v7 Gate 5 — Engineer Contract¶
exp-expert가 v6 n=22 back-test로 empirical cutoff를 도출하고 9-run smoke preview를 검증한 결과(보고서 v7_stage05_gate5_redesign_v2.md), Gate 5 구현을 아래 계약대로 전환해 달라.
1. 수정 대상 (1 파일)¶
experiments/federated/v7_stage05_smoke_analysis.py
1.1 evaluate_gate5 함수 전면 재작성¶
Before (line 488~521):
def evaluate_gate5(runs: list[Any]) -> list[GateResult]:
"""Gate 5: Convergence via ff_val_initial_avg / ff_val_moving_avg ratio."""
...
initial = r.data.metrics.get("ff_val_initial_avg")
moving = r.data.metrics.get("ff_val_moving_avg")
...
ratio = moving / initial
if ratio < 0.5: PASS
elif ratio < 0.6: WARNING
else: FAIL
After:
# ──────────────────────────────────────────────────────────────
# Gate 5 — run-level: convergence via monotonic-decrease metric
# ──────────────────────────────────────────────────────────────
# Empirical cutoffs (v6 n=22 back-test, 2026-04-20):
# See outputs/v7_stage05/v6_gate5_backtest.json.
GATE5_BLOCK_SIZE = 3
GATE5_PASS_CUTOFF = 0.0017 # v6 P10
GATE5_WARNING_CUTOFF = 0.0 # net-increase boundary
GATE5_MIN_SAMPLES = 2 * GATE5_BLOCK_SIZE # len(val_loss) >= 6
def evaluate_gate5(
runs: list[Any], client: MlflowClient | None = None
) -> list[GateResult]:
"""Gate 5: Convergence via monotonic-decrease metric.
rel_decrease = (mean(val_loss[:k]) - mean(val_loss[-k:])) / mean(val_loss[:k])
with k=GATE5_BLOCK_SIZE=3. Positive = improvement.
Cutoffs were derived empirically from the v6 n=22 converged-run back-test
(see outputs/v7_stage05/v6_gate5_backtest.json):
- PASS : rel_decrease >= 0.0017 (v6 P10; ~86% v6 TPR)
- WARNING : 0.0 <= rel_decrease < 0.0017 (v6 P05~P10)
- FAIL : rel_decrease < 0.0 (net increase)
Requires the MLflow `val_loss` metric history (logged per-step by
v7_runner.py:1757). Falls back to ERROR if fewer than 6 samples.
"""
if client is None:
client = MlflowClient()
results: list[GateResult] = []
k = GATE5_BLOCK_SIZE
for r in runs:
rid = r.info.run_id
try:
hist = client.get_metric_history(rid, "val_loss")
except Exception as exc:
results.append(GateResult(
"G5", "ERROR", f"val_loss history fetch failed: {exc}", run_id=rid,
))
continue
if not hist:
results.append(GateResult(
"G5", "ERROR", "val_loss history empty", run_id=rid,
))
continue
vals = [float(h.value) for h in sorted(hist, key=lambda x: x.step)
if not np.isnan(h.value)]
if len(vals) < GATE5_MIN_SAMPLES:
results.append(GateResult(
"G5", "ERROR",
f"insufficient val_loss samples (n={len(vals)} < {GATE5_MIN_SAMPLES})",
run_id=rid,
))
continue
initial = float(np.mean(vals[:k]))
final = float(np.mean(vals[-k:]))
if initial <= 0:
results.append(GateResult(
"G5", "ERROR", f"non-positive initial block mean={initial}",
run_id=rid,
))
continue
dec = (initial - final) / initial
if dec >= GATE5_PASS_CUTOFF:
results.append(GateResult(
"G5", "PASS", f"rel_decrease={dec:+.4f} >= {GATE5_PASS_CUTOFF}",
run_id=rid,
))
elif dec >= GATE5_WARNING_CUTOFF:
results.append(GateResult(
"G5", "WARNING",
f"rel_decrease={dec:+.4f} ∈ [{GATE5_WARNING_CUTOFF}, "
f"{GATE5_PASS_CUTOFF}) borderline",
run_id=rid,
))
else:
results.append(GateResult(
"G5", "FAIL",
f"rel_decrease={dec:+.4f} < {GATE5_WARNING_CUTOFF} (net increase)",
run_id=rid,
))
return results
주요 변경:
1. ff_val_initial_avg / ff_val_moving_avg 사용 → val_loss full history 사용
(MlflowClient.get_metric_history).
2. client: MlflowClient | None = None 인자 추가 (테스트 주입 용이).
3. Block size, cutoff constants는 module-level 상수로 추출.
4. ERROR 조건 명시: empty history, insufficient samples, non-positive initial.
5. Legacy 상수 (0.5 / 0.6)는 완전히 제거 — 추상 재사용 금지.
1.2 Module docstring 갱신¶
Line 21 (현재 기술):
으로 교체: G5 run-level : rel_decrease = (mean(vl[:3]) - mean(vl[-3:])) / mean(vl[:3])
PASS >= 0.0017, WARNING >= 0.0, FAIL < 0.0 (v6 n=22 back-test).
1.3 호출부 업데이트 (있다면)¶
Grep로 evaluate_gate5( 호출을 찾아 client 주입 여부 확인. 현재는 MlflowClient를 함수 내부에서 생성하므로 기존 호출부는 그대로 동작해야 함 (backward-compat).
2. 테스트 갱신 (tests/test_v7_stage05_smoke_analysis.py)¶
2.1 기존 테스트 치환¶
TestGate5::test_gate5_pass_on_good_ratio, test_gate5_fail_on_bad_ratio 두 개는 ff_val_initial_avg/ff_val_moving_avg를 사용하므로 새 계약과 맞지 않음. 아래로 치환:
class TestGate5:
@staticmethod
def _mk_run_with_val_loss(
run_id: str, val_loss_series: list[float]
) -> SimpleNamespace:
"""Build a run whose MlflowClient.get_metric_history returns the given series."""
# `make_run` builds the Run object; MLflow client is mocked separately.
return SimpleNamespace(
info=SimpleNamespace(run_id=run_id, status="FINISHED"),
data=SimpleNamespace(params={"cell_id": "B0", "seed": "42"},
metrics={}, tags={}),
)
@staticmethod
def _mock_client(run_histories: dict[str, list[float]]) -> MagicMock:
client = MagicMock()
def _get_metric_history(run_id: str, key: str):
if key != "val_loss":
return []
series = run_histories.get(run_id, [])
return [
SimpleNamespace(step=i, value=float(v))
for i, v in enumerate(series)
]
client.get_metric_history.side_effect = _get_metric_history
return client
def test_gate5_pass_on_clear_decrease(self, sa):
"""rel_decrease well above PASS cutoff → PASS."""
run = self._mk_run_with_val_loss("r1", [])
vals = [1.0, 1.0, 1.0, 0.9, 0.9, 0.9] # rel_decrease = 0.10
client = self._mock_client({"r1": vals})
results = sa.evaluate_gate5([run], client=client)
assert results[0].verdict == "PASS", results[0].detail
def test_gate5_warning_on_borderline(self, sa):
"""rel_decrease ∈ [0, 0.0017) → WARNING."""
run = self._mk_run_with_val_loss("r1", [])
# mean([1.0]*3)=1.0, mean([0.9995]*3)=0.9995 -> dec=0.0005
vals = [1.0, 1.0, 1.0, 0.9995, 0.9995, 0.9995]
client = self._mock_client({"r1": vals})
results = sa.evaluate_gate5([run], client=client)
assert results[0].verdict == "WARNING", results[0].detail
def test_gate5_fail_on_net_increase(self, sa):
"""val_loss net increase → FAIL."""
run = self._mk_run_with_val_loss("r1", [])
vals = [0.6, 0.6, 0.6, 0.61, 0.61, 0.61] # rel_decrease = -0.0167
client = self._mock_client({"r1": vals})
results = sa.evaluate_gate5([run], client=client)
assert results[0].verdict == "FAIL", results[0].detail
def test_gate5_error_on_insufficient_samples(self, sa):
"""len(val_loss) < 6 → ERROR."""
run = self._mk_run_with_val_loss("r1", [])
vals = [1.0, 0.9, 0.8] # too short
client = self._mock_client({"r1": vals})
results = sa.evaluate_gate5([run], client=client)
assert results[0].verdict == "ERROR"
assert "insufficient" in results[0].detail.lower()
def test_gate5_error_on_empty_history(self, sa):
"""Missing val_loss metric → ERROR."""
run = self._mk_run_with_val_loss("r1", [])
client = self._mock_client({}) # no series
results = sa.evaluate_gate5([run], client=client)
assert results[0].verdict == "ERROR"
def test_gate5_smoke_preview_reproduction(self, sa):
"""Reproduce the 9-run smoke preview outcome (PASS 5 / WARN 1 / FAIL 3)
using the val_loss series captured in outputs/v7_stage05/smoke_gate5_preview.json.
"""
import json
preview_path = ROOT / "outputs" / "v7_stage05" / "smoke_gate5_preview.json"
if not preview_path.exists():
pytest.skip("preview JSON not present (run smoke_gate5_preview.py first)")
preview = json.loads(preview_path.read_text(encoding="utf-8"))
runs, histories, expected = [], {}, {}
for row in preview["per_run"]:
rid = row["run_id"]
runs.append(SimpleNamespace(
info=SimpleNamespace(run_id=rid, status="FINISHED"),
data=SimpleNamespace(params={}, metrics={}, tags={}),
))
histories[rid] = row["val_loss_series"]
expected[rid] = row["new_metric"]["verdict"]
client = self._mock_client(histories)
actual = {r.run_id: r.verdict for r in sa.evaluate_gate5(runs, client=client)}
assert actual == expected
2.2 make_run 시그니처 정리¶
ff_val_initial_avg, ff_val_moving_avg 파라미터는 Gate 5와 더 이상 무관. include_full_metrics의 기본 branch에서도 제거 (단 ff_val_divergence_ratio는 §2.4 divergence detector에서 여전히 사용되니 유지).
2.3 Aggregation 테스트 영향¶
TestAggregate::test_aggregate_all_pass 등 집계 테스트가 Gate 5 결과에 의존한다면 mock client를 주입하여 val_loss 시리즈를 공급해야 함. grep으로 확인 바람.
3. Verification 요구사항¶
uv run python -m pytest tests/test_v7_stage05_smoke_analysis.py -v전체 PASS 유지.uv run python -m pytest tests/ -v --ignore=tests/integration_distilts.py전체 PASS 유지.- 기존 9-run smoke에 대해: 실행 시 report가 PASS 5 / WARNING 1 / FAIL 3 으로 갱신되어야 함 (Overall verdict = FAIL, Gate 5 행).
- Overall verdict는 FAIL 유지 (Gate 5 FAIL 3개 때문). A3 diagnosis는 orchestrator 별도 결정.
superpowers:verification-before-completionskill로 실제 실행 결과 확인 후에만 완료 보고.
4. 하지 말아야 할 것¶
v7_runner.py의compute_fail_fast_metrics수정 금지: ff_val_initial_avg / ff_val_moving_avg는 §2.4 divergence detector 입력으로 여전히 사용됨. 제거 시 divergence hook이 깨진다.- smoke-mode flag / full-mode bifurcation 도입 금지 (critic CR1 명시적 금지).
- Legacy constants (0.5, 0.6)를 "하위 호환"으로 남기지 말 것 — silent drift 소스.
- v7 smoke 재실행 금지 (이번 작업은 analysis-only).
5. 질문 생기면¶
- exp-expert(본 보고서 작성자)에게 직접 질의. engineer 독자 해석 금지.
- 특히 "A3 FAIL이 실제 문제인가"는 orchestrator / exp-expert 판단 영역. engineer는 verdict 변환만 한다.
작성자: exp-expert | 2026-04-20 | cycle 2/2