From 8680be02601c5a95906a58fc7633f4bd16e6289d Mon Sep 17 00:00:00 2001 From: symphony-bot Date: Fri, 19 Jun 2026 00:22:15 +0000 Subject: [PATCH] =?UTF-8?q?fix(integration):=20unblock=20the=20L3=20verdic?= =?UTF-8?q?t=20=E2=80=94=20pin=20codex=20model=20+=20demote=20false=20pari?= =?UTF-8?q?ty=20blocker?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first full L3 run on #803 (plan ✓, 10/10 rollouts ✓, deterministic grader ✓) went red for two reasons, both confirmed by a pre-flight audit: 1. codex used its built-in default model (no --codex-model set), which the repo OPENAI_API_KEY is not entitled to -> model_not_found -> 'codex unavailable'. Pin CODEX_MODEL=gpt-5.4-nano (the model the codex-acp cells prove the key serves) in both review-pack codex steps. 2. the pinned-baseline reward-band gate STRUCTURALLY false-fails: the workflow feeds a NATIVE BenchFlow HF leaderboard baseline to check_skillsbench_harbor_ parity, which validates Harbor schema + a git pin -> 'missing Harbor field(s)' / pin mismatch -> fail -> hard not-mergeable. This is not a real reward regression. Demote a pinned-baseline parity FAIL to a QUARANTINE (visible, non-blocking); within-PR docker/daytona parity still hard-blocks. Follow-ups (documented, not blocking): a native-vs-native baseline mode for check_skillsbench_harbor_parity (the real parity fix); the S-NOSKILL gate is silently NA because production rollouts ship config.json not run_config.json. Regression test added for the parity demote. --- .../scripts/build_integration_review_pack.py | 13 ++++++++++- .../workflows/integration-final-review.yml | 5 +++++ .github/workflows/integration-scope.yml | 3 +++ tests/test_build_review_pack.py | 22 +++++++++++++++++++ 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/.github/scripts/build_integration_review_pack.py b/.github/scripts/build_integration_review_pack.py index 1cf8c63e..df955504 100644 --- a/.github/scripts/build_integration_review_pack.py +++ b/.github/scripts/build_integration_review_pack.py @@ -409,7 +409,18 @@ def compute_verdict(slots: list[Slot], parity: list[ParityResult]) -> Verdict: ) for pr in parity: - if pr.status == "fail": + if pr.status == "fail" and pr.kind == "pinned-baseline": + # The pinned-baseline reward-band gate currently feeds a NATIVE HF + # leaderboard baseline to the Harbor-schema + git-pinned checker, which + # structurally false-fails (missing Harbor fields / pin mismatch) — NOT + # a real reward regression. Quarantine it (visible, non-blocking) until + # check_skillsbench_harbor_parity gains a native-vs-native baseline mode + # (tracked follow-up). Within-PR docker/daytona parity still hard-blocks. + quarantines.append( + f"parity {pr.kind} (advisory — gate needs native-baseline mode): " + f"{pr.pair_id} — {pr.detail}" + ) + elif pr.status == "fail": blockers.append(f"parity {pr.kind} fail: {pr.pair_id} — {pr.detail}") elif pr.status == "quarantine": quarantines.append(f"parity {pr.kind}: {pr.pair_id} — {pr.detail}") diff --git a/.github/workflows/integration-final-review.yml b/.github/workflows/integration-final-review.yml index e6b7e5b9..5ec28bc8 100644 --- a/.github/workflows/integration-final-review.yml +++ b/.github/workflows/integration-final-review.yml @@ -514,6 +514,11 @@ jobs: # codex_review.py uses for the CLI (dropping the DeepSeek base URL) so # the judge clobber never leaks into codex auth. CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }} + # Pin the codex composer to a model the OPENAI_API_KEY is PROVEN to serve + # (the codex-acp matrix cells use gpt-5.4-nano). Otherwise `codex exec` + # falls back to its built-in default (gpt-5.x-codex), which the key may + # not be entitled to -> model_not_found -> 'codex unavailable'. + CODEX_MODEL: gpt-5.4-nano DETERMINISTIC_VERDICT: ${{ steps.pack.outputs.deterministic_verdict }} run: | set +e diff --git a/.github/workflows/integration-scope.yml b/.github/workflows/integration-scope.yml index 8b186282..779e0240 100644 --- a/.github/workflows/integration-scope.yml +++ b/.github/workflows/integration-scope.yml @@ -502,6 +502,9 @@ jobs: # via CODEX_API_KEY — codex_review.py isolates the CLI env (real key + # default OpenAI base) so the judge clobber never leaks into codex auth. CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }} + # Pin the codex composer to a model the key is proven to serve (else + # `codex exec` uses its built-in default, which the key may not serve). + CODEX_MODEL: gpt-5.4-nano run: | set +e uv run python .github/scripts/codex_review.py \ diff --git a/tests/test_build_review_pack.py b/tests/test_build_review_pack.py index 238f879d..009cd460 100644 --- a/tests/test_build_review_pack.py +++ b/tests/test_build_review_pack.py @@ -39,6 +39,28 @@ class SkipTest(Exception): """Raised to skip a test under the stdlib runner / signal pytest.skip.""" +def test_pinned_baseline_parity_fail_demotes_to_quarantine() -> None: + # Regression: the pinned-baseline reward-band gate currently false-fails + # (a native HF leaderboard baseline run through the Harbor-schema + git-pinned + # checker), so a pinned-baseline FAIL must DEMOTE to a quarantine — visible but + # non-blocking — not a hard 'not mergeable'. + pinned_fail = pack_mod.ParityResult( + "pinned-baseline", "pinned-baseline", "fail", "missing Harbor field(s)" + ) + v = pack_mod.compute_verdict([], [pinned_fail]) + assert v.verdict == pack_mod.VERDICT_QUARANTINES + assert not v.blockers + assert v.quarantines + + # ...but a REAL within-PR docker/daytona parity FAIL still hard-blocks. + within_fail = pack_mod.ParityResult( + "sandbox-parity(x)", "within-pr", "fail", "incomplete parity pair" + ) + v2 = pack_mod.compute_verdict([], [within_fail]) + assert v2.verdict == pack_mod.VERDICT_NOT_MERGEABLE + assert v2.blockers + + # ------------------------------------------------------------------ # Fixtures: hand-built flat rollouts + a matrix plan. # ------------------------------------------------------------------