diff --git a/.github/scripts/build_integration_review_pack.py b/.github/scripts/build_integration_review_pack.py index 1cf8c63e..df955504 100644 --- a/.github/scripts/build_integration_review_pack.py +++ b/.github/scripts/build_integration_review_pack.py @@ -409,7 +409,18 @@ def compute_verdict(slots: list[Slot], parity: list[ParityResult]) -> Verdict: ) for pr in parity: - if pr.status == "fail": + if pr.status == "fail" and pr.kind == "pinned-baseline": + # The pinned-baseline reward-band gate currently feeds a NATIVE HF + # leaderboard baseline to the Harbor-schema + git-pinned checker, which + # structurally false-fails (missing Harbor fields / pin mismatch) — NOT + # a real reward regression. Quarantine it (visible, non-blocking) until + # check_skillsbench_harbor_parity gains a native-vs-native baseline mode + # (tracked follow-up). Within-PR docker/daytona parity still hard-blocks. + quarantines.append( + f"parity {pr.kind} (advisory — gate needs native-baseline mode): " + f"{pr.pair_id} — {pr.detail}" + ) + elif pr.status == "fail": blockers.append(f"parity {pr.kind} fail: {pr.pair_id} — {pr.detail}") elif pr.status == "quarantine": quarantines.append(f"parity {pr.kind}: {pr.pair_id} — {pr.detail}") diff --git a/.github/workflows/integration-final-review.yml b/.github/workflows/integration-final-review.yml index e6b7e5b9..5ec28bc8 100644 --- a/.github/workflows/integration-final-review.yml +++ b/.github/workflows/integration-final-review.yml @@ -514,6 +514,11 @@ jobs: # codex_review.py uses for the CLI (dropping the DeepSeek base URL) so # the judge clobber never leaks into codex auth. CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }} + # Pin the codex composer to a model the OPENAI_API_KEY is PROVEN to serve + # (the codex-acp matrix cells use gpt-5.4-nano). Otherwise `codex exec` + # falls back to its built-in default (gpt-5.x-codex), which the key may + # not be entitled to -> model_not_found -> 'codex unavailable'. + CODEX_MODEL: gpt-5.4-nano DETERMINISTIC_VERDICT: ${{ steps.pack.outputs.deterministic_verdict }} run: | set +e diff --git a/.github/workflows/integration-scope.yml b/.github/workflows/integration-scope.yml index 8b186282..779e0240 100644 --- a/.github/workflows/integration-scope.yml +++ b/.github/workflows/integration-scope.yml @@ -502,6 +502,9 @@ jobs: # via CODEX_API_KEY — codex_review.py isolates the CLI env (real key + # default OpenAI base) so the judge clobber never leaks into codex auth. CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }} + # Pin the codex composer to a model the key is proven to serve (else + # `codex exec` uses its built-in default, which the key may not serve). + CODEX_MODEL: gpt-5.4-nano run: | set +e uv run python .github/scripts/codex_review.py \ diff --git a/tests/test_build_review_pack.py b/tests/test_build_review_pack.py index 238f879d..009cd460 100644 --- a/tests/test_build_review_pack.py +++ b/tests/test_build_review_pack.py @@ -39,6 +39,28 @@ class SkipTest(Exception): """Raised to skip a test under the stdlib runner / signal pytest.skip.""" +def test_pinned_baseline_parity_fail_demotes_to_quarantine() -> None: + # Regression: the pinned-baseline reward-band gate currently false-fails + # (a native HF leaderboard baseline run through the Harbor-schema + git-pinned + # checker), so a pinned-baseline FAIL must DEMOTE to a quarantine — visible but + # non-blocking — not a hard 'not mergeable'. + pinned_fail = pack_mod.ParityResult( + "pinned-baseline", "pinned-baseline", "fail", "missing Harbor field(s)" + ) + v = pack_mod.compute_verdict([], [pinned_fail]) + assert v.verdict == pack_mod.VERDICT_QUARANTINES + assert not v.blockers + assert v.quarantines + + # ...but a REAL within-PR docker/daytona parity FAIL still hard-blocks. + within_fail = pack_mod.ParityResult( + "sandbox-parity(x)", "within-pr", "fail", "incomplete parity pair" + ) + v2 = pack_mod.compute_verdict([], [within_fail]) + assert v2.verdict == pack_mod.VERDICT_NOT_MERGEABLE + assert v2.blockers + + # ------------------------------------------------------------------ # Fixtures: hand-built flat rollouts + a matrix plan. # ------------------------------------------------------------------