Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion .github/scripts/build_integration_review_pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,18 @@ def compute_verdict(slots: list[Slot], parity: list[ParityResult]) -> Verdict:
)

for pr in parity:
if pr.status == "fail":
if pr.status == "fail" and pr.kind == "pinned-baseline":
# The pinned-baseline reward-band gate currently feeds a NATIVE HF
# leaderboard baseline to the Harbor-schema + git-pinned checker, which
# structurally false-fails (missing Harbor fields / pin mismatch) — NOT
# a real reward regression. Quarantine it (visible, non-blocking) until
# check_skillsbench_harbor_parity gains a native-vs-native baseline mode
# (tracked follow-up). Within-PR docker/daytona parity still hard-blocks.
quarantines.append(
f"parity {pr.kind} (advisory — gate needs native-baseline mode): "
f"{pr.pair_id} — {pr.detail}"
)
elif pr.status == "fail":
blockers.append(f"parity {pr.kind} fail: {pr.pair_id} — {pr.detail}")
elif pr.status == "quarantine":
quarantines.append(f"parity {pr.kind}: {pr.pair_id} — {pr.detail}")
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/integration-final-review.yml
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,11 @@ jobs:
# codex_review.py uses for the CLI (dropping the DeepSeek base URL) so
# the judge clobber never leaks into codex auth.
CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# Pin the codex composer to a model the OPENAI_API_KEY is PROVEN to serve
# (the codex-acp matrix cells use gpt-5.4-nano). Otherwise `codex exec`
# falls back to its built-in default (gpt-5.x-codex), which the key may
# not be entitled to -> model_not_found -> 'codex unavailable'.
CODEX_MODEL: gpt-5.4-nano
DETERMINISTIC_VERDICT: ${{ steps.pack.outputs.deterministic_verdict }}
run: |
set +e
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/integration-scope.yml
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,9 @@ jobs:
# via CODEX_API_KEY — codex_review.py isolates the CLI env (real key +
# default OpenAI base) so the judge clobber never leaks into codex auth.
CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# Pin the codex composer to a model the key is proven to serve (else
# `codex exec` uses its built-in default, which the key may not serve).
CODEX_MODEL: gpt-5.4-nano
run: |
set +e
uv run python .github/scripts/codex_review.py \
Expand Down
22 changes: 22 additions & 0 deletions tests/test_build_review_pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,28 @@ class SkipTest(Exception):
"""Raised to skip a test under the stdlib runner / signal pytest.skip."""


def test_pinned_baseline_parity_fail_demotes_to_quarantine() -> None:
# Regression: the pinned-baseline reward-band gate currently false-fails
# (a native HF leaderboard baseline run through the Harbor-schema + git-pinned
# checker), so a pinned-baseline FAIL must DEMOTE to a quarantine — visible but
# non-blocking — not a hard 'not mergeable'.
pinned_fail = pack_mod.ParityResult(
"pinned-baseline", "pinned-baseline", "fail", "missing Harbor field(s)"
)
v = pack_mod.compute_verdict([], [pinned_fail])
assert v.verdict == pack_mod.VERDICT_QUARANTINES
assert not v.blockers
assert v.quarantines

# ...but a REAL within-PR docker/daytona parity FAIL still hard-blocks.
within_fail = pack_mod.ParityResult(
"sandbox-parity(x)", "within-pr", "fail", "incomplete parity pair"
)
v2 = pack_mod.compute_verdict([], [within_fail])
assert v2.verdict == pack_mod.VERDICT_NOT_MERGEABLE
assert v2.blockers


# ------------------------------------------------------------------
# Fixtures: hand-built flat rollouts + a matrix plan.
# ------------------------------------------------------------------
Expand Down
Loading