Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions .github/scripts/build_integration_review_pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,13 +286,33 @@ def _classify_one(slot: Slot, expected_source_sha: str | None) -> None:
slot.detail = f"ungradeable: {exc}"
return
if slot.grade["deterministic_reject"]:
slot.status = "unhealthy"
rejects = [
g["id"]
for g in slot.grade["gates"]
if g["status"] == "fail" and g["enforcement"] == "deterministic"
]
slot.detail = f"deterministic reject: {rejects}"
# R-OUTCOME fails when the rollout's status is not a valid SCORED outcome
# (an errored / unscored-timeout run, often a hard task the agent didn't
# finish in budget) — an experiment-fidelity issue on that one rollout, not
# a regression introduced by the PR. Demote an R-OUTCOME-ONLY reject to a
# QUARANTINE (visible, non-blocking). Any OTHER deterministic reject
# (realness, tamper, artifact, telemetry, schema) is a real health failure
# and still hard-blocks.
non_outcome = [r for r in rejects if r != "R-OUTCOME"]
if non_outcome:
slot.status = "unhealthy"
slot.detail = f"deterministic reject: {non_outcome}"
else:
slot.status = "healthy"
slot.grade["quarantines"] = [
*slot.grade.get("quarantines", []),
"R-OUTCOME: rollout produced no valid scored outcome "
"(error / unscored timeout) — quarantined, not a PR regression",
]
slot.detail = (
f"healthy with {len(slot.grade['quarantines'])} quarantine(s) "
"(incl. R-OUTCOME)"
)
Comment on lines +306 to +315

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 grade["deterministic_reject"] left True after R-OUTCOME demotion

When the R-OUTCOME-only branch fires, slot.status is set to "healthy" but slot.grade["deterministic_reject"] is never cleared — it remains True. agent_judge_summary.json serializes this field directly, so codex (Pass 2) will read a slot where "status": "healthy" and "deterministic_reject": true at the same time. That direct contradiction could cause the codex reviewer to escalate a quarantine to "not mergeable", which is exactly the false-positive this demotion is trying to prevent. Adding slot.grade["deterministic_reject"] = False (alongside the quarantine append) would make the review pack data internally consistent.

else:
slot.status = "healthy"
if slot.grade["quarantines"]:
Expand Down
24 changes: 22 additions & 2 deletions .github/scripts/codex_review.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import argparse
import asyncio
import contextlib
import dataclasses
import json
import os
import re
Expand Down Expand Up @@ -160,7 +161,9 @@ async def _gather_findings(
async def one(rollout: Path) -> dict:
try:
evidence = agent_judge.load_rollout_evidence(rollout)
evidence_json = json.dumps(evidence.to_dict(), indent=2)[:8000]
# RolloutEvidence is a frozen dataclass (no .to_dict()); serialize via
# dataclasses.asdict. flagged_actions etc. are JSON-able.
evidence_json = json.dumps(dataclasses.asdict(evidence), indent=2)[:8000]
except Exception as exc: # evidence we cannot load is unhealthy
return {
"rollout": str(rollout),
Expand Down Expand Up @@ -281,6 +284,20 @@ def _codex_env(env: Mapping[str, str]) -> dict[str, str]:
return out


def _reasoning_config(env: Mapping[str, str]) -> list[str]:
"""Codex `-c` override for the composer's reasoning effort.

``CODEX_REASONING_EFFORT`` (none|minimal|low|medium|high|xhigh) sets how hard
the codex composer reasons over the review pack. Empty -> codex default.
(codex speaks ONLY the OpenAI Responses API, so the composer must be an
OpenAI model — DeepSeek/chat-wire is not supported by codex.)
"""
effort = (env.get("CODEX_REASONING_EFFORT") or "").strip()
if not effort:
return []
return [f'model_reasoning_effort="{effort}"']


def build_codex_command(
prompt: str,
*,
Expand Down Expand Up @@ -558,7 +575,10 @@ def main(argv: Sequence[str] | None = None) -> int:
workdir=args.review_pack.resolve().parent,
codex_bin=args.codex_bin,
model=args.codex_model,
config_overrides=args.config_overrides,
config_overrides=[
*_reasoning_config(codex_env),
*args.config_overrides,
],
env=codex_env,
)
if args.codex_out:
Expand Down
12 changes: 7 additions & 5 deletions .github/workflows/integration-final-review.yml
Original file line number Diff line number Diff line change
Expand Up @@ -514,11 +514,13 @@ jobs:
# codex_review.py uses for the CLI (dropping the DeepSeek base URL) so
# the judge clobber never leaks into codex auth.
CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# Pin the codex composer to a model the OPENAI_API_KEY is PROVEN to serve
# (the codex-acp matrix cells use gpt-5.4-nano). Otherwise `codex exec`
# falls back to its built-in default (gpt-5.x-codex), which the key may
# not be entitled to -> model_not_found -> 'codex unavailable'.
CODEX_MODEL: gpt-5.4-nano
# codex speaks ONLY the OpenAI Responses API — DeepSeek is impossible
# (codex removed chat-wire; DeepSeek doesn't serve Responses). So the
# composer runs on an OpenAI model: gpt-5.5 (full model — supports
# codex's tools, unlike gpt-5.4-nano which rejected `tool_search`) with
# xhigh reasoning effort for a thorough equivalence review.
CODEX_MODEL: gpt-5.5
CODEX_REASONING_EFFORT: xhigh
DETERMINISTIC_VERDICT: ${{ steps.pack.outputs.deterministic_verdict }}
run: |
set +e
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/integration-scope.yml
Original file line number Diff line number Diff line change
Expand Up @@ -502,9 +502,10 @@ jobs:
# via CODEX_API_KEY — codex_review.py isolates the CLI env (real key +
# default OpenAI base) so the judge clobber never leaks into codex auth.
CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# Pin the codex composer to a model the key is proven to serve (else
# `codex exec` uses its built-in default, which the key may not serve).
CODEX_MODEL: gpt-5.4-nano
# codex requires the OpenAI Responses API (DeepSeek impossible): the
# composer runs on gpt-5.5 with xhigh reasoning effort.
CODEX_MODEL: gpt-5.5
CODEX_REASONING_EFFORT: xhigh
run: |
set +e
uv run python .github/scripts/codex_review.py \
Expand Down
13 changes: 13 additions & 0 deletions tests/test_codex_review.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,19 @@ def test_codex_env_unchanged_without_codex_api_key():
assert cr._codex_env(src) == src


def test_reasoning_config_sets_effort_override():
# CODEX_REASONING_EFFORT -> a model_reasoning_effort `-c` override.
assert cr._reasoning_config({"CODEX_REASONING_EFFORT": "xhigh"}) == [
'model_reasoning_effort="xhigh"'
]


def test_reasoning_config_empty_when_unset():
# No env -> no override (codex uses its default effort).
assert cr._reasoning_config({}) == []
assert cr._reasoning_config({"CODEX_REASONING_EFFORT": " "}) == []


# ------------------------------------------------------------------
# worst() — advisory-stricter-only composition
# ------------------------------------------------------------------
Expand Down
Loading