benchflow-ai · Yiminnn · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · greptile-apps
diff --git a/.github/scripts/build_integration_review_pack.py b/.github/scripts/build_integration_review_pack.py
@@ -286,13 +286,33 @@ def _classify_one(slot: Slot, expected_source_sha: str | None) -> None:
         slot.detail = f"ungradeable: {exc}"
         return
     if slot.grade["deterministic_reject"]:
-        slot.status = "unhealthy"
         rejects = [
             g["id"]
             for g in slot.grade["gates"]
             if g["status"] == "fail" and g["enforcement"] == "deterministic"
         ]
-        slot.detail = f"deterministic reject: {rejects}"
+        # R-OUTCOME fails when the rollout's status is not a valid SCORED outcome
+        # (an errored / unscored-timeout run, often a hard task the agent didn't
+        # finish in budget) — an experiment-fidelity issue on that one rollout, not
+        # a regression introduced by the PR. Demote an R-OUTCOME-ONLY reject to a
+        # QUARANTINE (visible, non-blocking). Any OTHER deterministic reject
+        # (realness, tamper, artifact, telemetry, schema) is a real health failure
+        # and still hard-blocks.
+        non_outcome = [r for r in rejects if r != "R-OUTCOME"]
+        if non_outcome:
+            slot.status = "unhealthy"
+            slot.detail = f"deterministic reject: {non_outcome}"
+        else:
+            slot.status = "healthy"
+            slot.grade["quarantines"] = [
+                *slot.grade.get("quarantines", []),
+                "R-OUTCOME: rollout produced no valid scored outcome "
+                "(error / unscored timeout) — quarantined, not a PR regression",
+            ]
+            slot.detail = (
+                f"healthy with {len(slot.grade['quarantines'])} quarantine(s) "
+                "(incl. R-OUTCOME)"
+            )
     else:
         slot.status = "healthy"
         if slot.grade["quarantines"]:

diff --git a/.github/scripts/codex_review.py b/.github/scripts/codex_review.py
@@ -38,6 +38,7 @@
 import argparse
 import asyncio
 import contextlib
+import dataclasses
 import json
 import os
 import re
@@ -160,7 +161,9 @@ async def _gather_findings(
     async def one(rollout: Path) -> dict:
         try:
             evidence = agent_judge.load_rollout_evidence(rollout)
-            evidence_json = json.dumps(evidence.to_dict(), indent=2)[:8000]
+            # RolloutEvidence is a frozen dataclass (no .to_dict()); serialize via
+            # dataclasses.asdict. flagged_actions etc. are JSON-able.
+            evidence_json = json.dumps(dataclasses.asdict(evidence), indent=2)[:8000]
         except Exception as exc:  # evidence we cannot load is unhealthy
             return {
                 "rollout": str(rollout),
@@ -281,6 +284,20 @@ def _codex_env(env: Mapping[str, str]) -> dict[str, str]:
     return out
 
 
+def _reasoning_config(env: Mapping[str, str]) -> list[str]:
+    """Codex `-c` override for the composer's reasoning effort.
+
+    ``CODEX_REASONING_EFFORT`` (none|minimal|low|medium|high|xhigh) sets how hard
+    the codex composer reasons over the review pack. Empty -> codex default.
+    (codex speaks ONLY the OpenAI Responses API, so the composer must be an
+    OpenAI model — DeepSeek/chat-wire is not supported by codex.)
+    """
+    effort = (env.get("CODEX_REASONING_EFFORT") or "").strip()
+    if not effort:
+        return []
+    return [f'model_reasoning_effort="{effort}"']
+
+
 def build_codex_command(
     prompt: str,
     *,
@@ -558,7 +575,10 @@ def main(argv: Sequence[str] | None = None) -> int:
         workdir=args.review_pack.resolve().parent,
         codex_bin=args.codex_bin,
         model=args.codex_model,
-        config_overrides=args.config_overrides,
+        config_overrides=[
+            *_reasoning_config(codex_env),
+            *args.config_overrides,
+        ],
         env=codex_env,
     )
     if args.codex_out:

diff --git a/.github/workflows/integration-final-review.yml b/.github/workflows/integration-final-review.yml
@@ -514,11 +514,13 @@ jobs:
           # codex_review.py uses for the CLI (dropping the DeepSeek base URL) so
           # the judge clobber never leaks into codex auth.
           CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          # Pin the codex composer to a model the OPENAI_API_KEY is PROVEN to serve
-          # (the codex-acp matrix cells use gpt-5.4-nano). Otherwise `codex exec`
-          # falls back to its built-in default (gpt-5.x-codex), which the key may
-          # not be entitled to -> model_not_found -> 'codex unavailable'.
-          CODEX_MODEL: gpt-5.4-nano
+          # codex speaks ONLY the OpenAI Responses API — DeepSeek is impossible
+          # (codex removed chat-wire; DeepSeek doesn't serve Responses). So the
+          # composer runs on an OpenAI model: gpt-5.5 (full model — supports
+          # codex's tools, unlike gpt-5.4-nano which rejected `tool_search`) with
+          # xhigh reasoning effort for a thorough equivalence review.
+          CODEX_MODEL: gpt-5.5
+          CODEX_REASONING_EFFORT: xhigh
           DETERMINISTIC_VERDICT: ${{ steps.pack.outputs.deterministic_verdict }}
         run: |
           set +e

diff --git a/.github/workflows/integration-scope.yml b/.github/workflows/integration-scope.yml
@@ -502,9 +502,10 @@ jobs:
           # via CODEX_API_KEY — codex_review.py isolates the CLI env (real key +
           # default OpenAI base) so the judge clobber never leaks into codex auth.
           CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          # Pin the codex composer to a model the key is proven to serve (else
-          # `codex exec` uses its built-in default, which the key may not serve).
-          CODEX_MODEL: gpt-5.4-nano
+          # codex requires the OpenAI Responses API (DeepSeek impossible): the
+          # composer runs on gpt-5.5 with xhigh reasoning effort.
+          CODEX_MODEL: gpt-5.5
+          CODEX_REASONING_EFFORT: xhigh
         run: |
           set +e
           uv run python .github/scripts/codex_review.py \

diff --git a/tests/test_codex_review.py b/tests/test_codex_review.py
@@ -69,6 +69,19 @@ def test_codex_env_unchanged_without_codex_api_key():
     assert cr._codex_env(src) == src
 
 
+def test_reasoning_config_sets_effort_override():
+    # CODEX_REASONING_EFFORT -> a model_reasoning_effort `-c` override.
+    assert cr._reasoning_config({"CODEX_REASONING_EFFORT": "xhigh"}) == [
+        'model_reasoning_effort="xhigh"'
+    ]
+
+
+def test_reasoning_config_empty_when_unset():
+    # No env -> no override (codex uses its default effort).
+    assert cr._reasoning_config({}) == []
+    assert cr._reasoning_config({"CODEX_REASONING_EFFORT": "  "}) == []
+
+
 # ------------------------------------------------------------------
 # worst() — advisory-stricter-only composition
 # ------------------------------------------------------------------