benchflow-ai · Yiminnn · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · greptile-apps
diff --git a/.github/scripts/build_integration_review_pack.py b/.github/scripts/build_integration_review_pack.py
@@ -286,13 +286,33 @@ def _classify_one(slot: Slot, expected_source_sha: str | None) -> None:
         slot.detail = f"ungradeable: {exc}"
         return
     if slot.grade["deterministic_reject"]:
-        slot.status = "unhealthy"
         rejects = [
             g["id"]
             for g in slot.grade["gates"]
             if g["status"] == "fail" and g["enforcement"] == "deterministic"
         ]
-        slot.detail = f"deterministic reject: {rejects}"
+        # R-OUTCOME fails when the rollout's status is not a valid SCORED outcome
+        # (an errored / unscored-timeout run, often a hard task the agent didn't
+        # finish in budget) — an experiment-fidelity issue on that one rollout, not
+        # a regression introduced by the PR. Demote an R-OUTCOME-ONLY reject to a
+        # QUARANTINE (visible, non-blocking). Any OTHER deterministic reject
+        # (realness, tamper, artifact, telemetry, schema) is a real health failure
+        # and still hard-blocks.
+        non_outcome = [r for r in rejects if r != "R-OUTCOME"]
+        if non_outcome:
+            slot.status = "unhealthy"
+            slot.detail = f"deterministic reject: {non_outcome}"
+        else:
+            slot.status = "healthy"
+            slot.grade["quarantines"] = [
+                *slot.grade.get("quarantines", []),
+                "R-OUTCOME: rollout produced no valid scored outcome "
+                "(error / unscored timeout) — quarantined, not a PR regression",
+            ]
+            slot.detail = (
+                f"healthy with {len(slot.grade['quarantines'])} quarantine(s) "
+                "(incl. R-OUTCOME)"
+            )
     else:
         slot.status = "healthy"
         if slot.grade["quarantines"]:

diff --git a/.github/scripts/codex_review.py b/.github/scripts/codex_review.py
@@ -38,6 +38,7 @@
 import argparse
 import asyncio
 import contextlib
+import dataclasses
 import json
 import os
 import re
@@ -160,7 +161,9 @@ async def _gather_findings(
     async def one(rollout: Path) -> dict:
         try:
             evidence = agent_judge.load_rollout_evidence(rollout)
-            evidence_json = json.dumps(evidence.to_dict(), indent=2)[:8000]
+            # RolloutEvidence is a frozen dataclass (no .to_dict()); serialize via
+            # dataclasses.asdict. flagged_actions etc. are JSON-able.
+            evidence_json = json.dumps(dataclasses.asdict(evidence), indent=2)[:8000]
         except Exception as exc:  # evidence we cannot load is unhealthy
             return {
                 "rollout": str(rollout),
@@ -281,6 +284,32 @@ def _codex_env(env: Mapping[str, str]) -> dict[str, str]:
     return out
 
 
+def _deepseek_codex_config(
+    model: str | None, env: Mapping[str, str]
+) -> list[str]:
+    """Codex `-c` overrides to run the composer on a DeepSeek model.
+
+    A DeepSeek codex model can't use the OpenAI Responses API + its `tool_search`
+    tool (which small/non-OpenAI models reject), so route codex through a custom
+    chat-completions provider pointed at DeepSeek. The composer only reads the
+    review-pack text we hand it (no web tools needed), so the chat wire is enough.
+    Returns [] for a non-DeepSeek model (codex uses its default OpenAI provider).
+    """
+    if not model or "deepseek" not in model.lower():
+        return []
+    base = (env.get("DEEPSEEK_BASE_URL") or "https://api.deepseek.com").rstrip("/")
+    # DeepSeek's OpenAI-compatible chat endpoint lives under /v1.
+    if not base.endswith("/v1"):
+        base = f"{base}/v1"
+    return [
+        "model_provider=deepseek",
+        "model_providers.deepseek.name=DeepSeek",
+        f"model_providers.deepseek.base_url={base}",
+        "model_providers.deepseek.env_key=DEEPSEEK_API_KEY",
+        "model_providers.deepseek.wire_api=chat",
+    ]
+
+
 def build_codex_command(
     prompt: str,
     *,
@@ -558,7 +587,10 @@ def main(argv: Sequence[str] | None = None) -> int:
         workdir=args.review_pack.resolve().parent,
         codex_bin=args.codex_bin,
         model=args.codex_model,
-        config_overrides=args.config_overrides,
+        config_overrides=[
+            *_deepseek_codex_config(args.codex_model, codex_env),
+            *args.config_overrides,
+        ],
         env=codex_env,
     )
     if args.codex_out:

diff --git a/.github/workflows/integration-final-review.yml b/.github/workflows/integration-final-review.yml
@@ -514,11 +514,12 @@ jobs:
           # codex_review.py uses for the CLI (dropping the DeepSeek base URL) so
           # the judge clobber never leaks into codex auth.
           CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          # Pin the codex composer to a model the OPENAI_API_KEY is PROVEN to serve
-          # (the codex-acp matrix cells use gpt-5.4-nano). Otherwise `codex exec`
-          # falls back to its built-in default (gpt-5.x-codex), which the key may
-          # not be entitled to -> model_not_found -> 'codex unavailable'.
-          CODEX_MODEL: gpt-5.4-nano
+          # Run the codex composer on DeepSeek-v4-pro. codex_review.py routes a
+          # deepseek model through a custom chat-completions provider (DeepSeek
+          # serves chat, not the OpenAI Responses API + `tool_search` tool that
+          # gpt-5.4-nano rejected). Reads DEEPSEEK_API_KEY + DEEPSEEK_BASE_URL from
+          # the job env (both present).
+          CODEX_MODEL: deepseek-v4-pro
           DETERMINISTIC_VERDICT: ${{ steps.pack.outputs.deterministic_verdict }}
         run: |
           set +e

diff --git a/.github/workflows/integration-scope.yml b/.github/workflows/integration-scope.yml
@@ -502,9 +502,10 @@ jobs:
           # via CODEX_API_KEY — codex_review.py isolates the CLI env (real key +
           # default OpenAI base) so the judge clobber never leaks into codex auth.
           CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          # Pin the codex composer to a model the key is proven to serve (else
-          # `codex exec` uses its built-in default, which the key may not serve).
-          CODEX_MODEL: gpt-5.4-nano
+          # Run the codex composer on DeepSeek-v4-pro (codex_review.py routes a
+          # deepseek model through a chat-completions provider; DeepSeek can't
+          # serve the OpenAI Responses API + tool_search that small models reject).
+          CODEX_MODEL: deepseek-v4-pro
         run: |
           set +e
           uv run python .github/scripts/codex_review.py \

diff --git a/tests/test_codex_review.py b/tests/test_codex_review.py
@@ -69,6 +69,25 @@ def test_codex_env_unchanged_without_codex_api_key():
     assert cr._codex_env(src) == src
 
 
+def test_deepseek_codex_config_routes_via_chat_provider():
+    # A DeepSeek composer model can't use the OpenAI Responses API + tool_search,
+    # so codex is pointed at a custom chat-completions provider on DeepSeek.
+    cfg = cr._deepseek_codex_config(
+        "deepseek-v4-pro", {"DEEPSEEK_BASE_URL": "https://api.deepseek.com"}
+    )
+    assert "model_provider=deepseek" in cfg
+    assert "model_providers.deepseek.wire_api=chat" in cfg
+    assert "model_providers.deepseek.env_key=DEEPSEEK_API_KEY" in cfg
+    # DeepSeek's OpenAI-compatible chat endpoint is under /v1.
+    assert "model_providers.deepseek.base_url=https://api.deepseek.com/v1" in cfg
+
+
+def test_deepseek_codex_config_empty_for_openai_model():
+    # A non-DeepSeek model leaves codex on its default OpenAI provider.
+    assert cr._deepseek_codex_config("gpt-5.4-nano", {}) == []
+    assert cr._deepseek_codex_config(None, {}) == []
+
+
 # ------------------------------------------------------------------
 # worst() — advisory-stricter-only composition
 # ------------------------------------------------------------------