Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions .github/scripts/build_integration_review_pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,13 +286,33 @@ def _classify_one(slot: Slot, expected_source_sha: str | None) -> None:
slot.detail = f"ungradeable: {exc}"
return
if slot.grade["deterministic_reject"]:
slot.status = "unhealthy"
rejects = [
g["id"]
for g in slot.grade["gates"]
if g["status"] == "fail" and g["enforcement"] == "deterministic"
]
slot.detail = f"deterministic reject: {rejects}"
# R-OUTCOME fails when the rollout's status is not a valid SCORED outcome
# (an errored / unscored-timeout run, often a hard task the agent didn't
# finish in budget) — an experiment-fidelity issue on that one rollout, not
# a regression introduced by the PR. Demote an R-OUTCOME-ONLY reject to a
# QUARANTINE (visible, non-blocking). Any OTHER deterministic reject
# (realness, tamper, artifact, telemetry, schema) is a real health failure
# and still hard-blocks.
non_outcome = [r for r in rejects if r != "R-OUTCOME"]
if non_outcome:
slot.status = "unhealthy"
slot.detail = f"deterministic reject: {non_outcome}"
else:
slot.status = "healthy"
slot.grade["quarantines"] = [
*slot.grade.get("quarantines", []),
"R-OUTCOME: rollout produced no valid scored outcome "
"(error / unscored timeout) — quarantined, not a PR regression",
]
slot.detail = (
f"healthy with {len(slot.grade['quarantines'])} quarantine(s) "
"(incl. R-OUTCOME)"
)
Comment on lines +306 to +315

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 grade["deterministic_reject"] left True after R-OUTCOME demotion

When the R-OUTCOME-only branch fires, slot.status is set to "healthy" but slot.grade["deterministic_reject"] is never cleared — it remains True. agent_judge_summary.json serializes this field directly, so codex (Pass 2) will read a slot where "status": "healthy" and "deterministic_reject": true at the same time. That direct contradiction could cause the codex reviewer to escalate a quarantine to "not mergeable", which is exactly the false-positive this demotion is trying to prevent. Adding slot.grade["deterministic_reject"] = False (alongside the quarantine append) would make the review pack data internally consistent.

else:
slot.status = "healthy"
if slot.grade["quarantines"]:
Expand Down
36 changes: 34 additions & 2 deletions .github/scripts/codex_review.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import argparse
import asyncio
import contextlib
import dataclasses
import json
import os
import re
Expand Down Expand Up @@ -160,7 +161,9 @@ async def _gather_findings(
async def one(rollout: Path) -> dict:
try:
evidence = agent_judge.load_rollout_evidence(rollout)
evidence_json = json.dumps(evidence.to_dict(), indent=2)[:8000]
# RolloutEvidence is a frozen dataclass (no .to_dict()); serialize via
# dataclasses.asdict. flagged_actions etc. are JSON-able.
evidence_json = json.dumps(dataclasses.asdict(evidence), indent=2)[:8000]
except Exception as exc: # evidence we cannot load is unhealthy
return {
"rollout": str(rollout),
Expand Down Expand Up @@ -281,6 +284,32 @@ def _codex_env(env: Mapping[str, str]) -> dict[str, str]:
return out


def _deepseek_codex_config(
model: str | None, env: Mapping[str, str]
) -> list[str]:
"""Codex `-c` overrides to run the composer on a DeepSeek model.

A DeepSeek codex model can't use the OpenAI Responses API + its `tool_search`
tool (which small/non-OpenAI models reject), so route codex through a custom
chat-completions provider pointed at DeepSeek. The composer only reads the
review-pack text we hand it (no web tools needed), so the chat wire is enough.
Returns [] for a non-DeepSeek model (codex uses its default OpenAI provider).
"""
if not model or "deepseek" not in model.lower():
return []
base = (env.get("DEEPSEEK_BASE_URL") or "https://api.deepseek.com").rstrip("/")
# DeepSeek's OpenAI-compatible chat endpoint lives under /v1.
if not base.endswith("/v1"):
base = f"{base}/v1"

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 endswith("/v1") heuristic can double-append on non-standard proxy URLs

base.endswith("/v1") strips no path components before the check, so a DEEPSEEK_BASE_URL like https://proxy.example.com/deepseek/v1/chat/completions would pass through rstrip("/") unchanged, fail the endswith("/v1") guard, and produce …/chat/completions/v1 — an invalid endpoint. The default (https://api.deepseek.com) and the typical …/v1 form work correctly; this only breaks with unusual custom proxy paths.

return [
"model_provider=deepseek",
"model_providers.deepseek.name=DeepSeek",
f"model_providers.deepseek.base_url={base}",
"model_providers.deepseek.env_key=DEEPSEEK_API_KEY",
"model_providers.deepseek.wire_api=chat",
]


def build_codex_command(
prompt: str,
*,
Expand Down Expand Up @@ -558,7 +587,10 @@ def main(argv: Sequence[str] | None = None) -> int:
workdir=args.review_pack.resolve().parent,
codex_bin=args.codex_bin,
model=args.codex_model,
config_overrides=args.config_overrides,
config_overrides=[
*_deepseek_codex_config(args.codex_model, codex_env),
*args.config_overrides,
],
env=codex_env,
)
if args.codex_out:
Expand Down
11 changes: 6 additions & 5 deletions .github/workflows/integration-final-review.yml
Original file line number Diff line number Diff line change
Expand Up @@ -514,11 +514,12 @@ jobs:
# codex_review.py uses for the CLI (dropping the DeepSeek base URL) so
# the judge clobber never leaks into codex auth.
CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# Pin the codex composer to a model the OPENAI_API_KEY is PROVEN to serve
# (the codex-acp matrix cells use gpt-5.4-nano). Otherwise `codex exec`
# falls back to its built-in default (gpt-5.x-codex), which the key may
# not be entitled to -> model_not_found -> 'codex unavailable'.
CODEX_MODEL: gpt-5.4-nano
# Run the codex composer on DeepSeek-v4-pro. codex_review.py routes a
# deepseek model through a custom chat-completions provider (DeepSeek
# serves chat, not the OpenAI Responses API + `tool_search` tool that
# gpt-5.4-nano rejected). Reads DEEPSEEK_API_KEY + DEEPSEEK_BASE_URL from
# the job env (both present).
CODEX_MODEL: deepseek-v4-pro
DETERMINISTIC_VERDICT: ${{ steps.pack.outputs.deterministic_verdict }}
run: |
set +e
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/integration-scope.yml
Original file line number Diff line number Diff line change
Expand Up @@ -502,9 +502,10 @@ jobs:
# via CODEX_API_KEY — codex_review.py isolates the CLI env (real key +
# default OpenAI base) so the judge clobber never leaks into codex auth.
CODEX_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# Pin the codex composer to a model the key is proven to serve (else
# `codex exec` uses its built-in default, which the key may not serve).
CODEX_MODEL: gpt-5.4-nano
# Run the codex composer on DeepSeek-v4-pro (codex_review.py routes a
# deepseek model through a chat-completions provider; DeepSeek can't
# serve the OpenAI Responses API + tool_search that small models reject).
CODEX_MODEL: deepseek-v4-pro
run: |
set +e
uv run python .github/scripts/codex_review.py \
Expand Down
19 changes: 19 additions & 0 deletions tests/test_codex_review.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,25 @@ def test_codex_env_unchanged_without_codex_api_key():
assert cr._codex_env(src) == src


def test_deepseek_codex_config_routes_via_chat_provider():
# A DeepSeek composer model can't use the OpenAI Responses API + tool_search,
# so codex is pointed at a custom chat-completions provider on DeepSeek.
cfg = cr._deepseek_codex_config(
"deepseek-v4-pro", {"DEEPSEEK_BASE_URL": "https://api.deepseek.com"}
)
assert "model_provider=deepseek" in cfg
assert "model_providers.deepseek.wire_api=chat" in cfg
assert "model_providers.deepseek.env_key=DEEPSEEK_API_KEY" in cfg
# DeepSeek's OpenAI-compatible chat endpoint is under /v1.
assert "model_providers.deepseek.base_url=https://api.deepseek.com/v1" in cfg


def test_deepseek_codex_config_empty_for_openai_model():
# A non-DeepSeek model leaves codex on its default OpenAI provider.
assert cr._deepseek_codex_config("gpt-5.4-nano", {}) == []
assert cr._deepseek_codex_config(None, {}) == []


# ------------------------------------------------------------------
# worst() — advisory-stricter-only composition
# ------------------------------------------------------------------
Expand Down
Loading