Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/scripts/build_integration_review_pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,11 @@ def _classify_one(slot: Slot, expected_source_sha: str | None) -> None:
slot.detail = f"deterministic reject: {non_outcome}"
else:
slot.status = "healthy"
# Demoted to healthy: clear the reject flag too, so the serialized
# agent_judge_summary does not tell codex this healthy slot still has a
# deterministic reject — a contradiction that can spuriously push the
# codex reviewer to downgrade the verdict.
slot.grade["deterministic_reject"] = False
slot.grade["quarantines"] = [
*slot.grade.get("quarantines", []),
"R-OUTCOME: rollout produced no valid scored outcome "
Expand Down
13 changes: 8 additions & 5 deletions .github/scripts/codex_review.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,14 +184,17 @@ async def one(rollout: Path) -> dict:
"error": f"deepseek pass failed: {type(exc).__name__}: {exc}",
}
finding: dict = {"rollout": str(rollout), "raw": raw[:4000]}
match = re.search(r"\{.*\}", raw, re.DOTALL)
if match:
start = raw.find("{")
if start == -1:
finding["parse_error"] = "no JSON object in deepseek finding"
else:
try:
finding["parsed"] = json.loads(match.group(0))
# raw_decode parses the FIRST complete JSON object and ignores any
# trailing prose the model appends. A greedy `{.*}` span would
# instead merge multiple objects into one invalid blob.
finding["parsed"], _ = json.JSONDecoder().raw_decode(raw[start:])
except json.JSONDecodeError:
finding["parse_error"] = "deepseek finding JSON was unparseable"
else:
finding["parse_error"] = "no JSON object in deepseek finding"
return finding

return await asyncio.gather(*(one(r) for r in rollout_dirs))
Expand Down
22 changes: 15 additions & 7 deletions .github/scripts/integration_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -1111,13 +1111,21 @@ def build_plan(


def _changed_files_from_git(base_ref: str, head_sha: str) -> list[str]:
out = subprocess.run(
["git", "diff", "--name-only", f"{base_ref}...{head_sha}"],
cwd=REPO_ROOT,
capture_output=True,
text=True,
check=True,
)
try:
out = subprocess.run(
["git", "diff", "--name-only", f"{base_ref}...{head_sha}"],
cwd=REPO_ROOT,
capture_output=True,
text=True,
check=True,
)
except subprocess.CalledProcessError as exc:
# git diff fails when e.g. head_sha was never fetched (the workflow
# silences the fetch with `|| true`). Surface it through the ScopeError
# fail-closed handler in main() rather than as an uncaught traceback.
raise ScopeError(
f"git diff {base_ref}...{head_sha} failed: {exc.stderr.strip() or exc}"
) from exc
return [line.strip() for line in out.stdout.splitlines() if line.strip()]


Expand Down
35 changes: 35 additions & 0 deletions tests/test_build_review_pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,41 @@ def test_review_pack_infra_timeout_is_capability_attributed_quarantine() -> None
assert attribution["label"] == "experiment-fidelity"


def test_r_outcome_only_demotion_clears_deterministic_reject() -> None:
# Regression (#810 greptile P1): an R-OUTCOME-ONLY reject is demoted to a
# healthy + quarantine slot, but the serialized grade must NOT keep
# deterministic_reject=True. agent_judge_summary serializes that field and
# codex reads it; a "healthy slot that still has a deterministic reject" is a
# contradiction that can spuriously push the codex reviewer to downgrade.
import rubric_checks as rc

original = rc.grade_rollout
rc.grade_rollout = lambda rollout: { # type: ignore[assignment]
"deterministic_reject": True,
"gates": [
{"id": "R-OUTCOME", "status": "fail", "enforcement": "deterministic"},
{"id": "R-REAL", "status": "pass", "enforcement": "deterministic"},
],
"quarantines": [],
}
try:
with tempfile.TemporaryDirectory() as tmp:
rollout = Path(tmp) / "r"
rollout.mkdir()
(rollout / "result.json").write_text("{}")
slot = pack_mod.Slot(
cell=pack_mod.normalize_cell(_cell("weighted-gdp-calc", "openhands"))
)
slot.rollouts = [rollout]
pack_mod._classify_one(slot, None)
finally:
rc.grade_rollout = original

assert slot.status == "healthy"
assert slot.grade["deterministic_reject"] is False # the bug: was left True
assert any("R-OUTCOME" in q for q in slot.grade["quarantines"])


# ------------------------------------------------------------------
# Full review-pack/ layout on disk + the CLI verdict contract.
# ------------------------------------------------------------------
Expand Down
Loading