From fcc9ea5e14d44602b129014f7933592dd068bdc8 Mon Sep 17 00:00:00 2001 From: symphony-bot Date: Sat, 20 Jun 2026 19:32:10 +0000 Subject: [PATCH] fix(integration): clear residual greptile findings on the L3 gate Cross-checked the unaddressed greptile comments on the earlier integration PRs (#802-#810); most were already fixed or stale. Three genuine ones remained: - P1 (build_integration_review_pack): an R-OUTCOME-only deterministic reject is demoted to a healthy+quarantine slot, but grade["deterministic_reject"] was left True and serialized into agent_judge_summary.json. codex reads that and a "healthy slot with a deterministic reject" can spuriously push it to downgrade mergeable -> not mergeable. Clear the flag on demotion. + regression test. - codex_review deepseek-finding parser: replace the greedy `{.*}` DOTALL regex (which merges two JSON objects into one invalid blob) with raw_decode of the first complete object, tolerant of trailing prose. - integration_matrix: `git diff` runs with check=True but CalledProcessError bypassed the ScopeError fail-closed handler (uncaught traceback). Re-raise it as ScopeError so a failed/absent head_sha fails closed cleanly. --- .../scripts/build_integration_review_pack.py | 5 +++ .github/scripts/codex_review.py | 13 ++++--- .github/scripts/integration_matrix.py | 22 ++++++++---- tests/test_build_review_pack.py | 35 +++++++++++++++++++ 4 files changed, 63 insertions(+), 12 deletions(-) diff --git a/.github/scripts/build_integration_review_pack.py b/.github/scripts/build_integration_review_pack.py index 9762403b..13be6b56 100644 --- a/.github/scripts/build_integration_review_pack.py +++ b/.github/scripts/build_integration_review_pack.py @@ -373,6 +373,11 @@ def _classify_one(slot: Slot, expected_source_sha: str | None) -> None: slot.detail = f"deterministic reject: {non_outcome}" else: slot.status = "healthy" + # Demoted to healthy: clear the reject flag too, so the serialized + # agent_judge_summary does not tell codex this healthy slot still has a + # deterministic reject — a contradiction that can spuriously push the + # codex reviewer to downgrade the verdict. + slot.grade["deterministic_reject"] = False slot.grade["quarantines"] = [ *slot.grade.get("quarantines", []), "R-OUTCOME: rollout produced no valid scored outcome " diff --git a/.github/scripts/codex_review.py b/.github/scripts/codex_review.py index 5b021e74..26b22c27 100644 --- a/.github/scripts/codex_review.py +++ b/.github/scripts/codex_review.py @@ -184,14 +184,17 @@ async def one(rollout: Path) -> dict: "error": f"deepseek pass failed: {type(exc).__name__}: {exc}", } finding: dict = {"rollout": str(rollout), "raw": raw[:4000]} - match = re.search(r"\{.*\}", raw, re.DOTALL) - if match: + start = raw.find("{") + if start == -1: + finding["parse_error"] = "no JSON object in deepseek finding" + else: try: - finding["parsed"] = json.loads(match.group(0)) + # raw_decode parses the FIRST complete JSON object and ignores any + # trailing prose the model appends. A greedy `{.*}` span would + # instead merge multiple objects into one invalid blob. + finding["parsed"], _ = json.JSONDecoder().raw_decode(raw[start:]) except json.JSONDecodeError: finding["parse_error"] = "deepseek finding JSON was unparseable" - else: - finding["parse_error"] = "no JSON object in deepseek finding" return finding return await asyncio.gather(*(one(r) for r in rollout_dirs)) diff --git a/.github/scripts/integration_matrix.py b/.github/scripts/integration_matrix.py index 122953d9..3cb180e0 100644 --- a/.github/scripts/integration_matrix.py +++ b/.github/scripts/integration_matrix.py @@ -1111,13 +1111,21 @@ def build_plan( def _changed_files_from_git(base_ref: str, head_sha: str) -> list[str]: - out = subprocess.run( - ["git", "diff", "--name-only", f"{base_ref}...{head_sha}"], - cwd=REPO_ROOT, - capture_output=True, - text=True, - check=True, - ) + try: + out = subprocess.run( + ["git", "diff", "--name-only", f"{base_ref}...{head_sha}"], + cwd=REPO_ROOT, + capture_output=True, + text=True, + check=True, + ) + except subprocess.CalledProcessError as exc: + # git diff fails when e.g. head_sha was never fetched (the workflow + # silences the fetch with `|| true`). Surface it through the ScopeError + # fail-closed handler in main() rather than as an uncaught traceback. + raise ScopeError( + f"git diff {base_ref}...{head_sha} failed: {exc.stderr.strip() or exc}" + ) from exc return [line.strip() for line in out.stdout.splitlines() if line.strip()] diff --git a/tests/test_build_review_pack.py b/tests/test_build_review_pack.py index 06c63035..967e5b08 100644 --- a/tests/test_build_review_pack.py +++ b/tests/test_build_review_pack.py @@ -407,6 +407,41 @@ def test_review_pack_infra_timeout_is_capability_attributed_quarantine() -> None assert attribution["label"] == "experiment-fidelity" +def test_r_outcome_only_demotion_clears_deterministic_reject() -> None: + # Regression (#810 greptile P1): an R-OUTCOME-ONLY reject is demoted to a + # healthy + quarantine slot, but the serialized grade must NOT keep + # deterministic_reject=True. agent_judge_summary serializes that field and + # codex reads it; a "healthy slot that still has a deterministic reject" is a + # contradiction that can spuriously push the codex reviewer to downgrade. + import rubric_checks as rc + + original = rc.grade_rollout + rc.grade_rollout = lambda rollout: { # type: ignore[assignment] + "deterministic_reject": True, + "gates": [ + {"id": "R-OUTCOME", "status": "fail", "enforcement": "deterministic"}, + {"id": "R-REAL", "status": "pass", "enforcement": "deterministic"}, + ], + "quarantines": [], + } + try: + with tempfile.TemporaryDirectory() as tmp: + rollout = Path(tmp) / "r" + rollout.mkdir() + (rollout / "result.json").write_text("{}") + slot = pack_mod.Slot( + cell=pack_mod.normalize_cell(_cell("weighted-gdp-calc", "openhands")) + ) + slot.rollouts = [rollout] + pack_mod._classify_one(slot, None) + finally: + rc.grade_rollout = original + + assert slot.status == "healthy" + assert slot.grade["deterministic_reject"] is False # the bug: was left True + assert any("R-OUTCOME" in q for q in slot.grade["quarantines"]) + + # ------------------------------------------------------------------ # Full review-pack/ layout on disk + the CLI verdict contract. # ------------------------------------------------------------------