From efa7e3300ce24cea95b299e3e09c07e487449e91 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Wed, 20 May 2026 12:34:31 +0200 Subject: [PATCH 1/9] Multi-turn evaluation support for SWE-Bench. --- evals/common/inference_worker.py | 167 -------- evals/common/multi_turn.py | 180 +++++++++ evals/common/multi_turn_worker.py | 152 +++++++ evals/common/score_aggregator.py | 121 ++++++ evals/common/verifier_set.py | 80 ++++ evals/swe_bench/README.md | 141 ++++--- evals/swe_bench/deploy/job-build-prompts.yaml | 2 +- evals/swe_bench/deploy/job-mirror-images.yaml | 2 +- .../deploy/raycluster-patch-gen.yaml | 4 +- .../deploy/raycluster-test-exec.yaml | 128 ------ evals/swe_bench/grader.py | 79 +--- evals/swe_bench/multi_turn_worker.py | 380 ++++++++++++++++++ evals/swe_bench/patch_worker.py | 106 ----- evals/swe_bench/run_patch_generation.py | 280 ++++++++++--- evals/swe_bench/run_test_execution.py | 324 --------------- evals/swe_bench/test_worker.py | 237 ----------- evals/swe_bench/verifiers/__init__.py | 0 .../swe_bench/verifiers/unit_test_verifier.py | 172 ++++++++ infra/deploy/buildconfig-ray-swe-bench.yaml | 39 ++ infra/images/Containerfile.swe-bench-eval | 7 +- pyproject.toml | 2 +- ...e_bench_phase1.sh => run_swe_bench_eval.sh | 78 +++- run_swe_bench_phase2.sh | 92 ----- verifiers/__init__.py | 0 verifiers/ast_check.py | 145 +++++-- verifiers/base.py | 84 +++- 26 files changed, 1678 insertions(+), 1324 deletions(-) delete mode 100644 evals/common/inference_worker.py create mode 100644 evals/common/multi_turn.py create mode 100644 evals/common/multi_turn_worker.py create mode 100644 evals/common/score_aggregator.py create mode 100644 evals/common/verifier_set.py delete mode 100644 evals/swe_bench/deploy/raycluster-test-exec.yaml create mode 100644 evals/swe_bench/multi_turn_worker.py delete mode 100644 evals/swe_bench/patch_worker.py delete mode 100644 evals/swe_bench/run_test_execution.py delete mode 100644 evals/swe_bench/test_worker.py create mode 100644 evals/swe_bench/verifiers/__init__.py create mode 100644 evals/swe_bench/verifiers/unit_test_verifier.py create mode 100644 infra/deploy/buildconfig-ray-swe-bench.yaml rename run_swe_bench_phase1.sh => run_swe_bench_eval.sh (58%) delete mode 100755 run_swe_bench_phase2.sh create mode 100644 verifiers/__init__.py diff --git a/evals/common/inference_worker.py b/evals/common/inference_worker.py deleted file mode 100644 index 6aec1d6..0000000 --- a/evals/common/inference_worker.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Generic Ray worker for LLM inference via vLLM. - -Provides a reusable base class for generating predictions from LLM prompts -across different benchmarks. Subclasses customize prompt formatting and -response extraction. -""" - -from __future__ import annotations - -import logging -from typing import Callable - -logger = logging.getLogger(__name__) - - -class InferenceWorker: - """Generates predictions via vLLM for arbitrary evaluation tasks. - - Each worker processes its assigned instances sequentially. - Multiple workers run in parallel across the Ray cluster, - each pointing at the same vLLM endpoint (or different - endpoints when scaling). - - Args: - vllm_urls: List of vLLM OpenAI-compatible base URLs. - Requests are round-robined across them. - model_name: Model name as registered in vLLM. - max_tokens: Maximum tokens for generation. - temperature: Sampling temperature. - system_message: Optional system message to prepend to prompts. - """ - - def __init__( - self, - vllm_urls: list[str], - model_name: str, - max_tokens: int = 4096, - temperature: float = 0.0, - system_message: str | None = None, - timeout: float = 600.0, - ): - import openai - - self.model_name = model_name - self.max_tokens = max_tokens - self.temperature = temperature - self.system_message = system_message - self.timeout = timeout - - if not vllm_urls: - raise ValueError("vllm_urls must contain at least one endpoint") - - self.clients = [ - openai.OpenAI(base_url=url, api_key="not-needed", timeout=timeout) - for url in vllm_urls - ] - self._call_count = 0 - - def _get_client(self): - """Round-robin across vLLM clients.""" - client = self.clients[self._call_count % len(self.clients)] - self._call_count += 1 - return client - - def _generate(self, prompt: str) -> str: - """Call vLLM to generate a response for one prompt. - - Args: - prompt: The prompt text. - - Returns: - Raw response from the model. - """ - client = self._get_client() - - messages = [] - if self.system_message: - messages.append({"role": "system", "content": self.system_message}) - messages.append({"role": "user", "content": prompt}) - - response = client.chat.completions.create( - model=self.model_name, - messages=messages, - max_tokens=self.max_tokens, - temperature=self.temperature, - timeout=self.timeout - ) - - return response.choices[0].message.content or "" - - def generate_batch( - self, - instances: list[dict], - prompts: dict[str, str], - extract_fn: Callable[[str], str] | None = None, - instance_id_key: str = "instance_id", - ) -> list[dict]: - """Generate predictions for a batch of instances. - - Args: - instances: List of dataset instances. - prompts: Map of instance_id -> prompt text. - extract_fn: Optional function to extract the prediction from raw response. - If None, uses the raw response as-is. - instance_id_key: Key to use for instance ID in the instance dict. - - Returns: - List of dicts with keys: instance_id, prediction, full_output, - model_name_or_path, error. - """ - results = [] - - for instance in instances: - instance_id = instance.get(instance_id_key) - if instance_id is None: - logger.error(f"Missing `{instance_id_key}` in instance: {instance}") - results.append({ - "instance_id": None, - "prediction": "", - "full_output": "", - "model_name_or_path": self.model_name, - "error": f"Missing `{instance_id_key}` in instance", - }) - continue - - prompt = prompts.get(instance_id) - - if prompt is None: - logger.error(f"No prompt found for {instance_id}") - results.append({ - "instance_id": instance_id, - "prediction": "", - "full_output": "", - "model_name_or_path": self.model_name, - "error": "No prompt found for instance", - }) - continue - - logger.info(f"Generating prediction for {instance_id}") - - try: - raw_response = self._generate(prompt) - - if extract_fn: - prediction = extract_fn(raw_response) - else: - prediction = raw_response - - results.append({ - "instance_id": instance_id, - "prediction": prediction, - "full_output": raw_response, - "model_name_or_path": self.model_name, - "error": None, - }) - - except Exception as e: - logger.error(f"Error generating prediction for {instance_id}: {e}") - results.append({ - "instance_id": instance_id, - "prediction": "", - "full_output": "", - "model_name_or_path": self.model_name, - "error": str(e), - }) - - return results diff --git a/evals/common/multi_turn.py b/evals/common/multi_turn.py new file mode 100644 index 0000000..e2fa424 --- /dev/null +++ b/evals/common/multi_turn.py @@ -0,0 +1,180 @@ +""" +Multi-turn evaluation session. + +Manages the iterative generate → evaluate → feedback loop for one instance. +All benchmark-specific logic (inference, evaluation dispatch, output extraction) +is injected via callables so this class remains benchmark-agnostic. + +Loop flow per instance: + 1. Call generate_fn(messages) → raw model response + 2. Call extract_fn(response) → structured output string + 3. Call run_intermediate_fn(output) → list[VerifierResult] + 4. Compute aggregate score; check early-exit condition + 5. If not exiting: collect feedback from each verifier and append + user message; go to 1 + After max_turns (or early exit): + 6. Call run_final_fn(last_output) → list[VerifierResult] + 7. Return MultiTurnResult +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Callable + +from evals.common.score_aggregator import ScoreAggregator +from evals.common.verifier_set import VerifierSet +from verifiers.base import VerifierResult + +logger = logging.getLogger(__name__) + + +@dataclass +class TurnResult: + """Recorded state for a single generation turn.""" + turn: int # 0-based turn index + output: str # extracted output from model response + verifier_results: list[VerifierResult] # intermediate verifier results + aggregate_score: float # aggregated intermediate score + feedback: str # feedback message sent back to model + # (empty string on last turn / early exit) + + +@dataclass +class MultiTurnResult: + """Full result for one multi-turn evaluation instance.""" + final_output: str # output from the last generation turn + final_verifier_results: list[VerifierResult] + final_aggregate_score: float + turns: list[TurnResult] # per-turn intermediate records + stopped_early: bool # True if intermediate eval passed before max_turns + + @property + def num_turns(self) -> int: + return len(self.turns) + + +class MultiTurnSession: + """Runs the multi-turn generate → evaluate → feedback loop for one instance. + + Args: + generate_fn: Callable(messages: list[dict]) -> str. + Calls the model with the current conversation and returns the raw response. + extract_fn: Callable(response: str) -> str. + Extracts the structured output (patch, solution, etc.) from the raw response. + run_intermediate_fn: Callable(output: str) -> list[VerifierResult]. + Runs the intermediate verifier set. May be a no-op if the set is empty. + run_final_fn: Callable(output: str) -> list[VerifierResult]. + Runs the final verifier set on the last output. + aggregator: Combines verifier results into a scalar score. + intermediate_verifier_set: Used for early-exit checking and feedback generation. + Each verifier in the set is responsible for formatting its own feedback. + max_turns: Hard cap on generation attempts. + """ + + def __init__( + self, + generate_fn: Callable[[list[dict]], str], + extract_fn: Callable[[str], str], + run_intermediate_fn: Callable[[str], list[VerifierResult]], + run_final_fn: Callable[[str], list[VerifierResult]], + aggregator: ScoreAggregator, + intermediate_verifier_set: VerifierSet, + max_turns: int = 1, + ): + self.generate_fn = generate_fn + self.extract_fn = extract_fn + self.run_intermediate_fn = run_intermediate_fn + self.run_final_fn = run_final_fn + self.aggregator = aggregator + self.intermediate_verifier_set = intermediate_verifier_set + self.max_turns = max_turns + + def _build_feedback(self, results: list[VerifierResult]) -> str: + """Collect per-verifier feedback and wrap in a single message.""" + lines = ["Your solution was evaluated. Here is the feedback:\n"] + for entry in self.intermediate_verifier_set.entries: + result = next((r for r in results if r.name == entry.verifier.name), None) + if result is not None: + lines.append(entry.verifier.format_feedback(result)) + if all(r.passed for r in results): + lines.append("\nAll checks passed.") + else: + lines.append("\nPlease revise your solution to address the issues above.") + return "\n".join(lines) + + def run(self, initial_messages: list[dict]) -> MultiTurnResult: + """Execute the full multi-turn loop for one instance. + + Args: + initial_messages: Starting conversation (system + first user message). + + Returns: + MultiTurnResult with the last output, final eval results, and turn history. + """ + messages = list(initial_messages) + turns: list[TurnResult] = [] + output = "" + stopped_early = False + + for turn_idx in range(self.max_turns): + logger.debug(f"Turn {turn_idx + 1}/{self.max_turns}") + + # Generate + response = self.generate_fn(messages) + output = self.extract_fn(response) + + # Append assistant turn to conversation + messages.append({"role": "assistant", "content": response}) + + # Intermediate evaluation (skipped when max_turns == 1) + intermediate_results: list[VerifierResult] = [] + aggregate_score = 1.0 + feedback = "" + + if self.intermediate_verifier_set: + intermediate_results = self.run_intermediate_fn(output) + aggregate_score = self.aggregator.aggregate(intermediate_results) + + # Early exit: all intermediate verifiers passed + if self.intermediate_verifier_set.all_passed(intermediate_results): + logger.debug( + f"Turn {turn_idx + 1}: all intermediate checks passed " + f"(score={aggregate_score:.3f}), stopping early" + ) + turns.append(TurnResult( + turn=turn_idx, + output=output, + verifier_results=intermediate_results, + aggregate_score=aggregate_score, + feedback="", + )) + stopped_early = True + break + + # Not the last turn: generate feedback and continue + if turn_idx < self.max_turns - 1: + feedback = self._build_feedback(intermediate_results) + messages.append({"role": "user", "content": feedback}) + + turns.append(TurnResult( + turn=turn_idx, + output=output, + verifier_results=intermediate_results, + aggregate_score=aggregate_score, + feedback=feedback, + )) + + # Final evaluation on the last output + logger.debug("Running final evaluation") + final_results = self.run_final_fn(output) + final_score = self.aggregator.aggregate(final_results) + + return MultiTurnResult( + final_output=output, + final_verifier_results=final_results, + final_aggregate_score=final_score, + turns=turns, + stopped_early=stopped_early, + ) diff --git a/evals/common/multi_turn_worker.py b/evals/common/multi_turn_worker.py new file mode 100644 index 0000000..82e7f93 --- /dev/null +++ b/evals/common/multi_turn_worker.py @@ -0,0 +1,152 @@ +"""Generic multi-turn worker base class. + +Provides the benchmark-agnostic pieces of a Ray-based multi-turn evaluation worker: +- vLLM client management with round-robin across endpoints +- Naive (inline vLLM) generation +- Static and dynamic verifier execution (inline async vs thread pool) +- Batch evaluation entry point + +Subclasses provide the benchmark-specific parts: +- _generate_turn(messages, instance, run_id) -> str +- _evaluate_instance(instance, prompts, run_id) -> dict +""" + +from __future__ import annotations + +import asyncio +import logging +from concurrent.futures import ThreadPoolExecutor, as_completed + +from evals.common.score_aggregator import MeanAggregator, ScoreAggregator +from evals.common.verifier_set import VerifierSet +from verifiers.base import VerifierResult, VerifierStatus + +logger = logging.getLogger(__name__) + + +class MultiTurnWorkerBase: + """Base class for multi-turn evaluation Ray workers. + + Args: + vllm_urls: vLLM OpenAI-compatible base URLs (round-robined). + model_name: Model name as registered in vLLM. + max_tokens: Max tokens per vLLM call. + temperature: Sampling temperature. + verifier_set: Full verifier set; intermediate/final subsets are derived from flags. + aggregator: Aggregates per-verifier scores into a scalar. + max_turns: Hard cap on generation attempts per instance. + max_concurrent_jobs: Max concurrent threads for dynamic verifiers. + """ + + def __init__( + self, + vllm_urls: list[str], + model_name: str, + max_tokens: int = 16000, + temperature: float = 0.15, + verifier_set: VerifierSet | None = None, + aggregator: ScoreAggregator | None = None, + max_turns: int = 1, + max_concurrent_jobs: int = 4, + ): + import openai + + if not vllm_urls: + raise ValueError("vllm_urls must contain at least one endpoint") + + self.model_name = model_name + self.max_tokens = max_tokens + self.temperature = temperature + self.max_turns = max_turns + self.max_concurrent_jobs = max_concurrent_jobs + + self.clients = [ + openai.OpenAI(base_url=url, api_key="not-needed", timeout=600.0) + for url in vllm_urls + ] + self._call_count = 0 + + self.verifier_set = verifier_set or VerifierSet() + self.intermediate_set = self.verifier_set.intermediate_subset() + self.final_set = self.verifier_set.final_subset() + self.aggregator = aggregator or MeanAggregator() + + # ── vLLM (naive generation) ───────────────────────────────────────────── + + def _get_client(self): + client = self.clients[self._call_count % len(self.clients)] + self._call_count += 1 + return client + + def _generate_naive(self, messages: list[dict]) -> str: + """Generate a response via inline vLLM inference.""" + client = self._get_client() + response = client.chat.completions.create( + model=self.model_name, + messages=messages, + max_tokens=self.max_tokens, + temperature=self.temperature, + ) + return response.choices[0].message.content or "" + + # ── Verifier execution ────────────────────────────────────────────────── + + def _run_verifier_set_inline(self, vset: VerifierSet, ctx) -> list[VerifierResult]: + """Run all verifiers in vset concurrently (async, in-process).""" + async def _run_all(): + return [await entry.verifier.safe_verify(ctx) for entry in vset.entries] + return asyncio.run(_run_all()) + + def _run_mixed_verifier_set(self, vset: VerifierSet, ctx) -> list[VerifierResult]: + """Run static verifiers inline and dynamic verifiers in a thread pool.""" + static_entries = [e for e in vset.entries if e.verifier.execution_mode == "static"] + dynamic_entries = [e for e in vset.entries if e.verifier.execution_mode == "dynamic"] + + results: dict[str, VerifierResult] = {} + + if static_entries: + for r in self._run_verifier_set_inline(VerifierSet(static_entries), ctx): + results[r.name] = r + + if dynamic_entries: + with ThreadPoolExecutor( + max_workers=min(self.max_concurrent_jobs, len(dynamic_entries)) + ) as pool: + futures = { + pool.submit(asyncio.run, entry.verifier.safe_verify(ctx)): entry + for entry in dynamic_entries + } + for future in as_completed(futures): + entry = futures[future] + try: + r = future.result() + except Exception as e: + r = VerifierResult( + name=entry.verifier.name, + status=VerifierStatus.ERROR, + score=0.0, + pass_threshold=entry.verifier.pass_threshold, + details={"error": str(e)}, + ) + results[r.name] = r + + return [results[e.verifier.name] for e in vset.entries if e.verifier.name in results] + + def _run_verifier_set(self, vset: VerifierSet, ctx) -> list[VerifierResult]: + """Run a verifier set against a pre-built context.""" + if not vset: + return [] + if vset.is_dynamic: + return self._run_mixed_verifier_set(vset, ctx) + return self._run_verifier_set_inline(vset, ctx) + + # ── Batch entry point ─────────────────────────────────────────────────── + + def evaluate_batch( + self, + instances: list[dict], + prompts: dict[str, str], + run_id: str, + ) -> list[dict]: + """Evaluate a batch of instances sequentially.""" + return [self._evaluate_instance(inst, prompts, run_id) for inst in instances] diff --git a/evals/common/score_aggregator.py b/evals/common/score_aggregator.py new file mode 100644 index 0000000..f11361b --- /dev/null +++ b/evals/common/score_aggregator.py @@ -0,0 +1,121 @@ +""" +Score aggregation strategies for multi-verifier evaluation. + +Each aggregator combines a list of (verifier_name, score) pairs into a single +aggregate score in [0.0, 1.0]. Verifiers that did not run successfully +(ERROR, TIMEOUT, SKIPPED) contribute a score of 0.0 unless excluded. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any + +from verifiers.base import VerifierResult, VerifierStatus + + +class ScoreAggregator(ABC): + """Abstract base for score aggregation strategies.""" + + @abstractmethod + def aggregate(self, results: list[VerifierResult]) -> float: + """ + Combine verifier results into a single aggregate score. + + Args: + results: List of VerifierResult from one evaluation pass. + + Returns: + Aggregate score in [0.0, 1.0]. + """ + ... + + @staticmethod + def _effective_score(result: VerifierResult) -> float: + """Return the result's score, or 0.0 for non-OK statuses.""" + if result.status != VerifierStatus.OK: + return 0.0 + return result.score + + +class MeanAggregator(ScoreAggregator): + """Simple mean across all verifier scores (equal weights). + + Non-OK results contribute 0.0. + Returns 1.0 if the result list is empty. + """ + + def aggregate(self, results: list[VerifierResult]) -> float: + if not results: + return 1.0 + return sum(self._effective_score(r) for r in results) / len(results) + + +class MinAggregator(ScoreAggregator): + """Minimum score across all verifiers (any failure dominates). + + Useful when every check must pass independently. + Non-OK results contribute 0.0. + Returns 1.0 if the result list is empty. + """ + + def aggregate(self, results: list[VerifierResult]) -> float: + if not results: + return 1.0 + return min(self._effective_score(r) for r in results) + + +class WeightedSumAggregator(ScoreAggregator): + """Weighted sum of verifier scores, normalized to [0.0, 1.0]. + + Weights are keyed by verifier name. Any verifier not in the weights + dict defaults to weight 1.0. Non-OK results contribute 0.0. + + Args: + weights: Map of verifier name to non-negative weight. + + Example config:: + + WeightedSumAggregator(weights={"ast_check": 0.2, "swe_test": 0.8}) + """ + + def __init__(self, weights: dict[str, float] | None = None): + self.weights = weights or {} + + def aggregate(self, results: list[VerifierResult]) -> float: + if not results: + return 1.0 + + total_weight = 0.0 + weighted_sum = 0.0 + + for result in results: + w = self.weights.get(result.name, 1.0) + if w < 0: + raise ValueError( + f"Weight for '{result.name}' must be non-negative, got {w}" + ) + weighted_sum += self._effective_score(result) * w + total_weight += w + + if total_weight == 0.0: + return 1.0 + + return weighted_sum / total_weight + + +def build_aggregator(name: str, config: dict[str, Any] | None = None) -> ScoreAggregator: + """Construct a named aggregator from a config dict. + + Supported names: "mean", "min", "weighted_sum". + For "weighted_sum": config may contain {"weights": {"verifier_name": float, ...}} + """ + cfg = config or {} + if name == "mean": + return MeanAggregator() + if name == "min": + return MinAggregator() + if name == "weighted_sum": + return WeightedSumAggregator(weights=cfg.get("weights")) + raise ValueError(f"Unknown aggregator '{name}'. Choose from: mean, min, weighted_sum") + diff --git a/evals/common/verifier_set.py b/evals/common/verifier_set.py new file mode 100644 index 0000000..8c13d44 --- /dev/null +++ b/evals/common/verifier_set.py @@ -0,0 +1,80 @@ +""" +VerifierSet: a configured collection of verifiers with scheduling metadata. + +Each verifier in the set carries two flags: + - run_intermediate: include this verifier after each generation turn + - run_final: include this verifier in the final evaluation pass + +The set dispatches inline (static) vs. via an isolated environment (dynamic) +based on whether any member verifier has weight == "dynamic". +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from verifiers.base import BaseVerifier, VerifierResult + + +@dataclass +class VerifierEntry: + """A verifier paired with its scheduling configuration.""" + verifier: BaseVerifier + run_intermediate: bool = True + run_final: bool = True + + +class VerifierSet: + """An ordered collection of verifiers with intermediate/final scheduling. + + Args: + entries: List of VerifierEntry objects. + """ + + def __init__(self, entries: list[VerifierEntry] | None = None): + self.entries: list[VerifierEntry] = entries or [] + + def add( + self, + verifier: BaseVerifier, + run_intermediate: bool = True, + run_final: bool = True, + ) -> "VerifierSet": + """Add a verifier with scheduling flags. Returns self for chaining.""" + self.entries.append(VerifierEntry(verifier, run_intermediate, run_final)) + return self + + def intermediate_subset(self) -> "VerifierSet": + """Return a new VerifierSet containing only intermediate-scheduled verifiers.""" + return VerifierSet([e for e in self.entries if e.run_intermediate]) + + def final_subset(self) -> "VerifierSet": + """Return a new VerifierSet containing only final-scheduled verifiers.""" + return VerifierSet([e for e in self.entries if e.run_final]) + + @property + def verifiers(self) -> list[BaseVerifier]: + return [e.verifier for e in self.entries] + + @property + def is_dynamic(self) -> bool: + """True if any verifier in this set requires isolated execution.""" + return any(e.verifier.execution_mode == "dynamic" for e in self.entries) + + def all_passed(self, results: list[VerifierResult]) -> bool: + """True if every result in the list passes its threshold. + + Used for early-exit decisions: if all intermediate verifiers pass, + there is no need to generate another turn. + """ + return all(r.passed for r in results) + + def __bool__(self) -> bool: + return bool(self.entries) + + def __len__(self) -> int: + return len(self.entries) + + def __repr__(self) -> str: + names = [e.verifier.name for e in self.entries] + return f"VerifierSet({names})" diff --git a/evals/swe_bench/README.md b/evals/swe_bench/README.md index 0e66048..8685607 100644 --- a/evals/swe_bench/README.md +++ b/evals/swe_bench/README.md @@ -4,19 +4,23 @@ Evaluate models on [SWE-bench](https://swe-bench.github.io/) using Ray for orche ## Overview -The pipeline has three steps: +The pipeline has two steps: -1. **Build prompts** (one-time) -- Clone repos and construct prompts using swebench's official pipeline. Upload to S3/MinIO. -2. **Phase 1: Patch generation** -- Ray workers call vLLM to generate patches from the pre-built prompts. -3. **Phase 2: Test execution** -- Ray workers create K8s Jobs using SWE-bench's pre-built container images, collect test output, and grade results. +1. **Build prompts** (one-time) — Clone repos and construct prompts using swebench's official pipeline. Upload to S3/MinIO. +2. **Evaluate** — A single unified Ray job handles patch generation and test execution. Two strategies are supported, and both work in single-shot or multi-turn mode: + - `naive`: vLLM inference from pre-built prompts, followed by K8s test execution + - `agent`: agentic loop (mini-swe-agent or custom) running inside K8s Jobs + +**Multi-turn** (`MAX_TURNS > 1`) is orthogonal to strategy. With either strategy the model receives feedback from configurable intermediate verifiers after each attempt and can revise its patch before the final K8s test evaluation. ## Prerequisites - OpenShift / Kubernetes cluster with KubeRay operator - MinIO deployed with a `swe-bench` bucket (`oc apply -f infra/deploy/minio.yaml`) - vLLM model server deployed (`oc apply -f inference/deploy/vllm-server-deployment.yaml`) +- RBAC for K8s Job creation from Ray worker pods (`oc apply -f evals/swe_bench/deploy/rbac.yaml`) -## Step 0: Build Prompted Dataset (one-time) +## Step 0: Build Prompted Dataset (one-time, naive strategy only) Builds prompts using swebench's style-3 format with oracle file selection. Only needs to run once per dataset. @@ -25,32 +29,28 @@ oc apply -f evals/swe_bench/deploy/job-build-prompts.yaml oc logs -f job/build-swe-bench-prompts ``` -The job uploads the prompted dataset to `s3://swe-bench/swe-bench-verified/prompts/style-3-oracle.jsonl`. +Outputs to: `s3://swe-bench/swe-bench-verified/prompts/style-3-oracle.jsonl` -## Step 1: Generate Patches (Phase 1) +## Step 1: Run Evaluation -Deploy the Ray cluster and run inference: +Deploy the Ray cluster, apply RBAC, and run: ```bash +oc apply -f evals/swe_bench/deploy/rbac.yaml oc apply -f evals/swe_bench/deploy/raycluster-patch-gen.yaml oc port-forward svc/swe-bench-patch-gen-head-svc 8265:8265 +# Naive strategy, single-shot (default) +MODEL_NAME="Qwen/Qwen3-1.7B" RUN_ID=my-run bash run_swe_bench_eval.sh -MODEL_NAME="" RUN_ID= bash run_swe_bench_phase1.sh -``` - -Predictions are uploaded to `s3://swe-bench/runs/{RUN_ID}/predictions.jsonl`. +# Naive strategy, multi-turn (3 attempts, AST check between turns) +MAX_TURNS=3 INTERMEDIATE_VERIFIERS="ast_check" RUN_ID=my-run bash run_swe_bench_eval.sh -## Step 2: Run Tests (Phase 2) - -Deploy the RBAC, Ray cluster, and run test execution: - -```bash -oc apply -f evals/swe_bench/deploy/rbac.yaml -oc apply -f evals/swe_bench/deploy/raycluster-test-exec.yaml -oc port-forward svc/swe-bench-test-exec-head-svc 8265:8265 +# Agent strategy, single-shot +STRATEGY=agent RUN_ID=my-run bash run_swe_bench_eval.sh -RUN_ID= bash run_swe_bench_phase2.sh +# Agent strategy, multi-turn +STRATEGY=agent MAX_TURNS=3 INTERMEDIATE_VERIFIERS="ast_check" RUN_ID=my-run bash run_swe_bench_eval.sh ``` Results are uploaded to `s3://swe-bench/runs/{RUN_ID}/results.json`. @@ -60,63 +60,88 @@ Results are uploaded to `s3://swe-bench/runs/{RUN_ID}/results.json`. Run on a small subset to validate the setup: ```bash -# Phase 1: generate 16 patches -INSTANCE_LIMIT=16 RUN_ID=test-16 bash run_swe_bench_phase1.sh - -# Phase 2: run tests on those 16 -INSTANCE_LIMIT=16 RUN_ID=test-16 bash run_swe_bench_phase2.sh -``` - -## Verify Harness with Gold Patches - -Use the ground-truth patches from the dataset to confirm the eval harness works correctly (skips Phase 1): - -```bash -PREDICTIONS=gold INSTANCE_LIMIT=16 RUN_ID=gold-test bash run_swe_bench_phase2.sh +INSTANCE_LIMIT=16 RUN_ID=test-16 bash run_swe_bench_eval.sh ``` -If gold patches resolve, the harness is working. If they don't, there's a setup issue. - ## Configuration -### Phase 1 (`run_swe_bench_phase1.sh`) +### `run_swe_bench_eval.sh` — common variables | Variable | Default | Description | |---|---|---| +| `STRATEGY` | `naive` | `naive` \| `agent` | | `VLLM_URL` | `http://vllm-server:8000/v1` | vLLM endpoint | | `MODEL_NAME` | `Qwen/Qwen3-1.7B` | Model name in vLLM | | `DATASET` | `SWE-bench/SWE-bench_Verified` | HuggingFace dataset | -| `PROMPTS` | `s3://swe-bench/swe-bench-verified/prompts/style-3-oracle.jsonl` | Prompted dataset | | `NUM_WORKERS` | `2` | Ray workers | -| `MAX_TOKENS` | `4096` | Max tokens for generation | -| `TEMPERATURE` | `0.0` | Sampling temperature | | `INSTANCE_LIMIT` | `0` (all) | Limit instances for testing | | `RUN_ID` | `eval-run` | Unique run identifier | +| `MAX_CONCURRENT_JOBS` | `4` | K8s Jobs per Ray worker | +| `JOB_TIMEOUT` | `1800` | Per-instance K8s Job timeout (seconds) | -### Phase 2 (`run_swe_bench_phase2.sh`) +### Multi-turn variables (apply to both strategies) | Variable | Default | Description | |---|---|---| -| `PREDICTIONS` | `s3://.../{RUN_ID}/predictions.jsonl` | Predictions from Phase 1, or `gold` | -| `NUM_WORKERS` | `4` | Ray workers | -| `MAX_CONCURRENT_JOBS` | `4` | K8s Jobs per worker | -| `TIMEOUT` | `1800` | Per-instance timeout (seconds) | -| `INSTANCE_LIMIT` | `0` (all) | Limit instances for testing | -| `RUN_ID` | `eval-run` | Must match Phase 1 | +| `MAX_TURNS` | `1` | Max generation attempts (1 = single-shot) | +| `INTERMEDIATE_VERIFIERS` | _(none)_ | Space-separated verifiers to run between turns (e.g. `ast_check`) | +| `AGGREGATOR` | `mean` | Score aggregation: `mean` \| `min` \| `weighted_sum` | -## Architecture +### Naive strategy variables -```markdown -Step 0: Build Prompts (one-time K8s Job) - clone repos -> read source files -> build style-3 prompts -> upload to S3 +| Variable | Default | Description | +|---|---|---| +| `PROMPTS` | `s3://swe-bench/verified/prompts/style-3-oracle.jsonl` | Prompted dataset | +| `MAX_TOKENS` | `16000` | Max tokens per generation call | +| `TEMPERATURE` | `0.15` | Sampling temperature | -Phase 1: Patch Generation (Ray cluster) - download prompts from S3 -> distribute to Ray workers -> call vLLM -> upload predictions to S3 +### Agent strategy variables + +| Variable | Default | Description | +|---|---|---| +| `AGENT_CONFIG` | `evals/swe_bench/agents/mini_swe_agent.yaml` | Agent config YAML | +| `STEP_LIMIT` | `150` | Max agent steps per instance | +| `COST_LIMIT` | `3.0` | Max cost in dollars per instance | + +## Architecture -Phase 2: Test Execution (Ray cluster) - download predictions from S3 -> distribute to Ray workers -> - each worker creates K8s Jobs using SWE-bench pre-built images -> - collect pod logs -> grade with swebench -> upload results to S3 ``` +Step 0: Build Prompts (one-time K8s Job, naive strategy only) + clone repos → read source files → build style-3 prompts → upload to S3 + +Evaluation (single Ray cluster): + + naive, single-shot (MAX_TURNS=1): + download prompts → distribute to workers → call vLLM → + create K8s test Jobs → collect pod logs → grade → upload results + + naive, multi-turn (MAX_TURNS>1): + for each turn (up to MAX_TURNS): + call vLLM → extract patch → + run intermediate verifiers: + static (e.g. ast_check) → inline in worker process + dynamic (e.g. swe_test) → K8s Job + if all pass → stop early + otherwise → format feedback → append to conversation + run final K8s test Job → grade → upload results + + agent, single-shot (MAX_TURNS=1): + distribute to workers → each worker creates K8s Jobs running + the agent loop → collect results → grade → upload results + + agent, multi-turn (MAX_TURNS>1): + for each turn (up to MAX_TURNS): + run agent K8s Job (no in-container eval) → extract patch → + run intermediate verifiers → format feedback → + inject prior patches + feedback as context for next turn + run final K8s test Job → grade → upload results +``` + +No nested containers — SWE-bench test images run as native K8s Jobs. + +## Multi-turn Design Notes -No nested containers -- SWE-bench test images run as native K8s Jobs. +- **Final patch**: always the last generated attempt (not the best-scoring one). Per-turn scores are recorded in `results.json` for analysis. +- **Early exit**: if all intermediate verifiers pass their `pass_threshold`, the loop stops and proceeds directly to final evaluation. +- **Static vs. dynamic verifiers**: static verifiers (e.g. `ast_check`) work from the patch diff alone and run inline with no K8s overhead. Dynamic verifiers (e.g. `swe_test`) require a K8s Job. Configuring only static verifiers as intermediate checks keeps turn latency low. +- **`MAX_TURNS=1`** is single-shot behavior — no intermediate evaluation, one generation followed by the final K8s test. diff --git a/evals/swe_bench/deploy/job-build-prompts.yaml b/evals/swe_bench/deploy/job-build-prompts.yaml index 3ebeb28..e5c2020 100644 --- a/evals/swe_bench/deploy/job-build-prompts.yaml +++ b/evals/swe_bench/deploy/job-build-prompts.yaml @@ -31,7 +31,7 @@ spec: runAsNonRoot: true containers: - name: build-prompts - image: quay.io/michaelclifford/ray-swe-bench:latest + image: image-registry.openshift-image-registry.svc:5000/code-agent/ray-swe-bench:latest imagePullPolicy: Always securityContext: allowPrivilegeEscalation: false diff --git a/evals/swe_bench/deploy/job-mirror-images.yaml b/evals/swe_bench/deploy/job-mirror-images.yaml index 78ce6a1..6877b62 100644 --- a/evals/swe_bench/deploy/job-mirror-images.yaml +++ b/evals/swe_bench/deploy/job-mirror-images.yaml @@ -55,7 +55,7 @@ spec: initContainers: # Step 1: Generate the deduplicated image list - name: generate-list - image: quay.io/michaelclifford/ray-swe-bench@sha256:d35839b2c3bf6ac0d1ae1d8cddf1ea134706b951ff5e5a87f527b760e2e3341e + image: image-registry.openshift-image-registry.svc:5000/code-agent/ray-swe-bench:latest command: - python - -m diff --git a/evals/swe_bench/deploy/raycluster-patch-gen.yaml b/evals/swe_bench/deploy/raycluster-patch-gen.yaml index 80c7ba8..ca7b146 100644 --- a/evals/swe_bench/deploy/raycluster-patch-gen.yaml +++ b/evals/swe_bench/deploy/raycluster-patch-gen.yaml @@ -49,7 +49,7 @@ spec: runAsNonRoot: true containers: - name: ray-head - image: quay.io/michaelclifford/ray-swe-bench:latest + image: image-registry.openshift-image-registry.svc:5000/code-agent/ray-swe-bench:latest imagePullPolicy: Always securityContext: allowPrivilegeEscalation: false @@ -108,7 +108,7 @@ spec: runAsNonRoot: true containers: - name: ray-worker - image: quay.io/michaelclifford/ray-swe-bench:latest + image: image-registry.openshift-image-registry.svc:5000/code-agent/ray-swe-bench:latest imagePullPolicy: Always securityContext: allowPrivilegeEscalation: false diff --git a/evals/swe_bench/deploy/raycluster-test-exec.yaml b/evals/swe_bench/deploy/raycluster-test-exec.yaml deleted file mode 100644 index 51568ca..0000000 --- a/evals/swe_bench/deploy/raycluster-test-exec.yaml +++ /dev/null @@ -1,128 +0,0 @@ ---- -# ConfigMap required by the ODH KubeRay operator's kube-rbac-proxy sidecar. -# See agent-notes/odh-kuberay-rbac-proxy-configmap-bug.md -apiVersion: v1 -kind: ConfigMap -metadata: - name: kube-rbac-proxy-config-swe-bench-test-exec -data: - config-file.yaml: |+ - authorization: - resourceAttributes: - apiVersion: v1 - resource: services - subresource: proxy - name: swe-bench-test-exec ---- -# Phase 2 RayCluster: Test execution via K8s Jobs. -# -# Workers are thin orchestrators -- they create K8s Jobs using pre-built -# SWE-bench container images, watch for completion, and collect logs. -# Each worker manages a concurrency window of M K8s Jobs in parallel. -# The actual test execution happens in the K8s Job pods, not here. -# -# With 4 workers x 4 concurrent jobs = 16 parallel test executions. -# -# Prerequisites: -# - oc apply -f evals/swe_bench/deploy/rbac.yaml -# - predictions.jsonl from Phase 1 (in S3/MinIO) -# - MinIO credentials secret (same as Phase 1) -# - Port-forward: oc port-forward svc/swe-bench-test-exec-head-svc 8265:8265 -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: swe-bench-test-exec -spec: - rayVersion: "2.53.0" - headGroupSpec: - rayStartParams: - dashboard-host: "0.0.0.0" - num-cpus: "0" - template: - metadata: - labels: - mlflow-client: "true" - spec: - serviceAccountName: swe-bench-eval - securityContext: - runAsNonRoot: true - containers: - - name: ray-head - image: quay.io/michaelclifford/ray-swe-bench:latest - imagePullPolicy: Always - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - requests: - cpu: "4" - memory: "8Gi" - limits: - cpu: "4" - memory: "8Gi" - env: - - name: NVIDIA_VISIBLE_DEVICES - value: "void" - # MinIO/S3 credentials for reading predictions and writing results - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: minio-credentials - key: MINIO_ROOT_USER - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: minio-credentials - key: MINIO_ROOT_PASSWORD - - name: S3_ENDPOINT_URL - valueFrom: - secretKeyRef: - name: minio-credentials - key: MINIO_ENDPOINT_URL - volumeMounts: - - name: ray-tmp - mountPath: /tmp - volumes: - - name: ray-tmp - emptyDir: {} - workerGroupSpecs: - - groupName: test-exec - replicas: 4 - minReplicas: 1 - maxReplicas: 8 - rayStartParams: - num-cpus: "2" - template: - spec: - serviceAccountName: swe-bench-eval - securityContext: - runAsNonRoot: true - containers: - - name: ray-worker - image: quay.io/michaelclifford/ray-swe-bench:latest - imagePullPolicy: Always - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - resources: - requests: - cpu: "2" - memory: "4Gi" - limits: - cpu: "2" - memory: "4Gi" - volumeMounts: - - name: ray-tmp - mountPath: /tmp - volumes: - - name: ray-tmp - emptyDir: {} diff --git a/evals/swe_bench/grader.py b/evals/swe_bench/grader.py index 681475c..cfba2a0 100644 --- a/evals/swe_bench/grader.py +++ b/evals/swe_bench/grader.py @@ -33,80 +33,19 @@ class InstanceResult: @dataclass class AggregateReport(BaseAggregateReport): - """Aggregate report across all evaluated SWE-bench instances. - - Inherits common aggregation structure from BaseAggregateReport. - Adds SWE-bench-specific field aliases for compatibility. - """ - - # Aliases for base class fields (for backward compatibility) - @property - def total_instances(self) -> int: - return self.total - - @total_instances.setter - def total_instances(self, value: int) -> None: - self.total = value - - @property - def resolved_instances(self) -> int: - return self.passed - - @resolved_instances.setter - def resolved_instances(self, value: int) -> None: - self.passed = value - - @property - def unresolved_instances(self) -> int: - return self.failed - - @unresolved_instances.setter - def unresolved_instances(self, value: int) -> None: - self.failed = value - - @property - def error_instances(self) -> int: - return self.errors - - @error_instances.setter - def error_instances(self, value: int) -> None: - self.errors = value - - @property - def empty_patch_instances(self) -> int: - return self.empty - - @empty_patch_instances.setter - def empty_patch_instances(self, value: int) -> None: - self.empty = value - - @property - def resolve_rate(self) -> float: - return self.pass_rate - - @resolve_rate.setter - def resolve_rate(self, value: float) -> None: - self.pass_rate = value - - @property - def resolved_ids(self) -> list[str]: - return self.passed_ids - - @property - def unresolved_ids(self) -> list[str]: - return self.failed_ids + """Aggregate report across all evaluated SWE-bench instances.""" def to_dict(self) -> dict[str, Any]: """Convert to dict with SWE-bench field names.""" return { - "total_instances": self.total_instances, - "resolved_instances": self.resolved_instances, - "unresolved_instances": self.unresolved_instances, - "error_instances": self.error_instances, - "empty_patch_instances": self.empty_patch_instances, - "resolve_rate": self.resolve_rate, - "resolved_ids": self.resolved_ids, - "unresolved_ids": self.unresolved_ids, + "total_instances": self.total, + "resolved_instances": self.passed, + "unresolved_instances": self.failed, + "error_instances": self.errors, + "empty_patch_instances": self.empty, + "resolve_rate": self.pass_rate, + "resolved_ids": self.passed_ids, + "unresolved_ids": self.failed_ids, "error_ids": self.error_ids, } diff --git a/evals/swe_bench/multi_turn_worker.py b/evals/swe_bench/multi_turn_worker.py new file mode 100644 index 0000000..eb9f0a6 --- /dev/null +++ b/evals/swe_bench/multi_turn_worker.py @@ -0,0 +1,380 @@ +""" +SWE-bench multi-turn Ray worker. + +Subclasses MultiTurnWorkerBase with SWE-bench-specific generation and evaluation: + - naive — inline vLLM inference per turn + - agent — K8s agent Job per turn (run_eval=False; grading via SWEBenchUnitTestVerifier) + Prior patches and feedback are injected into the problem statement + for subsequent agent turns. + +The base class (MultiTurnWorkerBase) handles vLLM client management, verifier +execution, and the batch entry point. This class wires them to SWE-bench. +""" + +from __future__ import annotations + +import logging +from typing import Any, Literal + +import ray + +from evals.common.multi_turn import MultiTurnResult, MultiTurnSession +from evals.common.multi_turn_worker import MultiTurnWorkerBase +from evals.common.score_aggregator import ScoreAggregator +from evals.common.verifier_set import VerifierSet +from evals.swe_bench.instance_runner import InstanceRunner +from evals.swe_bench.prompt import extract_diff_from_response +from verifiers.base import PatchContext, VerifierResult + +logger = logging.getLogger(__name__) + + +@ray.remote(num_cpus=1) +class SWEBenchMultiTurnWorker(MultiTurnWorkerBase): + """SWE-bench multi-turn evaluation worker. + + Extends MultiTurnWorkerBase with: + - SWE-bench verifier context construction (_make_context) + - naive strategy: inline vLLM + patch extraction from diff + - agent strategy: K8s Job per turn with prior context injection + - SWE-bench result format (model_patch, eval_report, resolved) + + Agent-only args: + agent_config_dict: Serialised AgentConfig dict. + model_api_key: API key forwarded to the agent. + subset: SWE-bench subset name forwarded to the agent. + split: Dataset split forwarded to the agent. + step_limit: Max agent steps per turn. + cost_limit: Max cost per turn in dollars. + """ + + def __init__( + self, + vllm_urls: list[str], + model_name: str, + strategy: Literal["naive", "agent"] = "naive", + max_tokens: int = 16000, + temperature: float = 0.15, + verifier_set: VerifierSet | None = None, + aggregator: ScoreAggregator | None = None, + max_turns: int = 1, + k8s_namespace: str | None = None, + timeout: int = 1800, + service_account: str = "swe-bench-eval", + max_concurrent_jobs: int = 4, + swebench_namespace: str = "swebench", + image_registry: str | None = None, + # agent strategy only + agent_config_dict: dict | None = None, + model_api_key: str = "dummy", + subset: str = "verified", + split: str = "test", + step_limit: int = 100, + cost_limit: float = 3.0, + ): + from evals.swe_bench.agent_config import AgentConfig + from evals.swe_bench.agent_worker import _detect_namespace, _init_k8s + + super().__init__( + vllm_urls=vllm_urls, + model_name=model_name, + max_tokens=max_tokens, + temperature=temperature, + verifier_set=verifier_set, + aggregator=aggregator, + max_turns=max_turns, + max_concurrent_jobs=max_concurrent_jobs, + ) + + self.strategy = strategy + self.swebench_namespace = swebench_namespace + self.image_registry = image_registry + self.vllm_urls = vllm_urls + + self.k8s_namespace = k8s_namespace or _detect_namespace() + self.service_account = service_account + self.batch_api, self.core_api = _init_k8s() + + self.runner = InstanceRunner( + k8s_namespace=self.k8s_namespace, + timeout=timeout, + service_account=service_account, + ) + + self.agent_config = AgentConfig(**agent_config_dict) if agent_config_dict else None + self.model_api_key = model_api_key + self.subset = subset + self.split = split + self.step_limit = step_limit + self.cost_limit = cost_limit + + # ── SWE-bench verifier context ────────────────────────────────────────── + + def _make_context(self, patch: str, instance_data: dict, run_id: str) -> PatchContext: + return PatchContext( + patch_diff=patch, + changed_files=_extract_changed_files(patch), + task_id=instance_data["instance_id"], + metadata={ + "instance_data": instance_data, + "run_id": run_id, + "runner": self.runner, + "model_name": self.model_name, + "image_registry": self.image_registry, + "swebench_namespace": self.swebench_namespace, + }, + ) + + # ── Generation dispatch ───────────────────────────────────────────────── + + def _generate_turn( + self, + messages: list[dict], + instance: dict, + run_id: str, + ) -> str: + if self.strategy == "naive": + return self._generate_naive(messages) + return self._generate_agent(messages, instance, run_id) + + def _generate_agent( + self, + messages: list[dict], + instance: dict, + run_id: str, + ) -> str: + """Generate a patch via a K8s agent Job (eval disabled; grading done separately). + + For turns after the first, prior patches and feedback are injected into + the problem statement so the agent has full context of previous attempts. + """ + from swebench.harness.constants import DOCKER_WORKDIR + + from evals.swe_bench.agent_worker import ( + _build_job_command, + _build_job_manifest, + _delete_job, + _get_pod_logs, + _wait_for_job, + extract_prediction_from_logs, + resolve_instance, + ) + + assert self.agent_config is not None, "agent_config_dict required for agent strategy" + + instance_id = instance["instance_id"] + n_initial = 2 if messages and messages[0]["role"] == "system" else 1 + turn_idx = (len(messages) - n_initial) // 2 + + modified_instance = dict(instance) + if turn_idx > 0 and not self.agent_config.needs_swebench_dataset: + prior_context = self._extract_prior_context(messages) + if prior_context: + modified_instance["problem_statement"] = ( + instance.get("problem_statement", "") + + "\n\n---\nPrior attempts and feedback:\n\n" + + prior_context + ) + + job_name = None + try: + info = resolve_instance( + modified_instance, + self.image_registry or "", + self.swebench_namespace, + ) + + template_vars = { + "instance_id": instance_id, + "model_name": self.model_name, + "model_base_url": self.vllm_urls[0], + "model_api_key": self.model_api_key, + "workdir": DOCKER_WORKDIR, + "problem_statement_file": "/tmp/problem_statement.txt", + "subset": self.subset, + "split": self.split, + "step_limit": str(self.step_limit), + "cost_limit": str(self.cost_limit), + } + + command = _build_job_command( + agent_config=self.agent_config, + instance=modified_instance, + template_vars=template_vars, + eval_script="", + run_eval=False, + ) + job = _build_job_manifest( + instance_id=instance_id, + run_id=f"{run_id}-t{turn_idx}", + image=info.image, + command=command, + namespace=self.k8s_namespace, + agent_config=self.agent_config, + service_account=self.service_account, + ) + + self.batch_api.create_namespaced_job(namespace=self.k8s_namespace, body=job) + job_name = job.metadata.name + logger.info(f"[{instance_id}] Turn {turn_idx}: created agent Job {job_name}") + + _, timed_out = _wait_for_job(self.batch_api, job_name, self.k8s_namespace) + if timed_out: + logger.warning(f"[{instance_id}] Turn {turn_idx}: agent Job timed out") + + logs = _get_pod_logs(self.core_api, job_name, self.k8s_namespace) + prediction = extract_prediction_from_logs( + logs=logs, + instance_id=instance_id, + model_name=self.model_name, + patch_extraction=self.agent_config.patch_extraction, + ) + return prediction.get("model_patch", "") + + except Exception as e: + logger.error(f"[{instance_id}] Turn {turn_idx}: agent Job failed: {e}") + return "" + finally: + if job_name: + _delete_job(self.batch_api, job_name, self.k8s_namespace) + + def _extract_prior_context(self, messages: list[dict]) -> str: + """Format prior assistant+user message pairs as plain text for context injection.""" + start = 2 if messages and messages[0]["role"] == "system" else 1 + parts = [] + attempt = 1 + i = start + while i < len(messages): + if messages[i]["role"] == "assistant": + parts.append(f"### Attempt {attempt}\n{messages[i]['content']}") + i += 1 + if i < len(messages) and messages[i]["role"] == "user": + parts.append(f"### Feedback on Attempt {attempt}\n{messages[i]['content']}") + i += 1 + attempt += 1 + else: + i += 1 + return "\n\n".join(parts) + + # ── Per-instance loop ─────────────────────────────────────────────────── + + def _evaluate_instance( + self, + instance: dict, + prompts: dict[str, str], + run_id: str, + ) -> dict[str, Any]: + instance_id = instance["instance_id"] + + if self.strategy == "naive": + prompt = prompts.get(instance_id, "") + if not prompt: + logger.error(f"No prompt found for {instance_id}") + return _error_result(instance_id, self.model_name, "No prompt found for instance") + lines = prompt.split("\n", 1) + if len(lines) == 2: + initial_messages = [ + {"role": "system", "content": lines[0]}, + {"role": "user", "content": lines[1]}, + ] + else: + initial_messages = [{"role": "user", "content": prompt}] + extract_fn = extract_diff_from_response + else: + initial_messages = [ + {"role": "user", "content": instance.get("problem_statement", "")}, + ] + extract_fn = lambda x: x # agent jobs return the patch directly + + logger.info( + f"[{instance_id}] Starting multi-turn evaluation " + f"(strategy={self.strategy}, max_turns={self.max_turns})" + ) + + session = MultiTurnSession( + generate_fn=lambda messages: self._generate_turn(messages, instance, run_id), + extract_fn=extract_fn, + run_intermediate_fn=lambda patch: self._run_verifier_set( + self.intermediate_set, self._make_context(patch, instance, run_id) + ), + run_final_fn=lambda patch: self._run_verifier_set( + self.final_set, self._make_context(patch, instance, run_id) + ), + aggregator=self.aggregator, + intermediate_verifier_set=self.intermediate_set, + max_turns=self.max_turns, + ) + + try: + result: MultiTurnResult = session.run(initial_messages) + except Exception as e: + logger.error(f"[{instance_id}] Multi-turn session failed: {e}") + return _error_result(instance_id, self.model_name, str(e)) + + logger.info( + f"[{instance_id}] Completed: {result.num_turns} turn(s), " + f"early_exit={result.stopped_early}, " + f"final_score={result.final_aggregate_score:.3f}" + ) + + swe_test_result = next( + (r for r in result.final_verifier_results if r.name == "swe_test"), None + ) + eval_report = swe_test_result.details if swe_test_result else {} + + return { + "instance_id": instance_id, + "model_patch": result.final_output, + "model_name_or_path": self.model_name, + "error": None, + "eval_report": eval_report, + "resolved": eval_report.get("resolved", False), + "multi_turn": { + "num_turns": result.num_turns, + "stopped_early": result.stopped_early, + "final_aggregate_score": result.final_aggregate_score, + "turns": [ + { + "turn": t.turn, + "aggregate_score": t.aggregate_score, + "verifier_results": [ + { + "name": r.name, + "status": r.status.value, + "score": r.score, + "passed": r.passed, + } + for r in t.verifier_results + ], + } + for t in result.turns + ], + }, + } + + +# ── Utilities ──────────────────────────────────────────────────────────────── + +def _extract_changed_files(patch_diff: str) -> list[str]: + files = [] + for line in patch_diff.splitlines(): + if line.startswith("+++ "): + path = line[4:] + if path.startswith("b/"): + path = path[2:] + if path != "/dev/null": + files.append(path) + return files + + +def _error_result(instance_id: str, model_name: str, error: str) -> dict: + return { + "instance_id": instance_id, + "model_patch": "", + "model_name_or_path": model_name, + "error": error, + "eval_report": {}, + "resolved": False, + "multi_turn": {"num_turns": 0, "stopped_early": False, + "final_aggregate_score": 0.0, "turns": []}, + } diff --git a/evals/swe_bench/patch_worker.py b/evals/swe_bench/patch_worker.py deleted file mode 100644 index 1819541..0000000 --- a/evals/swe_bench/patch_worker.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Phase 1 Ray worker: generate patches via vLLM. - -Each worker receives a sub-list of SWE-bench instances with pre-built -prompts (from swebench's prompt pipeline), calls the vLLM endpoint -to generate patches, and returns structured results. - -Workers do NOT build prompts -- they receive them pre-built. Prompt -construction (which requires git cloning) is done separately via -build_prompt_dataset.py / job-build-prompts.yaml. -""" - -from __future__ import annotations - -import logging - -import ray - -from evals.common.inference_worker import InferenceWorker -from .prompt import extract_diff_from_response - -logger = logging.getLogger(__name__) - - -@ray.remote(num_cpus=1) -class PatchWorker(InferenceWorker): - """Generates patches for SWE-bench instances via vLLM. - - Inherits from InferenceWorker and customizes for SWE-bench: - - Handles SWE-bench's prompt format (first line = system message) - - Extracts patches using swebench's extract_diff - - Returns predictions with SWE-bench schema (model_patch) - - Args: - vllm_urls: List of vLLM OpenAI-compatible base URLs. - Requests are round-robined across them. - model_name: Model name as registered in vLLM. - max_tokens: Maximum tokens for patch generation. - temperature: Sampling temperature. - """ - - def _generate(self, prompt: str) -> str: - """Override to handle SWE-bench's prompt format. - - SWE-bench convention: first line is the system message, - rest is the user message. - - Args: - prompt: Pre-built prompt text from swebench's pipeline. - - Returns: - Raw response from the model. - """ - client = self._get_client() - - # Split prompt into system and user messages - lines = prompt.split("\n", 1) - if len(lines) == 2: - system_msg = lines[0] - user_msg = lines[1] - else: - system_msg = None - user_msg = prompt - - messages = [] - if system_msg: - messages.append({"role": "system", "content": system_msg}) - messages.append({"role": "user", "content": user_msg}) - - response = client.chat.completions.create( - model=self.model_name, - messages=messages, - max_tokens=self.max_tokens, - temperature=self.temperature, - ) - - return response.choices[0].message.content or "" - - def generate_patches( - self, - instances: list[dict], - prompts: dict[str, str], - ) -> list[dict]: - """Generate patches for a batch of SWE-bench instances. - - Uses parent's generate_batch with SWE-bench-specific configuration. - - Args: - instances: List of SWE-bench dataset instances. - prompts: Map of instance_id -> pre-built prompt text. - - Returns: - List of dicts with keys: instance_id, model_patch, - full_output, model_name_or_path, error. - """ - results = self.generate_batch( - instances=instances, - prompts=prompts, - extract_fn=extract_diff_from_response, - instance_id_key="instance_id", - ) - - # Rename 'prediction' to 'model_patch' for SWE-bench schema - for result in results: - result["model_patch"] = result.pop("prediction") - - return results diff --git a/evals/swe_bench/run_patch_generation.py b/evals/swe_bench/run_patch_generation.py index 4387f94..7bcf65f 100644 --- a/evals/swe_bench/run_patch_generation.py +++ b/evals/swe_bench/run_patch_generation.py @@ -1,27 +1,49 @@ -"""Phase 1: Generate patches for SWE-bench instances. - -Supports two strategies: - - naive: Single-shot vLLM inference from pre-built prompts. - - agent: Agentic loop using mini-swe-agent (or any agent via YAML config) - running inside K8s Jobs with SWE-bench container images. - -Distributes work across Ray workers, saves per-instance results, -and merges into predictions.jsonl for Phase 2 or sb-cli evaluation. +"""SWE-bench evaluation: patch generation and test execution. + +Supports two strategies, both of which optionally support multi-turn iteration: + - naive: vLLM inference from pre-built prompts, followed by K8s-based test + execution. With --max-turns > 1, the model receives intermediate + feedback (from configurable verifiers) and can revise its patch. + - agent: Agentic loop using mini-swe-agent (or any agent via YAML config) + running inside K8s Jobs with SWE-bench container images. + With --max-turns > 1, the agent runs repeatedly, receiving feedback + from intermediate verifiers between attempts. + +Usage (naive, single-shot): + python -m evals.swe_bench.run_patch_generation \ + --strategy naive \ + --vllm-url http://vllm-server:8000/v1 \ + --model-name Qwen/Qwen3-1.7B \ + --prompts s3://swe-bench/prompts/style-3-oracle.jsonl \ + --output-dir /tmp/swe-bench-results/ -Usage (naive): +Usage (naive, multi-turn): python -m evals.swe_bench.run_patch_generation \ --strategy naive \ --vllm-url http://vllm-server:8000/v1 \ --model-name Qwen/Qwen3-1.7B \ --prompts s3://swe-bench/prompts/style-3-oracle.jsonl \ + --max-turns 3 \ + --intermediate-verifiers ast_check \ + --aggregator mean \ + --output-dir /tmp/swe-bench-results/ + +Usage (agent, single-shot): + python -m evals.swe_bench.run_patch_generation \ + --strategy agent \ + --vllm-url http://vllm-server:8000/v1 \ + --model-name Qwen/Qwen3-1.7B \ + --agent-config evals/swe_bench/agents/mini_swe_agent.yaml \ --output-dir /tmp/swe-bench-results/ -Usage (agent): +Usage (agent, multi-turn): python -m evals.swe_bench.run_patch_generation \ --strategy agent \ --vllm-url http://vllm-server:8000/v1 \ --model-name Qwen/Qwen3-1.7B \ --agent-config evals/swe_bench/agents/mini_swe_agent.yaml \ + --max-turns 3 \ + --intermediate-verifiers ast_check \ --output-dir /tmp/swe-bench-results/ """ @@ -172,11 +194,47 @@ def _upload_to_s3(local_path: Path, s3_uri: str | None) -> None: upload_file(local_path, s3_uri) -# ── Strategy: naive (single-shot vLLM) ────────────────────────────── +# ── Verifier set builder (shared across strategies) ───────────────── + +def _build_verifier_set(args): + """Build the VerifierSet and aggregator from CLI args.""" + from evals.common.score_aggregator import build_aggregator + from evals.common.verifier_set import VerifierSet + from evals.swe_bench.verifiers.unit_test_verifier import SWEBenchUnitTestVerifier + from verifiers.ast_check import ASTCheckVerifier + + vset = VerifierSet() + intermediate_names = set(args.intermediate_verifiers or []) + + if "ast_check" in intermediate_names: + vset.add(ASTCheckVerifier(), run_intermediate=True, run_final=False) + + # SWE-bench test verifier always runs as final; optionally also intermediate + run_swe_intermediate = "swe_test" in intermediate_names + vset.add( + SWEBenchUnitTestVerifier( + swebench_namespace=args.swebench_namespace, + image_registry=args.image_registry or None, + timeout=float(args.job_timeout or 1800), + ), + run_intermediate=run_swe_intermediate, + run_final=True, + ) + + aggregator = build_aggregator(args.aggregator) + return vset, aggregator + + +# ── Strategy: naive (vLLM inference, single or multi-turn) ────────── def _run_naive(args, pending: list, output_dir: Path) -> list[dict]: - """Run naive single-shot inference via PatchWorker Ray actors.""" - from evals.swe_bench.patch_worker import PatchWorker + """Run naive vLLM inference via SWEBenchMultiTurnWorker Ray actors. + + With max_turns=1 this is equivalent to single-shot generation followed by + K8s test execution. With max_turns>1 the model receives intermediate + verifier feedback and can revise its patch. + """ + from evals.swe_bench.multi_turn_worker import SWEBenchMultiTurnWorker from evals.swe_bench.prompt import load_prompt_dataset prompts_path = _resolve_prompts(args.prompts, output_dir) @@ -191,13 +249,24 @@ def _run_naive(args, pending: list, output_dir: Path) -> list[dict]: f"First missing: {missing[:5]}" ) + vset, aggregator = _build_verifier_set(args) num_workers = min(args.num_workers, len(pending)) workers = [ - PatchWorker.remote( + SWEBenchMultiTurnWorker.remote( vllm_urls=args.vllm_url, model_name=args.model_name, + strategy="naive", max_tokens=args.max_tokens, temperature=args.temperature, + verifier_set=vset, + aggregator=aggregator, + max_turns=args.max_turns, + k8s_namespace=args.k8s_namespace, + timeout=int(args.job_timeout or 1800), + service_account=args.service_account, + max_concurrent_jobs=args.max_concurrent_jobs, + swebench_namespace=args.swebench_namespace, + image_registry=args.image_registry or None, ) for _ in range(num_workers) ] @@ -207,9 +276,12 @@ def _run_naive(args, pending: list, output_dir: Path) -> list[dict]: batches[i % num_workers].append(dict(instance)) prompts_ref = ray.put(prompts) - logger.info(f"Distributing {len(pending)} instances across {num_workers} workers") + logger.info( + f"Distributing {len(pending)} instances across {num_workers} workers " + f"(max_turns={args.max_turns})" + ) futures = [ - worker.generate_patches.remote(batch, prompts_ref) + worker.evaluate_batch.remote(batch, prompts_ref, args.run_id) for worker, batch in zip(workers, batches) if batch ] @@ -217,53 +289,103 @@ def _run_naive(args, pending: list, output_dir: Path) -> list[dict]: return _collect_results(futures) -# ── Strategy: agent (agentic loop via K8s Jobs) ───────────────────── +# ── Strategy: agent (agentic loop via K8s Jobs, single or multi-turn) def _run_agent(args, pending: list, output_dir: Path) -> list[dict]: - """Run agent-based patch generation via AgentWorker Ray actors.""" + """Run agent-based patch generation via AgentWorker or SWEBenchMultiTurnWorker. + + With max_turns=1, uses AgentWorker which is self-contained (generation + + evaluation happen inside the K8s Job). With max_turns>1, uses + SWEBenchMultiTurnWorker with strategy="agent" so intermediate verifier feedback + can be injected between attempts. + """ from evals.swe_bench.agent_config import load_agent_config - from evals.swe_bench.agent_worker import AgentWorker agent_config = load_agent_config(args.agent_config) logger.info(f"Agent: {agent_config.name}") subset = _resolve_subset_name(args.dataset) - num_workers = min(args.num_workers, len(pending)) - workers = [ - AgentWorker.remote( - agent_config_dict=asdict(agent_config), - model_name=args.model_name, - model_base_url=args.vllm_url[0], - model_api_key=args.model_api_key, - k8s_namespace=args.k8s_namespace, - service_account=args.service_account, - image_registry=args.image_registry, - swebench_namespace=args.swebench_namespace, - subset=subset, - split=args.split, - step_limit=args.step_limit, - cost_limit=args.cost_limit, - max_concurrent_jobs=args.max_concurrent_jobs, - job_timeout=args.job_timeout, - run_eval=args.run_eval, - ) - for _ in range(num_workers) - ] batches = [[] for _ in range(num_workers)] for i, instance in enumerate(pending): batches[i % num_workers].append(dict(instance)) - logger.info( - f"Distributing {len(pending)} instances across {num_workers} workers " - f"(max {args.max_concurrent_jobs} concurrent Jobs per worker)" - ) - futures = [ - worker.generate_patches.remote(batch, args.run_id) - for worker, batch in zip(workers, batches) - if batch - ] + if args.max_turns > 1: + from evals.swe_bench.multi_turn_worker import SWEBenchMultiTurnWorker + + vset, aggregator = _build_verifier_set(args) + + workers = [ + SWEBenchMultiTurnWorker.remote( + vllm_urls=args.vllm_url, + model_name=args.model_name, + strategy="agent", + max_tokens=args.max_tokens, + temperature=args.temperature, + verifier_set=vset, + aggregator=aggregator, + max_turns=args.max_turns, + k8s_namespace=args.k8s_namespace, + timeout=int(args.job_timeout or 1800), + service_account=args.service_account, + max_concurrent_jobs=args.max_concurrent_jobs, + swebench_namespace=args.swebench_namespace, + image_registry=args.image_registry or None, + agent_config_dict=asdict(agent_config), + model_api_key=args.model_api_key, + subset=subset, + split=args.split, + step_limit=args.step_limit, + cost_limit=args.cost_limit, + ) + for _ in range(num_workers) + ] + + prompts_ref = ray.put({}) # agent strategy does not use pre-built prompts + logger.info( + f"Distributing {len(pending)} instances across {num_workers} workers " + f"(agent, max_turns={args.max_turns}, " + f"max {args.max_concurrent_jobs} concurrent Jobs per worker)" + ) + futures = [ + worker.evaluate_batch.remote(batch, prompts_ref, args.run_id) + for worker, batch in zip(workers, batches) + if batch + ] + else: + from evals.swe_bench.agent_worker import AgentWorker + + workers = [ + AgentWorker.remote( + agent_config_dict=asdict(agent_config), + model_name=args.model_name, + model_base_url=args.vllm_url[0], + model_api_key=args.model_api_key, + k8s_namespace=args.k8s_namespace, + service_account=args.service_account, + image_registry=args.image_registry, + swebench_namespace=args.swebench_namespace, + subset=subset, + split=args.split, + step_limit=args.step_limit, + cost_limit=args.cost_limit, + max_concurrent_jobs=args.max_concurrent_jobs, + job_timeout=args.job_timeout, + run_eval=args.run_eval, + ) + for _ in range(num_workers) + ] + + logger.info( + f"Distributing {len(pending)} instances across {num_workers} workers " + f"(agent, max {args.max_concurrent_jobs} concurrent Jobs per worker)" + ) + futures = [ + worker.generate_patches.remote(batch, args.run_id) + for worker, batch in zip(workers, batches) + if batch + ] return _collect_results(futures) @@ -320,7 +442,7 @@ def _dry_run(args, dataset: list) -> None: def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Phase 1: Generate SWE-bench patches (naive or agent strategy)" + description="Generate SWE-bench patches (naive or agent strategy, single or multi-turn)" ) # Common args @@ -346,6 +468,16 @@ def _parse_args() -> argparse.Namespace: help="Max instances to evaluate (0 = no limit)") common.add_argument("--run-id", type=str, default="eval-run", help="Unique run identifier") + common.add_argument("--max-turns", type=int, default=1, + help="Max generation attempts per instance " + "(1 = single-shot, >1 = multi-turn with feedback)") + common.add_argument("--intermediate-verifiers", type=str, nargs="*", default=[], + choices=["ast_check"], + help="Verifiers to run after each intermediate turn. " + "Options: ast_check") + common.add_argument("--aggregator", type=str, default="mean", + choices=["mean", "min", "weighted_sum"], + help="Score aggregation strategy for multi-turn") # Naive strategy args naive = parser.add_argument_group("naive strategy") @@ -381,9 +513,10 @@ def _parse_args() -> argparse.Namespace: agent.add_argument("--job-timeout", type=int, default=0, help="K8s Job timeout in seconds (0 = use agent config default)") agent.add_argument("--run-eval", action="store_true", default=True, - help="Run in-container evaluation after agent (default: True)") + help="Run in-container evaluation after agent (default: True, " + "only applies when max_turns=1)") agent.add_argument("--skip-eval", action="store_true", - help="Skip in-container evaluation") + help="Skip in-container evaluation (only applies when max_turns=1)") # Debug agent.add_argument("--dry-run", action="store_true", @@ -456,8 +589,10 @@ def main(): # Upload to S3 _upload_to_s3(predictions_path, args.s3_output) - # Write aggregate results.json (agent strategy with eval) - if args.strategy == "agent" and args.run_eval: + # Write aggregate results.json when evaluation was performed + uses_multi_turn = args.max_turns > 1 + agent_with_eval = args.strategy == "agent" and args.max_turns == 1 and args.run_eval + if uses_multi_turn or agent_with_eval: _write_aggregate_results(output_dir, all_results) # Summary @@ -469,13 +604,14 @@ def main(): if (output_dir / inst["instance_id"] / "prediction.json").exists() ) + turn_label = f", max_turns={args.max_turns}" if args.max_turns > 1 else "" logger.info("=" * 60) - logger.info(f"Phase 1 complete ({args.strategy} strategy)") + logger.info(f"Evaluation complete ({args.strategy}{turn_label})") logger.info("=" * 60) logger.info(f" Patches generated: {patches_generated}/{total}") logger.info(f" Errors: {errors}") - if args.strategy == "agent" and args.run_eval: + if uses_multi_turn or agent_with_eval: resolved = sum(1 for r in all_results if r.get("resolved") is True) evaluated = sum(1 for r in all_results if r.get("resolved") is not None) rate = resolved / evaluated if evaluated > 0 else 0 @@ -483,6 +619,20 @@ def main(): logger.info(f" Resolved: {resolved}") logger.info(f" Resolve rate: {rate:.1%}") + if uses_multi_turn: + turn_counts = [ + r.get("multi_turn", {}).get("num_turns", 1) + for r in all_results if not r.get("error") + ] + if turn_counts: + avg_turns = sum(turn_counts) / len(turn_counts) + early_exits = sum( + 1 for r in all_results + if r.get("multi_turn", {}).get("stopped_early", False) + ) + logger.info(f" Avg turns: {avg_turns:.2f}") + logger.info(f" Early exits: {early_exits}/{len(turn_counts)}") + logger.info(f" Total time: {elapsed / 60:.1f} minutes") logger.info("=" * 60) @@ -496,8 +646,6 @@ def main(): def _write_aggregate_results(output_dir: Path, all_results: list[dict]) -> None: """Write aggregate results.json from eval results.""" - from evals.swe_bench.grader import AggregateReport - evaluated = [r for r in all_results if r.get("resolved") is not None] resolved = [r for r in evaluated if r.get("resolved") is True] errors = [r for r in all_results if r.get("error")] @@ -536,8 +684,8 @@ def _log_to_mlflow(args, total, patches_generated, errors, all_results, os.environ["MLFLOW_S3_ENDPOINT_URL"] = s3_endpoint mlflow.set_experiment("swe-bench-eval") - with mlflow.start_run(run_name=f"{args.run_id}-phase1", - tags={"run_id": args.run_id, "phase": "1", + with mlflow.start_run(run_name=f"{args.run_id}-{args.strategy}", + tags={"run_id": args.run_id, "strategy": args.strategy}): params = { "strategy": args.strategy, @@ -547,6 +695,7 @@ def _log_to_mlflow(args, total, patches_generated, errors, all_results, "run_id": args.run_id, "num_workers": args.num_workers, "instance_limit": args.instance_limit, + "max_turns": args.max_turns, } if args.strategy == "naive": params.update({ @@ -559,15 +708,22 @@ def _log_to_mlflow(args, total, patches_generated, errors, all_results, "step_limit": args.step_limit, "cost_limit": args.cost_limit, }) + if args.max_turns > 1: + params.update({ + "intermediate_verifiers": ",".join(args.intermediate_verifiers or []), + "aggregator": args.aggregator, + }) mlflow.log_params(params) + uses_multi_turn = args.max_turns > 1 + agent_with_eval = args.strategy == "agent" and args.max_turns == 1 and args.run_eval metrics = { "total_instances": total, "patches_generated": patches_generated, "generation_errors": errors, } - if args.strategy == "agent" and args.run_eval: + if uses_multi_turn or agent_with_eval: resolved = sum(1 for r in all_results if r.get("resolved") is True) evaluated = sum(1 for r in all_results if r.get("resolved") is not None) metrics["evaluated"] = evaluated diff --git a/evals/swe_bench/run_test_execution.py b/evals/swe_bench/run_test_execution.py deleted file mode 100644 index 1e5bede..0000000 --- a/evals/swe_bench/run_test_execution.py +++ /dev/null @@ -1,324 +0,0 @@ -"""Phase 2: Run SWE-bench test execution via K8s Jobs. - -Reads predictions from Phase 1 (from S3/MinIO or a local path), -distributes test execution across Ray workers, each managing -concurrent K8s Jobs using pre-built SWE-bench container images. -Results are graded, aggregated, and uploaded to S3/MinIO. - -Results are optionally logged to MLflow when MLFLOW_TRACKING_URI is set. - -Usage: - python run_test_execution.py \ - --predictions s3://swe-bench/runs/run-001/predictions.jsonl \ - --output-dir /tmp/swe-bench-results/ \ - --s3-output s3://swe-bench/runs/run-001/results.json -""" - -from __future__ import annotations - -import argparse -import json -import logging -import os -from pathlib import Path - -import ray -from swebench.harness.utils import load_swebench_dataset - -from evals.common.s3_storage import download_file, upload_file -from evals.swe_bench.grader import InstanceResult, aggregate_reports -from evals.swe_bench.test_worker import TestWorker - -logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") -logger = logging.getLogger(__name__) - - -def load_predictions(predictions_path: Path) -> list[dict]: - """Load predictions from a JSONL file.""" - predictions = [] - with open(predictions_path) as f: - for line in f: - line = line.strip() - if line: - predictions.append(json.loads(line)) - return predictions - - -def _build_gold_predictions(dataset: list, output_dir: Path, limit: int = 0) -> Path: - """Build predictions from gold patches in the dataset. - - Uses the dataset's own 'patch' field as the model prediction. - This is the standard way to verify the eval harness is working. - - Args: - dataset: SWE-bench dataset instances. - output_dir: Directory to write the gold predictions file. - limit: Max instances (0 = no limit). - - Returns: - Path to the generated predictions.jsonl. - """ - instances = dataset[:limit] if limit > 0 else dataset - predictions_path = output_dir / "gold_predictions.jsonl" - - with open(predictions_path, "w") as f: - for inst in instances: - f.write(json.dumps({ - "instance_id": inst["instance_id"], - "model_patch": inst["patch"], - "model_name_or_path": "gold", - }) + "\n") - - logger.info(f"Built {len(instances)} gold predictions from dataset") - return predictions_path - - -def _resolve_predictions(source: str, output_dir: Path) -> Path: - """Resolve predictions source to a local file path. - - Handles: - - "gold": special value, resolved later after dataset is loaded - - "s3://...": download from S3/MinIO - - local path: used as-is - - Returns Path for s3/local, or None for gold (handled separately). - """ - if source == "gold": - # Sentinel -- caller handles this after loading the dataset - return None - if source.startswith("s3://"): - local_path = output_dir / "predictions.jsonl" - download_file(source, local_path) - return local_path - return Path(source) - - -def main(): - parser = argparse.ArgumentParser(description="Phase 2: Run SWE-bench test execution") - parser.add_argument("--predictions", type=str, required=True, - help="'gold' to use dataset ground-truth patches, " - "S3 URI (s3://...), or local path to predictions.jsonl") - parser.add_argument("--dataset", type=str, default="SWE-bench/SWE-bench_Lite", - help="HuggingFace dataset name") - parser.add_argument("--split", type=str, default="test", - help="Dataset split") - parser.add_argument("--output-dir", type=str, required=True, - help="Directory to write results") - parser.add_argument("--run-id", type=str, default="eval-run", - help="Unique run identifier") - parser.add_argument("--num-workers", type=int, default=4, - help="Number of Ray workers") - parser.add_argument("--max-concurrent-jobs", type=int, default=4, - help="Max concurrent K8s Jobs per worker") - parser.add_argument("--k8s-namespace", type=str, default=None, - help="K8s namespace for eval Jobs (auto-detected if not set)") - parser.add_argument("--service-account", type=str, default="swe-bench-eval", - help="K8s ServiceAccount for eval Job pods") - parser.add_argument("--timeout", type=int, default=1800, - help="Per-instance timeout in seconds") - parser.add_argument("--swebench-namespace", type=str, default="swebench", - help="DockerHub namespace for pre-built images") - parser.add_argument("--image-registry", type=str, default=None, - help="Pull SWE-bench images from this registry instead of " - "DockerHub (e.g. image-registry.openshift-image-registry" - ".svc:5000/code-agent)") - parser.add_argument("--instance-limit", type=int, default=0, - help="Max instances to evaluate (0 = no limit)") - parser.add_argument("--s3-output", type=str, default=None, - help="S3 URI to upload results.json " - "(e.g. s3://swe-bench/runs/run-001/results.json)") - args = parser.parse_args() - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - # Load dataset for instance metadata (needed by make_test_spec and gold mode) - logger.info(f"Loading dataset: {args.dataset} split={args.split}") - dataset = load_swebench_dataset(args.dataset, split=args.split) - instances_by_id = {inst["instance_id"]: dict(inst) for inst in dataset} - logger.info(f"Loaded {len(dataset)} dataset instances") - - # Load predictions (gold, S3, or local) - is_gold = args.predictions == "gold" - if is_gold: - logger.info("Using GOLD patches from dataset (harness verification mode)") - predictions_path = _build_gold_predictions( - dataset, output_dir, limit=args.instance_limit, - ) - else: - predictions_path = _resolve_predictions(args.predictions, output_dir) - - logger.info(f"Loading predictions from {predictions_path}") - predictions = load_predictions(predictions_path) - logger.info(f"Loaded {len(predictions)} predictions") - - # Apply instance limit (for non-gold mode; gold already applies it) - if not is_gold and args.instance_limit > 0: - predictions = predictions[:args.instance_limit] - logger.info(f"Limited to {len(predictions)} instances") - - if not predictions: - logger.info("No predictions to evaluate") - return - - # Initialize Ray - ray.init() - - # Check for already-completed results from a previous run (resumability). - # This runs on the head node where results are persisted. - completed_results = {} - for pred in predictions: - iid = pred["instance_id"] - report_path = output_dir / iid / "report.json" - if report_path.exists(): - try: - completed_results[iid] = json.loads(report_path.read_text()) - logger.info(f"Skipping {iid} (already completed)") - except (json.JSONDecodeError, OSError): - pass - - pending = [p for p in predictions if p["instance_id"] not in completed_results] - logger.info( - f"{len(completed_results)} already completed, " - f"{len(pending)} pending" - ) - - # Create workers and distribute pending predictions - all_results = list(completed_results.values()) - - if pending: - num_workers = min(args.num_workers, len(pending)) - workers = [ - TestWorker.remote( - k8s_namespace=args.k8s_namespace, - timeout=args.timeout, - service_account=args.service_account, - max_concurrent_jobs=args.max_concurrent_jobs, - swebench_namespace=args.swebench_namespace, - image_registry=args.image_registry, - ) - for _ in range(num_workers) - ] - - # Split pending predictions across workers - batches = [[] for _ in range(num_workers)] - for i, pred in enumerate(pending): - batches[i % num_workers].append(pred) - - # Submit work -- instances dict is shared via Ray object store - # (serialized once, not per-worker) - instances_ref = ray.put(instances_by_id) - logger.info( - f"Distributing {len(pending)} instances across {num_workers} workers " - f"({args.max_concurrent_jobs} concurrent jobs per worker)" - ) - futures = [ - worker.evaluate_batch.remote(batch, instances_ref, args.run_id) - for worker, batch in zip(workers, batches) - if batch - ] - - # Collect results -- process futures one at a time so a single - # worker failure doesn't discard results from other workers - pending_futures = list(futures) - while pending_futures: - ready, pending_futures = ray.wait(pending_futures, num_returns=1) - try: - batch_results = ray.get(ready[0]) - except Exception as e: - logger.error(f"Test worker batch failed: {e}") - continue - - for result in batch_results: - iid = result["instance_id"] - instance_dir = output_dir / iid - instance_dir.mkdir(parents=True, exist_ok=True) - (instance_dir / "report.json").write_text( - json.dumps(result, indent=2) - ) - all_results.append(result) - - # Aggregate - instance_results = [ - InstanceResult( - instance_id=r["instance_id"], - resolved=r.get("resolved", False), - patch_exists=r.get("patch_exists", False), - patch_successfully_applied=r.get("patch_successfully_applied", False), - error=r.get("error"), - tests_status=r.get("tests_status"), - ) - for r in all_results - ] - report = aggregate_reports(instance_results) - - # Write final report - full_report = { - "summary": report.to_dict(), - "instance_results": all_results, - } - results_path = output_dir / "results.json" - results_path.write_text(json.dumps(full_report, indent=2)) - logger.info(f"Wrote full report to {results_path}") - - # Upload to S3/MinIO - if args.s3_output: - upload_file(results_path, args.s3_output) - - # Summary - logger.info( - f"Phase 2 complete: " - f"{report.resolved_instances}/{report.total_instances} resolved " - f"({report.resolve_rate:.1%}), " - f"{report.error_instances} errors, " - f"{report.empty_patch_instances} empty patches" - ) - - if report.resolved_ids: - logger.info(f"Resolved: {report.resolved_ids}") - - # ── MLflow tracking (optional) ────────────────────────────── - # Logs params, metrics, and the results artifact to MLflow. - # Activated when MLFLOW_TRACKING_URI is set in the environment. - if os.environ.get("MLFLOW_TRACKING_URI"): - try: - import mlflow - - # Ensure MLflow's artifact client talks to the in-cluster MinIO, - # not AWS. The S3_ENDPOINT_URL env var is set on the Ray head - # from the minio-credentials secret. - s3_endpoint = os.environ.get("S3_ENDPOINT_URL") or os.environ.get("MINIO_ENDPOINT_URL") - if s3_endpoint and not os.environ.get("MLFLOW_S3_ENDPOINT_URL"): - os.environ["MLFLOW_S3_ENDPOINT_URL"] = s3_endpoint - - mlflow.set_experiment("swe-bench-eval") - with mlflow.start_run(run_name=f"{args.run_id}-phase2", - tags={"run_id": args.run_id, "phase": "2"}): - mlflow.log_params({ - "dataset": args.dataset, - "split": args.split, - "run_id": args.run_id, - "predictions_source": args.predictions, - "num_workers": args.num_workers, - "max_concurrent_jobs": args.max_concurrent_jobs, - "timeout": args.timeout, - "instance_limit": args.instance_limit, - }) - mlflow.log_metrics({ - "total_instances": report.total_instances, - "resolved_instances": report.resolved_instances, - "unresolved_instances": report.unresolved_instances, - "error_instances": report.error_instances, - "empty_patch_instances": report.empty_patch_instances, - "resolve_rate": report.resolve_rate, - }) - mlflow.log_artifact(str(results_path)) - logger.info("Phase 2 results logged to MLflow") - except Exception as e: - logger.warning(f"MLflow logging failed (non-fatal): {e}") - - ray.shutdown() - - -if __name__ == "__main__": - main() diff --git a/evals/swe_bench/test_worker.py b/evals/swe_bench/test_worker.py deleted file mode 100644 index e5195b6..0000000 --- a/evals/swe_bench/test_worker.py +++ /dev/null @@ -1,237 +0,0 @@ -"""Phase 2 Ray worker: run SWE-bench test execution via K8s Jobs. - -Each worker receives a sub-list of predictions and manages K8s Jobs -for test execution. Workers maintain a concurrency window of M -concurrent jobs to balance throughput with cluster capacity. - -Workers are stateless with respect to results -- all results are -returned to the head node via ray.get(). The head handles persistence -and resumability to avoid stale/split state across worker pods. -""" - -from __future__ import annotations - -import logging -from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Any - -import ray - -from .grader import InstanceResult, grade_instance -from .instance_runner import InstanceRunner, JobResult -from swebench.harness.test_spec.test_spec import make_test_spec - -logger = logging.getLogger(__name__) - - -@ray.remote(num_cpus=1) -class TestWorker: - """Runs SWE-bench test evaluation via K8s Jobs. - - Each worker manages a concurrency window of K8s Jobs, - running M jobs in parallel within a single worker. - - Args: - k8s_namespace: K8s namespace to create Jobs in. - timeout: Per-instance timeout in seconds. - service_account: K8s ServiceAccount for Job pods. - max_concurrent_jobs: Max K8s Jobs to run in parallel per worker. - swebench_namespace: DockerHub namespace for pre-built images. - image_registry: If set, pull images from this registry instead of - DockerHub. The swebench namespace prefix (e.g. "swebench/") is - replaced with "//". For the OpenShift - internal registry this looks like: - image-registry.openshift-image-registry.svc:5000/code-agent - """ - - def __init__( - self, - k8s_namespace: str | None = None, - timeout: int = 1800, - service_account: str = "swe-bench-eval", - max_concurrent_jobs: int = 4, - swebench_namespace: str = "swebench", - image_registry: str | None = None, - ): - self.max_concurrent_jobs = max_concurrent_jobs - self.swebench_namespace = swebench_namespace - self.image_registry = image_registry - - self.runner = InstanceRunner( - k8s_namespace=k8s_namespace, - timeout=timeout, - service_account=service_account, - ) - - def _evaluate_one( - self, - prediction: dict, - instance_data: dict, - run_id: str, - ) -> dict: - """Evaluate a single instance via a K8s Job. - - Args: - prediction: Dict with instance_id, model_patch, model_name_or_path. - instance_data: Full instance data from the dataset. - run_id: Unique run identifier. - - Returns: - Result dict with grading information. - """ - instance_id = prediction["instance_id"] - model_patch = prediction.get("model_patch", "") - - # Build TestSpec to get eval_script and image name - test_spec = make_test_spec( - instance_data, - namespace=self.swebench_namespace, - ) - - image = test_spec.instance_image_key - eval_script = test_spec.eval_script - - # Rewrite image ref to point at the internal registry when configured. - # DockerHub image: swebench/sweb.eval.x86_64.django_1776_django-16938:latest - # Internal image: /sweb.eval.x86_64.django_1776_django-16938:v1 - # The :latest -> :v1 retag is done during mirroring so that - # imagePullPolicy=IfNotPresent is respected by the kubelet - # (K8s forces Always for :latest tags). - if self.image_registry: - # Strip the DockerHub namespace prefix (e.g. "swebench/") - _, _, image_name = image.partition("/") - # Retag :latest -> :v1 to match the mirrored tag - if not image_name.endswith(":latest"): - logger.warning( - f"Expected :latest tag for {instance_id}, " - f"got {image_name} -- using as-is" - ) - else: - image_name = image_name.removesuffix(":latest") + ":v1" - image = f"{self.image_registry}/{image_name}" - - logger.info(f"Evaluating {instance_id} with image {image}") - - # Run the K8s Job - job_result = self.runner.run_instance( - instance_id=instance_id, - run_id=run_id, - image=image, - model_patch=model_patch, - eval_script=eval_script, - ) - - if job_result.error: - logger.error(f"Job error for {instance_id}: {job_result.error}") - return { - "instance_id": instance_id, - "resolved": False, - "patch_exists": bool(model_patch), - "patch_successfully_applied": False, - "error": job_result.error, - "tests_status": None, - } - - if not job_result.test_output: - logger.warning(f"No test output for {instance_id}") - error = ( - "K8s Job timed out" - if job_result.timed_out - else "No test output captured from pod logs" - ) - return { - "instance_id": instance_id, - "resolved": False, - "patch_exists": bool(model_patch), - "patch_successfully_applied": False, - "error": error, - "tests_status": None, - } - - # Grade the result - grade_result = grade_instance( - test_spec=test_spec, - prediction=prediction, - test_output=job_result.test_output, - ) - - status = "RESOLVED" if grade_result.resolved else "NOT RESOLVED" - logger.info(f"Instance {instance_id}: {status}") - - return { - "instance_id": grade_result.instance_id, - "resolved": grade_result.resolved, - "patch_exists": grade_result.patch_exists, - "patch_successfully_applied": grade_result.patch_successfully_applied, - "error": grade_result.error, - "tests_status": grade_result.tests_status, - } - - def evaluate_batch( - self, - predictions: list[dict], - instances_by_id: dict[str, dict], - run_id: str, - ) -> list[dict]: - """Evaluate a batch of predictions with concurrent K8s Jobs. - - Uses a thread pool to manage multiple K8s Jobs in parallel. - All results are returned to the caller (head node) -- no - local persistence on the worker. - - Args: - predictions: List of prediction dicts (instance_id, model_patch, ...). - instances_by_id: Map of instance_id to full dataset instance. - run_id: Unique run identifier. - - Returns: - List of result dicts for all predictions in this batch. - """ - results = [] - - # Run instances with a concurrency window - with ThreadPoolExecutor(max_workers=self.max_concurrent_jobs) as pool: - future_to_pred = {} - for pred in predictions: - instance_id = pred["instance_id"] - instance_data = instances_by_id.get(instance_id) - - if instance_data is None: - logger.error(f"Instance {instance_id} not found in dataset") - results.append({ - "instance_id": instance_id, - "resolved": False, - "patch_exists": bool(pred.get("model_patch")), - "patch_successfully_applied": False, - "error": "Instance not found in dataset", - "tests_status": None, - }) - continue - - future = pool.submit( - self._evaluate_one, - pred, - instance_data, - run_id, - ) - future_to_pred[future] = pred - - for future in as_completed(future_to_pred): - pred = future_to_pred[future] - instance_id = pred["instance_id"] - try: - result = future.result() - except Exception as e: - logger.error(f"Unexpected error for {instance_id}: {e}") - result = { - "instance_id": instance_id, - "resolved": False, - "patch_exists": bool(pred.get("model_patch")), - "patch_successfully_applied": False, - "error": str(e), - "tests_status": None, - } - - results.append(result) - - return results diff --git a/evals/swe_bench/verifiers/__init__.py b/evals/swe_bench/verifiers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/evals/swe_bench/verifiers/unit_test_verifier.py b/evals/swe_bench/verifiers/unit_test_verifier.py new file mode 100644 index 0000000..31b78cb --- /dev/null +++ b/evals/swe_bench/verifiers/unit_test_verifier.py @@ -0,0 +1,172 @@ +""" +SWE-bench unit test verifier (dynamic). + +Runs the official SWE-bench unit test suite for a single instance via a K8s Job, +grades the result, and returns a VerifierResult. + +This verifier requires: + ctx.metadata["instance_data"]: full dataset instance dict (for make_test_spec) + ctx.metadata["run_id"]: unique run identifier (for K8s Job naming) + ctx.metadata["runner"]: InstanceRunner instance (injected by SWEBenchMultiTurnWorker) + +The K8s Job applies the patch and runs the eval script inside the pre-built +SWE-bench container image. The result is scored as 1.0 (resolved) or 0.0 (not). +""" + +from __future__ import annotations + +import logging +from typing import ClassVar, Literal + +from swebench.harness.test_spec.test_spec import make_test_spec + +from evals.swe_bench.grader import grade_instance +from evals.swe_bench.instance_runner import InstanceRunner +from verifiers.base import BaseVerifier, PatchContext, VerifierResult, VerifierStatus + +logger = logging.getLogger(__name__) + + +class SWEBenchUnitTestVerifier(BaseVerifier): + """Dynamic verifier: runs the SWE-bench unit test suite via a K8s Job. + + Args: + swebench_namespace: DockerHub namespace for pre-built images. + image_registry: If set, pull from this internal registry instead of DockerHub. + pass_threshold: Score threshold for pass/fail (default 1.0 = must fully resolve). + timeout: Per-instance job timeout in seconds. + """ + + execution_mode: ClassVar[Literal["static", "dynamic"]] = "dynamic" + + def __init__( + self, + swebench_namespace: str = "swebench", + image_registry: str | None = None, + pass_threshold: float = 1.0, + timeout: float = 1800.0, + ): + super().__init__(timeout=timeout, pass_threshold=pass_threshold) + self.swebench_namespace = swebench_namespace + self.image_registry = image_registry + + @property + def name(self) -> str: + return "swe_test" + + def format_feedback(self, result: VerifierResult) -> str: + if result.status != VerifierStatus.OK: + error = result.details.get("error", "unknown error") + return f"[{self.name}] {result.status.value.upper()}: {error}" + if result.passed: + return f"[{self.name}] PASSED: issue resolved" + lines = [f"[{self.name}] FAILED (score: {result.score:.2f})"] + details = result.details + if not details.get("patch_successfully_applied"): + lines.append(" The patch could not be applied to the repository.") + return "\n".join(lines) + tests_status = details.get("tests_status") or {} + failed_tests = [ + name for name, status in tests_status.items() + if status in ("FAILED", "ERROR") + ] + if failed_tests: + lines.append(f" Failing tests ({len(failed_tests)}):") + for t in failed_tests[:20]: + lines.append(f" - {t}") + if len(failed_tests) > 20: + lines.append(f" ... and {len(failed_tests) - 20} more") + stdout = result.stdout or "" + if stdout and len(stdout) < 4000: + lines.append("\n Test output:\n " + stdout.replace("\n", "\n ")) + elif stdout: + tail = stdout[-3000:] + lines.append("\n Test output (last 3000 chars):\n " + tail.replace("\n", "\n ")) + return "\n".join(lines) + + async def verify(self, ctx: PatchContext) -> VerifierResult: + instance_data = ctx.metadata["instance_data"] + run_id = ctx.metadata["run_id"] + runner: InstanceRunner = ctx.metadata["runner"] + + instance_id = ctx.task_id + model_patch = ctx.patch_diff + + # Build TestSpec to get eval_script and image + test_spec = make_test_spec(instance_data, namespace=self.swebench_namespace) + image = test_spec.instance_image_key + eval_script = test_spec.eval_script + + # Rewrite image ref if using an internal registry + if self.image_registry: + _, _, image_name = image.partition("/") + if image_name.endswith(":latest"): + image_name = image_name.removesuffix(":latest") + ":v1" + else: + logger.warning( + f"Expected :latest tag for {instance_id}, got {image_name} -- using as-is" + ) + image = f"{self.image_registry}/{image_name}" + + logger.info(f"[{instance_id}] Running K8s Job with image {image}") + + job_result = runner.run_instance( + instance_id=instance_id, + run_id=run_id, + image=image, + model_patch=model_patch, + eval_script=eval_script, + ) + + if job_result.error: + return VerifierResult( + name=self.name, + status=VerifierStatus.ERROR, + score=0.0, + pass_threshold=self.pass_threshold, + details={"error": job_result.error}, + ) + + if not job_result.test_output: + error = ( + "K8s Job timed out" + if job_result.timed_out + else "No test output captured from pod logs" + ) + return VerifierResult( + name=self.name, + status=VerifierStatus.ERROR, + score=0.0, + pass_threshold=self.pass_threshold, + details={"error": error}, + ) + + prediction = { + "instance_id": instance_id, + "model_patch": model_patch, + "model_name_or_path": ctx.metadata.get("model_name", "unknown"), + } + grade_result = grade_instance( + test_spec=test_spec, + prediction=prediction, + test_output=job_result.test_output, + ) + + score = 1.0 if grade_result.resolved else 0.0 + status = "RESOLVED" if grade_result.resolved else "NOT RESOLVED" + logger.info(f"[{instance_id}] {status} (score={score:.1f})") + + return VerifierResult( + name=self.name, + status=VerifierStatus.OK, + score=score, + pass_threshold=self.pass_threshold, + stdout=job_result.test_output, + details={ + "resolved": grade_result.resolved, + "patch_exists": grade_result.patch_exists, + "patch_successfully_applied": grade_result.patch_successfully_applied, + "tests_status": grade_result.tests_status, + "error": grade_result.error, + }, + ) diff --git a/infra/deploy/buildconfig-ray-swe-bench.yaml b/infra/deploy/buildconfig-ray-swe-bench.yaml new file mode 100644 index 0000000..74cda4d --- /dev/null +++ b/infra/deploy/buildconfig-ray-swe-bench.yaml @@ -0,0 +1,39 @@ +--- +# ImageStream for the Ray SWE-bench worker image. +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: ray-swe-bench + namespace: code-agent +spec: + lookupPolicy: + local: true # allows pods in this namespace to reference the image by name + +--- +# BuildConfig for the Ray SWE-bench worker image. +# +# Builds from local source using the binary strategy so no git remote +# access is required from the cluster. +# +# Usage: +# oc apply -f infra/deploy/buildconfig-ray-swe-bench.yaml +# oc start-build ray-swe-bench --from-dir=. --follow +# +# The built image is pushed to the internal registry and available as: +# image-registry.openshift-image-registry.svc:5000/code-agent/ray-swe-bench:latest +apiVersion: build.openshift.io/v1 +kind: BuildConfig +metadata: + name: ray-swe-bench + namespace: code-agent +spec: + source: + type: Binary + strategy: + type: Docker + dockerStrategy: + dockerfilePath: infra/images/Containerfile.swe-bench-eval + output: + to: + kind: ImageStreamTag + name: ray-swe-bench:latest diff --git a/infra/images/Containerfile.swe-bench-eval b/infra/images/Containerfile.swe-bench-eval index dcf1090..c7062d4 100644 --- a/infra/images/Containerfile.swe-bench-eval +++ b/infra/images/Containerfile.swe-bench-eval @@ -1,9 +1,9 @@ # Ray worker image for SWE-bench evaluation orchestration. # # This image runs the Ray head and workers that: -# - Phase 1 head: Builds prompts using swebench's pipeline (clones repos) -# - Phase 1 workers: Call vLLM to generate patches, run AST verification -# - Phase 2 workers: Create K8s Jobs for test execution, collect results +# - (one-time Job) Build prompts using swebench's pipeline (clones repos) +# - (eval workers) Call vLLM to generate patches, run static verifiers inline, +# and dispatch K8s Jobs for dynamic verification and test execution # # Actual test execution happens in the pre-built SWE-bench container # images (from DockerHub) running as native K8s Jobs. @@ -37,6 +37,7 @@ RUN pip install --no-cache-dir \ # Install the code-agent evals package COPY pyproject.toml /app/pyproject.toml COPY evals/ /app/evals/ +COPY verifiers/ /app/verifiers/ RUN pip install --no-cache-dir /app # Copy agent config YAMLs to the installed package location diff --git a/pyproject.toml b/pyproject.toml index 67efa6f..9787562 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ eval = [ [tool.setuptools.packages.find] where = ["."] -include = ["evals*"] +include = ["evals*", "verifiers*"] [tool.setuptools.package-data] "evals.swe_bench" = ["agents/*.yaml"] diff --git a/run_swe_bench_phase1.sh b/run_swe_bench_eval.sh similarity index 58% rename from run_swe_bench_phase1.sh rename to run_swe_bench_eval.sh index 3a609b2..2603af7 100755 --- a/run_swe_bench_phase1.sh +++ b/run_swe_bench_eval.sh @@ -1,28 +1,39 @@ #!/bin/bash # -# Phase 1: Generate patches for SWE-bench instances. +# SWE-bench evaluation: patch generation and test execution. # -# Supports two strategies: -# - naive: Single-shot vLLM inference from pre-built prompts. -# - agent: Agentic loop using mini-swe-agent (or any agent via YAML config) -# running inside K8s Jobs with SWE-bench container images. +# Supports two strategies, both of which support single-shot and multi-turn: +# - naive: vLLM inference from pre-built prompts, followed by K8s test +# execution. Set MAX_TURNS > 1 for multi-turn with feedback. +# - agent: Agentic loop (mini-swe-agent or custom) inside K8s Jobs. +# Set MAX_TURNS > 1 for multi-turn with feedback between runs. +# +# All strategies produce results.json with resolve rate and per-instance details. # # Prerequisites: -# - RayCluster deployed +# - RayCluster deployed (see evals/swe_bench/deploy/) +# - RBAC applied: oc apply -f evals/swe_bench/deploy/rbac.yaml # - vLLM server deployed and accessible from the cluster # - Port-forward active: oc port-forward svc/ 8265:8265 -# - For naive: prompted dataset built -# - For agent: SWE-bench images available (DockerHub or internal registry) +# - For naive strategy: prompted dataset built (see job-build-prompts.yaml) +# - SWE-bench images available (DockerHub or internal registry) # # Usage: -# # Naive strategy (default) -# bash run_swe_bench_phase1.sh +# # Naive strategy, single-shot (default) +# bash run_swe_bench_eval.sh +# +# # Naive strategy, multi-turn (3 attempts, AST check between turns) +# MAX_TURNS=3 INTERMEDIATE_VERIFIERS="ast_check" bash run_swe_bench_eval.sh # -# # Agent strategy -# STRATEGY=agent bash run_swe_bench_phase1.sh +# # Agent strategy, single-shot +# STRATEGY=agent bash run_swe_bench_eval.sh +# +# # Agent strategy, multi-turn +# STRATEGY=agent MAX_TURNS=3 INTERMEDIATE_VERIFIERS="ast_check" \ +# bash run_swe_bench_eval.sh # # # Quick test with 2 instances -# STRATEGY=agent INSTANCE_LIMIT=2 bash run_swe_bench_phase1.sh +# INSTANCE_LIMIT=2 bash run_swe_bench_eval.sh set -euo pipefail @@ -38,9 +49,14 @@ INSTANCE_LIMIT="${INSTANCE_LIMIT:-0}" RUN_ID="${RUN_ID:-eval-run}" OUTPUT_DIR="${OUTPUT_DIR:-/tmp/swe-bench-results/${RUN_ID}}" S3_BUCKET="${S3_BUCKET:-swe-bench}" -S3_OUTPUT="${S3_OUTPUT:-s3://${S3_BUCKET}/runs/${RUN_ID}/predictions.jsonl}" +S3_OUTPUT="${S3_OUTPUT:-s3://${S3_BUCKET}/runs/${RUN_ID}/results.json}" MLFLOW_TRACKING_URI="${MLFLOW_TRACKING_URI:-}" +# ── Multi-turn config (applies to both strategies) ───────────── +MAX_TURNS="${MAX_TURNS:-1}" +INTERMEDIATE_VERIFIERS="${INTERMEDIATE_VERIFIERS:-}" # space-separated, e.g. "ast_check" +AGGREGATOR="${AGGREGATOR:-mean}" + # ── Naive strategy config ────────────────────────────────────── PROMPTS="${PROMPTS:-s3://${S3_BUCKET}/verified/prompts/style-3-oracle.jsonl}" MAX_TOKENS="${MAX_TOKENS:-16000}" @@ -51,13 +67,15 @@ AGENT_CONFIG="${AGENT_CONFIG:-evals/swe_bench/agents/mini_swe_agent.yaml}" MODEL_API_KEY="${MODEL_API_KEY:-dummy}" STEP_LIMIT="${STEP_LIMIT:-150}" COST_LIMIT="${COST_LIMIT:-3.0}" +RUN_EVAL="${RUN_EVAL:-1}" + +# ── Shared K8s / registry config ─────────────────────────────── K8S_NAMESPACE="${K8S_NAMESPACE:-}" SERVICE_ACCOUNT="${SERVICE_ACCOUNT:-swe-bench-eval}" IMAGE_REGISTRY="${IMAGE_REGISTRY:-}" SWEBENCH_NAMESPACE="${SWEBENCH_NAMESPACE:-swebench}" MAX_CONCURRENT_JOBS="${MAX_CONCURRENT_JOBS:-4}" -JOB_TIMEOUT="${JOB_TIMEOUT:-600}" -RUN_EVAL="${RUN_EVAL:-1}" +JOB_TIMEOUT="${JOB_TIMEOUT:-1800}" if [[ "${DEBUG:-0}" == "1" ]]; then set -x @@ -76,8 +94,17 @@ CMD_ARGS=( --instance-limit "${INSTANCE_LIMIT}" --s3-output "${S3_OUTPUT}" --run-id "${RUN_ID}" + --max-turns "${MAX_TURNS}" + --aggregator "${AGGREGATOR}" ) +# Append each intermediate verifier as a separate arg +if [[ -n "${INTERMEDIATE_VERIFIERS}" ]]; then + for v in ${INTERMEDIATE_VERIFIERS}; do + CMD_ARGS+=(--intermediate-verifiers "${v}") + done +fi + if [[ "${STRATEGY}" == "naive" ]]; then CMD_ARGS+=( --prompts "${PROMPTS}" @@ -95,14 +122,27 @@ elif [[ "${STRATEGY}" == "agent" ]]; then --max-concurrent-jobs "${MAX_CONCURRENT_JOBS}" --job-timeout "${JOB_TIMEOUT}" ) + if [[ "${RUN_EVAL}" == "0" ]]; then + CMD_ARGS+=(--skip-eval) + fi +fi + +# K8s and registry args (needed for agent and multi-turn naive) +if [[ "${STRATEGY}" == "agent" ]] || [[ "${MAX_TURNS}" -gt 1 ]]; then if [[ -n "${K8S_NAMESPACE}" ]]; then CMD_ARGS+=(--k8s-namespace "${K8S_NAMESPACE}") fi if [[ -n "${IMAGE_REGISTRY}" ]]; then CMD_ARGS+=(--image-registry "${IMAGE_REGISTRY}") fi - if [[ "${RUN_EVAL}" == "0" ]]; then - CMD_ARGS+=(--skip-eval) + if [[ "${STRATEGY}" == "naive" ]]; then + # naive multi-turn also needs K8s args for SWEBenchUnitTestVerifier + CMD_ARGS+=( + --service-account "${SERVICE_ACCOUNT}" + --swebench-namespace "${SWEBENCH_NAMESPACE}" + --max-concurrent-jobs "${MAX_CONCURRENT_JOBS}" + --job-timeout "${JOB_TIMEOUT}" + ) fi fi @@ -112,7 +152,7 @@ if [[ -n "${MLFLOW_TRACKING_URI}" ]]; then ENV_ARGS+=(--runtime-env-json "{\"env_vars\": {\"MLFLOW_TRACKING_URI\": \"${MLFLOW_TRACKING_URI}\"}}") fi -echo "Strategy: ${STRATEGY}" +echo "Strategy: ${STRATEGY}, max_turns: ${MAX_TURNS}" echo "Running: ray job submit -- ${CMD_ARGS[*]}" ray job submit \ diff --git a/run_swe_bench_phase2.sh b/run_swe_bench_phase2.sh deleted file mode 100755 index 61956ea..0000000 --- a/run_swe_bench_phase2.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/bin/bash -# -# Phase 2: Run SWE-bench test execution via K8s Jobs. -# -# Reads predictions from S3/MinIO (output of Phase 1), distributes test -# execution across Ray workers. Each worker creates K8s Jobs using -# pre-built SWE-bench container images. Results are graded, aggregated, -# and uploaded to S3/MinIO. -# -# Prerequisites: -# - RayCluster deployed: oc apply -f evals/swe_bench/deploy/raycluster-test-exec.yaml -# - RBAC applied: oc apply -f evals/swe_bench/deploy/rbac.yaml -# - predictions.jsonl from Phase 1 (in S3) -# - MinIO credentials secret (same as Phase 1) -# - Port-forward active: oc port-forward svc/swe-bench-test-exec-head-svc 8265:8265 -# - (optional) Images mirrored to internal registry: -# oc apply -f evals/swe_bench/deploy/job-mirror-images.yaml -# -# Usage: -# bash run_swe_bench_phase2.sh -# -# Quick test with 16 instances: -# INSTANCE_LIMIT=16 bash run_swe_bench_phase2.sh -# -# Verify harness with gold patches (skips Phase 1 entirely): -# PREDICTIONS=gold INSTANCE_LIMIT=16 RUN_ID=gold-test bash run_swe_bench_phase2.sh -# -# Use internal registry (after mirroring images): -# IMAGE_REGISTRY=image-registry.openshift-image-registry.svc:5000/code-agent \ -# bash run_swe_bench_phase2.sh - -set -euo pipefail - -# ── Configurable ──────────────────────────────────────────────── -RAY_ADDRESS="${RAY_ADDRESS:-http://127.0.0.1:8265}" -DATASET="${DATASET:-SWE-bench/SWE-bench_Verified}" -RUN_ID="${RUN_ID:-eval-run}" -OUTPUT_DIR="${OUTPUT_DIR:-/tmp/swe-bench-results/${RUN_ID}}" -NUM_WORKERS="${NUM_WORKERS:-4}" -MAX_CONCURRENT_JOBS="${MAX_CONCURRENT_JOBS:-4}" -K8S_NAMESPACE="${K8S_NAMESPACE:-}" -SERVICE_ACCOUNT="${SERVICE_ACCOUNT:-swe-bench-eval}" -TIMEOUT="${TIMEOUT:-1800}" -INSTANCE_LIMIT="${INSTANCE_LIMIT:-0}" -IMAGE_REGISTRY="${IMAGE_REGISTRY:-}" -S3_BUCKET="${S3_BUCKET:-swe-bench}" -PREDICTIONS="${PREDICTIONS:-s3://${S3_BUCKET}/runs/${RUN_ID}/predictions.jsonl}" -S3_OUTPUT="${S3_OUTPUT:-s3://${S3_BUCKET}/runs/${RUN_ID}/results.json}" -# MLflow tracking (optional). Set to the in-cluster MLflow service URL -# to enable experiment tracking. Unset to disable. -# e.g. MLFLOW_TRACKING_URI=http://mlflow-server:5000 -MLFLOW_TRACKING_URI="${MLFLOW_TRACKING_URI:-}" - -if [[ "${DEBUG:-0}" == "1" ]]; then - set -x -fi - -# Build command args -CMD_ARGS=( - python3 -m evals.swe_bench.run_test_execution - --predictions "${PREDICTIONS}" - --dataset "${DATASET}" - --output-dir "${OUTPUT_DIR}" - --run-id "${RUN_ID}" - --num-workers "${NUM_WORKERS}" - --max-concurrent-jobs "${MAX_CONCURRENT_JOBS}" - --service-account "${SERVICE_ACCOUNT}" - --timeout "${TIMEOUT}" - --instance-limit "${INSTANCE_LIMIT}" - --s3-output "${S3_OUTPUT}" -) - -# Only pass --k8s-namespace if explicitly set (otherwise auto-detected in-cluster) -if [[ -n "${K8S_NAMESPACE}" ]]; then - CMD_ARGS+=(--k8s-namespace "${K8S_NAMESPACE}") -fi - -# Use internal registry instead of DockerHub when set -if [[ -n "${IMAGE_REGISTRY}" ]]; then - CMD_ARGS+=(--image-registry "${IMAGE_REGISTRY}") -fi - -# Pass MLflow tracking URI via Ray runtime env so workers pick it up -ENV_ARGS=() -if [[ -n "${MLFLOW_TRACKING_URI}" ]]; then - ENV_ARGS+=(--runtime-env-json "{\"env_vars\": {\"MLFLOW_TRACKING_URI\": \"${MLFLOW_TRACKING_URI}\"}}") -fi - -ray job submit \ - --address="${RAY_ADDRESS}" \ - "${ENV_ARGS[@]}" \ - -- "${CMD_ARGS[@]}" diff --git a/verifiers/__init__.py b/verifiers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/verifiers/ast_check.py b/verifiers/ast_check.py index f02fd18..f161ef3 100644 --- a/verifiers/ast_check.py +++ b/verifiers/ast_check.py @@ -1,53 +1,134 @@ """ -AST validity verifier. +AST validity verifier (static). -Checks that all changed Python files parse successfully. -This is the cheapest possible check — catches syntax errors -that small models produce more frequently than frontier models. +Checks that all changed Python files introduced by the patch parse successfully. +Works entirely from the patch diff — no repo checkout required. + +For each changed .py file, new content is reconstructed by taking the context +lines (unchanged) and added lines (+) from the unified diff. This is a heuristic: +deletions create gaps, so the reconstructed content may differ from the true new +file. However, syntax errors introduced by the model (missing colons, unclosed +brackets, bad indentation in new code) will reliably surface. """ from __future__ import annotations import ast -from typing import Any +import re +from typing import Any, ClassVar, Literal from .base import BaseVerifier, PatchContext, VerifierResult, VerifierStatus +def _extract_new_content(patch_diff: str, filepath: str) -> str | None: + """Reconstruct the new content of a file from a unified diff. + + Collects context lines (prefix ' ') and added lines (prefix '+'), + skipping hunk headers and deleted lines. + + Returns None if the file is not found in the diff (e.g. pure deletion). + """ + lines = patch_diff.splitlines() + in_file = False + in_hunk = False + new_lines: list[str] = [] + found = False + + for line in lines: + # Detect file header (unified diff: +++ b/path or +++ path) + if line.startswith("+++ "): + candidate = line[4:] + if candidate.startswith("b/"): + candidate = candidate[2:] + # Match by suffix to handle path prefixes + in_file = candidate == filepath or candidate.endswith("/" + filepath) + if in_file: + found = True + new_lines = [] + in_hunk = False + continue + + if line.startswith("--- "): + continue + + if in_file: + if line.startswith("@@"): + in_hunk = True + continue + if not in_hunk: + continue + if line.startswith("+") and not line.startswith("+++"): + new_lines.append(line[1:]) # added line + elif line.startswith(" "): + new_lines.append(line[1:]) # context line + elif line.startswith("-"): + pass # deleted line — skip + elif line.startswith("\\"): + pass # "No newline at end of file" + else: + # Next file section begins + in_file = False + in_hunk = False + + if not found: + return None + return "\n".join(new_lines) + + class ASTCheckVerifier(BaseVerifier): + """Static verifier: checks Python syntax of all changed files in the patch.""" + + execution_mode: ClassVar[Literal["static", "dynamic"]] = "static" + @property def name(self) -> str: return "ast_check" + def format_feedback(self, result: VerifierResult) -> str: + if result.status != VerifierStatus.OK: + return f"[{self.name}] {result.status.value.upper()}" + if result.passed: + return f"[{self.name}] PASSED: no syntax errors detected" + errors = result.details.get("errors", []) + lines = [f"[{self.name}] FAILED: syntax errors in the following files:"] + for e in errors: + loc = f"line {e['line']}" if e.get("line") else "" + offset = f", offset {e['offset']}" if e.get("offset") else "" + msg = e.get("message", "syntax error") + lines.append(f" - {e.get('file', '?')}: {loc}{offset}: {msg}") + return "\n".join(lines) + async def verify(self, ctx: PatchContext) -> VerifierResult: python_files = [f for f in ctx.changed_files if f.endswith(".py")] - #e.g. changed files are md files or JS files - patch is okay as far as py is concerned if not python_files: return VerifierResult( name=self.name, - status=VerifierStatus.PASS, + status=VerifierStatus.SKIPPED, score=1.0, + pass_threshold=self.pass_threshold, details={"message": "No Python files changed"}, ) errors: list[dict[str, Any]] = [] - parsed, deleted = 0, 0 + parsed = 0 + skipped = 0 # files not present in the diff (e.g. pure deletions) - #py files only - three possible states - #deleted by patch - #ast parsing successful (parsed += 1) - #ast parsing unsuccessful (errors) for filepath in python_files: - full_path = ctx.repo_path / filepath - if not full_path.exists(): - # File was deleted by patch — that's fine - deleted += 1 + content = _extract_new_content(ctx.patch_diff, filepath) + + if content is None: + # File was deleted by the patch — nothing to parse + skipped += 1 + continue + + if not content.strip(): + # Empty new content (file fully cleared) — counts as parseable + parsed += 1 continue try: - source = full_path.read_text(encoding="utf-8", errors="replace") - ast.parse(source, filename=filepath) + ast.parse(content, filename=filepath) parsed += 1 except SyntaxError as e: errors.append({ @@ -57,24 +138,20 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: "message": e.msg, }) - assert deleted + parsed + len(errors) == len(python_files) - - status = VerifierStatus.FAIL if errors else VerifierStatus.PASS - - #score computation - parsed_or_errored = parsed + len(errors) #can be zero if all py files deleted - score = parsed / parsed_or_errored if parsed_or_errored > 0 else 1.0 #else if all files deleted + checkable = parsed + len(errors) + score = parsed / checkable if checkable > 0 else 1.0 return VerifierResult( name=self.name, - status=status, + status=VerifierStatus.OK, score=score, + pass_threshold=self.pass_threshold, details={ - "errors": errors, - "files_total": len(python_files), - "files_checked": parsed_or_errored, - "files_parsed": parsed, - "files_deleted": deleted, - "files_errored": len(errors) - }, - ) \ No newline at end of file + "errors": errors, + "files_total": len(python_files), + "files_checkable": checkable, + "files_parsed": parsed, + "files_skipped": skipped, + "files_errored": len(errors), + }, + ) diff --git a/verifiers/base.py b/verifiers/base.py index bc90230..d7de5b0 100644 --- a/verifiers/base.py +++ b/verifiers/base.py @@ -3,6 +3,9 @@ All verifiers implement this interface. A verifier takes a patch and repo context, runs a check, and returns a VerifierResult with a score and metadata. + +VerifierStatus only signals execution errors (ERROR, TIMEOUT, SKIPPED). +Pass/fail semantics are derived from score >= pass_threshold, not from status. """ from __future__ import annotations @@ -12,15 +15,14 @@ from dataclasses import dataclass, field from enum import Enum from pathlib import Path -from typing import Any +from typing import Any, ClassVar, Literal class VerifierStatus(Enum): - PASS = "pass" - FAIL = "fail" - ERROR = "error" # verifier itself crashed - TIMEOUT = "timeout" - SKIPPED = "skipped" # skipped due to early exit in prior stage + OK = "ok" # ran successfully; consult score for pass/fail + ERROR = "error" # verifier itself crashed + TIMEOUT = "timeout" # verifier timed out + SKIPPED = "skipped" # not applicable (e.g., no relevant files changed) @dataclass @@ -29,6 +31,7 @@ class VerifierResult: name: str status: VerifierStatus score: float # normalized to [0.0, 1.0] + pass_threshold: float = 1.0 # score must meet this to be considered passing wall_clock_seconds: float = 0.0 details: dict[str, Any] = field(default_factory=dict) # Raw output for debugging @@ -37,33 +40,61 @@ class VerifierResult: @property def passed(self) -> bool: - return self.status == VerifierStatus.PASS + """True only when the verifier ran successfully and score >= pass_threshold.""" + return self.status == VerifierStatus.OK and self.score >= self.pass_threshold def __repr__(self) -> str: return ( f"VerifierResult(name={self.name!r}, status={self.status.value}, " - f"score={self.score:.3f}, time={self.wall_clock_seconds:.1f}s)" + f"score={self.score:.3f}, threshold={self.pass_threshold:.3f}, " + f"passed={self.passed}, time={self.wall_clock_seconds:.1f}s)" ) @dataclass class PatchContext: """Everything a verifier needs to check a patch.""" - repo_path: Path # path to the repo with patch applied - patch_diff: str # the raw unified diff - changed_files: list[str] # list of files modified by the patch - task_id: str # SWE-bench task identifier - test_cmd: str | None = None # repo-specific test command - ground_truth_patch: str | None = None # for differential comparison + patch_diff: str # the raw unified diff + changed_files: list[str] # list of files modified by the patch + task_id: str # benchmark task identifier + repo_path: Path | None = None # path to the repo with patch applied (dynamic verifiers) + test_cmd: str | None = None # repo-specific test command + ground_truth_patch: str | None = None # for differential comparison metadata: dict[str, Any] = field(default_factory=dict) class BaseVerifier(ABC): - """Abstract base class for all verifiers.""" - - def __init__(self, config: dict[str, Any] | None = None, timeout: float = 60.0): + """Abstract base class for all verifiers. + + Subclasses must define: + - execution_mode (ClassVar): "static" or "dynamic" + static — runs inline in the calling process; works from patch_diff alone + dynamic — requires an isolated execution environment (e.g. K8s Job) + - name (property): unique verifier identifier + - verify (coroutine): core verification logic + """ + + execution_mode: ClassVar[Literal["static", "dynamic"]] + + def __init_subclass__(cls, **kwargs: Any) -> None: + super().__init_subclass__(**kwargs) + # Only enforce on concrete (non-abstract) subclasses + if not getattr(cls, "__abstractmethods__", None): + if not hasattr(cls, "execution_mode"): + raise TypeError( + f"{cls.__name__} must define an 'execution_mode' class attribute " + f"(Literal['static', 'dynamic'])" + ) + + def __init__( + self, + config: dict[str, Any] | None = None, + timeout: float = 60.0, + pass_threshold: float = 1.0, + ): self.config = config or {} self.timeout = timeout + self.pass_threshold = pass_threshold @property @abstractmethod @@ -77,13 +108,26 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: Run the verification check. Args: - ctx: The patch context containing repo path, diff, etc. + ctx: The patch context. Static verifiers use patch_diff/changed_files. + Dynamic verifiers may also use repo_path and ctx.metadata. Returns: - VerifierResult with status, score, and details. + VerifierResult with status OK and a score, or ERROR/TIMEOUT/SKIPPED. """ ... + def format_feedback(self, result: VerifierResult) -> str: + """Convert a VerifierResult into a human-readable feedback string. + + Called by the multi-turn loop to build the feedback message shown to + the model before its next attempt. Subclasses override to provide + verifier-specific detail (error locations, failing tests, etc.). + """ + if result.status != VerifierStatus.OK: + return f"[{self.name}] {result.status.value.upper()}" + status_label = "PASSED" if result.passed else "FAILED" + return f"[{self.name}] {status_label} (score: {result.score:.2f})" + async def safe_verify(self, ctx: PatchContext) -> VerifierResult: """ Run verify() with timeout and error handling. @@ -104,6 +148,7 @@ async def safe_verify(self, ctx: PatchContext) -> VerifierResult: name=self.name, status=VerifierStatus.TIMEOUT, score=0.0, + pass_threshold=self.pass_threshold, wall_clock_seconds=time.monotonic() - start, details={"timeout": self.timeout}, ) @@ -112,6 +157,7 @@ async def safe_verify(self, ctx: PatchContext) -> VerifierResult: name=self.name, status=VerifierStatus.ERROR, score=0.0, + pass_threshold=self.pass_threshold, wall_clock_seconds=time.monotonic() - start, details={"error": str(e), "error_type": type(e).__name__}, ) From 6887cda8c8c1bd8f4c217bbfbd773b4e2c7c6ab5 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Thu, 21 May 2026 14:21:34 +0200 Subject: [PATCH 2/9] Fixing issues related to the multi-turn execution feature. --- evals/common/s3_storage.py | 28 ++++ .../deploy/raycluster-patch-gen.yaml | 4 +- evals/swe_bench/multi_turn_worker.py | 2 + evals/swe_bench/run_patch_generation.py | 21 ++- verifiers/ast_check.py | 158 +++++++++++++++--- 5 files changed, 189 insertions(+), 24 deletions(-) diff --git a/evals/common/s3_storage.py b/evals/common/s3_storage.py index 610a995..b272c23 100644 --- a/evals/common/s3_storage.py +++ b/evals/common/s3_storage.py @@ -54,6 +54,34 @@ def upload_file(local_path: str | Path, s3_uri: str) -> None: logger.info(f"Upload complete") +def upload_dir(local_dir: str | Path, s3_prefix: str) -> None: + """Upload all files in a directory tree to S3/MinIO. + + Skips files that are not inside a per-instance subdirectory (e.g. + prompted_dataset.jsonl, predictions.jsonl) to avoid re-uploading + bulk files. Only files one level deep (instance_id/filename) are + uploaded. + + Args: + local_dir: Local directory containing per-instance subdirectories. + s3_prefix: S3 URI prefix (e.g. s3://bucket/runs/my-run). + """ + bucket, prefix = parse_s3_uri(s3_prefix) + s3 = _get_s3_client() + local_dir = Path(local_dir) + uploaded = 0 + for instance_dir in local_dir.iterdir(): + if not instance_dir.is_dir(): + continue + for file in instance_dir.iterdir(): + if not file.is_file(): + continue + key = f"{prefix}/{instance_dir.name}/{file.name}" + s3.upload_file(str(file), bucket, key) + uploaded += 1 + logger.info(f"Uploaded {uploaded} per-instance files to s3://{bucket}/{prefix}/") + + def download_file(s3_uri: str, local_path: str | Path) -> None: """Download a file from S3/MinIO to a local path. diff --git a/evals/swe_bench/deploy/raycluster-patch-gen.yaml b/evals/swe_bench/deploy/raycluster-patch-gen.yaml index ca7b146..1d68274 100644 --- a/evals/swe_bench/deploy/raycluster-patch-gen.yaml +++ b/evals/swe_bench/deploy/raycluster-patch-gen.yaml @@ -117,10 +117,10 @@ spec: resources: requests: cpu: "2" - memory: "4Gi" + memory: "8Gi" limits: cpu: "2" - memory: "4Gi" + memory: "8Gi" volumeMounts: - name: ray-tmp mountPath: /tmp diff --git a/evals/swe_bench/multi_turn_worker.py b/evals/swe_bench/multi_turn_worker.py index eb9f0a6..2c65eb7 100644 --- a/evals/swe_bench/multi_turn_worker.py +++ b/evals/swe_bench/multi_turn_worker.py @@ -336,6 +336,8 @@ def _evaluate_instance( "turns": [ { "turn": t.turn, + "output": t.output, + "feedback": t.feedback, "aggregate_score": t.aggregate_score, "verifier_results": [ { diff --git a/evals/swe_bench/run_patch_generation.py b/evals/swe_bench/run_patch_generation.py index 7bcf65f..c5c94a2 100644 --- a/evals/swe_bench/run_patch_generation.py +++ b/evals/swe_bench/run_patch_generation.py @@ -118,6 +118,7 @@ def save_prediction(output_dir: Path, result: dict) -> None: Each instance gets a directory with: - prediction.json (the prediction) - report.json (eval grading result) + - multi_turn.json (per-turn patches, feedback, and scores; multi-turn only) - pod_logs.txt (full pod stdout, agent strategy only) - .traj.json (agent trajectory, if available) """ @@ -141,6 +142,12 @@ def save_prediction(output_dir: Path, result: dict) -> None: json.dumps(report_data, indent=2) ) + # multi_turn.json (multi-turn runs only) + if result.get("multi_turn"): + (instance_dir / "multi_turn.json").write_text( + json.dumps(result["multi_turn"], indent=2) + ) + # pod_logs.txt (agent strategy only) if result.get("full_logs"): (instance_dir / "pod_logs.txt").write_text(result["full_logs"]) @@ -187,12 +194,24 @@ def _write_merged_predictions(output_dir: Path, dataset: list) -> Path: def _upload_to_s3(local_path: Path, s3_uri: str | None) -> None: - """Upload predictions file to S3 if an S3 URI was provided.""" + """Upload the predictions file and all per-instance directories to S3. + + The predictions.jsonl goes to s3_uri directly. Per-instance directories + are uploaded alongside it under the same S3 prefix. + """ if not s3_uri: logger.info("No --s3-output specified, skipping S3 upload") return + + from evals.common.s3_storage import upload_dir + upload_file(local_path, s3_uri) + # Upload per-instance dirs to the same prefix as the predictions file + s3_prefix = s3_uri.rsplit("/", 1)[0] + output_dir = local_path.parent + upload_dir(output_dir, s3_prefix) + # ── Verifier set builder (shared across strategies) ───────────────── diff --git a/verifiers/ast_check.py b/verifiers/ast_check.py index f161ef3..57560cb 100644 --- a/verifiers/ast_check.py +++ b/verifiers/ast_check.py @@ -9,29 +9,55 @@ deletions create gaps, so the reconstructed content may differ from the true new file. However, syntax errors introduced by the model (missing colons, unclosed brackets, bad indentation in new code) will reliably surface. + +Parsing strategy (applied per file, in order): + 1. Try ast.parse() on the raw reconstructed fragment. + 2. If that fails with an indentation error on line 1 (fragment starts mid-block), + apply textwrap.dedent() and retry. + 3. If that still fails with an indentation error on line 1 (mixed indentation + levels after dedent), wrap in "async def _():\\n if True:\\n" and retry. + 4. A failure at any stage with a non-indentation error, or a failure of the + wrapped version, is reported as a real syntax error. + +Line numbers in reported errors are adjusted to refer to the reconstructed +fragment, not to any wrapper lines added during parsing. + +Error feedback includes a short code snippet showing the lines around the error, +with '+' prefixes on added lines and ' ' prefixes on context lines. """ from __future__ import annotations import ast -import re +import textwrap from typing import Any, ClassVar, Literal from .base import BaseVerifier, PatchContext, VerifierResult, VerifierStatus +# Number of lines prepended by the wrapper in step 3. +_WRAPPER = "async def _():\n if True:\n" +_WRAPPER_LINES = _WRAPPER.count("\n") +_WRAPPER_INDENT = " " # 8 spaces (2 levels of 4) + +# Lines of context to show before/after the error line in feedback. +_SNIPPET_BEFORE = 2 +_SNIPPET_AFTER = 1 -def _extract_new_content(patch_diff: str, filepath: str) -> str | None: + +def _extract_new_content(patch_diff: str, filepath: str) -> list[tuple[str, str]] | None: """Reconstruct the new content of a file from a unified diff. Collects context lines (prefix ' ') and added lines (prefix '+'), skipping hunk headers and deleted lines. - Returns None if the file is not found in the diff (e.g. pure deletion). + Returns a list of (prefix, text) tuples where prefix is '+' (added line) + or ' ' (context line), or None if the file is not found in the diff + (e.g. pure deletion). """ lines = patch_diff.splitlines() in_file = False in_hunk = False - new_lines: list[str] = [] + line_records: list[tuple[str, str]] = [] found = False for line in lines: @@ -44,7 +70,7 @@ def _extract_new_content(patch_diff: str, filepath: str) -> str | None: in_file = candidate == filepath or candidate.endswith("/" + filepath) if in_file: found = True - new_lines = [] + line_records = [] in_hunk = False continue @@ -58,13 +84,13 @@ def _extract_new_content(patch_diff: str, filepath: str) -> str | None: if not in_hunk: continue if line.startswith("+") and not line.startswith("+++"): - new_lines.append(line[1:]) # added line + line_records.append(("+", line[1:])) # added line elif line.startswith(" "): - new_lines.append(line[1:]) # context line + line_records.append((" ", line[1:])) # context line elif line.startswith("-"): - pass # deleted line — skip + pass # deleted line — skip elif line.startswith("\\"): - pass # "No newline at end of file" + pass # "No newline at end of file" else: # Next file section begins in_file = False @@ -72,7 +98,99 @@ def _extract_new_content(patch_diff: str, filepath: str) -> str | None: if not found: return None - return "\n".join(new_lines) + return line_records + + +def _records_to_content(line_records: list[tuple[str, str]]) -> str: + """Join line records into a plain content string for ast.parse().""" + return "\n".join(text for _, text in line_records) + + +def _is_indentation_error_on_line_1(exc: SyntaxError) -> bool: + """True if the exception is an indentation problem on the very first line.""" + if exc.lineno != 1: + return False + if isinstance(exc, IndentationError): + return True + msg = exc.msg.lower() + return "indent" in msg + + +def _try_parse(content: str, filename: str) -> SyntaxError | None: + """Attempt ast.parse(); return the SyntaxError or None on success.""" + try: + ast.parse(content, filename=filename) + return None + except SyntaxError as e: + return e + + +def _build_snippet(line_records: list[tuple[str, str]], error_lineno: int | None) -> str | None: + """Build a short code snippet around the error line for display in feedback. + + Line numbers are 1-based and refer to the reconstructed fragment. + Returns None if error_lineno is None or out of range. + """ + if error_lineno is None or not (1 <= error_lineno <= len(line_records)): + return None + + start = max(0, error_lineno - 1 - _SNIPPET_BEFORE) + end = min(len(line_records), error_lineno + _SNIPPET_AFTER) + + snippet_lines = [] + for i in range(start, end): + lineno = i + 1 + prefix, text = line_records[i] + marker = "-->" if lineno == error_lineno else " " + snippet_lines.append(f" {marker} {lineno:4d} {prefix} {text}") + + return "\n".join(snippet_lines) + + +def _parse_fragment( + line_records: list[tuple[str, str]], + filepath: str, +) -> dict[str, Any] | None: + """Parse a reconstructed diff fragment using the multi-stage strategy. + + Returns an error dict {file, line, offset, message, snippet} if a real + syntax error is found, or None if the fragment parses successfully at + some stage. + """ + content = _records_to_content(line_records) + + def _make_error(lineno: int | None, offset: int | None, msg: str) -> dict[str, Any]: + return { + "file": filepath, + "line": lineno, + "offset": offset, + "message": msg, + "snippet": _build_snippet(line_records, lineno), + } + + # Stage 1: raw content + err = _try_parse(content, filepath) + if err is None: + return None + if not _is_indentation_error_on_line_1(err): + return _make_error(err.lineno, err.offset, err.msg) + + # Stage 2: dedented content + dedented = textwrap.dedent(content) + err = _try_parse(dedented, filepath) + if err is None: + return None + if not _is_indentation_error_on_line_1(err): + return _make_error(err.lineno, err.offset, err.msg) + + # Stage 3: wrap in "async def _():\n if True:\n" + wrapped = _WRAPPER + textwrap.indent(dedented, _WRAPPER_INDENT) + err = _try_parse(wrapped, filepath) + if err is None: + return None + # Adjust line number back to the fragment (subtract wrapper lines) + lineno = err.lineno - _WRAPPER_LINES if err.lineno is not None else None + return _make_error(lineno, err.offset, err.msg) class ASTCheckVerifier(BaseVerifier): @@ -96,6 +214,8 @@ def format_feedback(self, result: VerifierResult) -> str: offset = f", offset {e['offset']}" if e.get("offset") else "" msg = e.get("message", "syntax error") lines.append(f" - {e.get('file', '?')}: {loc}{offset}: {msg}") + if e.get("snippet"): + lines.append(e["snippet"]) return "\n".join(lines) async def verify(self, ctx: PatchContext) -> VerifierResult: @@ -115,28 +235,24 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: skipped = 0 # files not present in the diff (e.g. pure deletions) for filepath in python_files: - content = _extract_new_content(ctx.patch_diff, filepath) + line_records = _extract_new_content(ctx.patch_diff, filepath) - if content is None: + if line_records is None: # File was deleted by the patch — nothing to parse skipped += 1 continue + content = _records_to_content(line_records) if not content.strip(): # Empty new content (file fully cleared) — counts as parseable parsed += 1 continue - try: - ast.parse(content, filename=filepath) + error = _parse_fragment(line_records, filepath) + if error is None: parsed += 1 - except SyntaxError as e: - errors.append({ - "file": filepath, - "line": e.lineno, - "offset": e.offset, - "message": e.msg, - }) + else: + errors.append(error) checkable = parsed + len(errors) score = parsed / checkable if checkable > 0 else 1.0 From 02c7cfcc13ff0aacc160db437aab91b7e798e441 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Fri, 22 May 2026 11:33:56 +0200 Subject: [PATCH 3/9] Fixed a bug in propagating the custom agent job timeout. --- evals/swe_bench/run_patch_generation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evals/swe_bench/run_patch_generation.py b/evals/swe_bench/run_patch_generation.py index c5c94a2..c9862af 100644 --- a/evals/swe_bench/run_patch_generation.py +++ b/evals/swe_bench/run_patch_generation.py @@ -321,6 +321,8 @@ def _run_agent(args, pending: list, output_dir: Path) -> list[dict]: from evals.swe_bench.agent_config import load_agent_config agent_config = load_agent_config(args.agent_config) + if args.job_timeout > 0: + agent_config.job_timeout = args.job_timeout logger.info(f"Agent: {agent_config.name}") subset = _resolve_subset_name(args.dataset) From ea52bf29fedd8ab81116a5bae51927f79d257091 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Fri, 22 May 2026 12:05:56 +0200 Subject: [PATCH 4/9] Switched to building from the repo instead of local sources. --- infra/deploy/buildconfig-ray-swe-bench.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/infra/deploy/buildconfig-ray-swe-bench.yaml b/infra/deploy/buildconfig-ray-swe-bench.yaml index 74cda4d..eb630db 100644 --- a/infra/deploy/buildconfig-ray-swe-bench.yaml +++ b/infra/deploy/buildconfig-ray-swe-bench.yaml @@ -12,12 +12,11 @@ spec: --- # BuildConfig for the Ray SWE-bench worker image. # -# Builds from local source using the binary strategy so no git remote -# access is required from the cluster. +# Builds from the code-agent GitHub repository (main branch). # # Usage: # oc apply -f infra/deploy/buildconfig-ray-swe-bench.yaml -# oc start-build ray-swe-bench --from-dir=. --follow +# oc start-build ray-swe-bench --follow # # The built image is pushed to the internal registry and available as: # image-registry.openshift-image-registry.svc:5000/code-agent/ray-swe-bench:latest @@ -28,7 +27,10 @@ metadata: namespace: code-agent spec: source: - type: Binary + type: Git + git: + uri: https://github.com/redhat-et/code-agent.git + ref: main strategy: type: Docker dockerStrategy: From 2e2729eecff9ee7894ed2177aaddee2d8b805185 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Tue, 26 May 2026 13:57:48 +0200 Subject: [PATCH 5/9] Fixed CodeRabbit CR comments. --- evals/common/multi_turn_worker.py | 29 ++++- evals/common/s3_storage.py | 7 +- evals/swe_bench/agent_worker.py | 14 +- evals/swe_bench/multi_turn_worker.py | 9 +- evals/swe_bench/run_patch_generation.py | 75 +++++++---- .../swe_bench/verifiers/unit_test_verifier.py | 46 +++++-- run_swe_bench_eval.sh | 13 +- verifiers/ast_check.py | 120 +++++++++++------- verifiers/base.py | 9 +- 9 files changed, 223 insertions(+), 99 deletions(-) diff --git a/evals/common/multi_turn_worker.py b/evals/common/multi_turn_worker.py index 82e7f93..7ff10c4 100644 --- a/evals/common/multi_turn_worker.py +++ b/evals/common/multi_turn_worker.py @@ -60,6 +60,7 @@ def __init__( self.max_turns = max_turns self.max_concurrent_jobs = max_concurrent_jobs + self.vllm_urls = vllm_urls self.clients = [ openai.OpenAI(base_url=url, api_key="not-needed", timeout=600.0) for url in vllm_urls @@ -73,10 +74,16 @@ def __init__( # ── vLLM (naive generation) ───────────────────────────────────────────── - def _get_client(self): - client = self.clients[self._call_count % len(self.clients)] + def _next_index(self) -> int: + idx = self._call_count % len(self.clients) self._call_count += 1 - return client + return idx + + def _get_client(self): + return self.clients[self._next_index()] + + def _get_vllm_url(self) -> str: + return self.vllm_urls[self._next_index()] def _generate_naive(self, messages: list[dict]) -> str: """Generate a response via inline vLLM inference.""" @@ -94,7 +101,7 @@ def _generate_naive(self, messages: list[dict]) -> str: def _run_verifier_set_inline(self, vset: VerifierSet, ctx) -> list[VerifierResult]: """Run all verifiers in vset concurrently (async, in-process).""" async def _run_all(): - return [await entry.verifier.safe_verify(ctx) for entry in vset.entries] + return await asyncio.gather(*(entry.verifier.safe_verify(ctx) for entry in vset.entries)) return asyncio.run(_run_all()) def _run_mixed_verifier_set(self, vset: VerifierSet, ctx) -> list[VerifierResult]: @@ -149,4 +156,16 @@ def evaluate_batch( run_id: str, ) -> list[dict]: """Evaluate a batch of instances sequentially.""" - return [self._evaluate_instance(inst, prompts, run_id) for inst in instances] + results: list[dict] = [] + for inst in instances: + try: + results.append(self._evaluate_instance(inst, prompts, run_id)) + except Exception as e: + instance_id = inst.get("instance_id") + logger.exception("Failed evaluating instance %s", instance_id) + results.append({ + "instance_id": instance_id, + "error": str(e), + "resolved": False, + }) + return results diff --git a/evals/common/s3_storage.py b/evals/common/s3_storage.py index b272c23..cde371f 100644 --- a/evals/common/s3_storage.py +++ b/evals/common/s3_storage.py @@ -67,17 +67,18 @@ def upload_dir(local_dir: str | Path, s3_prefix: str) -> None: s3_prefix: S3 URI prefix (e.g. s3://bucket/runs/my-run). """ bucket, prefix = parse_s3_uri(s3_prefix) + prefix = prefix.rstrip("/") s3 = _get_s3_client() local_dir = Path(local_dir) uploaded = 0 for instance_dir in local_dir.iterdir(): if not instance_dir.is_dir(): continue - for file in instance_dir.iterdir(): + for file in instance_dir.rglob("*"): if not file.is_file(): continue - key = f"{prefix}/{instance_dir.name}/{file.name}" - s3.upload_file(str(file), bucket, key) + rel = file.relative_to(local_dir).as_posix() + key = f"{prefix}/{rel}" if prefix else rel uploaded += 1 logger.info(f"Uploaded {uploaded} per-instance files to s3://{bucket}/{prefix}/") diff --git a/evals/swe_bench/agent_worker.py b/evals/swe_bench/agent_worker.py index 7593e52..29ce852 100644 --- a/evals/swe_bench/agent_worker.py +++ b/evals/swe_bench/agent_worker.py @@ -519,7 +519,7 @@ def __init__( self, agent_config_dict: dict, model_name: str, - model_base_url: str, + vllm_urls: list[str], model_api_key: str = "dummy", k8s_namespace: str | None = None, service_account: str = "swe-bench-eval", @@ -538,7 +538,10 @@ def __init__( if job_timeout > 0: self.agent_config.job_timeout = job_timeout self.model_name = model_name - self.model_base_url = model_base_url + if not vllm_urls: + raise ValueError("vllm_urls must contain at least one endpoint") + self._vllm_urls = vllm_urls + self._call_count = 0 self.model_api_key = model_api_key self.k8s_namespace = k8s_namespace or _detect_namespace() self.service_account = service_account @@ -553,6 +556,11 @@ def __init__( self.batch_api, self.core_api = _init_k8s() + def _get_vllm_url(self) -> str: + url = self._vllm_urls[self._call_count % len(self._vllm_urls)] + self._call_count += 1 + return url + def _generate_one(self, instance: dict, run_id: str) -> dict: """Run an agent on a single SWE-bench instance via a K8s Job. @@ -570,7 +578,7 @@ def _generate_one(self, instance: dict, run_id: str) -> dict: template_vars = { "instance_id": instance_id, "model_name": self.model_name, - "model_base_url": self.model_base_url, + "model_base_url": self._get_vllm_url(), "model_api_key": self.model_api_key, "workdir": DOCKER_WORKDIR, "problem_statement_file": "/tmp/problem_statement.txt", diff --git a/evals/swe_bench/multi_turn_worker.py b/evals/swe_bench/multi_turn_worker.py index 2c65eb7..87a2c5e 100644 --- a/evals/swe_bench/multi_turn_worker.py +++ b/evals/swe_bench/multi_turn_worker.py @@ -89,7 +89,6 @@ def __init__( self.strategy = strategy self.swebench_namespace = swebench_namespace self.image_registry = image_registry - self.vllm_urls = vllm_urls self.k8s_namespace = k8s_namespace or _detect_namespace() self.service_account = service_account @@ -187,7 +186,7 @@ def _generate_agent( template_vars = { "instance_id": instance_id, "model_name": self.model_name, - "model_base_url": self.vllm_urls[0], + "model_base_url": self._get_vllm_url(), "model_api_key": self.model_api_key, "workdir": DOCKER_WORKDIR, "problem_statement_file": "/tmp/problem_statement.txt", @@ -232,8 +231,8 @@ def _generate_agent( return prediction.get("model_patch", "") except Exception as e: - logger.error(f"[{instance_id}] Turn {turn_idx}: agent Job failed: {e}") - return "" + logger.exception(f"[{instance_id}] Turn {turn_idx}: agent Job failed") + raise RuntimeError(f"Agent turn {turn_idx} failed for {instance_id}") from e finally: if job_name: _delete_job(self.batch_api, job_name, self.k8s_namespace) @@ -376,7 +375,7 @@ def _error_result(instance_id: str, model_name: str, error: str) -> dict: "model_name_or_path": model_name, "error": error, "eval_report": {}, - "resolved": False, + "resolved": None, "multi_turn": {"num_turns": 0, "stopped_early": False, "final_aggregate_score": 0.0, "turns": []}, } diff --git a/evals/swe_bench/run_patch_generation.py b/evals/swe_bench/run_patch_generation.py index c9862af..607d5d7 100644 --- a/evals/swe_bench/run_patch_generation.py +++ b/evals/swe_bench/run_patch_generation.py @@ -224,21 +224,31 @@ def _build_verifier_set(args): vset = VerifierSet() intermediate_names = set(args.intermediate_verifiers or []) + final_names = set(args.final_verifiers or []) - if "ast_check" in intermediate_names: - vset.add(ASTCheckVerifier(), run_intermediate=True, run_final=False) + # If --final-verifiers is not provided, default to swe_test only + if not final_names: + final_names = {"swe_test"} + + run_ast_intermediate = "ast_check" in intermediate_names + run_ast_final = "ast_check" in final_names + if run_ast_intermediate or run_ast_final: + vset.add(ASTCheckVerifier(), + run_intermediate=run_ast_intermediate, + run_final=run_ast_final) - # SWE-bench test verifier always runs as final; optionally also intermediate run_swe_intermediate = "swe_test" in intermediate_names - vset.add( - SWEBenchUnitTestVerifier( - swebench_namespace=args.swebench_namespace, - image_registry=args.image_registry or None, - timeout=float(args.job_timeout or 1800), - ), - run_intermediate=run_swe_intermediate, - run_final=True, - ) + run_swe_final = "swe_test" in final_names + if run_swe_intermediate or run_swe_final: + vset.add( + SWEBenchUnitTestVerifier( + swebench_namespace=args.swebench_namespace, + image_registry=args.image_registry or None, + timeout=float(args.job_timeout or 1800), + ), + run_intermediate=run_swe_intermediate, + run_final=run_swe_final, + ) aggregator = build_aggregator(args.aggregator) return vset, aggregator @@ -381,7 +391,7 @@ def _run_agent(args, pending: list, output_dir: Path) -> list[dict]: AgentWorker.remote( agent_config_dict=asdict(agent_config), model_name=args.model_name, - model_base_url=args.vllm_url[0], + vllm_urls=args.vllm_url, model_api_key=args.model_api_key, k8s_namespace=args.k8s_namespace, service_account=args.service_account, @@ -493,9 +503,13 @@ def _parse_args() -> argparse.Namespace: help="Max generation attempts per instance " "(1 = single-shot, >1 = multi-turn with feedback)") common.add_argument("--intermediate-verifiers", type=str, nargs="*", default=[], - choices=["ast_check"], + choices=["ast_check", "swe_test"], help="Verifiers to run after each intermediate turn. " - "Options: ast_check") + "Options: ast_check, swe_test") + common.add_argument("--final-verifiers", type=str, nargs="*", default=None, + choices=["ast_check", "swe_test"], + help="Verifiers to run on the final patch. " + "Default: swe_test only") common.add_argument("--aggregator", type=str, default="mean", choices=["mean", "min", "weighted_sum"], help="Score aggregation strategy for multi-turn") @@ -633,12 +647,15 @@ def main(): logger.info(f" Errors: {errors}") if uses_multi_turn or agent_with_eval: - resolved = sum(1 for r in all_results if r.get("resolved") is True) - evaluated = sum(1 for r in all_results if r.get("resolved") is not None) - rate = resolved / evaluated if evaluated > 0 else 0 - logger.info(f" Evaluated: {evaluated}") - logger.info(f" Resolved: {resolved}") - logger.info(f" Resolve rate: {rate:.1%}") + evaluated = [r for r in all_results + if not r.get("error") and r.get("resolved") is not None] + resolved = [r for r in evaluated if r.get("resolved") is True] + resolve_rate = len(resolved) / len(evaluated) if evaluated else 0 + completion_rate = len(evaluated) / total if total > 0 else 0 + logger.info(f" Evaluated: {len(evaluated)}/{total}") + logger.info(f" Completion rate: {completion_rate:.1%}") + logger.info(f" Resolved: {len(resolved)}") + logger.info(f" Resolve rate: {resolve_rate:.1%}") if uses_multi_turn: turn_counts = [ @@ -667,13 +684,16 @@ def main(): def _write_aggregate_results(output_dir: Path, all_results: list[dict]) -> None: """Write aggregate results.json from eval results.""" - evaluated = [r for r in all_results if r.get("resolved") is not None] - resolved = [r for r in evaluated if r.get("resolved") is True] errors = [r for r in all_results if r.get("error")] + evaluated = [r for r in all_results + if not r.get("error") and r.get("resolved") is not None] + resolved = [r for r in evaluated if r.get("resolved") is True] + total = len(all_results) report = { - "total_instances": len(all_results), + "total_instances": total, "evaluated": len(evaluated), + "completion_rate": len(evaluated) / total if total > 0 else 0, "resolved_instances": len(resolved), "unresolved_instances": len(evaluated) - len(resolved), "error_instances": len(errors), @@ -745,9 +765,12 @@ def _log_to_mlflow(args, total, patches_generated, errors, all_results, "generation_errors": errors, } if uses_multi_turn or agent_with_eval: - resolved = sum(1 for r in all_results if r.get("resolved") is True) - evaluated = sum(1 for r in all_results if r.get("resolved") is not None) + evaluated = sum(1 for r in all_results + if not r.get("error") and r.get("resolved") is not None) + resolved = sum(1 for r in all_results + if not r.get("error") and r.get("resolved") is True) metrics["evaluated"] = evaluated + metrics["completion_rate"] = evaluated / total if total else 0 metrics["resolved"] = resolved metrics["resolve_rate"] = resolved / evaluated if evaluated else 0 diff --git a/evals/swe_bench/verifiers/unit_test_verifier.py b/evals/swe_bench/verifiers/unit_test_verifier.py index 31b78cb..ca17410 100644 --- a/evals/swe_bench/verifiers/unit_test_verifier.py +++ b/evals/swe_bench/verifiers/unit_test_verifier.py @@ -110,13 +110,23 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: logger.info(f"[{instance_id}] Running K8s Job with image {image}") - job_result = runner.run_instance( - instance_id=instance_id, - run_id=run_id, - image=image, - model_patch=model_patch, - eval_script=eval_script, - ) + try: + job_result = runner.run_instance( + instance_id=instance_id, + run_id=run_id, + image=image, + model_patch=model_patch, + eval_script=eval_script, + ) + except Exception as e: + logger.exception("[%s] unit-test job execution failed", instance_id) + return VerifierResult( + name=self.name, + status=VerifierStatus.ERROR, + score=0.0, + pass_threshold = self.pass_threshold, + details = {"error": str(e)}, + ) if job_result.error: return VerifierResult( @@ -146,11 +156,23 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: "model_patch": model_patch, "model_name_or_path": ctx.metadata.get("model_name", "unknown"), } - grade_result = grade_instance( - test_spec=test_spec, - prediction=prediction, - test_output=job_result.test_output, - ) + + try: + grade_result = grade_instance( + test_spec=test_spec, + prediction=prediction, + test_output=job_result.test_output, + ) + except Exception as e: + logger.exception("[%s] grading failed", instance_id) + return VerifierResult( + name=self.name, + status=VerifierStatus.ERROR, + score=0.0, + pass_threshold=self.pass_threshold, + details={"error": str(e)}, + stdout=job_result.test_output, + ) score = 1.0 if grade_result.resolved else 0.0 status = "RESOLVED" if grade_result.resolved else "NOT RESOLVED" diff --git a/run_swe_bench_eval.sh b/run_swe_bench_eval.sh index 2603af7..91eb513 100755 --- a/run_swe_bench_eval.sh +++ b/run_swe_bench_eval.sh @@ -55,6 +55,7 @@ MLFLOW_TRACKING_URI="${MLFLOW_TRACKING_URI:-}" # ── Multi-turn config (applies to both strategies) ───────────── MAX_TURNS="${MAX_TURNS:-1}" INTERMEDIATE_VERIFIERS="${INTERMEDIATE_VERIFIERS:-}" # space-separated, e.g. "ast_check" +FINAL_VERIFIERS="${FINAL_VERIFIERS:-}" # space-separated, default: swe_test AGGREGATOR="${AGGREGATOR:-mean}" # ── Naive strategy config ────────────────────────────────────── @@ -105,6 +106,13 @@ if [[ -n "${INTERMEDIATE_VERIFIERS}" ]]; then done fi +# Append each final verifier as a separate arg +if [[ -n "${FINAL_VERIFIERS}" ]]; then + for v in ${FINAL_VERIFIERS}; do + CMD_ARGS+=(--final-verifiers "${v}") + done +fi + if [[ "${STRATEGY}" == "naive" ]]; then CMD_ARGS+=( --prompts "${PROMPTS}" @@ -153,7 +161,10 @@ if [[ -n "${MLFLOW_TRACKING_URI}" ]]; then fi echo "Strategy: ${STRATEGY}, max_turns: ${MAX_TURNS}" -echo "Running: ray job submit -- ${CMD_ARGS[*]}" +# Redact secrets before logging +DISPLAY_CMD="${CMD_ARGS[*]}" +DISPLAY_CMD=$(echo "${DISPLAY_CMD}" | sed -E 's/(--model-api-key )[^ ]+/\1***REDACTED***/g') +echo "Running: ray job submit -- ${DISPLAY_CMD}" ray job submit \ --address="${RAY_ADDRESS}" \ diff --git a/verifiers/ast_check.py b/verifiers/ast_check.py index 57560cb..468160f 100644 --- a/verifiers/ast_check.py +++ b/verifiers/ast_check.py @@ -4,13 +4,14 @@ Checks that all changed Python files introduced by the patch parse successfully. Works entirely from the patch diff — no repo checkout required. -For each changed .py file, new content is reconstructed by taking the context -lines (unchanged) and added lines (+) from the unified diff. This is a heuristic: -deletions create gaps, so the reconstructed content may differ from the true new -file. However, syntax errors introduced by the model (missing colons, unclosed -brackets, bad indentation in new code) will reliably surface. - -Parsing strategy (applied per file, in order): +For each changed .py file, every hunk in the unified diff is parsed +independently. The @@ hunk header supplies the new-file line offset so +that reported error line numbers map back to the patched file. Parsing +each hunk in isolation avoids the false adjacency problem that occurs +when multiple hunks are naively concatenated (gaps between hunks are +unknown original lines that would break syntax if omitted). + +Parsing strategy (applied per hunk, in order): 1. Try ast.parse() on the raw reconstructed fragment. 2. If that fails with an indentation error on line 1 (fragment starts mid-block), apply textwrap.dedent() and retry. @@ -19,8 +20,8 @@ 4. A failure at any stage with a non-indentation error, or a failure of the wrapped version, is reported as a real syntax error. -Line numbers in reported errors are adjusted to refer to the reconstructed -fragment, not to any wrapper lines added during parsing. +Line numbers in reported errors are adjusted to refer to the new file, +not to the hunk-local fragment or any wrapper lines added during parsing. Error feedback includes a short code snippet showing the lines around the error, with '+' prefixes on added lines and ' ' prefixes on context lines. @@ -29,6 +30,7 @@ from __future__ import annotations import ast +import re import textwrap from typing import Any, ClassVar, Literal @@ -43,34 +45,46 @@ _SNIPPET_BEFORE = 2 _SNIPPET_AFTER = 1 +_HUNK_HEADER_RE = re.compile(r"@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@") + + +def _parse_hunk_new_start(header: str) -> int: + """Extract the new-file start line from a unified diff hunk header.""" + m = _HUNK_HEADER_RE.search(header) + return int(m.group(1)) if m else 1 -def _extract_new_content(patch_diff: str, filepath: str) -> list[tuple[str, str]] | None: - """Reconstruct the new content of a file from a unified diff. - Collects context lines (prefix ' ') and added lines (prefix '+'), - skipping hunk headers and deleted lines. +def _extract_new_content( + patch_diff: str, filepath: str, +) -> list[tuple[int, list[tuple[str, str]]]] | None: + """Extract per-hunk new content from a unified diff. - Returns a list of (prefix, text) tuples where prefix is '+' (added line) - or ' ' (context line), or None if the file is not found in the diff - (e.g. pure deletion). + Each hunk is returned as (new_start, records) where new_start is the + 1-based starting line in the new file (from the @@ header) and records + is a list of (prefix, text) tuples ('+' for added, ' ' for context). + + Returns None if the file is not found in the diff (e.g. pure deletion). """ lines = patch_diff.splitlines() in_file = False in_hunk = False - line_records: list[tuple[str, str]] = [] found = False + hunks: list[tuple[int, list[tuple[str, str]]]] = [] + current_records: list[tuple[str, str]] = [] + current_start = 1 for line in lines: - # Detect file header (unified diff: +++ b/path or +++ path) if line.startswith("+++ "): + if in_file and current_records: + hunks.append((current_start, current_records)) candidate = line[4:] if candidate.startswith("b/"): candidate = candidate[2:] - # Match by suffix to handle path prefixes in_file = candidate == filepath or candidate.endswith("/" + filepath) if in_file: found = True - line_records = [] + hunks = [] + current_records = [] in_hunk = False continue @@ -79,26 +93,32 @@ def _extract_new_content(patch_diff: str, filepath: str) -> list[tuple[str, str] if in_file: if line.startswith("@@"): + if current_records: + hunks.append((current_start, current_records)) + current_records = [] + current_start = _parse_hunk_new_start(line) in_hunk = True continue if not in_hunk: continue if line.startswith("+") and not line.startswith("+++"): - line_records.append(("+", line[1:])) # added line + current_records.append(("+", line[1:])) elif line.startswith(" "): - line_records.append((" ", line[1:])) # context line + current_records.append((" ", line[1:])) elif line.startswith("-"): - pass # deleted line — skip + pass elif line.startswith("\\"): - pass # "No newline at end of file" + pass else: - # Next file section begins in_file = False in_hunk = False + if in_file and current_records: + hunks.append((current_start, current_records)) + if not found: return None - return line_records + return hunks def _records_to_content(line_records: list[tuple[str, str]]) -> str: @@ -125,10 +145,15 @@ def _try_parse(content: str, filename: str) -> SyntaxError | None: return e -def _build_snippet(line_records: list[tuple[str, str]], error_lineno: int | None) -> str | None: +def _build_snippet( + line_records: list[tuple[str, str]], + error_lineno: int | None, + line_offset: int = 0, +) -> str | None: """Build a short code snippet around the error line for display in feedback. - Line numbers are 1-based and refer to the reconstructed fragment. + error_lineno is 1-based within line_records. line_offset is added to + displayed line numbers so they reflect new-file positions. Returns None if error_lineno is None or out of range. """ if error_lineno is None or not (1 <= error_lineno <= len(line_records)): @@ -140,9 +165,10 @@ def _build_snippet(line_records: list[tuple[str, str]], error_lineno: int | None snippet_lines = [] for i in range(start, end): lineno = i + 1 + display_lineno = lineno + line_offset prefix, text = line_records[i] marker = "-->" if lineno == error_lineno else " " - snippet_lines.append(f" {marker} {lineno:4d} {prefix} {text}") + snippet_lines.append(f" {marker} {display_lineno:4d} {prefix} {text}") return "\n".join(snippet_lines) @@ -150,9 +176,13 @@ def _build_snippet(line_records: list[tuple[str, str]], error_lineno: int | None def _parse_fragment( line_records: list[tuple[str, str]], filepath: str, + line_offset: int = 0, ) -> dict[str, Any] | None: """Parse a reconstructed diff fragment using the multi-stage strategy. + line_offset is added to reported line numbers so they refer to new-file + positions rather than hunk-local positions. + Returns an error dict {file, line, offset, message, snippet} if a real syntax error is found, or None if the fragment parses successfully at some stage. @@ -160,12 +190,13 @@ def _parse_fragment( content = _records_to_content(line_records) def _make_error(lineno: int | None, offset: int | None, msg: str) -> dict[str, Any]: + display_lineno = lineno + line_offset if lineno is not None else None return { "file": filepath, - "line": lineno, + "line": display_lineno, "offset": offset, "message": msg, - "snippet": _build_snippet(line_records, lineno), + "snippet": _build_snippet(line_records, lineno, line_offset), } # Stage 1: raw content @@ -235,24 +266,27 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: skipped = 0 # files not present in the diff (e.g. pure deletions) for filepath in python_files: - line_records = _extract_new_content(ctx.patch_diff, filepath) + hunks = _extract_new_content(ctx.patch_diff, filepath) - if line_records is None: - # File was deleted by the patch — nothing to parse + if hunks is None: skipped += 1 continue - content = _records_to_content(line_records) - if not content.strip(): - # Empty new content (file fully cleared) — counts as parseable - parsed += 1 - continue + file_has_error = False + for new_start, hunk_records in hunks: + content = _records_to_content(hunk_records) + if not content.strip(): + continue - error = _parse_fragment(line_records, filepath) - if error is None: + error = _parse_fragment( + hunk_records, filepath, line_offset=new_start - 1, + ) + if error is not None: + errors.append(error) + file_has_error = True + + if not file_has_error: parsed += 1 - else: - errors.append(error) checkable = parsed + len(errors) score = parsed / checkable if checkable > 0 else 1.0 diff --git a/verifiers/base.py b/verifiers/base.py index d7de5b0..2b40dd5 100644 --- a/verifiers/base.py +++ b/verifiers/base.py @@ -80,7 +80,8 @@ def __init_subclass__(cls, **kwargs: Any) -> None: super().__init_subclass__(**kwargs) # Only enforce on concrete (non-abstract) subclasses if not getattr(cls, "__abstractmethods__", None): - if not hasattr(cls, "execution_mode"): + mode = getattr(cls, "execution_mode", None) + if mode not in {"static", "dynamic"}: raise TypeError( f"{cls.__name__} must define an 'execution_mode' class attribute " f"(Literal['static', 'dynamic'])" @@ -94,6 +95,11 @@ def __init__( ): self.config = config or {} self.timeout = timeout + + if not 0.0 <= pass_threshold <= 1.0: + raise ValueError( + f"pass_threshold must be in [0.0, 1.0], got {pass_threshold}" + ) self.pass_threshold = pass_threshold @property @@ -141,6 +147,7 @@ async def safe_verify(self, ctx: PatchContext) -> VerifierResult: self.verify(ctx), timeout=self.timeout ) + result.pass_threshold = self.pass_threshold result.wall_clock_seconds = time.monotonic() - start return result except asyncio.TimeoutError: From 33360ddfaa30952d5e3f8cce3bb2efe5cac5db33 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Tue, 26 May 2026 16:29:19 +0200 Subject: [PATCH 6/9] Pinned the K8s client version to prevent weird bugs. --- infra/images/Containerfile.swe-bench-eval | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/images/Containerfile.swe-bench-eval b/infra/images/Containerfile.swe-bench-eval index c7062d4..e503834 100644 --- a/infra/images/Containerfile.swe-bench-eval +++ b/infra/images/Containerfile.swe-bench-eval @@ -25,7 +25,7 @@ RUN dnf install -y git && dnf clean all # SWE-bench harness (for TestSpec generation, prompt construction, and grading) RUN pip install --no-cache-dir \ "swebench @ git+https://github.com/SWE-bench/SWE-bench.git@v4.1.0" \ - "kubernetes>=28.0.0" \ + "kubernetes>=28.0.0,<32.0.0" \ "openai>=1.0.0" \ "datasets" \ "boto3" \ From 3fc72e18acc30a818e912c02d76f8277067e4403 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Wed, 27 May 2026 12:13:04 +0200 Subject: [PATCH 7/9] Fixed a few additional bugs. --- evals/common/s3_storage.py | 1 + run_swe_bench_eval.sh | 12 ++++-------- verifiers/ast_check.py | 9 ++++++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/evals/common/s3_storage.py b/evals/common/s3_storage.py index cde371f..72f3810 100644 --- a/evals/common/s3_storage.py +++ b/evals/common/s3_storage.py @@ -79,6 +79,7 @@ def upload_dir(local_dir: str | Path, s3_prefix: str) -> None: continue rel = file.relative_to(local_dir).as_posix() key = f"{prefix}/{rel}" if prefix else rel + s3.upload_file(str(file), bucket, key) uploaded += 1 logger.info(f"Uploaded {uploaded} per-instance files to s3://{bucket}/{prefix}/") diff --git a/run_swe_bench_eval.sh b/run_swe_bench_eval.sh index 91eb513..e2ef072 100755 --- a/run_swe_bench_eval.sh +++ b/run_swe_bench_eval.sh @@ -99,18 +99,14 @@ CMD_ARGS=( --aggregator "${AGGREGATOR}" ) -# Append each intermediate verifier as a separate arg +# Append intermediate verifiers as a single flag followed by all values if [[ -n "${INTERMEDIATE_VERIFIERS}" ]]; then - for v in ${INTERMEDIATE_VERIFIERS}; do - CMD_ARGS+=(--intermediate-verifiers "${v}") - done + CMD_ARGS+=(--intermediate-verifiers ${INTERMEDIATE_VERIFIERS}) fi -# Append each final verifier as a separate arg +# Append final verifiers as a single flag followed by all values if [[ -n "${FINAL_VERIFIERS}" ]]; then - for v in ${FINAL_VERIFIERS}; do - CMD_ARGS+=(--final-verifiers "${v}") - done + CMD_ARGS+=(--final-verifiers ${FINAL_VERIFIERS}) fi if [[ "${STRATEGY}" == "naive" ]]; then diff --git a/verifiers/ast_check.py b/verifiers/ast_check.py index 468160f..9139bc1 100644 --- a/verifiers/ast_check.py +++ b/verifiers/ast_check.py @@ -263,6 +263,7 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: errors: list[dict[str, Any]] = [] parsed = 0 + errored = 0 skipped = 0 # files not present in the diff (e.g. pure deletions) for filepath in python_files: @@ -285,10 +286,12 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: errors.append(error) file_has_error = True - if not file_has_error: + if file_has_error: + errored += 1 + else: parsed += 1 - checkable = parsed + len(errors) + checkable = parsed + errored score = parsed / checkable if checkable > 0 else 1.0 return VerifierResult( @@ -302,6 +305,6 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: "files_checkable": checkable, "files_parsed": parsed, "files_skipped": skipped, - "files_errored": len(errors), + "files_errored": errored, }, ) From 26bd6ccb80a7c9f3765b067f9a03025f9bba3719 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Thu, 28 May 2026 13:16:10 +0200 Subject: [PATCH 8/9] Modified the AST Check Verifier to work with complete files only. Existing hunk-based implementation is left as a fallback. --- verifiers/ast_check.py | 225 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 209 insertions(+), 16 deletions(-) diff --git a/verifiers/ast_check.py b/verifiers/ast_check.py index 9139bc1..eaa1b88 100644 --- a/verifiers/ast_check.py +++ b/verifiers/ast_check.py @@ -2,16 +2,20 @@ AST validity verifier (static). Checks that all changed Python files introduced by the patch parse successfully. -Works entirely from the patch diff — no repo checkout required. -For each changed .py file, every hunk in the unified diff is parsed -independently. The @@ hunk header supplies the new-file line offset so -that reported error line numbers map back to the patched file. Parsing -each hunk in isolation avoids the false adjacency problem that occurs -when multiple hunks are naively concatenated (gaps between hunks are -unknown original lines that would break syntax if omitted). +Two parsing strategies, tried in order: -Parsing strategy (applied per hunk, in order): + 1. **Full-file parsing** (preferred): when SWE-bench metadata is available + (repo name and base commit), performs a shallow sparse clone of the + repository, checks out the base commit, applies the patch, and runs + ast.parse() on each complete modified Python file. This eliminates + the false positives inherent in fragment-based parsing. + + 2. **Hunk-based parsing** (fallback): when repo metadata is unavailable or + the clone fails, falls back to parsing each unified-diff hunk in + isolation. This works entirely from the patch diff string with no I/O. + +Hunk-based parsing strategy (applied per hunk, in order): 1. Try ast.parse() on the raw reconstructed fragment. 2. If that fails with an indentation error on line 1 (fragment starts mid-block), apply textwrap.dedent() and retry. @@ -19,23 +23,24 @@ levels after dedent), wrap in "async def _():\\n if True:\\n" and retry. 4. A failure at any stage with a non-indentation error, or a failure of the wrapped version, is reported as a real syntax error. - -Line numbers in reported errors are adjusted to refer to the new file, -not to the hunk-local fragment or any wrapper lines added during parsing. - -Error feedback includes a short code snippet showing the lines around the error, -with '+' prefixes on added lines and ' ' prefixes on context lines. """ from __future__ import annotations import ast +import asyncio +import logging +import os import re +import shutil +import tempfile import textwrap from typing import Any, ClassVar, Literal from .base import BaseVerifier, PatchContext, VerifierResult, VerifierStatus +logger = logging.getLogger(__name__) + # Number of lines prepended by the wrapper in step 3. _WRAPPER = "async def _():\n if True:\n" _WRAPPER_LINES = _WRAPPER.count("\n") @@ -48,6 +53,159 @@ _HUNK_HEADER_RE = re.compile(r"@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@") +# ── Full-file parsing (preferred) ────────────────────────────────────── + + +async def _run_git(*args: str, cwd: str | None = None) -> tuple[int, str, str]: + """Run a git command asynchronously. Returns (returncode, stdout, stderr).""" + proc = await asyncio.create_subprocess_exec( + "git", *args, + cwd=cwd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + return proc.returncode, stdout.decode(), stderr.decode() + + +async def _clone_and_parse( + repo: str, + base_commit: str, + patch_diff: str, + python_files: list[str], + verifier_name: str, + pass_threshold: float, +) -> VerifierResult | None: + """Clone the repo, apply the patch, and parse full Python files. + + Returns a VerifierResult on success, or None if the git operations fail + (signaling the caller to fall back to hunk-based parsing). + """ + tmpdir = tempfile.mkdtemp(prefix="ast_check_") + repo_dir = os.path.join(tmpdir, "repo") + + try: + url = f"https://github.com/{repo}.git" + + rc, _, stderr = await _run_git( + "clone", "--depth", "1", "--filter=blob:none", + "--sparse", url, repo_dir, + ) + if rc != 0: + logger.warning("ast_check: git clone failed: %s", stderr.strip()) + return None + + rc, _, stderr = await _run_git( + "sparse-checkout", "set", "--skip-checks", *python_files, cwd=repo_dir, + ) + if rc != 0: + logger.warning("ast_check: sparse-checkout failed: %s", stderr.strip()) + return None + + rc, _, stderr = await _run_git( + "fetch", "--depth", "1", "origin", base_commit, cwd=repo_dir, + ) + if rc != 0: + logger.warning("ast_check: git fetch failed: %s", stderr.strip()) + return None + + rc, _, stderr = await _run_git( + "checkout", base_commit, cwd=repo_dir, + ) + if rc != 0: + logger.warning("ast_check: git checkout failed: %s", stderr.strip()) + return None + + patch_file = os.path.join(tmpdir, "patch.diff") + with open(patch_file, "w") as f: + f.write(patch_diff) + + rc, _, stderr = await _run_git( + "apply", patch_file, cwd=repo_dir, + ) + if rc != 0: + logger.warning("ast_check: git apply failed: %s", stderr.strip()) + return None + + errors: list[dict[str, Any]] = [] + parsed = 0 + errored = 0 + skipped = 0 + + for filepath in python_files: + full_path = os.path.join(repo_dir, filepath) + if not os.path.isfile(full_path): + skipped += 1 + continue + + try: + with open(full_path) as f: + content = f.read() + except OSError: + skipped += 1 + continue + + try: + ast.parse(content, filename=filepath) + parsed += 1 + except SyntaxError as e: + errored += 1 + snippet = _build_full_file_snippet(content, e.lineno) + errors.append({ + "file": filepath, + "line": e.lineno, + "offset": e.offset, + "message": e.msg, + "snippet": snippet, + }) + + checkable = parsed + errored + score = parsed / checkable if checkable > 0 else 1.0 + + return VerifierResult( + name=verifier_name, + status=VerifierStatus.OK, + score=score, + pass_threshold=pass_threshold, + details={ + "errors": errors, + "method": "full_file", + "files_total": len(python_files), + "files_checkable": checkable, + "files_parsed": parsed, + "files_skipped": skipped, + "files_errored": errored, + }, + ) + + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +def _build_full_file_snippet(content: str, error_lineno: int | None) -> str | None: + """Build a code snippet around the error line from full file content.""" + if error_lineno is None: + return None + + lines = content.splitlines() + if not (1 <= error_lineno <= len(lines)): + return None + + start = max(0, error_lineno - 1 - _SNIPPET_BEFORE) + end = min(len(lines), error_lineno + _SNIPPET_AFTER) + + snippet_lines = [] + for i in range(start, end): + lineno = i + 1 + marker = "-->" if lineno == error_lineno else " " + snippet_lines.append(f" {marker} {lineno:4d} {lines[i]}") + + return "\n".join(snippet_lines) + + +# ── Hunk-based parsing (fallback) ────────────────────────────────────── + + def _parse_hunk_new_start(header: str) -> int: """Extract the new-file start line from a unified diff hunk header.""" m = _HUNK_HEADER_RE.search(header) @@ -224,11 +382,22 @@ def _make_error(lineno: int | None, offset: int | None, msg: str) -> dict[str, A return _make_error(lineno, err.offset, err.msg) +# ── Verifier class ───────────────────────────────────────────────────── + + class ASTCheckVerifier(BaseVerifier): - """Static verifier: checks Python syntax of all changed files in the patch.""" + """Static verifier: checks Python syntax of all changed files in the patch. + + Uses full-file parsing when SWE-bench metadata (repo, base_commit) is + available; falls back to hunk-based parsing otherwise. + """ execution_mode: ClassVar[Literal["static", "dynamic"]] = "static" + def __init__(self, **kwargs: Any) -> None: + kwargs.setdefault("timeout", 120.0) + super().__init__(**kwargs) + @property def name(self) -> str: return "ast_check" @@ -261,10 +430,33 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: details={"message": "No Python files changed"}, ) + instance_data = ctx.metadata.get("instance_data", {}) + repo = instance_data.get("repo") + base_commit = instance_data.get("base_commit") + + if repo and base_commit: + result = await _clone_and_parse( + repo=repo, + base_commit=base_commit, + patch_diff=ctx.patch_diff, + python_files=python_files, + verifier_name=self.name, + pass_threshold=self.pass_threshold, + ) + if result is not None: + return result + logger.info("ast_check: full-file parsing failed, falling back to hunk parsing") + + return self._verify_from_hunks(ctx, python_files) + + def _verify_from_hunks( + self, ctx: PatchContext, python_files: list[str], + ) -> VerifierResult: + """Fallback: parse each diff hunk in isolation.""" errors: list[dict[str, Any]] = [] parsed = 0 errored = 0 - skipped = 0 # files not present in the diff (e.g. pure deletions) + skipped = 0 for filepath in python_files: hunks = _extract_new_content(ctx.patch_diff, filepath) @@ -301,6 +493,7 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: pass_threshold=self.pass_threshold, details={ "errors": errors, + "method": "hunk", "files_total": len(python_files), "files_checkable": checkable, "files_parsed": parsed, From 4c3f80bcc8783097f321d487a8b9edc633be3324 Mon Sep 17 00:00:00 2001 From: ilya-kolchinsky Date: Thu, 28 May 2026 13:37:44 +0200 Subject: [PATCH 9/9] CodeRabbit CR fixes. --- verifiers/ast_check.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/verifiers/ast_check.py b/verifiers/ast_check.py index eaa1b88..d44762e 100644 --- a/verifiers/ast_check.py +++ b/verifiers/ast_check.py @@ -64,14 +64,21 @@ async def _run_git(*args: str, cwd: str | None = None) -> tuple[int, str, str]: stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - stdout, stderr = await proc.communicate() - return proc.returncode, stdout.decode(), stderr.decode() + try: + stdout, stderr = await proc.communicate() + return proc.returncode, stdout.decode(), stderr.decode() + except asyncio.CancelledError: + if proc.returncode is None: + proc.kill() + await proc.wait() + raise async def _clone_and_parse( repo: str, base_commit: str, patch_diff: str, + all_changed_files: list[str], python_files: list[str], verifier_name: str, pass_threshold: float, @@ -96,7 +103,7 @@ async def _clone_and_parse( return None rc, _, stderr = await _run_git( - "sparse-checkout", "set", "--skip-checks", *python_files, cwd=repo_dir, + "sparse-checkout", "set", "--skip-checks", *all_changed_files, cwd=repo_dir, ) if rc != 0: logger.warning("ast_check: sparse-checkout failed: %s", stderr.strip()) @@ -439,6 +446,7 @@ async def verify(self, ctx: PatchContext) -> VerifierResult: repo=repo, base_commit=base_commit, patch_diff=ctx.patch_diff, + all_changed_files=ctx.changed_files, python_files=python_files, verifier_name=self.name, pass_threshold=self.pass_threshold,