diff --git a/.agents/skills/benchflow-experiment-review/scripts/extract_harness_skills.py b/.agents/skills/benchflow-experiment-review/scripts/extract_harness_skills.py index 76fb3df0..862357a3 100755 --- a/.agents/skills/benchflow-experiment-review/scripts/extract_harness_skills.py +++ b/.agents/skills/benchflow-experiment-review/scripts/extract_harness_skills.py @@ -121,7 +121,13 @@ def walk_dicts(value: Any) -> list[dict[str, Any]]: def infer_skill_mode(value: Any) -> str: text = str(value or "").strip().lower().replace("-", "_") - if text in {"with_skill", "with_skills", "with_task_skills", "task_skills", "skills"}: + if text in { + "with_skill", + "with_skills", + "with_task_skills", + "task_skills", + "skills", + }: return "with_skills" if text in {"without_skill", "without_skills", "no_skill", "no_skills", "baseline"}: return "without_skills" @@ -253,8 +259,16 @@ def task_skill_loading_fields( catalog_names = catalog_skill_names(extracted.get("skills", [])) catalog_norm_to_name = {normalize_skill_name(name): name for name in catalog_names} - loaded = [catalog_norm_to_name[name] for name in sorted(expected_norm) if name in catalog_norm_to_name] - missing = [name for name in expected if normalize_skill_name(name) not in catalog_norm_to_name] + loaded = [ + catalog_norm_to_name[name] + for name in sorted(expected_norm) + if name in catalog_norm_to_name + ] + missing = [ + name + for name in expected + if normalize_skill_name(name) not in catalog_norm_to_name + ] mode = task_skill_context.get("task_skill_mode", "unknown") if expected and mode == "without_skills": @@ -290,7 +304,9 @@ def task_skill_loading_fields( } -def request_bodies(events: list[dict[str, Any]]) -> list[tuple[int, str, dict[str, Any]]]: +def request_bodies( + events: list[dict[str, Any]], +) -> list[tuple[int, str, dict[str, Any]]]: bodies: list[tuple[int, str, dict[str, Any]]] = [] for idx, event in enumerate(events): request = event.get("request") @@ -479,7 +495,13 @@ def extract_pi(body: dict[str, Any]) -> dict[str, Any] | None: ) -EXTRACTORS = [extract_codex, extract_openhands, extract_gemini, extract_claude_code, extract_pi] +EXTRACTORS = [ + extract_codex, + extract_openhands, + extract_gemini, + extract_claude_code, + extract_pi, +] def extract_from_body(body: dict[str, Any]) -> dict[str, Any] | None: @@ -497,7 +519,12 @@ def bodies_from_text(text: str) -> list[dict[str, Any]]: 0, { "instructions": "", - "input": [{"role": "developer", "content": [{"type": "input_text", "text": text}]}], + "input": [ + { + "role": "developer", + "content": [{"type": "input_text", "text": text}], + } + ], }, ) if "# Available Agent Skills" in text: @@ -550,16 +577,14 @@ def finalize_result( } ) extracted.update(task_loading) - extracted["manual_review_required"] = ( - skill_count == 0 - or task_loading["task_skills_loading_status"] - in { - "expected_with_skills_but_no_task_skill_manifest", - "missing_expected_task_skills", - "partial_unexpected_task_skills_loaded_without_skills", - "unexpected_complete_task_skills_loaded_without_skills", - } - ) + extracted["manual_review_required"] = skill_count == 0 or task_loading[ + "task_skills_loading_status" + ] in { + "expected_with_skills_but_no_task_skill_manifest", + "missing_expected_task_skills", + "partial_unexpected_task_skills_loaded_without_skills", + "unexpected_complete_task_skills_loaded_without_skills", + } return extracted @@ -655,7 +680,9 @@ def main() -> None: args.task_path, ) events = read_jsonl(args.trajectory, limit=limit) - extracted = extract_from_events(args.trajectory, events, checked_files, task_skill_context) + extracted = extract_from_events( + args.trajectory, events, checked_files, task_skill_context + ) if extracted is None: fallback = sibling_acp_path(args.trajectory) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25cf1044..28517688 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,60 @@ ## [Unreleased] +## 0.6.4 — 2026-06-27 + +### Added +- **Environment and config as run-time axes on `bench eval run`.** `--state` + binds the environment (S-axis) per run — inline JSON, a registry + `name@version` resolved through the environment registry, or a manifest path + (takes precedence over `--environment-manifest`). `--config-override` overlays + the task config (C-axis) — inline JSON/YAML/TOML or `@file`, deep-merged into + each task's resolved config. `--config` also gains a `--run-config` alias. + (#790) +- **Content-addressed environment binding.** Registry environment resolution is + content-addressed — `env_hash = sha256(manifest)` — so a `name@version` + resolves to an exact, pinned environment that is recorded for replay; the + C-axis `--config-override` is likewise persisted with its content hash and the + applied patch. Every rollout is attributable to the precise world and config + it ran against. (#790) +- **MLE-bench adapter.** Adds an MLE-bench benchmark adapter, parity fixture, and + task plumbing for running and auditing MLE-bench through BenchFlow. (#792) +- **Agent adapter skill.** Adds the canonical adapter skill under `.agents/skills` + for harness-side adapter work. (#793) +- **Prime-RL SFT export.** Adds `bench train convert prime-sft` support for + exporting BenchFlow trajectories into Prime SFT-ready JSONL artifacts. (#828) + ### Changed - **`bench continue` is now `bench eval continue`.** The command (and its `continue-batch` companion) moved under the `eval` group, where it is now discoverable in `bench eval --help` alongside `run`/`adopt`. The original top-level `bench continue` / `bench continue-batch` remain as hidden, deprecated aliases (they print a deprecation notice) so existing scripts keep - working. + working. (#800) +- **Routable agents always go through the LiteLLM usage proxy.** OpenCode-family + and pi-acp model calls now stay on the proxy path so token usage, cost, and + trajectory capture are preserved consistently. (#797, #803, #820) +- **Agent manifest loading is now the additive decoupling path.** The core agent + manifest loader and Omnigent/session-factory seam are gated in while preserving + existing ACP manifests and byte-identical parity coverage. (#825, #836, #837) + +### Fixed +- Resolved the sharded and run-config paths so the S-axis environment and C-axis + config overlay are applied consistently in `bench eval run`. (#804) +- Added `bench eval run --context-root` plumbing and early validation for missing + paths. (#816) +- Fixed verifier-error resume logging and streaming `claude-agent-acp` + trajectory emission so failed or streamed runs retain the expected evidence. + (#819, #839) +- Resolved bare model IDs to their provider, avoided pi-acp context-window retry + storms, and kept provider failure causes visible while preserving redaction. + (#805, #831, #834, #835) +- Preserved Codex subscription-auth behavior and auth-file permissions in the + launcher path. (#825) +- Rejected `.git` and `file://` source paths with clear errors. (#822) +- Hardened experiment-review and integration gates around missing trajectories, + summaryless roots, file-editor false positives, and L3 review calibration. + (#802, #806, #807, #808, #809, #810, #811, #812, #814, #817, #821, #823, #824) ## 0.6.3 — 2026-06-16 diff --git a/CITATION.cff b/CITATION.cff index 8070f92b..d0717d71 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -9,8 +9,8 @@ authors: repository-code: "https://github.com/benchflow-ai/benchflow" url: "https://github.com/benchflow-ai/benchflow" license: Apache-2.0 -version: 0.6.3 -date-released: 2026-06-16 +version: 0.6.4 +date-released: 2026-06-27 keywords: - benchmark - llm-agents diff --git a/docs/examples/swebench_pro_progressive_disclosure.ipynb b/docs/examples/swebench_pro_progressive_disclosure.ipynb index 51a5dcdd..9943a58e 100644 --- a/docs/examples/swebench_pro_progressive_disclosure.ipynb +++ b/docs/examples/swebench_pro_progressive_disclosure.ipynb @@ -60,8 +60,7 @@ "\n", "# Run from the repo root (this notebook lives in docs/examples/).\n", "while (\n", - " not (Path.cwd() / \"src\" / \"benchflow\").exists()\n", - " and Path.cwd() != Path.cwd().parent\n", + " not (Path.cwd() / \"src\" / \"benchflow\").exists() and Path.cwd() != Path.cwd().parent\n", "):\n", " os.chdir(Path.cwd().parent)\n", "\n", diff --git a/pyproject.toml b/pyproject.toml index 8c52dc68..c885a43e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "benchflow" -version = "0.6.3" +version = "0.6.4" description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider." readme = "README.md" requires-python = ">=3.12" diff --git a/uv.lock b/uv.lock index eecb0746..f3612963 100644 --- a/uv.lock +++ b/uv.lock @@ -288,7 +288,7 @@ wheels = [ [[package]] name = "benchflow" -version = "0.6.3" +version = "0.6.4" source = { editable = "." } dependencies = [ { name = "agent-client-protocol" },