diff --git a/.agents/skills/benchflow-experiment-review/scripts/extract_harness_skills.py b/.agents/skills/benchflow-experiment-review/scripts/extract_harness_skills.py
index 76fb3df0..862357a3 100755
--- a/.agents/skills/benchflow-experiment-review/scripts/extract_harness_skills.py
+++ b/.agents/skills/benchflow-experiment-review/scripts/extract_harness_skills.py
@@ -121,7 +121,13 @@ def walk_dicts(value: Any) -> list[dict[str, Any]]:
 
 def infer_skill_mode(value: Any) -> str:
     text = str(value or "").strip().lower().replace("-", "_")
-    if text in {"with_skill", "with_skills", "with_task_skills", "task_skills", "skills"}:
+    if text in {
+        "with_skill",
+        "with_skills",
+        "with_task_skills",
+        "task_skills",
+        "skills",
+    }:
         return "with_skills"
     if text in {"without_skill", "without_skills", "no_skill", "no_skills", "baseline"}:
         return "without_skills"
@@ -253,8 +259,16 @@ def task_skill_loading_fields(
     catalog_names = catalog_skill_names(extracted.get("skills", []))
     catalog_norm_to_name = {normalize_skill_name(name): name for name in catalog_names}
 
-    loaded = [catalog_norm_to_name[name] for name in sorted(expected_norm) if name in catalog_norm_to_name]
-    missing = [name for name in expected if normalize_skill_name(name) not in catalog_norm_to_name]
+    loaded = [
+        catalog_norm_to_name[name]
+        for name in sorted(expected_norm)
+        if name in catalog_norm_to_name
+    ]
+    missing = [
+        name
+        for name in expected
+        if normalize_skill_name(name) not in catalog_norm_to_name
+    ]
     mode = task_skill_context.get("task_skill_mode", "unknown")
 
     if expected and mode == "without_skills":
@@ -290,7 +304,9 @@ def task_skill_loading_fields(
     }
 
 
-def request_bodies(events: list[dict[str, Any]]) -> list[tuple[int, str, dict[str, Any]]]:
+def request_bodies(
+    events: list[dict[str, Any]],
+) -> list[tuple[int, str, dict[str, Any]]]:
     bodies: list[tuple[int, str, dict[str, Any]]] = []
     for idx, event in enumerate(events):
         request = event.get("request")
@@ -479,7 +495,13 @@ def extract_pi(body: dict[str, Any]) -> dict[str, Any] | None:
     )
 
 
-EXTRACTORS = [extract_codex, extract_openhands, extract_gemini, extract_claude_code, extract_pi]
+EXTRACTORS = [
+    extract_codex,
+    extract_openhands,
+    extract_gemini,
+    extract_claude_code,
+    extract_pi,
+]
 
 
 def extract_from_body(body: dict[str, Any]) -> dict[str, Any] | None:
@@ -497,7 +519,12 @@ def bodies_from_text(text: str) -> list[dict[str, Any]]:
             0,
             {
                 "instructions": "",
-                "input": [{"role": "developer", "content": [{"type": "input_text", "text": text}]}],
+                "input": [
+                    {
+                        "role": "developer",
+                        "content": [{"type": "input_text", "text": text}],
+                    }
+                ],
             },
         )
     if "# Available Agent Skills" in text:
@@ -550,16 +577,14 @@ def finalize_result(
         }
     )
     extracted.update(task_loading)
-    extracted["manual_review_required"] = (
-        skill_count == 0
-        or task_loading["task_skills_loading_status"]
-        in {
-            "expected_with_skills_but_no_task_skill_manifest",
-            "missing_expected_task_skills",
-            "partial_unexpected_task_skills_loaded_without_skills",
-            "unexpected_complete_task_skills_loaded_without_skills",
-        }
-    )
+    extracted["manual_review_required"] = skill_count == 0 or task_loading[
+        "task_skills_loading_status"
+    ] in {
+        "expected_with_skills_but_no_task_skill_manifest",
+        "missing_expected_task_skills",
+        "partial_unexpected_task_skills_loaded_without_skills",
+        "unexpected_complete_task_skills_loaded_without_skills",
+    }
     return extracted
 
 
@@ -655,7 +680,9 @@ def main() -> None:
         args.task_path,
     )
     events = read_jsonl(args.trajectory, limit=limit)
-    extracted = extract_from_events(args.trajectory, events, checked_files, task_skill_context)
+    extracted = extract_from_events(
+        args.trajectory, events, checked_files, task_skill_context
+    )
 
     if extracted is None:
         fallback = sibling_acp_path(args.trajectory)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 25cf1044..28517688 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,13 +2,60 @@
 
 ## [Unreleased]
 
+## 0.6.4 — 2026-06-27
+
+### Added
+- **Environment and config as run-time axes on `bench eval run`.** `--state`
+  binds the environment (S-axis) per run — inline JSON, a registry
+  `name@version` resolved through the environment registry, or a manifest path
+  (takes precedence over `--environment-manifest`). `--config-override` overlays
+  the task config (C-axis) — inline JSON/YAML/TOML or `@file`, deep-merged into
+  each task's resolved config. `--config` also gains a `--run-config` alias.
+  (#790)
+- **Content-addressed environment binding.** Registry environment resolution is
+  content-addressed — `env_hash = sha256(manifest)` — so a `name@version`
+  resolves to an exact, pinned environment that is recorded for replay; the
+  C-axis `--config-override` is likewise persisted with its content hash and the
+  applied patch. Every rollout is attributable to the precise world and config
+  it ran against. (#790)
+- **MLE-bench adapter.** Adds an MLE-bench benchmark adapter, parity fixture, and
+  task plumbing for running and auditing MLE-bench through BenchFlow. (#792)
+- **Agent adapter skill.** Adds the canonical adapter skill under `.agents/skills`
+  for harness-side adapter work. (#793)
+- **Prime-RL SFT export.** Adds `bench train convert prime-sft` support for
+  exporting BenchFlow trajectories into Prime SFT-ready JSONL artifacts. (#828)
+
 ### Changed
 - **`bench continue` is now `bench eval continue`.** The command (and its
   `continue-batch` companion) moved under the `eval` group, where it is now
   discoverable in `bench eval --help` alongside `run`/`adopt`. The original
   top-level `bench continue` / `bench continue-batch` remain as hidden,
   deprecated aliases (they print a deprecation notice) so existing scripts keep
-  working.
+  working. (#800)
+- **Routable agents always go through the LiteLLM usage proxy.** OpenCode-family
+  and pi-acp model calls now stay on the proxy path so token usage, cost, and
+  trajectory capture are preserved consistently. (#797, #803, #820)
+- **Agent manifest loading is now the additive decoupling path.** The core agent
+  manifest loader and Omnigent/session-factory seam are gated in while preserving
+  existing ACP manifests and byte-identical parity coverage. (#825, #836, #837)
+
+### Fixed
+- Resolved the sharded and run-config paths so the S-axis environment and C-axis
+  config overlay are applied consistently in `bench eval run`. (#804)
+- Added `bench eval run --context-root` plumbing and early validation for missing
+  paths. (#816)
+- Fixed verifier-error resume logging and streaming `claude-agent-acp`
+  trajectory emission so failed or streamed runs retain the expected evidence.
+  (#819, #839)
+- Resolved bare model IDs to their provider, avoided pi-acp context-window retry
+  storms, and kept provider failure causes visible while preserving redaction.
+  (#805, #831, #834, #835)
+- Preserved Codex subscription-auth behavior and auth-file permissions in the
+  launcher path. (#825)
+- Rejected `.git` and `file://` source paths with clear errors. (#822)
+- Hardened experiment-review and integration gates around missing trajectories,
+  summaryless roots, file-editor false positives, and L3 review calibration.
+  (#802, #806, #807, #808, #809, #810, #811, #812, #814, #817, #821, #823, #824)
 
 ## 0.6.3 — 2026-06-16
 
diff --git a/CITATION.cff b/CITATION.cff
index 8070f92b..d0717d71 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -9,8 +9,8 @@ authors:
 repository-code: "https://github.com/benchflow-ai/benchflow"
 url: "https://github.com/benchflow-ai/benchflow"
 license: Apache-2.0
-version: 0.6.3
-date-released: 2026-06-16
+version: 0.6.4
+date-released: 2026-06-27
 keywords:
   - benchmark
   - llm-agents
diff --git a/docs/examples/swebench_pro_progressive_disclosure.ipynb b/docs/examples/swebench_pro_progressive_disclosure.ipynb
index 51a5dcdd..9943a58e 100644
--- a/docs/examples/swebench_pro_progressive_disclosure.ipynb
+++ b/docs/examples/swebench_pro_progressive_disclosure.ipynb
@@ -60,8 +60,7 @@
     "\n",
     "# Run from the repo root (this notebook lives in docs/examples/).\n",
     "while (\n",
-    "    not (Path.cwd() / \"src\" / \"benchflow\").exists()\n",
-    "    and Path.cwd() != Path.cwd().parent\n",
+    "    not (Path.cwd() / \"src\" / \"benchflow\").exists() and Path.cwd() != Path.cwd().parent\n",
     "):\n",
     "    os.chdir(Path.cwd().parent)\n",
     "\n",
diff --git a/pyproject.toml b/pyproject.toml
index 8c52dc68..c885a43e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "benchflow"
-version = "0.6.3"
+version = "0.6.4"
 description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
 readme = "README.md"
 requires-python = ">=3.12"
diff --git a/uv.lock b/uv.lock
index eecb0746..f3612963 100644
--- a/uv.lock
+++ b/uv.lock
@@ -288,7 +288,7 @@ wheels = [
 
 [[package]]
 name = "benchflow"
-version = "0.6.3"
+version = "0.6.4"
 source = { editable = "." }
 dependencies = [
     { name = "agent-client-protocol" },