OpenFn · hanna-paasivirta · Jun 30, 2026 · Jun 29, 2026
diff --git a/services/testing/README.md b/services/testing/README.md
@@ -24,10 +24,20 @@ This directory is on the Python path via `pyproject.toml`
 - `spec_collector.py` — pytest plugin (registered via `pytest_plugins` in the
   repo-root `conftest.py`). Turns each MD spec into a pytest item that builds
   the service payload, calls the service via `ApolloClient`, and runs the judge.
-  Any project YAML in the response (`response_yaml`, `workflow_yaml`,
-  `content_yaml`, or a `workflow_yaml` attachment) is written to a `tmp/`
-  folder next to the spec file (e.g.
-  `services/workflow_chat/tests/acceptance/tmp/<spec_id>.yaml`) for inspection.
+  For inspection, three artifacts are written to a `tmp/` folder next to the
+  spec file:
+  - `<spec_id>.yaml` — any project YAML in the response (`response_yaml`,
+    `workflow_yaml`, `content_yaml`, or a `workflow_yaml` attachment).
+  - `<spec_id>.txt` — the response text, prefixed with the agent path
+    (e.g. `agents: router -> planner -> job_code_agent`).
+  - `<spec_id>.judges.txt` — the judge verdict(s) in the same format printed
+    during the run (per-judge PASS/FAIL header + criteria/flags summary).
+
+  Filenames use `__` as the metadata separator (the spec id and extension only
+  use `.`/`-`), so they stay splittable. Multi-run specs append `__run-N`. Pass
+  `-E <label>` / `--experiment=<label>` to append `__<label>` to every captured
+  filename so runs with different settings or dates don't overwrite each other
+  (e.g. `tmp/<spec_id>__sonnet-2026-06-29.txt`).
 - `apollo_client.py` — `ApolloClient` for dispatching to a chat service.
   Currently a subprocess-based stub; the integration tier will replace its
   internals with a real HTTP client (same `.call()` signature, no test changes).

diff --git a/services/testing/spec_collector.py b/services/testing/spec_collector.py
@@ -13,6 +13,7 @@
   5. Fails with the judge's reasoning summary if `verdict.passed` is False.
 """
 
+import re
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 
@@ -28,6 +29,34 @@
 _session_verdicts: list[tuple[str, judge.Verdict]] = []
 
 
+def pytest_addoption(parser):
+    parser.addoption(
+        "-E",
+        "--experiment",
+        action="store",
+        default="",
+        help="Optional experiment label appended to captured output filenames "
+        "in tmp/ (e.g. --experiment=sonnet-2026-06-29) so runs with different "
+        "settings/dates don't overwrite each other.",
+    )
+
+
+def _experiment_suffix(config) -> str:
+    """Filesystem-safe `__<experiment>` suffix, or '' if no experiment given.
+
+    `__` is the reserved metadata-boundary separator (the spec id and extension
+    use `.`/`-` but never `__`), so a downstream `name.split("__")` can recover
+    the fields. We collapse any `_` runs in the label to a single `_` so a user
+    label can't inject a false field boundary.
+    """
+    raw = (config.getoption("experiment") or "").strip()
+    if not raw:
+        return ""
+    safe = re.sub(r"[^A-Za-z0-9_-]+", "-", raw)
+    safe = re.sub(r"_+", "_", safe).strip("-_")
+    return f"__{safe}" if safe else ""
+
+
 def pytest_collect_file(parent, file_path):
     if (
         file_path.suffix == ".md"
@@ -69,14 +98,17 @@ def runtest(self):
         response = client.call(spec.service, payload)
         print("  ✓ service responded")
 
+        tmp_dir = self.path.parent / "tmp"
+        experiment = _experiment_suffix(self.config)
+
         yaml_path = _capture_response_yaml(
-            response, spec.id, self.run_index, spec.runs, self.path.parent / "tmp"
+            response, spec.id, self.run_index, spec.runs, tmp_dir, experiment
         )
         if yaml_path is not None:
             print(f"  ✓ project YAML saved to {yaml_path}")
 
         text_path = _capture_response_text(
-            response, spec.id, self.run_index, spec.runs, self.path.parent / "tmp"
+            response, spec.id, self.run_index, spec.runs, tmp_dir, experiment
         )
         if text_path is not None:
             print(f"  ✓ response text saved to {text_path}")
@@ -102,6 +134,12 @@ def _run_judge(judge_name: str) -> judge.Verdict:
                   f"(score={v.score:.2f}, flags={len(v.general_flags)})")
             _session_verdicts.append((spec.id, v))
 
+        judges_path = _capture_judge_verdicts(
+            verdicts, spec.id, self.run_index, spec.runs, tmp_dir, experiment
+        )
+        if judges_path is not None:
+            print(f"  ✓ judge verdicts saved to {judges_path}")
+
         failing = [v for v in verdicts if not v.passed]
         if failing:
             summary = "\n\n".join(v.summary for v in failing)
@@ -166,11 +204,14 @@ def _capture_response_yaml(
     run_index: int,
     runs: int,
     output_dir: Path,
+    experiment: str = "",
 ) -> Path | None:
     """Write the response's project YAML to `output_dir/<spec_id>.yaml`.
 
     For multi-run specs, appends `__run-N` to the filename so each run is
-    preserved. Returns the written path, or None if no YAML was present.
+    preserved. `experiment` (already a `__<label>` suffix or '') is appended
+    last so runs with different settings don't overwrite each other. Returns
+    the written path, or None if no YAML was present.
     """
     yaml_str = _extract_yaml_from_response(response)
     if yaml_str is None:
@@ -179,7 +220,7 @@ def _capture_response_yaml(
     output_dir.mkdir(parents=True, exist_ok=True)
     suffix = f"__run-{run_index}" if runs > 1 else ""
     safe_id = spec_id.replace("/", "_")
-    path = output_dir / f"{safe_id}{suffix}.yaml"
+    path = output_dir / f"{safe_id}{suffix}{experiment}.yaml"
     path.write_text(yaml_str)
     return path
 
@@ -219,6 +260,7 @@ def _capture_response_text(
     run_index: int,
     runs: int,
     output_dir: Path,
+    experiment: str = "",
 ) -> Path | None:
     """Write the agent path and response text to `output_dir/<spec_id>.txt`."""
     if not isinstance(response, dict):
@@ -233,7 +275,40 @@ def _capture_response_text(
     output_dir.mkdir(parents=True, exist_ok=True)
     suffix = f"__run-{run_index}" if runs > 1 else ""
     safe_id = spec_id.replace("/", "_")
-    path = output_dir / f"{safe_id}{suffix}.txt"
+    path = output_dir / f"{safe_id}{suffix}{experiment}.txt"
+    path.write_text(body)
+    return path
+
+
+def _capture_judge_verdicts(
+    verdicts: list[judge.Verdict],
+    spec_id: str,
+    run_index: int,
+    runs: int,
+    output_dir: Path,
+    experiment: str = "",
+) -> Path | None:
+    """Write the judge verdicts to `output_dir/<spec_id>.judges.txt`.
+
+    Reuses the same formatting printed during the run: a one-line header per
+    judge (`✓ general: PASS (score=..., flags=...)`) followed by that judge's
+    `verdict.summary` block (criteria + reasoning + flags).
+    """
+    if not verdicts:
+        return None
+
+    blocks = []
+    for v in verdicts:
+        mark = "✓" if v.passed else "✗"
+        header = (f"{mark} {v.judge_name}: {'PASS' if v.passed else 'FAIL'} "
+                  f"(score={v.score:.2f}, flags={len(v.general_flags)})")
+        blocks.append(f"{header}\n\n{v.summary}")
+    body = "\n\n===\n\n".join(blocks) + "\n"
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    suffix = f"__run-{run_index}" if runs > 1 else ""
+    safe_id = spec_id.replace("/", "_")
+    path = output_dir / f"{safe_id}.judges{suffix}{experiment}.txt"
     path.write_text(body)
     return path