From 753b88d13e4a2971b28827d1e17f0bd87b0ca502 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Thu, 25 Jun 2026 18:18:37 -0400
Subject: [PATCH 01/18] test(eval): add TDD tests for release scorecard + gate
 (frozen contract)

Tests encode the full acceptance criteria for gaia.eval.release_scorecard
and gaia.eval.scorecard_gate before any implementation exists. Includes
the email benchmark fixture used by the adapter tests.
---
 .../eval/email_benchmark_scorecard.json       |   5 +
 tests/unit/eval/test_release_scorecard.py     | 439 ++++++++++++++++++
 tests/unit/eval/test_scorecard_gate.py        | 246 ++++++++++
 3 files changed, 690 insertions(+)
 create mode 100644 tests/fixtures/eval/email_benchmark_scorecard.json
 create mode 100644 tests/unit/eval/test_release_scorecard.py
 create mode 100644 tests/unit/eval/test_scorecard_gate.py

diff --git a/tests/fixtures/eval/email_benchmark_scorecard.json b/tests/fixtures/eval/email_benchmark_scorecard.json
new file mode 100644
index 000000000..389e7292f
--- /dev/null
+++ b/tests/fixtures/eval/email_benchmark_scorecard.json
@@ -0,0 +1,5 @@
+{"run_id":"bench-fixture","scenarios":[
+  {"category":"Gemma-4-E4B-it-GGUF","status":"PASS","total_emails":12,"quality":{"category_accuracy":0.4167}},
+  {"category":"Gemma-4-E4B-it-GGUF","status":"PASS","total_emails":12,"quality":{"category_accuracy":0.5000}},
+  {"category":"Gemma-4-E4B-it-GGUF","status":"PASS","total_emails":0}
+]}
diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py
new file mode 100644
index 000000000..7e9770e1a
--- /dev/null
+++ b/tests/unit/eval/test_release_scorecard.py
@@ -0,0 +1,439 @@
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+"""TDD tests for gaia.eval.release_scorecard — written before implementation exists."""
+
+import datetime
+import importlib.util
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+from gaia.eval.release_scorecard import (
+    REQUIRED_FIELDS,
+    ResultPayload,
+    carry_forward,
+    compute_aggregate,
+    latest_version_below,
+    parse_scorecard,
+    render_scorecard,
+    validate_scorecard,
+)
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+FIXTURE_DIR = Path(__file__).parents[2] / "fixtures" / "eval"
+EMAIL_BENCHMARK_FIXTURE = FIXTURE_DIR / "email_benchmark_scorecard.json"
+
+
+def _make_payload(version="1.0.0", accuracy=0.5):
+    metrics = [{"name": "category_accuracy", "value": accuracy, "weight": 1.0}]
+    components, agg_value = compute_aggregate(metrics)
+    return ResultPayload(
+        agent_name="test-agent",
+        agent_version=version,
+        dataset_reference="test/fixture",
+        dataset_description="test dataset",
+        dataset_size=100,
+        methodology="unit test",
+        config={"model": "test"},
+        test_cases_run=10,
+        metrics=metrics,
+        aggregate_name="weighted_accuracy",
+        generated_at=datetime.datetime.utcnow().isoformat(),
+        inherited_from=None,
+    )
+
+
+# ---------------------------------------------------------------------------
+# 1. Schema / validator round-trip
+# ---------------------------------------------------------------------------
+
+
+class TestSchemaValidator:
+    def test_valid_payload_passes_validation(self):
+        payload = _make_payload()
+        text = render_scorecard(payload)
+        parsed = parse_scorecard(text)
+        errors = validate_scorecard(parsed)
+        assert errors == [], f"Expected no errors, got: {errors}"
+
+    def test_missing_required_fields_each_flagged(self):
+        payload = _make_payload()
+        text = render_scorecard(payload)
+        parsed = parse_scorecard(text)
+
+        # Each required top-level field, when removed, should produce a non-empty error list.
+        for field in REQUIRED_FIELDS:
+            mutated = {k: v for k, v in parsed.items() if k != field}
+            errors = validate_scorecard(mutated)
+            assert errors, (
+                f"Expected validate_scorecard to flag missing '{field}' "
+                f"but got empty error list"
+            )
+
+    def test_required_top_level_keys_include_expected_sections(self):
+        # schema_version, agent, recipe, results, aggregate must be required
+        for section in ("schema_version", "agent", "recipe", "results", "aggregate"):
+            assert section in REQUIRED_FIELDS, (
+                f"'{section}' must be in REQUIRED_FIELDS"
+            )
+
+
+# ---------------------------------------------------------------------------
+# 2. Aggregate computation
+# ---------------------------------------------------------------------------
+
+
+class TestComputeAggregate:
+    def test_single_metric(self):
+        _, value = compute_aggregate([{"name": "acc", "value": 0.5, "weight": 1.0}])
+        assert value == 50.0
+
+    def test_multiple_metrics_weighted(self):
+        metrics = [
+            {"name": "a", "value": 0.4167, "weight": 1.0},
+            {"name": "b", "value": 0.5, "weight": 2.0},
+        ]
+        _, value = compute_aggregate(metrics)
+        expected = round(100 * (0.4167 + 2 * 0.5) / (1 + 2), 2)
+        assert value == expected
+
+    def test_empty_metrics_raises(self):
+        with pytest.raises(ValueError):
+            compute_aggregate([])
+
+    def test_zero_weight_raises(self):
+        with pytest.raises(ValueError):
+            compute_aggregate([{"name": "x", "value": 0.5, "weight": 0.0}])
+
+    def test_recompute_from_components_matches_aggregate_value(self):
+        metrics = [
+            {"name": "cat_acc", "value": 0.4167, "weight": 1.0},
+            {"name": "send_acc", "value": 0.75, "weight": 2.0},
+        ]
+        payload = _make_payload()
+        # Build payload with these 2 metrics directly
+        components, agg_value = compute_aggregate(metrics)
+        recomputed = round(
+            100
+            * sum(c["weight"] * c["value"] for c in components)
+            / sum(c["weight"] for c in components),
+            2,
+        )
+        assert recomputed == agg_value
+
+
+# ---------------------------------------------------------------------------
+# 3. Generator round-trip
+# ---------------------------------------------------------------------------
+
+
+class TestGeneratorRoundTrip:
+    def test_rendered_text_starts_with_dashes(self):
+        payload = _make_payload()
+        text = render_scorecard(payload)
+        lines = text.splitlines()
+        assert lines[0] == "---", f"First line must be '---', got: {lines[0]!r}"
+
+    def test_rendered_text_contains_closing_dashes(self):
+        payload = _make_payload()
+        text = render_scorecard(payload)
+        lines = text.splitlines()
+        # Find second occurrence of '---'
+        closing = [i for i, l in enumerate(lines) if l == "---" and i > 0]
+        assert closing, "Rendered scorecard must contain a closing '---' after the first"
+
+    def test_body_after_front_matter_is_non_empty(self):
+        payload = _make_payload()
+        text = render_scorecard(payload)
+        lines = text.splitlines()
+        closing_indices = [i for i, l in enumerate(lines) if l == "---"]
+        assert len(closing_indices) >= 2, "Need at least two '---' lines"
+        body = "\n".join(lines[closing_indices[1] + 1 :])
+        assert body.strip(), "Body after front matter must be non-empty"
+
+    def test_parse_recovers_all_required_fields(self):
+        payload = _make_payload()
+        text = render_scorecard(payload)
+        parsed = parse_scorecard(text)
+        errors = validate_scorecard(parsed)
+        assert errors == []
+
+
+# ---------------------------------------------------------------------------
+# 4. Two counts distinct as separate fields
+# ---------------------------------------------------------------------------
+
+
+class TestDistinctCountFields:
+    def test_test_cases_run_and_dataset_size_both_present(self):
+        payload = _make_payload()
+        text = render_scorecard(payload)
+        parsed = parse_scorecard(text)
+        assert "results" in parsed, "'results' section missing from parsed scorecard"
+        assert "test_cases_run" in parsed["results"], (
+            "'results.test_cases_run' must be a distinct field"
+        )
+        assert "recipe" in parsed, "'recipe' section missing from parsed scorecard"
+        assert "dataset" in parsed["recipe"], (
+            "'recipe.dataset' sub-section missing"
+        )
+        assert "size" in parsed["recipe"]["dataset"], (
+            "'recipe.dataset.size' must be a distinct field"
+        )
+
+
+# ---------------------------------------------------------------------------
+# 5. Loose coupling — no harness/agent modules imported
+# ---------------------------------------------------------------------------
+
+
+class TestLooseCoupling:
+    def test_no_benchmark_or_agent_modules_imported(self):
+        # Import is already done at top of file; check sys.modules
+        contaminated = [
+            m
+            for m in sys.modules
+            if "benchmark" in m or "gaia_agent_email" in m
+        ]
+        assert not contaminated, (
+            f"release_scorecard import pulled in harness/agent modules: {contaminated}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# 6. Markdown structure (duplicate guard on render)
+# ---------------------------------------------------------------------------
+
+
+class TestMarkdownStructure:
+    def test_first_line_is_dashes(self):
+        text = render_scorecard(_make_payload())
+        assert text.splitlines()[0] == "---"
+
+    def test_contains_closing_dashes(self):
+        text = render_scorecard(_make_payload())
+        count = text.count("\n---")
+        assert count >= 1, "Must contain at least one closing '---' line"
+
+    def test_body_non_empty(self):
+        text = render_scorecard(_make_payload())
+        parts = text.split("---")
+        # parts[0] is empty, parts[1] is YAML, parts[2+] is body
+        body = "---".join(parts[2:])
+        assert body.strip(), "Markdown body after front matter must not be empty"
+
+
+# ---------------------------------------------------------------------------
+# 7. Versioning — patch carry-forward
+# ---------------------------------------------------------------------------
+
+
+class TestCarryForwardPatch:
+    def test_carry_forward_sets_inherited_from(self, tmp_path):
+        src = _make_payload(version="0.2.3", accuracy=0.75)
+        card_path = tmp_path / "0.2.3.md"
+        card_path.write_text(render_scorecard(src))
+
+        result = carry_forward(card_path, "0.2.4")
+        assert result.inherited_from == "0.2.3"
+
+    def test_carry_forward_copies_metrics_verbatim(self, tmp_path):
+        src = _make_payload(version="0.2.3", accuracy=0.75)
+        card_path = tmp_path / "0.2.3.md"
+        card_path.write_text(render_scorecard(src))
+
+        result = carry_forward(card_path, "0.2.4")
+        assert result.metrics == src.metrics
+
+
+# ---------------------------------------------------------------------------
+# 8. Versioning — minor bump refuses
+# ---------------------------------------------------------------------------
+
+
+class TestCarryForwardMinorBumpRefuses:
+    def test_minor_bump_raises_value_error(self, tmp_path):
+        src = _make_payload(version="0.2.3", accuracy=0.75)
+        card_path = tmp_path / "0.2.3.md"
+        card_path.write_text(render_scorecard(src))
+
+        with pytest.raises(ValueError, match="re-run"):
+            carry_forward(card_path, "0.3.0")
+
+    def test_major_bump_raises_value_error(self, tmp_path):
+        src = _make_payload(version="0.2.3", accuracy=0.75)
+        card_path = tmp_path / "0.2.3.md"
+        card_path.write_text(render_scorecard(src))
+
+        with pytest.raises(ValueError, match="re-run"):
+            carry_forward(card_path, "1.0.0")
+
+
+# ---------------------------------------------------------------------------
+# 9. Non-carry-forward card has inherited_from=None
+# ---------------------------------------------------------------------------
+
+
+class TestInheritedFromNone:
+    def test_fresh_payload_has_null_inherited_from(self):
+        payload = _make_payload()
+        assert payload.inherited_from is None
+
+    def test_rendered_parsed_inherited_from_null_or_absent(self):
+        payload = _make_payload()
+        text = render_scorecard(payload)
+        parsed = parse_scorecard(text)
+        # Either key absent or value is None/null
+        value = parsed.get("inherited_from", None)
+        assert value is None
+
+
+# ---------------------------------------------------------------------------
+# 10. latest_version_below
+# ---------------------------------------------------------------------------
+
+
+class TestLatestVersionBelow:
+    def _seed_dir(self, tmp_path):
+        for name in ("0.1.0.md", "0.2.3.md", "0.10.0.md", "README.md", "not-a-version.md"):
+            (tmp_path / name).write_text("# placeholder")
+        return tmp_path
+
+    def test_returns_closest_below(self, tmp_path):
+        self._seed_dir(tmp_path)
+        result = latest_version_below(tmp_path, "0.2.4")
+        assert result == "0.2.3"
+
+    def test_none_when_nothing_below(self, tmp_path):
+        self._seed_dir(tmp_path)
+        result = latest_version_below(tmp_path, "0.1.0")
+        assert result is None
+
+    def test_integer_comparison_not_string(self, tmp_path):
+        self._seed_dir(tmp_path)
+        result = latest_version_below(tmp_path, "0.10.1")
+        assert result == "0.10.0"
+
+    def test_non_version_files_silently_skipped(self, tmp_path):
+        self._seed_dir(tmp_path)
+        # Should not raise even with README.md and not-a-version.md present
+        result = latest_version_below(tmp_path, "0.2.4")
+        assert result == "0.2.3"
+
+
+# ---------------------------------------------------------------------------
+# Adapter tests: TestEmailAdapter
+# ---------------------------------------------------------------------------
+
+
+class TestEmailAdapter:
+    """Tests for hub/agents/python/email/packaging/gen_scorecard.py adapter."""
+
+    def _load_gen_scorecard(self):
+        adapter_path = (
+            Path(__file__).parents[3]
+            / "hub"
+            / "agents"
+            / "python"
+            / "email"
+            / "packaging"
+            / "gen_scorecard.py"
+        )
+        spec = importlib.util.spec_from_file_location("gen_scorecard", adapter_path)
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+        return mod
+
+    def test_build_payload_mean_of_judged_scenarios(self, tmp_path):
+        mod = self._load_gen_scorecard()
+
+        # Copy fixture to a benchmark dir
+        benchmark_dir = tmp_path / "benchmark"
+        benchmark_dir.mkdir()
+        scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json"
+        scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text())
+
+        # Fake ground_truth.json with 3 keys (2 labeled + 1 _meta → dataset_size=2)
+        ground_truth = {
+            "_meta": {"count": 3},
+            "email1": {"label": "spam"},
+            "email2": {"label": "promo"},
+        }
+        gt_path = tmp_path / "ground_truth.json"
+        gt_path.write_text(json.dumps(ground_truth))
+
+        payload = mod.build_payload(benchmark_dir, gt_path)
+
+        expected_mean = round((0.4167 + 0.5000) / 2, 10)
+        assert payload.metrics[0]["value"] == pytest.approx(expected_mean), (
+            f"Expected metric value {expected_mean}, got {payload.metrics[0]['value']}"
+        )
+
+    def test_build_payload_test_cases_run(self, tmp_path):
+        mod = self._load_gen_scorecard()
+
+        benchmark_dir = tmp_path / "benchmark"
+        benchmark_dir.mkdir()
+        scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json"
+        scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text())
+
+        ground_truth = {
+            "_meta": {"count": 3},
+            "email1": {"label": "spam"},
+            "email2": {"label": "promo"},
+        }
+        gt_path = tmp_path / "ground_truth.json"
+        gt_path.write_text(json.dumps(ground_truth))
+
+        payload = mod.build_payload(benchmark_dir, gt_path)
+        # 12 + 12 = 24; third scenario skipped (no quality key)
+        assert payload.test_cases_run == 24
+
+    def test_build_payload_dataset_size(self, tmp_path):
+        mod = self._load_gen_scorecard()
+
+        benchmark_dir = tmp_path / "benchmark"
+        benchmark_dir.mkdir()
+        scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json"
+        scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text())
+
+        ground_truth = {
+            "_meta": {"count": 3},
+            "email1": {"label": "spam"},
+            "email2": {"label": "promo"},
+        }
+        gt_path = tmp_path / "ground_truth.json"
+        gt_path.write_text(json.dumps(ground_truth))
+
+        payload = mod.build_payload(benchmark_dir, gt_path)
+        # 3 keys - 1 _meta = 2
+        assert payload.dataset_size == 2
+
+    def test_all_no_quality_raises(self, tmp_path):
+        mod = self._load_gen_scorecard()
+
+        benchmark_dir = tmp_path / "benchmark"
+        benchmark_dir.mkdir()
+        # Scorecard where no scenario has quality
+        empty_scorecard = {
+            "run_id": "no-quality",
+            "scenarios": [
+                {"category": "Gemma-4-E4B-it-GGUF", "status": "PASS", "total_emails": 0},
+                {"category": "Gemma-4-E4B-it-GGUF", "status": "PASS", "total_emails": 0},
+            ],
+        }
+        (benchmark_dir / "email_benchmark_scorecard.json").write_text(
+            json.dumps(empty_scorecard)
+        )
+
+        ground_truth = {"_meta": {"count": 1}, "email1": {"label": "spam"}}
+        gt_path = tmp_path / "ground_truth.json"
+        gt_path.write_text(json.dumps(ground_truth))
+
+        with pytest.raises(ValueError):
+            mod.build_payload(benchmark_dir, gt_path)
diff --git a/tests/unit/eval/test_scorecard_gate.py b/tests/unit/eval/test_scorecard_gate.py
new file mode 100644
index 000000000..dbeaba0b7
--- /dev/null
+++ b/tests/unit/eval/test_scorecard_gate.py
@@ -0,0 +1,246 @@
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+"""TDD tests for gaia.eval.scorecard_gate — written before implementation exists."""
+
+import datetime
+from pathlib import Path
+
+import pytest
+import yaml
+
+from gaia.eval.release_scorecard import ResultPayload, compute_aggregate, render_scorecard
+from gaia.eval.scorecard_gate import main
+
+# ---------------------------------------------------------------------------
+# Helper
+# ---------------------------------------------------------------------------
+
+
+def _make_payload(version="1.0.0", accuracy=0.5):
+    metrics = [{"name": "category_accuracy", "value": accuracy, "weight": 1.0}]
+    components, agg_value = compute_aggregate(metrics)
+    return ResultPayload(
+        agent_name="test-agent",
+        agent_version=version,
+        dataset_reference="test/fixture",
+        dataset_description="test dataset",
+        dataset_size=100,
+        methodology="unit test",
+        config={"model": "test"},
+        test_cases_run=10,
+        metrics=metrics,
+        aggregate_name="weighted_accuracy",
+        generated_at=datetime.datetime.utcnow().isoformat(),
+        inherited_from=None,
+    )
+
+
+def _write_card(directory: Path, version: str, accuracy: float) -> Path:
+    payload = _make_payload(version=version, accuracy=accuracy)
+    path = directory / f"{version}.md"
+    path.write_text(render_scorecard(payload))
+    return path
+
+
+# ---------------------------------------------------------------------------
+# Case (a) — missing card → exit 1
+# ---------------------------------------------------------------------------
+
+
+class TestMissingCard:
+    def test_missing_card_returns_1(self, tmp_path):
+        result = main(["--scorecards-dir", str(tmp_path), "--version", "1.0.0"])
+        assert result == 1
+
+
+# ---------------------------------------------------------------------------
+# Case (b) — strict regression → exit 1
+# ---------------------------------------------------------------------------
+
+
+class TestStrictRegression:
+    def test_regression_returns_1(self, tmp_path):
+        _write_card(tmp_path, "0.2.3", accuracy=0.8)
+        _write_card(tmp_path, "0.2.4", accuracy=0.5)
+        result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"])
+        assert result == 1
+
+
+# ---------------------------------------------------------------------------
+# Case (c) — no prior → exit 0
+# ---------------------------------------------------------------------------
+
+
+class TestNoPrior:
+    def test_first_adoption_returns_0(self, tmp_path):
+        _write_card(tmp_path, "1.0.0", accuracy=0.6)
+        result = main(["--scorecards-dir", str(tmp_path), "--version", "1.0.0"])
+        assert result == 0
+
+
+# ---------------------------------------------------------------------------
+# Case (d) — equal score (carry-forward) → exit 0
+# ---------------------------------------------------------------------------
+
+
+class TestEqualScore:
+    def test_equal_score_returns_0(self, tmp_path):
+        _write_card(tmp_path, "0.2.3", accuracy=0.5)
+        _write_card(tmp_path, "0.2.4", accuracy=0.5)
+        result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"])
+        assert result == 0
+
+
+# ---------------------------------------------------------------------------
+# --allow-regression → exit 0
+# ---------------------------------------------------------------------------
+
+
+class TestAllowRegression:
+    def test_allow_regression_flag_returns_0(self, tmp_path):
+        _write_card(tmp_path, "0.2.3", accuracy=0.8)
+        _write_card(tmp_path, "0.2.4", accuracy=0.5)
+        result = main(
+            [
+                "--scorecards-dir",
+                str(tmp_path),
+                "--version",
+                "0.2.4",
+                "--allow-regression",
+            ]
+        )
+        assert result == 0
+
+    def test_allow_regression_prints_warning_line(self, tmp_path, capsys):
+        _write_card(tmp_path, "0.2.3", accuracy=0.8)
+        _write_card(tmp_path, "0.2.4", accuracy=0.5)
+        main(
+            [
+                "--scorecards-dir",
+                str(tmp_path),
+                "--version",
+                "0.2.4",
+                "--allow-regression",
+            ]
+        )
+        captured = capsys.readouterr()
+        assert "::warning::" in captured.out
+
+
+# ---------------------------------------------------------------------------
+# --manifest reads version
+# ---------------------------------------------------------------------------
+
+
+class TestManifestFlag:
+    def test_manifest_reads_version(self, tmp_path):
+        scorecards_dir = tmp_path / "scorecards"
+        scorecards_dir.mkdir()
+        _write_card(scorecards_dir, "1.2.3", accuracy=0.6)
+
+        manifest_path = tmp_path / "gaia-agent.yaml"
+        manifest_path.write_text("version: 1.2.3\nname: test-agent\n")
+
+        result = main(
+            [
+                "--scorecards-dir",
+                str(scorecards_dir),
+                "--manifest",
+                str(manifest_path),
+            ]
+        )
+        assert result == 0
+
+    def test_manifest_with_regression(self, tmp_path):
+        scorecards_dir = tmp_path / "scorecards"
+        scorecards_dir.mkdir()
+        _write_card(scorecards_dir, "1.2.2", accuracy=0.9)
+        _write_card(scorecards_dir, "1.2.3", accuracy=0.3)
+
+        manifest_path = tmp_path / "gaia-agent.yaml"
+        manifest_path.write_text("version: 1.2.3\nname: test-agent\n")
+
+        result = main(
+            [
+                "--scorecards-dir",
+                str(scorecards_dir),
+                "--manifest",
+                str(manifest_path),
+            ]
+        )
+        assert result == 1
+
+
+# ---------------------------------------------------------------------------
+# Invalid prior → exit 1
+# ---------------------------------------------------------------------------
+
+
+class TestInvalidPrior:
+    def test_corrupt_prior_returns_1(self, tmp_path):
+        # Write corrupt/invalid prior card
+        corrupt_path = tmp_path / "0.2.3.md"
+        corrupt_path.write_text("this is not valid yaml front matter at all\ngarbage\n")
+
+        # Write a valid candidate card
+        _write_card(tmp_path, "0.2.4", accuracy=0.9)
+
+        result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"])
+        assert result == 1
+
+    def test_empty_prior_returns_1(self, tmp_path):
+        # Prior exists but is empty
+        empty_path = tmp_path / "0.2.3.md"
+        empty_path.write_text("")
+
+        _write_card(tmp_path, "0.2.4", accuracy=0.9)
+
+        result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"])
+        assert result == 1
+
+
+# ---------------------------------------------------------------------------
+# Workflow YAML test: publish job must list scorecard-gate in needs
+# ---------------------------------------------------------------------------
+
+
+class TestWorkflowYaml:
+    def test_publish_job_needs_scorecard_gate(self):
+        workflow_path = (
+            Path(__file__).parents[3]
+            / ".github"
+            / "workflows"
+            / "release_agent_email.yml"
+        )
+        assert workflow_path.exists(), (
+            f"Workflow file not found: {workflow_path}"
+        )
+        content = workflow_path.read_text()
+        parsed = yaml.safe_load(content)
+
+        assert "jobs" in parsed, "Workflow has no 'jobs' key"
+        assert "publish" in parsed["jobs"], (
+            "Workflow has no 'publish' job — add it or check the job name"
+        )
+        needs = parsed["jobs"]["publish"].get("needs", [])
+        # needs can be a string or a list
+        if isinstance(needs, str):
+            needs = [needs]
+        assert "scorecard-gate" in needs, (
+            f"'publish' job must list 'scorecard-gate' in its needs; got: {needs}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Error handling — bad CLI input returns 1 (not exception)
+# ---------------------------------------------------------------------------
+
+
+class TestCliErrorHandling:
+    def test_missing_scorecards_dir_flag_returns_1(self):
+        result = main(["--version", "1.0.0"])
+        assert result == 1
+
+    def test_missing_version_and_manifest_returns_1(self, tmp_path):
+        result = main(["--scorecards-dir", str(tmp_path)])
+        assert result == 1

From 2257088c5d5fd6d8bed979a5f42e19877a7565c9 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Thu, 25 Jun 2026 18:21:37 -0400
Subject: [PATCH 02/18] feat(eval): add release_scorecard + scorecard_gate
 modules (increments 1-3)

Core harness-agnostic scorecard generator and standalone release gate.
- ResultPayload dataclass, compute_aggregate (guard empty/zero-weight)
- render_scorecard + parse_scorecard (safe_load on first ---...--- slice)
- validate_scorecard + REQUIRED_FIELDS; anchored semver path guard
- latest_version_below (stdlib int-tuple, skips non-semver filenames)
- carry_forward (patch-only, sets inherited_from, raises on minor/major)
- scorecard_gate.main(argv)->int with --version/--manifest/--allow-regression
- 38/44 tests pass; 4 adapter tests pending gen_scorecard.py (incr 4)
- 1 CI test pending workflow update (incr 6)
- 1 loose-coupling test false-positive: pytest_benchmark matches 'benchmark'
---
 src/gaia/eval/release_scorecard.py | 421 +++++++++++++++++++++++++++++
 src/gaia/eval/scorecard_gate.py    | 270 ++++++++++++++++++
 2 files changed, 691 insertions(+)
 create mode 100644 src/gaia/eval/release_scorecard.py
 create mode 100644 src/gaia/eval/scorecard_gate.py

diff --git a/src/gaia/eval/release_scorecard.py b/src/gaia/eval/release_scorecard.py
new file mode 100644
index 000000000..49d81b71b
--- /dev/null
+++ b/src/gaia/eval/release_scorecard.py
@@ -0,0 +1,421 @@
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+"""
+Per-agent / per-version eval scorecard: generator, parser, validator, and versioning helpers.
+
+**Distinct from** ``src/gaia/eval/scorecard.py`` — that module is the per-eval-run
+scenario PASS/FAIL aggregator (``build_scorecard``). This module produces the
+outward-facing *release artifact*: a versioned Markdown file with YAML front matter
+holding measured accuracy metrics, the eval recipe, and a deterministic aggregate score.
+
+Intentionally harness-agnostic: this module imports ONLY stdlib + PyYAML.
+No other loader is permitted — ``yaml.safe_load`` only.
+
+Usage pattern::
+
+    payload = ResultPayload(
+        agent_name="email-triage",
+        agent_version="0.2.4",
+        ...
+    )
+    text = render_scorecard(payload)
+    write_scorecard(payload, path)
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+import yaml
+
+# Anchored semver regex — no prerelease/build suffixes permitted.
+_SEMVER_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)$")
+
+# Required top-level keys in the parsed front matter.
+REQUIRED_FIELDS: list[str] = [
+    "schema_version",
+    "agent",
+    "recipe",
+    "results",
+    "aggregate",
+]
+
+
+@dataclass
+class ResultPayload:
+    """Harness-agnostic result payload — the input to the scorecard generator.
+
+    Fields:
+        agent_name: Human-readable agent name (e.g. "Email Triage").
+        agent_version: Semver version string (e.g. "0.2.4").
+        dataset_reference: Repo-relative path or URL to the dataset.
+        dataset_description: Short human description of the dataset.
+        dataset_size: Total labeled examples available in the dataset.
+        methodology: Short description of the eval methodology.
+        config: Arbitrary dict of harness config (model, limit, corpus, etc.).
+        test_cases_run: Number of cases actually executed this run (<= dataset_size).
+        metrics: List of dicts with keys ``name`` (str), ``value`` (float 0..1),
+            and optionally ``weight`` (float, default 1.0).
+        aggregate_name: Name for the aggregate score (default "weighted_accuracy").
+        generated_at: ISO-8601 timestamp string; informational only.
+        inherited_from: If this is a patch carry-forward, the prior version string;
+            otherwise None.
+    """
+
+    agent_name: str
+    agent_version: str
+    dataset_reference: str
+    dataset_description: str
+    dataset_size: int
+    methodology: str
+    config: dict
+    test_cases_run: int
+    metrics: list
+    aggregate_name: str = "weighted_accuracy"
+    generated_at: str = ""
+    inherited_from: Optional[str] = None
+
+
+def compute_aggregate(metrics: list) -> tuple:
+    """Compute the weighted aggregate score over a list of metrics.
+
+    Formula::
+
+        round(100 * sum(weight_i * value_i) / sum(weight_i), 2)
+
+    Args:
+        metrics: List of dicts with ``name``, ``value`` (float in [0,1]),
+            and optional ``weight`` (float, default 1.0).
+
+    Returns:
+        (components, value) where ``components`` is a list of dicts
+        ``{metric, value, weight}`` and ``value`` is the aggregate float.
+
+    Raises:
+        ValueError: If metrics is empty or the total weight is zero.
+    """
+    if not metrics:
+        raise ValueError("aggregate undefined: no metrics / zero total weight")
+
+    components = []
+    total_weight = 0.0
+    weighted_sum = 0.0
+    for m in metrics:
+        w = float(m.get("weight", 1.0))
+        v = float(m["value"])
+        components.append({"metric": m["name"], "value": v, "weight": w})
+        total_weight += w
+        weighted_sum += w * v
+
+    if total_weight == 0.0:
+        raise ValueError("aggregate undefined: no metrics / zero total weight")
+
+    value = round(100.0 * weighted_sum / total_weight, 2)
+    return components, value
+
+
+def render_scorecard(payload: ResultPayload) -> str:
+    """Render a scorecard as Markdown with YAML front matter.
+
+    The front matter is machine-readable; the body is a human-readable summary
+    that includes the aggregate formula and a worked recomputation example.
+
+    Args:
+        payload: Populated :class:`ResultPayload`.
+
+    Returns:
+        Markdown string starting with ``---`` front matter.
+    """
+    _assert_valid_version(payload.agent_version)
+
+    components, agg_value = compute_aggregate(payload.metrics)
+
+    # Build the YAML-serialisable front-matter dict
+    front: dict = {
+        "schema_version": 1,
+        "agent": {
+            "name": payload.agent_name,
+            "version": payload.agent_version,
+        },
+        "recipe": {
+            "dataset": {
+                "reference": payload.dataset_reference,
+                "description": payload.dataset_description,
+                "size": payload.dataset_size,
+            },
+            "methodology": payload.methodology,
+            "config": payload.config,
+        },
+        "results": {
+            "test_cases_run": payload.test_cases_run,
+            "metrics": [
+                {
+                    "name": m["name"],
+                    "value": float(m["value"]),
+                    "weight": float(m.get("weight", 1.0)),
+                }
+                for m in payload.metrics
+            ],
+        },
+        "aggregate": {
+            "name": payload.aggregate_name,
+            "formula": "round(100 * sum(weight_i * value_i) / sum(weight_i), 2)",
+            "components": components,
+            "value": agg_value,
+        },
+        "generated_at": payload.generated_at,
+        "inherited_from": payload.inherited_from,
+    }
+
+    fm_text = yaml.dump(front, default_flow_style=False, sort_keys=False, allow_unicode=True)
+
+    # Human-readable body with worked recompute
+    metric_lines = "\n".join(
+        f"  - **{c['metric']}**: {c['value']:.4f} × {c['weight']:.1f}"
+        for c in components
+    )
+    total_w = sum(c["weight"] for c in components)
+    worked = " + ".join(
+        f"({c['value']:.4f} × {c['weight']:.1f})" for c in components
+    )
+
+    body = f"""# {payload.agent_name} — Eval Scorecard v{payload.agent_version}
+
+**Aggregate score: {agg_value}** (out of 100)
+
+## Recipe
+
+| Field | Value |
+|-------|-------|
+| Dataset | [{payload.dataset_reference}]({payload.dataset_reference}) |
+| Description | {payload.dataset_description} |
+| Dataset size | {payload.dataset_size} labeled examples |
+| Test cases run | {payload.test_cases_run} |
+| Methodology | {payload.methodology} |
+
+## Metrics
+
+{metric_lines}
+
+## Aggregate score recomputation
+
+Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)`
+
+Worked example:
+
+```
+round(100 × ({worked}) / {total_w:.1f}, 2) = {agg_value}
+```
+
+A reader can reproduce this value from the `aggregate.components` in the front
+matter alone — no eval-harness access needed.
+"""
+
+    if payload.inherited_from:
+        body += f"\n> **Inherited from {payload.inherited_from}** — results carried forward verbatim (patch release).\n"
+
+    return f"---\n{fm_text}---\n{body}"
+
+
+def write_scorecard(payload: ResultPayload, path: Path) -> None:
+    """Write a rendered scorecard to ``path``.
+
+    Args:
+        payload: Populated :class:`ResultPayload`.
+        path: Destination file path. Parent directory must exist.
+    """
+    path = Path(path)
+    path.write_text(render_scorecard(payload), encoding="utf-8")
+
+
+def parse_scorecard(source) -> dict:
+    """Parse the YAML front matter from a scorecard file or string.
+
+    Extracts the first ``---`` … ``---`` block and runs ``yaml.safe_load``
+    on it only — a bare ``---`` rule in the Markdown body is never parsed.
+
+    Args:
+        source: A :class:`pathlib.Path` (file to read) or a ``str`` (raw text).
+
+    Returns:
+        Parsed front-matter dict.
+
+    Raises:
+        ValueError: If no valid front-matter block is found or YAML is invalid.
+    """
+    if isinstance(source, Path):
+        text = source.read_text(encoding="utf-8")
+    else:
+        text = str(source)
+
+    # Split on first pair of '---' delimiters
+    if not text.startswith("---"):
+        raise ValueError(f"Scorecard does not start with '---' front matter")
+
+    # Find the closing '---' (first occurrence after the opening line)
+    rest = text[3:]  # strip opening ---
+    # The closing delimiter is a line consisting of exactly ---
+    closing_match = re.search(r"\n---\n", rest)
+    if closing_match is None:
+        # Try end-of-string variant
+        closing_match = re.search(r"\n---$", rest)
+    if closing_match is None:
+        raise ValueError("Scorecard front matter has no closing '---'")
+
+    yaml_block = rest[: closing_match.start()]
+    try:
+        return yaml.safe_load(yaml_block) or {}
+    except yaml.YAMLError as exc:
+        raise ValueError(f"Invalid YAML in scorecard front matter: {exc}") from exc
+
+
+def validate_scorecard(parsed: dict) -> list:
+    """Validate a parsed scorecard front-matter dict.
+
+    Args:
+        parsed: Dict returned by :func:`parse_scorecard`.
+
+    Returns:
+        List of error strings. Empty list means the scorecard is valid.
+    """
+    errors: list[str] = []
+
+    for key in REQUIRED_FIELDS:
+        if key not in parsed:
+            errors.append(f"Missing required field: '{key}'")
+
+    return errors
+
+
+def _semver_tuple(v: str) -> tuple:
+    """Parse a semver string to an int tuple, or raise ValueError."""
+    m = _SEMVER_RE.match(v)
+    if not m:
+        raise ValueError(f"Not a valid semver string: {v!r}")
+    return (int(m.group(1)), int(m.group(2)), int(m.group(3)))
+
+
+def _assert_valid_version(version: str) -> None:
+    """Raise ValueError if version does not match the anchored semver regex."""
+    m = _SEMVER_RE.match(version)
+    if not m:
+        raise ValueError(
+            f"Version {version!r} does not match semver pattern X.Y.Z — "
+            "prerelease and build-metadata suffixes are not permitted."
+        )
+
+
+def _assert_safe_path(scorecards_dir: Path, version: str) -> Path:
+    """Return ``scorecards_dir / f"{version}.md"`` after path-traversal guard."""
+    _assert_valid_version(version)
+    scorecards_dir = scorecards_dir.resolve()
+    candidate = (scorecards_dir / f"{version}.md").resolve()
+    if not str(candidate).startswith(str(scorecards_dir)):
+        raise ValueError(
+            f"Resolved scorecard path {candidate} is not inside "
+            f"scorecards dir {scorecards_dir} — possible path traversal."
+        )
+    return candidate
+
+
+def latest_version_below(scorecards_dir: Path, version: str) -> Optional[str]:
+    """Return the greatest version in ``scorecards_dir`` strictly less than ``version``.
+
+    Only files whose stem matches the anchored semver regex ``^\\d+\\.\\d+\\.\\d+$``
+    are considered. Non-matching filenames (README.md, .gitkeep, etc.) are silently
+    skipped.
+
+    Args:
+        scorecards_dir: Directory to scan for ``*.md`` scorecards.
+        version: The candidate version string (must be valid semver).
+
+    Returns:
+        The greatest matching version string strictly below ``version``, or ``None``
+        if no such version exists.
+
+    Raises:
+        ValueError: If ``version`` is not a valid semver string.
+    """
+    _assert_valid_version(version)
+    target_tuple = _semver_tuple(version)
+    scorecards_dir = Path(scorecards_dir)
+
+    candidates: list[tuple] = []
+    if scorecards_dir.is_dir():
+        for p in scorecards_dir.glob("*.md"):
+            m = _SEMVER_RE.match(p.stem)
+            if not m:
+                continue  # silently skip non-semver filenames
+            t = (int(m.group(1)), int(m.group(2)), int(m.group(3)))
+            if t < target_tuple:
+                candidates.append(t)
+
+    if not candidates:
+        return None
+
+    best = max(candidates)
+    return f"{best[0]}.{best[1]}.{best[2]}"
+
+
+def carry_forward(prev_path: Path, new_version: str) -> ResultPayload:
+    """Carry forward a prior scorecard's results to a new patch version.
+
+    Reads the prior scorecard, copies all results verbatim, and sets
+    ``inherited_from`` to the prior version string.
+
+    Args:
+        prev_path: Path to the prior version's scorecard ``.md`` file.
+        new_version: The new version string (must be a patch bump of the prior).
+
+    Returns:
+        A :class:`ResultPayload` with results copied and ``inherited_from`` set.
+
+    Raises:
+        ValueError: If ``new_version`` is not a patch-only bump of the prior version
+            (i.e. if major or minor differs). The error message contains "re-run"
+            to inform the caller that a fresh eval is required.
+        ValueError: If the prior scorecard cannot be parsed.
+    """
+    _assert_valid_version(new_version)
+    prev_path = Path(prev_path)
+    prev_version = prev_path.stem  # e.g. "0.2.3" from "0.2.3.md"
+
+    prev_tuple = _semver_tuple(prev_version)
+    new_tuple = _semver_tuple(new_version)
+
+    # Only patch bumps are allowed for carry-forward.
+    if prev_tuple[0] != new_tuple[0] or prev_tuple[1] != new_tuple[1]:
+        raise ValueError(
+            f"Cannot carry forward from {prev_version} to {new_version}: "
+            f"major or minor version changed. Please re-run the eval to "
+            f"generate fresh results for this release."
+        )
+
+    parsed = parse_scorecard(prev_path)
+
+    # Extract fields from the parsed front matter
+    agent = parsed.get("agent", {})
+    recipe = parsed.get("recipe", {})
+    dataset = recipe.get("dataset", {})
+    results = parsed.get("results", {})
+    metrics_raw = results.get("metrics", [])
+
+    import datetime
+
+    return ResultPayload(
+        agent_name=agent.get("name", ""),
+        agent_version=new_version,
+        dataset_reference=dataset.get("reference", ""),
+        dataset_description=dataset.get("description", ""),
+        dataset_size=dataset.get("size", 0),
+        methodology=recipe.get("methodology", ""),
+        config=recipe.get("config", {}),
+        test_cases_run=results.get("test_cases_run", 0),
+        metrics=metrics_raw,
+        aggregate_name=parsed.get("aggregate", {}).get("name", "weighted_accuracy"),
+        generated_at=datetime.datetime.utcnow().isoformat(),
+        inherited_from=prev_version,
+    )
diff --git a/src/gaia/eval/scorecard_gate.py b/src/gaia/eval/scorecard_gate.py
new file mode 100644
index 000000000..a33cecfb0
--- /dev/null
+++ b/src/gaia/eval/scorecard_gate.py
@@ -0,0 +1,270 @@
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+"""
+Standalone release gate: blocks packaging when the candidate scorecard is missing
+or when its aggregate score strictly regressed below the prior version's.
+
+**Distinct from** ``src/gaia/eval/scorecard.py`` — that module aggregates per-run
+scenario PASS/FAIL for internal CI. This gate checks the *outward-facing* release
+artifact produced by ``release_scorecard.py``.
+
+Usage::
+
+    python -m gaia.eval.scorecard_gate \\
+        --scorecards-dir hub/agents/npm/agent-email/scorecards \\
+        --manifest hub/agents/python/email/gaia-agent.yaml
+
+    python -m gaia.eval.scorecard_gate \\
+        --scorecards-dir hub/agents/npm/agent-email/scorecards \\
+        --version 0.2.4
+
+Exit codes:
+    0 — Passed (presence-only first adoption, equal score, or score improved).
+    1 — Failed (missing/invalid candidate card, strict regression, or prior card invalid).
+
+The ``--allow-regression`` flag overrides a regression: prints a ``::warning::``
+GHA annotation and both version/score pairs, then exits 0.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import yaml
+
+from gaia.eval.release_scorecard import (
+    _assert_safe_path,
+    latest_version_below,
+    parse_scorecard,
+    validate_scorecard,
+)
+
+
+def _read_version_from_manifest(manifest_path: Path) -> str:
+    """Read the ``version:`` field from a ``gaia-agent.yaml`` manifest.
+
+    Args:
+        manifest_path: Path to the YAML manifest file.
+
+    Returns:
+        The version string.
+
+    Raises:
+        ValueError: If the file cannot be read or ``version:`` is absent.
+    """
+    try:
+        text = manifest_path.read_text(encoding="utf-8")
+    except OSError as exc:
+        raise ValueError(
+            f"Cannot read manifest {manifest_path}: {exc}"
+        ) from exc
+
+    try:
+        data = yaml.safe_load(text) or {}
+    except yaml.YAMLError as exc:
+        raise ValueError(
+            f"Invalid YAML in manifest {manifest_path}: {exc}"
+        ) from exc
+
+    version = data.get("version")
+    if not version:
+        raise ValueError(
+            f"Manifest {manifest_path} has no 'version:' field."
+        )
+    return str(version)
+
+
+def main(argv=None) -> int:
+    """Run the scorecard gate.
+
+    Args:
+        argv: Argument list (``sys.argv[1:]`` if None).
+
+    Returns:
+        0 on pass, 1 on failure.
+    """
+    parser = argparse.ArgumentParser(
+        description=(
+            "Release gate: ensures a valid scorecard exists for the candidate version "
+            "and that its aggregate score has not strictly regressed vs the prior version."
+        ),
+        prog="python -m gaia.eval.scorecard_gate",
+    )
+    parser.add_argument(
+        "--scorecards-dir",
+        required=False,
+        help="Directory containing per-version scorecard .md files.",
+    )
+    version_group = parser.add_mutually_exclusive_group()
+    version_group.add_argument(
+        "--version",
+        help="Candidate version string (e.g. 0.2.4).",
+    )
+    version_group.add_argument(
+        "--manifest",
+        help="Path to gaia-agent.yaml; the 'version:' field is used as the candidate version.",
+    )
+    parser.add_argument(
+        "--allow-regression",
+        action="store_true",
+        default=False,
+        help=(
+            "Override a regression: prints a GHA ::warning:: annotation and both "
+            "version/score pairs, then exits 0. Use only when a regression is intentional."
+        ),
+    )
+
+    try:
+        args = parser.parse_args(argv)
+    except SystemExit:
+        return 1
+
+    # Validate required arguments
+    if not args.scorecards_dir:
+        print(
+            "ERROR: --scorecards-dir is required.\n"
+            "Usage: python -m gaia.eval.scorecard_gate --scorecards-dir DIR "
+            "--version V (or --manifest PATH)"
+        )
+        return 1
+
+    if not args.version and not args.manifest:
+        print(
+            "ERROR: Either --version or --manifest is required.\n"
+            "Usage: python -m gaia.eval.scorecard_gate --scorecards-dir DIR "
+            "--version V (or --manifest PATH)"
+        )
+        return 1
+
+    scorecards_dir = Path(args.scorecards_dir)
+
+    # Resolve the candidate version
+    if args.manifest:
+        try:
+            version = _read_version_from_manifest(Path(args.manifest))
+        except ValueError as exc:
+            print(f"ERROR: {exc}")
+            return 1
+    else:
+        version = args.version
+
+    # --- Step 1: Presence check ---
+    try:
+        candidate_path = _assert_safe_path(scorecards_dir, version)
+    except ValueError as exc:
+        print(f"ERROR: {exc}")
+        return 1
+
+    if not candidate_path.exists():
+        print(
+            f"ERROR: Scorecard missing for version {version}.\n"
+            f"  Expected: {candidate_path}\n"
+            f"  Run 'python gen_scorecard.py' (or 'carry_forward') to generate it, "
+            f"then commit the file before releasing."
+        )
+        return 1
+
+    try:
+        candidate_parsed = parse_scorecard(candidate_path)
+    except ValueError as exc:
+        print(f"ERROR: Cannot parse candidate scorecard {candidate_path}: {exc}")
+        return 1
+
+    errors = validate_scorecard(candidate_parsed)
+    if errors:
+        print(
+            f"ERROR: Candidate scorecard {candidate_path} is invalid:\n"
+            + "\n".join(f"  - {e}" for e in errors)
+        )
+        return 1
+
+    # --- Step 2: Locate prior version ---
+    try:
+        prev_version = latest_version_below(scorecards_dir, version)
+    except ValueError as exc:
+        print(f"ERROR: {exc}")
+        return 1
+
+    if prev_version is None:
+        print(
+            f"PASS: No prior scorecard found for versions below {version}. "
+            f"First adoption — presence check only."
+        )
+        return 0
+
+    # --- Step 3: Parse prior and regression check ---
+    try:
+        prev_path = _assert_safe_path(scorecards_dir, prev_version)
+    except ValueError as exc:
+        print(f"ERROR: {exc}")
+        return 1
+
+    try:
+        prev_parsed = parse_scorecard(prev_path)
+    except ValueError as exc:
+        print(
+            f"ERROR: Cannot parse prior scorecard {prev_path}: {exc}\n"
+            f"  The prior scorecard is corrupt or missing a valid front matter. "
+            f"Fix it before releasing {version}."
+        )
+        return 1
+
+    prev_errors = validate_scorecard(prev_parsed)
+    if prev_errors:
+        print(
+            f"ERROR: Prior scorecard {prev_path} is invalid:\n"
+            + "\n".join(f"  - {e}" for e in prev_errors)
+            + f"\n  Fix the prior scorecard before releasing {version}."
+        )
+        return 1
+
+    candidate_score = candidate_parsed.get("aggregate", {}).get("value")
+    prev_score = prev_parsed.get("aggregate", {}).get("value")
+
+    if candidate_score is None:
+        print(
+            f"ERROR: Candidate scorecard {candidate_path} has no 'aggregate.value' field."
+        )
+        return 1
+
+    if prev_score is None:
+        print(
+            f"ERROR: Prior scorecard {prev_path} has no 'aggregate.value' field."
+        )
+        return 1
+
+    if float(candidate_score) < float(prev_score):
+        # Strict regression detected
+        if args.allow_regression:
+            print(
+                f"::warning::Scorecard regression allowed by --allow-regression: "
+                f"{prev_version}={prev_score} → {version}={candidate_score}"
+            )
+            print(
+                f"WARNING: Regression override active. "
+                f"Prior version {prev_version} scored {prev_score}; "
+                f"candidate {version} scored {candidate_score}. "
+                f"This regression has been explicitly acknowledged."
+            )
+            return 0
+        print(
+            f"ERROR: Scorecard regression detected.\n"
+            f"  Prior version {prev_version}: aggregate.value = {prev_score}\n"
+            f"  Candidate {version}: aggregate.value = {candidate_score}\n"
+            f"  The candidate score is strictly lower than the prior. "
+            f"Investigate the regression or use --allow-regression to override intentionally."
+        )
+        return 1
+
+    print(
+        f"PASS: Scorecard gate passed.\n"
+        f"  Candidate {version}: aggregate.value = {candidate_score} "
+        f"(prior {prev_version}: {prev_score})"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 5ed399c639ba71188015b655515ddc747579d552 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Thu, 25 Jun 2026 18:23:54 -0400
Subject: [PATCH 03/18] feat(eval): add email adapter gen_scorecard.py + fix
 loose-coupling test (increments 4)

- gen_scorecard.py: reads benchmark scorecard.json (or any scenarios JSON)
  + ground_truth.json -> ResultPayload -> writes scorecards/<version>.md
- Judged = quality.category_accuracy is finite float in [0,1]; zero judged raises
- test_cases_run = sum(total_emails over judged); dataset_size excl _meta
- Path derivation mirrors stamp_version.py (parents[...] from __file__)
- Fix loose-coupling test: subprocess instead of sys.modules (avoids pytest_benchmark FP)
  (orchestrator-authorized replacement)
- 43/44 tests pass; 1 remaining = CI workflow test (incr 6)
---
 .../python/email/packaging/gen_scorecard.py   | 288 ++++++++++++++++++
 tests/unit/eval/test_release_scorecard.py     |  22 +-
 2 files changed, 302 insertions(+), 8 deletions(-)
 create mode 100644 hub/agents/python/email/packaging/gen_scorecard.py

diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py
new file mode 100644
index 000000000..2961deeb7
--- /dev/null
+++ b/hub/agents/python/email/packaging/gen_scorecard.py
@@ -0,0 +1,288 @@
+#!/usr/bin/env python3
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+"""
+Email-agent adapter: generate a release scorecard from a ``gaia eval benchmark`` run.
+
+Reads the benchmark ``--output-dir`` (looks for a JSON file containing a
+``scenarios`` key — ``scorecard.json`` in a real run, or any ``*scorecard*.json``
+fixture) and the ground-truth JSON, builds a :class:`ResultPayload`, and writes the
+scorecard to ``hub/agents/npm/agent-email/scorecards/<version>.md``.
+
+This adapter imports ``gaia.eval.release_scorecard`` (core generator) but never
+imports the eval harness (``gaia.eval.benchmark``) or the email-agent package —
+the loose-coupling spine is preserved.
+
+Usage::
+
+    python hub/agents/python/email/packaging/gen_scorecard.py \\
+        --benchmark-dir /tmp/email-eval \\
+        [--ground-truth tests/fixtures/email/ground_truth.json]
+
+The ``--ground-truth`` path defaults to the canonical fixture in the repository.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+# Derive repo root the same way stamp_version.py does:
+# packaging/ -> email/ -> python/ -> agents/ -> hub/ -> repo root
+_PACKAGING_DIR = Path(__file__).resolve().parent
+_EMAIL_ROOT = _PACKAGING_DIR.parent
+_REPO_ROOT = _EMAIL_ROOT.parent.parent.parent.parent
+_NPM_ROOT = _REPO_ROOT / "hub" / "agents" / "npm" / "agent-email"
+
+# Default ground-truth path
+_DEFAULT_GT = _REPO_ROOT / "tests" / "fixtures" / "email" / "ground_truth.json"
+
+# Canonical benchmark scorecard filename (written by gaia eval benchmark)
+_SCORECARD_FILENAME = "scorecard.json"
+
+
+def _find_benchmark_scorecard(benchmark_dir: Path) -> Path:
+    """Locate the benchmark scorecard JSON in ``benchmark_dir``.
+
+    Looks first for the canonical ``scorecard.json``, then for any ``*.json``
+    file whose parsed content contains a ``scenarios`` key. Raises loudly if
+    none is found or if multiple ambiguous files match.
+
+    Args:
+        benchmark_dir: Directory written by ``gaia eval benchmark --output-dir``.
+
+    Returns:
+        Path to the benchmark scorecard JSON file.
+
+    Raises:
+        FileNotFoundError: If ``benchmark_dir`` does not exist.
+        ValueError: If no suitable scorecard JSON is found in the directory.
+    """
+    if not benchmark_dir.is_dir():
+        raise FileNotFoundError(
+            f"Benchmark directory not found: {benchmark_dir}\n"
+            f"Run 'gaia eval benchmark --output-dir <dir>' first."
+        )
+
+    # Try the canonical name first
+    canonical = benchmark_dir / _SCORECARD_FILENAME
+    if canonical.exists():
+        return canonical
+
+    # Scan for any JSON containing a 'scenarios' key
+    matches: list[Path] = []
+    for p in sorted(benchmark_dir.glob("*.json")):
+        try:
+            data = json.loads(p.read_text(encoding="utf-8"))
+            if isinstance(data, dict) and "scenarios" in data:
+                matches.append(p)
+        except (json.JSONDecodeError, OSError):
+            continue
+
+    if not matches:
+        raise ValueError(
+            f"No benchmark scorecard JSON found in {benchmark_dir}.\n"
+            f"Expected '{_SCORECARD_FILENAME}' (written by 'gaia eval benchmark'), "
+            f"or any JSON file with a 'scenarios' key.\n"
+            f"Run 'gaia eval benchmark --output-dir {benchmark_dir}' to generate it."
+        )
+
+    if len(matches) > 1:
+        paths = ", ".join(str(p) for p in matches)
+        raise ValueError(
+            f"Ambiguous benchmark scorecard: multiple JSON files with a 'scenarios' "
+            f"key found in {benchmark_dir}: {paths}.\n"
+            f"Remove all but '{_SCORECARD_FILENAME}' and retry."
+        )
+
+    return matches[0]
+
+
+def _is_judged(scenario: dict) -> bool:
+    """Return True if a scenario has a valid category_accuracy in [0,1]."""
+    quality = scenario.get("quality")
+    if not isinstance(quality, dict):
+        return False
+    acc = quality.get("category_accuracy")
+    if acc is None:
+        return False
+    try:
+        f = float(acc)
+    except (TypeError, ValueError):
+        return False
+    return 0.0 <= f <= 1.0 and f == f  # also rejects NaN via f==f
+
+
+def build_payload(benchmark_dir: Path, ground_truth_path: Path):
+    """Build a :class:`~gaia.eval.release_scorecard.ResultPayload` from benchmark output.
+
+    A scenario is **judged** iff it has a ``quality`` dict AND
+    ``quality.category_accuracy`` is a finite float in [0, 1]. Non-judged
+    scenarios (missing ``quality`` or invalid accuracy) are skipped.
+
+    Args:
+        benchmark_dir: Directory written by ``gaia eval benchmark --output-dir``.
+        ground_truth_path: Path to ``ground_truth.json`` (the labeled corpus).
+
+    Returns:
+        Populated :class:`~gaia.eval.release_scorecard.ResultPayload`.
+
+    Raises:
+        ValueError: If zero scenarios are judged (likely missing ``--ground-truth``
+            or a benchmark run that produced no quality metrics).
+        FileNotFoundError: If required files are not found.
+    """
+    # Import here (not at module top) so tests that import build_payload before
+    # gaia is installed in the test environment fail at call time, not import time.
+    from gaia.eval.release_scorecard import ResultPayload, compute_aggregate
+
+    scorecard_path = _find_benchmark_scorecard(benchmark_dir)
+    data = json.loads(scorecard_path.read_text(encoding="utf-8"))
+    scenarios = data.get("scenarios", [])
+
+    # Separate judged from non-judged scenarios
+    judged = [s for s in scenarios if _is_judged(s)]
+
+    if not judged:
+        raise ValueError(
+            f"Zero judged scenarios in {scorecard_path}.\n"
+            f"Possible causes: benchmark ran without '--ground-truth', "
+            f"or no scenario produced a category_accuracy metric.\n"
+            f"Benchmark dir: {benchmark_dir}"
+        )
+
+    # Aggregate metrics from judged scenarios
+    category_accuracy = sum(
+        s["quality"]["category_accuracy"] for s in judged
+    ) / len(judged)
+
+    test_cases_run = sum(int(s.get("total_emails", 0)) for s in judged)
+
+    # Dataset size = labeled entries in ground_truth.json (excluding _meta key)
+    if not ground_truth_path.exists():
+        raise FileNotFoundError(
+            f"Ground truth not found: {ground_truth_path}\n"
+            f"Pass --ground-truth <path> pointing to the labeled corpus JSON."
+        )
+    ground_truth = json.loads(ground_truth_path.read_text(encoding="utf-8"))
+    dataset_size = len(ground_truth) - (1 if "_meta" in ground_truth else 0)
+
+    # Read version from gaia-agent.yaml
+    agent_yaml_path = _EMAIL_ROOT / "gaia-agent.yaml"
+    try:
+        import yaml  # noqa: PLC0415  (local import; PyYAML already a dep)
+
+        agent_data = yaml.safe_load(agent_yaml_path.read_text(encoding="utf-8")) or {}
+    except Exception as exc:
+        raise ValueError(
+            f"Cannot read agent version from {agent_yaml_path}: {exc}"
+        ) from exc
+
+    version = str(agent_data.get("version", ""))
+    if not version:
+        raise ValueError(
+            f"No 'version:' field found in {agent_yaml_path}."
+        )
+
+    metrics = [
+        {"name": "category_accuracy", "value": float(category_accuracy), "weight": 1.0}
+    ]
+    _components, agg_value = compute_aggregate(metrics)  # noqa: F841
+
+    import datetime
+
+    return ResultPayload(
+        agent_name="Email Triage",
+        agent_version=version,
+        dataset_reference="tests/fixtures/email/ground_truth.json",
+        dataset_description=(
+            "Synthetic email corpus for GAIA email-triage evaluation "
+            "(FakeGmailBackend, 5-category classification)"
+        ),
+        dataset_size=dataset_size,
+        methodology=(
+            "gaia eval benchmark — category classification accuracy "
+            "(case-insensitive exact match) over a synthetic labeled corpus "
+            "via FakeGmailBackend; no LLM judge required"
+        ),
+        config={
+            "harness": "gaia eval benchmark",
+            "model": data.get("model", agent_data.get("models", [None])[0]),
+            "corpus": "tests/fixtures/email/synthetic_inbox.mbox",
+            "ground_truth": str(ground_truth_path),
+            "limit": data.get("limit"),
+        },
+        test_cases_run=test_cases_run,
+        metrics=metrics,
+        aggregate_name="weighted_accuracy",
+        generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(),
+        inherited_from=None,
+    )
+
+
+def main(argv=None) -> int:
+    """Generate and write the email-agent scorecard."""
+    parser = argparse.ArgumentParser(
+        description="Generate a release scorecard for the email-triage agent.",
+        prog="gen_scorecard.py",
+    )
+    parser.add_argument(
+        "--benchmark-dir",
+        required=True,
+        help=(
+            "Directory written by 'gaia eval benchmark --output-dir <dir>' "
+            "(must contain scorecard.json)."
+        ),
+    )
+    parser.add_argument(
+        "--ground-truth",
+        default=str(_DEFAULT_GT),
+        help=(
+            f"Path to ground_truth.json (default: {_DEFAULT_GT.relative_to(_REPO_ROOT)})"
+        ),
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help=(
+            "Override the scorecard output directory "
+            "(default: hub/agents/npm/agent-email/scorecards/)."
+        ),
+    )
+
+    args = parser.parse_args(argv)
+
+    benchmark_dir = Path(args.benchmark_dir).resolve()
+    gt_path = Path(args.ground_truth).resolve()
+
+    try:
+        payload = build_payload(benchmark_dir, gt_path)
+    except (ValueError, FileNotFoundError) as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 1
+
+    from gaia.eval.release_scorecard import write_scorecard
+
+    if args.output_dir:
+        scorecards_dir = Path(args.output_dir)
+    else:
+        scorecards_dir = _NPM_ROOT / "scorecards"
+
+    scorecards_dir.mkdir(parents=True, exist_ok=True)
+    out_path = scorecards_dir / f"{payload.agent_version}.md"
+    write_scorecard(payload, out_path)
+
+    print(
+        f"Scorecard written: {out_path}\n"
+        f"  Version: {payload.agent_version}\n"
+        f"  Aggregate: {payload.metrics[0]['value']:.4f} category_accuracy "
+        f"(over {len([s for s in [payload] if True])} — {payload.test_cases_run} emails judged)\n"
+        f"  Dataset size: {payload.dataset_size} labeled examples"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py
index 7e9770e1a..d36e203eb 100644
--- a/tests/unit/eval/test_release_scorecard.py
+++ b/tests/unit/eval/test_release_scorecard.py
@@ -194,15 +194,21 @@ def test_test_cases_run_and_dataset_size_both_present(self):
 
 class TestLooseCoupling:
     def test_no_benchmark_or_agent_modules_imported(self):
-        # Import is already done at top of file; check sys.modules
-        contaminated = [
-            m
-            for m in sys.modules
-            if "benchmark" in m or "gaia_agent_email" in m
-        ]
-        assert not contaminated, (
-            f"release_scorecard import pulled in harness/agent modules: {contaminated}"
+        # Clean interpreter: importing release_scorecard must not pull in the
+        # eval harness or any agent package. Scanning the test process's own
+        # sys.modules gives false positives (e.g. the pytest_benchmark plugin),
+        # so check in a fresh subprocess instead.
+        import subprocess
+        import sys as _sys
+
+        code = (
+            "import sys, gaia.eval.release_scorecard; "
+            "bad=[m for m in sys.modules if 'gaia.eval.benchmark' in m "
+            "or 'gaia.eval.quality_metrics' in m or 'gaia_agent_email' in m]; "
+            "assert not bad, bad"
         )
+        r = subprocess.run([_sys.executable, "-c", code], capture_output=True, text=True)
+        assert r.returncode == 0, r.stderr
 
 
 # ---------------------------------------------------------------------------

From a1dce4ffca1f239759aad3b01e4539d61a6bc859 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Thu, 25 Jun 2026 18:32:23 -0400
Subject: [PATCH 04/18] feat(eval): docs, hello-world scorecard, CI gate, npm
 wiring (increments 5-6)

- docs/reference/eval-scorecard.mdx: schema, storage, formula, versioning policy, gate
- docs/docs.json: nav entry in Evaluation Framework group
- hub/agents/python/hello-world/scorecards/0.1.0.md: generator-produced generalization proof
- hub/agents/npm/agent-email/scorecards/.gitkeep: placeholder for real scorecard
- hub/agents/npm/agent-email/README.md: eval scorecard link to ./scorecards/0.2.4.md
- hub/agents/npm/agent-email/package.json: add scorecards/ to files array
- .github/workflows/release_agent_email.yml: scorecard-gate job + publish.needs update
- lint fixes: remove unused imports from test files; black/isort/pylint/flake8 clean
- 44/44 target tests pass; lint: ALL QUALITY CHECKS PASSED
---
 .github/workflows/release_agent_email.yml     |  19 +-
 docs/docs.json                                |   1 +
 docs/reference/eval-scorecard.mdx             | 240 ++++++++++++++++++
 hub/agents/npm/agent-email/README.md          |   2 +
 hub/agents/npm/agent-email/package.json       |   3 +-
 .../npm/agent-email/scorecards/.gitkeep       |   0
 .../python/email/packaging/gen_scorecard.py   |   6 +-
 .../python/hello-world/scorecards/0.1.0.md    |  62 +++++
 src/gaia/eval/release_scorecard.py            |  12 +-
 src/gaia/eval/scorecard_gate.py               |  16 +-
 tests/unit/eval/test_release_scorecard.py     |  55 ++--
 tests/unit/eval/test_scorecard_gate.py        |  23 +-
 12 files changed, 385 insertions(+), 54 deletions(-)
 create mode 100644 docs/reference/eval-scorecard.mdx
 create mode 100644 hub/agents/npm/agent-email/scorecards/.gitkeep
 create mode 100644 hub/agents/python/hello-world/scorecards/0.1.0.md

diff --git a/.github/workflows/release_agent_email.yml b/.github/workflows/release_agent_email.yml
index ea183775f..81c674384 100644
--- a/.github/workflows/release_agent_email.yml
+++ b/.github/workflows/release_agent_email.yml
@@ -266,11 +266,28 @@ jobs:
             echo "ok=false" >> "$GITHUB_OUTPUT"
           fi
 
+  # ── Stage 1b: scorecard presence + regression gate ─────────────────
+  scorecard-gate:
+    name: Scorecard gate
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install core + PyYAML
+        run: pip install -e . pyyaml
+      - name: Run scorecard gate
+        run: |
+          python -m gaia.eval.scorecard_gate \
+            --scorecards-dir hub/agents/npm/agent-email/scorecards \
+            --manifest hub/agents/python/email/gaia-agent.yaml
+
   # ── Stage 2: publish to the hub + npm (single atomic step) ─────────
   publish:
     name: Publish to Hub + npm
     runs-on: ubuntu-latest
-    needs: [build, verify-darwin-x64-compat]
+    needs: [build, verify-darwin-x64-compat, scorecard-gate]
     # Manual approval gate: the `agent-publish` environment is configured (repo
     # Settings → Environments) with required reviewers, so this job pauses until a
     # maintainer approves — the human backstop for an accidental/tampered release
diff --git a/docs/docs.json b/docs/docs.json
index 5397cf0a5..ba1b26d90 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -356,6 +356,7 @@
                 "group": "Evaluation Framework",
                 "pages": [
                   "reference/eval",
+                  "reference/eval-scorecard",
                   "eval"
                 ]
               },
diff --git a/docs/reference/eval-scorecard.mdx b/docs/reference/eval-scorecard.mdx
new file mode 100644
index 000000000..f3cbc5093
--- /dev/null
+++ b/docs/reference/eval-scorecard.mdx
@@ -0,0 +1,240 @@
+---
+title: "Release Eval Scorecard"
+description: "Per-agent, per-version eval scorecard: schema, storage convention, aggregate formula, versioning policy, and release gate."
+icon: "chart-bar"
+---
+
+<Info>
+  **Source Code:**
+  [`src/gaia/eval/release_scorecard.py`](https://github.com/amd/gaia/tree/main/src/gaia/eval/release_scorecard.py) (core generator) ·
+  [`src/gaia/eval/scorecard_gate.py`](https://github.com/amd/gaia/tree/main/src/gaia/eval/scorecard_gate.py) (release gate)
+
+  **Distinct from** [`src/gaia/eval/scorecard.py`](https://github.com/amd/gaia/tree/main/src/gaia/eval/scorecard.py) — that file is the per-run scenario PASS/FAIL aggregator used internally by `gaia eval agent`. This document describes the outward-facing *release artifact*.
+</Info>
+
+## Overview
+
+Each published hub agent ships a **release scorecard** — a versioned Markdown file that records:
+
+- The **eval recipe**: dataset reference, methodology, configuration, and metric definitions.
+- The **measured results**: per-metric values, number of test cases actually run, and dataset size.
+- A single **named aggregate score**: a deterministic, recomputable percentage so a reviewer can verify the number without re-running the eval.
+
+Scorecards are committed alongside the agent's README and linked from it. A standalone **release gate** (`scorecard_gate.py`) blocks packaging when the scorecard is missing or when its aggregate score strictly regresses below the prior version's.
+
+## File format
+
+Scorecard files are **Markdown with YAML front matter** (`.md`). The front matter holds all machine-readable fields; the body is a human-readable summary with a worked recomputation.
+
+```
+---
+schema_version: 1
+agent:
+  name: Email Triage
+  version: 0.2.4
+recipe:
+  dataset:
+    reference: tests/fixtures/email/ground_truth.json
+    description: Synthetic email corpus (FakeGmailBackend, 5-category classification)
+    size: 220
+  methodology: gaia eval benchmark — category_accuracy (case-insensitive exact match)
+  config:
+    harness: gaia eval benchmark
+    model: Gemma-4-E4B-it-GGUF
+    limit: 25
+results:
+  test_cases_run: 24
+  metrics:
+    - name: category_accuracy
+      value: 0.4584
+      weight: 1.0
+aggregate:
+  name: weighted_accuracy
+  formula: "round(100 * sum(weight_i * value_i) / sum(weight_i), 2)"
+  components:
+    - metric: category_accuracy
+      value: 0.4584
+      weight: 1.0
+  value: 45.84
+generated_at: "2026-06-25T10:00:00+00:00"
+inherited_from: null
+---
+
+# Email Triage — Eval Scorecard v0.2.4
+
+**Aggregate score: 45.84** (out of 100)
+...
+```
+
+### Required fields
+
+A scorecard missing any of these is **invalid** and will be rejected by the release gate:
+
+| Field | Description |
+|-------|-------------|
+| `schema_version` | Always `1` for this schema version |
+| `agent.name` | Human-readable agent name |
+| `agent.version` | Semver version string (e.g. `0.2.4`) |
+| `recipe.dataset.reference` | Dataset path or URL |
+| `recipe.dataset.description` | Short description |
+| `recipe.dataset.size` | Total labeled examples available |
+| `recipe.methodology` | How the eval was run |
+| `recipe.config` | Harness config (model, limit, corpus, …) |
+| `results.test_cases_run` | Subset of examples actually executed this run |
+| `results.metrics` | List of `{name, value, weight}` dicts |
+| `aggregate.name` | Name of the aggregate score |
+| `aggregate.formula` | Human-readable formula string |
+| `aggregate.components` | List of `{metric, value, weight}` dicts |
+| `aggregate.value` | The computed aggregate float |
+
+### Two counts — defined distinctly
+
+`recipe.dataset.size` and `results.test_cases_run` are intentionally **separate fields**:
+
+- **`recipe.dataset.size`** — total labeled examples available in the dataset (fixed for a given dataset version).
+- **`results.test_cases_run`** — the subset actually executed in this run (may be limited by `--limit`). Must be ≤ `recipe.dataset.size`.
+
+They may be numerically equal (when the full dataset is run), but they represent different things.
+
+<Warning>
+  **Comparability depends on a consistent `--limit`.** Future regression checks compare aggregate scores. If one run uses `--limit 12` and the next uses `--limit 100`, the scores may differ for reasons unrelated to model quality. Record the exact `limit` in `recipe.config` and keep it consistent across versions.
+</Warning>
+
+## Aggregate score formula
+
+```
+aggregate.value = round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)
+```
+
+where each `valueᵢ` is a metric value in [0, 1] and each `weightᵢ` defaults to 1.0.
+
+The result is a **percentage in [0, 100]**. For a single metric with weight 1.0:
+
+```
+round(100 × 0.4584, 2) = 45.84
+```
+
+A reader can reproduce this value from `aggregate.components` alone — no eval-harness access needed.
+The `aggregate.formula` field in the front matter states the formula in human-readable form so it is self-documenting.
+
+## Storage convention
+
+Scorecards live in a `scorecards/` subdirectory beside the agent's canonical README:
+
+```
+<doc-root>/
+  README.md              ← canonical README (links to scorecard)
+  scorecards/
+    0.1.0.md
+    0.2.3.md
+    0.2.4.md             ← latest
+```
+
+The `doc-root` is the location of the agent's canonical README:
+
+| Agent | doc-root |
+|-------|----------|
+| Email Triage (`@amd-gaia/agent-email`) | `hub/agents/npm/agent-email/` |
+| Hello World | `hub/agents/python/hello-world/` |
+
+The relative link `./scorecards/<version>.md` resolves both in-repo and when the directory is published as an npm package.
+
+## Versioning policy
+
+### Patch releases — carry forward
+
+For a **patch release** (same `major.minor`, `patch` incremented), the prior version's results are carried forward verbatim using `carry_forward()`:
+
+```python
+from gaia.eval.release_scorecard import carry_forward, write_scorecard
+from pathlib import Path
+
+new_payload = carry_forward(
+    prev_path=Path("scorecards/0.2.3.md"),
+    new_version="0.2.4",
+)
+# new_payload.inherited_from == "0.2.3"
+write_scorecard(new_payload, Path("scorecards/0.2.4.md"))
+```
+
+The resulting scorecard has `inherited_from: "0.2.3"` and identical `results` and `aggregate` fields. The aggregate score is unchanged, so the release gate's equal-score case passes.
+
+### Minor / major releases — re-run required
+
+For a **minor or major bump**, `carry_forward()` raises `ValueError` with a "re-run" message. Run the eval fresh and generate a new scorecard:
+
+```bash
+gaia eval benchmark \
+  --model Gemma-4-E4B-it-GGUF \
+  --mbox-path tests/fixtures/email/synthetic_inbox.mbox \
+  --ground-truth tests/fixtures/email/ground_truth.json \
+  --limit 25 \
+  --output-dir /tmp/email-eval
+
+python hub/agents/python/email/packaging/gen_scorecard.py \
+  --benchmark-dir /tmp/email-eval
+```
+
+## Release gate
+
+`scorecard_gate.py` is a standalone script that exits non-zero on failure:
+
+```bash
+python -m gaia.eval.scorecard_gate \
+  --scorecards-dir hub/agents/npm/agent-email/scorecards \
+  --manifest hub/agents/python/email/gaia-agent.yaml
+```
+
+Or with an explicit version:
+
+```bash
+python -m gaia.eval.scorecard_gate \
+  --scorecards-dir hub/agents/npm/agent-email/scorecards \
+  --version 0.2.4
+```
+
+### Gate logic
+
+1. **Presence check**: `<scorecards-dir>/<version>.md` must exist and be valid. → exit 1 if not.
+2. **Locate prior**: find the greatest semver strictly below `<version>` in `<scorecards-dir>`. If none → **first adoption**, exit 0 (presence-only pass).
+3. **Regression check**: if `candidate.aggregate.value < prior.aggregate.value` (strict) → exit 1.
+4. Equal or greater → exit 0.
+
+### Exit codes
+
+| Case | Exit code |
+|------|-----------|
+| Missing or invalid candidate scorecard | `1` |
+| Strict regression vs prior version | `1` |
+| No prior version (first adoption) | `0` |
+| Equal score (patch carry-forward) | `0` |
+| Score improved | `0` |
+
+### `--allow-regression`
+
+When a regression is intentional (e.g. a dataset correction or methodology change), use `--allow-regression`. The gate prints a GHA `::warning::` annotation naming both versions and scores, then exits 0:
+
+```
+::warning::Scorecard regression allowed by --allow-regression: 0.2.3=65.0 → 0.2.4=45.84
+WARNING: Regression override active. Prior version 0.2.3 scored 65.0; candidate 0.2.4 scored 45.84. ...
+```
+
+### How the gate resolves "previous version"
+
+The gate calls `latest_version_below(scorecards_dir, version)`, which:
+
+1. Lists all `*.md` files in `scorecards_dir`.
+2. Keeps only those whose **stem** matches the anchored regex `^\d+\.\d+\.\d+$` (skips `README.md`, `.gitkeep`, prerelease tags, etc.).
+3. Compares versions as **integer tuples** `(major, minor, patch)` — so `0.10.0 > 0.2.9` correctly.
+4. Returns the greatest version strictly below the candidate, or `None`.
+
+The version is read from `gaia-agent.yaml` (via `--manifest`) or passed explicitly (via `--version`).
+
+## Adding a scorecard for a new agent
+
+1. Create the `scorecards/` directory beside the agent's canonical README.
+2. Write a `packaging/gen_scorecard.py` adapter (see `hub/agents/python/email/packaging/gen_scorecard.py` for a reference).
+3. Run the eval and call the adapter → commit the resulting `<version>.md`.
+4. Link the scorecard from the README: `./scorecards/<version>.md`.
+5. Add `scorecards/` to the npm `package.json` `files` array (if published on npm).
+6. Wire `scorecard_gate` into the release workflow (see `release_agent_email.yml` for the job topology).
diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md
index fa7ef97e0..729fe99a6 100644
--- a/hub/agents/npm/agent-email/README.md
+++ b/hub/agents/npm/agent-email/README.md
@@ -2,6 +2,8 @@
 
 [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24**
 
+**Eval scorecard:** see [`./scorecards/0.2.4.md`](./scorecards/0.2.4.md) for the per-version accuracy metrics, dataset details, and aggregate score for v0.2.4.
+
 Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies
 to, and schedules from Gmail and Outlook — with every email body analyzed
 **locally on AMD Ryzen AI** via Lemonade. No message content is sent to a cloud
diff --git a/hub/agents/npm/agent-email/package.json b/hub/agents/npm/agent-email/package.json
index fc3ad9be5..426d163e8 100644
--- a/hub/agents/npm/agent-email/package.json
+++ b/hub/agents/npm/agent-email/package.json
@@ -48,7 +48,8 @@
     "CHANGELOG.md",
     "SPEC.md",
     "SKILL.md",
-    "LICENSE"
+    "LICENSE",
+    "scorecards/"
   ],
   "engines": {
     "node": ">=18"
diff --git a/hub/agents/npm/agent-email/scorecards/.gitkeep b/hub/agents/npm/agent-email/scorecards/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py
index 2961deeb7..52c58d143 100644
--- a/hub/agents/python/email/packaging/gen_scorecard.py
+++ b/hub/agents/python/email/packaging/gen_scorecard.py
@@ -109,10 +109,12 @@ def _is_judged(scenario: dict) -> bool:
     if acc is None:
         return False
     try:
+        import math
+
         f = float(acc)
     except (TypeError, ValueError):
         return False
-    return 0.0 <= f <= 1.0 and f == f  # also rejects NaN via f==f
+    return 0.0 <= f <= 1.0 and math.isfinite(f)
 
 
 def build_payload(benchmark_dir: Path, ground_truth_path: Path):
@@ -189,7 +191,7 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path):
     metrics = [
         {"name": "category_accuracy", "value": float(category_accuracy), "weight": 1.0}
     ]
-    _components, agg_value = compute_aggregate(metrics)  # noqa: F841
+    compute_aggregate(metrics)  # validate metrics; aggregate embedded in render_scorecard
 
     import datetime
 
diff --git a/hub/agents/python/hello-world/scorecards/0.1.0.md b/hub/agents/python/hello-world/scorecards/0.1.0.md
new file mode 100644
index 000000000..fc6121f2e
--- /dev/null
+++ b/hub/agents/python/hello-world/scorecards/0.1.0.md
@@ -0,0 +1,62 @@
+---
+schema_version: 1
+agent:
+  name: Hello World
+  version: 0.1.0
+recipe:
+  dataset:
+    reference: hub/agents/python/hello-world/tests
+    description: Illustrative conversational response dataset (reference agent)
+    size: 10
+  methodology: Illustrative metric — reference agent for scorecard format generalization
+  config:
+    harness: gaia eval agent
+    model: Gemma-4-E4B-it-GGUF
+    limit: 10
+results:
+  test_cases_run: 10
+  metrics:
+  - name: response_quality
+    value: 0.9
+    weight: 1.0
+aggregate:
+  name: weighted_accuracy
+  formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2)
+  components:
+  - metric: response_quality
+    value: 0.9
+    weight: 1.0
+  value: 90.0
+generated_at: '2026-06-25T12:00:00+00:00'
+inherited_from: null
+---
+# Hello World — Eval Scorecard v0.1.0
+
+**Aggregate score: 90.0** (out of 100)
+
+## Recipe
+
+| Field | Value |
+|-------|-------|
+| Dataset | [hub/agents/python/hello-world/tests](hub/agents/python/hello-world/tests) |
+| Description | Illustrative conversational response dataset (reference agent) |
+| Dataset size | 10 labeled examples |
+| Test cases run | 10 |
+| Methodology | Illustrative metric — reference agent for scorecard format generalization |
+
+## Metrics
+
+  - **response_quality**: 0.9000 × 1.0
+
+## Aggregate score recomputation
+
+Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)`
+
+Worked example:
+
+```
+round(100 × ((0.9000 × 1.0)) / 1.0, 2) = 90.0
+```
+
+A reader can reproduce this value from the `aggregate.components` in the front
+matter alone — no eval-harness access needed.
diff --git a/src/gaia/eval/release_scorecard.py b/src/gaia/eval/release_scorecard.py
index 49d81b71b..0d1e4a47f 100644
--- a/src/gaia/eval/release_scorecard.py
+++ b/src/gaia/eval/release_scorecard.py
@@ -25,7 +25,7 @@
 from __future__ import annotations
 
 import re
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
 
@@ -170,7 +170,9 @@ def render_scorecard(payload: ResultPayload) -> str:
         "inherited_from": payload.inherited_from,
     }
 
-    fm_text = yaml.dump(front, default_flow_style=False, sort_keys=False, allow_unicode=True)
+    fm_text = yaml.dump(
+        front, default_flow_style=False, sort_keys=False, allow_unicode=True
+    )
 
     # Human-readable body with worked recompute
     metric_lines = "\n".join(
@@ -178,9 +180,7 @@ def render_scorecard(payload: ResultPayload) -> str:
         for c in components
     )
     total_w = sum(c["weight"] for c in components)
-    worked = " + ".join(
-        f"({c['value']:.4f} × {c['weight']:.1f})" for c in components
-    )
+    worked = " + ".join(f"({c['value']:.4f} × {c['weight']:.1f})" for c in components)
 
     body = f"""# {payload.agent_name} — Eval Scorecard v{payload.agent_version}
 
@@ -253,7 +253,7 @@ def parse_scorecard(source) -> dict:
 
     # Split on first pair of '---' delimiters
     if not text.startswith("---"):
-        raise ValueError(f"Scorecard does not start with '---' front matter")
+        raise ValueError("Scorecard does not start with '---' front matter")
 
     # Find the closing '---' (first occurrence after the opening line)
     rest = text[3:]  # strip opening ---
diff --git a/src/gaia/eval/scorecard_gate.py b/src/gaia/eval/scorecard_gate.py
index a33cecfb0..49c292561 100644
--- a/src/gaia/eval/scorecard_gate.py
+++ b/src/gaia/eval/scorecard_gate.py
@@ -57,22 +57,16 @@ def _read_version_from_manifest(manifest_path: Path) -> str:
     try:
         text = manifest_path.read_text(encoding="utf-8")
     except OSError as exc:
-        raise ValueError(
-            f"Cannot read manifest {manifest_path}: {exc}"
-        ) from exc
+        raise ValueError(f"Cannot read manifest {manifest_path}: {exc}") from exc
 
     try:
         data = yaml.safe_load(text) or {}
     except yaml.YAMLError as exc:
-        raise ValueError(
-            f"Invalid YAML in manifest {manifest_path}: {exc}"
-        ) from exc
+        raise ValueError(f"Invalid YAML in manifest {manifest_path}: {exc}") from exc
 
     version = data.get("version")
     if not version:
-        raise ValueError(
-            f"Manifest {manifest_path} has no 'version:' field."
-        )
+        raise ValueError(f"Manifest {manifest_path} has no 'version:' field.")
     return str(version)
 
 
@@ -230,9 +224,7 @@ def main(argv=None) -> int:
         return 1
 
     if prev_score is None:
-        print(
-            f"ERROR: Prior scorecard {prev_path} has no 'aggregate.value' field."
-        )
+        print(f"ERROR: Prior scorecard {prev_path} has no 'aggregate.value' field.")
         return 1
 
     if float(candidate_score) < float(prev_score):
diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py
index d36e203eb..ccc6d2abe 100644
--- a/tests/unit/eval/test_release_scorecard.py
+++ b/tests/unit/eval/test_release_scorecard.py
@@ -5,7 +5,6 @@
 import datetime
 import importlib.util
 import json
-import sys
 from pathlib import Path
 
 import pytest
@@ -78,9 +77,7 @@ def test_missing_required_fields_each_flagged(self):
     def test_required_top_level_keys_include_expected_sections(self):
         # schema_version, agent, recipe, results, aggregate must be required
         for section in ("schema_version", "agent", "recipe", "results", "aggregate"):
-            assert section in REQUIRED_FIELDS, (
-                f"'{section}' must be in REQUIRED_FIELDS"
-            )
+            assert section in REQUIRED_FIELDS, f"'{section}' must be in REQUIRED_FIELDS"
 
 
 # ---------------------------------------------------------------------------
@@ -145,7 +142,9 @@ def test_rendered_text_contains_closing_dashes(self):
         lines = text.splitlines()
         # Find second occurrence of '---'
         closing = [i for i, l in enumerate(lines) if l == "---" and i > 0]
-        assert closing, "Rendered scorecard must contain a closing '---' after the first"
+        assert (
+            closing
+        ), "Rendered scorecard must contain a closing '---' after the first"
 
     def test_body_after_front_matter_is_non_empty(self):
         payload = _make_payload()
@@ -175,16 +174,14 @@ def test_test_cases_run_and_dataset_size_both_present(self):
         text = render_scorecard(payload)
         parsed = parse_scorecard(text)
         assert "results" in parsed, "'results' section missing from parsed scorecard"
-        assert "test_cases_run" in parsed["results"], (
-            "'results.test_cases_run' must be a distinct field"
-        )
+        assert (
+            "test_cases_run" in parsed["results"]
+        ), "'results.test_cases_run' must be a distinct field"
         assert "recipe" in parsed, "'recipe' section missing from parsed scorecard"
-        assert "dataset" in parsed["recipe"], (
-            "'recipe.dataset' sub-section missing"
-        )
-        assert "size" in parsed["recipe"]["dataset"], (
-            "'recipe.dataset.size' must be a distinct field"
-        )
+        assert "dataset" in parsed["recipe"], "'recipe.dataset' sub-section missing"
+        assert (
+            "size" in parsed["recipe"]["dataset"]
+        ), "'recipe.dataset.size' must be a distinct field"
 
 
 # ---------------------------------------------------------------------------
@@ -207,7 +204,9 @@ def test_no_benchmark_or_agent_modules_imported(self):
             "or 'gaia.eval.quality_metrics' in m or 'gaia_agent_email' in m]; "
             "assert not bad, bad"
         )
-        r = subprocess.run([_sys.executable, "-c", code], capture_output=True, text=True)
+        r = subprocess.run(
+            [_sys.executable, "-c", code], capture_output=True, text=True
+        )
         assert r.returncode == 0, r.stderr
 
 
@@ -306,7 +305,13 @@ def test_rendered_parsed_inherited_from_null_or_absent(self):
 
 class TestLatestVersionBelow:
     def _seed_dir(self, tmp_path):
-        for name in ("0.1.0.md", "0.2.3.md", "0.10.0.md", "README.md", "not-a-version.md"):
+        for name in (
+            "0.1.0.md",
+            "0.2.3.md",
+            "0.10.0.md",
+            "README.md",
+            "not-a-version.md",
+        ):
             (tmp_path / name).write_text("# placeholder")
         return tmp_path
 
@@ -376,9 +381,9 @@ def test_build_payload_mean_of_judged_scenarios(self, tmp_path):
         payload = mod.build_payload(benchmark_dir, gt_path)
 
         expected_mean = round((0.4167 + 0.5000) / 2, 10)
-        assert payload.metrics[0]["value"] == pytest.approx(expected_mean), (
-            f"Expected metric value {expected_mean}, got {payload.metrics[0]['value']}"
-        )
+        assert payload.metrics[0]["value"] == pytest.approx(
+            expected_mean
+        ), f"Expected metric value {expected_mean}, got {payload.metrics[0]['value']}"
 
     def test_build_payload_test_cases_run(self, tmp_path):
         mod = self._load_gen_scorecard()
@@ -429,8 +434,16 @@ def test_all_no_quality_raises(self, tmp_path):
         empty_scorecard = {
             "run_id": "no-quality",
             "scenarios": [
-                {"category": "Gemma-4-E4B-it-GGUF", "status": "PASS", "total_emails": 0},
-                {"category": "Gemma-4-E4B-it-GGUF", "status": "PASS", "total_emails": 0},
+                {
+                    "category": "Gemma-4-E4B-it-GGUF",
+                    "status": "PASS",
+                    "total_emails": 0,
+                },
+                {
+                    "category": "Gemma-4-E4B-it-GGUF",
+                    "status": "PASS",
+                    "total_emails": 0,
+                },
             ],
         }
         (benchmark_dir / "email_benchmark_scorecard.json").write_text(
diff --git a/tests/unit/eval/test_scorecard_gate.py b/tests/unit/eval/test_scorecard_gate.py
index dbeaba0b7..28ab269d9 100644
--- a/tests/unit/eval/test_scorecard_gate.py
+++ b/tests/unit/eval/test_scorecard_gate.py
@@ -5,10 +5,13 @@
 import datetime
 from pathlib import Path
 
-import pytest
 import yaml
 
-from gaia.eval.release_scorecard import ResultPayload, compute_aggregate, render_scorecard
+from gaia.eval.release_scorecard import (
+    ResultPayload,
+    compute_aggregate,
+    render_scorecard,
+)
 from gaia.eval.scorecard_gate import main
 
 # ---------------------------------------------------------------------------
@@ -212,23 +215,21 @@ def test_publish_job_needs_scorecard_gate(self):
             / "workflows"
             / "release_agent_email.yml"
         )
-        assert workflow_path.exists(), (
-            f"Workflow file not found: {workflow_path}"
-        )
+        assert workflow_path.exists(), f"Workflow file not found: {workflow_path}"
         content = workflow_path.read_text()
         parsed = yaml.safe_load(content)
 
         assert "jobs" in parsed, "Workflow has no 'jobs' key"
-        assert "publish" in parsed["jobs"], (
-            "Workflow has no 'publish' job — add it or check the job name"
-        )
+        assert (
+            "publish" in parsed["jobs"]
+        ), "Workflow has no 'publish' job — add it or check the job name"
         needs = parsed["jobs"]["publish"].get("needs", [])
         # needs can be a string or a list
         if isinstance(needs, str):
             needs = [needs]
-        assert "scorecard-gate" in needs, (
-            f"'publish' job must list 'scorecard-gate' in its needs; got: {needs}"
-        )
+        assert (
+            "scorecard-gate" in needs
+        ), f"'publish' job must list 'scorecard-gate' in its needs; got: {needs}"
 
 
 # ---------------------------------------------------------------------------

From 78e45bfd57aafc08c3117d128ff30ae5c5cd5605 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Thu, 25 Jun 2026 18:43:32 -0400
Subject: [PATCH 05/18] feat(eval): deepen validate_scorecard with nested-field
 checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The AC requires 'missing ANY required field ⇒ invalid', but the validator
only checked 5 top-level keys. Add nested checks for agent.{name,version},
recipe.dataset.{reference,size}, recipe.{methodology,config},
results.{test_cases_run,metrics}, aggregate.{name,formula,value}, with
non-dict-parent guards and a non-empty metrics-list requirement. Add
TestSchemaValidator cases for missing nested fields, empty metrics, and
non-dict sections. Also baseline sys.modules before import in the
loose-coupling test so editable-install path finders don't false-positive.
---
 src/gaia/eval/release_scorecard.py        | 64 +++++++++++++++++++++++
 tests/unit/eval/test_release_scorecard.py | 56 +++++++++++++++++---
 2 files changed, 114 insertions(+), 6 deletions(-)

diff --git a/src/gaia/eval/release_scorecard.py b/src/gaia/eval/release_scorecard.py
index 0d1e4a47f..d50449f23 100644
--- a/src/gaia/eval/release_scorecard.py
+++ b/src/gaia/eval/release_scorecard.py
@@ -283,10 +283,74 @@ def validate_scorecard(parsed: dict) -> list:
     """
     errors: list[str] = []
 
+    # Top-level required keys
     for key in REQUIRED_FIELDS:
         if key not in parsed:
             errors.append(f"Missing required field: '{key}'")
 
+    def _section(name: str):
+        """Return the section dict if present and a dict, else record an error."""
+        value = parsed.get(name)
+        if name in parsed and not isinstance(value, dict):
+            errors.append(
+                f"Field '{name}' must be a mapping, got {type(value).__name__}"
+            )
+            return None
+        return value if isinstance(value, dict) else None
+
+    # agent.{name, version}
+    agent = _section("agent")
+    if agent is not None:
+        for sub in ("name", "version"):
+            if sub not in agent:
+                errors.append(f"Missing required field: 'agent.{sub}'")
+
+    # recipe.{dataset.{reference, size}, methodology, config}
+    recipe = _section("recipe")
+    if recipe is not None:
+        for sub in ("methodology", "config"):
+            if sub not in recipe:
+                errors.append(f"Missing required field: 'recipe.{sub}'")
+        dataset = recipe.get("dataset")
+        if "dataset" not in recipe:
+            errors.append("Missing required field: 'recipe.dataset'")
+        elif not isinstance(dataset, dict):
+            errors.append(
+                f"Field 'recipe.dataset' must be a mapping, got {type(dataset).__name__}"
+            )
+        else:
+            for sub in ("reference", "size"):
+                if sub not in dataset:
+                    errors.append(f"Missing required field: 'recipe.dataset.{sub}'")
+
+    # results.{test_cases_run, metrics}
+    results = _section("results")
+    if results is not None:
+        if "test_cases_run" not in results:
+            errors.append("Missing required field: 'results.test_cases_run'")
+        metrics = results.get("metrics")
+        if "metrics" not in results:
+            errors.append("Missing required field: 'results.metrics'")
+        elif not isinstance(metrics, list) or not metrics:
+            errors.append("Field 'results.metrics' must be a non-empty list")
+        else:
+            for i, metric in enumerate(metrics):
+                if not isinstance(metric, dict):
+                    errors.append(f"Field 'results.metrics[{i}]' must be a mapping")
+                    continue
+                for sub in ("name", "value"):
+                    if sub not in metric:
+                        errors.append(
+                            f"Missing required field: 'results.metrics[{i}].{sub}'"
+                        )
+
+    # aggregate.{name, formula, value}
+    aggregate = _section("aggregate")
+    if aggregate is not None:
+        for sub in ("name", "formula", "value"):
+            if sub not in aggregate:
+                errors.append(f"Missing required field: 'aggregate.{sub}'")
+
     return errors
 
 
diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py
index ccc6d2abe..c542ae13e 100644
--- a/tests/unit/eval/test_release_scorecard.py
+++ b/tests/unit/eval/test_release_scorecard.py
@@ -79,6 +79,46 @@ def test_required_top_level_keys_include_expected_sections(self):
         for section in ("schema_version", "agent", "recipe", "results", "aggregate"):
             assert section in REQUIRED_FIELDS, f"'{section}' must be in REQUIRED_FIELDS"
 
+    def test_missing_nested_aggregate_value_flagged(self):
+        payload = _make_payload()
+        parsed = parse_scorecard(render_scorecard(payload))
+        # Complete card stays valid
+        assert validate_scorecard(parsed) == []
+        # Removing a nested required field flags it
+        del parsed["aggregate"]["value"]
+        errors = validate_scorecard(parsed)
+        assert errors, "Expected missing 'aggregate.value' to be flagged"
+        assert any("aggregate.value" in e for e in errors), errors
+
+    def test_missing_nested_agent_version_flagged(self):
+        payload = _make_payload()
+        parsed = parse_scorecard(render_scorecard(payload))
+        del parsed["agent"]["version"]
+        errors = validate_scorecard(parsed)
+        assert errors, "Expected missing 'agent.version' to be flagged"
+        assert any("agent.version" in e for e in errors), errors
+
+    def test_missing_nested_dataset_size_flagged(self):
+        payload = _make_payload()
+        parsed = parse_scorecard(render_scorecard(payload))
+        del parsed["recipe"]["dataset"]["size"]
+        errors = validate_scorecard(parsed)
+        assert any("recipe.dataset.size" in e for e in errors), errors
+
+    def test_empty_metrics_list_flagged(self):
+        payload = _make_payload()
+        parsed = parse_scorecard(render_scorecard(payload))
+        parsed["results"]["metrics"] = []
+        errors = validate_scorecard(parsed)
+        assert any("metrics" in e for e in errors), errors
+
+    def test_non_dict_section_flagged_not_crash(self):
+        payload = _make_payload()
+        parsed = parse_scorecard(render_scorecard(payload))
+        parsed["agent"] = "not-a-dict"
+        errors = validate_scorecard(parsed)
+        assert errors, "Expected a non-dict 'agent' section to be flagged"
+
 
 # ---------------------------------------------------------------------------
 # 2. Aggregate computation
@@ -191,16 +231,20 @@ def test_test_cases_run_and_dataset_size_both_present(self):
 
 class TestLooseCoupling:
     def test_no_benchmark_or_agent_modules_imported(self):
-        # Clean interpreter: importing release_scorecard must not pull in the
-        # eval harness or any agent package. Scanning the test process's own
-        # sys.modules gives false positives (e.g. the pytest_benchmark plugin),
-        # so check in a fresh subprocess instead.
+        # Importing release_scorecard must not pull in the eval harness or any
+        # agent package. Run in a fresh subprocess and baseline sys.modules
+        # BEFORE the import, so we measure only what the import itself adds —
+        # not pytest plugins or editable-install path finders that the
+        # interpreter registers at startup regardless of any import.
         import subprocess
         import sys as _sys
 
         code = (
-            "import sys, gaia.eval.release_scorecard; "
-            "bad=[m for m in sys.modules if 'gaia.eval.benchmark' in m "
+            "import sys; "
+            "before=set(sys.modules); "
+            "import gaia.eval.release_scorecard; "
+            "added=set(sys.modules)-before; "
+            "bad=[m for m in added if 'gaia.eval.benchmark' in m "
             "or 'gaia.eval.quality_metrics' in m or 'gaia_agent_email' in m]; "
             "assert not bad, bad"
         )

From 019cc16a7502dc845b24ef05c5641bf6a511e2c3 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Thu, 25 Jun 2026 18:43:37 -0400
Subject: [PATCH 06/18] feat(eval): record eval limit + derive model in email
 scorecard adapter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The benchmark scorecard.json has no top-level model/limit, so config.limit
was always null — defeating the comparability note in eval-scorecard.mdx.
Add a --limit CLI arg threaded into config.limit, and derive config.model
from the run's scenarios[0].category (the model id in benchmark output),
falling back to gaia-agent.yaml models[0]. Drop the dead list-comprehension
in the final print.
---
 .../python/email/packaging/gen_scorecard.py   | 29 +++++++++++++++----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py
index 52c58d143..22b12ad10 100644
--- a/hub/agents/python/email/packaging/gen_scorecard.py
+++ b/hub/agents/python/email/packaging/gen_scorecard.py
@@ -117,7 +117,7 @@ def _is_judged(scenario: dict) -> bool:
     return 0.0 <= f <= 1.0 and math.isfinite(f)
 
 
-def build_payload(benchmark_dir: Path, ground_truth_path: Path):
+def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None):
     """Build a :class:`~gaia.eval.release_scorecard.ResultPayload` from benchmark output.
 
     A scenario is **judged** iff it has a ``quality`` dict AND
@@ -127,6 +127,9 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path):
     Args:
         benchmark_dir: Directory written by ``gaia eval benchmark --output-dir``.
         ground_truth_path: Path to ``ground_truth.json`` (the labeled corpus).
+        limit: The ``--limit`` value used for the eval run, recorded in
+            ``config["limit"]`` for cross-version comparability. The benchmark
+            ``scorecard.json`` does not persist this, so it must be passed in.
 
     Returns:
         Populated :class:`~gaia.eval.release_scorecard.ResultPayload`.
@@ -188,6 +191,12 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path):
             f"No 'version:' field found in {agent_yaml_path}."
         )
 
+    # Model id: benchmark output records it as the per-scenario `category`.
+    # Fall back to the manifest's first declared model.
+    scenario_model = scenarios[0].get("category") if scenarios else None
+    manifest_models = agent_data.get("models") or [None]
+    model = scenario_model or manifest_models[0]
+
     metrics = [
         {"name": "category_accuracy", "value": float(category_accuracy), "weight": 1.0}
     ]
@@ -211,10 +220,10 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path):
         ),
         config={
             "harness": "gaia eval benchmark",
-            "model": data.get("model", agent_data.get("models", [None])[0]),
+            "model": model,
             "corpus": "tests/fixtures/email/synthetic_inbox.mbox",
             "ground_truth": str(ground_truth_path),
-            "limit": data.get("limit"),
+            "limit": limit,
         },
         test_cases_run=test_cases_run,
         metrics=metrics,
@@ -253,6 +262,16 @@ def main(argv=None) -> int:
             "(default: hub/agents/npm/agent-email/scorecards/)."
         ),
     )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help=(
+            "The --limit value passed to 'gaia eval benchmark' for this run. "
+            "Recorded in config.limit for cross-version comparability "
+            "(the benchmark output does not persist it)."
+        ),
+    )
 
     args = parser.parse_args(argv)
 
@@ -260,7 +279,7 @@ def main(argv=None) -> int:
     gt_path = Path(args.ground_truth).resolve()
 
     try:
-        payload = build_payload(benchmark_dir, gt_path)
+        payload = build_payload(benchmark_dir, gt_path, limit=args.limit)
     except (ValueError, FileNotFoundError) as exc:
         print(f"ERROR: {exc}", file=sys.stderr)
         return 1
@@ -280,7 +299,7 @@ def main(argv=None) -> int:
         f"Scorecard written: {out_path}\n"
         f"  Version: {payload.agent_version}\n"
         f"  Aggregate: {payload.metrics[0]['value']:.4f} category_accuracy "
-        f"(over {len([s for s in [payload] if True])} — {payload.test_cases_run} emails judged)\n"
+        f"({payload.test_cases_run} emails judged)\n"
         f"  Dataset size: {payload.dataset_size} labeled examples"
     )
     return 0

From 2f931a1076a0b71e50dca8232611a8bc2662b072 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Thu, 25 Jun 2026 18:43:41 -0400
Subject: [PATCH 07/18] ci(eval): pin scorecard-gate setup-python to @v6

Match the rest of release_agent_email.yml, which already uses
actions/setup-python@v6.
---
 .github/workflows/release_agent_email.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release_agent_email.yml b/.github/workflows/release_agent_email.yml
index 81c674384..2ae937ab6 100644
--- a/.github/workflows/release_agent_email.yml
+++ b/.github/workflows/release_agent_email.yml
@@ -272,7 +272,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v6
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v6
         with:
           python-version: "3.12"
       - name: Install core + PyYAML

From e47bfaf45fce0de119f1850458458a38fb8d8a4e Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 10:10:49 -0400
Subject: [PATCH 08/18] feat(eval): email v0.2.4 scorecard from real benchmark
 run

Generate the email-triage agent's v0.2.4 release scorecard from an actual
`gaia eval benchmark` run (Gemma-4-E4B, 25 of 220 corpus emails) on AMD
Strix Halo hardware: category_accuracy 0.04 -> aggregate 4.0/100. The low
value is a taxonomy mismatch (the agent's triage labels and the ground-truth
priority labels overlap only on 'urgent'), not triage quality -- tracked in
#1266 and recorded in the scorecard's own methodology.

Adapter hardening: store a repo-relative ground_truth path (no absolute-path
leak in the published artifact), record the eval limit for comparability, and
carry the taxonomy caveat. README surfaces the aggregate with the caveat and a
relative link; docs example aligned to the 4-category label set.
---
 docs/reference/eval-scorecard.mdx             |  2 +-
 hub/agents/npm/agent-email/README.md          |  2 +-
 .../npm/agent-email/scorecards/.gitkeep       |  0
 .../npm/agent-email/scorecards/0.2.4.md       | 70 +++++++++++++++++++
 .../python/email/packaging/gen_scorecard.py   | 20 ++++--
 5 files changed, 88 insertions(+), 6 deletions(-)
 delete mode 100644 hub/agents/npm/agent-email/scorecards/.gitkeep
 create mode 100644 hub/agents/npm/agent-email/scorecards/0.2.4.md

diff --git a/docs/reference/eval-scorecard.mdx b/docs/reference/eval-scorecard.mdx
index f3cbc5093..fd0acd57a 100644
--- a/docs/reference/eval-scorecard.mdx
+++ b/docs/reference/eval-scorecard.mdx
@@ -35,7 +35,7 @@ agent:
 recipe:
   dataset:
     reference: tests/fixtures/email/ground_truth.json
-    description: Synthetic email corpus (FakeGmailBackend, 5-category classification)
+    description: Synthetic email corpus (FakeGmailBackend, 4-category priority labels)
     size: 220
   methodology: gaia eval benchmark — category_accuracy (case-insensitive exact match)
   config:
diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md
index 729fe99a6..dcadb4442 100644
--- a/hub/agents/npm/agent-email/README.md
+++ b/hub/agents/npm/agent-email/README.md
@@ -2,7 +2,7 @@
 
 [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24**
 
-**Eval scorecard:** see [`./scorecards/0.2.4.md`](./scorecards/0.2.4.md) for the per-version accuracy metrics, dataset details, and aggregate score for v0.2.4.
+**Eval scorecard (v0.2.4): aggregate 4.0 / 100** — `category_accuracy` 0.04 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)). The low value reflects strict 4-way exact-match against a different label vocabulary, not triage quality — taxonomy calibration is tracked in [#1266](https://github.com/amd/gaia/issues/1266). The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate.
 
 Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies
 to, and schedules from Gmail and Outlook — with every email body analyzed
diff --git a/hub/agents/npm/agent-email/scorecards/.gitkeep b/hub/agents/npm/agent-email/scorecards/.gitkeep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/hub/agents/npm/agent-email/scorecards/0.2.4.md b/hub/agents/npm/agent-email/scorecards/0.2.4.md
new file mode 100644
index 000000000..5fad58585
--- /dev/null
+++ b/hub/agents/npm/agent-email/scorecards/0.2.4.md
@@ -0,0 +1,70 @@
+---
+schema_version: 1
+agent:
+  name: Email Triage
+  version: 0.2.4
+recipe:
+  dataset:
+    reference: tests/fixtures/email/ground_truth.json
+    description: 'Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend,
+      4-category priority labels: informational / actionable / urgent / low priority)'
+    size: 220
+  methodology: 'gaia eval benchmark — category classification accuracy (case-insensitive
+    exact match of the agent''s triage label vs the ground-truth priority label) over
+    a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent''s
+    triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority
+    labels currently overlap only on ''urgent'', so this exact-match metric understates
+    triage usefulness — taxonomy calibration is tracked in amd/gaia#1266'
+  config:
+    harness: gaia eval benchmark
+    model: Gemma-4-E4B-it-GGUF
+    corpus: tests/fixtures/email/synthetic_inbox.mbox
+    ground_truth: tests/fixtures/email/ground_truth.json
+    limit: 25
+results:
+  test_cases_run: 25
+  metrics:
+  - name: category_accuracy
+    value: 0.04
+    weight: 1.0
+aggregate:
+  name: weighted_accuracy
+  formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2)
+  components:
+  - metric: category_accuracy
+    value: 0.04
+    weight: 1.0
+  value: 4.0
+generated_at: '2026-06-26T14:07:51.768804+00:00'
+inherited_from: null
+---
+# Email Triage — Eval Scorecard v0.2.4
+
+**Aggregate score: 4.0** (out of 100)
+
+## Recipe
+
+| Field | Value |
+|-------|-------|
+| Dataset | [tests/fixtures/email/ground_truth.json](tests/fixtures/email/ground_truth.json) |
+| Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, 4-category priority labels: informational / actionable / urgent / low priority) |
+| Dataset size | 220 labeled examples |
+| Test cases run | 25 |
+| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth priority label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority labels currently overlap only on 'urgent', so this exact-match metric understates triage usefulness — taxonomy calibration is tracked in amd/gaia#1266 |
+
+## Metrics
+
+  - **category_accuracy**: 0.0400 × 1.0
+
+## Aggregate score recomputation
+
+Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)`
+
+Worked example:
+
+```
+round(100 × ((0.0400 × 1.0)) / 1.0, 2) = 4.0
+```
+
+A reader can reproduce this value from the `aggregate.components` in the front
+matter alone — no eval-harness access needed.
diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py
index 22b12ad10..b10681ffe 100644
--- a/hub/agents/python/email/packaging/gen_scorecard.py
+++ b/hub/agents/python/email/packaging/gen_scorecard.py
@@ -210,19 +210,31 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None):
         dataset_reference="tests/fixtures/email/ground_truth.json",
         dataset_description=(
             "Synthetic email corpus for GAIA email-triage evaluation "
-            "(FakeGmailBackend, 5-category classification)"
+            "(FakeGmailBackend, 4-category priority labels: "
+            "informational / actionable / urgent / low priority)"
         ),
         dataset_size=dataset_size,
         methodology=(
             "gaia eval benchmark — category classification accuracy "
-            "(case-insensitive exact match) over a synthetic labeled corpus "
-            "via FakeGmailBackend; no LLM judge required"
+            "(case-insensitive exact match of the agent's triage label vs the "
+            "ground-truth priority label) over a synthetic labeled corpus via "
+            "FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy "
+            "(fyi / needs_response / promotional / urgent) and the corpus "
+            "priority labels currently overlap only on 'urgent', so this "
+            "exact-match metric understates triage usefulness — taxonomy "
+            "calibration is tracked in amd/gaia#1266"
         ),
         config={
             "harness": "gaia eval benchmark",
             "model": model,
             "corpus": "tests/fixtures/email/synthetic_inbox.mbox",
-            "ground_truth": str(ground_truth_path),
+            # Store a repo-relative path — never leak a local absolute path into
+            # a committed/published artifact.
+            "ground_truth": (
+                str(ground_truth_path.relative_to(_REPO_ROOT))
+                if str(ground_truth_path).startswith(str(_REPO_ROOT))
+                else ground_truth_path.name
+            ),
             "limit": limit,
         },
         test_cases_run=test_cases_run,

From 2ae55ecda732ceb3db306bd57495e6bc46d54ce0 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 10:42:26 -0400
Subject: [PATCH 09/18] feat(eval): scorecard refresh/reject CI loop, adoption
 skill, correct taxonomy ref

- Add .github/workflows/email_scorecard_refresh.yml: on agent/corpus changes the
  self-hosted AMD runner re-runs the eval, regenerates the scorecard, commits it
  when the score holds/improves, and FAILS on a regression (same-version vs the
  committed card + cross-version via scorecard_gate). Hosted-CI backstop stays the
  release-time scorecard-gate job.
- Add .claude/skills/adding-eval-scorecard: a phased skill so adopting a scorecard
  is invocable, not a prose walkthrough; referenced from eval-scorecard.mdx.
- Document the update/reject loop in eval-scorecard.mdx.
- Correct the scorecard's taxonomy reference from the closed #1266 (old 4-way) to
  #1874 (corpus labels stale vs schema-2.0 5-bucket taxonomy); regenerate the card.
---
 .claude/skills/adding-eval-scorecard/SKILL.md |  85 +++++++++
 .github/workflows/email_scorecard_refresh.yml | 162 ++++++++++++++++++
 docs/reference/eval-scorecard.mdx             |  21 +++
 hub/agents/npm/agent-email/README.md          |   2 +-
 .../npm/agent-email/scorecards/0.2.4.md       |   7 +-
 .../python/email/packaging/gen_scorecard.py   |   4 +-
 6 files changed, 275 insertions(+), 6 deletions(-)
 create mode 100644 .claude/skills/adding-eval-scorecard/SKILL.md
 create mode 100644 .github/workflows/email_scorecard_refresh.yml

diff --git a/.claude/skills/adding-eval-scorecard/SKILL.md b/.claude/skills/adding-eval-scorecard/SKILL.md
new file mode 100644
index 000000000..0afaa057f
--- /dev/null
+++ b/.claude/skills/adding-eval-scorecard/SKILL.md
@@ -0,0 +1,85 @@
+---
+name: "adding-eval-scorecard"
+description: "Adopt the per-agent eval scorecard for a GAIA hub agent: write the harness→payload adapter, run the eval to produce a REAL scorecard, link + surface it from the agent's README, wire the release gate, and (for a new agent) generalize the format. Use when asked to 'add a scorecard', 'adopt the eval scorecard', 'generate the scorecard for <agent>', or wire scorecard CI for an agent. Builds on docs/reference/eval-scorecard.mdx and the email agent reference adapter."
+---
+
+# Adding an Eval Scorecard to a GAIA Agent
+
+Adopt the release **eval scorecard** ([`docs/reference/eval-scorecard.mdx`](../../../docs/reference/eval-scorecard.mdx)) for one hub agent. The system is `harness → result payload → generator → scorecard`, with a standalone presence+regression release gate. The **email agent is the reference implementation** — mirror it.
+
+**Core modules (do not modify; reuse):**
+- `src/gaia/eval/release_scorecard.py` — `ResultPayload`, `compute_aggregate`, `render_scorecard`, `write_scorecard`, `validate_scorecard`, `carry_forward`, `latest_version_below`. Harness-agnostic (stdlib + PyYAML only).
+- `src/gaia/eval/scorecard_gate.py` — the standalone gate (`python -m gaia.eval.scorecard_gate`).
+- Reference adapter: `hub/agents/python/email/packaging/gen_scorecard.py`.
+
+This is a **phased checklist with a hard gate at the real-eval step** — the scorecard MUST come from an actual eval run, never hand-authored numbers.
+
+## Phase 1 — Locate the agent's surfaces
+
+1. **Version source of truth** = the `version:` field in `<agent>/gaia-agent.yaml`. Never invent a parallel scheme.
+2. **Canonical README** (where the scorecard is linked + surfaced): for an npm-published agent it is the npm client README (e.g. `hub/agents/npm/<id>/README.md`), NOT a `packaging/README.md`. For a Python-only agent it is `hub/agents/python/<id>/README.md`. Confirm which by checking what `release_agent_<id>.yml` publishes (`README:` env) — the published README is the one to link.
+3. **doc-root** = the directory holding that canonical README. Scorecards live at `<doc-root>/scorecards/<version>.md`.
+4. **Eval vehicle**: what existing harness produces this agent's accuracy metric? (email → `gaia eval benchmark` over `tests/fixtures/email/`.) If none exists, STOP and surface that — propose the minimal harness before building; do not invent numbers.
+
+## Phase 2 — Write the adapter (harness → payload)
+
+Copy `hub/agents/python/email/packaging/gen_scorecard.py` as the template. The adapter:
+- imports ONLY `gaia.eval.release_scorecard` (never the harness or agent package — preserve loose coupling);
+- reads the harness output, builds a `ResultPayload`;
+- defines **"judged"** explicitly and **raises loudly** if zero results are judged (no silent 0.0);
+- records **dataset size** (total labeled examples) and **test_cases_run** (subset executed) as DISTINCT fields;
+- stores **repo-relative** paths only (never a local absolute path — it ships in a published artifact);
+- records the eval `limit`/config so future regression checks are comparable;
+- writes to `<doc-root>/scorecards/<version>.md`.
+
+Add an offline unit test against a committed sample harness-output fixture (see `tests/fixtures/eval/email_benchmark_scorecard.json` + `tests/unit/eval/test_release_scorecard.py::TestEmailAdapter`) so the adapter is testable without a live model.
+
+## Phase 3 — Run the REAL eval (hard gate — no hand-authored numbers)
+
+The accuracy number must come from an actual run. For the email agent:
+
+```bash
+# Real eval needs Lemonade + the model. Prefer AMD hardware (Strix Halo / Ryzen AI);
+# the [self-hosted, lemonade-eval] runner is the canonical environment.
+GAIA_AGENT_TOOL_TIMEOUT=900 \
+PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \
+PYTHONPATH="$(pwd)" \
+  <venv>/bin/gaia eval benchmark \
+    --model Gemma-4-E4B-it-GGUF \
+    --mbox-path tests/fixtures/email/synthetic_inbox.mbox \
+    --ground-truth tests/fixtures/email/ground_truth.json \
+    --limit 25 --output-dir <persistent-dir>
+
+<venv>/bin/python hub/agents/python/email/packaging/gen_scorecard.py \
+    --benchmark-dir <persistent-dir> --limit 25
+```
+
+**Headless gotchas (see memory `project-email-benchmark-headless-gotchas`):**
+- `PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring` — the email agent's calendar-connector resolution blocks forever on the macOS Keychain (and can stall on Linux SecretService) in non-interactive contexts. Without this it hangs at 0% CPU during agent construction.
+- `PYTHONPATH="$(pwd)"` — the benchmark imports `tests.fixtures.email.*`; the console script doesn't add the repo root.
+- `GAIA_AGENT_TOOL_TIMEOUT=900` — triage of N emails is one tool call; the 180s default abandons it on slow backends, yielding a degenerate 0-email FAIL run.
+- Write `--output-dir` to a **persistent** dir, not `/tmp` (cleared on session resume).
+- Record honestly: if the metric is low for a known reason (e.g. a taxonomy/label mismatch), put the explanation in the adapter's `methodology` string and link the tracking issue — never inflate the number.
+
+## Phase 4 — Surface, link, and gate
+
+1. **Link + surface** from the canonical README: a one-line `Eval scorecard (vX.Y.Z): aggregate N/100 … ([./scorecards/X.Y.Z.md](./scorecards/X.Y.Z.md))`. The relative link must resolve in-repo.
+2. **npm `files`**: if the agent publishes on npm, add `scorecards/` to `package.json` `files` so the link resolves on the published package too.
+3. **Hub display**: a published scorecard surfaces on the agent's hub page / Agent UI detail view (see `workers/agent-hub` + `AgentDetailModal.tsx`); ensure the publish step uploads the scorecard alongside the README.
+4. **Release gate**: add a `scorecard-gate` job to `release_agent_<id>.yml` and list it in `publish.needs`. The job runs on a GitHub-hosted runner (it only parses committed files — no eval):
+   ```bash
+   python -m gaia.eval.scorecard_gate \
+     --scorecards-dir <doc-root>/scorecards \
+     --manifest hub/agents/python/<id>/gaia-agent.yaml
+   ```
+   The job must NOT have `continue-on-error`, an `environment:`, or a `permissions:` override (inherits `contents: read`; needs no secrets).
+5. **Auto-update/reject loop**: for re-running on agent changes and refreshing the committed scorecard, see `eval-scorecard.mdx` "Keeping the scorecard current" and the self-hosted refresh workflow — reject-on-worse is the gate; better-or-equal refreshes the committed card.
+
+## Phase 5 — Verify (evidence before "done")
+
+Run and capture: the generated `<version>.md`; the gate **passing** on it (exit 0); the gate **blocking** a manufactured regression (exit 1) and a missing card (exit 1); a by-hand recompute of the aggregate from `aggregate.components` matching the recorded value. Run `python util/lint.py --all` and the eval unit tests. These are the PR's real-world proof.
+
+## Versioning
+
+- **Patch** release → `carry_forward(prev_path, new_version)` (copies results verbatim, sets `inherited_from`); do NOT re-run the eval.
+- **Minor/major** release → re-run the eval (Phase 3); `carry_forward` refuses a non-patch bump with a "re-run" error.
diff --git a/.github/workflows/email_scorecard_refresh.yml b/.github/workflows/email_scorecard_refresh.yml
new file mode 100644
index 000000000..7b3b02b5f
--- /dev/null
+++ b/.github/workflows/email_scorecard_refresh.yml
@@ -0,0 +1,162 @@
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+# Email agent eval-scorecard refresh + regression gate (#1862).
+#
+# Answers "how does a PR that changes the agent keep the scorecard honest?":
+# when the email agent's LLM-affecting code (or the eval corpus) changes, this
+# re-runs the REAL eval, regenerates the scorecard, and then:
+#   - score IMPROVED or held  -> commits the refreshed scorecard to the branch
+#   - score REGRESSED          -> fails the job (the worse card is NOT committed)
+#
+# `gaia eval benchmark` needs Lemonade on AMD hardware, so this runs ONLY on the
+# self-hosted [self-hosted, lemonade-eval] pool — GitHub-hosted runners cannot run
+# it. The release-time `scorecard-gate` job in release_agent_email.yml is the
+# hosted-CI backstop (it parses committed files only, no eval).
+#
+# Two regression checks run here:
+#   1. SAME-VERSION: fresh aggregate vs the currently-committed card for this
+#      version — stops a noisy/worse re-run from silently overwriting a good score.
+#   2. CROSS-VERSION: `gaia.eval.scorecard_gate` — fresh card vs the prior version.
+#
+# Auto-commit needs `contents: write` and only works on the repo's own branches;
+# a fork PR's GITHUB_TOKEN is read-only — for forks, run the eval locally / on AMD
+# hardware and commit the scorecard by hand (the release gate still enforces it).
+
+name: Email Agent Eval — scorecard refresh
+
+on:
+  workflow_dispatch:
+    inputs:
+      limit:
+        description: 'Messages to triage (must match the committed scorecard for comparability)'
+        required: false
+        default: '25'
+      model:
+        description: 'Lemonade model id'
+        required: false
+        default: 'Gemma-4-E4B-it-GGUF'
+  push:
+    branches-ignore:
+      - main
+    paths:
+      - 'hub/agents/python/email/**'
+      - 'tests/fixtures/email/**'
+      - 'src/gaia/eval/release_scorecard.py'
+      - 'src/gaia/eval/scorecard_gate.py'
+
+concurrency:
+  # Share the single Lemonade backend slot with the other self-hosted evals so two
+  # runs never race-evict each other's model (CLAUDE.md: evals run serially).
+  group: lemonade-eval
+  cancel-in-progress: false
+
+permissions:
+  contents: write   # auto-commit the refreshed scorecard to the branch
+
+env:
+  SCORECARD_DIR: hub/agents/npm/agent-email/scorecards
+  MANIFEST: hub/agents/python/email/gaia-agent.yaml
+  LIMIT: ${{ github.event.inputs.limit || '25' }}
+  MODEL: ${{ github.event.inputs.model || 'Gemma-4-E4B-it-GGUF' }}
+
+jobs:
+  refresh:
+    name: Re-run eval, refresh-or-reject scorecard
+    runs-on: [self-hosted, lemonade-eval]
+    timeout-minutes: 90
+    steps:
+      - name: Checkout (the pushed branch)
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ github.head_ref || github.ref_name }}
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.10'
+
+      - name: Install in isolated venv
+        run: |
+          python -m venv .venv-scorecard
+          source .venv-scorecard/bin/activate
+          python -m pip install --upgrade pip
+          pip install -e ".[dev,eval,api]"
+          echo "$PWD/.venv-scorecard/bin" >> "$GITHUB_PATH"
+
+      - name: Resolve version + capture currently-committed aggregate
+        id: pre
+        run: |
+          set -euo pipefail
+          VERSION=$(python -c "import yaml; print(yaml.safe_load(open('${MANIFEST}'))['version'])")
+          echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
+          CARD="${SCORECARD_DIR}/${VERSION}.md"
+          # Aggregate of the card as committed on this branch (empty if new).
+          if git cat-file -e "HEAD:${CARD}" 2>/dev/null; then
+            git show "HEAD:${CARD}" > /tmp/committed_card.md
+            COMMITTED=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('/tmp/committed_card.md'))['aggregate']['value'])")
+          else
+            COMMITTED=""
+          fi
+          echo "committed=${COMMITTED}" >> "$GITHUB_OUTPUT"
+          echo "Version ${VERSION}; committed aggregate: ${COMMITTED:-<none>}"
+
+      - name: Run the email-triage benchmark (real eval)
+        env:
+          # The agent's calendar-connector resolution blocks on the OS keyring in
+          # a headless context — disable it so construction doesn't hang.
+          PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
+          # Triage of N emails is one tool call; the 180s default abandons it on a
+          # slow backend and yields a degenerate 0-email run.
+          GAIA_AGENT_TOOL_TIMEOUT: '900'
+          PYTHONPATH: ${{ github.workspace }}
+        run: |
+          set -euo pipefail
+          rm -rf eval-out && mkdir -p eval-out
+          gaia eval benchmark \
+            --model "${MODEL}" \
+            --mbox-path tests/fixtures/email/synthetic_inbox.mbox \
+            --ground-truth tests/fixtures/email/ground_truth.json \
+            --limit "${LIMIT}" \
+            --output-dir eval-out
+
+      - name: Regenerate the scorecard from the real run
+        run: |
+          set -euo pipefail
+          python hub/agents/python/email/packaging/gen_scorecard.py \
+            --benchmark-dir eval-out --limit "${LIMIT}"
+
+      - name: Same-version regression check (reject a worse re-run)
+        run: |
+          set -euo pipefail
+          VERSION="${{ steps.pre.outputs.version }}"
+          COMMITTED="${{ steps.pre.outputs.committed }}"
+          CARD="${SCORECARD_DIR}/${VERSION}.md"
+          FRESH=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('${CARD}'))['aggregate']['value'])")
+          echo "fresh aggregate: ${FRESH} | committed: ${COMMITTED:-<none>}"
+          if [ -n "${COMMITTED}" ] && python -c "import sys; sys.exit(0 if float('${FRESH}') < float('${COMMITTED}') else 1)"; then
+            echo "::error::Scorecard regression for v${VERSION}: re-run scored ${FRESH} < committed ${COMMITTED}. Not committing the worse card. Investigate, or override intentionally via --allow-regression in a manual commit."
+            git checkout -- "${CARD}" || true
+            exit 1
+          fi
+          echo "No same-version regression — fresh score is >= committed."
+
+      - name: Cross-version gate (fresh card vs prior version)
+        run: |
+          set -euo pipefail
+          python -m gaia.eval.scorecard_gate \
+            --scorecards-dir "${SCORECARD_DIR}" \
+            --manifest "${MANIFEST}"
+
+      - name: Commit the refreshed scorecard (only if it changed for the better/equal)
+        run: |
+          set -euo pipefail
+          if git diff --quiet -- "${SCORECARD_DIR}"; then
+            echo "Scorecard unchanged — nothing to commit."
+            exit 0
+          fi
+          git config user.name  "${{ github.actor }}"
+          git config user.email "${{ github.actor }}@users.noreply.github.com"
+          git add "${SCORECARD_DIR}"
+          git commit -m "eval(email): refresh v${{ steps.pre.outputs.version }} scorecard from benchmark run"
+          git push origin "HEAD:${{ github.head_ref || github.ref_name }}"
diff --git a/docs/reference/eval-scorecard.mdx b/docs/reference/eval-scorecard.mdx
index fd0acd57a..b00d9d00e 100644
--- a/docs/reference/eval-scorecard.mdx
+++ b/docs/reference/eval-scorecard.mdx
@@ -230,8 +230,29 @@ The gate calls `latest_version_below(scorecards_dir, version)`, which:
 
 The version is read from `gaia-agent.yaml` (via `--manifest`) or passed explicitly (via `--version`).
 
+## Keeping the scorecard current (the update / reject loop)
+
+The scorecard must move with the agent: when LLM-affecting code changes, the eval is re-run and the committed scorecard refreshed — **upward**. A regression is blocked.
+
+Two enforcement points work together:
+
+1. **Reject-on-worse (always on, GitHub-hosted).** The `scorecard-gate` job in `release_agent_<id>.yml` runs on every release. It only parses committed files (no eval), so it runs on a standard runner and **fails the build** if the committed scorecard regressed below the prior version or is missing. This is the hard gate.
+2. **Run-and-refresh (self-hosted AMD).** `gaia eval benchmark` needs Lemonade on AMD hardware, so it cannot run on GitHub-hosted runners — it runs on the `[self-hosted, lemonade-eval]` pool. The `Email Agent Eval — scorecard refresh` workflow (`.github/workflows/email_scorecard_refresh.yml`) runs on demand (and on pushes touching the email agent), re-runs the eval, regenerates the scorecard, then:
+   - **score ≥ committed** → commits the refreshed scorecard back to the branch (the PR carries the improved number);
+   - **score < committed** → fails loudly (the regression must be investigated, or consciously overridden with `--allow-regression`).
+
+So a PR that changes the agent gets its scorecard refreshed (better) or rejected (worse) automatically on the AMD runner, and the release gate is the backstop on hosted CI. Locally, `gen_scorecard.py` + `scorecard_gate.py` reproduce both steps (see the **`adding-eval-scorecard` skill**).
+
+<Warning>
+  The refresh job needs `contents: write` and runs only on the repo's own branches — a fork PR's `GITHUB_TOKEN` is read-only and cannot auto-commit. For a fork PR, run the eval locally/on AMD hardware and commit the scorecard manually; the release gate still enforces no-regression.
+</Warning>
+
 ## Adding a scorecard for a new agent
 
+<Tip>
+  **Use the [`adding-eval-scorecard` skill](https://github.com/amd/gaia/tree/main/.claude/skills/adding-eval-scorecard/SKILL.md).** In Claude Code, invoke it instead of following these steps by hand — it carries the exact commands, the harness→payload→generator flow, the headless-eval gotchas (keyring/PYTHONPATH/tool-timeout), and the verification evidence to capture. The steps below are the reference the skill automates.
+</Tip>
+
 1. Create the `scorecards/` directory beside the agent's canonical README.
 2. Write a `packaging/gen_scorecard.py` adapter (see `hub/agents/python/email/packaging/gen_scorecard.py` for a reference).
 3. Run the eval and call the adapter → commit the resulting `<version>.md`.
diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md
index dcadb4442..7f0891d3d 100644
--- a/hub/agents/npm/agent-email/README.md
+++ b/hub/agents/npm/agent-email/README.md
@@ -2,7 +2,7 @@
 
 [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24**
 
-**Eval scorecard (v0.2.4): aggregate 4.0 / 100** — `category_accuracy` 0.04 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)). The low value reflects strict 4-way exact-match against a different label vocabulary, not triage quality — taxonomy calibration is tracked in [#1266](https://github.com/amd/gaia/issues/1266). The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate.
+**Eval scorecard (v0.2.4): aggregate 4.0 / 100** — `category_accuracy` 0.04 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)). The low value reflects strict exact-match against stale corpus labels (the eval ground-truth still uses the pre-schema-2.0 4-way taxonomy), not triage quality — tracked in [#1874](https://github.com/amd/gaia/issues/1874). The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate.
 
 Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies
 to, and schedules from Gmail and Outlook — with every email body analyzed
diff --git a/hub/agents/npm/agent-email/scorecards/0.2.4.md b/hub/agents/npm/agent-email/scorecards/0.2.4.md
index 5fad58585..1c5f89480 100644
--- a/hub/agents/npm/agent-email/scorecards/0.2.4.md
+++ b/hub/agents/npm/agent-email/scorecards/0.2.4.md
@@ -14,7 +14,8 @@ recipe:
     a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent''s
     triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority
     labels currently overlap only on ''urgent'', so this exact-match metric understates
-    triage usefulness — taxonomy calibration is tracked in amd/gaia#1266'
+    triage usefulness — the corpus labels are stale vs the schema-2.0 taxonomy, tracked
+    in amd/gaia#1874'
   config:
     harness: gaia eval benchmark
     model: Gemma-4-E4B-it-GGUF
@@ -35,7 +36,7 @@ aggregate:
     value: 0.04
     weight: 1.0
   value: 4.0
-generated_at: '2026-06-26T14:07:51.768804+00:00'
+generated_at: '2026-06-26T14:38:25.168352+00:00'
 inherited_from: null
 ---
 # Email Triage — Eval Scorecard v0.2.4
@@ -50,7 +51,7 @@ inherited_from: null
 | Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, 4-category priority labels: informational / actionable / urgent / low priority) |
 | Dataset size | 220 labeled examples |
 | Test cases run | 25 |
-| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth priority label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority labels currently overlap only on 'urgent', so this exact-match metric understates triage usefulness — taxonomy calibration is tracked in amd/gaia#1266 |
+| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth priority label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority labels currently overlap only on 'urgent', so this exact-match metric understates triage usefulness — the corpus labels are stale vs the schema-2.0 taxonomy, tracked in amd/gaia#1874 |
 
 ## Metrics
 
diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py
index b10681ffe..fdfdf1eae 100644
--- a/hub/agents/python/email/packaging/gen_scorecard.py
+++ b/hub/agents/python/email/packaging/gen_scorecard.py
@@ -221,8 +221,8 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None):
             "FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy "
             "(fyi / needs_response / promotional / urgent) and the corpus "
             "priority labels currently overlap only on 'urgent', so this "
-            "exact-match metric understates triage usefulness — taxonomy "
-            "calibration is tracked in amd/gaia#1266"
+            "exact-match metric understates triage usefulness — the corpus "
+            "labels are stale vs the schema-2.0 taxonomy, tracked in amd/gaia#1874"
         ),
         config={
             "harness": "gaia eval benchmark",

From 01d6da4696191efeb69a26859f02e6f07280d7a1 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 10:49:58 -0400
Subject: [PATCH 10/18] feat(eval): surface eval scorecard in Agent Hub worker
 and publish flow

Adds `eval_scorecard_url` and `eval_score` fields end-to-end through the
worker catalog pipeline so the Agent Hub listing can show a benchmark
aggregate and link to the full scorecard.

Worker: `evalScorecardKey()` storage helper, optional `eval_scorecard`
multipart part in POST /publish (stored as `eval-scorecard.md` per version),
YAML front-matter parse of `aggregate.value` in `toIndexEntry`, and both
fields carried through `rebuildIndex`. Missing/unparseable scorecard yields
undefined fields, never throws.

Publish: `--eval-scorecard <path>` flag in `publish_to_r2.py`; the GHA
release workflow conditionally passes the versioned scorecard file when it
exists under `hub/agents/npm/agent-email/scorecards/<version>.md`.

Python catalog: `merge_with_registry` threads the two new optional fields
from the R2 index entry into the unified catalog dict so the UI backend
serves them alongside existing agent metadata.

Tests: two focused tests in routes.test.ts cover the present/absent
scorecard cases (69 tests total, all pass).
---
 .github/workflows/release_agent_email.yml     |  7 +++
 .../python/email/packaging/publish_to_r2.py   | 29 +++++++++
 src/gaia/hub/catalog.py                       |  9 ++-
 workers/agent-hub/src/catalog.ts              | 41 +++++++++++--
 workers/agent-hub/src/publish.ts              | 12 +++-
 workers/agent-hub/src/storage.ts              | 19 ++++++
 workers/agent-hub/src/types.ts                |  4 ++
 workers/agent-hub/test/fake-r2.ts             |  2 +
 workers/agent-hub/test/routes.test.ts         | 61 +++++++++++++++++++
 9 files changed, 178 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/release_agent_email.yml b/.github/workflows/release_agent_email.yml
index 2ae937ab6..1c624f2f2 100644
--- a/.github/workflows/release_agent_email.yml
+++ b/.github/workflows/release_agent_email.yml
@@ -475,6 +475,12 @@ jobs:
             case "$f" in *.json) continue ;; esac
             args+=(--artifact "$f")
           done
+          VER="${{ steps.ver.outputs.version }}"
+          scorecard_args=()
+          SCORECARD="hub/agents/npm/agent-email/scorecards/${VER}.md"
+          if [ -f "${SCORECARD}" ]; then
+            scorecard_args+=(--eval-scorecard "${SCORECARD}")
+          fi
           python hub/agents/python/email/packaging/publish_to_r2.py \
             --base-url "${GAIA_HUB_PUBLISH_URL:-${GAIA_HUB_BASE_URL:-https://hub.amd-gaia.ai}}" \
             --manifest "${MANIFEST}" \
@@ -482,6 +488,7 @@ jobs:
             --changelog "${CHANGELOG}" \
             --spec "${SPEC}" \
             --skill "${SKILL}" \
+            "${scorecard_args[@]}" \
             "${args[@]}" \
             --summary-out published.json
           echo "=== publish summary ==="
diff --git a/hub/agents/python/email/packaging/publish_to_r2.py b/hub/agents/python/email/packaging/publish_to_r2.py
index 4da341837..5884cb976 100644
--- a/hub/agents/python/email/packaging/publish_to_r2.py
+++ b/hub/agents/python/email/packaging/publish_to_r2.py
@@ -129,6 +129,7 @@ def publish_one(
     changelog_bytes: bytes | None = None,
     spec_bytes: bytes | None = None,
     skill_bytes: bytes | None = None,
+    eval_scorecard_bytes: bytes | None = None,
     package_files_bytes: bytes | None = None,
 ) -> dict:
     if not artifact_path.exists():
@@ -172,6 +173,10 @@ def publish_one(
             files["spec"] = ("SPEC.md", spec_bytes, "text/markdown")
         if skill_bytes is not None:
             files["skill"] = ("SKILL.md", skill_bytes, "text/markdown")
+        # The eval scorecard rides along with the first platform binary and becomes
+        # the catalog entry's `eval_score` and `eval_scorecard_url`.
+        if eval_scorecard_bytes is not None:
+            files["eval_scorecard"] = ("eval-scorecard.md", eval_scorecard_bytes, "text/markdown")
         # The whole-package file listing rides with the zip artifact — it becomes
         # the catalog entry's `package.files` (the hub's file-list display).
         if package_files_bytes is not None:
@@ -271,6 +276,14 @@ def main(argv=None) -> int:
         help="Path to SKILL.md to publish as the agent's catalog skill "
         "(POSTed as the multipart 'skill' part the Worker accepts).",
     )
+    parser.add_argument(
+        "--eval-scorecard",
+        type=Path,
+        help="Path to the eval scorecard markdown (e.g. scorecards/0.2.4.md) to "
+        "publish as the agent's catalog eval score and scorecard URL "
+        "(POSTed as the multipart 'eval_scorecard' part the Worker accepts). "
+        "Absent = publish without an eval scorecard.",
+    )
     parser.add_argument(
         "--package-files",
         type=Path,
@@ -341,6 +354,21 @@ def main(argv=None) -> int:
             flush=True,
         )
 
+    eval_scorecard_bytes = None
+    if args.eval_scorecard is not None:
+        if not args.eval_scorecard.exists():
+            raise SystemExit(
+                f"error: --eval-scorecard path not found: {args.eval_scorecard}. "
+                "Pass the scorecard markdown, or omit --eval-scorecard to publish "
+                "without one."
+            )
+        eval_scorecard_bytes = args.eval_scorecard.read_bytes()
+        print(
+            f"[publish] attaching eval scorecard: {args.eval_scorecard} "
+            f"({len(eval_scorecard_bytes)} bytes)",
+            flush=True,
+        )
+
     package_files_bytes = None
     if args.package_files is not None:
         if not args.package_files.exists():
@@ -376,6 +404,7 @@ def main(argv=None) -> int:
                 changelog_bytes=changelog_bytes,
                 spec_bytes=spec_bytes,
                 skill_bytes=skill_bytes,
+                eval_scorecard_bytes=eval_scorecard_bytes,
                 package_files_bytes=package_files_bytes,
             )
         )
diff --git a/src/gaia/hub/catalog.py b/src/gaia/hub/catalog.py
index b78337895..461e34b0f 100644
--- a/src/gaia/hub/catalog.py
+++ b/src/gaia/hub/catalog.py
@@ -389,7 +389,7 @@ def merge_with_registry(
 
         language = entry.get("language", "python")
         security_tier = entry.get("security_tier", "experimental")
-        by_id[agent_id] = {
+        merged: Dict[str, Any] = {
             "id": agent_id,
             "name": entry.get("name", agent_id),
             "description": entry.get("description", ""),
@@ -407,6 +407,13 @@ def merge_with_registry(
             "status": status,
             "source": (reg.source if reg is not None else "hub"),
         }
+        # Optional eval scorecard fields — absent from older catalog entries and
+        # from builtin/custom agents that haven't run a benchmark yet.
+        if "eval_score" in entry:
+            merged["eval_score"] = entry["eval_score"]
+        if "eval_scorecard_url" in entry:
+            merged["eval_scorecard_url"] = entry["eval_scorecard_url"]
+        by_id[agent_id] = merged
 
     # 2. Registry-only agents (builtins / custom not published to the hub).
     for agent_id, reg in registered.items():
diff --git a/workers/agent-hub/src/catalog.ts b/workers/agent-hub/src/catalog.ts
index 357cf83f8..786d278d9 100644
--- a/workers/agent-hub/src/catalog.ts
+++ b/workers/agent-hub/src/catalog.ts
@@ -5,11 +5,15 @@
  * Build per-agent manifests and the top-level catalog index.
  */
 
+import { parse as parseYaml } from "yaml";
+
 import { compareSemver } from "./manifest";
 import {
+  evalScorecardKey,
   listAgentIds,
   readAgentManifest,
   readChangelog,
+  readEvalScorecard,
   readPackageFiles,
   readReadme,
   readSkill,
@@ -96,10 +100,31 @@ export function upsertVersion(
   };
 }
 
+/**
+ * Parse the `aggregate.value` from a scorecard's YAML front matter. Returns
+ * undefined when the scorecard is absent, malformed, or missing the field —
+ * never throws so a bad scorecard never breaks the catalog build.
+ */
+function parseScorecardScore(markdown: string | null): number | undefined {
+  if (!markdown) return undefined;
+  // Extract the YAML front matter block between the leading --- delimiters.
+  const match = /^---\n([\s\S]*?)\n---/.exec(markdown);
+  if (!match) return undefined;
+  try {
+    const fm = parseYaml(match[1]) as Record<string, unknown> | null;
+    const agg = fm && typeof fm === "object" ? (fm.aggregate as Record<string, unknown> | undefined) : undefined;
+    const val = agg?.value;
+    return typeof val === "number" && Number.isFinite(val) ? val : undefined;
+  } catch {
+    return undefined;
+  }
+}
+
 /**
  * Build the catalog entry for one agent manifest. `readme`/`changelog` are the
  * latest version's markdown ("" if none was published); `packageFiles` is the
- * whole-package zip's file listing (null if no package zip was published).
+ * whole-package zip's file listing (null if no package zip was published);
+ * `evalScorecard` is the scorecard markdown (null if none was published).
  */
 export function toIndexEntry(
   agent: AgentManifest,
@@ -107,7 +132,9 @@ export function toIndexEntry(
   changelog: string,
   packageFiles: { files: { name: string; size_bytes: number }[] } | null,
   spec = "",
-  skill = ""
+  skill = "",
+  evalScorecard: string | null = null,
+  baseUrl = "https://hub.amd-gaia.ai"
 ): IndexEntry {
   const latest = agent.versions[agent.latest_version];
   const req = agent.requirements;
@@ -154,6 +181,10 @@ export function toIndexEntry(
     // undefined serializes to "key absent" — only present when the manifest set it.
     npm_package: agent.npm_package,
     playground_url: agent.playground_url,
+    eval_scorecard_url: evalScorecard !== null
+      ? `${baseUrl.replace(/\/$/, "")}/${evalScorecardKey(agent.id, agent.latest_version)}`
+      : undefined,
+    eval_score: parseScorecardScore(evalScorecard),
     package: pkg,
   };
 }
@@ -164,7 +195,8 @@ export function toIndexEntry(
  */
 export async function rebuildIndex(
   bucket: R2Bucket,
-  now: Date = new Date()
+  now: Date = new Date(),
+  baseUrl = "https://hub.amd-gaia.ai"
 ): Promise<CatalogIndex> {
   const ids = await listAgentIds(bucket);
   const entries: IndexEntry[] = [];
@@ -176,7 +208,8 @@ export async function rebuildIndex(
     const packageFiles = await readPackageFiles(bucket, id, agent.latest_version);
     const spec = await readSpec(bucket, id, agent.latest_version);
     const skill = await readSkill(bucket, id, agent.latest_version);
-    entries.push(toIndexEntry(agent, readme, changelog, packageFiles, spec, skill));
+    const evalScorecard = await readEvalScorecard(bucket, id, agent.latest_version);
+    entries.push(toIndexEntry(agent, readme, changelog, packageFiles, spec, skill, evalScorecard, baseUrl));
   }
   entries.sort((a, b) => a.id.localeCompare(b.id));
 
diff --git a/workers/agent-hub/src/publish.ts b/workers/agent-hub/src/publish.ts
index 6c9b638dd..626b8c65e 100644
--- a/workers/agent-hub/src/publish.ts
+++ b/workers/agent-hub/src/publish.ts
@@ -18,6 +18,7 @@ import { parseManifest } from "./manifest";
 import {
   artifactKey,
   changelogKey,
+  evalScorecardKey,
   packageFilesKey,
   rawManifestKey,
   readAgentManifest,
@@ -173,6 +174,9 @@ export async function handlePublish(
   // semantics as README/CHANGELOG.
   const specText = await optionalMarkdownPart(form, "spec", "SPEC.md");
   const skillText = await optionalMarkdownPart(form, "skill", "SKILL.md");
+  // Optional eval scorecard markdown (the agent's benchmark results, rendered on
+  // the hub listing as an aggregate score + link). Per-version, first-POST semantics.
+  const evalScorecardText = await optionalMarkdownPart(form, "eval_scorecard", "eval-scorecard.md");
   // Optional whole-package file listing (the zip's contents, for the hub's file
   // list). The zip itself rides in as a normal `artifact`; this is just the
   // manifest of what's inside it.
@@ -276,6 +280,11 @@ export async function handlePublish(
         httpMetadata: { contentType: "text/markdown; charset=utf-8" },
       });
     }
+    if (evalScorecardText != null) {
+      await env.BUCKET.put(evalScorecardKey(manifest.id, manifest.version), evalScorecardText, {
+        httpMetadata: { contentType: "text/markdown; charset=utf-8" },
+      });
+    }
   }
 
   // The package file listing rides the whole-package zip POST, which in a real
@@ -296,7 +305,8 @@ export async function handlePublish(
   const updated = upsertVersion(existing, manifest, versionEntry);
   await writeAgentManifest(env.BUCKET, updated);
 
-  const index = await rebuildIndex(env.BUCKET, now);
+  const baseUrl = new URL(request.url).origin;
+  const index = await rebuildIndex(env.BUCKET, now, baseUrl);
 
   return json(
     {
diff --git a/workers/agent-hub/src/storage.ts b/workers/agent-hub/src/storage.ts
index 0b15640f6..366e3fe84 100644
--- a/workers/agent-hub/src/storage.ts
+++ b/workers/agent-hub/src/storage.ts
@@ -52,6 +52,10 @@ export function skillKey(id: string, version: string): string {
   return `${versionDir(id, version)}SKILL.md`;
 }
 
+export function evalScorecardKey(id: string, version: string): string {
+  return `${versionDir(id, version)}eval-scorecard.md`;
+}
+
 export function packageFilesKey(id: string, version: string): string {
   return `${versionDir(id, version)}package-files.json`;
 }
@@ -114,6 +118,21 @@ export async function readSkill(
   return obj.text();
 }
 
+/**
+ * Read the eval scorecard markdown for one published version. Returns null when
+ * none was published — the `eval_scorecard` form part is optional, so its
+ * absence is not an error.
+ */
+export async function readEvalScorecard(
+  bucket: R2Bucket,
+  id: string,
+  version: string
+): Promise<string | null> {
+  const obj = await bucket.get(evalScorecardKey(id, version));
+  if (!obj) return null;
+  return obj.text();
+}
+
 /**
  * Read the whole-package file listing (`{ files: [{name, size_bytes}] }`) for one
  * version, or null when none was published — the `package_files` form part on
diff --git a/workers/agent-hub/src/types.ts b/workers/agent-hub/src/types.ts
index 571d75f9d..36df4a811 100644
--- a/workers/agent-hub/src/types.ts
+++ b/workers/agent-hub/src/types.ts
@@ -199,6 +199,10 @@ export interface IndexEntry {
   npm_package?: string;
   /** Localhost playground URL served by the agent's sidecar; absent otherwise. */
   playground_url?: string;
+  /** Public URL of the eval scorecard markdown for the latest version; absent when none was published. */
+  eval_scorecard_url?: string;
+  /** Aggregate eval score (0–100) parsed from the latest version's scorecard front matter; absent when none was published or parseable. */
+  eval_score?: number;
   /**
    * Whole-package download: a single zip (all platform binaries + client + docs)
    * plus its file listing. Present only when a `package_files` manifest was
diff --git a/workers/agent-hub/test/fake-r2.ts b/workers/agent-hub/test/fake-r2.ts
index 79284f98b..9e149c681 100644
--- a/workers/agent-hub/test/fake-r2.ts
+++ b/workers/agent-hub/test/fake-r2.ts
@@ -159,6 +159,7 @@ export function publishRequest(opts: {
   changelog?: string;
   spec?: string;
   skill?: string;
+  evalScorecard?: string;
   packageFiles?: string;
 }): Request {
   const form = new FormData();
@@ -167,6 +168,7 @@ export function publishRequest(opts: {
   if (opts.changelog !== undefined) form.set("changelog", opts.changelog);
   if (opts.spec !== undefined) form.set("spec", opts.spec);
   if (opts.skill !== undefined) form.set("skill", opts.skill);
+  if (opts.evalScorecard !== undefined) form.set("eval_scorecard", opts.evalScorecard);
   if (opts.packageFiles !== undefined) form.set("package_files", opts.packageFiles);
   const bytes = typeof opts.artifact === "string" ? new TextEncoder().encode(opts.artifact) : opts.artifact;
   form.set(
diff --git a/workers/agent-hub/test/routes.test.ts b/workers/agent-hub/test/routes.test.ts
index bb602d127..29505b207 100644
--- a/workers/agent-hub/test/routes.test.ts
+++ b/workers/agent-hub/test/routes.test.ts
@@ -81,3 +81,64 @@ describe("GET routes", () => {
     expect(res.status).toBe(405);
   });
 });
+
+// Minimal YAML front matter matching the email agent's scorecard shape.
+const SAMPLE_SCORECARD = [
+  "---",
+  "schema_version: 1",
+  "agent:",
+  "  name: Test Agent",
+  "  version: 0.1.0",
+  "aggregate:",
+  "  name: weighted_accuracy",
+  "  value: 87.5",
+  "generated_at: '2026-06-26T00:00:00Z'",
+  "---",
+  "# Test Agent — Eval Scorecard v0.1.0",
+  "",
+  "**Aggregate score: 87.5** (out of 100)",
+].join("\n");
+
+describe("eval scorecard in catalog", () => {
+  it("exposes eval_score and eval_scorecard_url when a scorecard is published", async () => {
+    const env = makeEnv();
+    await worker.fetch(
+      publishRequest({
+        token: "tok_amd",
+        manifestYaml: sampleManifest({ id: "chat", version: "0.1.0" }),
+        artifact: "chat-wheel",
+        filename: "gaia_agent_chat-0.1.0-py3-none-any.whl",
+        evalScorecard: SAMPLE_SCORECARD,
+      }),
+      env as never
+    );
+
+    const res = await worker.fetch(get("/index.json"), env as never);
+    expect(res.status).toBe(200);
+    const body = (await res.json()) as any;
+    const entry = body.agents[0];
+    expect(entry.eval_score).toBe(87.5);
+    expect(entry.eval_scorecard_url).toMatch(/\/agents\/chat\/0\.1\.0\/eval-scorecard\.md$/);
+  });
+
+  it("omits eval_score and eval_scorecard_url when no scorecard is published", async () => {
+    const env = makeEnv();
+    await worker.fetch(
+      publishRequest({
+        token: "tok_amd",
+        manifestYaml: sampleManifest({ id: "chat", version: "0.1.0" }),
+        artifact: "chat-wheel",
+        filename: "gaia_agent_chat-0.1.0-py3-none-any.whl",
+        // no evalScorecard
+      }),
+      env as never
+    );
+
+    const res = await worker.fetch(get("/index.json"), env as never);
+    expect(res.status).toBe(200);
+    const body = (await res.json()) as any;
+    const entry = body.agents[0];
+    expect(entry.eval_score).toBeUndefined();
+    expect(entry.eval_scorecard_url).toBeUndefined();
+  });
+});

From add517249ca54a91d42dbea21715ec213957fb3e Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 10:50:04 -0400
Subject: [PATCH 11/18] feat(eval): show eval score and scorecard link in Agent
 UI detail modal

Adds `eval_score` and `eval_scorecard_url` optional fields to `AgentInfo`
in the frontend type definitions. When an agent has an eval score, the
detail modal renders an "Eval scorecard" section showing the numeric score
out of 100, with a "View scorecard" link when the URL is present. Renders
nothing when neither field is set (no empty section).
---
 .../webui/src/components/AgentDetailModal.tsx | 26 ++++++++++++++++++-
 src/gaia/apps/webui/src/types/index.ts        |  4 +++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/gaia/apps/webui/src/components/AgentDetailModal.tsx b/src/gaia/apps/webui/src/components/AgentDetailModal.tsx
index 7de638328..b1a2dd954 100644
--- a/src/gaia/apps/webui/src/components/AgentDetailModal.tsx
+++ b/src/gaia/apps/webui/src/components/AgentDetailModal.tsx
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MIT
 
 import { useEffect, useCallback } from 'react';
-import { Wrench, Cpu, Shield, X, HardDrive, CheckCircle2, FlaskConical, AlertTriangle } from 'lucide-react';
+import { Wrench, Cpu, Shield, X, HardDrive, CheckCircle2, FlaskConical, AlertTriangle, BarChart2 } from 'lucide-react';
 import { getAgentIcon } from './agentIcons';
 import type { AgentInfo } from '../types';
 
@@ -172,6 +172,30 @@ export function AgentDetailModal({ agent, onClose, onStartChat }: AgentDetailMod
                         </div>
                     )}
 
+                    {/* Eval scorecard */}
+                    {agent.eval_score != null && (
+                        <div className="agent-detail-section">
+                            <div className="agent-detail-section-title">Eval scorecard</div>
+                            <div className="agent-detail-meta-item">
+                                <BarChart2 size={14} />
+                                <div>
+                                    <div className="agent-detail-meta-label">Eval score</div>
+                                    <div className="agent-detail-meta-value">
+                                        {agent.eval_score} / 100
+                                        {agent.eval_scorecard_url && (
+                                            <> &mdash; <a
+                                                href={agent.eval_scorecard_url}
+                                                target="_blank"
+                                                rel="noopener noreferrer"
+                                                style={{ color: 'var(--accent)', textDecoration: 'underline', fontSize: 12 }}
+                                            >View scorecard</a></>
+                                        )}
+                                    </div>
+                                </div>
+                            </div>
+                        </div>
+                    )}
+
                     {/* Conversation starters */}
                     {starters.length > 0 && (
                         <div className="agent-detail-section">
diff --git a/src/gaia/apps/webui/src/types/index.ts b/src/gaia/apps/webui/src/types/index.ts
index 7b1aa9447..10c610e57 100644
--- a/src/gaia/apps/webui/src/types/index.ts
+++ b/src/gaia/apps/webui/src/types/index.ts
@@ -117,6 +117,10 @@ export interface AgentInfo {
     avatar_url?: string;
     /** True when the publisher has deprecated this agent. */
     deprecated?: boolean;
+    /** Public URL of the eval scorecard markdown; absent when none was published. */
+    eval_scorecard_url?: string;
+    /** Aggregate eval score (0–100) from the latest published scorecard; absent when none. */
+    eval_score?: number;
 }
 
 /** Derived card state for the Agent Hub (issue #1097). */

From 0eed4456920ebf57b95f2ef298e3d1f5865a9428 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 12:48:13 -0400
Subject: [PATCH 12/18] feat(eval): regenerate email v0.2.4 scorecard against
 relabeled corpus

After #1875 relabeled the eval corpus to the schema-2.0 triage taxonomy, the
email agent's predictions and the ground-truth labels share one vocabulary, so
category_accuracy now measures real agreement: 0.40 over 25 of 220 emails ->
aggregate 40.0/100 (was 4.0, a labeling artifact). Fresh gaia eval benchmark run
on AMD Strix Halo. Drop the now-resolved #1874 caveat from the adapter
methodology + README; align the dataset description to the schema-2.0 taxonomy.
---
 hub/agents/npm/agent-email/README.md          |  2 +-
 .../npm/agent-email/scorecards/0.2.4.md       | 31 +++++++++----------
 .../python/email/packaging/gen_scorecard.py   | 13 +++-----
 3 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md
index 7f0891d3d..c8c079b2a 100644
--- a/hub/agents/npm/agent-email/README.md
+++ b/hub/agents/npm/agent-email/README.md
@@ -2,7 +2,7 @@
 
 [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24**
 
-**Eval scorecard (v0.2.4): aggregate 4.0 / 100** — `category_accuracy` 0.04 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)). The low value reflects strict exact-match against stale corpus labels (the eval ground-truth still uses the pre-schema-2.0 4-way taxonomy), not triage quality — tracked in [#1874](https://github.com/amd/gaia/issues/1874). The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate.
+**Eval scorecard (v0.2.4): aggregate 40.0 / 100** — `category_accuracy` 0.40 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate.
 
 Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies
 to, and schedules from Gmail and Outlook — with every email body analyzed
diff --git a/hub/agents/npm/agent-email/scorecards/0.2.4.md b/hub/agents/npm/agent-email/scorecards/0.2.4.md
index 1c5f89480..7e36786a1 100644
--- a/hub/agents/npm/agent-email/scorecards/0.2.4.md
+++ b/hub/agents/npm/agent-email/scorecards/0.2.4.md
@@ -7,15 +7,12 @@ recipe:
   dataset:
     reference: tests/fixtures/email/ground_truth.json
     description: 'Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend,
-      4-category priority labels: informational / actionable / urgent / low priority)'
+      schema-2.0 triage taxonomy: fyi / needs_response / promotional / urgent / personal)'
     size: 220
-  methodology: 'gaia eval benchmark — category classification accuracy (case-insensitive
-    exact match of the agent''s triage label vs the ground-truth priority label) over
-    a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent''s
-    triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority
-    labels currently overlap only on ''urgent'', so this exact-match metric understates
-    triage usefulness — the corpus labels are stale vs the schema-2.0 taxonomy, tracked
-    in amd/gaia#1874'
+  methodology: gaia eval benchmark — category classification accuracy (case-insensitive
+    exact match of the agent's triage label vs the ground-truth label) over a synthetic
+    labeled corpus via FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0
+    triage taxonomy, aligned with the agent's output labels (#1874)
   config:
     harness: gaia eval benchmark
     model: Gemma-4-E4B-it-GGUF
@@ -26,36 +23,36 @@ results:
   test_cases_run: 25
   metrics:
   - name: category_accuracy
-    value: 0.04
+    value: 0.4
     weight: 1.0
 aggregate:
   name: weighted_accuracy
   formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2)
   components:
   - metric: category_accuracy
-    value: 0.04
+    value: 0.4
     weight: 1.0
-  value: 4.0
-generated_at: '2026-06-26T14:38:25.168352+00:00'
+  value: 40.0
+generated_at: '2026-06-26T16:47:13.735478+00:00'
 inherited_from: null
 ---
 # Email Triage — Eval Scorecard v0.2.4
 
-**Aggregate score: 4.0** (out of 100)
+**Aggregate score: 40.0** (out of 100)
 
 ## Recipe
 
 | Field | Value |
 |-------|-------|
 | Dataset | [tests/fixtures/email/ground_truth.json](tests/fixtures/email/ground_truth.json) |
-| Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, 4-category priority labels: informational / actionable / urgent / low priority) |
+| Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, schema-2.0 triage taxonomy: fyi / needs_response / promotional / urgent / personal) |
 | Dataset size | 220 labeled examples |
 | Test cases run | 25 |
-| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth priority label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority labels currently overlap only on 'urgent', so this exact-match metric understates triage usefulness — the corpus labels are stale vs the schema-2.0 taxonomy, tracked in amd/gaia#1874 |
+| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0 triage taxonomy, aligned with the agent's output labels (#1874) |
 
 ## Metrics
 
-  - **category_accuracy**: 0.0400 × 1.0
+  - **category_accuracy**: 0.4000 × 1.0
 
 ## Aggregate score recomputation
 
@@ -64,7 +61,7 @@ Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)`
 Worked example:
 
 ```
-round(100 × ((0.0400 × 1.0)) / 1.0, 2) = 4.0
+round(100 × ((0.4000 × 1.0)) / 1.0, 2) = 40.0
 ```
 
 A reader can reproduce this value from the `aggregate.components` in the front
diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py
index fdfdf1eae..10d036442 100644
--- a/hub/agents/python/email/packaging/gen_scorecard.py
+++ b/hub/agents/python/email/packaging/gen_scorecard.py
@@ -210,19 +210,16 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None):
         dataset_reference="tests/fixtures/email/ground_truth.json",
         dataset_description=(
             "Synthetic email corpus for GAIA email-triage evaluation "
-            "(FakeGmailBackend, 4-category priority labels: "
-            "informational / actionable / urgent / low priority)"
+            "(FakeGmailBackend, schema-2.0 triage taxonomy: "
+            "fyi / needs_response / promotional / urgent / personal)"
         ),
         dataset_size=dataset_size,
         methodology=(
             "gaia eval benchmark — category classification accuracy "
             "(case-insensitive exact match of the agent's triage label vs the "
-            "ground-truth priority label) over a synthetic labeled corpus via "
-            "FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy "
-            "(fyi / needs_response / promotional / urgent) and the corpus "
-            "priority labels currently overlap only on 'urgent', so this "
-            "exact-match metric understates triage usefulness — the corpus "
-            "labels are stale vs the schema-2.0 taxonomy, tracked in amd/gaia#1874"
+            "ground-truth label) over a synthetic labeled corpus via "
+            "FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0 "
+            "triage taxonomy, aligned with the agent's output labels (#1874)"
         ),
         config={
             "harness": "gaia eval benchmark",

From f5971b67cfae403bcfd85b466e60e61ba1fc2da4 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 13:29:20 -0400
Subject: [PATCH 13/18] refactor(eval): single SCORECARD.md per agent, new gate
 interface, reproduction section

Storage convention changes from scorecards/<version>.md to a single SCORECARD.md
updated in place (versioned via publish snapshot, same as README.md).

- release_scorecard.py: add reproduction_command to ResultPayload; render_scorecard
  emits a Reproduction section; carry_forward reads version from front matter instead
  of filename stem; remove latest_version_below (per-version dirs gone); fix utcnow
  -> now(utc)
- scorecard_gate.py: redesigned to accept --scorecard SCORECARD.md + optional
  --baseline-file / --baseline-ref (mutually exclusive); no --scorecards-dir or
  --version flags; --baseline-ref resolves via git show; absence at ref = first
  adoption pass; git-shellout-free when --baseline-file is used
- gen_scorecard.py: writes hub/agents/npm/agent-email/SCORECARD.md (not
  scorecards/<ver>.md); supplies reproduction_command with exact env vars and commands
- tests: updated for new carry_forward signature, new gate interface, reproduction
  section assertions, second-agent generalization test, utcnow -> now(utc)
---
 .../python/email/packaging/gen_scorecard.py   |  53 +++-
 src/gaia/eval/release_scorecard.py            | 123 ++++----
 src/gaia/eval/scorecard_gate.py               | 266 ++++++++++--------
 tests/unit/eval/test_release_scorecard.py     | 136 ++++++---
 tests/unit/eval/test_scorecard_gate.py        | 202 ++++++++-----
 5 files changed, 477 insertions(+), 303 deletions(-)

diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py
index 10d036442..1837a1389 100644
--- a/hub/agents/python/email/packaging/gen_scorecard.py
+++ b/hub/agents/python/email/packaging/gen_scorecard.py
@@ -7,7 +7,8 @@
 Reads the benchmark ``--output-dir`` (looks for a JSON file containing a
 ``scenarios`` key — ``scorecard.json`` in a real run, or any ``*scorecard*.json``
 fixture) and the ground-truth JSON, builds a :class:`ResultPayload`, and writes the
-scorecard to ``hub/agents/npm/agent-email/scorecards/<version>.md``.
+scorecard to ``hub/agents/npm/agent-email/SCORECARD.md`` (a single file, updated
+in place — versioned via the publish snapshot, the same way README.md works).
 
 This adapter imports ``gaia.eval.release_scorecard`` (core generator) but never
 imports the eval harness (``gaia.eval.benchmark``) or the email-agent package —
@@ -15,9 +16,13 @@
 
 Usage::
 
+    PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\
+    GAIA_AGENT_TOOL_TIMEOUT=120 \\
+    PYTHONPATH="$(pwd)" \\
     python hub/agents/python/email/packaging/gen_scorecard.py \\
         --benchmark-dir /tmp/email-eval \\
-        [--ground-truth tests/fixtures/email/ground_truth.json]
+        [--ground-truth tests/fixtures/email/ground_truth.json] \\
+        [--limit 25]
 
 The ``--ground-truth`` path defaults to the canonical fixture in the repository.
 """
@@ -42,6 +47,9 @@
 # Canonical benchmark scorecard filename (written by gaia eval benchmark)
 _SCORECARD_FILENAME = "scorecard.json"
 
+# Output filename: single SCORECARD.md per agent package, updated in place.
+_OUTPUT_FILENAME = "SCORECARD.md"
+
 
 def _find_benchmark_scorecard(benchmark_dir: Path) -> Path:
     """Locate the benchmark scorecard JSON in ``benchmark_dir``.
@@ -204,6 +212,30 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None):
 
     import datetime
 
+    # Construct an exact reproduction command using the supplied arguments, so any
+    # reader can reproduce the scorecard result from scratch.
+    limit_flag = f" --limit {limit}" if limit is not None else ""
+    ground_truth_rel = (
+        str(ground_truth_path.relative_to(_REPO_ROOT))
+        if str(ground_truth_path).startswith(str(_REPO_ROOT))
+        else ground_truth_path.name
+    )
+    benchmark_dir_display = str(benchmark_dir)
+    reproduction_command = (
+        "# Step 1: run the benchmark (requires a running Lemonade Server on :13305)\n"
+        f"PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n"
+        f"GAIA_AGENT_TOOL_TIMEOUT=120 \\\n"
+        f"PYTHONPATH=\"$(pwd)\" \\\n"
+        f"gaia eval benchmark{limit_flag}\n\n"
+        "# Step 2: generate the scorecard from the benchmark output\n"
+        f"PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n"
+        f"PYTHONPATH=\"$(pwd)\" \\\n"
+        f"python hub/agents/python/email/packaging/gen_scorecard.py \\\n"
+        f"    --benchmark-dir {benchmark_dir_display} \\\n"
+        f"    --ground-truth {ground_truth_rel}"
+        + (f" \\\n    --limit {limit}" if limit is not None else "")
+    )
+
     return ResultPayload(
         agent_name="Email Triage",
         agent_version=version,
@@ -227,11 +259,7 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None):
             "corpus": "tests/fixtures/email/synthetic_inbox.mbox",
             # Store a repo-relative path — never leak a local absolute path into
             # a committed/published artifact.
-            "ground_truth": (
-                str(ground_truth_path.relative_to(_REPO_ROOT))
-                if str(ground_truth_path).startswith(str(_REPO_ROOT))
-                else ground_truth_path.name
-            ),
+            "ground_truth": ground_truth_rel,
             "limit": limit,
         },
         test_cases_run=test_cases_run,
@@ -239,6 +267,7 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None):
         aggregate_name="weighted_accuracy",
         generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(),
         inherited_from=None,
+        reproduction_command=reproduction_command,
     )
 
 
@@ -268,7 +297,7 @@ def main(argv=None) -> int:
         default=None,
         help=(
             "Override the scorecard output directory "
-            "(default: hub/agents/npm/agent-email/scorecards/)."
+            f"(default: hub/agents/npm/agent-email/, writes {_OUTPUT_FILENAME})."
         ),
     )
     parser.add_argument(
@@ -296,12 +325,12 @@ def main(argv=None) -> int:
     from gaia.eval.release_scorecard import write_scorecard
 
     if args.output_dir:
-        scorecards_dir = Path(args.output_dir)
+        out_dir = Path(args.output_dir)
     else:
-        scorecards_dir = _NPM_ROOT / "scorecards"
+        out_dir = _NPM_ROOT
 
-    scorecards_dir.mkdir(parents=True, exist_ok=True)
-    out_path = scorecards_dir / f"{payload.agent_version}.md"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_path = out_dir / _OUTPUT_FILENAME
     write_scorecard(payload, out_path)
 
     print(
diff --git a/src/gaia/eval/release_scorecard.py b/src/gaia/eval/release_scorecard.py
index d50449f23..46111455e 100644
--- a/src/gaia/eval/release_scorecard.py
+++ b/src/gaia/eval/release_scorecard.py
@@ -1,12 +1,18 @@
 # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 """
-Per-agent / per-version eval scorecard: generator, parser, validator, and versioning helpers.
+Per-agent eval scorecard: generator, parser, validator, and versioning helpers.
 
 **Distinct from** ``src/gaia/eval/scorecard.py`` — that module is the per-eval-run
 scenario PASS/FAIL aggregator (``build_scorecard``). This module produces the
-outward-facing *release artifact*: a versioned Markdown file with YAML front matter
-holding measured accuracy metrics, the eval recipe, and a deterministic aggregate score.
+outward-facing *release artifact*: a single ``SCORECARD.md`` file (updated in
+place per release, versioned via the publish snapshot — the same way README.md
+works) with YAML front matter holding measured accuracy metrics, the eval recipe,
+a deterministic aggregate score, and a Reproduction section.
+
+Storage convention: ``<agent-npm-root>/SCORECARD.md``  (NOT ``scorecards/<ver>.md``).
+Per-version uniqueness comes from the publish snapshot in R2 (the hub stores every
+doc per version at ``agents/<id>/<version>/SCORECARD.md``).
 
 Intentionally harness-agnostic: this module imports ONLY stdlib + PyYAML.
 No other loader is permitted — ``yaml.safe_load`` only.
@@ -63,6 +69,9 @@ class ResultPayload:
         generated_at: ISO-8601 timestamp string; informational only.
         inherited_from: If this is a patch carry-forward, the prior version string;
             otherwise None.
+        reproduction_command: Optional exact shell command(s) to reproduce this
+            scorecard run.  Rendered in the ``## Reproduction`` section.  If None,
+            a generic pointer to the docs/skill is rendered instead.
     """
 
     agent_name: str
@@ -77,6 +86,7 @@ class ResultPayload:
     aggregate_name: str = "weighted_accuracy"
     generated_at: str = ""
     inherited_from: Optional[str] = None
+    reproduction_command: Optional[str] = None
 
 
 def compute_aggregate(metrics: list) -> tuple:
@@ -121,7 +131,8 @@ def render_scorecard(payload: ResultPayload) -> str:
     """Render a scorecard as Markdown with YAML front matter.
 
     The front matter is machine-readable; the body is a human-readable summary
-    that includes the aggregate formula and a worked recomputation example.
+    that includes the aggregate formula, a worked recomputation example, and a
+    Reproduction section so any reader can reproduce the result from scratch.
 
     Args:
         payload: Populated :class:`ResultPayload`.
@@ -182,6 +193,22 @@ def render_scorecard(payload: ResultPayload) -> str:
     total_w = sum(c["weight"] for c in components)
     worked = " + ".join(f"({c['value']:.4f} × {c['weight']:.1f})" for c in components)
 
+    # Reproduction section
+    if payload.reproduction_command:
+        repro_body = (
+            "Run the following commands from the repository root:\n\n"
+            f"```sh\n{payload.reproduction_command}\n```\n\n"
+            "See [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) "
+            "and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) "
+            "for the full setup guide."
+        )
+    else:
+        repro_body = (
+            "See the [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) "
+            "and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) "
+            "for the full reproduction recipe."
+        )
+
     body = f"""# {payload.agent_name} — Eval Scorecard v{payload.agent_version}
 
 **Aggregate score: {agg_value}** (out of 100)
@@ -212,6 +239,10 @@ def render_scorecard(payload: ResultPayload) -> str:
 
 A reader can reproduce this value from the `aggregate.components` in the front
 matter alone — no eval-harness access needed.
+
+## Reproduction
+
+{repro_body}
 """
 
     if payload.inherited_from:
@@ -372,66 +403,19 @@ def _assert_valid_version(version: str) -> None:
         )
 
 
-def _assert_safe_path(scorecards_dir: Path, version: str) -> Path:
-    """Return ``scorecards_dir / f"{version}.md"`` after path-traversal guard."""
-    _assert_valid_version(version)
-    scorecards_dir = scorecards_dir.resolve()
-    candidate = (scorecards_dir / f"{version}.md").resolve()
-    if not str(candidate).startswith(str(scorecards_dir)):
-        raise ValueError(
-            f"Resolved scorecard path {candidate} is not inside "
-            f"scorecards dir {scorecards_dir} — possible path traversal."
-        )
-    return candidate
-
-
-def latest_version_below(scorecards_dir: Path, version: str) -> Optional[str]:
-    """Return the greatest version in ``scorecards_dir`` strictly less than ``version``.
-
-    Only files whose stem matches the anchored semver regex ``^\\d+\\.\\d+\\.\\d+$``
-    are considered. Non-matching filenames (README.md, .gitkeep, etc.) are silently
-    skipped.
-
-    Args:
-        scorecards_dir: Directory to scan for ``*.md`` scorecards.
-        version: The candidate version string (must be valid semver).
-
-    Returns:
-        The greatest matching version string strictly below ``version``, or ``None``
-        if no such version exists.
-
-    Raises:
-        ValueError: If ``version`` is not a valid semver string.
-    """
-    _assert_valid_version(version)
-    target_tuple = _semver_tuple(version)
-    scorecards_dir = Path(scorecards_dir)
-
-    candidates: list[tuple] = []
-    if scorecards_dir.is_dir():
-        for p in scorecards_dir.glob("*.md"):
-            m = _SEMVER_RE.match(p.stem)
-            if not m:
-                continue  # silently skip non-semver filenames
-            t = (int(m.group(1)), int(m.group(2)), int(m.group(3)))
-            if t < target_tuple:
-                candidates.append(t)
-
-    if not candidates:
-        return None
-
-    best = max(candidates)
-    return f"{best[0]}.{best[1]}.{best[2]}"
-
+def carry_forward(prev_scorecard_path: Path, new_version: str) -> ResultPayload:
+    """Carry forward a prior SCORECARD.md's results to a new patch version.
 
-def carry_forward(prev_path: Path, new_version: str) -> ResultPayload:
-    """Carry forward a prior scorecard's results to a new patch version.
+    Reads the single ``SCORECARD.md`` (the agent's one scorecard file, updated
+    in place per release), copies all results verbatim, and sets
+    ``inherited_from`` to the prior version string recorded in the front matter.
 
-    Reads the prior scorecard, copies all results verbatim, and sets
-    ``inherited_from`` to the prior version string.
+    Only patch bumps are allowed: if the prior scorecard's ``agent.version``
+    differs in major or minor from ``new_version``, the caller must re-run the
+    eval to generate fresh results.
 
     Args:
-        prev_path: Path to the prior version's scorecard ``.md`` file.
+        prev_scorecard_path: Path to the prior ``SCORECARD.md`` file.
         new_version: The new version string (must be a patch bump of the prior).
 
     Returns:
@@ -444,8 +428,18 @@ def carry_forward(prev_path: Path, new_version: str) -> ResultPayload:
         ValueError: If the prior scorecard cannot be parsed.
     """
     _assert_valid_version(new_version)
-    prev_path = Path(prev_path)
-    prev_version = prev_path.stem  # e.g. "0.2.3" from "0.2.3.md"
+    prev_scorecard_path = Path(prev_scorecard_path)
+
+    parsed = parse_scorecard(prev_scorecard_path)
+
+    # Extract prior version from front matter (agent.version)
+    agent = parsed.get("agent", {})
+    prev_version = str(agent.get("version", ""))
+    if not prev_version:
+        raise ValueError(
+            f"Cannot read prior version from {prev_scorecard_path}: "
+            "missing 'agent.version' field in front matter."
+        )
 
     prev_tuple = _semver_tuple(prev_version)
     new_tuple = _semver_tuple(new_version)
@@ -458,10 +452,7 @@ def carry_forward(prev_path: Path, new_version: str) -> ResultPayload:
             f"generate fresh results for this release."
         )
 
-    parsed = parse_scorecard(prev_path)
-
     # Extract fields from the parsed front matter
-    agent = parsed.get("agent", {})
     recipe = parsed.get("recipe", {})
     dataset = recipe.get("dataset", {})
     results = parsed.get("results", {})
@@ -480,6 +471,6 @@ def carry_forward(prev_path: Path, new_version: str) -> ResultPayload:
         test_cases_run=results.get("test_cases_run", 0),
         metrics=metrics_raw,
         aggregate_name=parsed.get("aggregate", {}).get("name", "weighted_accuracy"),
-        generated_at=datetime.datetime.utcnow().isoformat(),
+        generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(),
         inherited_from=prev_version,
     )
diff --git a/src/gaia/eval/scorecard_gate.py b/src/gaia/eval/scorecard_gate.py
index 49c292561..4328511c4 100644
--- a/src/gaia/eval/scorecard_gate.py
+++ b/src/gaia/eval/scorecard_gate.py
@@ -1,26 +1,36 @@
 # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 """
-Standalone release gate: blocks packaging when the candidate scorecard is missing
-or when its aggregate score strictly regressed below the prior version's.
+Standalone release gate: blocks packaging when the candidate SCORECARD.md is
+missing, invalid, or when its aggregate score strictly regressed below the prior
+version's.
 
 **Distinct from** ``src/gaia/eval/scorecard.py`` — that module aggregates per-run
 scenario PASS/FAIL for internal CI. This gate checks the *outward-facing* release
 artifact produced by ``release_scorecard.py``.
 
+Storage convention: one ``SCORECARD.md`` per agent package (updated in place,
+versioned via the publish snapshot — the same way README.md works).
+
 Usage::
 
+    # Presence-only (first adoption):
+    python -m gaia.eval.scorecard_gate \\
+        --scorecard hub/agents/npm/agent-email/SCORECARD.md
+
+    # With a baseline from a file (unit tests):
     python -m gaia.eval.scorecard_gate \\
-        --scorecards-dir hub/agents/npm/agent-email/scorecards \\
-        --manifest hub/agents/python/email/gaia-agent.yaml
+        --scorecard hub/agents/npm/agent-email/SCORECARD.md \\
+        --baseline-file /tmp/prev-SCORECARD.md
 
+    # With a baseline resolved from a git ref (CI):
     python -m gaia.eval.scorecard_gate \\
-        --scorecards-dir hub/agents/npm/agent-email/scorecards \\
-        --version 0.2.4
+        --scorecard hub/agents/npm/agent-email/SCORECARD.md \\
+        --baseline-ref agent-pkg-email-v0.2.3
 
 Exit codes:
     0 — Passed (presence-only first adoption, equal score, or score improved).
-    1 — Failed (missing/invalid candidate card, strict regression, or prior card invalid).
+    1 — Failed (missing/invalid candidate, strict regression, invalid baseline).
 
 The ``--allow-regression`` flag overrides a regression: prints a ``::warning::``
 GHA annotation and both version/score pairs, then exits 0.
@@ -29,45 +39,69 @@
 from __future__ import annotations
 
 import argparse
+import subprocess
 import sys
 from pathlib import Path
 
-import yaml
-
 from gaia.eval.release_scorecard import (
-    _assert_safe_path,
-    latest_version_below,
     parse_scorecard,
     validate_scorecard,
 )
 
 
-def _read_version_from_manifest(manifest_path: Path) -> str:
-    """Read the ``version:`` field from a ``gaia-agent.yaml`` manifest.
+def _parse_baseline_ref(scorecard_path: Path, ref: str) -> str | None:
+    """Resolve ``<ref>:<scorecard-path>`` via ``git show`` and return the content.
 
-    Args:
-        manifest_path: Path to the YAML manifest file.
+    The path used in the git command is the path of ``scorecard_path`` relative
+    to the repository root (discovered by ``git rev-parse --show-toplevel``).
 
-    Returns:
-        The version string.
+    Returns the file content as a string, or None if the file does not exist at
+    that ref (treated as first adoption — presence-only pass).
 
     Raises:
-        ValueError: If the file cannot be read or ``version:`` is absent.
+        ValueError: If ``git`` cannot be called or the ref is otherwise invalid
+            (the caller treats this as an actionable error, not first adoption).
     """
+    # Discover repo root so we can form a root-relative path for git show.
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "--show-toplevel"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+    except (subprocess.CalledProcessError, FileNotFoundError) as exc:
+        raise ValueError(
+            f"Cannot determine git repository root: {exc}. "
+            "Run from inside a git repository, or use --baseline-file instead."
+        ) from exc
+
+    repo_root = Path(result.stdout.strip())
+    scorecard_path = Path(scorecard_path).resolve()
     try:
-        text = manifest_path.read_text(encoding="utf-8")
-    except OSError as exc:
-        raise ValueError(f"Cannot read manifest {manifest_path}: {exc}") from exc
+        rel = scorecard_path.relative_to(repo_root)
+    except ValueError:
+        raise ValueError(
+            f"SCORECARD path {scorecard_path} is not inside the git repo root "
+            f"{repo_root}. Use an absolute path inside the repo, or use "
+            "--baseline-file instead."
+        )
 
+    git_path = rel.as_posix()
     try:
-        data = yaml.safe_load(text) or {}
-    except yaml.YAMLError as exc:
-        raise ValueError(f"Invalid YAML in manifest {manifest_path}: {exc}") from exc
+        result = subprocess.run(
+            ["git", "show", f"{ref}:{git_path}"],
+            capture_output=True,
+            text=True,
+        )
+    except FileNotFoundError as exc:
+        raise ValueError(f"git not found: {exc}") from exc
 
-    version = data.get("version")
-    if not version:
-        raise ValueError(f"Manifest {manifest_path} has no 'version:' field.")
-    return str(version)
+    if result.returncode != 0:
+        # File absent at that ref → first adoption (presence-only pass).
+        return None
+
+    return result.stdout
 
 
 def main(argv=None) -> int:
@@ -81,24 +115,31 @@ def main(argv=None) -> int:
     """
     parser = argparse.ArgumentParser(
         description=(
-            "Release gate: ensures a valid scorecard exists for the candidate version "
-            "and that its aggregate score has not strictly regressed vs the prior version."
+            "Release gate: ensures a valid SCORECARD.md exists and that its "
+            "aggregate score has not strictly regressed vs the prior version."
         ),
         prog="python -m gaia.eval.scorecard_gate",
     )
     parser.add_argument(
-        "--scorecards-dir",
-        required=False,
-        help="Directory containing per-version scorecard .md files.",
+        "--scorecard",
+        required=True,
+        help="Path to the candidate SCORECARD.md (e.g. hub/agents/npm/agent-email/SCORECARD.md).",
     )
-    version_group = parser.add_mutually_exclusive_group()
-    version_group.add_argument(
-        "--version",
-        help="Candidate version string (e.g. 0.2.4).",
+    baseline_group = parser.add_mutually_exclusive_group()
+    baseline_group.add_argument(
+        "--baseline-file",
+        help=(
+            "Path to the prior version's SCORECARD.md for regression comparison "
+            "(for unit tests; no git access needed)."
+        ),
     )
-    version_group.add_argument(
-        "--manifest",
-        help="Path to gaia-agent.yaml; the 'version:' field is used as the candidate version.",
+    baseline_group.add_argument(
+        "--baseline-ref",
+        help=(
+            "Git ref (tag or commit) of the prior release to use as baseline. "
+            "Resolves via 'git show <ref>:<scorecard-path>'. If the file does not "
+            "exist at that ref, a presence-only pass is applied (first adoption)."
+        ),
     )
     parser.add_argument(
         "--allow-regression",
@@ -115,102 +156,99 @@ def main(argv=None) -> int:
     except SystemExit:
         return 1
 
-    # Validate required arguments
-    if not args.scorecards_dir:
-        print(
-            "ERROR: --scorecards-dir is required.\n"
-            "Usage: python -m gaia.eval.scorecard_gate --scorecards-dir DIR "
-            "--version V (or --manifest PATH)"
-        )
-        return 1
-
-    if not args.version and not args.manifest:
-        print(
-            "ERROR: Either --version or --manifest is required.\n"
-            "Usage: python -m gaia.eval.scorecard_gate --scorecards-dir DIR "
-            "--version V (or --manifest PATH)"
-        )
-        return 1
-
-    scorecards_dir = Path(args.scorecards_dir)
-
-    # Resolve the candidate version
-    if args.manifest:
-        try:
-            version = _read_version_from_manifest(Path(args.manifest))
-        except ValueError as exc:
-            print(f"ERROR: {exc}")
-            return 1
-    else:
-        version = args.version
+    candidate_path = Path(args.scorecard)
 
     # --- Step 1: Presence check ---
-    try:
-        candidate_path = _assert_safe_path(scorecards_dir, version)
-    except ValueError as exc:
-        print(f"ERROR: {exc}")
-        return 1
-
     if not candidate_path.exists():
         print(
-            f"ERROR: Scorecard missing for version {version}.\n"
-            f"  Expected: {candidate_path}\n"
+            f"ERROR: SCORECARD.md missing at {candidate_path}.\n"
             f"  Run 'python gen_scorecard.py' (or 'carry_forward') to generate it, "
-            f"then commit the file before releasing."
+            f"then commit the file before releasing.\n"
+            f"  See https://amd-gaia.ai/docs/reference/eval-scorecard and "
+            f".claude/skills/adding-eval-scorecard/SKILL.md"
         )
         return 1
 
     try:
         candidate_parsed = parse_scorecard(candidate_path)
     except ValueError as exc:
-        print(f"ERROR: Cannot parse candidate scorecard {candidate_path}: {exc}")
+        print(f"ERROR: Cannot parse candidate SCORECARD.md at {candidate_path}: {exc}")
         return 1
 
     errors = validate_scorecard(candidate_parsed)
     if errors:
         print(
-            f"ERROR: Candidate scorecard {candidate_path} is invalid:\n"
+            f"ERROR: Candidate SCORECARD.md at {candidate_path} is invalid:\n"
             + "\n".join(f"  - {e}" for e in errors)
         )
         return 1
 
-    # --- Step 2: Locate prior version ---
-    try:
-        prev_version = latest_version_below(scorecards_dir, version)
-    except ValueError as exc:
-        print(f"ERROR: {exc}")
-        return 1
+    # --- Step 2: Resolve baseline ---
+    baseline_text: str | None = None
 
-    if prev_version is None:
+    if args.baseline_file:
+        baseline_path = Path(args.baseline_file)
+        if not baseline_path.exists():
+            print(
+                f"ERROR: --baseline-file not found: {baseline_path}.\n"
+                f"  Provide a valid path to a prior SCORECARD.md, or omit --baseline-file "
+                f"for a presence-only pass."
+            )
+            return 1
+        try:
+            baseline_text = baseline_path.read_text(encoding="utf-8")
+        except OSError as exc:
+            print(f"ERROR: Cannot read --baseline-file {baseline_path}: {exc}")
+            return 1
+
+    elif args.baseline_ref:
+        try:
+            baseline_text = _parse_baseline_ref(candidate_path, args.baseline_ref)
+        except ValueError as exc:
+            print(f"ERROR: {exc}")
+            return 1
+        # None means the file doesn't exist at that ref → first adoption
+        if baseline_text is None:
+            print(
+                f"PASS: No SCORECARD.md found at ref '{args.baseline_ref}'. "
+                f"First adoption — presence check only."
+            )
+            return 0
+
+    if baseline_text is None:
+        # No baseline specified at all → presence-only pass.
+        candidate_version = candidate_parsed.get("agent", {}).get("version", "?")
+        candidate_score = candidate_parsed.get("aggregate", {}).get("value")
+        if candidate_score is None:
+            print(
+                f"ERROR: Candidate SCORECARD.md at {candidate_path} has no "
+                f"'aggregate.value' field.\n"
+                f"  Fix the scorecard front matter before releasing."
+            )
+            return 1
         print(
-            f"PASS: No prior scorecard found for versions below {version}. "
-            f"First adoption — presence check only."
+            f"PASS: No baseline provided. Presence check only.\n"
+            f"  Candidate v{candidate_version}: aggregate.value = {candidate_score}"
         )
         return 0
 
-    # --- Step 3: Parse prior and regression check ---
+    # --- Step 3: Parse baseline and regression check ---
     try:
-        prev_path = _assert_safe_path(scorecards_dir, prev_version)
-    except ValueError as exc:
-        print(f"ERROR: {exc}")
-        return 1
-
-    try:
-        prev_parsed = parse_scorecard(prev_path)
+        prev_parsed = parse_scorecard(baseline_text)
     except ValueError as exc:
         print(
-            f"ERROR: Cannot parse prior scorecard {prev_path}: {exc}\n"
-            f"  The prior scorecard is corrupt or missing a valid front matter. "
-            f"Fix it before releasing {version}."
+            f"ERROR: Cannot parse baseline SCORECARD.md: {exc}\n"
+            f"  The baseline is corrupt or missing a valid front matter. "
+            f"Fix it before releasing."
         )
         return 1
 
     prev_errors = validate_scorecard(prev_parsed)
     if prev_errors:
         print(
-            f"ERROR: Prior scorecard {prev_path} is invalid:\n"
+            f"ERROR: Baseline SCORECARD.md is invalid:\n"
             + "\n".join(f"  - {e}" for e in prev_errors)
-            + f"\n  Fix the prior scorecard before releasing {version}."
+            + f"\n  Fix the baseline scorecard before releasing."
         )
         return 1
 
@@ -219,32 +257,40 @@ def main(argv=None) -> int:
 
     if candidate_score is None:
         print(
-            f"ERROR: Candidate scorecard {candidate_path} has no 'aggregate.value' field."
+            f"ERROR: Candidate SCORECARD.md at {candidate_path} has no "
+            f"'aggregate.value' field.\n"
+            f"  Fix the scorecard front matter before releasing."
         )
         return 1
 
     if prev_score is None:
-        print(f"ERROR: Prior scorecard {prev_path} has no 'aggregate.value' field.")
+        print(
+            f"ERROR: Baseline SCORECARD.md has no 'aggregate.value' field.\n"
+            f"  Fix the baseline scorecard before releasing."
+        )
         return 1
 
+    candidate_version = candidate_parsed.get("agent", {}).get("version", "?")
+    prev_version = prev_parsed.get("agent", {}).get("version", "?")
+
     if float(candidate_score) < float(prev_score):
         # Strict regression detected
         if args.allow_regression:
             print(
                 f"::warning::Scorecard regression allowed by --allow-regression: "
-                f"{prev_version}={prev_score} → {version}={candidate_score}"
+                f"v{prev_version}={prev_score} → v{candidate_version}={candidate_score}"
             )
             print(
                 f"WARNING: Regression override active. "
-                f"Prior version {prev_version} scored {prev_score}; "
-                f"candidate {version} scored {candidate_score}. "
+                f"Prior version v{prev_version} scored {prev_score}; "
+                f"candidate v{candidate_version} scored {candidate_score}. "
                 f"This regression has been explicitly acknowledged."
             )
             return 0
         print(
             f"ERROR: Scorecard regression detected.\n"
-            f"  Prior version {prev_version}: aggregate.value = {prev_score}\n"
-            f"  Candidate {version}: aggregate.value = {candidate_score}\n"
+            f"  Prior version v{prev_version}: aggregate.value = {prev_score}\n"
+            f"  Candidate v{candidate_version}: aggregate.value = {candidate_score}\n"
             f"  The candidate score is strictly lower than the prior. "
             f"Investigate the regression or use --allow-regression to override intentionally."
         )
@@ -252,8 +298,8 @@ def main(argv=None) -> int:
 
     print(
         f"PASS: Scorecard gate passed.\n"
-        f"  Candidate {version}: aggregate.value = {candidate_score} "
-        f"(prior {prev_version}: {prev_score})"
+        f"  Candidate v{candidate_version}: aggregate.value = {candidate_score} "
+        f"(prior v{prev_version}: {prev_score})"
     )
     return 0
 
diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py
index c542ae13e..d13abdc1e 100644
--- a/tests/unit/eval/test_release_scorecard.py
+++ b/tests/unit/eval/test_release_scorecard.py
@@ -14,7 +14,6 @@
     ResultPayload,
     carry_forward,
     compute_aggregate,
-    latest_version_below,
     parse_scorecard,
     render_scorecard,
     validate_scorecard,
@@ -42,7 +41,7 @@ def _make_payload(version="1.0.0", accuracy=0.5):
         test_cases_run=10,
         metrics=metrics,
         aggregate_name="weighted_accuracy",
-        generated_at=datetime.datetime.utcnow().isoformat(),
+        generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(),
         inherited_from=None,
     )
 
@@ -202,6 +201,24 @@ def test_parse_recovers_all_required_fields(self):
         errors = validate_scorecard(parsed)
         assert errors == []
 
+    def test_body_contains_reproduction_section(self):
+        payload = _make_payload()
+        text = render_scorecard(payload)
+        assert "## Reproduction" in text
+
+    def test_reproduction_section_includes_custom_command(self):
+        payload = _make_payload()
+        payload.reproduction_command = "gaia eval benchmark --limit 25"
+        text = render_scorecard(payload)
+        assert "gaia eval benchmark --limit 25" in text
+
+    def test_reproduction_section_generic_when_no_command(self):
+        payload = _make_payload()
+        # No reproduction_command (default None)
+        text = render_scorecard(payload)
+        assert "## Reproduction" in text
+        assert "eval-scorecard" in text
+
 
 # ---------------------------------------------------------------------------
 # 4. Two counts distinct as separate fields
@@ -278,14 +295,14 @@ def test_body_non_empty(self):
 
 
 # ---------------------------------------------------------------------------
-# 7. Versioning — patch carry-forward
+# 7. Versioning — patch carry-forward (SCORECARD.md is a single file)
 # ---------------------------------------------------------------------------
 
 
 class TestCarryForwardPatch:
     def test_carry_forward_sets_inherited_from(self, tmp_path):
         src = _make_payload(version="0.2.3", accuracy=0.75)
-        card_path = tmp_path / "0.2.3.md"
+        card_path = tmp_path / "SCORECARD.md"
         card_path.write_text(render_scorecard(src))
 
         result = carry_forward(card_path, "0.2.4")
@@ -293,12 +310,23 @@ def test_carry_forward_sets_inherited_from(self, tmp_path):
 
     def test_carry_forward_copies_metrics_verbatim(self, tmp_path):
         src = _make_payload(version="0.2.3", accuracy=0.75)
-        card_path = tmp_path / "0.2.3.md"
+        card_path = tmp_path / "SCORECARD.md"
         card_path.write_text(render_scorecard(src))
 
         result = carry_forward(card_path, "0.2.4")
         assert result.metrics == src.metrics
 
+    def test_carry_forward_reads_version_from_front_matter(self, tmp_path):
+        # The new carry_forward reads agent.version from front matter, NOT filename.
+        src = _make_payload(version="0.2.3", accuracy=0.75)
+        # Use a different filename to confirm it's not read from stem
+        card_path = tmp_path / "SCORECARD.md"
+        card_path.write_text(render_scorecard(src))
+
+        result = carry_forward(card_path, "0.2.4")
+        assert result.agent_version == "0.2.4"
+        assert result.inherited_from == "0.2.3"
+
 
 # ---------------------------------------------------------------------------
 # 8. Versioning — minor bump refuses
@@ -308,7 +336,7 @@ def test_carry_forward_copies_metrics_verbatim(self, tmp_path):
 class TestCarryForwardMinorBumpRefuses:
     def test_minor_bump_raises_value_error(self, tmp_path):
         src = _make_payload(version="0.2.3", accuracy=0.75)
-        card_path = tmp_path / "0.2.3.md"
+        card_path = tmp_path / "SCORECARD.md"
         card_path.write_text(render_scorecard(src))
 
         with pytest.raises(ValueError, match="re-run"):
@@ -316,7 +344,7 @@ def test_minor_bump_raises_value_error(self, tmp_path):
 
     def test_major_bump_raises_value_error(self, tmp_path):
         src = _make_payload(version="0.2.3", accuracy=0.75)
-        card_path = tmp_path / "0.2.3.md"
+        card_path = tmp_path / "SCORECARD.md"
         card_path.write_text(render_scorecard(src))
 
         with pytest.raises(ValueError, match="re-run"):
@@ -343,42 +371,48 @@ def test_rendered_parsed_inherited_from_null_or_absent(self):
 
 
 # ---------------------------------------------------------------------------
-# 10. latest_version_below
+# 10. Gate integration: second-agent generalization (no fabricated artifacts)
 # ---------------------------------------------------------------------------
 
 
-class TestLatestVersionBelow:
-    def _seed_dir(self, tmp_path):
-        for name in (
-            "0.1.0.md",
-            "0.2.3.md",
-            "0.10.0.md",
-            "README.md",
-            "not-a-version.md",
-        ):
-            (tmp_path / name).write_text("# placeholder")
-        return tmp_path
-
-    def test_returns_closest_below(self, tmp_path):
-        self._seed_dir(tmp_path)
-        result = latest_version_below(tmp_path, "0.2.4")
-        assert result == "0.2.3"
-
-    def test_none_when_nothing_below(self, tmp_path):
-        self._seed_dir(tmp_path)
-        result = latest_version_below(tmp_path, "0.1.0")
-        assert result is None
-
-    def test_integer_comparison_not_string(self, tmp_path):
-        self._seed_dir(tmp_path)
-        result = latest_version_below(tmp_path, "0.10.1")
-        assert result == "0.10.0"
-
-    def test_non_version_files_silently_skipped(self, tmp_path):
-        self._seed_dir(tmp_path)
-        # Should not raise even with README.md and not-a-version.md present
-        result = latest_version_below(tmp_path, "0.2.4")
-        assert result == "0.2.3"
+class TestSecondAgentGeneralization:
+    """Prove the generator + gate work for an agent OTHER than email-triage."""
+
+    def test_second_agent_scorecard_validates_and_gate_passes(self, tmp_path):
+        from gaia.eval.scorecard_gate import main as gate_main
+
+        # Build a ResultPayload for a different agent
+        metrics = [{"name": "accuracy", "value": 0.75, "weight": 1.0}]
+        payload = ResultPayload(
+            agent_name="Hello World Agent",
+            agent_version="0.1.0",
+            dataset_reference="tests/fixtures/hello/ground_truth.json",
+            dataset_description="Hello world evaluation dataset",
+            dataset_size=50,
+            methodology="exact match accuracy",
+            config={"model": "Gemma-4-E4B-it-GGUF", "limit": 20},
+            test_cases_run=20,
+            metrics=metrics,
+            aggregate_name="weighted_accuracy",
+            generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(),
+            inherited_from=None,
+            reproduction_command="gaia eval agent --category hello",
+        )
+
+        scorecard_path = tmp_path / "SCORECARD.md"
+        from gaia.eval.release_scorecard import write_scorecard
+
+        write_scorecard(payload, scorecard_path)
+
+        # Validate the written scorecard
+        text = scorecard_path.read_text()
+        parsed = parse_scorecard(text)
+        errors = validate_scorecard(parsed)
+        assert errors == [], f"Second-agent scorecard should be valid, got: {errors}"
+
+        # Gate should pass (no baseline → presence-only)
+        result = gate_main(["--scorecard", str(scorecard_path)])
+        assert result == 0, "Gate should pass for a valid second-agent SCORECARD.md"
 
 
 # ---------------------------------------------------------------------------
@@ -500,3 +534,25 @@ def test_all_no_quality_raises(self, tmp_path):
 
         with pytest.raises(ValueError):
             mod.build_payload(benchmark_dir, gt_path)
+
+    def test_build_payload_includes_reproduction_command(self, tmp_path):
+        mod = self._load_gen_scorecard()
+
+        benchmark_dir = tmp_path / "benchmark"
+        benchmark_dir.mkdir()
+        scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json"
+        scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text())
+
+        ground_truth = {
+            "_meta": {"count": 3},
+            "email1": {"label": "spam"},
+            "email2": {"label": "promo"},
+        }
+        gt_path = tmp_path / "ground_truth.json"
+        gt_path.write_text(json.dumps(ground_truth))
+
+        payload = mod.build_payload(benchmark_dir, gt_path, limit=25)
+        assert payload.reproduction_command is not None
+        assert "gaia eval benchmark" in payload.reproduction_command
+        assert "gen_scorecard.py" in payload.reproduction_command
+        assert "PYTHON_KEYRING_BACKEND" in payload.reproduction_command
diff --git a/tests/unit/eval/test_scorecard_gate.py b/tests/unit/eval/test_scorecard_gate.py
index 28ab269d9..32424f97a 100644
--- a/tests/unit/eval/test_scorecard_gate.py
+++ b/tests/unit/eval/test_scorecard_gate.py
@@ -1,6 +1,6 @@
 # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
-"""TDD tests for gaia.eval.scorecard_gate — written before implementation exists."""
+"""TDD tests for gaia.eval.scorecard_gate — new single-file SCORECARD.md interface."""
 
 import datetime
 from pathlib import Path
@@ -33,14 +33,22 @@ def _make_payload(version="1.0.0", accuracy=0.5):
         test_cases_run=10,
         metrics=metrics,
         aggregate_name="weighted_accuracy",
-        generated_at=datetime.datetime.utcnow().isoformat(),
+        generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(),
         inherited_from=None,
     )
 
 
 def _write_card(directory: Path, version: str, accuracy: float) -> Path:
+    """Write a valid SCORECARD.md to directory/SCORECARD.md."""
+    payload = _make_payload(version=version, accuracy=accuracy)
+    path = directory / "SCORECARD.md"
+    path.write_text(render_scorecard(payload))
+    return path
+
+
+def _write_card_named(path: Path, version: str, accuracy: float) -> Path:
+    """Write a valid SCORECARD.md to an explicit path."""
     payload = _make_payload(version=version, accuracy=accuracy)
-    path = directory / f"{version}.md"
     path.write_text(render_scorecard(payload))
     return path
 
@@ -52,45 +60,77 @@ def _write_card(directory: Path, version: str, accuracy: float) -> Path:
 
 class TestMissingCard:
     def test_missing_card_returns_1(self, tmp_path):
-        result = main(["--scorecards-dir", str(tmp_path), "--version", "1.0.0"])
+        scorecard = tmp_path / "SCORECARD.md"
+        result = main(["--scorecard", str(scorecard)])
         assert result == 1
 
 
 # ---------------------------------------------------------------------------
-# Case (b) — strict regression → exit 1
+# Case (b) — strict regression with --baseline-file → exit 1
 # ---------------------------------------------------------------------------
 
 
 class TestStrictRegression:
     def test_regression_returns_1(self, tmp_path):
-        _write_card(tmp_path, "0.2.3", accuracy=0.8)
-        _write_card(tmp_path, "0.2.4", accuracy=0.5)
-        result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"])
+        baseline_dir = tmp_path / "baseline"
+        baseline_dir.mkdir()
+        baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.8)
+
+        candidate_dir = tmp_path / "candidate"
+        candidate_dir.mkdir()
+        candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5)
+
+        result = main(["--scorecard", str(candidate), "--baseline-file", str(baseline)])
         assert result == 1
 
 
 # ---------------------------------------------------------------------------
-# Case (c) — no prior → exit 0
+# Case (c) — no baseline → presence-only pass → exit 0
 # ---------------------------------------------------------------------------
 
 
 class TestNoPrior:
     def test_first_adoption_returns_0(self, tmp_path):
-        _write_card(tmp_path, "1.0.0", accuracy=0.6)
-        result = main(["--scorecards-dir", str(tmp_path), "--version", "1.0.0"])
+        candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6)
+        result = main(["--scorecard", str(candidate)])
         assert result == 0
 
 
 # ---------------------------------------------------------------------------
-# Case (d) — equal score (carry-forward) → exit 0
+# Case (d) — equal score (carry-forward) with --baseline-file → exit 0
 # ---------------------------------------------------------------------------
 
 
 class TestEqualScore:
     def test_equal_score_returns_0(self, tmp_path):
-        _write_card(tmp_path, "0.2.3", accuracy=0.5)
-        _write_card(tmp_path, "0.2.4", accuracy=0.5)
-        result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"])
+        baseline_dir = tmp_path / "baseline"
+        baseline_dir.mkdir()
+        baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.5)
+
+        candidate_dir = tmp_path / "candidate"
+        candidate_dir.mkdir()
+        candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5)
+
+        result = main(["--scorecard", str(candidate), "--baseline-file", str(baseline)])
+        assert result == 0
+
+
+# ---------------------------------------------------------------------------
+# Case (e) — improved score → exit 0
+# ---------------------------------------------------------------------------
+
+
+class TestImprovedScore:
+    def test_improved_score_returns_0(self, tmp_path):
+        baseline_dir = tmp_path / "baseline"
+        baseline_dir.mkdir()
+        baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.5)
+
+        candidate_dir = tmp_path / "candidate"
+        candidate_dir.mkdir()
+        candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.8)
+
+        result = main(["--scorecard", str(candidate), "--baseline-file", str(baseline)])
         assert result == 0
 
 
@@ -101,28 +141,36 @@ def test_equal_score_returns_0(self, tmp_path):
 
 class TestAllowRegression:
     def test_allow_regression_flag_returns_0(self, tmp_path):
-        _write_card(tmp_path, "0.2.3", accuracy=0.8)
-        _write_card(tmp_path, "0.2.4", accuracy=0.5)
+        baseline_dir = tmp_path / "baseline"
+        baseline_dir.mkdir()
+        baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.8)
+
+        candidate_dir = tmp_path / "candidate"
+        candidate_dir.mkdir()
+        candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5)
+
         result = main(
             [
-                "--scorecards-dir",
-                str(tmp_path),
-                "--version",
-                "0.2.4",
+                "--scorecard", str(candidate),
+                "--baseline-file", str(baseline),
                 "--allow-regression",
             ]
         )
         assert result == 0
 
     def test_allow_regression_prints_warning_line(self, tmp_path, capsys):
-        _write_card(tmp_path, "0.2.3", accuracy=0.8)
-        _write_card(tmp_path, "0.2.4", accuracy=0.5)
+        baseline_dir = tmp_path / "baseline"
+        baseline_dir.mkdir()
+        baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.8)
+
+        candidate_dir = tmp_path / "candidate"
+        candidate_dir.mkdir()
+        candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5)
+
         main(
             [
-                "--scorecards-dir",
-                str(tmp_path),
-                "--version",
-                "0.2.4",
+                "--scorecard", str(candidate),
+                "--baseline-file", str(baseline),
                 "--allow-regression",
             ]
         )
@@ -131,74 +179,71 @@ def test_allow_regression_prints_warning_line(self, tmp_path, capsys):
 
 
 # ---------------------------------------------------------------------------
-# --manifest reads version
+# --baseline-file missing → exit 1
 # ---------------------------------------------------------------------------
 
 
-class TestManifestFlag:
-    def test_manifest_reads_version(self, tmp_path):
-        scorecards_dir = tmp_path / "scorecards"
-        scorecards_dir.mkdir()
-        _write_card(scorecards_dir, "1.2.3", accuracy=0.6)
-
-        manifest_path = tmp_path / "gaia-agent.yaml"
-        manifest_path.write_text("version: 1.2.3\nname: test-agent\n")
-
+class TestBaselineFileMissing:
+    def test_missing_baseline_file_returns_1(self, tmp_path):
+        candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6)
         result = main(
             [
-                "--scorecards-dir",
-                str(scorecards_dir),
-                "--manifest",
-                str(manifest_path),
+                "--scorecard", str(candidate),
+                "--baseline-file", str(tmp_path / "nonexistent-SCORECARD.md"),
             ]
         )
-        assert result == 0
+        assert result == 1
 
-    def test_manifest_with_regression(self, tmp_path):
-        scorecards_dir = tmp_path / "scorecards"
-        scorecards_dir.mkdir()
-        _write_card(scorecards_dir, "1.2.2", accuracy=0.9)
-        _write_card(scorecards_dir, "1.2.3", accuracy=0.3)
 
-        manifest_path = tmp_path / "gaia-agent.yaml"
-        manifest_path.write_text("version: 1.2.3\nname: test-agent\n")
+# ---------------------------------------------------------------------------
+# Invalid candidate (corrupt YAML front matter) → exit 1
+# ---------------------------------------------------------------------------
 
-        result = main(
-            [
-                "--scorecards-dir",
-                str(scorecards_dir),
-                "--manifest",
-                str(manifest_path),
-            ]
-        )
+
+class TestInvalidCandidate:
+    def test_corrupt_candidate_returns_1(self, tmp_path):
+        corrupt_path = tmp_path / "SCORECARD.md"
+        corrupt_path.write_text("this is not valid yaml front matter at all\ngarbage\n")
+        result = main(["--scorecard", str(corrupt_path)])
+        assert result == 1
+
+    def test_empty_candidate_returns_1(self, tmp_path):
+        empty_path = tmp_path / "SCORECARD.md"
+        empty_path.write_text("")
+        result = main(["--scorecard", str(empty_path)])
         assert result == 1
 
 
 # ---------------------------------------------------------------------------
-# Invalid prior → exit 1
+# Invalid baseline → exit 1
 # ---------------------------------------------------------------------------
 
 
 class TestInvalidPrior:
-    def test_corrupt_prior_returns_1(self, tmp_path):
-        # Write corrupt/invalid prior card
-        corrupt_path = tmp_path / "0.2.3.md"
-        corrupt_path.write_text("this is not valid yaml front matter at all\ngarbage\n")
+    def test_corrupt_baseline_returns_1(self, tmp_path):
+        baseline_dir = tmp_path / "baseline"
+        baseline_dir.mkdir()
+        corrupt = baseline_dir / "SCORECARD.md"
+        corrupt.write_text("this is not valid yaml front matter at all\ngarbage\n")
 
-        # Write a valid candidate card
-        _write_card(tmp_path, "0.2.4", accuracy=0.9)
+        candidate_dir = tmp_path / "candidate"
+        candidate_dir.mkdir()
+        candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.9)
 
-        result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"])
+        result = main(["--scorecard", str(candidate), "--baseline-file", str(corrupt)])
         assert result == 1
 
-    def test_empty_prior_returns_1(self, tmp_path):
-        # Prior exists but is empty
-        empty_path = tmp_path / "0.2.3.md"
-        empty_path.write_text("")
+    def test_empty_baseline_returns_1(self, tmp_path):
+        baseline_dir = tmp_path / "baseline"
+        baseline_dir.mkdir()
+        empty = baseline_dir / "SCORECARD.md"
+        empty.write_text("")
 
-        _write_card(tmp_path, "0.2.4", accuracy=0.9)
+        candidate_dir = tmp_path / "candidate"
+        candidate_dir.mkdir()
+        candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.9)
 
-        result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"])
+        result = main(["--scorecard", str(candidate), "--baseline-file", str(empty)])
         assert result == 1
 
 
@@ -238,10 +283,17 @@ def test_publish_job_needs_scorecard_gate(self):
 
 
 class TestCliErrorHandling:
-    def test_missing_scorecards_dir_flag_returns_1(self):
-        result = main(["--version", "1.0.0"])
+    def test_missing_scorecard_flag_returns_1(self):
+        result = main([])
         assert result == 1
 
-    def test_missing_version_and_manifest_returns_1(self, tmp_path):
-        result = main(["--scorecards-dir", str(tmp_path)])
+    def test_baseline_file_and_ref_mutually_exclusive(self, tmp_path):
+        candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6)
+        result = main(
+            [
+                "--scorecard", str(candidate),
+                "--baseline-file", str(candidate),
+                "--baseline-ref", "v1.0.0",
+            ]
+        )
         assert result == 1

From 7e0ea567580e1573b62a3dd947e0ecf370efbee9 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 13:29:54 -0400
Subject: [PATCH 14/18] refactor(eval): replace scorecards/ dirs with single
 SCORECARD.md per agent

- hub/agents/npm/agent-email/SCORECARD.md: generated from relabeled-corpus run
  (placeholder; orchestrator will regenerate from full run)
- hub/agents/npm/agent-email/package.json: files array includes SCORECARD.md,
  removes scorecards/ (don't ship all versions in the npm tarball)
- hub/agents/npm/agent-email/README.md: scorecard link updated to ./SCORECARD.md
- Delete hub/agents/npm/agent-email/scorecards/ (per-version dir, now obsolete)
- Delete hub/agents/python/hello-world/scorecards/ (contained fabricated 90.0 score)
---
 hub/agents/npm/agent-email/README.md          |  2 +-
 .../{scorecards/0.2.4.md => SCORECARD.md}     | 24 ++++++-
 hub/agents/npm/agent-email/package.json       |  4 +-
 .../python/hello-world/scorecards/0.1.0.md    | 62 -------------------
 4 files changed, 26 insertions(+), 66 deletions(-)
 rename hub/agents/npm/agent-email/{scorecards/0.2.4.md => SCORECARD.md} (70%)
 delete mode 100644 hub/agents/python/hello-world/scorecards/0.1.0.md

diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md
index c8c079b2a..92424371b 100644
--- a/hub/agents/npm/agent-email/README.md
+++ b/hub/agents/npm/agent-email/README.md
@@ -2,7 +2,7 @@
 
 [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24**
 
-**Eval scorecard (v0.2.4): aggregate 40.0 / 100** — `category_accuracy` 0.40 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate.
+**Eval scorecard (v0.2.4): aggregate 40.0 / 100** — `category_accuracy` 0.40 over 25 of 220 labeled emails ([`./SCORECARD.md`](./SCORECARD.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate.
 
 Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies
 to, and schedules from Gmail and Outlook — with every email body analyzed
diff --git a/hub/agents/npm/agent-email/scorecards/0.2.4.md b/hub/agents/npm/agent-email/SCORECARD.md
similarity index 70%
rename from hub/agents/npm/agent-email/scorecards/0.2.4.md
rename to hub/agents/npm/agent-email/SCORECARD.md
index 7e36786a1..b4f8ae5ca 100644
--- a/hub/agents/npm/agent-email/scorecards/0.2.4.md
+++ b/hub/agents/npm/agent-email/SCORECARD.md
@@ -33,7 +33,7 @@ aggregate:
     value: 0.4
     weight: 1.0
   value: 40.0
-generated_at: '2026-06-26T16:47:13.735478+00:00'
+generated_at: '2026-06-26T17:29:34.631236+00:00'
 inherited_from: null
 ---
 # Email Triage — Eval Scorecard v0.2.4
@@ -66,3 +66,25 @@ round(100 × ((0.4000 × 1.0)) / 1.0, 2) = 40.0
 
 A reader can reproduce this value from the `aggregate.components` in the front
 matter alone — no eval-harness access needed.
+
+## Reproduction
+
+Run the following commands from the repository root:
+
+```sh
+# Step 1: run the benchmark (requires a running Lemonade Server on :13305)
+PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \
+GAIA_AGENT_TOOL_TIMEOUT=120 \
+PYTHONPATH="$(pwd)" \
+gaia eval benchmark --limit 25
+
+# Step 2: generate the scorecard from the benchmark output
+PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \
+PYTHONPATH="$(pwd)" \
+python hub/agents/python/email/packaging/gen_scorecard.py \
+    --benchmark-dir /private/tmp/claude-501/-Users-tomasz-src-amd-gaia--claude-worktrees-sleepy-chatelet-2b818a/314bd25e-fbc0-4ab7-aab0-a8825585e5ef/scratchpad/email-eval-relabeled \
+    --ground-truth tests/fixtures/email/ground_truth.json \
+    --limit 25
+```
+
+See [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) for the full setup guide.
diff --git a/hub/agents/npm/agent-email/package.json b/hub/agents/npm/agent-email/package.json
index 426d163e8..115483bc0 100644
--- a/hub/agents/npm/agent-email/package.json
+++ b/hub/agents/npm/agent-email/package.json
@@ -48,8 +48,8 @@
     "CHANGELOG.md",
     "SPEC.md",
     "SKILL.md",
-    "LICENSE",
-    "scorecards/"
+    "SCORECARD.md",
+    "LICENSE"
   ],
   "engines": {
     "node": ">=18"
diff --git a/hub/agents/python/hello-world/scorecards/0.1.0.md b/hub/agents/python/hello-world/scorecards/0.1.0.md
deleted file mode 100644
index fc6121f2e..000000000
--- a/hub/agents/python/hello-world/scorecards/0.1.0.md
+++ /dev/null
@@ -1,62 +0,0 @@
----
-schema_version: 1
-agent:
-  name: Hello World
-  version: 0.1.0
-recipe:
-  dataset:
-    reference: hub/agents/python/hello-world/tests
-    description: Illustrative conversational response dataset (reference agent)
-    size: 10
-  methodology: Illustrative metric — reference agent for scorecard format generalization
-  config:
-    harness: gaia eval agent
-    model: Gemma-4-E4B-it-GGUF
-    limit: 10
-results:
-  test_cases_run: 10
-  metrics:
-  - name: response_quality
-    value: 0.9
-    weight: 1.0
-aggregate:
-  name: weighted_accuracy
-  formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2)
-  components:
-  - metric: response_quality
-    value: 0.9
-    weight: 1.0
-  value: 90.0
-generated_at: '2026-06-25T12:00:00+00:00'
-inherited_from: null
----
-# Hello World — Eval Scorecard v0.1.0
-
-**Aggregate score: 90.0** (out of 100)
-
-## Recipe
-
-| Field | Value |
-|-------|-------|
-| Dataset | [hub/agents/python/hello-world/tests](hub/agents/python/hello-world/tests) |
-| Description | Illustrative conversational response dataset (reference agent) |
-| Dataset size | 10 labeled examples |
-| Test cases run | 10 |
-| Methodology | Illustrative metric — reference agent for scorecard format generalization |
-
-## Metrics
-
-  - **response_quality**: 0.9000 × 1.0
-
-## Aggregate score recomputation
-
-Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)`
-
-Worked example:
-
-```
-round(100 × ((0.9000 × 1.0)) / 1.0, 2) = 90.0
-```
-
-A reader can reproduce this value from the `aggregate.components` in the front
-matter alone — no eval-harness access needed.

From 704ea088b298d8adda7976a1c6ae2f3a52a2c735 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 13:31:23 -0400
Subject: [PATCH 15/18] refactor(eval): update hub worker, workflows, and
 publish for SCORECARD.md

- storage.ts: evalScorecardKey now returns SCORECARD.md (was eval-scorecard.md)
- publish.ts: update comment for SCORECARD.md
- routes.test.ts: expect eval_scorecard_url to end in /SCORECARD.md
- publish_to_r2.py: update --eval-scorecard help text to reference SCORECARD.md
- release_agent_email.yml: scorecard-gate uses new --scorecard / --baseline-ref
  interface; computes prev tag via git describe; publish step points at SCORECARD.md
- email_scorecard_refresh.yml: use SCORECARD.md env var throughout; same-version
  check and cross-version gate use new gate interface with --baseline-ref
---
 .github/workflows/email_scorecard_refresh.yml | 60 +++++++++++--------
 .github/workflows/release_agent_email.yml     | 35 +++++++++--
 .../python/email/packaging/publish_to_r2.py   |  2 +-
 workers/agent-hub/src/publish.ts              |  2 +-
 workers/agent-hub/src/storage.ts              |  2 +-
 workers/agent-hub/test/routes.test.ts         |  2 +-
 6 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/email_scorecard_refresh.yml b/.github/workflows/email_scorecard_refresh.yml
index 7b3b02b5f..9a8f0a849 100644
--- a/.github/workflows/email_scorecard_refresh.yml
+++ b/.github/workflows/email_scorecard_refresh.yml
@@ -6,7 +6,7 @@
 # Answers "how does a PR that changes the agent keep the scorecard honest?":
 # when the email agent's LLM-affecting code (or the eval corpus) changes, this
 # re-runs the REAL eval, regenerates the scorecard, and then:
-#   - score IMPROVED or held  -> commits the refreshed scorecard to the branch
+#   - score IMPROVED or held  -> commits the refreshed SCORECARD.md to the branch
 #   - score REGRESSED          -> fails the job (the worse card is NOT committed)
 #
 # `gaia eval benchmark` needs Lemonade on AMD hardware, so this runs ONLY on the
@@ -15,9 +15,10 @@
 # hosted-CI backstop (it parses committed files only, no eval).
 #
 # Two regression checks run here:
-#   1. SAME-VERSION: fresh aggregate vs the currently-committed card for this
-#      version — stops a noisy/worse re-run from silently overwriting a good score.
-#   2. CROSS-VERSION: `gaia.eval.scorecard_gate` — fresh card vs the prior version.
+#   1. SAME-VERSION: fresh aggregate vs the currently-committed SCORECARD.md —
+#      stops a noisy/worse re-run from silently overwriting a good score.
+#   2. CROSS-VERSION (best-effort): fresh SCORECARD.md vs the prior version tag
+#      via --baseline-ref.
 #
 # Auto-commit needs `contents: write` and only works on the repo's own branches;
 # a fork PR's GITHUB_TOKEN is read-only — for forks, run the eval locally / on AMD
@@ -55,7 +56,7 @@ permissions:
   contents: write   # auto-commit the refreshed scorecard to the branch
 
 env:
-  SCORECARD_DIR: hub/agents/npm/agent-email/scorecards
+  SCORECARD: hub/agents/npm/agent-email/SCORECARD.md
   MANIFEST: hub/agents/python/email/gaia-agent.yaml
   LIMIT: ${{ github.event.inputs.limit || '25' }}
   MODEL: ${{ github.event.inputs.model || 'Gemma-4-E4B-it-GGUF' }}
@@ -70,6 +71,7 @@ jobs:
         uses: actions/checkout@v6
         with:
           ref: ${{ github.head_ref || github.ref_name }}
+          fetch-depth: 0   # full history for git describe (cross-version baseline)
 
       - name: Set up Python
         uses: actions/setup-python@v6
@@ -90,16 +92,20 @@ jobs:
           set -euo pipefail
           VERSION=$(python -c "import yaml; print(yaml.safe_load(open('${MANIFEST}'))['version'])")
           echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
-          CARD="${SCORECARD_DIR}/${VERSION}.md"
-          # Aggregate of the card as committed on this branch (empty if new).
-          if git cat-file -e "HEAD:${CARD}" 2>/dev/null; then
-            git show "HEAD:${CARD}" > /tmp/committed_card.md
-            COMMITTED=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('/tmp/committed_card.md'))['aggregate']['value'])")
+          # Aggregate of the SCORECARD.md as committed on this branch (empty if new).
+          if git cat-file -e "HEAD:${SCORECARD}" 2>/dev/null; then
+            git show "HEAD:${SCORECARD}" > /tmp/committed_scorecard.md
+            COMMITTED=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('/tmp/committed_scorecard.md'))['aggregate']['value'])")
           else
             COMMITTED=""
           fi
           echo "committed=${COMMITTED}" >> "$GITHUB_OUTPUT"
-          echo "Version ${VERSION}; committed aggregate: ${COMMITTED:-<none>}"
+          # Resolve the previous release tag for the cross-version check.
+          PREV="$(git describe --tags --abbrev=0 \
+            --match 'agent-pkg-email-*' \
+            "HEAD^" 2>/dev/null || true)"
+          echo "prev_tag=${PREV}" >> "$GITHUB_OUTPUT"
+          echo "Version ${VERSION}; committed aggregate: ${COMMITTED:-<none>}; prev tag: ${PREV:-<none>}"
 
       - name: Run the email-triage benchmark (real eval)
         env:
@@ -120,7 +126,7 @@ jobs:
             --limit "${LIMIT}" \
             --output-dir eval-out
 
-      - name: Regenerate the scorecard from the real run
+      - name: Regenerate SCORECARD.md from the real run
         run: |
           set -euo pipefail
           python hub/agents/python/email/packaging/gen_scorecard.py \
@@ -129,34 +135,38 @@ jobs:
       - name: Same-version regression check (reject a worse re-run)
         run: |
           set -euo pipefail
-          VERSION="${{ steps.pre.outputs.version }}"
           COMMITTED="${{ steps.pre.outputs.committed }}"
-          CARD="${SCORECARD_DIR}/${VERSION}.md"
-          FRESH=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('${CARD}'))['aggregate']['value'])")
+          FRESH=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('${SCORECARD}'))['aggregate']['value'])")
           echo "fresh aggregate: ${FRESH} | committed: ${COMMITTED:-<none>}"
           if [ -n "${COMMITTED}" ] && python -c "import sys; sys.exit(0 if float('${FRESH}') < float('${COMMITTED}') else 1)"; then
-            echo "::error::Scorecard regression for v${VERSION}: re-run scored ${FRESH} < committed ${COMMITTED}. Not committing the worse card. Investigate, or override intentionally via --allow-regression in a manual commit."
-            git checkout -- "${CARD}" || true
+            echo "::error::Scorecard regression: re-run scored ${FRESH} < committed ${COMMITTED}. Not committing the worse card. Investigate, or override intentionally via --allow-regression in a manual commit."
+            git checkout -- "${SCORECARD}" || true
             exit 1
           fi
           echo "No same-version regression — fresh score is >= committed."
 
-      - name: Cross-version gate (fresh card vs prior version)
+      - name: Cross-version gate (fresh SCORECARD.md vs prior version tag, best-effort)
         run: |
           set -euo pipefail
-          python -m gaia.eval.scorecard_gate \
-            --scorecards-dir "${SCORECARD_DIR}" \
-            --manifest "${MANIFEST}"
+          PREV="${{ steps.pre.outputs.prev_tag }}"
+          if [ -n "${PREV}" ]; then
+            python -m gaia.eval.scorecard_gate \
+              --scorecard "${SCORECARD}" \
+              --baseline-ref "${PREV}"
+          else
+            python -m gaia.eval.scorecard_gate \
+              --scorecard "${SCORECARD}"
+          fi
 
-      - name: Commit the refreshed scorecard (only if it changed for the better/equal)
+      - name: Commit the refreshed SCORECARD.md (only if it changed for the better/equal)
         run: |
           set -euo pipefail
-          if git diff --quiet -- "${SCORECARD_DIR}"; then
-            echo "Scorecard unchanged — nothing to commit."
+          if git diff --quiet -- "${SCORECARD}"; then
+            echo "SCORECARD.md unchanged — nothing to commit."
             exit 0
           fi
           git config user.name  "${{ github.actor }}"
           git config user.email "${{ github.actor }}@users.noreply.github.com"
-          git add "${SCORECARD_DIR}"
+          git add "${SCORECARD}"
           git commit -m "eval(email): refresh v${{ steps.pre.outputs.version }} scorecard from benchmark run"
           git push origin "HEAD:${{ github.head_ref || github.ref_name }}"
diff --git a/.github/workflows/release_agent_email.yml b/.github/workflows/release_agent_email.yml
index 1c624f2f2..3bc05451b 100644
--- a/.github/workflows/release_agent_email.yml
+++ b/.github/workflows/release_agent_email.yml
@@ -272,16 +272,43 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0   # full history so git describe can find previous tags
       - uses: actions/setup-python@v6
         with:
           python-version: "3.12"
       - name: Install core + PyYAML
         run: pip install -e . pyyaml
+      - name: Resolve previous release tag (best-effort baseline)
+        id: prev_tag
+        shell: bash
+        run: |
+          set -uo pipefail
+          # Find the most recent agent-pkg-email-* tag strictly before the
+          # current ref. On workflow_dispatch the current ref is a branch, not
+          # a tag, so we look for the latest tag of the right pattern overall.
+          PREV="$(git describe --tags --abbrev=0 \
+            --match 'agent-pkg-email-*' \
+            "${GITHUB_REF_NAME}^" 2>/dev/null || true)"
+          echo "prev_tag=${PREV}" >> "$GITHUB_OUTPUT"
+          if [ -n "${PREV}" ]; then
+            echo "Baseline tag: ${PREV}"
+          else
+            echo "No previous release tag found — presence-only check."
+          fi
       - name: Run scorecard gate
+        shell: bash
         run: |
-          python -m gaia.eval.scorecard_gate \
-            --scorecards-dir hub/agents/npm/agent-email/scorecards \
-            --manifest hub/agents/python/email/gaia-agent.yaml
+          set -euo pipefail
+          PREV="${{ steps.prev_tag.outputs.prev_tag }}"
+          if [ -n "${PREV}" ]; then
+            python -m gaia.eval.scorecard_gate \
+              --scorecard hub/agents/npm/agent-email/SCORECARD.md \
+              --baseline-ref "${PREV}"
+          else
+            python -m gaia.eval.scorecard_gate \
+              --scorecard hub/agents/npm/agent-email/SCORECARD.md
+          fi
 
   # ── Stage 2: publish to the hub + npm (single atomic step) ─────────
   publish:
@@ -477,7 +504,7 @@ jobs:
           done
           VER="${{ steps.ver.outputs.version }}"
           scorecard_args=()
-          SCORECARD="hub/agents/npm/agent-email/scorecards/${VER}.md"
+          SCORECARD="hub/agents/npm/agent-email/SCORECARD.md"
           if [ -f "${SCORECARD}" ]; then
             scorecard_args+=(--eval-scorecard "${SCORECARD}")
           fi
diff --git a/hub/agents/python/email/packaging/publish_to_r2.py b/hub/agents/python/email/packaging/publish_to_r2.py
index 5884cb976..9cca41e3d 100644
--- a/hub/agents/python/email/packaging/publish_to_r2.py
+++ b/hub/agents/python/email/packaging/publish_to_r2.py
@@ -279,7 +279,7 @@ def main(argv=None) -> int:
     parser.add_argument(
         "--eval-scorecard",
         type=Path,
-        help="Path to the eval scorecard markdown (e.g. scorecards/0.2.4.md) to "
+        help="Path to the eval scorecard markdown (e.g. SCORECARD.md) to "
         "publish as the agent's catalog eval score and scorecard URL "
         "(POSTed as the multipart 'eval_scorecard' part the Worker accepts). "
         "Absent = publish without an eval scorecard.",
diff --git a/workers/agent-hub/src/publish.ts b/workers/agent-hub/src/publish.ts
index 626b8c65e..c869ea623 100644
--- a/workers/agent-hub/src/publish.ts
+++ b/workers/agent-hub/src/publish.ts
@@ -176,7 +176,7 @@ export async function handlePublish(
   const skillText = await optionalMarkdownPart(form, "skill", "SKILL.md");
   // Optional eval scorecard markdown (the agent's benchmark results, rendered on
   // the hub listing as an aggregate score + link). Per-version, first-POST semantics.
-  const evalScorecardText = await optionalMarkdownPart(form, "eval_scorecard", "eval-scorecard.md");
+  const evalScorecardText = await optionalMarkdownPart(form, "eval_scorecard", "SCORECARD.md");
   // Optional whole-package file listing (the zip's contents, for the hub's file
   // list). The zip itself rides in as a normal `artifact`; this is just the
   // manifest of what's inside it.
diff --git a/workers/agent-hub/src/storage.ts b/workers/agent-hub/src/storage.ts
index 366e3fe84..3a26647a4 100644
--- a/workers/agent-hub/src/storage.ts
+++ b/workers/agent-hub/src/storage.ts
@@ -53,7 +53,7 @@ export function skillKey(id: string, version: string): string {
 }
 
 export function evalScorecardKey(id: string, version: string): string {
-  return `${versionDir(id, version)}eval-scorecard.md`;
+  return `${versionDir(id, version)}SCORECARD.md`;
 }
 
 export function packageFilesKey(id: string, version: string): string {
diff --git a/workers/agent-hub/test/routes.test.ts b/workers/agent-hub/test/routes.test.ts
index 29505b207..cb00f5abf 100644
--- a/workers/agent-hub/test/routes.test.ts
+++ b/workers/agent-hub/test/routes.test.ts
@@ -118,7 +118,7 @@ describe("eval scorecard in catalog", () => {
     const body = (await res.json()) as any;
     const entry = body.agents[0];
     expect(entry.eval_score).toBe(87.5);
-    expect(entry.eval_scorecard_url).toMatch(/\/agents\/chat\/0\.1\.0\/eval-scorecard\.md$/);
+    expect(entry.eval_scorecard_url).toMatch(/\/agents\/chat\/0\.1\.0\/SCORECARD\.md$/);
   });
 
   it("omits eval_score and eval_scorecard_url when no scorecard is published", async () => {

From 40107bff90c75d9b2599abedadc3ea94465b201e Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 13:33:16 -0400
Subject: [PATCH 16/18] docs(eval): update scorecard docs and skill for single
 SCORECARD.md convention

- eval-scorecard.mdx: storage convention is now a single SCORECARD.md (not
  scorecards/<ver>.md); gate uses --scorecard + --baseline-ref/--baseline-file;
  carry_forward reads version from front matter; Reproduction section documented;
  npm files include SCORECARD.md only (not scorecards/ dir)
- SKILL.md: doc-root/SCORECARD.md as single file; reproduction_command in adapter;
  gate CLI updated to --scorecard / --baseline-ref pattern; Phase 4 examples updated
---
 .claude/skills/adding-eval-scorecard/SKILL.md |  37 +++--
 docs/reference/eval-scorecard.mdx             | 128 ++++++++++--------
 2 files changed, 94 insertions(+), 71 deletions(-)

diff --git a/.claude/skills/adding-eval-scorecard/SKILL.md b/.claude/skills/adding-eval-scorecard/SKILL.md
index 0afaa057f..97123244d 100644
--- a/.claude/skills/adding-eval-scorecard/SKILL.md
+++ b/.claude/skills/adding-eval-scorecard/SKILL.md
@@ -8,7 +8,7 @@ description: "Adopt the per-agent eval scorecard for a GAIA hub agent: write the
 Adopt the release **eval scorecard** ([`docs/reference/eval-scorecard.mdx`](../../../docs/reference/eval-scorecard.mdx)) for one hub agent. The system is `harness → result payload → generator → scorecard`, with a standalone presence+regression release gate. The **email agent is the reference implementation** — mirror it.
 
 **Core modules (do not modify; reuse):**
-- `src/gaia/eval/release_scorecard.py` — `ResultPayload`, `compute_aggregate`, `render_scorecard`, `write_scorecard`, `validate_scorecard`, `carry_forward`, `latest_version_below`. Harness-agnostic (stdlib + PyYAML only).
+- `src/gaia/eval/release_scorecard.py` — `ResultPayload`, `compute_aggregate`, `render_scorecard`, `write_scorecard`, `validate_scorecard`, `carry_forward`. Harness-agnostic (stdlib + PyYAML only).
 - `src/gaia/eval/scorecard_gate.py` — the standalone gate (`python -m gaia.eval.scorecard_gate`).
 - Reference adapter: `hub/agents/python/email/packaging/gen_scorecard.py`.
 
@@ -18,7 +18,7 @@ This is a **phased checklist with a hard gate at the real-eval step** — the sc
 
 1. **Version source of truth** = the `version:` field in `<agent>/gaia-agent.yaml`. Never invent a parallel scheme.
 2. **Canonical README** (where the scorecard is linked + surfaced): for an npm-published agent it is the npm client README (e.g. `hub/agents/npm/<id>/README.md`), NOT a `packaging/README.md`. For a Python-only agent it is `hub/agents/python/<id>/README.md`. Confirm which by checking what `release_agent_<id>.yml` publishes (`README:` env) — the published README is the one to link.
-3. **doc-root** = the directory holding that canonical README. Scorecards live at `<doc-root>/scorecards/<version>.md`.
+3. **doc-root** = the directory holding that canonical README. The scorecard lives at `<doc-root>/SCORECARD.md` — a **single file updated in place**, versioned via the publish snapshot (same as README.md). **There is no `scorecards/` directory.**
 4. **Eval vehicle**: what existing harness produces this agent's accuracy metric? (email → `gaia eval benchmark` over `tests/fixtures/email/`.) If none exists, STOP and surface that — propose the minimal harness before building; do not invent numbers.
 
 ## Phase 2 — Write the adapter (harness → payload)
@@ -26,11 +26,12 @@ This is a **phased checklist with a hard gate at the real-eval step** — the sc
 Copy `hub/agents/python/email/packaging/gen_scorecard.py` as the template. The adapter:
 - imports ONLY `gaia.eval.release_scorecard` (never the harness or agent package — preserve loose coupling);
 - reads the harness output, builds a `ResultPayload`;
+- populates `reproduction_command` with the **exact shell commands** to reproduce this scorecard, including all required env vars (`PYTHON_KEYRING_BACKEND`, `GAIA_AGENT_TOOL_TIMEOUT`, `PYTHONPATH`);
 - defines **"judged"** explicitly and **raises loudly** if zero results are judged (no silent 0.0);
 - records **dataset size** (total labeled examples) and **test_cases_run** (subset executed) as DISTINCT fields;
 - stores **repo-relative** paths only (never a local absolute path — it ships in a published artifact);
 - records the eval `limit`/config so future regression checks are comparable;
-- writes to `<doc-root>/scorecards/<version>.md`.
+- writes to `<doc-root>/SCORECARD.md` (the single file; `--output-dir` overrides to a directory, but the filename is always `SCORECARD.md`).
 
 Add an offline unit test against a committed sample harness-output fixture (see `tests/fixtures/eval/email_benchmark_scorecard.json` + `tests/unit/eval/test_release_scorecard.py::TestEmailAdapter`) so the adapter is testable without a live model.
 
@@ -50,8 +51,10 @@ PYTHONPATH="$(pwd)" \
     --ground-truth tests/fixtures/email/ground_truth.json \
     --limit 25 --output-dir <persistent-dir>
 
+PYTHONPATH="$(pwd)" \
 <venv>/bin/python hub/agents/python/email/packaging/gen_scorecard.py \
     --benchmark-dir <persistent-dir> --limit 25
+# → writes hub/agents/npm/agent-email/SCORECARD.md in place
 ```
 
 **Headless gotchas (see memory `project-email-benchmark-headless-gotchas`):**
@@ -63,23 +66,33 @@ PYTHONPATH="$(pwd)" \
 
 ## Phase 4 — Surface, link, and gate
 
-1. **Link + surface** from the canonical README: a one-line `Eval scorecard (vX.Y.Z): aggregate N/100 … ([./scorecards/X.Y.Z.md](./scorecards/X.Y.Z.md))`. The relative link must resolve in-repo.
-2. **npm `files`**: if the agent publishes on npm, add `scorecards/` to `package.json` `files` so the link resolves on the published package too.
-3. **Hub display**: a published scorecard surfaces on the agent's hub page / Agent UI detail view (see `workers/agent-hub` + `AgentDetailModal.tsx`); ensure the publish step uploads the scorecard alongside the README.
+1. **Link + surface** from the canonical README: a one-line `Eval scorecard (vX.Y.Z): aggregate N/100 … ([./SCORECARD.md](./SCORECARD.md))`. The relative link must resolve in-repo.
+2. **npm `files`**: if the agent publishes on npm, add `SCORECARD.md` to `package.json` `files`. **Do not** add a `scorecards/` directory — only the single current file ships.
+3. **Hub display**: a published scorecard surfaces on the agent's hub page / Agent UI detail view (see `workers/agent-hub` + `AgentDetailModal.tsx`); ensure the publish step passes `--eval-scorecard <doc-root>/SCORECARD.md` to `publish_to_r2.py`.
 4. **Release gate**: add a `scorecard-gate` job to `release_agent_<id>.yml` and list it in `publish.needs`. The job runs on a GitHub-hosted runner (it only parses committed files — no eval):
    ```bash
+   # Presence-only (no previous tag yet):
    python -m gaia.eval.scorecard_gate \
-     --scorecards-dir <doc-root>/scorecards \
-     --manifest hub/agents/python/<id>/gaia-agent.yaml
+     --scorecard <doc-root>/SCORECARD.md
+
+   # With best-effort previous-release baseline (recommended for CI):
+   PREV="$(git describe --tags --abbrev=0 --match 'agent-pkg-<id>-*' "${GITHUB_REF_NAME}^" 2>/dev/null || true)"
+   if [ -n "$PREV" ]; then
+     python -m gaia.eval.scorecard_gate \
+       --scorecard <doc-root>/SCORECARD.md --baseline-ref "$PREV"
+   else
+     python -m gaia.eval.scorecard_gate \
+       --scorecard <doc-root>/SCORECARD.md
+   fi
    ```
-   The job must NOT have `continue-on-error`, an `environment:`, or a `permissions:` override (inherits `contents: read`; needs no secrets).
-5. **Auto-update/reject loop**: for re-running on agent changes and refreshing the committed scorecard, see `eval-scorecard.mdx` "Keeping the scorecard current" and the self-hosted refresh workflow — reject-on-worse is the gate; better-or-equal refreshes the committed card.
+   The job must NOT have `continue-on-error`, an `environment:`, or a `permissions:` override (inherits `contents: read`; needs no secrets). Fetch full history (`fetch-depth: 0`) so `git describe` resolves.
+5. **Auto-update/reject loop**: for re-running on agent changes and refreshing the committed scorecard, see `eval-scorecard.mdx` "Keeping the scorecard current" and the self-hosted refresh workflow — reject-on-worse is the gate; better-or-equal refreshes the committed `SCORECARD.md`.
 
 ## Phase 5 — Verify (evidence before "done")
 
-Run and capture: the generated `<version>.md`; the gate **passing** on it (exit 0); the gate **blocking** a manufactured regression (exit 1) and a missing card (exit 1); a by-hand recompute of the aggregate from `aggregate.components` matching the recorded value. Run `python util/lint.py --all` and the eval unit tests. These are the PR's real-world proof.
+Run and capture: the generated `SCORECARD.md`; the gate **passing** on it (exit 0); the gate **blocking** a manufactured regression (exit 1, via `--baseline-file` with a higher-scoring card) and a missing card (exit 1); a by-hand recompute of the aggregate from `aggregate.components` matching the recorded value. Run `python util/lint.py --all` and the eval unit tests. These are the PR's real-world proof.
 
 ## Versioning
 
-- **Patch** release → `carry_forward(prev_path, new_version)` (copies results verbatim, sets `inherited_from`); do NOT re-run the eval.
+- **Patch** release → `carry_forward(prev_scorecard_path, new_version)` reads the version from the front matter of the current `SCORECARD.md` (not from the filename) and copies results verbatim, sets `inherited_from`; do NOT re-run the eval.
 - **Minor/major** release → re-run the eval (Phase 3); `carry_forward` refuses a non-patch bump with a "re-run" error.
diff --git a/docs/reference/eval-scorecard.mdx b/docs/reference/eval-scorecard.mdx
index b00d9d00e..20151a45e 100644
--- a/docs/reference/eval-scorecard.mdx
+++ b/docs/reference/eval-scorecard.mdx
@@ -1,6 +1,6 @@
 ---
 title: "Release Eval Scorecard"
-description: "Per-agent, per-version eval scorecard: schema, storage convention, aggregate formula, versioning policy, and release gate."
+description: "Per-agent eval scorecard: schema, storage convention, aggregate formula, versioning policy, reproduction, and release gate."
 icon: "chart-bar"
 ---
 
@@ -14,17 +14,18 @@ icon: "chart-bar"
 
 ## Overview
 
-Each published hub agent ships a **release scorecard** — a versioned Markdown file that records:
+Each published hub agent ships a **release scorecard** — a single `SCORECARD.md` file (updated in place per release, versioned via the publish snapshot, the same way `README.md` works) that records:
 
 - The **eval recipe**: dataset reference, methodology, configuration, and metric definitions.
 - The **measured results**: per-metric values, number of test cases actually run, and dataset size.
 - A single **named aggregate score**: a deterministic, recomputable percentage so a reviewer can verify the number without re-running the eval.
+- A **Reproduction section**: the exact commands to reproduce the result from scratch.
 
 Scorecards are committed alongside the agent's README and linked from it. A standalone **release gate** (`scorecard_gate.py`) blocks packaging when the scorecard is missing or when its aggregate score strictly regresses below the prior version's.
 
 ## File format
 
-Scorecard files are **Markdown with YAML front matter** (`.md`). The front matter holds all machine-readable fields; the body is a human-readable summary with a worked recomputation.
+Scorecard files are **Markdown with YAML front matter** (`.md`). The front matter holds all machine-readable fields; the body is a human-readable summary with a worked recomputation and a Reproduction section.
 
 ```
 ---
@@ -35,7 +36,7 @@ agent:
 recipe:
   dataset:
     reference: tests/fixtures/email/ground_truth.json
-    description: Synthetic email corpus (FakeGmailBackend, 4-category priority labels)
+    description: Synthetic email corpus (FakeGmailBackend, schema-2.0 triage taxonomy)
     size: 220
   methodology: gaia eval benchmark — category_accuracy (case-insensitive exact match)
   config:
@@ -43,26 +44,31 @@ recipe:
     model: Gemma-4-E4B-it-GGUF
     limit: 25
 results:
-  test_cases_run: 24
+  test_cases_run: 25
   metrics:
     - name: category_accuracy
-      value: 0.4584
+      value: 0.40
       weight: 1.0
 aggregate:
   name: weighted_accuracy
   formula: "round(100 * sum(weight_i * value_i) / sum(weight_i), 2)"
   components:
     - metric: category_accuracy
-      value: 0.4584
+      value: 0.40
       weight: 1.0
-  value: 45.84
-generated_at: "2026-06-25T10:00:00+00:00"
+  value: 40.0
+generated_at: "2026-06-26T16:47:13+00:00"
 inherited_from: null
 ---
 
 # Email Triage — Eval Scorecard v0.2.4
 
-**Aggregate score: 45.84** (out of 100)
+**Aggregate score: 40.0** (out of 100)
+...
+
+## Reproduction
+
+Run the following commands from the repository root:
 ...
 ```
 
@@ -111,7 +117,7 @@ where each `valueᵢ` is a metric value in [0, 1] and each `weightᵢ` defaults
 The result is a **percentage in [0, 100]**. For a single metric with weight 1.0:
 
 ```
-round(100 × 0.4584, 2) = 45.84
+round(100 × 0.40, 2) = 40.0
 ```
 
 A reader can reproduce this value from `aggregate.components` alone — no eval-harness access needed.
@@ -119,15 +125,15 @@ The `aggregate.formula` field in the front matter states the formula in human-re
 
 ## Storage convention
 
-Scorecards live in a `scorecards/` subdirectory beside the agent's canonical README:
+Each agent package ships a **single `SCORECARD.md`** file, updated in place per release — the same way `README.md` works. Per-version uniqueness comes from the publish snapshot (R2 stores the file at `agents/<id>/<version>/SCORECARD.md`; the npm package ships only the current version's `SCORECARD.md`).
 
 ```
 <doc-root>/
-  README.md              ← canonical README (links to scorecard)
-  scorecards/
-    0.1.0.md
-    0.2.3.md
-    0.2.4.md             ← latest
+  README.md              ← canonical README (links to SCORECARD.md)
+  SCORECARD.md           ← current version's scorecard, updated in place
+  SPEC.md
+  SKILL.md
+  CHANGELOG.md
 ```
 
 The `doc-root` is the location of the agent's canonical README:
@@ -135,35 +141,39 @@ The `doc-root` is the location of the agent's canonical README:
 | Agent | doc-root |
 |-------|----------|
 | Email Triage (`@amd-gaia/agent-email`) | `hub/agents/npm/agent-email/` |
-| Hello World | `hub/agents/python/hello-world/` |
 
-The relative link `./scorecards/<version>.md` resolves both in-repo and when the directory is published as an npm package.
+The relative link `./SCORECARD.md` resolves both in-repo and when the directory is published as an npm package. The npm `files` array includes `SCORECARD.md` (not a `scorecards/` directory).
 
 ## Versioning policy
 
 ### Patch releases — carry forward
 
-For a **patch release** (same `major.minor`, `patch` incremented), the prior version's results are carried forward verbatim using `carry_forward()`:
+For a **patch release** (same `major.minor`, `patch` incremented), the prior version's results are carried forward verbatim using `carry_forward()`. Pass the path to the agent's current `SCORECARD.md`:
 
 ```python
 from gaia.eval.release_scorecard import carry_forward, write_scorecard
 from pathlib import Path
 
 new_payload = carry_forward(
-    prev_path=Path("scorecards/0.2.3.md"),
-    new_version="0.2.4",
+    prev_scorecard_path=Path("hub/agents/npm/agent-email/SCORECARD.md"),
+    new_version="0.2.5",
 )
-# new_payload.inherited_from == "0.2.3"
-write_scorecard(new_payload, Path("scorecards/0.2.4.md"))
+# new_payload.inherited_from == "0.2.4"  (read from front matter, not filename)
+write_scorecard(new_payload, Path("hub/agents/npm/agent-email/SCORECARD.md"))
 ```
 
-The resulting scorecard has `inherited_from: "0.2.3"` and identical `results` and `aggregate` fields. The aggregate score is unchanged, so the release gate's equal-score case passes.
+The resulting scorecard has `inherited_from: "0.2.4"` and identical `results` and `aggregate` fields. The aggregate score is unchanged, so the release gate's equal-score case passes.
+
+`carry_forward()` reads the prior version from the `agent.version` field in the front matter — **not** from the filename.
 
 ### Minor / major releases — re-run required
 
 For a **minor or major bump**, `carry_forward()` raises `ValueError` with a "re-run" message. Run the eval fresh and generate a new scorecard:
 
 ```bash
+PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \
+GAIA_AGENT_TOOL_TIMEOUT=120 \
+PYTHONPATH="$(pwd)" \
 gaia eval benchmark \
   --model Gemma-4-E4B-it-GGUF \
   --mbox-path tests/fixtures/email/synthetic_inbox.mbox \
@@ -171,33 +181,44 @@ gaia eval benchmark \
   --limit 25 \
   --output-dir /tmp/email-eval
 
+PYTHONPATH="$(pwd)" \
 python hub/agents/python/email/packaging/gen_scorecard.py \
-  --benchmark-dir /tmp/email-eval
+  --benchmark-dir /tmp/email-eval \
+  --limit 25
 ```
 
+This writes `hub/agents/npm/agent-email/SCORECARD.md` in place.
+
 ## Release gate
 
 `scorecard_gate.py` is a standalone script that exits non-zero on failure:
 
 ```bash
+# Presence-only check (first adoption or no baseline specified):
 python -m gaia.eval.scorecard_gate \
-  --scorecards-dir hub/agents/npm/agent-email/scorecards \
-  --manifest hub/agents/python/email/gaia-agent.yaml
-```
+  --scorecard hub/agents/npm/agent-email/SCORECARD.md
 
-Or with an explicit version:
+# Regression check against a specific prior scorecard file (unit tests / local):
+python -m gaia.eval.scorecard_gate \
+  --scorecard hub/agents/npm/agent-email/SCORECARD.md \
+  --baseline-file /tmp/prev-SCORECARD.md
 
-```bash
+# Regression check against a prior release tag (CI):
 python -m gaia.eval.scorecard_gate \
-  --scorecards-dir hub/agents/npm/agent-email/scorecards \
-  --version 0.2.4
+  --scorecard hub/agents/npm/agent-email/SCORECARD.md \
+  --baseline-ref agent-pkg-email-v0.2.3
 ```
 
+`--baseline-file` and `--baseline-ref` are mutually exclusive. If the file doesn't exist at the given ref, the gate treats it as first adoption (presence-only pass).
+
 ### Gate logic
 
-1. **Presence check**: `<scorecards-dir>/<version>.md` must exist and be valid. → exit 1 if not.
-2. **Locate prior**: find the greatest semver strictly below `<version>` in `<scorecards-dir>`. If none → **first adoption**, exit 0 (presence-only pass).
-3. **Regression check**: if `candidate.aggregate.value < prior.aggregate.value` (strict) → exit 1.
+1. **Presence check**: `--scorecard` path must exist and be a valid scorecard. → exit 1 if not.
+2. **Baseline resolution**:
+   - `--baseline-file`: read the given file directly (no git access; suitable for unit tests).
+   - `--baseline-ref`: resolve via `git show <ref>:<scorecard-path>`. If the file does not exist at that ref → **first adoption**, exit 0.
+   - Neither specified: **first adoption**, exit 0 (presence-only pass).
+3. **Regression check**: if `candidate.aggregate.value < baseline.aggregate.value` (strict) → exit 1.
 4. Equal or greater → exit 0.
 
 ### Exit codes
@@ -205,8 +226,9 @@ python -m gaia.eval.scorecard_gate \
 | Case | Exit code |
 |------|-----------|
 | Missing or invalid candidate scorecard | `1` |
-| Strict regression vs prior version | `1` |
-| No prior version (first adoption) | `0` |
+| Strict regression vs baseline | `1` |
+| No baseline (first adoption) | `0` |
+| File absent at `--baseline-ref` | `0` |
 | Equal score (patch carry-forward) | `0` |
 | Score improved | `0` |
 
@@ -215,29 +237,18 @@ python -m gaia.eval.scorecard_gate \
 When a regression is intentional (e.g. a dataset correction or methodology change), use `--allow-regression`. The gate prints a GHA `::warning::` annotation naming both versions and scores, then exits 0:
 
 ```
-::warning::Scorecard regression allowed by --allow-regression: 0.2.3=65.0 → 0.2.4=45.84
-WARNING: Regression override active. Prior version 0.2.3 scored 65.0; candidate 0.2.4 scored 45.84. ...
+::warning::Scorecard regression allowed by --allow-regression: v0.2.3=65.0 → v0.2.4=40.0
+WARNING: Regression override active. Prior version v0.2.3 scored 65.0; candidate v0.2.4 scored 40.0. ...
 ```
 
-### How the gate resolves "previous version"
-
-The gate calls `latest_version_below(scorecards_dir, version)`, which:
-
-1. Lists all `*.md` files in `scorecards_dir`.
-2. Keeps only those whose **stem** matches the anchored regex `^\d+\.\d+\.\d+$` (skips `README.md`, `.gitkeep`, prerelease tags, etc.).
-3. Compares versions as **integer tuples** `(major, minor, patch)` — so `0.10.0 > 0.2.9` correctly.
-4. Returns the greatest version strictly below the candidate, or `None`.
-
-The version is read from `gaia-agent.yaml` (via `--manifest`) or passed explicitly (via `--version`).
-
 ## Keeping the scorecard current (the update / reject loop)
 
-The scorecard must move with the agent: when LLM-affecting code changes, the eval is re-run and the committed scorecard refreshed — **upward**. A regression is blocked.
+The scorecard must move with the agent: when LLM-affecting code changes, the eval is re-run and the committed `SCORECARD.md` refreshed — **upward**. A regression is blocked.
 
 Two enforcement points work together:
 
 1. **Reject-on-worse (always on, GitHub-hosted).** The `scorecard-gate` job in `release_agent_<id>.yml` runs on every release. It only parses committed files (no eval), so it runs on a standard runner and **fails the build** if the committed scorecard regressed below the prior version or is missing. This is the hard gate.
-2. **Run-and-refresh (self-hosted AMD).** `gaia eval benchmark` needs Lemonade on AMD hardware, so it cannot run on GitHub-hosted runners — it runs on the `[self-hosted, lemonade-eval]` pool. The `Email Agent Eval — scorecard refresh` workflow (`.github/workflows/email_scorecard_refresh.yml`) runs on demand (and on pushes touching the email agent), re-runs the eval, regenerates the scorecard, then:
+2. **Run-and-refresh (self-hosted AMD).** `gaia eval benchmark` needs Lemonade on AMD hardware, so it cannot run on GitHub-hosted runners — it runs on the `[self-hosted, lemonade-eval]` pool. The `Email Agent Eval — scorecard refresh` workflow (`.github/workflows/email_scorecard_refresh.yml`) runs on demand (and on pushes touching the email agent), re-runs the eval, regenerates `SCORECARD.md`, then:
    - **score ≥ committed** → commits the refreshed scorecard back to the branch (the PR carries the improved number);
    - **score < committed** → fails loudly (the regression must be investigated, or consciously overridden with `--allow-regression`).
 
@@ -253,9 +264,8 @@ So a PR that changes the agent gets its scorecard refreshed (better) or rejected
   **Use the [`adding-eval-scorecard` skill](https://github.com/amd/gaia/tree/main/.claude/skills/adding-eval-scorecard/SKILL.md).** In Claude Code, invoke it instead of following these steps by hand — it carries the exact commands, the harness→payload→generator flow, the headless-eval gotchas (keyring/PYTHONPATH/tool-timeout), and the verification evidence to capture. The steps below are the reference the skill automates.
 </Tip>
 
-1. Create the `scorecards/` directory beside the agent's canonical README.
-2. Write a `packaging/gen_scorecard.py` adapter (see `hub/agents/python/email/packaging/gen_scorecard.py` for a reference).
-3. Run the eval and call the adapter → commit the resulting `<version>.md`.
-4. Link the scorecard from the README: `./scorecards/<version>.md`.
-5. Add `scorecards/` to the npm `package.json` `files` array (if published on npm).
-6. Wire `scorecard_gate` into the release workflow (see `release_agent_email.yml` for the job topology).
+1. Write a `packaging/gen_scorecard.py` adapter (see `hub/agents/python/email/packaging/gen_scorecard.py` for a reference). The adapter should populate `reproduction_command` with the exact commands needed to reproduce the scorecard.
+2. Run the eval and call the adapter → commit the resulting `SCORECARD.md` to `<doc-root>/SCORECARD.md`.
+3. Link the scorecard from the README: `./SCORECARD.md`.
+4. Add `SCORECARD.md` to the npm `package.json` `files` array (if published on npm); do **not** add a `scorecards/` directory.
+5. Wire `scorecard_gate` into the release workflow (see `release_agent_email.yml` for the job topology). Use `--scorecard <path>/SCORECARD.md` and `--baseline-ref <prev-tag>` (best-effort).

From 20dbdbece3f0597fcdfcab85da3c3478b2e73e62 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 13:38:29 -0400
Subject: [PATCH 17/18] fix(eval): scorecard_gate pylint and black formatting

- subprocess.run: add check=False (W1510)
- Remove bare f-strings with no interpolated vars (W1309)
- black reformatted test_scorecard_gate.py
---
 src/gaia/eval/scorecard_gate.py        | 15 +++++++-------
 tests/unit/eval/test_scorecard_gate.py | 27 +++++++++++++++++---------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/gaia/eval/scorecard_gate.py b/src/gaia/eval/scorecard_gate.py
index 4328511c4..3a54c09b6 100644
--- a/src/gaia/eval/scorecard_gate.py
+++ b/src/gaia/eval/scorecard_gate.py
@@ -89,10 +89,11 @@ def _parse_baseline_ref(scorecard_path: Path, ref: str) -> str | None:
 
     git_path = rel.as_posix()
     try:
-        result = subprocess.run(
+        result = subprocess.run(  # noqa: S603 (git is trusted here)
             ["git", "show", f"{ref}:{git_path}"],
             capture_output=True,
             text=True,
+            check=False,
         )
     except FileNotFoundError as exc:
         raise ValueError(f"git not found: {exc}") from exc
@@ -246,9 +247,9 @@ def main(argv=None) -> int:
     prev_errors = validate_scorecard(prev_parsed)
     if prev_errors:
         print(
-            f"ERROR: Baseline SCORECARD.md is invalid:\n"
+            "ERROR: Baseline SCORECARD.md is invalid:\n"
             + "\n".join(f"  - {e}" for e in prev_errors)
-            + f"\n  Fix the baseline scorecard before releasing."
+            + "\n  Fix the baseline scorecard before releasing."
         )
         return 1
 
@@ -258,15 +259,15 @@ def main(argv=None) -> int:
     if candidate_score is None:
         print(
             f"ERROR: Candidate SCORECARD.md at {candidate_path} has no "
-            f"'aggregate.value' field.\n"
-            f"  Fix the scorecard front matter before releasing."
+            "'aggregate.value' field.\n"
+            "  Fix the scorecard front matter before releasing."
         )
         return 1
 
     if prev_score is None:
         print(
-            f"ERROR: Baseline SCORECARD.md has no 'aggregate.value' field.\n"
-            f"  Fix the baseline scorecard before releasing."
+            "ERROR: Baseline SCORECARD.md has no 'aggregate.value' field.\n"
+            "  Fix the baseline scorecard before releasing."
         )
         return 1
 
diff --git a/tests/unit/eval/test_scorecard_gate.py b/tests/unit/eval/test_scorecard_gate.py
index 32424f97a..efc5d4fad 100644
--- a/tests/unit/eval/test_scorecard_gate.py
+++ b/tests/unit/eval/test_scorecard_gate.py
@@ -151,8 +151,10 @@ def test_allow_regression_flag_returns_0(self, tmp_path):
 
         result = main(
             [
-                "--scorecard", str(candidate),
-                "--baseline-file", str(baseline),
+                "--scorecard",
+                str(candidate),
+                "--baseline-file",
+                str(baseline),
                 "--allow-regression",
             ]
         )
@@ -169,8 +171,10 @@ def test_allow_regression_prints_warning_line(self, tmp_path, capsys):
 
         main(
             [
-                "--scorecard", str(candidate),
-                "--baseline-file", str(baseline),
+                "--scorecard",
+                str(candidate),
+                "--baseline-file",
+                str(baseline),
                 "--allow-regression",
             ]
         )
@@ -188,8 +192,10 @@ def test_missing_baseline_file_returns_1(self, tmp_path):
         candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6)
         result = main(
             [
-                "--scorecard", str(candidate),
-                "--baseline-file", str(tmp_path / "nonexistent-SCORECARD.md"),
+                "--scorecard",
+                str(candidate),
+                "--baseline-file",
+                str(tmp_path / "nonexistent-SCORECARD.md"),
             ]
         )
         assert result == 1
@@ -291,9 +297,12 @@ def test_baseline_file_and_ref_mutually_exclusive(self, tmp_path):
         candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6)
         result = main(
             [
-                "--scorecard", str(candidate),
-                "--baseline-file", str(candidate),
-                "--baseline-ref", "v1.0.0",
+                "--scorecard",
+                str(candidate),
+                "--baseline-file",
+                str(candidate),
+                "--baseline-ref",
+                "v1.0.0",
             ]
         )
         assert result == 1

From c2dcf6c991c59c7c434085eb6fb2699e281dac83 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <tomasz@iniewicz.com>
Date: Fri, 26 Jun 2026 13:41:52 -0400
Subject: [PATCH 18/18] feat(eval): email SCORECARD.md from full-corpus run
 (46.0); portable reproduction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Regenerate the email v0.2.4 SCORECARD.md from a full-corpus gaia eval benchmark
run on AMD Strix Halo: category_accuracy 0.46 over 100 of 220 emails (the triage
tool processes up to 100 per call) -> aggregate 46.0/100. Errors are dominated by
the inherently-ambiguous fyi<->needs_response boundary; the model over-assigns
NEEDS_RESPONSE. Fix the adapter's reproduction command to be portable (generic
/tmp/email-eval output dir, full model/mbox/ground-truth/output-dir flags) — no
local absolute path in the published artifact. README reflects 46.0.
---
 hub/agents/npm/agent-email/README.md          |  2 +-
 hub/agents/npm/agent-email/SCORECARD.md       | 38 ++++++++++---------
 .../python/email/packaging/gen_scorecard.py   | 34 +++++++++--------
 3 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md
index 92424371b..f8d797279 100644
--- a/hub/agents/npm/agent-email/README.md
+++ b/hub/agents/npm/agent-email/README.md
@@ -2,7 +2,7 @@
 
 [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24**
 
-**Eval scorecard (v0.2.4): aggregate 40.0 / 100** — `category_accuracy` 0.40 over 25 of 220 labeled emails ([`./SCORECARD.md`](./SCORECARD.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate.
+**Eval scorecard (v0.2.4): aggregate 46.0 / 100** — `category_accuracy` 0.46 over 100 of 220 labeled emails ([`./SCORECARD.md`](./SCORECARD.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, a worked recomputation, and reproduction steps.
 
 Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies
 to, and schedules from Gmail and Outlook — with every email body analyzed
diff --git a/hub/agents/npm/agent-email/SCORECARD.md b/hub/agents/npm/agent-email/SCORECARD.md
index b4f8ae5ca..000f2a127 100644
--- a/hub/agents/npm/agent-email/SCORECARD.md
+++ b/hub/agents/npm/agent-email/SCORECARD.md
@@ -18,27 +18,27 @@ recipe:
     model: Gemma-4-E4B-it-GGUF
     corpus: tests/fixtures/email/synthetic_inbox.mbox
     ground_truth: tests/fixtures/email/ground_truth.json
-    limit: 25
+    limit: 220
 results:
-  test_cases_run: 25
+  test_cases_run: 100
   metrics:
   - name: category_accuracy
-    value: 0.4
+    value: 0.46
     weight: 1.0
 aggregate:
   name: weighted_accuracy
   formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2)
   components:
   - metric: category_accuracy
-    value: 0.4
+    value: 0.46
     weight: 1.0
-  value: 40.0
-generated_at: '2026-06-26T17:29:34.631236+00:00'
+  value: 46.0
+generated_at: '2026-06-26T17:40:26.470285+00:00'
 inherited_from: null
 ---
 # Email Triage — Eval Scorecard v0.2.4
 
-**Aggregate score: 40.0** (out of 100)
+**Aggregate score: 46.0** (out of 100)
 
 ## Recipe
 
@@ -47,12 +47,12 @@ inherited_from: null
 | Dataset | [tests/fixtures/email/ground_truth.json](tests/fixtures/email/ground_truth.json) |
 | Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, schema-2.0 triage taxonomy: fyi / needs_response / promotional / urgent / personal) |
 | Dataset size | 220 labeled examples |
-| Test cases run | 25 |
+| Test cases run | 100 |
 | Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0 triage taxonomy, aligned with the agent's output labels (#1874) |
 
 ## Metrics
 
-  - **category_accuracy**: 0.4000 × 1.0
+  - **category_accuracy**: 0.4600 × 1.0
 
 ## Aggregate score recomputation
 
@@ -61,7 +61,7 @@ Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)`
 Worked example:
 
 ```
-round(100 × ((0.4000 × 1.0)) / 1.0, 2) = 40.0
+round(100 × ((0.4600 × 1.0)) / 1.0, 2) = 46.0
 ```
 
 A reader can reproduce this value from the `aggregate.components` in the front
@@ -72,19 +72,23 @@ matter alone — no eval-harness access needed.
 Run the following commands from the repository root:
 
 ```sh
-# Step 1: run the benchmark (requires a running Lemonade Server on :13305)
+# Step 1: run the benchmark (requires a Lemonade Server with the model loaded; AMD Ryzen AI / Strix Halo recommended)
 PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \
-GAIA_AGENT_TOOL_TIMEOUT=120 \
+GAIA_AGENT_TOOL_TIMEOUT=900 \
 PYTHONPATH="$(pwd)" \
-gaia eval benchmark --limit 25
+gaia eval benchmark \
+    --model Gemma-4-E4B-it-GGUF \
+    --mbox-path tests/fixtures/email/synthetic_inbox.mbox \
+    --ground-truth tests/fixtures/email/ground_truth.json \
+    --limit 220 \
+    --output-dir /tmp/email-eval
 
-# Step 2: generate the scorecard from the benchmark output
-PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \
+# Step 2: generate this scorecard from the benchmark output
 PYTHONPATH="$(pwd)" \
 python hub/agents/python/email/packaging/gen_scorecard.py \
-    --benchmark-dir /private/tmp/claude-501/-Users-tomasz-src-amd-gaia--claude-worktrees-sleepy-chatelet-2b818a/314bd25e-fbc0-4ab7-aab0-a8825585e5ef/scratchpad/email-eval-relabeled \
+    --benchmark-dir /tmp/email-eval \
     --ground-truth tests/fixtures/email/ground_truth.json \
-    --limit 25
+    --limit 220
 ```
 
 See [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) for the full setup guide.
diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py
index 1837a1389..344817bda 100644
--- a/hub/agents/python/email/packaging/gen_scorecard.py
+++ b/hub/agents/python/email/packaging/gen_scorecard.py
@@ -212,28 +212,32 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None):
 
     import datetime
 
-    # Construct an exact reproduction command using the supplied arguments, so any
-    # reader can reproduce the scorecard result from scratch.
-    limit_flag = f" --limit {limit}" if limit is not None else ""
+    # Construct a portable, exact reproduction command so any reader can reproduce
+    # this scorecard from scratch. Use repo-relative paths and a generic output dir
+    # only — never a local absolute path (this ships in a published artifact).
+    limit_flag = f" \\\n    --limit {limit}" if limit is not None else ""
     ground_truth_rel = (
         str(ground_truth_path.relative_to(_REPO_ROOT))
         if str(ground_truth_path).startswith(str(_REPO_ROOT))
         else ground_truth_path.name
     )
-    benchmark_dir_display = str(benchmark_dir)
     reproduction_command = (
-        "# Step 1: run the benchmark (requires a running Lemonade Server on :13305)\n"
-        f"PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n"
-        f"GAIA_AGENT_TOOL_TIMEOUT=120 \\\n"
-        f"PYTHONPATH=\"$(pwd)\" \\\n"
-        f"gaia eval benchmark{limit_flag}\n\n"
-        "# Step 2: generate the scorecard from the benchmark output\n"
-        f"PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n"
-        f"PYTHONPATH=\"$(pwd)\" \\\n"
-        f"python hub/agents/python/email/packaging/gen_scorecard.py \\\n"
-        f"    --benchmark-dir {benchmark_dir_display} \\\n"
+        "# Step 1: run the benchmark (requires a Lemonade Server with the model "
+        "loaded; AMD Ryzen AI / Strix Halo recommended)\n"
+        "PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n"
+        "GAIA_AGENT_TOOL_TIMEOUT=900 \\\n"
+        'PYTHONPATH="$(pwd)" \\\n'
+        "gaia eval benchmark \\\n"
+        f"    --model {model} \\\n"
+        "    --mbox-path tests/fixtures/email/synthetic_inbox.mbox \\\n"
+        f"    --ground-truth {ground_truth_rel}{limit_flag} \\\n"
+        "    --output-dir /tmp/email-eval\n\n"
+        "# Step 2: generate this scorecard from the benchmark output\n"
+        'PYTHONPATH="$(pwd)" \\\n'
+        "python hub/agents/python/email/packaging/gen_scorecard.py \\\n"
+        "    --benchmark-dir /tmp/email-eval \\\n"
         f"    --ground-truth {ground_truth_rel}"
-        + (f" \\\n    --limit {limit}" if limit is not None else "")
+        + (f"{limit_flag}" if limit is not None else "")
     )
 
     return ResultPayload(