From 753b88d13e4a2971b28827d1e17f0bd87b0ca502 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Thu, 25 Jun 2026 18:18:37 -0400 Subject: [PATCH 01/18] test(eval): add TDD tests for release scorecard + gate (frozen contract) Tests encode the full acceptance criteria for gaia.eval.release_scorecard and gaia.eval.scorecard_gate before any implementation exists. Includes the email benchmark fixture used by the adapter tests. --- .../eval/email_benchmark_scorecard.json | 5 + tests/unit/eval/test_release_scorecard.py | 439 ++++++++++++++++++ tests/unit/eval/test_scorecard_gate.py | 246 ++++++++++ 3 files changed, 690 insertions(+) create mode 100644 tests/fixtures/eval/email_benchmark_scorecard.json create mode 100644 tests/unit/eval/test_release_scorecard.py create mode 100644 tests/unit/eval/test_scorecard_gate.py diff --git a/tests/fixtures/eval/email_benchmark_scorecard.json b/tests/fixtures/eval/email_benchmark_scorecard.json new file mode 100644 index 000000000..389e7292f --- /dev/null +++ b/tests/fixtures/eval/email_benchmark_scorecard.json @@ -0,0 +1,5 @@ +{"run_id":"bench-fixture","scenarios":[ + {"category":"Gemma-4-E4B-it-GGUF","status":"PASS","total_emails":12,"quality":{"category_accuracy":0.4167}}, + {"category":"Gemma-4-E4B-it-GGUF","status":"PASS","total_emails":12,"quality":{"category_accuracy":0.5000}}, + {"category":"Gemma-4-E4B-it-GGUF","status":"PASS","total_emails":0} +]} diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py new file mode 100644 index 000000000..7e9770e1a --- /dev/null +++ b/tests/unit/eval/test_release_scorecard.py @@ -0,0 +1,439 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +"""TDD tests for gaia.eval.release_scorecard — written before implementation exists.""" + +import datetime +import importlib.util +import json +import sys +from pathlib import Path + +import pytest + +from gaia.eval.release_scorecard import ( + REQUIRED_FIELDS, + ResultPayload, + carry_forward, + compute_aggregate, + latest_version_below, + parse_scorecard, + render_scorecard, + validate_scorecard, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +FIXTURE_DIR = Path(__file__).parents[2] / "fixtures" / "eval" +EMAIL_BENCHMARK_FIXTURE = FIXTURE_DIR / "email_benchmark_scorecard.json" + + +def _make_payload(version="1.0.0", accuracy=0.5): + metrics = [{"name": "category_accuracy", "value": accuracy, "weight": 1.0}] + components, agg_value = compute_aggregate(metrics) + return ResultPayload( + agent_name="test-agent", + agent_version=version, + dataset_reference="test/fixture", + dataset_description="test dataset", + dataset_size=100, + methodology="unit test", + config={"model": "test"}, + test_cases_run=10, + metrics=metrics, + aggregate_name="weighted_accuracy", + generated_at=datetime.datetime.utcnow().isoformat(), + inherited_from=None, + ) + + +# --------------------------------------------------------------------------- +# 1. Schema / validator round-trip +# --------------------------------------------------------------------------- + + +class TestSchemaValidator: + def test_valid_payload_passes_validation(self): + payload = _make_payload() + text = render_scorecard(payload) + parsed = parse_scorecard(text) + errors = validate_scorecard(parsed) + assert errors == [], f"Expected no errors, got: {errors}" + + def test_missing_required_fields_each_flagged(self): + payload = _make_payload() + text = render_scorecard(payload) + parsed = parse_scorecard(text) + + # Each required top-level field, when removed, should produce a non-empty error list. + for field in REQUIRED_FIELDS: + mutated = {k: v for k, v in parsed.items() if k != field} + errors = validate_scorecard(mutated) + assert errors, ( + f"Expected validate_scorecard to flag missing '{field}' " + f"but got empty error list" + ) + + def test_required_top_level_keys_include_expected_sections(self): + # schema_version, agent, recipe, results, aggregate must be required + for section in ("schema_version", "agent", "recipe", "results", "aggregate"): + assert section in REQUIRED_FIELDS, ( + f"'{section}' must be in REQUIRED_FIELDS" + ) + + +# --------------------------------------------------------------------------- +# 2. Aggregate computation +# --------------------------------------------------------------------------- + + +class TestComputeAggregate: + def test_single_metric(self): + _, value = compute_aggregate([{"name": "acc", "value": 0.5, "weight": 1.0}]) + assert value == 50.0 + + def test_multiple_metrics_weighted(self): + metrics = [ + {"name": "a", "value": 0.4167, "weight": 1.0}, + {"name": "b", "value": 0.5, "weight": 2.0}, + ] + _, value = compute_aggregate(metrics) + expected = round(100 * (0.4167 + 2 * 0.5) / (1 + 2), 2) + assert value == expected + + def test_empty_metrics_raises(self): + with pytest.raises(ValueError): + compute_aggregate([]) + + def test_zero_weight_raises(self): + with pytest.raises(ValueError): + compute_aggregate([{"name": "x", "value": 0.5, "weight": 0.0}]) + + def test_recompute_from_components_matches_aggregate_value(self): + metrics = [ + {"name": "cat_acc", "value": 0.4167, "weight": 1.0}, + {"name": "send_acc", "value": 0.75, "weight": 2.0}, + ] + payload = _make_payload() + # Build payload with these 2 metrics directly + components, agg_value = compute_aggregate(metrics) + recomputed = round( + 100 + * sum(c["weight"] * c["value"] for c in components) + / sum(c["weight"] for c in components), + 2, + ) + assert recomputed == agg_value + + +# --------------------------------------------------------------------------- +# 3. Generator round-trip +# --------------------------------------------------------------------------- + + +class TestGeneratorRoundTrip: + def test_rendered_text_starts_with_dashes(self): + payload = _make_payload() + text = render_scorecard(payload) + lines = text.splitlines() + assert lines[0] == "---", f"First line must be '---', got: {lines[0]!r}" + + def test_rendered_text_contains_closing_dashes(self): + payload = _make_payload() + text = render_scorecard(payload) + lines = text.splitlines() + # Find second occurrence of '---' + closing = [i for i, l in enumerate(lines) if l == "---" and i > 0] + assert closing, "Rendered scorecard must contain a closing '---' after the first" + + def test_body_after_front_matter_is_non_empty(self): + payload = _make_payload() + text = render_scorecard(payload) + lines = text.splitlines() + closing_indices = [i for i, l in enumerate(lines) if l == "---"] + assert len(closing_indices) >= 2, "Need at least two '---' lines" + body = "\n".join(lines[closing_indices[1] + 1 :]) + assert body.strip(), "Body after front matter must be non-empty" + + def test_parse_recovers_all_required_fields(self): + payload = _make_payload() + text = render_scorecard(payload) + parsed = parse_scorecard(text) + errors = validate_scorecard(parsed) + assert errors == [] + + +# --------------------------------------------------------------------------- +# 4. Two counts distinct as separate fields +# --------------------------------------------------------------------------- + + +class TestDistinctCountFields: + def test_test_cases_run_and_dataset_size_both_present(self): + payload = _make_payload() + text = render_scorecard(payload) + parsed = parse_scorecard(text) + assert "results" in parsed, "'results' section missing from parsed scorecard" + assert "test_cases_run" in parsed["results"], ( + "'results.test_cases_run' must be a distinct field" + ) + assert "recipe" in parsed, "'recipe' section missing from parsed scorecard" + assert "dataset" in parsed["recipe"], ( + "'recipe.dataset' sub-section missing" + ) + assert "size" in parsed["recipe"]["dataset"], ( + "'recipe.dataset.size' must be a distinct field" + ) + + +# --------------------------------------------------------------------------- +# 5. Loose coupling — no harness/agent modules imported +# --------------------------------------------------------------------------- + + +class TestLooseCoupling: + def test_no_benchmark_or_agent_modules_imported(self): + # Import is already done at top of file; check sys.modules + contaminated = [ + m + for m in sys.modules + if "benchmark" in m or "gaia_agent_email" in m + ] + assert not contaminated, ( + f"release_scorecard import pulled in harness/agent modules: {contaminated}" + ) + + +# --------------------------------------------------------------------------- +# 6. Markdown structure (duplicate guard on render) +# --------------------------------------------------------------------------- + + +class TestMarkdownStructure: + def test_first_line_is_dashes(self): + text = render_scorecard(_make_payload()) + assert text.splitlines()[0] == "---" + + def test_contains_closing_dashes(self): + text = render_scorecard(_make_payload()) + count = text.count("\n---") + assert count >= 1, "Must contain at least one closing '---' line" + + def test_body_non_empty(self): + text = render_scorecard(_make_payload()) + parts = text.split("---") + # parts[0] is empty, parts[1] is YAML, parts[2+] is body + body = "---".join(parts[2:]) + assert body.strip(), "Markdown body after front matter must not be empty" + + +# --------------------------------------------------------------------------- +# 7. Versioning — patch carry-forward +# --------------------------------------------------------------------------- + + +class TestCarryForwardPatch: + def test_carry_forward_sets_inherited_from(self, tmp_path): + src = _make_payload(version="0.2.3", accuracy=0.75) + card_path = tmp_path / "0.2.3.md" + card_path.write_text(render_scorecard(src)) + + result = carry_forward(card_path, "0.2.4") + assert result.inherited_from == "0.2.3" + + def test_carry_forward_copies_metrics_verbatim(self, tmp_path): + src = _make_payload(version="0.2.3", accuracy=0.75) + card_path = tmp_path / "0.2.3.md" + card_path.write_text(render_scorecard(src)) + + result = carry_forward(card_path, "0.2.4") + assert result.metrics == src.metrics + + +# --------------------------------------------------------------------------- +# 8. Versioning — minor bump refuses +# --------------------------------------------------------------------------- + + +class TestCarryForwardMinorBumpRefuses: + def test_minor_bump_raises_value_error(self, tmp_path): + src = _make_payload(version="0.2.3", accuracy=0.75) + card_path = tmp_path / "0.2.3.md" + card_path.write_text(render_scorecard(src)) + + with pytest.raises(ValueError, match="re-run"): + carry_forward(card_path, "0.3.0") + + def test_major_bump_raises_value_error(self, tmp_path): + src = _make_payload(version="0.2.3", accuracy=0.75) + card_path = tmp_path / "0.2.3.md" + card_path.write_text(render_scorecard(src)) + + with pytest.raises(ValueError, match="re-run"): + carry_forward(card_path, "1.0.0") + + +# --------------------------------------------------------------------------- +# 9. Non-carry-forward card has inherited_from=None +# --------------------------------------------------------------------------- + + +class TestInheritedFromNone: + def test_fresh_payload_has_null_inherited_from(self): + payload = _make_payload() + assert payload.inherited_from is None + + def test_rendered_parsed_inherited_from_null_or_absent(self): + payload = _make_payload() + text = render_scorecard(payload) + parsed = parse_scorecard(text) + # Either key absent or value is None/null + value = parsed.get("inherited_from", None) + assert value is None + + +# --------------------------------------------------------------------------- +# 10. latest_version_below +# --------------------------------------------------------------------------- + + +class TestLatestVersionBelow: + def _seed_dir(self, tmp_path): + for name in ("0.1.0.md", "0.2.3.md", "0.10.0.md", "README.md", "not-a-version.md"): + (tmp_path / name).write_text("# placeholder") + return tmp_path + + def test_returns_closest_below(self, tmp_path): + self._seed_dir(tmp_path) + result = latest_version_below(tmp_path, "0.2.4") + assert result == "0.2.3" + + def test_none_when_nothing_below(self, tmp_path): + self._seed_dir(tmp_path) + result = latest_version_below(tmp_path, "0.1.0") + assert result is None + + def test_integer_comparison_not_string(self, tmp_path): + self._seed_dir(tmp_path) + result = latest_version_below(tmp_path, "0.10.1") + assert result == "0.10.0" + + def test_non_version_files_silently_skipped(self, tmp_path): + self._seed_dir(tmp_path) + # Should not raise even with README.md and not-a-version.md present + result = latest_version_below(tmp_path, "0.2.4") + assert result == "0.2.3" + + +# --------------------------------------------------------------------------- +# Adapter tests: TestEmailAdapter +# --------------------------------------------------------------------------- + + +class TestEmailAdapter: + """Tests for hub/agents/python/email/packaging/gen_scorecard.py adapter.""" + + def _load_gen_scorecard(self): + adapter_path = ( + Path(__file__).parents[3] + / "hub" + / "agents" + / "python" + / "email" + / "packaging" + / "gen_scorecard.py" + ) + spec = importlib.util.spec_from_file_location("gen_scorecard", adapter_path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + def test_build_payload_mean_of_judged_scenarios(self, tmp_path): + mod = self._load_gen_scorecard() + + # Copy fixture to a benchmark dir + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json" + scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text()) + + # Fake ground_truth.json with 3 keys (2 labeled + 1 _meta → dataset_size=2) + ground_truth = { + "_meta": {"count": 3}, + "email1": {"label": "spam"}, + "email2": {"label": "promo"}, + } + gt_path = tmp_path / "ground_truth.json" + gt_path.write_text(json.dumps(ground_truth)) + + payload = mod.build_payload(benchmark_dir, gt_path) + + expected_mean = round((0.4167 + 0.5000) / 2, 10) + assert payload.metrics[0]["value"] == pytest.approx(expected_mean), ( + f"Expected metric value {expected_mean}, got {payload.metrics[0]['value']}" + ) + + def test_build_payload_test_cases_run(self, tmp_path): + mod = self._load_gen_scorecard() + + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json" + scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text()) + + ground_truth = { + "_meta": {"count": 3}, + "email1": {"label": "spam"}, + "email2": {"label": "promo"}, + } + gt_path = tmp_path / "ground_truth.json" + gt_path.write_text(json.dumps(ground_truth)) + + payload = mod.build_payload(benchmark_dir, gt_path) + # 12 + 12 = 24; third scenario skipped (no quality key) + assert payload.test_cases_run == 24 + + def test_build_payload_dataset_size(self, tmp_path): + mod = self._load_gen_scorecard() + + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json" + scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text()) + + ground_truth = { + "_meta": {"count": 3}, + "email1": {"label": "spam"}, + "email2": {"label": "promo"}, + } + gt_path = tmp_path / "ground_truth.json" + gt_path.write_text(json.dumps(ground_truth)) + + payload = mod.build_payload(benchmark_dir, gt_path) + # 3 keys - 1 _meta = 2 + assert payload.dataset_size == 2 + + def test_all_no_quality_raises(self, tmp_path): + mod = self._load_gen_scorecard() + + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + # Scorecard where no scenario has quality + empty_scorecard = { + "run_id": "no-quality", + "scenarios": [ + {"category": "Gemma-4-E4B-it-GGUF", "status": "PASS", "total_emails": 0}, + {"category": "Gemma-4-E4B-it-GGUF", "status": "PASS", "total_emails": 0}, + ], + } + (benchmark_dir / "email_benchmark_scorecard.json").write_text( + json.dumps(empty_scorecard) + ) + + ground_truth = {"_meta": {"count": 1}, "email1": {"label": "spam"}} + gt_path = tmp_path / "ground_truth.json" + gt_path.write_text(json.dumps(ground_truth)) + + with pytest.raises(ValueError): + mod.build_payload(benchmark_dir, gt_path) diff --git a/tests/unit/eval/test_scorecard_gate.py b/tests/unit/eval/test_scorecard_gate.py new file mode 100644 index 000000000..dbeaba0b7 --- /dev/null +++ b/tests/unit/eval/test_scorecard_gate.py @@ -0,0 +1,246 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +"""TDD tests for gaia.eval.scorecard_gate — written before implementation exists.""" + +import datetime +from pathlib import Path + +import pytest +import yaml + +from gaia.eval.release_scorecard import ResultPayload, compute_aggregate, render_scorecard +from gaia.eval.scorecard_gate import main + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + + +def _make_payload(version="1.0.0", accuracy=0.5): + metrics = [{"name": "category_accuracy", "value": accuracy, "weight": 1.0}] + components, agg_value = compute_aggregate(metrics) + return ResultPayload( + agent_name="test-agent", + agent_version=version, + dataset_reference="test/fixture", + dataset_description="test dataset", + dataset_size=100, + methodology="unit test", + config={"model": "test"}, + test_cases_run=10, + metrics=metrics, + aggregate_name="weighted_accuracy", + generated_at=datetime.datetime.utcnow().isoformat(), + inherited_from=None, + ) + + +def _write_card(directory: Path, version: str, accuracy: float) -> Path: + payload = _make_payload(version=version, accuracy=accuracy) + path = directory / f"{version}.md" + path.write_text(render_scorecard(payload)) + return path + + +# --------------------------------------------------------------------------- +# Case (a) — missing card → exit 1 +# --------------------------------------------------------------------------- + + +class TestMissingCard: + def test_missing_card_returns_1(self, tmp_path): + result = main(["--scorecards-dir", str(tmp_path), "--version", "1.0.0"]) + assert result == 1 + + +# --------------------------------------------------------------------------- +# Case (b) — strict regression → exit 1 +# --------------------------------------------------------------------------- + + +class TestStrictRegression: + def test_regression_returns_1(self, tmp_path): + _write_card(tmp_path, "0.2.3", accuracy=0.8) + _write_card(tmp_path, "0.2.4", accuracy=0.5) + result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"]) + assert result == 1 + + +# --------------------------------------------------------------------------- +# Case (c) — no prior → exit 0 +# --------------------------------------------------------------------------- + + +class TestNoPrior: + def test_first_adoption_returns_0(self, tmp_path): + _write_card(tmp_path, "1.0.0", accuracy=0.6) + result = main(["--scorecards-dir", str(tmp_path), "--version", "1.0.0"]) + assert result == 0 + + +# --------------------------------------------------------------------------- +# Case (d) — equal score (carry-forward) → exit 0 +# --------------------------------------------------------------------------- + + +class TestEqualScore: + def test_equal_score_returns_0(self, tmp_path): + _write_card(tmp_path, "0.2.3", accuracy=0.5) + _write_card(tmp_path, "0.2.4", accuracy=0.5) + result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"]) + assert result == 0 + + +# --------------------------------------------------------------------------- +# --allow-regression → exit 0 +# --------------------------------------------------------------------------- + + +class TestAllowRegression: + def test_allow_regression_flag_returns_0(self, tmp_path): + _write_card(tmp_path, "0.2.3", accuracy=0.8) + _write_card(tmp_path, "0.2.4", accuracy=0.5) + result = main( + [ + "--scorecards-dir", + str(tmp_path), + "--version", + "0.2.4", + "--allow-regression", + ] + ) + assert result == 0 + + def test_allow_regression_prints_warning_line(self, tmp_path, capsys): + _write_card(tmp_path, "0.2.3", accuracy=0.8) + _write_card(tmp_path, "0.2.4", accuracy=0.5) + main( + [ + "--scorecards-dir", + str(tmp_path), + "--version", + "0.2.4", + "--allow-regression", + ] + ) + captured = capsys.readouterr() + assert "::warning::" in captured.out + + +# --------------------------------------------------------------------------- +# --manifest reads version +# --------------------------------------------------------------------------- + + +class TestManifestFlag: + def test_manifest_reads_version(self, tmp_path): + scorecards_dir = tmp_path / "scorecards" + scorecards_dir.mkdir() + _write_card(scorecards_dir, "1.2.3", accuracy=0.6) + + manifest_path = tmp_path / "gaia-agent.yaml" + manifest_path.write_text("version: 1.2.3\nname: test-agent\n") + + result = main( + [ + "--scorecards-dir", + str(scorecards_dir), + "--manifest", + str(manifest_path), + ] + ) + assert result == 0 + + def test_manifest_with_regression(self, tmp_path): + scorecards_dir = tmp_path / "scorecards" + scorecards_dir.mkdir() + _write_card(scorecards_dir, "1.2.2", accuracy=0.9) + _write_card(scorecards_dir, "1.2.3", accuracy=0.3) + + manifest_path = tmp_path / "gaia-agent.yaml" + manifest_path.write_text("version: 1.2.3\nname: test-agent\n") + + result = main( + [ + "--scorecards-dir", + str(scorecards_dir), + "--manifest", + str(manifest_path), + ] + ) + assert result == 1 + + +# --------------------------------------------------------------------------- +# Invalid prior → exit 1 +# --------------------------------------------------------------------------- + + +class TestInvalidPrior: + def test_corrupt_prior_returns_1(self, tmp_path): + # Write corrupt/invalid prior card + corrupt_path = tmp_path / "0.2.3.md" + corrupt_path.write_text("this is not valid yaml front matter at all\ngarbage\n") + + # Write a valid candidate card + _write_card(tmp_path, "0.2.4", accuracy=0.9) + + result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"]) + assert result == 1 + + def test_empty_prior_returns_1(self, tmp_path): + # Prior exists but is empty + empty_path = tmp_path / "0.2.3.md" + empty_path.write_text("") + + _write_card(tmp_path, "0.2.4", accuracy=0.9) + + result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"]) + assert result == 1 + + +# --------------------------------------------------------------------------- +# Workflow YAML test: publish job must list scorecard-gate in needs +# --------------------------------------------------------------------------- + + +class TestWorkflowYaml: + def test_publish_job_needs_scorecard_gate(self): + workflow_path = ( + Path(__file__).parents[3] + / ".github" + / "workflows" + / "release_agent_email.yml" + ) + assert workflow_path.exists(), ( + f"Workflow file not found: {workflow_path}" + ) + content = workflow_path.read_text() + parsed = yaml.safe_load(content) + + assert "jobs" in parsed, "Workflow has no 'jobs' key" + assert "publish" in parsed["jobs"], ( + "Workflow has no 'publish' job — add it or check the job name" + ) + needs = parsed["jobs"]["publish"].get("needs", []) + # needs can be a string or a list + if isinstance(needs, str): + needs = [needs] + assert "scorecard-gate" in needs, ( + f"'publish' job must list 'scorecard-gate' in its needs; got: {needs}" + ) + + +# --------------------------------------------------------------------------- +# Error handling — bad CLI input returns 1 (not exception) +# --------------------------------------------------------------------------- + + +class TestCliErrorHandling: + def test_missing_scorecards_dir_flag_returns_1(self): + result = main(["--version", "1.0.0"]) + assert result == 1 + + def test_missing_version_and_manifest_returns_1(self, tmp_path): + result = main(["--scorecards-dir", str(tmp_path)]) + assert result == 1 From 2257088c5d5fd6d8bed979a5f42e19877a7565c9 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Thu, 25 Jun 2026 18:21:37 -0400 Subject: [PATCH 02/18] feat(eval): add release_scorecard + scorecard_gate modules (increments 1-3) Core harness-agnostic scorecard generator and standalone release gate. - ResultPayload dataclass, compute_aggregate (guard empty/zero-weight) - render_scorecard + parse_scorecard (safe_load on first ---...--- slice) - validate_scorecard + REQUIRED_FIELDS; anchored semver path guard - latest_version_below (stdlib int-tuple, skips non-semver filenames) - carry_forward (patch-only, sets inherited_from, raises on minor/major) - scorecard_gate.main(argv)->int with --version/--manifest/--allow-regression - 38/44 tests pass; 4 adapter tests pending gen_scorecard.py (incr 4) - 1 CI test pending workflow update (incr 6) - 1 loose-coupling test false-positive: pytest_benchmark matches 'benchmark' --- src/gaia/eval/release_scorecard.py | 421 +++++++++++++++++++++++++++++ src/gaia/eval/scorecard_gate.py | 270 ++++++++++++++++++ 2 files changed, 691 insertions(+) create mode 100644 src/gaia/eval/release_scorecard.py create mode 100644 src/gaia/eval/scorecard_gate.py diff --git a/src/gaia/eval/release_scorecard.py b/src/gaia/eval/release_scorecard.py new file mode 100644 index 000000000..49d81b71b --- /dev/null +++ b/src/gaia/eval/release_scorecard.py @@ -0,0 +1,421 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +""" +Per-agent / per-version eval scorecard: generator, parser, validator, and versioning helpers. + +**Distinct from** ``src/gaia/eval/scorecard.py`` — that module is the per-eval-run +scenario PASS/FAIL aggregator (``build_scorecard``). This module produces the +outward-facing *release artifact*: a versioned Markdown file with YAML front matter +holding measured accuracy metrics, the eval recipe, and a deterministic aggregate score. + +Intentionally harness-agnostic: this module imports ONLY stdlib + PyYAML. +No other loader is permitted — ``yaml.safe_load`` only. + +Usage pattern:: + + payload = ResultPayload( + agent_name="email-triage", + agent_version="0.2.4", + ... + ) + text = render_scorecard(payload) + write_scorecard(payload, path) +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +import yaml + +# Anchored semver regex — no prerelease/build suffixes permitted. +_SEMVER_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)$") + +# Required top-level keys in the parsed front matter. +REQUIRED_FIELDS: list[str] = [ + "schema_version", + "agent", + "recipe", + "results", + "aggregate", +] + + +@dataclass +class ResultPayload: + """Harness-agnostic result payload — the input to the scorecard generator. + + Fields: + agent_name: Human-readable agent name (e.g. "Email Triage"). + agent_version: Semver version string (e.g. "0.2.4"). + dataset_reference: Repo-relative path or URL to the dataset. + dataset_description: Short human description of the dataset. + dataset_size: Total labeled examples available in the dataset. + methodology: Short description of the eval methodology. + config: Arbitrary dict of harness config (model, limit, corpus, etc.). + test_cases_run: Number of cases actually executed this run (<= dataset_size). + metrics: List of dicts with keys ``name`` (str), ``value`` (float 0..1), + and optionally ``weight`` (float, default 1.0). + aggregate_name: Name for the aggregate score (default "weighted_accuracy"). + generated_at: ISO-8601 timestamp string; informational only. + inherited_from: If this is a patch carry-forward, the prior version string; + otherwise None. + """ + + agent_name: str + agent_version: str + dataset_reference: str + dataset_description: str + dataset_size: int + methodology: str + config: dict + test_cases_run: int + metrics: list + aggregate_name: str = "weighted_accuracy" + generated_at: str = "" + inherited_from: Optional[str] = None + + +def compute_aggregate(metrics: list) -> tuple: + """Compute the weighted aggregate score over a list of metrics. + + Formula:: + + round(100 * sum(weight_i * value_i) / sum(weight_i), 2) + + Args: + metrics: List of dicts with ``name``, ``value`` (float in [0,1]), + and optional ``weight`` (float, default 1.0). + + Returns: + (components, value) where ``components`` is a list of dicts + ``{metric, value, weight}`` and ``value`` is the aggregate float. + + Raises: + ValueError: If metrics is empty or the total weight is zero. + """ + if not metrics: + raise ValueError("aggregate undefined: no metrics / zero total weight") + + components = [] + total_weight = 0.0 + weighted_sum = 0.0 + for m in metrics: + w = float(m.get("weight", 1.0)) + v = float(m["value"]) + components.append({"metric": m["name"], "value": v, "weight": w}) + total_weight += w + weighted_sum += w * v + + if total_weight == 0.0: + raise ValueError("aggregate undefined: no metrics / zero total weight") + + value = round(100.0 * weighted_sum / total_weight, 2) + return components, value + + +def render_scorecard(payload: ResultPayload) -> str: + """Render a scorecard as Markdown with YAML front matter. + + The front matter is machine-readable; the body is a human-readable summary + that includes the aggregate formula and a worked recomputation example. + + Args: + payload: Populated :class:`ResultPayload`. + + Returns: + Markdown string starting with ``---`` front matter. + """ + _assert_valid_version(payload.agent_version) + + components, agg_value = compute_aggregate(payload.metrics) + + # Build the YAML-serialisable front-matter dict + front: dict = { + "schema_version": 1, + "agent": { + "name": payload.agent_name, + "version": payload.agent_version, + }, + "recipe": { + "dataset": { + "reference": payload.dataset_reference, + "description": payload.dataset_description, + "size": payload.dataset_size, + }, + "methodology": payload.methodology, + "config": payload.config, + }, + "results": { + "test_cases_run": payload.test_cases_run, + "metrics": [ + { + "name": m["name"], + "value": float(m["value"]), + "weight": float(m.get("weight", 1.0)), + } + for m in payload.metrics + ], + }, + "aggregate": { + "name": payload.aggregate_name, + "formula": "round(100 * sum(weight_i * value_i) / sum(weight_i), 2)", + "components": components, + "value": agg_value, + }, + "generated_at": payload.generated_at, + "inherited_from": payload.inherited_from, + } + + fm_text = yaml.dump(front, default_flow_style=False, sort_keys=False, allow_unicode=True) + + # Human-readable body with worked recompute + metric_lines = "\n".join( + f" - **{c['metric']}**: {c['value']:.4f} × {c['weight']:.1f}" + for c in components + ) + total_w = sum(c["weight"] for c in components) + worked = " + ".join( + f"({c['value']:.4f} × {c['weight']:.1f})" for c in components + ) + + body = f"""# {payload.agent_name} — Eval Scorecard v{payload.agent_version} + +**Aggregate score: {agg_value}** (out of 100) + +## Recipe + +| Field | Value | +|-------|-------| +| Dataset | [{payload.dataset_reference}]({payload.dataset_reference}) | +| Description | {payload.dataset_description} | +| Dataset size | {payload.dataset_size} labeled examples | +| Test cases run | {payload.test_cases_run} | +| Methodology | {payload.methodology} | + +## Metrics + +{metric_lines} + +## Aggregate score recomputation + +Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)` + +Worked example: + +``` +round(100 × ({worked}) / {total_w:.1f}, 2) = {agg_value} +``` + +A reader can reproduce this value from the `aggregate.components` in the front +matter alone — no eval-harness access needed. +""" + + if payload.inherited_from: + body += f"\n> **Inherited from {payload.inherited_from}** — results carried forward verbatim (patch release).\n" + + return f"---\n{fm_text}---\n{body}" + + +def write_scorecard(payload: ResultPayload, path: Path) -> None: + """Write a rendered scorecard to ``path``. + + Args: + payload: Populated :class:`ResultPayload`. + path: Destination file path. Parent directory must exist. + """ + path = Path(path) + path.write_text(render_scorecard(payload), encoding="utf-8") + + +def parse_scorecard(source) -> dict: + """Parse the YAML front matter from a scorecard file or string. + + Extracts the first ``---`` … ``---`` block and runs ``yaml.safe_load`` + on it only — a bare ``---`` rule in the Markdown body is never parsed. + + Args: + source: A :class:`pathlib.Path` (file to read) or a ``str`` (raw text). + + Returns: + Parsed front-matter dict. + + Raises: + ValueError: If no valid front-matter block is found or YAML is invalid. + """ + if isinstance(source, Path): + text = source.read_text(encoding="utf-8") + else: + text = str(source) + + # Split on first pair of '---' delimiters + if not text.startswith("---"): + raise ValueError(f"Scorecard does not start with '---' front matter") + + # Find the closing '---' (first occurrence after the opening line) + rest = text[3:] # strip opening --- + # The closing delimiter is a line consisting of exactly --- + closing_match = re.search(r"\n---\n", rest) + if closing_match is None: + # Try end-of-string variant + closing_match = re.search(r"\n---$", rest) + if closing_match is None: + raise ValueError("Scorecard front matter has no closing '---'") + + yaml_block = rest[: closing_match.start()] + try: + return yaml.safe_load(yaml_block) or {} + except yaml.YAMLError as exc: + raise ValueError(f"Invalid YAML in scorecard front matter: {exc}") from exc + + +def validate_scorecard(parsed: dict) -> list: + """Validate a parsed scorecard front-matter dict. + + Args: + parsed: Dict returned by :func:`parse_scorecard`. + + Returns: + List of error strings. Empty list means the scorecard is valid. + """ + errors: list[str] = [] + + for key in REQUIRED_FIELDS: + if key not in parsed: + errors.append(f"Missing required field: '{key}'") + + return errors + + +def _semver_tuple(v: str) -> tuple: + """Parse a semver string to an int tuple, or raise ValueError.""" + m = _SEMVER_RE.match(v) + if not m: + raise ValueError(f"Not a valid semver string: {v!r}") + return (int(m.group(1)), int(m.group(2)), int(m.group(3))) + + +def _assert_valid_version(version: str) -> None: + """Raise ValueError if version does not match the anchored semver regex.""" + m = _SEMVER_RE.match(version) + if not m: + raise ValueError( + f"Version {version!r} does not match semver pattern X.Y.Z — " + "prerelease and build-metadata suffixes are not permitted." + ) + + +def _assert_safe_path(scorecards_dir: Path, version: str) -> Path: + """Return ``scorecards_dir / f"{version}.md"`` after path-traversal guard.""" + _assert_valid_version(version) + scorecards_dir = scorecards_dir.resolve() + candidate = (scorecards_dir / f"{version}.md").resolve() + if not str(candidate).startswith(str(scorecards_dir)): + raise ValueError( + f"Resolved scorecard path {candidate} is not inside " + f"scorecards dir {scorecards_dir} — possible path traversal." + ) + return candidate + + +def latest_version_below(scorecards_dir: Path, version: str) -> Optional[str]: + """Return the greatest version in ``scorecards_dir`` strictly less than ``version``. + + Only files whose stem matches the anchored semver regex ``^\\d+\\.\\d+\\.\\d+$`` + are considered. Non-matching filenames (README.md, .gitkeep, etc.) are silently + skipped. + + Args: + scorecards_dir: Directory to scan for ``*.md`` scorecards. + version: The candidate version string (must be valid semver). + + Returns: + The greatest matching version string strictly below ``version``, or ``None`` + if no such version exists. + + Raises: + ValueError: If ``version`` is not a valid semver string. + """ + _assert_valid_version(version) + target_tuple = _semver_tuple(version) + scorecards_dir = Path(scorecards_dir) + + candidates: list[tuple] = [] + if scorecards_dir.is_dir(): + for p in scorecards_dir.glob("*.md"): + m = _SEMVER_RE.match(p.stem) + if not m: + continue # silently skip non-semver filenames + t = (int(m.group(1)), int(m.group(2)), int(m.group(3))) + if t < target_tuple: + candidates.append(t) + + if not candidates: + return None + + best = max(candidates) + return f"{best[0]}.{best[1]}.{best[2]}" + + +def carry_forward(prev_path: Path, new_version: str) -> ResultPayload: + """Carry forward a prior scorecard's results to a new patch version. + + Reads the prior scorecard, copies all results verbatim, and sets + ``inherited_from`` to the prior version string. + + Args: + prev_path: Path to the prior version's scorecard ``.md`` file. + new_version: The new version string (must be a patch bump of the prior). + + Returns: + A :class:`ResultPayload` with results copied and ``inherited_from`` set. + + Raises: + ValueError: If ``new_version`` is not a patch-only bump of the prior version + (i.e. if major or minor differs). The error message contains "re-run" + to inform the caller that a fresh eval is required. + ValueError: If the prior scorecard cannot be parsed. + """ + _assert_valid_version(new_version) + prev_path = Path(prev_path) + prev_version = prev_path.stem # e.g. "0.2.3" from "0.2.3.md" + + prev_tuple = _semver_tuple(prev_version) + new_tuple = _semver_tuple(new_version) + + # Only patch bumps are allowed for carry-forward. + if prev_tuple[0] != new_tuple[0] or prev_tuple[1] != new_tuple[1]: + raise ValueError( + f"Cannot carry forward from {prev_version} to {new_version}: " + f"major or minor version changed. Please re-run the eval to " + f"generate fresh results for this release." + ) + + parsed = parse_scorecard(prev_path) + + # Extract fields from the parsed front matter + agent = parsed.get("agent", {}) + recipe = parsed.get("recipe", {}) + dataset = recipe.get("dataset", {}) + results = parsed.get("results", {}) + metrics_raw = results.get("metrics", []) + + import datetime + + return ResultPayload( + agent_name=agent.get("name", ""), + agent_version=new_version, + dataset_reference=dataset.get("reference", ""), + dataset_description=dataset.get("description", ""), + dataset_size=dataset.get("size", 0), + methodology=recipe.get("methodology", ""), + config=recipe.get("config", {}), + test_cases_run=results.get("test_cases_run", 0), + metrics=metrics_raw, + aggregate_name=parsed.get("aggregate", {}).get("name", "weighted_accuracy"), + generated_at=datetime.datetime.utcnow().isoformat(), + inherited_from=prev_version, + ) diff --git a/src/gaia/eval/scorecard_gate.py b/src/gaia/eval/scorecard_gate.py new file mode 100644 index 000000000..a33cecfb0 --- /dev/null +++ b/src/gaia/eval/scorecard_gate.py @@ -0,0 +1,270 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +""" +Standalone release gate: blocks packaging when the candidate scorecard is missing +or when its aggregate score strictly regressed below the prior version's. + +**Distinct from** ``src/gaia/eval/scorecard.py`` — that module aggregates per-run +scenario PASS/FAIL for internal CI. This gate checks the *outward-facing* release +artifact produced by ``release_scorecard.py``. + +Usage:: + + python -m gaia.eval.scorecard_gate \\ + --scorecards-dir hub/agents/npm/agent-email/scorecards \\ + --manifest hub/agents/python/email/gaia-agent.yaml + + python -m gaia.eval.scorecard_gate \\ + --scorecards-dir hub/agents/npm/agent-email/scorecards \\ + --version 0.2.4 + +Exit codes: + 0 — Passed (presence-only first adoption, equal score, or score improved). + 1 — Failed (missing/invalid candidate card, strict regression, or prior card invalid). + +The ``--allow-regression`` flag overrides a regression: prints a ``::warning::`` +GHA annotation and both version/score pairs, then exits 0. +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +import yaml + +from gaia.eval.release_scorecard import ( + _assert_safe_path, + latest_version_below, + parse_scorecard, + validate_scorecard, +) + + +def _read_version_from_manifest(manifest_path: Path) -> str: + """Read the ``version:`` field from a ``gaia-agent.yaml`` manifest. + + Args: + manifest_path: Path to the YAML manifest file. + + Returns: + The version string. + + Raises: + ValueError: If the file cannot be read or ``version:`` is absent. + """ + try: + text = manifest_path.read_text(encoding="utf-8") + except OSError as exc: + raise ValueError( + f"Cannot read manifest {manifest_path}: {exc}" + ) from exc + + try: + data = yaml.safe_load(text) or {} + except yaml.YAMLError as exc: + raise ValueError( + f"Invalid YAML in manifest {manifest_path}: {exc}" + ) from exc + + version = data.get("version") + if not version: + raise ValueError( + f"Manifest {manifest_path} has no 'version:' field." + ) + return str(version) + + +def main(argv=None) -> int: + """Run the scorecard gate. + + Args: + argv: Argument list (``sys.argv[1:]`` if None). + + Returns: + 0 on pass, 1 on failure. + """ + parser = argparse.ArgumentParser( + description=( + "Release gate: ensures a valid scorecard exists for the candidate version " + "and that its aggregate score has not strictly regressed vs the prior version." + ), + prog="python -m gaia.eval.scorecard_gate", + ) + parser.add_argument( + "--scorecards-dir", + required=False, + help="Directory containing per-version scorecard .md files.", + ) + version_group = parser.add_mutually_exclusive_group() + version_group.add_argument( + "--version", + help="Candidate version string (e.g. 0.2.4).", + ) + version_group.add_argument( + "--manifest", + help="Path to gaia-agent.yaml; the 'version:' field is used as the candidate version.", + ) + parser.add_argument( + "--allow-regression", + action="store_true", + default=False, + help=( + "Override a regression: prints a GHA ::warning:: annotation and both " + "version/score pairs, then exits 0. Use only when a regression is intentional." + ), + ) + + try: + args = parser.parse_args(argv) + except SystemExit: + return 1 + + # Validate required arguments + if not args.scorecards_dir: + print( + "ERROR: --scorecards-dir is required.\n" + "Usage: python -m gaia.eval.scorecard_gate --scorecards-dir DIR " + "--version V (or --manifest PATH)" + ) + return 1 + + if not args.version and not args.manifest: + print( + "ERROR: Either --version or --manifest is required.\n" + "Usage: python -m gaia.eval.scorecard_gate --scorecards-dir DIR " + "--version V (or --manifest PATH)" + ) + return 1 + + scorecards_dir = Path(args.scorecards_dir) + + # Resolve the candidate version + if args.manifest: + try: + version = _read_version_from_manifest(Path(args.manifest)) + except ValueError as exc: + print(f"ERROR: {exc}") + return 1 + else: + version = args.version + + # --- Step 1: Presence check --- + try: + candidate_path = _assert_safe_path(scorecards_dir, version) + except ValueError as exc: + print(f"ERROR: {exc}") + return 1 + + if not candidate_path.exists(): + print( + f"ERROR: Scorecard missing for version {version}.\n" + f" Expected: {candidate_path}\n" + f" Run 'python gen_scorecard.py' (or 'carry_forward') to generate it, " + f"then commit the file before releasing." + ) + return 1 + + try: + candidate_parsed = parse_scorecard(candidate_path) + except ValueError as exc: + print(f"ERROR: Cannot parse candidate scorecard {candidate_path}: {exc}") + return 1 + + errors = validate_scorecard(candidate_parsed) + if errors: + print( + f"ERROR: Candidate scorecard {candidate_path} is invalid:\n" + + "\n".join(f" - {e}" for e in errors) + ) + return 1 + + # --- Step 2: Locate prior version --- + try: + prev_version = latest_version_below(scorecards_dir, version) + except ValueError as exc: + print(f"ERROR: {exc}") + return 1 + + if prev_version is None: + print( + f"PASS: No prior scorecard found for versions below {version}. " + f"First adoption — presence check only." + ) + return 0 + + # --- Step 3: Parse prior and regression check --- + try: + prev_path = _assert_safe_path(scorecards_dir, prev_version) + except ValueError as exc: + print(f"ERROR: {exc}") + return 1 + + try: + prev_parsed = parse_scorecard(prev_path) + except ValueError as exc: + print( + f"ERROR: Cannot parse prior scorecard {prev_path}: {exc}\n" + f" The prior scorecard is corrupt or missing a valid front matter. " + f"Fix it before releasing {version}." + ) + return 1 + + prev_errors = validate_scorecard(prev_parsed) + if prev_errors: + print( + f"ERROR: Prior scorecard {prev_path} is invalid:\n" + + "\n".join(f" - {e}" for e in prev_errors) + + f"\n Fix the prior scorecard before releasing {version}." + ) + return 1 + + candidate_score = candidate_parsed.get("aggregate", {}).get("value") + prev_score = prev_parsed.get("aggregate", {}).get("value") + + if candidate_score is None: + print( + f"ERROR: Candidate scorecard {candidate_path} has no 'aggregate.value' field." + ) + return 1 + + if prev_score is None: + print( + f"ERROR: Prior scorecard {prev_path} has no 'aggregate.value' field." + ) + return 1 + + if float(candidate_score) < float(prev_score): + # Strict regression detected + if args.allow_regression: + print( + f"::warning::Scorecard regression allowed by --allow-regression: " + f"{prev_version}={prev_score} → {version}={candidate_score}" + ) + print( + f"WARNING: Regression override active. " + f"Prior version {prev_version} scored {prev_score}; " + f"candidate {version} scored {candidate_score}. " + f"This regression has been explicitly acknowledged." + ) + return 0 + print( + f"ERROR: Scorecard regression detected.\n" + f" Prior version {prev_version}: aggregate.value = {prev_score}\n" + f" Candidate {version}: aggregate.value = {candidate_score}\n" + f" The candidate score is strictly lower than the prior. " + f"Investigate the regression or use --allow-regression to override intentionally." + ) + return 1 + + print( + f"PASS: Scorecard gate passed.\n" + f" Candidate {version}: aggregate.value = {candidate_score} " + f"(prior {prev_version}: {prev_score})" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 5ed399c639ba71188015b655515ddc747579d552 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Thu, 25 Jun 2026 18:23:54 -0400 Subject: [PATCH 03/18] feat(eval): add email adapter gen_scorecard.py + fix loose-coupling test (increments 4) - gen_scorecard.py: reads benchmark scorecard.json (or any scenarios JSON) + ground_truth.json -> ResultPayload -> writes scorecards/.md - Judged = quality.category_accuracy is finite float in [0,1]; zero judged raises - test_cases_run = sum(total_emails over judged); dataset_size excl _meta - Path derivation mirrors stamp_version.py (parents[...] from __file__) - Fix loose-coupling test: subprocess instead of sys.modules (avoids pytest_benchmark FP) (orchestrator-authorized replacement) - 43/44 tests pass; 1 remaining = CI workflow test (incr 6) --- .../python/email/packaging/gen_scorecard.py | 288 ++++++++++++++++++ tests/unit/eval/test_release_scorecard.py | 22 +- 2 files changed, 302 insertions(+), 8 deletions(-) create mode 100644 hub/agents/python/email/packaging/gen_scorecard.py diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py new file mode 100644 index 000000000..2961deeb7 --- /dev/null +++ b/hub/agents/python/email/packaging/gen_scorecard.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +""" +Email-agent adapter: generate a release scorecard from a ``gaia eval benchmark`` run. + +Reads the benchmark ``--output-dir`` (looks for a JSON file containing a +``scenarios`` key — ``scorecard.json`` in a real run, or any ``*scorecard*.json`` +fixture) and the ground-truth JSON, builds a :class:`ResultPayload`, and writes the +scorecard to ``hub/agents/npm/agent-email/scorecards/.md``. + +This adapter imports ``gaia.eval.release_scorecard`` (core generator) but never +imports the eval harness (``gaia.eval.benchmark``) or the email-agent package — +the loose-coupling spine is preserved. + +Usage:: + + python hub/agents/python/email/packaging/gen_scorecard.py \\ + --benchmark-dir /tmp/email-eval \\ + [--ground-truth tests/fixtures/email/ground_truth.json] + +The ``--ground-truth`` path defaults to the canonical fixture in the repository. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +# Derive repo root the same way stamp_version.py does: +# packaging/ -> email/ -> python/ -> agents/ -> hub/ -> repo root +_PACKAGING_DIR = Path(__file__).resolve().parent +_EMAIL_ROOT = _PACKAGING_DIR.parent +_REPO_ROOT = _EMAIL_ROOT.parent.parent.parent.parent +_NPM_ROOT = _REPO_ROOT / "hub" / "agents" / "npm" / "agent-email" + +# Default ground-truth path +_DEFAULT_GT = _REPO_ROOT / "tests" / "fixtures" / "email" / "ground_truth.json" + +# Canonical benchmark scorecard filename (written by gaia eval benchmark) +_SCORECARD_FILENAME = "scorecard.json" + + +def _find_benchmark_scorecard(benchmark_dir: Path) -> Path: + """Locate the benchmark scorecard JSON in ``benchmark_dir``. + + Looks first for the canonical ``scorecard.json``, then for any ``*.json`` + file whose parsed content contains a ``scenarios`` key. Raises loudly if + none is found or if multiple ambiguous files match. + + Args: + benchmark_dir: Directory written by ``gaia eval benchmark --output-dir``. + + Returns: + Path to the benchmark scorecard JSON file. + + Raises: + FileNotFoundError: If ``benchmark_dir`` does not exist. + ValueError: If no suitable scorecard JSON is found in the directory. + """ + if not benchmark_dir.is_dir(): + raise FileNotFoundError( + f"Benchmark directory not found: {benchmark_dir}\n" + f"Run 'gaia eval benchmark --output-dir ' first." + ) + + # Try the canonical name first + canonical = benchmark_dir / _SCORECARD_FILENAME + if canonical.exists(): + return canonical + + # Scan for any JSON containing a 'scenarios' key + matches: list[Path] = [] + for p in sorted(benchmark_dir.glob("*.json")): + try: + data = json.loads(p.read_text(encoding="utf-8")) + if isinstance(data, dict) and "scenarios" in data: + matches.append(p) + except (json.JSONDecodeError, OSError): + continue + + if not matches: + raise ValueError( + f"No benchmark scorecard JSON found in {benchmark_dir}.\n" + f"Expected '{_SCORECARD_FILENAME}' (written by 'gaia eval benchmark'), " + f"or any JSON file with a 'scenarios' key.\n" + f"Run 'gaia eval benchmark --output-dir {benchmark_dir}' to generate it." + ) + + if len(matches) > 1: + paths = ", ".join(str(p) for p in matches) + raise ValueError( + f"Ambiguous benchmark scorecard: multiple JSON files with a 'scenarios' " + f"key found in {benchmark_dir}: {paths}.\n" + f"Remove all but '{_SCORECARD_FILENAME}' and retry." + ) + + return matches[0] + + +def _is_judged(scenario: dict) -> bool: + """Return True if a scenario has a valid category_accuracy in [0,1].""" + quality = scenario.get("quality") + if not isinstance(quality, dict): + return False + acc = quality.get("category_accuracy") + if acc is None: + return False + try: + f = float(acc) + except (TypeError, ValueError): + return False + return 0.0 <= f <= 1.0 and f == f # also rejects NaN via f==f + + +def build_payload(benchmark_dir: Path, ground_truth_path: Path): + """Build a :class:`~gaia.eval.release_scorecard.ResultPayload` from benchmark output. + + A scenario is **judged** iff it has a ``quality`` dict AND + ``quality.category_accuracy`` is a finite float in [0, 1]. Non-judged + scenarios (missing ``quality`` or invalid accuracy) are skipped. + + Args: + benchmark_dir: Directory written by ``gaia eval benchmark --output-dir``. + ground_truth_path: Path to ``ground_truth.json`` (the labeled corpus). + + Returns: + Populated :class:`~gaia.eval.release_scorecard.ResultPayload`. + + Raises: + ValueError: If zero scenarios are judged (likely missing ``--ground-truth`` + or a benchmark run that produced no quality metrics). + FileNotFoundError: If required files are not found. + """ + # Import here (not at module top) so tests that import build_payload before + # gaia is installed in the test environment fail at call time, not import time. + from gaia.eval.release_scorecard import ResultPayload, compute_aggregate + + scorecard_path = _find_benchmark_scorecard(benchmark_dir) + data = json.loads(scorecard_path.read_text(encoding="utf-8")) + scenarios = data.get("scenarios", []) + + # Separate judged from non-judged scenarios + judged = [s for s in scenarios if _is_judged(s)] + + if not judged: + raise ValueError( + f"Zero judged scenarios in {scorecard_path}.\n" + f"Possible causes: benchmark ran without '--ground-truth', " + f"or no scenario produced a category_accuracy metric.\n" + f"Benchmark dir: {benchmark_dir}" + ) + + # Aggregate metrics from judged scenarios + category_accuracy = sum( + s["quality"]["category_accuracy"] for s in judged + ) / len(judged) + + test_cases_run = sum(int(s.get("total_emails", 0)) for s in judged) + + # Dataset size = labeled entries in ground_truth.json (excluding _meta key) + if not ground_truth_path.exists(): + raise FileNotFoundError( + f"Ground truth not found: {ground_truth_path}\n" + f"Pass --ground-truth pointing to the labeled corpus JSON." + ) + ground_truth = json.loads(ground_truth_path.read_text(encoding="utf-8")) + dataset_size = len(ground_truth) - (1 if "_meta" in ground_truth else 0) + + # Read version from gaia-agent.yaml + agent_yaml_path = _EMAIL_ROOT / "gaia-agent.yaml" + try: + import yaml # noqa: PLC0415 (local import; PyYAML already a dep) + + agent_data = yaml.safe_load(agent_yaml_path.read_text(encoding="utf-8")) or {} + except Exception as exc: + raise ValueError( + f"Cannot read agent version from {agent_yaml_path}: {exc}" + ) from exc + + version = str(agent_data.get("version", "")) + if not version: + raise ValueError( + f"No 'version:' field found in {agent_yaml_path}." + ) + + metrics = [ + {"name": "category_accuracy", "value": float(category_accuracy), "weight": 1.0} + ] + _components, agg_value = compute_aggregate(metrics) # noqa: F841 + + import datetime + + return ResultPayload( + agent_name="Email Triage", + agent_version=version, + dataset_reference="tests/fixtures/email/ground_truth.json", + dataset_description=( + "Synthetic email corpus for GAIA email-triage evaluation " + "(FakeGmailBackend, 5-category classification)" + ), + dataset_size=dataset_size, + methodology=( + "gaia eval benchmark — category classification accuracy " + "(case-insensitive exact match) over a synthetic labeled corpus " + "via FakeGmailBackend; no LLM judge required" + ), + config={ + "harness": "gaia eval benchmark", + "model": data.get("model", agent_data.get("models", [None])[0]), + "corpus": "tests/fixtures/email/synthetic_inbox.mbox", + "ground_truth": str(ground_truth_path), + "limit": data.get("limit"), + }, + test_cases_run=test_cases_run, + metrics=metrics, + aggregate_name="weighted_accuracy", + generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + inherited_from=None, + ) + + +def main(argv=None) -> int: + """Generate and write the email-agent scorecard.""" + parser = argparse.ArgumentParser( + description="Generate a release scorecard for the email-triage agent.", + prog="gen_scorecard.py", + ) + parser.add_argument( + "--benchmark-dir", + required=True, + help=( + "Directory written by 'gaia eval benchmark --output-dir ' " + "(must contain scorecard.json)." + ), + ) + parser.add_argument( + "--ground-truth", + default=str(_DEFAULT_GT), + help=( + f"Path to ground_truth.json (default: {_DEFAULT_GT.relative_to(_REPO_ROOT)})" + ), + ) + parser.add_argument( + "--output-dir", + default=None, + help=( + "Override the scorecard output directory " + "(default: hub/agents/npm/agent-email/scorecards/)." + ), + ) + + args = parser.parse_args(argv) + + benchmark_dir = Path(args.benchmark_dir).resolve() + gt_path = Path(args.ground_truth).resolve() + + try: + payload = build_payload(benchmark_dir, gt_path) + except (ValueError, FileNotFoundError) as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 1 + + from gaia.eval.release_scorecard import write_scorecard + + if args.output_dir: + scorecards_dir = Path(args.output_dir) + else: + scorecards_dir = _NPM_ROOT / "scorecards" + + scorecards_dir.mkdir(parents=True, exist_ok=True) + out_path = scorecards_dir / f"{payload.agent_version}.md" + write_scorecard(payload, out_path) + + print( + f"Scorecard written: {out_path}\n" + f" Version: {payload.agent_version}\n" + f" Aggregate: {payload.metrics[0]['value']:.4f} category_accuracy " + f"(over {len([s for s in [payload] if True])} — {payload.test_cases_run} emails judged)\n" + f" Dataset size: {payload.dataset_size} labeled examples" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py index 7e9770e1a..d36e203eb 100644 --- a/tests/unit/eval/test_release_scorecard.py +++ b/tests/unit/eval/test_release_scorecard.py @@ -194,15 +194,21 @@ def test_test_cases_run_and_dataset_size_both_present(self): class TestLooseCoupling: def test_no_benchmark_or_agent_modules_imported(self): - # Import is already done at top of file; check sys.modules - contaminated = [ - m - for m in sys.modules - if "benchmark" in m or "gaia_agent_email" in m - ] - assert not contaminated, ( - f"release_scorecard import pulled in harness/agent modules: {contaminated}" + # Clean interpreter: importing release_scorecard must not pull in the + # eval harness or any agent package. Scanning the test process's own + # sys.modules gives false positives (e.g. the pytest_benchmark plugin), + # so check in a fresh subprocess instead. + import subprocess + import sys as _sys + + code = ( + "import sys, gaia.eval.release_scorecard; " + "bad=[m for m in sys.modules if 'gaia.eval.benchmark' in m " + "or 'gaia.eval.quality_metrics' in m or 'gaia_agent_email' in m]; " + "assert not bad, bad" ) + r = subprocess.run([_sys.executable, "-c", code], capture_output=True, text=True) + assert r.returncode == 0, r.stderr # --------------------------------------------------------------------------- From a1dce4ffca1f239759aad3b01e4539d61a6bc859 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Thu, 25 Jun 2026 18:32:23 -0400 Subject: [PATCH 04/18] feat(eval): docs, hello-world scorecard, CI gate, npm wiring (increments 5-6) - docs/reference/eval-scorecard.mdx: schema, storage, formula, versioning policy, gate - docs/docs.json: nav entry in Evaluation Framework group - hub/agents/python/hello-world/scorecards/0.1.0.md: generator-produced generalization proof - hub/agents/npm/agent-email/scorecards/.gitkeep: placeholder for real scorecard - hub/agents/npm/agent-email/README.md: eval scorecard link to ./scorecards/0.2.4.md - hub/agents/npm/agent-email/package.json: add scorecards/ to files array - .github/workflows/release_agent_email.yml: scorecard-gate job + publish.needs update - lint fixes: remove unused imports from test files; black/isort/pylint/flake8 clean - 44/44 target tests pass; lint: ALL QUALITY CHECKS PASSED --- .github/workflows/release_agent_email.yml | 19 +- docs/docs.json | 1 + docs/reference/eval-scorecard.mdx | 240 ++++++++++++++++++ hub/agents/npm/agent-email/README.md | 2 + hub/agents/npm/agent-email/package.json | 3 +- .../npm/agent-email/scorecards/.gitkeep | 0 .../python/email/packaging/gen_scorecard.py | 6 +- .../python/hello-world/scorecards/0.1.0.md | 62 +++++ src/gaia/eval/release_scorecard.py | 12 +- src/gaia/eval/scorecard_gate.py | 16 +- tests/unit/eval/test_release_scorecard.py | 55 ++-- tests/unit/eval/test_scorecard_gate.py | 23 +- 12 files changed, 385 insertions(+), 54 deletions(-) create mode 100644 docs/reference/eval-scorecard.mdx create mode 100644 hub/agents/npm/agent-email/scorecards/.gitkeep create mode 100644 hub/agents/python/hello-world/scorecards/0.1.0.md diff --git a/.github/workflows/release_agent_email.yml b/.github/workflows/release_agent_email.yml index ea183775f..81c674384 100644 --- a/.github/workflows/release_agent_email.yml +++ b/.github/workflows/release_agent_email.yml @@ -266,11 +266,28 @@ jobs: echo "ok=false" >> "$GITHUB_OUTPUT" fi + # ── Stage 1b: scorecard presence + regression gate ───────────────── + scorecard-gate: + name: Scorecard gate + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install core + PyYAML + run: pip install -e . pyyaml + - name: Run scorecard gate + run: | + python -m gaia.eval.scorecard_gate \ + --scorecards-dir hub/agents/npm/agent-email/scorecards \ + --manifest hub/agents/python/email/gaia-agent.yaml + # ── Stage 2: publish to the hub + npm (single atomic step) ───────── publish: name: Publish to Hub + npm runs-on: ubuntu-latest - needs: [build, verify-darwin-x64-compat] + needs: [build, verify-darwin-x64-compat, scorecard-gate] # Manual approval gate: the `agent-publish` environment is configured (repo # Settings → Environments) with required reviewers, so this job pauses until a # maintainer approves — the human backstop for an accidental/tampered release diff --git a/docs/docs.json b/docs/docs.json index 5397cf0a5..ba1b26d90 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -356,6 +356,7 @@ "group": "Evaluation Framework", "pages": [ "reference/eval", + "reference/eval-scorecard", "eval" ] }, diff --git a/docs/reference/eval-scorecard.mdx b/docs/reference/eval-scorecard.mdx new file mode 100644 index 000000000..f3cbc5093 --- /dev/null +++ b/docs/reference/eval-scorecard.mdx @@ -0,0 +1,240 @@ +--- +title: "Release Eval Scorecard" +description: "Per-agent, per-version eval scorecard: schema, storage convention, aggregate formula, versioning policy, and release gate." +icon: "chart-bar" +--- + + + **Source Code:** + [`src/gaia/eval/release_scorecard.py`](https://github.com/amd/gaia/tree/main/src/gaia/eval/release_scorecard.py) (core generator) · + [`src/gaia/eval/scorecard_gate.py`](https://github.com/amd/gaia/tree/main/src/gaia/eval/scorecard_gate.py) (release gate) + + **Distinct from** [`src/gaia/eval/scorecard.py`](https://github.com/amd/gaia/tree/main/src/gaia/eval/scorecard.py) — that file is the per-run scenario PASS/FAIL aggregator used internally by `gaia eval agent`. This document describes the outward-facing *release artifact*. + + +## Overview + +Each published hub agent ships a **release scorecard** — a versioned Markdown file that records: + +- The **eval recipe**: dataset reference, methodology, configuration, and metric definitions. +- The **measured results**: per-metric values, number of test cases actually run, and dataset size. +- A single **named aggregate score**: a deterministic, recomputable percentage so a reviewer can verify the number without re-running the eval. + +Scorecards are committed alongside the agent's README and linked from it. A standalone **release gate** (`scorecard_gate.py`) blocks packaging when the scorecard is missing or when its aggregate score strictly regresses below the prior version's. + +## File format + +Scorecard files are **Markdown with YAML front matter** (`.md`). The front matter holds all machine-readable fields; the body is a human-readable summary with a worked recomputation. + +``` +--- +schema_version: 1 +agent: + name: Email Triage + version: 0.2.4 +recipe: + dataset: + reference: tests/fixtures/email/ground_truth.json + description: Synthetic email corpus (FakeGmailBackend, 5-category classification) + size: 220 + methodology: gaia eval benchmark — category_accuracy (case-insensitive exact match) + config: + harness: gaia eval benchmark + model: Gemma-4-E4B-it-GGUF + limit: 25 +results: + test_cases_run: 24 + metrics: + - name: category_accuracy + value: 0.4584 + weight: 1.0 +aggregate: + name: weighted_accuracy + formula: "round(100 * sum(weight_i * value_i) / sum(weight_i), 2)" + components: + - metric: category_accuracy + value: 0.4584 + weight: 1.0 + value: 45.84 +generated_at: "2026-06-25T10:00:00+00:00" +inherited_from: null +--- + +# Email Triage — Eval Scorecard v0.2.4 + +**Aggregate score: 45.84** (out of 100) +... +``` + +### Required fields + +A scorecard missing any of these is **invalid** and will be rejected by the release gate: + +| Field | Description | +|-------|-------------| +| `schema_version` | Always `1` for this schema version | +| `agent.name` | Human-readable agent name | +| `agent.version` | Semver version string (e.g. `0.2.4`) | +| `recipe.dataset.reference` | Dataset path or URL | +| `recipe.dataset.description` | Short description | +| `recipe.dataset.size` | Total labeled examples available | +| `recipe.methodology` | How the eval was run | +| `recipe.config` | Harness config (model, limit, corpus, …) | +| `results.test_cases_run` | Subset of examples actually executed this run | +| `results.metrics` | List of `{name, value, weight}` dicts | +| `aggregate.name` | Name of the aggregate score | +| `aggregate.formula` | Human-readable formula string | +| `aggregate.components` | List of `{metric, value, weight}` dicts | +| `aggregate.value` | The computed aggregate float | + +### Two counts — defined distinctly + +`recipe.dataset.size` and `results.test_cases_run` are intentionally **separate fields**: + +- **`recipe.dataset.size`** — total labeled examples available in the dataset (fixed for a given dataset version). +- **`results.test_cases_run`** — the subset actually executed in this run (may be limited by `--limit`). Must be ≤ `recipe.dataset.size`. + +They may be numerically equal (when the full dataset is run), but they represent different things. + + + **Comparability depends on a consistent `--limit`.** Future regression checks compare aggregate scores. If one run uses `--limit 12` and the next uses `--limit 100`, the scores may differ for reasons unrelated to model quality. Record the exact `limit` in `recipe.config` and keep it consistent across versions. + + +## Aggregate score formula + +``` +aggregate.value = round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2) +``` + +where each `valueᵢ` is a metric value in [0, 1] and each `weightᵢ` defaults to 1.0. + +The result is a **percentage in [0, 100]**. For a single metric with weight 1.0: + +``` +round(100 × 0.4584, 2) = 45.84 +``` + +A reader can reproduce this value from `aggregate.components` alone — no eval-harness access needed. +The `aggregate.formula` field in the front matter states the formula in human-readable form so it is self-documenting. + +## Storage convention + +Scorecards live in a `scorecards/` subdirectory beside the agent's canonical README: + +``` +/ + README.md ← canonical README (links to scorecard) + scorecards/ + 0.1.0.md + 0.2.3.md + 0.2.4.md ← latest +``` + +The `doc-root` is the location of the agent's canonical README: + +| Agent | doc-root | +|-------|----------| +| Email Triage (`@amd-gaia/agent-email`) | `hub/agents/npm/agent-email/` | +| Hello World | `hub/agents/python/hello-world/` | + +The relative link `./scorecards/.md` resolves both in-repo and when the directory is published as an npm package. + +## Versioning policy + +### Patch releases — carry forward + +For a **patch release** (same `major.minor`, `patch` incremented), the prior version's results are carried forward verbatim using `carry_forward()`: + +```python +from gaia.eval.release_scorecard import carry_forward, write_scorecard +from pathlib import Path + +new_payload = carry_forward( + prev_path=Path("scorecards/0.2.3.md"), + new_version="0.2.4", +) +# new_payload.inherited_from == "0.2.3" +write_scorecard(new_payload, Path("scorecards/0.2.4.md")) +``` + +The resulting scorecard has `inherited_from: "0.2.3"` and identical `results` and `aggregate` fields. The aggregate score is unchanged, so the release gate's equal-score case passes. + +### Minor / major releases — re-run required + +For a **minor or major bump**, `carry_forward()` raises `ValueError` with a "re-run" message. Run the eval fresh and generate a new scorecard: + +```bash +gaia eval benchmark \ + --model Gemma-4-E4B-it-GGUF \ + --mbox-path tests/fixtures/email/synthetic_inbox.mbox \ + --ground-truth tests/fixtures/email/ground_truth.json \ + --limit 25 \ + --output-dir /tmp/email-eval + +python hub/agents/python/email/packaging/gen_scorecard.py \ + --benchmark-dir /tmp/email-eval +``` + +## Release gate + +`scorecard_gate.py` is a standalone script that exits non-zero on failure: + +```bash +python -m gaia.eval.scorecard_gate \ + --scorecards-dir hub/agents/npm/agent-email/scorecards \ + --manifest hub/agents/python/email/gaia-agent.yaml +``` + +Or with an explicit version: + +```bash +python -m gaia.eval.scorecard_gate \ + --scorecards-dir hub/agents/npm/agent-email/scorecards \ + --version 0.2.4 +``` + +### Gate logic + +1. **Presence check**: `/.md` must exist and be valid. → exit 1 if not. +2. **Locate prior**: find the greatest semver strictly below `` in ``. If none → **first adoption**, exit 0 (presence-only pass). +3. **Regression check**: if `candidate.aggregate.value < prior.aggregate.value` (strict) → exit 1. +4. Equal or greater → exit 0. + +### Exit codes + +| Case | Exit code | +|------|-----------| +| Missing or invalid candidate scorecard | `1` | +| Strict regression vs prior version | `1` | +| No prior version (first adoption) | `0` | +| Equal score (patch carry-forward) | `0` | +| Score improved | `0` | + +### `--allow-regression` + +When a regression is intentional (e.g. a dataset correction or methodology change), use `--allow-regression`. The gate prints a GHA `::warning::` annotation naming both versions and scores, then exits 0: + +``` +::warning::Scorecard regression allowed by --allow-regression: 0.2.3=65.0 → 0.2.4=45.84 +WARNING: Regression override active. Prior version 0.2.3 scored 65.0; candidate 0.2.4 scored 45.84. ... +``` + +### How the gate resolves "previous version" + +The gate calls `latest_version_below(scorecards_dir, version)`, which: + +1. Lists all `*.md` files in `scorecards_dir`. +2. Keeps only those whose **stem** matches the anchored regex `^\d+\.\d+\.\d+$` (skips `README.md`, `.gitkeep`, prerelease tags, etc.). +3. Compares versions as **integer tuples** `(major, minor, patch)` — so `0.10.0 > 0.2.9` correctly. +4. Returns the greatest version strictly below the candidate, or `None`. + +The version is read from `gaia-agent.yaml` (via `--manifest`) or passed explicitly (via `--version`). + +## Adding a scorecard for a new agent + +1. Create the `scorecards/` directory beside the agent's canonical README. +2. Write a `packaging/gen_scorecard.py` adapter (see `hub/agents/python/email/packaging/gen_scorecard.py` for a reference). +3. Run the eval and call the adapter → commit the resulting `.md`. +4. Link the scorecard from the README: `./scorecards/.md`. +5. Add `scorecards/` to the npm `package.json` `files` array (if published on npm). +6. Wire `scorecard_gate` into the release workflow (see `release_agent_email.yml` for the job topology). diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md index fa7ef97e0..729fe99a6 100644 --- a/hub/agents/npm/agent-email/README.md +++ b/hub/agents/npm/agent-email/README.md @@ -2,6 +2,8 @@ [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24** +**Eval scorecard:** see [`./scorecards/0.2.4.md`](./scorecards/0.2.4.md) for the per-version accuracy metrics, dataset details, and aggregate score for v0.2.4. + Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies to, and schedules from Gmail and Outlook — with every email body analyzed **locally on AMD Ryzen AI** via Lemonade. No message content is sent to a cloud diff --git a/hub/agents/npm/agent-email/package.json b/hub/agents/npm/agent-email/package.json index fc3ad9be5..426d163e8 100644 --- a/hub/agents/npm/agent-email/package.json +++ b/hub/agents/npm/agent-email/package.json @@ -48,7 +48,8 @@ "CHANGELOG.md", "SPEC.md", "SKILL.md", - "LICENSE" + "LICENSE", + "scorecards/" ], "engines": { "node": ">=18" diff --git a/hub/agents/npm/agent-email/scorecards/.gitkeep b/hub/agents/npm/agent-email/scorecards/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py index 2961deeb7..52c58d143 100644 --- a/hub/agents/python/email/packaging/gen_scorecard.py +++ b/hub/agents/python/email/packaging/gen_scorecard.py @@ -109,10 +109,12 @@ def _is_judged(scenario: dict) -> bool: if acc is None: return False try: + import math + f = float(acc) except (TypeError, ValueError): return False - return 0.0 <= f <= 1.0 and f == f # also rejects NaN via f==f + return 0.0 <= f <= 1.0 and math.isfinite(f) def build_payload(benchmark_dir: Path, ground_truth_path: Path): @@ -189,7 +191,7 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path): metrics = [ {"name": "category_accuracy", "value": float(category_accuracy), "weight": 1.0} ] - _components, agg_value = compute_aggregate(metrics) # noqa: F841 + compute_aggregate(metrics) # validate metrics; aggregate embedded in render_scorecard import datetime diff --git a/hub/agents/python/hello-world/scorecards/0.1.0.md b/hub/agents/python/hello-world/scorecards/0.1.0.md new file mode 100644 index 000000000..fc6121f2e --- /dev/null +++ b/hub/agents/python/hello-world/scorecards/0.1.0.md @@ -0,0 +1,62 @@ +--- +schema_version: 1 +agent: + name: Hello World + version: 0.1.0 +recipe: + dataset: + reference: hub/agents/python/hello-world/tests + description: Illustrative conversational response dataset (reference agent) + size: 10 + methodology: Illustrative metric — reference agent for scorecard format generalization + config: + harness: gaia eval agent + model: Gemma-4-E4B-it-GGUF + limit: 10 +results: + test_cases_run: 10 + metrics: + - name: response_quality + value: 0.9 + weight: 1.0 +aggregate: + name: weighted_accuracy + formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2) + components: + - metric: response_quality + value: 0.9 + weight: 1.0 + value: 90.0 +generated_at: '2026-06-25T12:00:00+00:00' +inherited_from: null +--- +# Hello World — Eval Scorecard v0.1.0 + +**Aggregate score: 90.0** (out of 100) + +## Recipe + +| Field | Value | +|-------|-------| +| Dataset | [hub/agents/python/hello-world/tests](hub/agents/python/hello-world/tests) | +| Description | Illustrative conversational response dataset (reference agent) | +| Dataset size | 10 labeled examples | +| Test cases run | 10 | +| Methodology | Illustrative metric — reference agent for scorecard format generalization | + +## Metrics + + - **response_quality**: 0.9000 × 1.0 + +## Aggregate score recomputation + +Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)` + +Worked example: + +``` +round(100 × ((0.9000 × 1.0)) / 1.0, 2) = 90.0 +``` + +A reader can reproduce this value from the `aggregate.components` in the front +matter alone — no eval-harness access needed. diff --git a/src/gaia/eval/release_scorecard.py b/src/gaia/eval/release_scorecard.py index 49d81b71b..0d1e4a47f 100644 --- a/src/gaia/eval/release_scorecard.py +++ b/src/gaia/eval/release_scorecard.py @@ -25,7 +25,7 @@ from __future__ import annotations import re -from dataclasses import dataclass, field +from dataclasses import dataclass from pathlib import Path from typing import Optional @@ -170,7 +170,9 @@ def render_scorecard(payload: ResultPayload) -> str: "inherited_from": payload.inherited_from, } - fm_text = yaml.dump(front, default_flow_style=False, sort_keys=False, allow_unicode=True) + fm_text = yaml.dump( + front, default_flow_style=False, sort_keys=False, allow_unicode=True + ) # Human-readable body with worked recompute metric_lines = "\n".join( @@ -178,9 +180,7 @@ def render_scorecard(payload: ResultPayload) -> str: for c in components ) total_w = sum(c["weight"] for c in components) - worked = " + ".join( - f"({c['value']:.4f} × {c['weight']:.1f})" for c in components - ) + worked = " + ".join(f"({c['value']:.4f} × {c['weight']:.1f})" for c in components) body = f"""# {payload.agent_name} — Eval Scorecard v{payload.agent_version} @@ -253,7 +253,7 @@ def parse_scorecard(source) -> dict: # Split on first pair of '---' delimiters if not text.startswith("---"): - raise ValueError(f"Scorecard does not start with '---' front matter") + raise ValueError("Scorecard does not start with '---' front matter") # Find the closing '---' (first occurrence after the opening line) rest = text[3:] # strip opening --- diff --git a/src/gaia/eval/scorecard_gate.py b/src/gaia/eval/scorecard_gate.py index a33cecfb0..49c292561 100644 --- a/src/gaia/eval/scorecard_gate.py +++ b/src/gaia/eval/scorecard_gate.py @@ -57,22 +57,16 @@ def _read_version_from_manifest(manifest_path: Path) -> str: try: text = manifest_path.read_text(encoding="utf-8") except OSError as exc: - raise ValueError( - f"Cannot read manifest {manifest_path}: {exc}" - ) from exc + raise ValueError(f"Cannot read manifest {manifest_path}: {exc}") from exc try: data = yaml.safe_load(text) or {} except yaml.YAMLError as exc: - raise ValueError( - f"Invalid YAML in manifest {manifest_path}: {exc}" - ) from exc + raise ValueError(f"Invalid YAML in manifest {manifest_path}: {exc}") from exc version = data.get("version") if not version: - raise ValueError( - f"Manifest {manifest_path} has no 'version:' field." - ) + raise ValueError(f"Manifest {manifest_path} has no 'version:' field.") return str(version) @@ -230,9 +224,7 @@ def main(argv=None) -> int: return 1 if prev_score is None: - print( - f"ERROR: Prior scorecard {prev_path} has no 'aggregate.value' field." - ) + print(f"ERROR: Prior scorecard {prev_path} has no 'aggregate.value' field.") return 1 if float(candidate_score) < float(prev_score): diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py index d36e203eb..ccc6d2abe 100644 --- a/tests/unit/eval/test_release_scorecard.py +++ b/tests/unit/eval/test_release_scorecard.py @@ -5,7 +5,6 @@ import datetime import importlib.util import json -import sys from pathlib import Path import pytest @@ -78,9 +77,7 @@ def test_missing_required_fields_each_flagged(self): def test_required_top_level_keys_include_expected_sections(self): # schema_version, agent, recipe, results, aggregate must be required for section in ("schema_version", "agent", "recipe", "results", "aggregate"): - assert section in REQUIRED_FIELDS, ( - f"'{section}' must be in REQUIRED_FIELDS" - ) + assert section in REQUIRED_FIELDS, f"'{section}' must be in REQUIRED_FIELDS" # --------------------------------------------------------------------------- @@ -145,7 +142,9 @@ def test_rendered_text_contains_closing_dashes(self): lines = text.splitlines() # Find second occurrence of '---' closing = [i for i, l in enumerate(lines) if l == "---" and i > 0] - assert closing, "Rendered scorecard must contain a closing '---' after the first" + assert ( + closing + ), "Rendered scorecard must contain a closing '---' after the first" def test_body_after_front_matter_is_non_empty(self): payload = _make_payload() @@ -175,16 +174,14 @@ def test_test_cases_run_and_dataset_size_both_present(self): text = render_scorecard(payload) parsed = parse_scorecard(text) assert "results" in parsed, "'results' section missing from parsed scorecard" - assert "test_cases_run" in parsed["results"], ( - "'results.test_cases_run' must be a distinct field" - ) + assert ( + "test_cases_run" in parsed["results"] + ), "'results.test_cases_run' must be a distinct field" assert "recipe" in parsed, "'recipe' section missing from parsed scorecard" - assert "dataset" in parsed["recipe"], ( - "'recipe.dataset' sub-section missing" - ) - assert "size" in parsed["recipe"]["dataset"], ( - "'recipe.dataset.size' must be a distinct field" - ) + assert "dataset" in parsed["recipe"], "'recipe.dataset' sub-section missing" + assert ( + "size" in parsed["recipe"]["dataset"] + ), "'recipe.dataset.size' must be a distinct field" # --------------------------------------------------------------------------- @@ -207,7 +204,9 @@ def test_no_benchmark_or_agent_modules_imported(self): "or 'gaia.eval.quality_metrics' in m or 'gaia_agent_email' in m]; " "assert not bad, bad" ) - r = subprocess.run([_sys.executable, "-c", code], capture_output=True, text=True) + r = subprocess.run( + [_sys.executable, "-c", code], capture_output=True, text=True + ) assert r.returncode == 0, r.stderr @@ -306,7 +305,13 @@ def test_rendered_parsed_inherited_from_null_or_absent(self): class TestLatestVersionBelow: def _seed_dir(self, tmp_path): - for name in ("0.1.0.md", "0.2.3.md", "0.10.0.md", "README.md", "not-a-version.md"): + for name in ( + "0.1.0.md", + "0.2.3.md", + "0.10.0.md", + "README.md", + "not-a-version.md", + ): (tmp_path / name).write_text("# placeholder") return tmp_path @@ -376,9 +381,9 @@ def test_build_payload_mean_of_judged_scenarios(self, tmp_path): payload = mod.build_payload(benchmark_dir, gt_path) expected_mean = round((0.4167 + 0.5000) / 2, 10) - assert payload.metrics[0]["value"] == pytest.approx(expected_mean), ( - f"Expected metric value {expected_mean}, got {payload.metrics[0]['value']}" - ) + assert payload.metrics[0]["value"] == pytest.approx( + expected_mean + ), f"Expected metric value {expected_mean}, got {payload.metrics[0]['value']}" def test_build_payload_test_cases_run(self, tmp_path): mod = self._load_gen_scorecard() @@ -429,8 +434,16 @@ def test_all_no_quality_raises(self, tmp_path): empty_scorecard = { "run_id": "no-quality", "scenarios": [ - {"category": "Gemma-4-E4B-it-GGUF", "status": "PASS", "total_emails": 0}, - {"category": "Gemma-4-E4B-it-GGUF", "status": "PASS", "total_emails": 0}, + { + "category": "Gemma-4-E4B-it-GGUF", + "status": "PASS", + "total_emails": 0, + }, + { + "category": "Gemma-4-E4B-it-GGUF", + "status": "PASS", + "total_emails": 0, + }, ], } (benchmark_dir / "email_benchmark_scorecard.json").write_text( diff --git a/tests/unit/eval/test_scorecard_gate.py b/tests/unit/eval/test_scorecard_gate.py index dbeaba0b7..28ab269d9 100644 --- a/tests/unit/eval/test_scorecard_gate.py +++ b/tests/unit/eval/test_scorecard_gate.py @@ -5,10 +5,13 @@ import datetime from pathlib import Path -import pytest import yaml -from gaia.eval.release_scorecard import ResultPayload, compute_aggregate, render_scorecard +from gaia.eval.release_scorecard import ( + ResultPayload, + compute_aggregate, + render_scorecard, +) from gaia.eval.scorecard_gate import main # --------------------------------------------------------------------------- @@ -212,23 +215,21 @@ def test_publish_job_needs_scorecard_gate(self): / "workflows" / "release_agent_email.yml" ) - assert workflow_path.exists(), ( - f"Workflow file not found: {workflow_path}" - ) + assert workflow_path.exists(), f"Workflow file not found: {workflow_path}" content = workflow_path.read_text() parsed = yaml.safe_load(content) assert "jobs" in parsed, "Workflow has no 'jobs' key" - assert "publish" in parsed["jobs"], ( - "Workflow has no 'publish' job — add it or check the job name" - ) + assert ( + "publish" in parsed["jobs"] + ), "Workflow has no 'publish' job — add it or check the job name" needs = parsed["jobs"]["publish"].get("needs", []) # needs can be a string or a list if isinstance(needs, str): needs = [needs] - assert "scorecard-gate" in needs, ( - f"'publish' job must list 'scorecard-gate' in its needs; got: {needs}" - ) + assert ( + "scorecard-gate" in needs + ), f"'publish' job must list 'scorecard-gate' in its needs; got: {needs}" # --------------------------------------------------------------------------- From 78e45bfd57aafc08c3117d128ff30ae5c5cd5605 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Thu, 25 Jun 2026 18:43:32 -0400 Subject: [PATCH 05/18] feat(eval): deepen validate_scorecard with nested-field checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AC requires 'missing ANY required field ⇒ invalid', but the validator only checked 5 top-level keys. Add nested checks for agent.{name,version}, recipe.dataset.{reference,size}, recipe.{methodology,config}, results.{test_cases_run,metrics}, aggregate.{name,formula,value}, with non-dict-parent guards and a non-empty metrics-list requirement. Add TestSchemaValidator cases for missing nested fields, empty metrics, and non-dict sections. Also baseline sys.modules before import in the loose-coupling test so editable-install path finders don't false-positive. --- src/gaia/eval/release_scorecard.py | 64 +++++++++++++++++++++++ tests/unit/eval/test_release_scorecard.py | 56 +++++++++++++++++--- 2 files changed, 114 insertions(+), 6 deletions(-) diff --git a/src/gaia/eval/release_scorecard.py b/src/gaia/eval/release_scorecard.py index 0d1e4a47f..d50449f23 100644 --- a/src/gaia/eval/release_scorecard.py +++ b/src/gaia/eval/release_scorecard.py @@ -283,10 +283,74 @@ def validate_scorecard(parsed: dict) -> list: """ errors: list[str] = [] + # Top-level required keys for key in REQUIRED_FIELDS: if key not in parsed: errors.append(f"Missing required field: '{key}'") + def _section(name: str): + """Return the section dict if present and a dict, else record an error.""" + value = parsed.get(name) + if name in parsed and not isinstance(value, dict): + errors.append( + f"Field '{name}' must be a mapping, got {type(value).__name__}" + ) + return None + return value if isinstance(value, dict) else None + + # agent.{name, version} + agent = _section("agent") + if agent is not None: + for sub in ("name", "version"): + if sub not in agent: + errors.append(f"Missing required field: 'agent.{sub}'") + + # recipe.{dataset.{reference, size}, methodology, config} + recipe = _section("recipe") + if recipe is not None: + for sub in ("methodology", "config"): + if sub not in recipe: + errors.append(f"Missing required field: 'recipe.{sub}'") + dataset = recipe.get("dataset") + if "dataset" not in recipe: + errors.append("Missing required field: 'recipe.dataset'") + elif not isinstance(dataset, dict): + errors.append( + f"Field 'recipe.dataset' must be a mapping, got {type(dataset).__name__}" + ) + else: + for sub in ("reference", "size"): + if sub not in dataset: + errors.append(f"Missing required field: 'recipe.dataset.{sub}'") + + # results.{test_cases_run, metrics} + results = _section("results") + if results is not None: + if "test_cases_run" not in results: + errors.append("Missing required field: 'results.test_cases_run'") + metrics = results.get("metrics") + if "metrics" not in results: + errors.append("Missing required field: 'results.metrics'") + elif not isinstance(metrics, list) or not metrics: + errors.append("Field 'results.metrics' must be a non-empty list") + else: + for i, metric in enumerate(metrics): + if not isinstance(metric, dict): + errors.append(f"Field 'results.metrics[{i}]' must be a mapping") + continue + for sub in ("name", "value"): + if sub not in metric: + errors.append( + f"Missing required field: 'results.metrics[{i}].{sub}'" + ) + + # aggregate.{name, formula, value} + aggregate = _section("aggregate") + if aggregate is not None: + for sub in ("name", "formula", "value"): + if sub not in aggregate: + errors.append(f"Missing required field: 'aggregate.{sub}'") + return errors diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py index ccc6d2abe..c542ae13e 100644 --- a/tests/unit/eval/test_release_scorecard.py +++ b/tests/unit/eval/test_release_scorecard.py @@ -79,6 +79,46 @@ def test_required_top_level_keys_include_expected_sections(self): for section in ("schema_version", "agent", "recipe", "results", "aggregate"): assert section in REQUIRED_FIELDS, f"'{section}' must be in REQUIRED_FIELDS" + def test_missing_nested_aggregate_value_flagged(self): + payload = _make_payload() + parsed = parse_scorecard(render_scorecard(payload)) + # Complete card stays valid + assert validate_scorecard(parsed) == [] + # Removing a nested required field flags it + del parsed["aggregate"]["value"] + errors = validate_scorecard(parsed) + assert errors, "Expected missing 'aggregate.value' to be flagged" + assert any("aggregate.value" in e for e in errors), errors + + def test_missing_nested_agent_version_flagged(self): + payload = _make_payload() + parsed = parse_scorecard(render_scorecard(payload)) + del parsed["agent"]["version"] + errors = validate_scorecard(parsed) + assert errors, "Expected missing 'agent.version' to be flagged" + assert any("agent.version" in e for e in errors), errors + + def test_missing_nested_dataset_size_flagged(self): + payload = _make_payload() + parsed = parse_scorecard(render_scorecard(payload)) + del parsed["recipe"]["dataset"]["size"] + errors = validate_scorecard(parsed) + assert any("recipe.dataset.size" in e for e in errors), errors + + def test_empty_metrics_list_flagged(self): + payload = _make_payload() + parsed = parse_scorecard(render_scorecard(payload)) + parsed["results"]["metrics"] = [] + errors = validate_scorecard(parsed) + assert any("metrics" in e for e in errors), errors + + def test_non_dict_section_flagged_not_crash(self): + payload = _make_payload() + parsed = parse_scorecard(render_scorecard(payload)) + parsed["agent"] = "not-a-dict" + errors = validate_scorecard(parsed) + assert errors, "Expected a non-dict 'agent' section to be flagged" + # --------------------------------------------------------------------------- # 2. Aggregate computation @@ -191,16 +231,20 @@ def test_test_cases_run_and_dataset_size_both_present(self): class TestLooseCoupling: def test_no_benchmark_or_agent_modules_imported(self): - # Clean interpreter: importing release_scorecard must not pull in the - # eval harness or any agent package. Scanning the test process's own - # sys.modules gives false positives (e.g. the pytest_benchmark plugin), - # so check in a fresh subprocess instead. + # Importing release_scorecard must not pull in the eval harness or any + # agent package. Run in a fresh subprocess and baseline sys.modules + # BEFORE the import, so we measure only what the import itself adds — + # not pytest plugins or editable-install path finders that the + # interpreter registers at startup regardless of any import. import subprocess import sys as _sys code = ( - "import sys, gaia.eval.release_scorecard; " - "bad=[m for m in sys.modules if 'gaia.eval.benchmark' in m " + "import sys; " + "before=set(sys.modules); " + "import gaia.eval.release_scorecard; " + "added=set(sys.modules)-before; " + "bad=[m for m in added if 'gaia.eval.benchmark' in m " "or 'gaia.eval.quality_metrics' in m or 'gaia_agent_email' in m]; " "assert not bad, bad" ) From 019cc16a7502dc845b24ef05c5641bf6a511e2c3 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Thu, 25 Jun 2026 18:43:37 -0400 Subject: [PATCH 06/18] feat(eval): record eval limit + derive model in email scorecard adapter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The benchmark scorecard.json has no top-level model/limit, so config.limit was always null — defeating the comparability note in eval-scorecard.mdx. Add a --limit CLI arg threaded into config.limit, and derive config.model from the run's scenarios[0].category (the model id in benchmark output), falling back to gaia-agent.yaml models[0]. Drop the dead list-comprehension in the final print. --- .../python/email/packaging/gen_scorecard.py | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py index 52c58d143..22b12ad10 100644 --- a/hub/agents/python/email/packaging/gen_scorecard.py +++ b/hub/agents/python/email/packaging/gen_scorecard.py @@ -117,7 +117,7 @@ def _is_judged(scenario: dict) -> bool: return 0.0 <= f <= 1.0 and math.isfinite(f) -def build_payload(benchmark_dir: Path, ground_truth_path: Path): +def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None): """Build a :class:`~gaia.eval.release_scorecard.ResultPayload` from benchmark output. A scenario is **judged** iff it has a ``quality`` dict AND @@ -127,6 +127,9 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path): Args: benchmark_dir: Directory written by ``gaia eval benchmark --output-dir``. ground_truth_path: Path to ``ground_truth.json`` (the labeled corpus). + limit: The ``--limit`` value used for the eval run, recorded in + ``config["limit"]`` for cross-version comparability. The benchmark + ``scorecard.json`` does not persist this, so it must be passed in. Returns: Populated :class:`~gaia.eval.release_scorecard.ResultPayload`. @@ -188,6 +191,12 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path): f"No 'version:' field found in {agent_yaml_path}." ) + # Model id: benchmark output records it as the per-scenario `category`. + # Fall back to the manifest's first declared model. + scenario_model = scenarios[0].get("category") if scenarios else None + manifest_models = agent_data.get("models") or [None] + model = scenario_model or manifest_models[0] + metrics = [ {"name": "category_accuracy", "value": float(category_accuracy), "weight": 1.0} ] @@ -211,10 +220,10 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path): ), config={ "harness": "gaia eval benchmark", - "model": data.get("model", agent_data.get("models", [None])[0]), + "model": model, "corpus": "tests/fixtures/email/synthetic_inbox.mbox", "ground_truth": str(ground_truth_path), - "limit": data.get("limit"), + "limit": limit, }, test_cases_run=test_cases_run, metrics=metrics, @@ -253,6 +262,16 @@ def main(argv=None) -> int: "(default: hub/agents/npm/agent-email/scorecards/)." ), ) + parser.add_argument( + "--limit", + type=int, + default=None, + help=( + "The --limit value passed to 'gaia eval benchmark' for this run. " + "Recorded in config.limit for cross-version comparability " + "(the benchmark output does not persist it)." + ), + ) args = parser.parse_args(argv) @@ -260,7 +279,7 @@ def main(argv=None) -> int: gt_path = Path(args.ground_truth).resolve() try: - payload = build_payload(benchmark_dir, gt_path) + payload = build_payload(benchmark_dir, gt_path, limit=args.limit) except (ValueError, FileNotFoundError) as exc: print(f"ERROR: {exc}", file=sys.stderr) return 1 @@ -280,7 +299,7 @@ def main(argv=None) -> int: f"Scorecard written: {out_path}\n" f" Version: {payload.agent_version}\n" f" Aggregate: {payload.metrics[0]['value']:.4f} category_accuracy " - f"(over {len([s for s in [payload] if True])} — {payload.test_cases_run} emails judged)\n" + f"({payload.test_cases_run} emails judged)\n" f" Dataset size: {payload.dataset_size} labeled examples" ) return 0 From 2f931a1076a0b71e50dca8232611a8bc2662b072 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Thu, 25 Jun 2026 18:43:41 -0400 Subject: [PATCH 07/18] ci(eval): pin scorecard-gate setup-python to @v6 Match the rest of release_agent_email.yml, which already uses actions/setup-python@v6. --- .github/workflows/release_agent_email.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release_agent_email.yml b/.github/workflows/release_agent_email.yml index 81c674384..2ae937ab6 100644 --- a/.github/workflows/release_agent_email.yml +++ b/.github/workflows/release_agent_email.yml @@ -272,7 +272,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: "3.12" - name: Install core + PyYAML From e47bfaf45fce0de119f1850458458a38fb8d8a4e Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 10:10:49 -0400 Subject: [PATCH 08/18] feat(eval): email v0.2.4 scorecard from real benchmark run Generate the email-triage agent's v0.2.4 release scorecard from an actual `gaia eval benchmark` run (Gemma-4-E4B, 25 of 220 corpus emails) on AMD Strix Halo hardware: category_accuracy 0.04 -> aggregate 4.0/100. The low value is a taxonomy mismatch (the agent's triage labels and the ground-truth priority labels overlap only on 'urgent'), not triage quality -- tracked in #1266 and recorded in the scorecard's own methodology. Adapter hardening: store a repo-relative ground_truth path (no absolute-path leak in the published artifact), record the eval limit for comparability, and carry the taxonomy caveat. README surfaces the aggregate with the caveat and a relative link; docs example aligned to the 4-category label set. --- docs/reference/eval-scorecard.mdx | 2 +- hub/agents/npm/agent-email/README.md | 2 +- .../npm/agent-email/scorecards/.gitkeep | 0 .../npm/agent-email/scorecards/0.2.4.md | 70 +++++++++++++++++++ .../python/email/packaging/gen_scorecard.py | 20 ++++-- 5 files changed, 88 insertions(+), 6 deletions(-) delete mode 100644 hub/agents/npm/agent-email/scorecards/.gitkeep create mode 100644 hub/agents/npm/agent-email/scorecards/0.2.4.md diff --git a/docs/reference/eval-scorecard.mdx b/docs/reference/eval-scorecard.mdx index f3cbc5093..fd0acd57a 100644 --- a/docs/reference/eval-scorecard.mdx +++ b/docs/reference/eval-scorecard.mdx @@ -35,7 +35,7 @@ agent: recipe: dataset: reference: tests/fixtures/email/ground_truth.json - description: Synthetic email corpus (FakeGmailBackend, 5-category classification) + description: Synthetic email corpus (FakeGmailBackend, 4-category priority labels) size: 220 methodology: gaia eval benchmark — category_accuracy (case-insensitive exact match) config: diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md index 729fe99a6..dcadb4442 100644 --- a/hub/agents/npm/agent-email/README.md +++ b/hub/agents/npm/agent-email/README.md @@ -2,7 +2,7 @@ [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24** -**Eval scorecard:** see [`./scorecards/0.2.4.md`](./scorecards/0.2.4.md) for the per-version accuracy metrics, dataset details, and aggregate score for v0.2.4. +**Eval scorecard (v0.2.4): aggregate 4.0 / 100** — `category_accuracy` 0.04 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)). The low value reflects strict 4-way exact-match against a different label vocabulary, not triage quality — taxonomy calibration is tracked in [#1266](https://github.com/amd/gaia/issues/1266). The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate. Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies to, and schedules from Gmail and Outlook — with every email body analyzed diff --git a/hub/agents/npm/agent-email/scorecards/.gitkeep b/hub/agents/npm/agent-email/scorecards/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/hub/agents/npm/agent-email/scorecards/0.2.4.md b/hub/agents/npm/agent-email/scorecards/0.2.4.md new file mode 100644 index 000000000..5fad58585 --- /dev/null +++ b/hub/agents/npm/agent-email/scorecards/0.2.4.md @@ -0,0 +1,70 @@ +--- +schema_version: 1 +agent: + name: Email Triage + version: 0.2.4 +recipe: + dataset: + reference: tests/fixtures/email/ground_truth.json + description: 'Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, + 4-category priority labels: informational / actionable / urgent / low priority)' + size: 220 + methodology: 'gaia eval benchmark — category classification accuracy (case-insensitive + exact match of the agent''s triage label vs the ground-truth priority label) over + a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent''s + triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority + labels currently overlap only on ''urgent'', so this exact-match metric understates + triage usefulness — taxonomy calibration is tracked in amd/gaia#1266' + config: + harness: gaia eval benchmark + model: Gemma-4-E4B-it-GGUF + corpus: tests/fixtures/email/synthetic_inbox.mbox + ground_truth: tests/fixtures/email/ground_truth.json + limit: 25 +results: + test_cases_run: 25 + metrics: + - name: category_accuracy + value: 0.04 + weight: 1.0 +aggregate: + name: weighted_accuracy + formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2) + components: + - metric: category_accuracy + value: 0.04 + weight: 1.0 + value: 4.0 +generated_at: '2026-06-26T14:07:51.768804+00:00' +inherited_from: null +--- +# Email Triage — Eval Scorecard v0.2.4 + +**Aggregate score: 4.0** (out of 100) + +## Recipe + +| Field | Value | +|-------|-------| +| Dataset | [tests/fixtures/email/ground_truth.json](tests/fixtures/email/ground_truth.json) | +| Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, 4-category priority labels: informational / actionable / urgent / low priority) | +| Dataset size | 220 labeled examples | +| Test cases run | 25 | +| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth priority label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority labels currently overlap only on 'urgent', so this exact-match metric understates triage usefulness — taxonomy calibration is tracked in amd/gaia#1266 | + +## Metrics + + - **category_accuracy**: 0.0400 × 1.0 + +## Aggregate score recomputation + +Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)` + +Worked example: + +``` +round(100 × ((0.0400 × 1.0)) / 1.0, 2) = 4.0 +``` + +A reader can reproduce this value from the `aggregate.components` in the front +matter alone — no eval-harness access needed. diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py index 22b12ad10..b10681ffe 100644 --- a/hub/agents/python/email/packaging/gen_scorecard.py +++ b/hub/agents/python/email/packaging/gen_scorecard.py @@ -210,19 +210,31 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None): dataset_reference="tests/fixtures/email/ground_truth.json", dataset_description=( "Synthetic email corpus for GAIA email-triage evaluation " - "(FakeGmailBackend, 5-category classification)" + "(FakeGmailBackend, 4-category priority labels: " + "informational / actionable / urgent / low priority)" ), dataset_size=dataset_size, methodology=( "gaia eval benchmark — category classification accuracy " - "(case-insensitive exact match) over a synthetic labeled corpus " - "via FakeGmailBackend; no LLM judge required" + "(case-insensitive exact match of the agent's triage label vs the " + "ground-truth priority label) over a synthetic labeled corpus via " + "FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy " + "(fyi / needs_response / promotional / urgent) and the corpus " + "priority labels currently overlap only on 'urgent', so this " + "exact-match metric understates triage usefulness — taxonomy " + "calibration is tracked in amd/gaia#1266" ), config={ "harness": "gaia eval benchmark", "model": model, "corpus": "tests/fixtures/email/synthetic_inbox.mbox", - "ground_truth": str(ground_truth_path), + # Store a repo-relative path — never leak a local absolute path into + # a committed/published artifact. + "ground_truth": ( + str(ground_truth_path.relative_to(_REPO_ROOT)) + if str(ground_truth_path).startswith(str(_REPO_ROOT)) + else ground_truth_path.name + ), "limit": limit, }, test_cases_run=test_cases_run, From 2ae55ecda732ceb3db306bd57495e6bc46d54ce0 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 10:42:26 -0400 Subject: [PATCH 09/18] feat(eval): scorecard refresh/reject CI loop, adoption skill, correct taxonomy ref - Add .github/workflows/email_scorecard_refresh.yml: on agent/corpus changes the self-hosted AMD runner re-runs the eval, regenerates the scorecard, commits it when the score holds/improves, and FAILS on a regression (same-version vs the committed card + cross-version via scorecard_gate). Hosted-CI backstop stays the release-time scorecard-gate job. - Add .claude/skills/adding-eval-scorecard: a phased skill so adopting a scorecard is invocable, not a prose walkthrough; referenced from eval-scorecard.mdx. - Document the update/reject loop in eval-scorecard.mdx. - Correct the scorecard's taxonomy reference from the closed #1266 (old 4-way) to #1874 (corpus labels stale vs schema-2.0 5-bucket taxonomy); regenerate the card. --- .claude/skills/adding-eval-scorecard/SKILL.md | 85 +++++++++ .github/workflows/email_scorecard_refresh.yml | 162 ++++++++++++++++++ docs/reference/eval-scorecard.mdx | 21 +++ hub/agents/npm/agent-email/README.md | 2 +- .../npm/agent-email/scorecards/0.2.4.md | 7 +- .../python/email/packaging/gen_scorecard.py | 4 +- 6 files changed, 275 insertions(+), 6 deletions(-) create mode 100644 .claude/skills/adding-eval-scorecard/SKILL.md create mode 100644 .github/workflows/email_scorecard_refresh.yml diff --git a/.claude/skills/adding-eval-scorecard/SKILL.md b/.claude/skills/adding-eval-scorecard/SKILL.md new file mode 100644 index 000000000..0afaa057f --- /dev/null +++ b/.claude/skills/adding-eval-scorecard/SKILL.md @@ -0,0 +1,85 @@ +--- +name: "adding-eval-scorecard" +description: "Adopt the per-agent eval scorecard for a GAIA hub agent: write the harness→payload adapter, run the eval to produce a REAL scorecard, link + surface it from the agent's README, wire the release gate, and (for a new agent) generalize the format. Use when asked to 'add a scorecard', 'adopt the eval scorecard', 'generate the scorecard for ', or wire scorecard CI for an agent. Builds on docs/reference/eval-scorecard.mdx and the email agent reference adapter." +--- + +# Adding an Eval Scorecard to a GAIA Agent + +Adopt the release **eval scorecard** ([`docs/reference/eval-scorecard.mdx`](../../../docs/reference/eval-scorecard.mdx)) for one hub agent. The system is `harness → result payload → generator → scorecard`, with a standalone presence+regression release gate. The **email agent is the reference implementation** — mirror it. + +**Core modules (do not modify; reuse):** +- `src/gaia/eval/release_scorecard.py` — `ResultPayload`, `compute_aggregate`, `render_scorecard`, `write_scorecard`, `validate_scorecard`, `carry_forward`, `latest_version_below`. Harness-agnostic (stdlib + PyYAML only). +- `src/gaia/eval/scorecard_gate.py` — the standalone gate (`python -m gaia.eval.scorecard_gate`). +- Reference adapter: `hub/agents/python/email/packaging/gen_scorecard.py`. + +This is a **phased checklist with a hard gate at the real-eval step** — the scorecard MUST come from an actual eval run, never hand-authored numbers. + +## Phase 1 — Locate the agent's surfaces + +1. **Version source of truth** = the `version:` field in `/gaia-agent.yaml`. Never invent a parallel scheme. +2. **Canonical README** (where the scorecard is linked + surfaced): for an npm-published agent it is the npm client README (e.g. `hub/agents/npm//README.md`), NOT a `packaging/README.md`. For a Python-only agent it is `hub/agents/python//README.md`. Confirm which by checking what `release_agent_.yml` publishes (`README:` env) — the published README is the one to link. +3. **doc-root** = the directory holding that canonical README. Scorecards live at `/scorecards/.md`. +4. **Eval vehicle**: what existing harness produces this agent's accuracy metric? (email → `gaia eval benchmark` over `tests/fixtures/email/`.) If none exists, STOP and surface that — propose the minimal harness before building; do not invent numbers. + +## Phase 2 — Write the adapter (harness → payload) + +Copy `hub/agents/python/email/packaging/gen_scorecard.py` as the template. The adapter: +- imports ONLY `gaia.eval.release_scorecard` (never the harness or agent package — preserve loose coupling); +- reads the harness output, builds a `ResultPayload`; +- defines **"judged"** explicitly and **raises loudly** if zero results are judged (no silent 0.0); +- records **dataset size** (total labeled examples) and **test_cases_run** (subset executed) as DISTINCT fields; +- stores **repo-relative** paths only (never a local absolute path — it ships in a published artifact); +- records the eval `limit`/config so future regression checks are comparable; +- writes to `/scorecards/.md`. + +Add an offline unit test against a committed sample harness-output fixture (see `tests/fixtures/eval/email_benchmark_scorecard.json` + `tests/unit/eval/test_release_scorecard.py::TestEmailAdapter`) so the adapter is testable without a live model. + +## Phase 3 — Run the REAL eval (hard gate — no hand-authored numbers) + +The accuracy number must come from an actual run. For the email agent: + +```bash +# Real eval needs Lemonade + the model. Prefer AMD hardware (Strix Halo / Ryzen AI); +# the [self-hosted, lemonade-eval] runner is the canonical environment. +GAIA_AGENT_TOOL_TIMEOUT=900 \ +PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \ +PYTHONPATH="$(pwd)" \ + /bin/gaia eval benchmark \ + --model Gemma-4-E4B-it-GGUF \ + --mbox-path tests/fixtures/email/synthetic_inbox.mbox \ + --ground-truth tests/fixtures/email/ground_truth.json \ + --limit 25 --output-dir + +/bin/python hub/agents/python/email/packaging/gen_scorecard.py \ + --benchmark-dir --limit 25 +``` + +**Headless gotchas (see memory `project-email-benchmark-headless-gotchas`):** +- `PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring` — the email agent's calendar-connector resolution blocks forever on the macOS Keychain (and can stall on Linux SecretService) in non-interactive contexts. Without this it hangs at 0% CPU during agent construction. +- `PYTHONPATH="$(pwd)"` — the benchmark imports `tests.fixtures.email.*`; the console script doesn't add the repo root. +- `GAIA_AGENT_TOOL_TIMEOUT=900` — triage of N emails is one tool call; the 180s default abandons it on slow backends, yielding a degenerate 0-email FAIL run. +- Write `--output-dir` to a **persistent** dir, not `/tmp` (cleared on session resume). +- Record honestly: if the metric is low for a known reason (e.g. a taxonomy/label mismatch), put the explanation in the adapter's `methodology` string and link the tracking issue — never inflate the number. + +## Phase 4 — Surface, link, and gate + +1. **Link + surface** from the canonical README: a one-line `Eval scorecard (vX.Y.Z): aggregate N/100 … ([./scorecards/X.Y.Z.md](./scorecards/X.Y.Z.md))`. The relative link must resolve in-repo. +2. **npm `files`**: if the agent publishes on npm, add `scorecards/` to `package.json` `files` so the link resolves on the published package too. +3. **Hub display**: a published scorecard surfaces on the agent's hub page / Agent UI detail view (see `workers/agent-hub` + `AgentDetailModal.tsx`); ensure the publish step uploads the scorecard alongside the README. +4. **Release gate**: add a `scorecard-gate` job to `release_agent_.yml` and list it in `publish.needs`. The job runs on a GitHub-hosted runner (it only parses committed files — no eval): + ```bash + python -m gaia.eval.scorecard_gate \ + --scorecards-dir /scorecards \ + --manifest hub/agents/python//gaia-agent.yaml + ``` + The job must NOT have `continue-on-error`, an `environment:`, or a `permissions:` override (inherits `contents: read`; needs no secrets). +5. **Auto-update/reject loop**: for re-running on agent changes and refreshing the committed scorecard, see `eval-scorecard.mdx` "Keeping the scorecard current" and the self-hosted refresh workflow — reject-on-worse is the gate; better-or-equal refreshes the committed card. + +## Phase 5 — Verify (evidence before "done") + +Run and capture: the generated `.md`; the gate **passing** on it (exit 0); the gate **blocking** a manufactured regression (exit 1) and a missing card (exit 1); a by-hand recompute of the aggregate from `aggregate.components` matching the recorded value. Run `python util/lint.py --all` and the eval unit tests. These are the PR's real-world proof. + +## Versioning + +- **Patch** release → `carry_forward(prev_path, new_version)` (copies results verbatim, sets `inherited_from`); do NOT re-run the eval. +- **Minor/major** release → re-run the eval (Phase 3); `carry_forward` refuses a non-patch bump with a "re-run" error. diff --git a/.github/workflows/email_scorecard_refresh.yml b/.github/workflows/email_scorecard_refresh.yml new file mode 100644 index 000000000..7b3b02b5f --- /dev/null +++ b/.github/workflows/email_scorecard_refresh.yml @@ -0,0 +1,162 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT + +# Email agent eval-scorecard refresh + regression gate (#1862). +# +# Answers "how does a PR that changes the agent keep the scorecard honest?": +# when the email agent's LLM-affecting code (or the eval corpus) changes, this +# re-runs the REAL eval, regenerates the scorecard, and then: +# - score IMPROVED or held -> commits the refreshed scorecard to the branch +# - score REGRESSED -> fails the job (the worse card is NOT committed) +# +# `gaia eval benchmark` needs Lemonade on AMD hardware, so this runs ONLY on the +# self-hosted [self-hosted, lemonade-eval] pool — GitHub-hosted runners cannot run +# it. The release-time `scorecard-gate` job in release_agent_email.yml is the +# hosted-CI backstop (it parses committed files only, no eval). +# +# Two regression checks run here: +# 1. SAME-VERSION: fresh aggregate vs the currently-committed card for this +# version — stops a noisy/worse re-run from silently overwriting a good score. +# 2. CROSS-VERSION: `gaia.eval.scorecard_gate` — fresh card vs the prior version. +# +# Auto-commit needs `contents: write` and only works on the repo's own branches; +# a fork PR's GITHUB_TOKEN is read-only — for forks, run the eval locally / on AMD +# hardware and commit the scorecard by hand (the release gate still enforces it). + +name: Email Agent Eval — scorecard refresh + +on: + workflow_dispatch: + inputs: + limit: + description: 'Messages to triage (must match the committed scorecard for comparability)' + required: false + default: '25' + model: + description: 'Lemonade model id' + required: false + default: 'Gemma-4-E4B-it-GGUF' + push: + branches-ignore: + - main + paths: + - 'hub/agents/python/email/**' + - 'tests/fixtures/email/**' + - 'src/gaia/eval/release_scorecard.py' + - 'src/gaia/eval/scorecard_gate.py' + +concurrency: + # Share the single Lemonade backend slot with the other self-hosted evals so two + # runs never race-evict each other's model (CLAUDE.md: evals run serially). + group: lemonade-eval + cancel-in-progress: false + +permissions: + contents: write # auto-commit the refreshed scorecard to the branch + +env: + SCORECARD_DIR: hub/agents/npm/agent-email/scorecards + MANIFEST: hub/agents/python/email/gaia-agent.yaml + LIMIT: ${{ github.event.inputs.limit || '25' }} + MODEL: ${{ github.event.inputs.model || 'Gemma-4-E4B-it-GGUF' }} + +jobs: + refresh: + name: Re-run eval, refresh-or-reject scorecard + runs-on: [self-hosted, lemonade-eval] + timeout-minutes: 90 + steps: + - name: Checkout (the pushed branch) + uses: actions/checkout@v6 + with: + ref: ${{ github.head_ref || github.ref_name }} + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.10' + + - name: Install in isolated venv + run: | + python -m venv .venv-scorecard + source .venv-scorecard/bin/activate + python -m pip install --upgrade pip + pip install -e ".[dev,eval,api]" + echo "$PWD/.venv-scorecard/bin" >> "$GITHUB_PATH" + + - name: Resolve version + capture currently-committed aggregate + id: pre + run: | + set -euo pipefail + VERSION=$(python -c "import yaml; print(yaml.safe_load(open('${MANIFEST}'))['version'])") + echo "version=${VERSION}" >> "$GITHUB_OUTPUT" + CARD="${SCORECARD_DIR}/${VERSION}.md" + # Aggregate of the card as committed on this branch (empty if new). + if git cat-file -e "HEAD:${CARD}" 2>/dev/null; then + git show "HEAD:${CARD}" > /tmp/committed_card.md + COMMITTED=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('/tmp/committed_card.md'))['aggregate']['value'])") + else + COMMITTED="" + fi + echo "committed=${COMMITTED}" >> "$GITHUB_OUTPUT" + echo "Version ${VERSION}; committed aggregate: ${COMMITTED:-}" + + - name: Run the email-triage benchmark (real eval) + env: + # The agent's calendar-connector resolution blocks on the OS keyring in + # a headless context — disable it so construction doesn't hang. + PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring + # Triage of N emails is one tool call; the 180s default abandons it on a + # slow backend and yields a degenerate 0-email run. + GAIA_AGENT_TOOL_TIMEOUT: '900' + PYTHONPATH: ${{ github.workspace }} + run: | + set -euo pipefail + rm -rf eval-out && mkdir -p eval-out + gaia eval benchmark \ + --model "${MODEL}" \ + --mbox-path tests/fixtures/email/synthetic_inbox.mbox \ + --ground-truth tests/fixtures/email/ground_truth.json \ + --limit "${LIMIT}" \ + --output-dir eval-out + + - name: Regenerate the scorecard from the real run + run: | + set -euo pipefail + python hub/agents/python/email/packaging/gen_scorecard.py \ + --benchmark-dir eval-out --limit "${LIMIT}" + + - name: Same-version regression check (reject a worse re-run) + run: | + set -euo pipefail + VERSION="${{ steps.pre.outputs.version }}" + COMMITTED="${{ steps.pre.outputs.committed }}" + CARD="${SCORECARD_DIR}/${VERSION}.md" + FRESH=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('${CARD}'))['aggregate']['value'])") + echo "fresh aggregate: ${FRESH} | committed: ${COMMITTED:-}" + if [ -n "${COMMITTED}" ] && python -c "import sys; sys.exit(0 if float('${FRESH}') < float('${COMMITTED}') else 1)"; then + echo "::error::Scorecard regression for v${VERSION}: re-run scored ${FRESH} < committed ${COMMITTED}. Not committing the worse card. Investigate, or override intentionally via --allow-regression in a manual commit." + git checkout -- "${CARD}" || true + exit 1 + fi + echo "No same-version regression — fresh score is >= committed." + + - name: Cross-version gate (fresh card vs prior version) + run: | + set -euo pipefail + python -m gaia.eval.scorecard_gate \ + --scorecards-dir "${SCORECARD_DIR}" \ + --manifest "${MANIFEST}" + + - name: Commit the refreshed scorecard (only if it changed for the better/equal) + run: | + set -euo pipefail + if git diff --quiet -- "${SCORECARD_DIR}"; then + echo "Scorecard unchanged — nothing to commit." + exit 0 + fi + git config user.name "${{ github.actor }}" + git config user.email "${{ github.actor }}@users.noreply.github.com" + git add "${SCORECARD_DIR}" + git commit -m "eval(email): refresh v${{ steps.pre.outputs.version }} scorecard from benchmark run" + git push origin "HEAD:${{ github.head_ref || github.ref_name }}" diff --git a/docs/reference/eval-scorecard.mdx b/docs/reference/eval-scorecard.mdx index fd0acd57a..b00d9d00e 100644 --- a/docs/reference/eval-scorecard.mdx +++ b/docs/reference/eval-scorecard.mdx @@ -230,8 +230,29 @@ The gate calls `latest_version_below(scorecards_dir, version)`, which: The version is read from `gaia-agent.yaml` (via `--manifest`) or passed explicitly (via `--version`). +## Keeping the scorecard current (the update / reject loop) + +The scorecard must move with the agent: when LLM-affecting code changes, the eval is re-run and the committed scorecard refreshed — **upward**. A regression is blocked. + +Two enforcement points work together: + +1. **Reject-on-worse (always on, GitHub-hosted).** The `scorecard-gate` job in `release_agent_.yml` runs on every release. It only parses committed files (no eval), so it runs on a standard runner and **fails the build** if the committed scorecard regressed below the prior version or is missing. This is the hard gate. +2. **Run-and-refresh (self-hosted AMD).** `gaia eval benchmark` needs Lemonade on AMD hardware, so it cannot run on GitHub-hosted runners — it runs on the `[self-hosted, lemonade-eval]` pool. The `Email Agent Eval — scorecard refresh` workflow (`.github/workflows/email_scorecard_refresh.yml`) runs on demand (and on pushes touching the email agent), re-runs the eval, regenerates the scorecard, then: + - **score ≥ committed** → commits the refreshed scorecard back to the branch (the PR carries the improved number); + - **score < committed** → fails loudly (the regression must be investigated, or consciously overridden with `--allow-regression`). + +So a PR that changes the agent gets its scorecard refreshed (better) or rejected (worse) automatically on the AMD runner, and the release gate is the backstop on hosted CI. Locally, `gen_scorecard.py` + `scorecard_gate.py` reproduce both steps (see the **`adding-eval-scorecard` skill**). + + + The refresh job needs `contents: write` and runs only on the repo's own branches — a fork PR's `GITHUB_TOKEN` is read-only and cannot auto-commit. For a fork PR, run the eval locally/on AMD hardware and commit the scorecard manually; the release gate still enforces no-regression. + + ## Adding a scorecard for a new agent + + **Use the [`adding-eval-scorecard` skill](https://github.com/amd/gaia/tree/main/.claude/skills/adding-eval-scorecard/SKILL.md).** In Claude Code, invoke it instead of following these steps by hand — it carries the exact commands, the harness→payload→generator flow, the headless-eval gotchas (keyring/PYTHONPATH/tool-timeout), and the verification evidence to capture. The steps below are the reference the skill automates. + + 1. Create the `scorecards/` directory beside the agent's canonical README. 2. Write a `packaging/gen_scorecard.py` adapter (see `hub/agents/python/email/packaging/gen_scorecard.py` for a reference). 3. Run the eval and call the adapter → commit the resulting `.md`. diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md index dcadb4442..7f0891d3d 100644 --- a/hub/agents/npm/agent-email/README.md +++ b/hub/agents/npm/agent-email/README.md @@ -2,7 +2,7 @@ [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24** -**Eval scorecard (v0.2.4): aggregate 4.0 / 100** — `category_accuracy` 0.04 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)). The low value reflects strict 4-way exact-match against a different label vocabulary, not triage quality — taxonomy calibration is tracked in [#1266](https://github.com/amd/gaia/issues/1266). The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate. +**Eval scorecard (v0.2.4): aggregate 4.0 / 100** — `category_accuracy` 0.04 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)). The low value reflects strict exact-match against stale corpus labels (the eval ground-truth still uses the pre-schema-2.0 4-way taxonomy), not triage quality — tracked in [#1874](https://github.com/amd/gaia/issues/1874). The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate. Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies to, and schedules from Gmail and Outlook — with every email body analyzed diff --git a/hub/agents/npm/agent-email/scorecards/0.2.4.md b/hub/agents/npm/agent-email/scorecards/0.2.4.md index 5fad58585..1c5f89480 100644 --- a/hub/agents/npm/agent-email/scorecards/0.2.4.md +++ b/hub/agents/npm/agent-email/scorecards/0.2.4.md @@ -14,7 +14,8 @@ recipe: a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent''s triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority labels currently overlap only on ''urgent'', so this exact-match metric understates - triage usefulness — taxonomy calibration is tracked in amd/gaia#1266' + triage usefulness — the corpus labels are stale vs the schema-2.0 taxonomy, tracked + in amd/gaia#1874' config: harness: gaia eval benchmark model: Gemma-4-E4B-it-GGUF @@ -35,7 +36,7 @@ aggregate: value: 0.04 weight: 1.0 value: 4.0 -generated_at: '2026-06-26T14:07:51.768804+00:00' +generated_at: '2026-06-26T14:38:25.168352+00:00' inherited_from: null --- # Email Triage — Eval Scorecard v0.2.4 @@ -50,7 +51,7 @@ inherited_from: null | Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, 4-category priority labels: informational / actionable / urgent / low priority) | | Dataset size | 220 labeled examples | | Test cases run | 25 | -| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth priority label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority labels currently overlap only on 'urgent', so this exact-match metric understates triage usefulness — taxonomy calibration is tracked in amd/gaia#1266 | +| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth priority label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority labels currently overlap only on 'urgent', so this exact-match metric understates triage usefulness — the corpus labels are stale vs the schema-2.0 taxonomy, tracked in amd/gaia#1874 | ## Metrics diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py index b10681ffe..fdfdf1eae 100644 --- a/hub/agents/python/email/packaging/gen_scorecard.py +++ b/hub/agents/python/email/packaging/gen_scorecard.py @@ -221,8 +221,8 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None): "FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy " "(fyi / needs_response / promotional / urgent) and the corpus " "priority labels currently overlap only on 'urgent', so this " - "exact-match metric understates triage usefulness — taxonomy " - "calibration is tracked in amd/gaia#1266" + "exact-match metric understates triage usefulness — the corpus " + "labels are stale vs the schema-2.0 taxonomy, tracked in amd/gaia#1874" ), config={ "harness": "gaia eval benchmark", From 01d6da4696191efeb69a26859f02e6f07280d7a1 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 10:49:58 -0400 Subject: [PATCH 10/18] feat(eval): surface eval scorecard in Agent Hub worker and publish flow Adds `eval_scorecard_url` and `eval_score` fields end-to-end through the worker catalog pipeline so the Agent Hub listing can show a benchmark aggregate and link to the full scorecard. Worker: `evalScorecardKey()` storage helper, optional `eval_scorecard` multipart part in POST /publish (stored as `eval-scorecard.md` per version), YAML front-matter parse of `aggregate.value` in `toIndexEntry`, and both fields carried through `rebuildIndex`. Missing/unparseable scorecard yields undefined fields, never throws. Publish: `--eval-scorecard ` flag in `publish_to_r2.py`; the GHA release workflow conditionally passes the versioned scorecard file when it exists under `hub/agents/npm/agent-email/scorecards/.md`. Python catalog: `merge_with_registry` threads the two new optional fields from the R2 index entry into the unified catalog dict so the UI backend serves them alongside existing agent metadata. Tests: two focused tests in routes.test.ts cover the present/absent scorecard cases (69 tests total, all pass). --- .github/workflows/release_agent_email.yml | 7 +++ .../python/email/packaging/publish_to_r2.py | 29 +++++++++ src/gaia/hub/catalog.py | 9 ++- workers/agent-hub/src/catalog.ts | 41 +++++++++++-- workers/agent-hub/src/publish.ts | 12 +++- workers/agent-hub/src/storage.ts | 19 ++++++ workers/agent-hub/src/types.ts | 4 ++ workers/agent-hub/test/fake-r2.ts | 2 + workers/agent-hub/test/routes.test.ts | 61 +++++++++++++++++++ 9 files changed, 178 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release_agent_email.yml b/.github/workflows/release_agent_email.yml index 2ae937ab6..1c624f2f2 100644 --- a/.github/workflows/release_agent_email.yml +++ b/.github/workflows/release_agent_email.yml @@ -475,6 +475,12 @@ jobs: case "$f" in *.json) continue ;; esac args+=(--artifact "$f") done + VER="${{ steps.ver.outputs.version }}" + scorecard_args=() + SCORECARD="hub/agents/npm/agent-email/scorecards/${VER}.md" + if [ -f "${SCORECARD}" ]; then + scorecard_args+=(--eval-scorecard "${SCORECARD}") + fi python hub/agents/python/email/packaging/publish_to_r2.py \ --base-url "${GAIA_HUB_PUBLISH_URL:-${GAIA_HUB_BASE_URL:-https://hub.amd-gaia.ai}}" \ --manifest "${MANIFEST}" \ @@ -482,6 +488,7 @@ jobs: --changelog "${CHANGELOG}" \ --spec "${SPEC}" \ --skill "${SKILL}" \ + "${scorecard_args[@]}" \ "${args[@]}" \ --summary-out published.json echo "=== publish summary ===" diff --git a/hub/agents/python/email/packaging/publish_to_r2.py b/hub/agents/python/email/packaging/publish_to_r2.py index 4da341837..5884cb976 100644 --- a/hub/agents/python/email/packaging/publish_to_r2.py +++ b/hub/agents/python/email/packaging/publish_to_r2.py @@ -129,6 +129,7 @@ def publish_one( changelog_bytes: bytes | None = None, spec_bytes: bytes | None = None, skill_bytes: bytes | None = None, + eval_scorecard_bytes: bytes | None = None, package_files_bytes: bytes | None = None, ) -> dict: if not artifact_path.exists(): @@ -172,6 +173,10 @@ def publish_one( files["spec"] = ("SPEC.md", spec_bytes, "text/markdown") if skill_bytes is not None: files["skill"] = ("SKILL.md", skill_bytes, "text/markdown") + # The eval scorecard rides along with the first platform binary and becomes + # the catalog entry's `eval_score` and `eval_scorecard_url`. + if eval_scorecard_bytes is not None: + files["eval_scorecard"] = ("eval-scorecard.md", eval_scorecard_bytes, "text/markdown") # The whole-package file listing rides with the zip artifact — it becomes # the catalog entry's `package.files` (the hub's file-list display). if package_files_bytes is not None: @@ -271,6 +276,14 @@ def main(argv=None) -> int: help="Path to SKILL.md to publish as the agent's catalog skill " "(POSTed as the multipart 'skill' part the Worker accepts).", ) + parser.add_argument( + "--eval-scorecard", + type=Path, + help="Path to the eval scorecard markdown (e.g. scorecards/0.2.4.md) to " + "publish as the agent's catalog eval score and scorecard URL " + "(POSTed as the multipart 'eval_scorecard' part the Worker accepts). " + "Absent = publish without an eval scorecard.", + ) parser.add_argument( "--package-files", type=Path, @@ -341,6 +354,21 @@ def main(argv=None) -> int: flush=True, ) + eval_scorecard_bytes = None + if args.eval_scorecard is not None: + if not args.eval_scorecard.exists(): + raise SystemExit( + f"error: --eval-scorecard path not found: {args.eval_scorecard}. " + "Pass the scorecard markdown, or omit --eval-scorecard to publish " + "without one." + ) + eval_scorecard_bytes = args.eval_scorecard.read_bytes() + print( + f"[publish] attaching eval scorecard: {args.eval_scorecard} " + f"({len(eval_scorecard_bytes)} bytes)", + flush=True, + ) + package_files_bytes = None if args.package_files is not None: if not args.package_files.exists(): @@ -376,6 +404,7 @@ def main(argv=None) -> int: changelog_bytes=changelog_bytes, spec_bytes=spec_bytes, skill_bytes=skill_bytes, + eval_scorecard_bytes=eval_scorecard_bytes, package_files_bytes=package_files_bytes, ) ) diff --git a/src/gaia/hub/catalog.py b/src/gaia/hub/catalog.py index b78337895..461e34b0f 100644 --- a/src/gaia/hub/catalog.py +++ b/src/gaia/hub/catalog.py @@ -389,7 +389,7 @@ def merge_with_registry( language = entry.get("language", "python") security_tier = entry.get("security_tier", "experimental") - by_id[agent_id] = { + merged: Dict[str, Any] = { "id": agent_id, "name": entry.get("name", agent_id), "description": entry.get("description", ""), @@ -407,6 +407,13 @@ def merge_with_registry( "status": status, "source": (reg.source if reg is not None else "hub"), } + # Optional eval scorecard fields — absent from older catalog entries and + # from builtin/custom agents that haven't run a benchmark yet. + if "eval_score" in entry: + merged["eval_score"] = entry["eval_score"] + if "eval_scorecard_url" in entry: + merged["eval_scorecard_url"] = entry["eval_scorecard_url"] + by_id[agent_id] = merged # 2. Registry-only agents (builtins / custom not published to the hub). for agent_id, reg in registered.items(): diff --git a/workers/agent-hub/src/catalog.ts b/workers/agent-hub/src/catalog.ts index 357cf83f8..786d278d9 100644 --- a/workers/agent-hub/src/catalog.ts +++ b/workers/agent-hub/src/catalog.ts @@ -5,11 +5,15 @@ * Build per-agent manifests and the top-level catalog index. */ +import { parse as parseYaml } from "yaml"; + import { compareSemver } from "./manifest"; import { + evalScorecardKey, listAgentIds, readAgentManifest, readChangelog, + readEvalScorecard, readPackageFiles, readReadme, readSkill, @@ -96,10 +100,31 @@ export function upsertVersion( }; } +/** + * Parse the `aggregate.value` from a scorecard's YAML front matter. Returns + * undefined when the scorecard is absent, malformed, or missing the field — + * never throws so a bad scorecard never breaks the catalog build. + */ +function parseScorecardScore(markdown: string | null): number | undefined { + if (!markdown) return undefined; + // Extract the YAML front matter block between the leading --- delimiters. + const match = /^---\n([\s\S]*?)\n---/.exec(markdown); + if (!match) return undefined; + try { + const fm = parseYaml(match[1]) as Record | null; + const agg = fm && typeof fm === "object" ? (fm.aggregate as Record | undefined) : undefined; + const val = agg?.value; + return typeof val === "number" && Number.isFinite(val) ? val : undefined; + } catch { + return undefined; + } +} + /** * Build the catalog entry for one agent manifest. `readme`/`changelog` are the * latest version's markdown ("" if none was published); `packageFiles` is the - * whole-package zip's file listing (null if no package zip was published). + * whole-package zip's file listing (null if no package zip was published); + * `evalScorecard` is the scorecard markdown (null if none was published). */ export function toIndexEntry( agent: AgentManifest, @@ -107,7 +132,9 @@ export function toIndexEntry( changelog: string, packageFiles: { files: { name: string; size_bytes: number }[] } | null, spec = "", - skill = "" + skill = "", + evalScorecard: string | null = null, + baseUrl = "https://hub.amd-gaia.ai" ): IndexEntry { const latest = agent.versions[agent.latest_version]; const req = agent.requirements; @@ -154,6 +181,10 @@ export function toIndexEntry( // undefined serializes to "key absent" — only present when the manifest set it. npm_package: agent.npm_package, playground_url: agent.playground_url, + eval_scorecard_url: evalScorecard !== null + ? `${baseUrl.replace(/\/$/, "")}/${evalScorecardKey(agent.id, agent.latest_version)}` + : undefined, + eval_score: parseScorecardScore(evalScorecard), package: pkg, }; } @@ -164,7 +195,8 @@ export function toIndexEntry( */ export async function rebuildIndex( bucket: R2Bucket, - now: Date = new Date() + now: Date = new Date(), + baseUrl = "https://hub.amd-gaia.ai" ): Promise { const ids = await listAgentIds(bucket); const entries: IndexEntry[] = []; @@ -176,7 +208,8 @@ export async function rebuildIndex( const packageFiles = await readPackageFiles(bucket, id, agent.latest_version); const spec = await readSpec(bucket, id, agent.latest_version); const skill = await readSkill(bucket, id, agent.latest_version); - entries.push(toIndexEntry(agent, readme, changelog, packageFiles, spec, skill)); + const evalScorecard = await readEvalScorecard(bucket, id, agent.latest_version); + entries.push(toIndexEntry(agent, readme, changelog, packageFiles, spec, skill, evalScorecard, baseUrl)); } entries.sort((a, b) => a.id.localeCompare(b.id)); diff --git a/workers/agent-hub/src/publish.ts b/workers/agent-hub/src/publish.ts index 6c9b638dd..626b8c65e 100644 --- a/workers/agent-hub/src/publish.ts +++ b/workers/agent-hub/src/publish.ts @@ -18,6 +18,7 @@ import { parseManifest } from "./manifest"; import { artifactKey, changelogKey, + evalScorecardKey, packageFilesKey, rawManifestKey, readAgentManifest, @@ -173,6 +174,9 @@ export async function handlePublish( // semantics as README/CHANGELOG. const specText = await optionalMarkdownPart(form, "spec", "SPEC.md"); const skillText = await optionalMarkdownPart(form, "skill", "SKILL.md"); + // Optional eval scorecard markdown (the agent's benchmark results, rendered on + // the hub listing as an aggregate score + link). Per-version, first-POST semantics. + const evalScorecardText = await optionalMarkdownPart(form, "eval_scorecard", "eval-scorecard.md"); // Optional whole-package file listing (the zip's contents, for the hub's file // list). The zip itself rides in as a normal `artifact`; this is just the // manifest of what's inside it. @@ -276,6 +280,11 @@ export async function handlePublish( httpMetadata: { contentType: "text/markdown; charset=utf-8" }, }); } + if (evalScorecardText != null) { + await env.BUCKET.put(evalScorecardKey(manifest.id, manifest.version), evalScorecardText, { + httpMetadata: { contentType: "text/markdown; charset=utf-8" }, + }); + } } // The package file listing rides the whole-package zip POST, which in a real @@ -296,7 +305,8 @@ export async function handlePublish( const updated = upsertVersion(existing, manifest, versionEntry); await writeAgentManifest(env.BUCKET, updated); - const index = await rebuildIndex(env.BUCKET, now); + const baseUrl = new URL(request.url).origin; + const index = await rebuildIndex(env.BUCKET, now, baseUrl); return json( { diff --git a/workers/agent-hub/src/storage.ts b/workers/agent-hub/src/storage.ts index 0b15640f6..366e3fe84 100644 --- a/workers/agent-hub/src/storage.ts +++ b/workers/agent-hub/src/storage.ts @@ -52,6 +52,10 @@ export function skillKey(id: string, version: string): string { return `${versionDir(id, version)}SKILL.md`; } +export function evalScorecardKey(id: string, version: string): string { + return `${versionDir(id, version)}eval-scorecard.md`; +} + export function packageFilesKey(id: string, version: string): string { return `${versionDir(id, version)}package-files.json`; } @@ -114,6 +118,21 @@ export async function readSkill( return obj.text(); } +/** + * Read the eval scorecard markdown for one published version. Returns null when + * none was published — the `eval_scorecard` form part is optional, so its + * absence is not an error. + */ +export async function readEvalScorecard( + bucket: R2Bucket, + id: string, + version: string +): Promise { + const obj = await bucket.get(evalScorecardKey(id, version)); + if (!obj) return null; + return obj.text(); +} + /** * Read the whole-package file listing (`{ files: [{name, size_bytes}] }`) for one * version, or null when none was published — the `package_files` form part on diff --git a/workers/agent-hub/src/types.ts b/workers/agent-hub/src/types.ts index 571d75f9d..36df4a811 100644 --- a/workers/agent-hub/src/types.ts +++ b/workers/agent-hub/src/types.ts @@ -199,6 +199,10 @@ export interface IndexEntry { npm_package?: string; /** Localhost playground URL served by the agent's sidecar; absent otherwise. */ playground_url?: string; + /** Public URL of the eval scorecard markdown for the latest version; absent when none was published. */ + eval_scorecard_url?: string; + /** Aggregate eval score (0–100) parsed from the latest version's scorecard front matter; absent when none was published or parseable. */ + eval_score?: number; /** * Whole-package download: a single zip (all platform binaries + client + docs) * plus its file listing. Present only when a `package_files` manifest was diff --git a/workers/agent-hub/test/fake-r2.ts b/workers/agent-hub/test/fake-r2.ts index 79284f98b..9e149c681 100644 --- a/workers/agent-hub/test/fake-r2.ts +++ b/workers/agent-hub/test/fake-r2.ts @@ -159,6 +159,7 @@ export function publishRequest(opts: { changelog?: string; spec?: string; skill?: string; + evalScorecard?: string; packageFiles?: string; }): Request { const form = new FormData(); @@ -167,6 +168,7 @@ export function publishRequest(opts: { if (opts.changelog !== undefined) form.set("changelog", opts.changelog); if (opts.spec !== undefined) form.set("spec", opts.spec); if (opts.skill !== undefined) form.set("skill", opts.skill); + if (opts.evalScorecard !== undefined) form.set("eval_scorecard", opts.evalScorecard); if (opts.packageFiles !== undefined) form.set("package_files", opts.packageFiles); const bytes = typeof opts.artifact === "string" ? new TextEncoder().encode(opts.artifact) : opts.artifact; form.set( diff --git a/workers/agent-hub/test/routes.test.ts b/workers/agent-hub/test/routes.test.ts index bb602d127..29505b207 100644 --- a/workers/agent-hub/test/routes.test.ts +++ b/workers/agent-hub/test/routes.test.ts @@ -81,3 +81,64 @@ describe("GET routes", () => { expect(res.status).toBe(405); }); }); + +// Minimal YAML front matter matching the email agent's scorecard shape. +const SAMPLE_SCORECARD = [ + "---", + "schema_version: 1", + "agent:", + " name: Test Agent", + " version: 0.1.0", + "aggregate:", + " name: weighted_accuracy", + " value: 87.5", + "generated_at: '2026-06-26T00:00:00Z'", + "---", + "# Test Agent — Eval Scorecard v0.1.0", + "", + "**Aggregate score: 87.5** (out of 100)", +].join("\n"); + +describe("eval scorecard in catalog", () => { + it("exposes eval_score and eval_scorecard_url when a scorecard is published", async () => { + const env = makeEnv(); + await worker.fetch( + publishRequest({ + token: "tok_amd", + manifestYaml: sampleManifest({ id: "chat", version: "0.1.0" }), + artifact: "chat-wheel", + filename: "gaia_agent_chat-0.1.0-py3-none-any.whl", + evalScorecard: SAMPLE_SCORECARD, + }), + env as never + ); + + const res = await worker.fetch(get("/index.json"), env as never); + expect(res.status).toBe(200); + const body = (await res.json()) as any; + const entry = body.agents[0]; + expect(entry.eval_score).toBe(87.5); + expect(entry.eval_scorecard_url).toMatch(/\/agents\/chat\/0\.1\.0\/eval-scorecard\.md$/); + }); + + it("omits eval_score and eval_scorecard_url when no scorecard is published", async () => { + const env = makeEnv(); + await worker.fetch( + publishRequest({ + token: "tok_amd", + manifestYaml: sampleManifest({ id: "chat", version: "0.1.0" }), + artifact: "chat-wheel", + filename: "gaia_agent_chat-0.1.0-py3-none-any.whl", + // no evalScorecard + }), + env as never + ); + + const res = await worker.fetch(get("/index.json"), env as never); + expect(res.status).toBe(200); + const body = (await res.json()) as any; + const entry = body.agents[0]; + expect(entry.eval_score).toBeUndefined(); + expect(entry.eval_scorecard_url).toBeUndefined(); + }); +}); From add517249ca54a91d42dbea21715ec213957fb3e Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 10:50:04 -0400 Subject: [PATCH 11/18] feat(eval): show eval score and scorecard link in Agent UI detail modal Adds `eval_score` and `eval_scorecard_url` optional fields to `AgentInfo` in the frontend type definitions. When an agent has an eval score, the detail modal renders an "Eval scorecard" section showing the numeric score out of 100, with a "View scorecard" link when the URL is present. Renders nothing when neither field is set (no empty section). --- .../webui/src/components/AgentDetailModal.tsx | 26 ++++++++++++++++++- src/gaia/apps/webui/src/types/index.ts | 4 +++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/gaia/apps/webui/src/components/AgentDetailModal.tsx b/src/gaia/apps/webui/src/components/AgentDetailModal.tsx index 7de638328..b1a2dd954 100644 --- a/src/gaia/apps/webui/src/components/AgentDetailModal.tsx +++ b/src/gaia/apps/webui/src/components/AgentDetailModal.tsx @@ -2,7 +2,7 @@ // SPDX-License-Identifier: MIT import { useEffect, useCallback } from 'react'; -import { Wrench, Cpu, Shield, X, HardDrive, CheckCircle2, FlaskConical, AlertTriangle } from 'lucide-react'; +import { Wrench, Cpu, Shield, X, HardDrive, CheckCircle2, FlaskConical, AlertTriangle, BarChart2 } from 'lucide-react'; import { getAgentIcon } from './agentIcons'; import type { AgentInfo } from '../types'; @@ -172,6 +172,30 @@ export function AgentDetailModal({ agent, onClose, onStartChat }: AgentDetailMod )} + {/* Eval scorecard */} + {agent.eval_score != null && ( +
+
Eval scorecard
+
+ +
+
Eval score
+
+ {agent.eval_score} / 100 + {agent.eval_scorecard_url && ( + <> — View scorecard + )} +
+
+
+
+ )} + {/* Conversation starters */} {starters.length > 0 && (
diff --git a/src/gaia/apps/webui/src/types/index.ts b/src/gaia/apps/webui/src/types/index.ts index 7b1aa9447..10c610e57 100644 --- a/src/gaia/apps/webui/src/types/index.ts +++ b/src/gaia/apps/webui/src/types/index.ts @@ -117,6 +117,10 @@ export interface AgentInfo { avatar_url?: string; /** True when the publisher has deprecated this agent. */ deprecated?: boolean; + /** Public URL of the eval scorecard markdown; absent when none was published. */ + eval_scorecard_url?: string; + /** Aggregate eval score (0–100) from the latest published scorecard; absent when none. */ + eval_score?: number; } /** Derived card state for the Agent Hub (issue #1097). */ From 0eed4456920ebf57b95f2ef298e3d1f5865a9428 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 12:48:13 -0400 Subject: [PATCH 12/18] feat(eval): regenerate email v0.2.4 scorecard against relabeled corpus After #1875 relabeled the eval corpus to the schema-2.0 triage taxonomy, the email agent's predictions and the ground-truth labels share one vocabulary, so category_accuracy now measures real agreement: 0.40 over 25 of 220 emails -> aggregate 40.0/100 (was 4.0, a labeling artifact). Fresh gaia eval benchmark run on AMD Strix Halo. Drop the now-resolved #1874 caveat from the adapter methodology + README; align the dataset description to the schema-2.0 taxonomy. --- hub/agents/npm/agent-email/README.md | 2 +- .../npm/agent-email/scorecards/0.2.4.md | 31 +++++++++---------- .../python/email/packaging/gen_scorecard.py | 13 +++----- 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md index 7f0891d3d..c8c079b2a 100644 --- a/hub/agents/npm/agent-email/README.md +++ b/hub/agents/npm/agent-email/README.md @@ -2,7 +2,7 @@ [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24** -**Eval scorecard (v0.2.4): aggregate 4.0 / 100** — `category_accuracy` 0.04 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)). The low value reflects strict exact-match against stale corpus labels (the eval ground-truth still uses the pre-schema-2.0 4-way taxonomy), not triage quality — tracked in [#1874](https://github.com/amd/gaia/issues/1874). The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate. +**Eval scorecard (v0.2.4): aggregate 40.0 / 100** — `category_accuracy` 0.40 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate. Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies to, and schedules from Gmail and Outlook — with every email body analyzed diff --git a/hub/agents/npm/agent-email/scorecards/0.2.4.md b/hub/agents/npm/agent-email/scorecards/0.2.4.md index 1c5f89480..7e36786a1 100644 --- a/hub/agents/npm/agent-email/scorecards/0.2.4.md +++ b/hub/agents/npm/agent-email/scorecards/0.2.4.md @@ -7,15 +7,12 @@ recipe: dataset: reference: tests/fixtures/email/ground_truth.json description: 'Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, - 4-category priority labels: informational / actionable / urgent / low priority)' + schema-2.0 triage taxonomy: fyi / needs_response / promotional / urgent / personal)' size: 220 - methodology: 'gaia eval benchmark — category classification accuracy (case-insensitive - exact match of the agent''s triage label vs the ground-truth priority label) over - a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent''s - triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority - labels currently overlap only on ''urgent'', so this exact-match metric understates - triage usefulness — the corpus labels are stale vs the schema-2.0 taxonomy, tracked - in amd/gaia#1874' + methodology: gaia eval benchmark — category classification accuracy (case-insensitive + exact match of the agent's triage label vs the ground-truth label) over a synthetic + labeled corpus via FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0 + triage taxonomy, aligned with the agent's output labels (#1874) config: harness: gaia eval benchmark model: Gemma-4-E4B-it-GGUF @@ -26,36 +23,36 @@ results: test_cases_run: 25 metrics: - name: category_accuracy - value: 0.04 + value: 0.4 weight: 1.0 aggregate: name: weighted_accuracy formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2) components: - metric: category_accuracy - value: 0.04 + value: 0.4 weight: 1.0 - value: 4.0 -generated_at: '2026-06-26T14:38:25.168352+00:00' + value: 40.0 +generated_at: '2026-06-26T16:47:13.735478+00:00' inherited_from: null --- # Email Triage — Eval Scorecard v0.2.4 -**Aggregate score: 4.0** (out of 100) +**Aggregate score: 40.0** (out of 100) ## Recipe | Field | Value | |-------|-------| | Dataset | [tests/fixtures/email/ground_truth.json](tests/fixtures/email/ground_truth.json) | -| Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, 4-category priority labels: informational / actionable / urgent / low priority) | +| Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, schema-2.0 triage taxonomy: fyi / needs_response / promotional / urgent / personal) | | Dataset size | 220 labeled examples | | Test cases run | 25 | -| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth priority label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy (fyi / needs_response / promotional / urgent) and the corpus priority labels currently overlap only on 'urgent', so this exact-match metric understates triage usefulness — the corpus labels are stale vs the schema-2.0 taxonomy, tracked in amd/gaia#1874 | +| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0 triage taxonomy, aligned with the agent's output labels (#1874) | ## Metrics - - **category_accuracy**: 0.0400 × 1.0 + - **category_accuracy**: 0.4000 × 1.0 ## Aggregate score recomputation @@ -64,7 +61,7 @@ Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)` Worked example: ``` -round(100 × ((0.0400 × 1.0)) / 1.0, 2) = 4.0 +round(100 × ((0.4000 × 1.0)) / 1.0, 2) = 40.0 ``` A reader can reproduce this value from the `aggregate.components` in the front diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py index fdfdf1eae..10d036442 100644 --- a/hub/agents/python/email/packaging/gen_scorecard.py +++ b/hub/agents/python/email/packaging/gen_scorecard.py @@ -210,19 +210,16 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None): dataset_reference="tests/fixtures/email/ground_truth.json", dataset_description=( "Synthetic email corpus for GAIA email-triage evaluation " - "(FakeGmailBackend, 4-category priority labels: " - "informational / actionable / urgent / low priority)" + "(FakeGmailBackend, schema-2.0 triage taxonomy: " + "fyi / needs_response / promotional / urgent / personal)" ), dataset_size=dataset_size, methodology=( "gaia eval benchmark — category classification accuracy " "(case-insensitive exact match of the agent's triage label vs the " - "ground-truth priority label) over a synthetic labeled corpus via " - "FakeGmailBackend; no LLM judge. NOTE: the agent's triage taxonomy " - "(fyi / needs_response / promotional / urgent) and the corpus " - "priority labels currently overlap only on 'urgent', so this " - "exact-match metric understates triage usefulness — the corpus " - "labels are stale vs the schema-2.0 taxonomy, tracked in amd/gaia#1874" + "ground-truth label) over a synthetic labeled corpus via " + "FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0 " + "triage taxonomy, aligned with the agent's output labels (#1874)" ), config={ "harness": "gaia eval benchmark", From f5971b67cfae403bcfd85b466e60e61ba1fc2da4 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 13:29:20 -0400 Subject: [PATCH 13/18] refactor(eval): single SCORECARD.md per agent, new gate interface, reproduction section Storage convention changes from scorecards/.md to a single SCORECARD.md updated in place (versioned via publish snapshot, same as README.md). - release_scorecard.py: add reproduction_command to ResultPayload; render_scorecard emits a Reproduction section; carry_forward reads version from front matter instead of filename stem; remove latest_version_below (per-version dirs gone); fix utcnow -> now(utc) - scorecard_gate.py: redesigned to accept --scorecard SCORECARD.md + optional --baseline-file / --baseline-ref (mutually exclusive); no --scorecards-dir or --version flags; --baseline-ref resolves via git show; absence at ref = first adoption pass; git-shellout-free when --baseline-file is used - gen_scorecard.py: writes hub/agents/npm/agent-email/SCORECARD.md (not scorecards/.md); supplies reproduction_command with exact env vars and commands - tests: updated for new carry_forward signature, new gate interface, reproduction section assertions, second-agent generalization test, utcnow -> now(utc) --- .../python/email/packaging/gen_scorecard.py | 53 +++- src/gaia/eval/release_scorecard.py | 123 ++++---- src/gaia/eval/scorecard_gate.py | 266 ++++++++++-------- tests/unit/eval/test_release_scorecard.py | 136 ++++++--- tests/unit/eval/test_scorecard_gate.py | 202 ++++++++----- 5 files changed, 477 insertions(+), 303 deletions(-) diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py index 10d036442..1837a1389 100644 --- a/hub/agents/python/email/packaging/gen_scorecard.py +++ b/hub/agents/python/email/packaging/gen_scorecard.py @@ -7,7 +7,8 @@ Reads the benchmark ``--output-dir`` (looks for a JSON file containing a ``scenarios`` key — ``scorecard.json`` in a real run, or any ``*scorecard*.json`` fixture) and the ground-truth JSON, builds a :class:`ResultPayload`, and writes the -scorecard to ``hub/agents/npm/agent-email/scorecards/.md``. +scorecard to ``hub/agents/npm/agent-email/SCORECARD.md`` (a single file, updated +in place — versioned via the publish snapshot, the same way README.md works). This adapter imports ``gaia.eval.release_scorecard`` (core generator) but never imports the eval harness (``gaia.eval.benchmark``) or the email-agent package — @@ -15,9 +16,13 @@ Usage:: + PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\ + GAIA_AGENT_TOOL_TIMEOUT=120 \\ + PYTHONPATH="$(pwd)" \\ python hub/agents/python/email/packaging/gen_scorecard.py \\ --benchmark-dir /tmp/email-eval \\ - [--ground-truth tests/fixtures/email/ground_truth.json] + [--ground-truth tests/fixtures/email/ground_truth.json] \\ + [--limit 25] The ``--ground-truth`` path defaults to the canonical fixture in the repository. """ @@ -42,6 +47,9 @@ # Canonical benchmark scorecard filename (written by gaia eval benchmark) _SCORECARD_FILENAME = "scorecard.json" +# Output filename: single SCORECARD.md per agent package, updated in place. +_OUTPUT_FILENAME = "SCORECARD.md" + def _find_benchmark_scorecard(benchmark_dir: Path) -> Path: """Locate the benchmark scorecard JSON in ``benchmark_dir``. @@ -204,6 +212,30 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None): import datetime + # Construct an exact reproduction command using the supplied arguments, so any + # reader can reproduce the scorecard result from scratch. + limit_flag = f" --limit {limit}" if limit is not None else "" + ground_truth_rel = ( + str(ground_truth_path.relative_to(_REPO_ROOT)) + if str(ground_truth_path).startswith(str(_REPO_ROOT)) + else ground_truth_path.name + ) + benchmark_dir_display = str(benchmark_dir) + reproduction_command = ( + "# Step 1: run the benchmark (requires a running Lemonade Server on :13305)\n" + f"PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n" + f"GAIA_AGENT_TOOL_TIMEOUT=120 \\\n" + f"PYTHONPATH=\"$(pwd)\" \\\n" + f"gaia eval benchmark{limit_flag}\n\n" + "# Step 2: generate the scorecard from the benchmark output\n" + f"PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n" + f"PYTHONPATH=\"$(pwd)\" \\\n" + f"python hub/agents/python/email/packaging/gen_scorecard.py \\\n" + f" --benchmark-dir {benchmark_dir_display} \\\n" + f" --ground-truth {ground_truth_rel}" + + (f" \\\n --limit {limit}" if limit is not None else "") + ) + return ResultPayload( agent_name="Email Triage", agent_version=version, @@ -227,11 +259,7 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None): "corpus": "tests/fixtures/email/synthetic_inbox.mbox", # Store a repo-relative path — never leak a local absolute path into # a committed/published artifact. - "ground_truth": ( - str(ground_truth_path.relative_to(_REPO_ROOT)) - if str(ground_truth_path).startswith(str(_REPO_ROOT)) - else ground_truth_path.name - ), + "ground_truth": ground_truth_rel, "limit": limit, }, test_cases_run=test_cases_run, @@ -239,6 +267,7 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None): aggregate_name="weighted_accuracy", generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), inherited_from=None, + reproduction_command=reproduction_command, ) @@ -268,7 +297,7 @@ def main(argv=None) -> int: default=None, help=( "Override the scorecard output directory " - "(default: hub/agents/npm/agent-email/scorecards/)." + f"(default: hub/agents/npm/agent-email/, writes {_OUTPUT_FILENAME})." ), ) parser.add_argument( @@ -296,12 +325,12 @@ def main(argv=None) -> int: from gaia.eval.release_scorecard import write_scorecard if args.output_dir: - scorecards_dir = Path(args.output_dir) + out_dir = Path(args.output_dir) else: - scorecards_dir = _NPM_ROOT / "scorecards" + out_dir = _NPM_ROOT - scorecards_dir.mkdir(parents=True, exist_ok=True) - out_path = scorecards_dir / f"{payload.agent_version}.md" + out_dir.mkdir(parents=True, exist_ok=True) + out_path = out_dir / _OUTPUT_FILENAME write_scorecard(payload, out_path) print( diff --git a/src/gaia/eval/release_scorecard.py b/src/gaia/eval/release_scorecard.py index d50449f23..46111455e 100644 --- a/src/gaia/eval/release_scorecard.py +++ b/src/gaia/eval/release_scorecard.py @@ -1,12 +1,18 @@ # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT """ -Per-agent / per-version eval scorecard: generator, parser, validator, and versioning helpers. +Per-agent eval scorecard: generator, parser, validator, and versioning helpers. **Distinct from** ``src/gaia/eval/scorecard.py`` — that module is the per-eval-run scenario PASS/FAIL aggregator (``build_scorecard``). This module produces the -outward-facing *release artifact*: a versioned Markdown file with YAML front matter -holding measured accuracy metrics, the eval recipe, and a deterministic aggregate score. +outward-facing *release artifact*: a single ``SCORECARD.md`` file (updated in +place per release, versioned via the publish snapshot — the same way README.md +works) with YAML front matter holding measured accuracy metrics, the eval recipe, +a deterministic aggregate score, and a Reproduction section. + +Storage convention: ``/SCORECARD.md`` (NOT ``scorecards/.md``). +Per-version uniqueness comes from the publish snapshot in R2 (the hub stores every +doc per version at ``agents///SCORECARD.md``). Intentionally harness-agnostic: this module imports ONLY stdlib + PyYAML. No other loader is permitted — ``yaml.safe_load`` only. @@ -63,6 +69,9 @@ class ResultPayload: generated_at: ISO-8601 timestamp string; informational only. inherited_from: If this is a patch carry-forward, the prior version string; otherwise None. + reproduction_command: Optional exact shell command(s) to reproduce this + scorecard run. Rendered in the ``## Reproduction`` section. If None, + a generic pointer to the docs/skill is rendered instead. """ agent_name: str @@ -77,6 +86,7 @@ class ResultPayload: aggregate_name: str = "weighted_accuracy" generated_at: str = "" inherited_from: Optional[str] = None + reproduction_command: Optional[str] = None def compute_aggregate(metrics: list) -> tuple: @@ -121,7 +131,8 @@ def render_scorecard(payload: ResultPayload) -> str: """Render a scorecard as Markdown with YAML front matter. The front matter is machine-readable; the body is a human-readable summary - that includes the aggregate formula and a worked recomputation example. + that includes the aggregate formula, a worked recomputation example, and a + Reproduction section so any reader can reproduce the result from scratch. Args: payload: Populated :class:`ResultPayload`. @@ -182,6 +193,22 @@ def render_scorecard(payload: ResultPayload) -> str: total_w = sum(c["weight"] for c in components) worked = " + ".join(f"({c['value']:.4f} × {c['weight']:.1f})" for c in components) + # Reproduction section + if payload.reproduction_command: + repro_body = ( + "Run the following commands from the repository root:\n\n" + f"```sh\n{payload.reproduction_command}\n```\n\n" + "See [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) " + "and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) " + "for the full setup guide." + ) + else: + repro_body = ( + "See the [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) " + "and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) " + "for the full reproduction recipe." + ) + body = f"""# {payload.agent_name} — Eval Scorecard v{payload.agent_version} **Aggregate score: {agg_value}** (out of 100) @@ -212,6 +239,10 @@ def render_scorecard(payload: ResultPayload) -> str: A reader can reproduce this value from the `aggregate.components` in the front matter alone — no eval-harness access needed. + +## Reproduction + +{repro_body} """ if payload.inherited_from: @@ -372,66 +403,19 @@ def _assert_valid_version(version: str) -> None: ) -def _assert_safe_path(scorecards_dir: Path, version: str) -> Path: - """Return ``scorecards_dir / f"{version}.md"`` after path-traversal guard.""" - _assert_valid_version(version) - scorecards_dir = scorecards_dir.resolve() - candidate = (scorecards_dir / f"{version}.md").resolve() - if not str(candidate).startswith(str(scorecards_dir)): - raise ValueError( - f"Resolved scorecard path {candidate} is not inside " - f"scorecards dir {scorecards_dir} — possible path traversal." - ) - return candidate - - -def latest_version_below(scorecards_dir: Path, version: str) -> Optional[str]: - """Return the greatest version in ``scorecards_dir`` strictly less than ``version``. - - Only files whose stem matches the anchored semver regex ``^\\d+\\.\\d+\\.\\d+$`` - are considered. Non-matching filenames (README.md, .gitkeep, etc.) are silently - skipped. - - Args: - scorecards_dir: Directory to scan for ``*.md`` scorecards. - version: The candidate version string (must be valid semver). - - Returns: - The greatest matching version string strictly below ``version``, or ``None`` - if no such version exists. - - Raises: - ValueError: If ``version`` is not a valid semver string. - """ - _assert_valid_version(version) - target_tuple = _semver_tuple(version) - scorecards_dir = Path(scorecards_dir) - - candidates: list[tuple] = [] - if scorecards_dir.is_dir(): - for p in scorecards_dir.glob("*.md"): - m = _SEMVER_RE.match(p.stem) - if not m: - continue # silently skip non-semver filenames - t = (int(m.group(1)), int(m.group(2)), int(m.group(3))) - if t < target_tuple: - candidates.append(t) - - if not candidates: - return None - - best = max(candidates) - return f"{best[0]}.{best[1]}.{best[2]}" - +def carry_forward(prev_scorecard_path: Path, new_version: str) -> ResultPayload: + """Carry forward a prior SCORECARD.md's results to a new patch version. -def carry_forward(prev_path: Path, new_version: str) -> ResultPayload: - """Carry forward a prior scorecard's results to a new patch version. + Reads the single ``SCORECARD.md`` (the agent's one scorecard file, updated + in place per release), copies all results verbatim, and sets + ``inherited_from`` to the prior version string recorded in the front matter. - Reads the prior scorecard, copies all results verbatim, and sets - ``inherited_from`` to the prior version string. + Only patch bumps are allowed: if the prior scorecard's ``agent.version`` + differs in major or minor from ``new_version``, the caller must re-run the + eval to generate fresh results. Args: - prev_path: Path to the prior version's scorecard ``.md`` file. + prev_scorecard_path: Path to the prior ``SCORECARD.md`` file. new_version: The new version string (must be a patch bump of the prior). Returns: @@ -444,8 +428,18 @@ def carry_forward(prev_path: Path, new_version: str) -> ResultPayload: ValueError: If the prior scorecard cannot be parsed. """ _assert_valid_version(new_version) - prev_path = Path(prev_path) - prev_version = prev_path.stem # e.g. "0.2.3" from "0.2.3.md" + prev_scorecard_path = Path(prev_scorecard_path) + + parsed = parse_scorecard(prev_scorecard_path) + + # Extract prior version from front matter (agent.version) + agent = parsed.get("agent", {}) + prev_version = str(agent.get("version", "")) + if not prev_version: + raise ValueError( + f"Cannot read prior version from {prev_scorecard_path}: " + "missing 'agent.version' field in front matter." + ) prev_tuple = _semver_tuple(prev_version) new_tuple = _semver_tuple(new_version) @@ -458,10 +452,7 @@ def carry_forward(prev_path: Path, new_version: str) -> ResultPayload: f"generate fresh results for this release." ) - parsed = parse_scorecard(prev_path) - # Extract fields from the parsed front matter - agent = parsed.get("agent", {}) recipe = parsed.get("recipe", {}) dataset = recipe.get("dataset", {}) results = parsed.get("results", {}) @@ -480,6 +471,6 @@ def carry_forward(prev_path: Path, new_version: str) -> ResultPayload: test_cases_run=results.get("test_cases_run", 0), metrics=metrics_raw, aggregate_name=parsed.get("aggregate", {}).get("name", "weighted_accuracy"), - generated_at=datetime.datetime.utcnow().isoformat(), + generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), inherited_from=prev_version, ) diff --git a/src/gaia/eval/scorecard_gate.py b/src/gaia/eval/scorecard_gate.py index 49c292561..4328511c4 100644 --- a/src/gaia/eval/scorecard_gate.py +++ b/src/gaia/eval/scorecard_gate.py @@ -1,26 +1,36 @@ # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT """ -Standalone release gate: blocks packaging when the candidate scorecard is missing -or when its aggregate score strictly regressed below the prior version's. +Standalone release gate: blocks packaging when the candidate SCORECARD.md is +missing, invalid, or when its aggregate score strictly regressed below the prior +version's. **Distinct from** ``src/gaia/eval/scorecard.py`` — that module aggregates per-run scenario PASS/FAIL for internal CI. This gate checks the *outward-facing* release artifact produced by ``release_scorecard.py``. +Storage convention: one ``SCORECARD.md`` per agent package (updated in place, +versioned via the publish snapshot — the same way README.md works). + Usage:: + # Presence-only (first adoption): + python -m gaia.eval.scorecard_gate \\ + --scorecard hub/agents/npm/agent-email/SCORECARD.md + + # With a baseline from a file (unit tests): python -m gaia.eval.scorecard_gate \\ - --scorecards-dir hub/agents/npm/agent-email/scorecards \\ - --manifest hub/agents/python/email/gaia-agent.yaml + --scorecard hub/agents/npm/agent-email/SCORECARD.md \\ + --baseline-file /tmp/prev-SCORECARD.md + # With a baseline resolved from a git ref (CI): python -m gaia.eval.scorecard_gate \\ - --scorecards-dir hub/agents/npm/agent-email/scorecards \\ - --version 0.2.4 + --scorecard hub/agents/npm/agent-email/SCORECARD.md \\ + --baseline-ref agent-pkg-email-v0.2.3 Exit codes: 0 — Passed (presence-only first adoption, equal score, or score improved). - 1 — Failed (missing/invalid candidate card, strict regression, or prior card invalid). + 1 — Failed (missing/invalid candidate, strict regression, invalid baseline). The ``--allow-regression`` flag overrides a regression: prints a ``::warning::`` GHA annotation and both version/score pairs, then exits 0. @@ -29,45 +39,69 @@ from __future__ import annotations import argparse +import subprocess import sys from pathlib import Path -import yaml - from gaia.eval.release_scorecard import ( - _assert_safe_path, - latest_version_below, parse_scorecard, validate_scorecard, ) -def _read_version_from_manifest(manifest_path: Path) -> str: - """Read the ``version:`` field from a ``gaia-agent.yaml`` manifest. +def _parse_baseline_ref(scorecard_path: Path, ref: str) -> str | None: + """Resolve ``:`` via ``git show`` and return the content. - Args: - manifest_path: Path to the YAML manifest file. + The path used in the git command is the path of ``scorecard_path`` relative + to the repository root (discovered by ``git rev-parse --show-toplevel``). - Returns: - The version string. + Returns the file content as a string, or None if the file does not exist at + that ref (treated as first adoption — presence-only pass). Raises: - ValueError: If the file cannot be read or ``version:`` is absent. + ValueError: If ``git`` cannot be called or the ref is otherwise invalid + (the caller treats this as an actionable error, not first adoption). """ + # Discover repo root so we can form a root-relative path for git show. + try: + result = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + capture_output=True, + text=True, + check=True, + ) + except (subprocess.CalledProcessError, FileNotFoundError) as exc: + raise ValueError( + f"Cannot determine git repository root: {exc}. " + "Run from inside a git repository, or use --baseline-file instead." + ) from exc + + repo_root = Path(result.stdout.strip()) + scorecard_path = Path(scorecard_path).resolve() try: - text = manifest_path.read_text(encoding="utf-8") - except OSError as exc: - raise ValueError(f"Cannot read manifest {manifest_path}: {exc}") from exc + rel = scorecard_path.relative_to(repo_root) + except ValueError: + raise ValueError( + f"SCORECARD path {scorecard_path} is not inside the git repo root " + f"{repo_root}. Use an absolute path inside the repo, or use " + "--baseline-file instead." + ) + git_path = rel.as_posix() try: - data = yaml.safe_load(text) or {} - except yaml.YAMLError as exc: - raise ValueError(f"Invalid YAML in manifest {manifest_path}: {exc}") from exc + result = subprocess.run( + ["git", "show", f"{ref}:{git_path}"], + capture_output=True, + text=True, + ) + except FileNotFoundError as exc: + raise ValueError(f"git not found: {exc}") from exc - version = data.get("version") - if not version: - raise ValueError(f"Manifest {manifest_path} has no 'version:' field.") - return str(version) + if result.returncode != 0: + # File absent at that ref → first adoption (presence-only pass). + return None + + return result.stdout def main(argv=None) -> int: @@ -81,24 +115,31 @@ def main(argv=None) -> int: """ parser = argparse.ArgumentParser( description=( - "Release gate: ensures a valid scorecard exists for the candidate version " - "and that its aggregate score has not strictly regressed vs the prior version." + "Release gate: ensures a valid SCORECARD.md exists and that its " + "aggregate score has not strictly regressed vs the prior version." ), prog="python -m gaia.eval.scorecard_gate", ) parser.add_argument( - "--scorecards-dir", - required=False, - help="Directory containing per-version scorecard .md files.", + "--scorecard", + required=True, + help="Path to the candidate SCORECARD.md (e.g. hub/agents/npm/agent-email/SCORECARD.md).", ) - version_group = parser.add_mutually_exclusive_group() - version_group.add_argument( - "--version", - help="Candidate version string (e.g. 0.2.4).", + baseline_group = parser.add_mutually_exclusive_group() + baseline_group.add_argument( + "--baseline-file", + help=( + "Path to the prior version's SCORECARD.md for regression comparison " + "(for unit tests; no git access needed)." + ), ) - version_group.add_argument( - "--manifest", - help="Path to gaia-agent.yaml; the 'version:' field is used as the candidate version.", + baseline_group.add_argument( + "--baseline-ref", + help=( + "Git ref (tag or commit) of the prior release to use as baseline. " + "Resolves via 'git show :'. If the file does not " + "exist at that ref, a presence-only pass is applied (first adoption)." + ), ) parser.add_argument( "--allow-regression", @@ -115,102 +156,99 @@ def main(argv=None) -> int: except SystemExit: return 1 - # Validate required arguments - if not args.scorecards_dir: - print( - "ERROR: --scorecards-dir is required.\n" - "Usage: python -m gaia.eval.scorecard_gate --scorecards-dir DIR " - "--version V (or --manifest PATH)" - ) - return 1 - - if not args.version and not args.manifest: - print( - "ERROR: Either --version or --manifest is required.\n" - "Usage: python -m gaia.eval.scorecard_gate --scorecards-dir DIR " - "--version V (or --manifest PATH)" - ) - return 1 - - scorecards_dir = Path(args.scorecards_dir) - - # Resolve the candidate version - if args.manifest: - try: - version = _read_version_from_manifest(Path(args.manifest)) - except ValueError as exc: - print(f"ERROR: {exc}") - return 1 - else: - version = args.version + candidate_path = Path(args.scorecard) # --- Step 1: Presence check --- - try: - candidate_path = _assert_safe_path(scorecards_dir, version) - except ValueError as exc: - print(f"ERROR: {exc}") - return 1 - if not candidate_path.exists(): print( - f"ERROR: Scorecard missing for version {version}.\n" - f" Expected: {candidate_path}\n" + f"ERROR: SCORECARD.md missing at {candidate_path}.\n" f" Run 'python gen_scorecard.py' (or 'carry_forward') to generate it, " - f"then commit the file before releasing." + f"then commit the file before releasing.\n" + f" See https://amd-gaia.ai/docs/reference/eval-scorecard and " + f".claude/skills/adding-eval-scorecard/SKILL.md" ) return 1 try: candidate_parsed = parse_scorecard(candidate_path) except ValueError as exc: - print(f"ERROR: Cannot parse candidate scorecard {candidate_path}: {exc}") + print(f"ERROR: Cannot parse candidate SCORECARD.md at {candidate_path}: {exc}") return 1 errors = validate_scorecard(candidate_parsed) if errors: print( - f"ERROR: Candidate scorecard {candidate_path} is invalid:\n" + f"ERROR: Candidate SCORECARD.md at {candidate_path} is invalid:\n" + "\n".join(f" - {e}" for e in errors) ) return 1 - # --- Step 2: Locate prior version --- - try: - prev_version = latest_version_below(scorecards_dir, version) - except ValueError as exc: - print(f"ERROR: {exc}") - return 1 + # --- Step 2: Resolve baseline --- + baseline_text: str | None = None - if prev_version is None: + if args.baseline_file: + baseline_path = Path(args.baseline_file) + if not baseline_path.exists(): + print( + f"ERROR: --baseline-file not found: {baseline_path}.\n" + f" Provide a valid path to a prior SCORECARD.md, or omit --baseline-file " + f"for a presence-only pass." + ) + return 1 + try: + baseline_text = baseline_path.read_text(encoding="utf-8") + except OSError as exc: + print(f"ERROR: Cannot read --baseline-file {baseline_path}: {exc}") + return 1 + + elif args.baseline_ref: + try: + baseline_text = _parse_baseline_ref(candidate_path, args.baseline_ref) + except ValueError as exc: + print(f"ERROR: {exc}") + return 1 + # None means the file doesn't exist at that ref → first adoption + if baseline_text is None: + print( + f"PASS: No SCORECARD.md found at ref '{args.baseline_ref}'. " + f"First adoption — presence check only." + ) + return 0 + + if baseline_text is None: + # No baseline specified at all → presence-only pass. + candidate_version = candidate_parsed.get("agent", {}).get("version", "?") + candidate_score = candidate_parsed.get("aggregate", {}).get("value") + if candidate_score is None: + print( + f"ERROR: Candidate SCORECARD.md at {candidate_path} has no " + f"'aggregate.value' field.\n" + f" Fix the scorecard front matter before releasing." + ) + return 1 print( - f"PASS: No prior scorecard found for versions below {version}. " - f"First adoption — presence check only." + f"PASS: No baseline provided. Presence check only.\n" + f" Candidate v{candidate_version}: aggregate.value = {candidate_score}" ) return 0 - # --- Step 3: Parse prior and regression check --- + # --- Step 3: Parse baseline and regression check --- try: - prev_path = _assert_safe_path(scorecards_dir, prev_version) - except ValueError as exc: - print(f"ERROR: {exc}") - return 1 - - try: - prev_parsed = parse_scorecard(prev_path) + prev_parsed = parse_scorecard(baseline_text) except ValueError as exc: print( - f"ERROR: Cannot parse prior scorecard {prev_path}: {exc}\n" - f" The prior scorecard is corrupt or missing a valid front matter. " - f"Fix it before releasing {version}." + f"ERROR: Cannot parse baseline SCORECARD.md: {exc}\n" + f" The baseline is corrupt or missing a valid front matter. " + f"Fix it before releasing." ) return 1 prev_errors = validate_scorecard(prev_parsed) if prev_errors: print( - f"ERROR: Prior scorecard {prev_path} is invalid:\n" + f"ERROR: Baseline SCORECARD.md is invalid:\n" + "\n".join(f" - {e}" for e in prev_errors) - + f"\n Fix the prior scorecard before releasing {version}." + + f"\n Fix the baseline scorecard before releasing." ) return 1 @@ -219,32 +257,40 @@ def main(argv=None) -> int: if candidate_score is None: print( - f"ERROR: Candidate scorecard {candidate_path} has no 'aggregate.value' field." + f"ERROR: Candidate SCORECARD.md at {candidate_path} has no " + f"'aggregate.value' field.\n" + f" Fix the scorecard front matter before releasing." ) return 1 if prev_score is None: - print(f"ERROR: Prior scorecard {prev_path} has no 'aggregate.value' field.") + print( + f"ERROR: Baseline SCORECARD.md has no 'aggregate.value' field.\n" + f" Fix the baseline scorecard before releasing." + ) return 1 + candidate_version = candidate_parsed.get("agent", {}).get("version", "?") + prev_version = prev_parsed.get("agent", {}).get("version", "?") + if float(candidate_score) < float(prev_score): # Strict regression detected if args.allow_regression: print( f"::warning::Scorecard regression allowed by --allow-regression: " - f"{prev_version}={prev_score} → {version}={candidate_score}" + f"v{prev_version}={prev_score} → v{candidate_version}={candidate_score}" ) print( f"WARNING: Regression override active. " - f"Prior version {prev_version} scored {prev_score}; " - f"candidate {version} scored {candidate_score}. " + f"Prior version v{prev_version} scored {prev_score}; " + f"candidate v{candidate_version} scored {candidate_score}. " f"This regression has been explicitly acknowledged." ) return 0 print( f"ERROR: Scorecard regression detected.\n" - f" Prior version {prev_version}: aggregate.value = {prev_score}\n" - f" Candidate {version}: aggregate.value = {candidate_score}\n" + f" Prior version v{prev_version}: aggregate.value = {prev_score}\n" + f" Candidate v{candidate_version}: aggregate.value = {candidate_score}\n" f" The candidate score is strictly lower than the prior. " f"Investigate the regression or use --allow-regression to override intentionally." ) @@ -252,8 +298,8 @@ def main(argv=None) -> int: print( f"PASS: Scorecard gate passed.\n" - f" Candidate {version}: aggregate.value = {candidate_score} " - f"(prior {prev_version}: {prev_score})" + f" Candidate v{candidate_version}: aggregate.value = {candidate_score} " + f"(prior v{prev_version}: {prev_score})" ) return 0 diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py index c542ae13e..d13abdc1e 100644 --- a/tests/unit/eval/test_release_scorecard.py +++ b/tests/unit/eval/test_release_scorecard.py @@ -14,7 +14,6 @@ ResultPayload, carry_forward, compute_aggregate, - latest_version_below, parse_scorecard, render_scorecard, validate_scorecard, @@ -42,7 +41,7 @@ def _make_payload(version="1.0.0", accuracy=0.5): test_cases_run=10, metrics=metrics, aggregate_name="weighted_accuracy", - generated_at=datetime.datetime.utcnow().isoformat(), + generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), inherited_from=None, ) @@ -202,6 +201,24 @@ def test_parse_recovers_all_required_fields(self): errors = validate_scorecard(parsed) assert errors == [] + def test_body_contains_reproduction_section(self): + payload = _make_payload() + text = render_scorecard(payload) + assert "## Reproduction" in text + + def test_reproduction_section_includes_custom_command(self): + payload = _make_payload() + payload.reproduction_command = "gaia eval benchmark --limit 25" + text = render_scorecard(payload) + assert "gaia eval benchmark --limit 25" in text + + def test_reproduction_section_generic_when_no_command(self): + payload = _make_payload() + # No reproduction_command (default None) + text = render_scorecard(payload) + assert "## Reproduction" in text + assert "eval-scorecard" in text + # --------------------------------------------------------------------------- # 4. Two counts distinct as separate fields @@ -278,14 +295,14 @@ def test_body_non_empty(self): # --------------------------------------------------------------------------- -# 7. Versioning — patch carry-forward +# 7. Versioning — patch carry-forward (SCORECARD.md is a single file) # --------------------------------------------------------------------------- class TestCarryForwardPatch: def test_carry_forward_sets_inherited_from(self, tmp_path): src = _make_payload(version="0.2.3", accuracy=0.75) - card_path = tmp_path / "0.2.3.md" + card_path = tmp_path / "SCORECARD.md" card_path.write_text(render_scorecard(src)) result = carry_forward(card_path, "0.2.4") @@ -293,12 +310,23 @@ def test_carry_forward_sets_inherited_from(self, tmp_path): def test_carry_forward_copies_metrics_verbatim(self, tmp_path): src = _make_payload(version="0.2.3", accuracy=0.75) - card_path = tmp_path / "0.2.3.md" + card_path = tmp_path / "SCORECARD.md" card_path.write_text(render_scorecard(src)) result = carry_forward(card_path, "0.2.4") assert result.metrics == src.metrics + def test_carry_forward_reads_version_from_front_matter(self, tmp_path): + # The new carry_forward reads agent.version from front matter, NOT filename. + src = _make_payload(version="0.2.3", accuracy=0.75) + # Use a different filename to confirm it's not read from stem + card_path = tmp_path / "SCORECARD.md" + card_path.write_text(render_scorecard(src)) + + result = carry_forward(card_path, "0.2.4") + assert result.agent_version == "0.2.4" + assert result.inherited_from == "0.2.3" + # --------------------------------------------------------------------------- # 8. Versioning — minor bump refuses @@ -308,7 +336,7 @@ def test_carry_forward_copies_metrics_verbatim(self, tmp_path): class TestCarryForwardMinorBumpRefuses: def test_minor_bump_raises_value_error(self, tmp_path): src = _make_payload(version="0.2.3", accuracy=0.75) - card_path = tmp_path / "0.2.3.md" + card_path = tmp_path / "SCORECARD.md" card_path.write_text(render_scorecard(src)) with pytest.raises(ValueError, match="re-run"): @@ -316,7 +344,7 @@ def test_minor_bump_raises_value_error(self, tmp_path): def test_major_bump_raises_value_error(self, tmp_path): src = _make_payload(version="0.2.3", accuracy=0.75) - card_path = tmp_path / "0.2.3.md" + card_path = tmp_path / "SCORECARD.md" card_path.write_text(render_scorecard(src)) with pytest.raises(ValueError, match="re-run"): @@ -343,42 +371,48 @@ def test_rendered_parsed_inherited_from_null_or_absent(self): # --------------------------------------------------------------------------- -# 10. latest_version_below +# 10. Gate integration: second-agent generalization (no fabricated artifacts) # --------------------------------------------------------------------------- -class TestLatestVersionBelow: - def _seed_dir(self, tmp_path): - for name in ( - "0.1.0.md", - "0.2.3.md", - "0.10.0.md", - "README.md", - "not-a-version.md", - ): - (tmp_path / name).write_text("# placeholder") - return tmp_path - - def test_returns_closest_below(self, tmp_path): - self._seed_dir(tmp_path) - result = latest_version_below(tmp_path, "0.2.4") - assert result == "0.2.3" - - def test_none_when_nothing_below(self, tmp_path): - self._seed_dir(tmp_path) - result = latest_version_below(tmp_path, "0.1.0") - assert result is None - - def test_integer_comparison_not_string(self, tmp_path): - self._seed_dir(tmp_path) - result = latest_version_below(tmp_path, "0.10.1") - assert result == "0.10.0" - - def test_non_version_files_silently_skipped(self, tmp_path): - self._seed_dir(tmp_path) - # Should not raise even with README.md and not-a-version.md present - result = latest_version_below(tmp_path, "0.2.4") - assert result == "0.2.3" +class TestSecondAgentGeneralization: + """Prove the generator + gate work for an agent OTHER than email-triage.""" + + def test_second_agent_scorecard_validates_and_gate_passes(self, tmp_path): + from gaia.eval.scorecard_gate import main as gate_main + + # Build a ResultPayload for a different agent + metrics = [{"name": "accuracy", "value": 0.75, "weight": 1.0}] + payload = ResultPayload( + agent_name="Hello World Agent", + agent_version="0.1.0", + dataset_reference="tests/fixtures/hello/ground_truth.json", + dataset_description="Hello world evaluation dataset", + dataset_size=50, + methodology="exact match accuracy", + config={"model": "Gemma-4-E4B-it-GGUF", "limit": 20}, + test_cases_run=20, + metrics=metrics, + aggregate_name="weighted_accuracy", + generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + inherited_from=None, + reproduction_command="gaia eval agent --category hello", + ) + + scorecard_path = tmp_path / "SCORECARD.md" + from gaia.eval.release_scorecard import write_scorecard + + write_scorecard(payload, scorecard_path) + + # Validate the written scorecard + text = scorecard_path.read_text() + parsed = parse_scorecard(text) + errors = validate_scorecard(parsed) + assert errors == [], f"Second-agent scorecard should be valid, got: {errors}" + + # Gate should pass (no baseline → presence-only) + result = gate_main(["--scorecard", str(scorecard_path)]) + assert result == 0, "Gate should pass for a valid second-agent SCORECARD.md" # --------------------------------------------------------------------------- @@ -500,3 +534,25 @@ def test_all_no_quality_raises(self, tmp_path): with pytest.raises(ValueError): mod.build_payload(benchmark_dir, gt_path) + + def test_build_payload_includes_reproduction_command(self, tmp_path): + mod = self._load_gen_scorecard() + + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json" + scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text()) + + ground_truth = { + "_meta": {"count": 3}, + "email1": {"label": "spam"}, + "email2": {"label": "promo"}, + } + gt_path = tmp_path / "ground_truth.json" + gt_path.write_text(json.dumps(ground_truth)) + + payload = mod.build_payload(benchmark_dir, gt_path, limit=25) + assert payload.reproduction_command is not None + assert "gaia eval benchmark" in payload.reproduction_command + assert "gen_scorecard.py" in payload.reproduction_command + assert "PYTHON_KEYRING_BACKEND" in payload.reproduction_command diff --git a/tests/unit/eval/test_scorecard_gate.py b/tests/unit/eval/test_scorecard_gate.py index 28ab269d9..32424f97a 100644 --- a/tests/unit/eval/test_scorecard_gate.py +++ b/tests/unit/eval/test_scorecard_gate.py @@ -1,6 +1,6 @@ # Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -"""TDD tests for gaia.eval.scorecard_gate — written before implementation exists.""" +"""TDD tests for gaia.eval.scorecard_gate — new single-file SCORECARD.md interface.""" import datetime from pathlib import Path @@ -33,14 +33,22 @@ def _make_payload(version="1.0.0", accuracy=0.5): test_cases_run=10, metrics=metrics, aggregate_name="weighted_accuracy", - generated_at=datetime.datetime.utcnow().isoformat(), + generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), inherited_from=None, ) def _write_card(directory: Path, version: str, accuracy: float) -> Path: + """Write a valid SCORECARD.md to directory/SCORECARD.md.""" + payload = _make_payload(version=version, accuracy=accuracy) + path = directory / "SCORECARD.md" + path.write_text(render_scorecard(payload)) + return path + + +def _write_card_named(path: Path, version: str, accuracy: float) -> Path: + """Write a valid SCORECARD.md to an explicit path.""" payload = _make_payload(version=version, accuracy=accuracy) - path = directory / f"{version}.md" path.write_text(render_scorecard(payload)) return path @@ -52,45 +60,77 @@ def _write_card(directory: Path, version: str, accuracy: float) -> Path: class TestMissingCard: def test_missing_card_returns_1(self, tmp_path): - result = main(["--scorecards-dir", str(tmp_path), "--version", "1.0.0"]) + scorecard = tmp_path / "SCORECARD.md" + result = main(["--scorecard", str(scorecard)]) assert result == 1 # --------------------------------------------------------------------------- -# Case (b) — strict regression → exit 1 +# Case (b) — strict regression with --baseline-file → exit 1 # --------------------------------------------------------------------------- class TestStrictRegression: def test_regression_returns_1(self, tmp_path): - _write_card(tmp_path, "0.2.3", accuracy=0.8) - _write_card(tmp_path, "0.2.4", accuracy=0.5) - result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"]) + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.8) + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5) + + result = main(["--scorecard", str(candidate), "--baseline-file", str(baseline)]) assert result == 1 # --------------------------------------------------------------------------- -# Case (c) — no prior → exit 0 +# Case (c) — no baseline → presence-only pass → exit 0 # --------------------------------------------------------------------------- class TestNoPrior: def test_first_adoption_returns_0(self, tmp_path): - _write_card(tmp_path, "1.0.0", accuracy=0.6) - result = main(["--scorecards-dir", str(tmp_path), "--version", "1.0.0"]) + candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6) + result = main(["--scorecard", str(candidate)]) assert result == 0 # --------------------------------------------------------------------------- -# Case (d) — equal score (carry-forward) → exit 0 +# Case (d) — equal score (carry-forward) with --baseline-file → exit 0 # --------------------------------------------------------------------------- class TestEqualScore: def test_equal_score_returns_0(self, tmp_path): - _write_card(tmp_path, "0.2.3", accuracy=0.5) - _write_card(tmp_path, "0.2.4", accuracy=0.5) - result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"]) + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.5) + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5) + + result = main(["--scorecard", str(candidate), "--baseline-file", str(baseline)]) + assert result == 0 + + +# --------------------------------------------------------------------------- +# Case (e) — improved score → exit 0 +# --------------------------------------------------------------------------- + + +class TestImprovedScore: + def test_improved_score_returns_0(self, tmp_path): + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.5) + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.8) + + result = main(["--scorecard", str(candidate), "--baseline-file", str(baseline)]) assert result == 0 @@ -101,28 +141,36 @@ def test_equal_score_returns_0(self, tmp_path): class TestAllowRegression: def test_allow_regression_flag_returns_0(self, tmp_path): - _write_card(tmp_path, "0.2.3", accuracy=0.8) - _write_card(tmp_path, "0.2.4", accuracy=0.5) + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.8) + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5) + result = main( [ - "--scorecards-dir", - str(tmp_path), - "--version", - "0.2.4", + "--scorecard", str(candidate), + "--baseline-file", str(baseline), "--allow-regression", ] ) assert result == 0 def test_allow_regression_prints_warning_line(self, tmp_path, capsys): - _write_card(tmp_path, "0.2.3", accuracy=0.8) - _write_card(tmp_path, "0.2.4", accuracy=0.5) + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.8) + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5) + main( [ - "--scorecards-dir", - str(tmp_path), - "--version", - "0.2.4", + "--scorecard", str(candidate), + "--baseline-file", str(baseline), "--allow-regression", ] ) @@ -131,74 +179,71 @@ def test_allow_regression_prints_warning_line(self, tmp_path, capsys): # --------------------------------------------------------------------------- -# --manifest reads version +# --baseline-file missing → exit 1 # --------------------------------------------------------------------------- -class TestManifestFlag: - def test_manifest_reads_version(self, tmp_path): - scorecards_dir = tmp_path / "scorecards" - scorecards_dir.mkdir() - _write_card(scorecards_dir, "1.2.3", accuracy=0.6) - - manifest_path = tmp_path / "gaia-agent.yaml" - manifest_path.write_text("version: 1.2.3\nname: test-agent\n") - +class TestBaselineFileMissing: + def test_missing_baseline_file_returns_1(self, tmp_path): + candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6) result = main( [ - "--scorecards-dir", - str(scorecards_dir), - "--manifest", - str(manifest_path), + "--scorecard", str(candidate), + "--baseline-file", str(tmp_path / "nonexistent-SCORECARD.md"), ] ) - assert result == 0 + assert result == 1 - def test_manifest_with_regression(self, tmp_path): - scorecards_dir = tmp_path / "scorecards" - scorecards_dir.mkdir() - _write_card(scorecards_dir, "1.2.2", accuracy=0.9) - _write_card(scorecards_dir, "1.2.3", accuracy=0.3) - manifest_path = tmp_path / "gaia-agent.yaml" - manifest_path.write_text("version: 1.2.3\nname: test-agent\n") +# --------------------------------------------------------------------------- +# Invalid candidate (corrupt YAML front matter) → exit 1 +# --------------------------------------------------------------------------- - result = main( - [ - "--scorecards-dir", - str(scorecards_dir), - "--manifest", - str(manifest_path), - ] - ) + +class TestInvalidCandidate: + def test_corrupt_candidate_returns_1(self, tmp_path): + corrupt_path = tmp_path / "SCORECARD.md" + corrupt_path.write_text("this is not valid yaml front matter at all\ngarbage\n") + result = main(["--scorecard", str(corrupt_path)]) + assert result == 1 + + def test_empty_candidate_returns_1(self, tmp_path): + empty_path = tmp_path / "SCORECARD.md" + empty_path.write_text("") + result = main(["--scorecard", str(empty_path)]) assert result == 1 # --------------------------------------------------------------------------- -# Invalid prior → exit 1 +# Invalid baseline → exit 1 # --------------------------------------------------------------------------- class TestInvalidPrior: - def test_corrupt_prior_returns_1(self, tmp_path): - # Write corrupt/invalid prior card - corrupt_path = tmp_path / "0.2.3.md" - corrupt_path.write_text("this is not valid yaml front matter at all\ngarbage\n") + def test_corrupt_baseline_returns_1(self, tmp_path): + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + corrupt = baseline_dir / "SCORECARD.md" + corrupt.write_text("this is not valid yaml front matter at all\ngarbage\n") - # Write a valid candidate card - _write_card(tmp_path, "0.2.4", accuracy=0.9) + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.9) - result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"]) + result = main(["--scorecard", str(candidate), "--baseline-file", str(corrupt)]) assert result == 1 - def test_empty_prior_returns_1(self, tmp_path): - # Prior exists but is empty - empty_path = tmp_path / "0.2.3.md" - empty_path.write_text("") + def test_empty_baseline_returns_1(self, tmp_path): + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + empty = baseline_dir / "SCORECARD.md" + empty.write_text("") - _write_card(tmp_path, "0.2.4", accuracy=0.9) + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.9) - result = main(["--scorecards-dir", str(tmp_path), "--version", "0.2.4"]) + result = main(["--scorecard", str(candidate), "--baseline-file", str(empty)]) assert result == 1 @@ -238,10 +283,17 @@ def test_publish_job_needs_scorecard_gate(self): class TestCliErrorHandling: - def test_missing_scorecards_dir_flag_returns_1(self): - result = main(["--version", "1.0.0"]) + def test_missing_scorecard_flag_returns_1(self): + result = main([]) assert result == 1 - def test_missing_version_and_manifest_returns_1(self, tmp_path): - result = main(["--scorecards-dir", str(tmp_path)]) + def test_baseline_file_and_ref_mutually_exclusive(self, tmp_path): + candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6) + result = main( + [ + "--scorecard", str(candidate), + "--baseline-file", str(candidate), + "--baseline-ref", "v1.0.0", + ] + ) assert result == 1 From 7e0ea567580e1573b62a3dd947e0ecf370efbee9 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 13:29:54 -0400 Subject: [PATCH 14/18] refactor(eval): replace scorecards/ dirs with single SCORECARD.md per agent - hub/agents/npm/agent-email/SCORECARD.md: generated from relabeled-corpus run (placeholder; orchestrator will regenerate from full run) - hub/agents/npm/agent-email/package.json: files array includes SCORECARD.md, removes scorecards/ (don't ship all versions in the npm tarball) - hub/agents/npm/agent-email/README.md: scorecard link updated to ./SCORECARD.md - Delete hub/agents/npm/agent-email/scorecards/ (per-version dir, now obsolete) - Delete hub/agents/python/hello-world/scorecards/ (contained fabricated 90.0 score) --- hub/agents/npm/agent-email/README.md | 2 +- .../{scorecards/0.2.4.md => SCORECARD.md} | 24 ++++++- hub/agents/npm/agent-email/package.json | 4 +- .../python/hello-world/scorecards/0.1.0.md | 62 ------------------- 4 files changed, 26 insertions(+), 66 deletions(-) rename hub/agents/npm/agent-email/{scorecards/0.2.4.md => SCORECARD.md} (70%) delete mode 100644 hub/agents/python/hello-world/scorecards/0.1.0.md diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md index c8c079b2a..92424371b 100644 --- a/hub/agents/npm/agent-email/README.md +++ b/hub/agents/npm/agent-email/README.md @@ -2,7 +2,7 @@ [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24** -**Eval scorecard (v0.2.4): aggregate 40.0 / 100** — `category_accuracy` 0.40 over 25 of 220 labeled emails ([`./scorecards/0.2.4.md`](./scorecards/0.2.4.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate. +**Eval scorecard (v0.2.4): aggregate 40.0 / 100** — `category_accuracy` 0.40 over 25 of 220 labeled emails ([`./SCORECARD.md`](./SCORECARD.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate. Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies to, and schedules from Gmail and Outlook — with every email body analyzed diff --git a/hub/agents/npm/agent-email/scorecards/0.2.4.md b/hub/agents/npm/agent-email/SCORECARD.md similarity index 70% rename from hub/agents/npm/agent-email/scorecards/0.2.4.md rename to hub/agents/npm/agent-email/SCORECARD.md index 7e36786a1..b4f8ae5ca 100644 --- a/hub/agents/npm/agent-email/scorecards/0.2.4.md +++ b/hub/agents/npm/agent-email/SCORECARD.md @@ -33,7 +33,7 @@ aggregate: value: 0.4 weight: 1.0 value: 40.0 -generated_at: '2026-06-26T16:47:13.735478+00:00' +generated_at: '2026-06-26T17:29:34.631236+00:00' inherited_from: null --- # Email Triage — Eval Scorecard v0.2.4 @@ -66,3 +66,25 @@ round(100 × ((0.4000 × 1.0)) / 1.0, 2) = 40.0 A reader can reproduce this value from the `aggregate.components` in the front matter alone — no eval-harness access needed. + +## Reproduction + +Run the following commands from the repository root: + +```sh +# Step 1: run the benchmark (requires a running Lemonade Server on :13305) +PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \ +GAIA_AGENT_TOOL_TIMEOUT=120 \ +PYTHONPATH="$(pwd)" \ +gaia eval benchmark --limit 25 + +# Step 2: generate the scorecard from the benchmark output +PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \ +PYTHONPATH="$(pwd)" \ +python hub/agents/python/email/packaging/gen_scorecard.py \ + --benchmark-dir /private/tmp/claude-501/-Users-tomasz-src-amd-gaia--claude-worktrees-sleepy-chatelet-2b818a/314bd25e-fbc0-4ab7-aab0-a8825585e5ef/scratchpad/email-eval-relabeled \ + --ground-truth tests/fixtures/email/ground_truth.json \ + --limit 25 +``` + +See [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) for the full setup guide. diff --git a/hub/agents/npm/agent-email/package.json b/hub/agents/npm/agent-email/package.json index 426d163e8..115483bc0 100644 --- a/hub/agents/npm/agent-email/package.json +++ b/hub/agents/npm/agent-email/package.json @@ -48,8 +48,8 @@ "CHANGELOG.md", "SPEC.md", "SKILL.md", - "LICENSE", - "scorecards/" + "SCORECARD.md", + "LICENSE" ], "engines": { "node": ">=18" diff --git a/hub/agents/python/hello-world/scorecards/0.1.0.md b/hub/agents/python/hello-world/scorecards/0.1.0.md deleted file mode 100644 index fc6121f2e..000000000 --- a/hub/agents/python/hello-world/scorecards/0.1.0.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -schema_version: 1 -agent: - name: Hello World - version: 0.1.0 -recipe: - dataset: - reference: hub/agents/python/hello-world/tests - description: Illustrative conversational response dataset (reference agent) - size: 10 - methodology: Illustrative metric — reference agent for scorecard format generalization - config: - harness: gaia eval agent - model: Gemma-4-E4B-it-GGUF - limit: 10 -results: - test_cases_run: 10 - metrics: - - name: response_quality - value: 0.9 - weight: 1.0 -aggregate: - name: weighted_accuracy - formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2) - components: - - metric: response_quality - value: 0.9 - weight: 1.0 - value: 90.0 -generated_at: '2026-06-25T12:00:00+00:00' -inherited_from: null ---- -# Hello World — Eval Scorecard v0.1.0 - -**Aggregate score: 90.0** (out of 100) - -## Recipe - -| Field | Value | -|-------|-------| -| Dataset | [hub/agents/python/hello-world/tests](hub/agents/python/hello-world/tests) | -| Description | Illustrative conversational response dataset (reference agent) | -| Dataset size | 10 labeled examples | -| Test cases run | 10 | -| Methodology | Illustrative metric — reference agent for scorecard format generalization | - -## Metrics - - - **response_quality**: 0.9000 × 1.0 - -## Aggregate score recomputation - -Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)` - -Worked example: - -``` -round(100 × ((0.9000 × 1.0)) / 1.0, 2) = 90.0 -``` - -A reader can reproduce this value from the `aggregate.components` in the front -matter alone — no eval-harness access needed. From 704ea088b298d8adda7976a1c6ae2f3a52a2c735 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 13:31:23 -0400 Subject: [PATCH 15/18] refactor(eval): update hub worker, workflows, and publish for SCORECARD.md - storage.ts: evalScorecardKey now returns SCORECARD.md (was eval-scorecard.md) - publish.ts: update comment for SCORECARD.md - routes.test.ts: expect eval_scorecard_url to end in /SCORECARD.md - publish_to_r2.py: update --eval-scorecard help text to reference SCORECARD.md - release_agent_email.yml: scorecard-gate uses new --scorecard / --baseline-ref interface; computes prev tag via git describe; publish step points at SCORECARD.md - email_scorecard_refresh.yml: use SCORECARD.md env var throughout; same-version check and cross-version gate use new gate interface with --baseline-ref --- .github/workflows/email_scorecard_refresh.yml | 60 +++++++++++-------- .github/workflows/release_agent_email.yml | 35 +++++++++-- .../python/email/packaging/publish_to_r2.py | 2 +- workers/agent-hub/src/publish.ts | 2 +- workers/agent-hub/src/storage.ts | 2 +- workers/agent-hub/test/routes.test.ts | 2 +- 6 files changed, 70 insertions(+), 33 deletions(-) diff --git a/.github/workflows/email_scorecard_refresh.yml b/.github/workflows/email_scorecard_refresh.yml index 7b3b02b5f..9a8f0a849 100644 --- a/.github/workflows/email_scorecard_refresh.yml +++ b/.github/workflows/email_scorecard_refresh.yml @@ -6,7 +6,7 @@ # Answers "how does a PR that changes the agent keep the scorecard honest?": # when the email agent's LLM-affecting code (or the eval corpus) changes, this # re-runs the REAL eval, regenerates the scorecard, and then: -# - score IMPROVED or held -> commits the refreshed scorecard to the branch +# - score IMPROVED or held -> commits the refreshed SCORECARD.md to the branch # - score REGRESSED -> fails the job (the worse card is NOT committed) # # `gaia eval benchmark` needs Lemonade on AMD hardware, so this runs ONLY on the @@ -15,9 +15,10 @@ # hosted-CI backstop (it parses committed files only, no eval). # # Two regression checks run here: -# 1. SAME-VERSION: fresh aggregate vs the currently-committed card for this -# version — stops a noisy/worse re-run from silently overwriting a good score. -# 2. CROSS-VERSION: `gaia.eval.scorecard_gate` — fresh card vs the prior version. +# 1. SAME-VERSION: fresh aggregate vs the currently-committed SCORECARD.md — +# stops a noisy/worse re-run from silently overwriting a good score. +# 2. CROSS-VERSION (best-effort): fresh SCORECARD.md vs the prior version tag +# via --baseline-ref. # # Auto-commit needs `contents: write` and only works on the repo's own branches; # a fork PR's GITHUB_TOKEN is read-only — for forks, run the eval locally / on AMD @@ -55,7 +56,7 @@ permissions: contents: write # auto-commit the refreshed scorecard to the branch env: - SCORECARD_DIR: hub/agents/npm/agent-email/scorecards + SCORECARD: hub/agents/npm/agent-email/SCORECARD.md MANIFEST: hub/agents/python/email/gaia-agent.yaml LIMIT: ${{ github.event.inputs.limit || '25' }} MODEL: ${{ github.event.inputs.model || 'Gemma-4-E4B-it-GGUF' }} @@ -70,6 +71,7 @@ jobs: uses: actions/checkout@v6 with: ref: ${{ github.head_ref || github.ref_name }} + fetch-depth: 0 # full history for git describe (cross-version baseline) - name: Set up Python uses: actions/setup-python@v6 @@ -90,16 +92,20 @@ jobs: set -euo pipefail VERSION=$(python -c "import yaml; print(yaml.safe_load(open('${MANIFEST}'))['version'])") echo "version=${VERSION}" >> "$GITHUB_OUTPUT" - CARD="${SCORECARD_DIR}/${VERSION}.md" - # Aggregate of the card as committed on this branch (empty if new). - if git cat-file -e "HEAD:${CARD}" 2>/dev/null; then - git show "HEAD:${CARD}" > /tmp/committed_card.md - COMMITTED=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('/tmp/committed_card.md'))['aggregate']['value'])") + # Aggregate of the SCORECARD.md as committed on this branch (empty if new). + if git cat-file -e "HEAD:${SCORECARD}" 2>/dev/null; then + git show "HEAD:${SCORECARD}" > /tmp/committed_scorecard.md + COMMITTED=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('/tmp/committed_scorecard.md'))['aggregate']['value'])") else COMMITTED="" fi echo "committed=${COMMITTED}" >> "$GITHUB_OUTPUT" - echo "Version ${VERSION}; committed aggregate: ${COMMITTED:-}" + # Resolve the previous release tag for the cross-version check. + PREV="$(git describe --tags --abbrev=0 \ + --match 'agent-pkg-email-*' \ + "HEAD^" 2>/dev/null || true)" + echo "prev_tag=${PREV}" >> "$GITHUB_OUTPUT" + echo "Version ${VERSION}; committed aggregate: ${COMMITTED:-}; prev tag: ${PREV:-}" - name: Run the email-triage benchmark (real eval) env: @@ -120,7 +126,7 @@ jobs: --limit "${LIMIT}" \ --output-dir eval-out - - name: Regenerate the scorecard from the real run + - name: Regenerate SCORECARD.md from the real run run: | set -euo pipefail python hub/agents/python/email/packaging/gen_scorecard.py \ @@ -129,34 +135,38 @@ jobs: - name: Same-version regression check (reject a worse re-run) run: | set -euo pipefail - VERSION="${{ steps.pre.outputs.version }}" COMMITTED="${{ steps.pre.outputs.committed }}" - CARD="${SCORECARD_DIR}/${VERSION}.md" - FRESH=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('${CARD}'))['aggregate']['value'])") + FRESH=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('${SCORECARD}'))['aggregate']['value'])") echo "fresh aggregate: ${FRESH} | committed: ${COMMITTED:-}" if [ -n "${COMMITTED}" ] && python -c "import sys; sys.exit(0 if float('${FRESH}') < float('${COMMITTED}') else 1)"; then - echo "::error::Scorecard regression for v${VERSION}: re-run scored ${FRESH} < committed ${COMMITTED}. Not committing the worse card. Investigate, or override intentionally via --allow-regression in a manual commit." - git checkout -- "${CARD}" || true + echo "::error::Scorecard regression: re-run scored ${FRESH} < committed ${COMMITTED}. Not committing the worse card. Investigate, or override intentionally via --allow-regression in a manual commit." + git checkout -- "${SCORECARD}" || true exit 1 fi echo "No same-version regression — fresh score is >= committed." - - name: Cross-version gate (fresh card vs prior version) + - name: Cross-version gate (fresh SCORECARD.md vs prior version tag, best-effort) run: | set -euo pipefail - python -m gaia.eval.scorecard_gate \ - --scorecards-dir "${SCORECARD_DIR}" \ - --manifest "${MANIFEST}" + PREV="${{ steps.pre.outputs.prev_tag }}" + if [ -n "${PREV}" ]; then + python -m gaia.eval.scorecard_gate \ + --scorecard "${SCORECARD}" \ + --baseline-ref "${PREV}" + else + python -m gaia.eval.scorecard_gate \ + --scorecard "${SCORECARD}" + fi - - name: Commit the refreshed scorecard (only if it changed for the better/equal) + - name: Commit the refreshed SCORECARD.md (only if it changed for the better/equal) run: | set -euo pipefail - if git diff --quiet -- "${SCORECARD_DIR}"; then - echo "Scorecard unchanged — nothing to commit." + if git diff --quiet -- "${SCORECARD}"; then + echo "SCORECARD.md unchanged — nothing to commit." exit 0 fi git config user.name "${{ github.actor }}" git config user.email "${{ github.actor }}@users.noreply.github.com" - git add "${SCORECARD_DIR}" + git add "${SCORECARD}" git commit -m "eval(email): refresh v${{ steps.pre.outputs.version }} scorecard from benchmark run" git push origin "HEAD:${{ github.head_ref || github.ref_name }}" diff --git a/.github/workflows/release_agent_email.yml b/.github/workflows/release_agent_email.yml index 1c624f2f2..3bc05451b 100644 --- a/.github/workflows/release_agent_email.yml +++ b/.github/workflows/release_agent_email.yml @@ -272,16 +272,43 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 + with: + fetch-depth: 0 # full history so git describe can find previous tags - uses: actions/setup-python@v6 with: python-version: "3.12" - name: Install core + PyYAML run: pip install -e . pyyaml + - name: Resolve previous release tag (best-effort baseline) + id: prev_tag + shell: bash + run: | + set -uo pipefail + # Find the most recent agent-pkg-email-* tag strictly before the + # current ref. On workflow_dispatch the current ref is a branch, not + # a tag, so we look for the latest tag of the right pattern overall. + PREV="$(git describe --tags --abbrev=0 \ + --match 'agent-pkg-email-*' \ + "${GITHUB_REF_NAME}^" 2>/dev/null || true)" + echo "prev_tag=${PREV}" >> "$GITHUB_OUTPUT" + if [ -n "${PREV}" ]; then + echo "Baseline tag: ${PREV}" + else + echo "No previous release tag found — presence-only check." + fi - name: Run scorecard gate + shell: bash run: | - python -m gaia.eval.scorecard_gate \ - --scorecards-dir hub/agents/npm/agent-email/scorecards \ - --manifest hub/agents/python/email/gaia-agent.yaml + set -euo pipefail + PREV="${{ steps.prev_tag.outputs.prev_tag }}" + if [ -n "${PREV}" ]; then + python -m gaia.eval.scorecard_gate \ + --scorecard hub/agents/npm/agent-email/SCORECARD.md \ + --baseline-ref "${PREV}" + else + python -m gaia.eval.scorecard_gate \ + --scorecard hub/agents/npm/agent-email/SCORECARD.md + fi # ── Stage 2: publish to the hub + npm (single atomic step) ───────── publish: @@ -477,7 +504,7 @@ jobs: done VER="${{ steps.ver.outputs.version }}" scorecard_args=() - SCORECARD="hub/agents/npm/agent-email/scorecards/${VER}.md" + SCORECARD="hub/agents/npm/agent-email/SCORECARD.md" if [ -f "${SCORECARD}" ]; then scorecard_args+=(--eval-scorecard "${SCORECARD}") fi diff --git a/hub/agents/python/email/packaging/publish_to_r2.py b/hub/agents/python/email/packaging/publish_to_r2.py index 5884cb976..9cca41e3d 100644 --- a/hub/agents/python/email/packaging/publish_to_r2.py +++ b/hub/agents/python/email/packaging/publish_to_r2.py @@ -279,7 +279,7 @@ def main(argv=None) -> int: parser.add_argument( "--eval-scorecard", type=Path, - help="Path to the eval scorecard markdown (e.g. scorecards/0.2.4.md) to " + help="Path to the eval scorecard markdown (e.g. SCORECARD.md) to " "publish as the agent's catalog eval score and scorecard URL " "(POSTed as the multipart 'eval_scorecard' part the Worker accepts). " "Absent = publish without an eval scorecard.", diff --git a/workers/agent-hub/src/publish.ts b/workers/agent-hub/src/publish.ts index 626b8c65e..c869ea623 100644 --- a/workers/agent-hub/src/publish.ts +++ b/workers/agent-hub/src/publish.ts @@ -176,7 +176,7 @@ export async function handlePublish( const skillText = await optionalMarkdownPart(form, "skill", "SKILL.md"); // Optional eval scorecard markdown (the agent's benchmark results, rendered on // the hub listing as an aggregate score + link). Per-version, first-POST semantics. - const evalScorecardText = await optionalMarkdownPart(form, "eval_scorecard", "eval-scorecard.md"); + const evalScorecardText = await optionalMarkdownPart(form, "eval_scorecard", "SCORECARD.md"); // Optional whole-package file listing (the zip's contents, for the hub's file // list). The zip itself rides in as a normal `artifact`; this is just the // manifest of what's inside it. diff --git a/workers/agent-hub/src/storage.ts b/workers/agent-hub/src/storage.ts index 366e3fe84..3a26647a4 100644 --- a/workers/agent-hub/src/storage.ts +++ b/workers/agent-hub/src/storage.ts @@ -53,7 +53,7 @@ export function skillKey(id: string, version: string): string { } export function evalScorecardKey(id: string, version: string): string { - return `${versionDir(id, version)}eval-scorecard.md`; + return `${versionDir(id, version)}SCORECARD.md`; } export function packageFilesKey(id: string, version: string): string { diff --git a/workers/agent-hub/test/routes.test.ts b/workers/agent-hub/test/routes.test.ts index 29505b207..cb00f5abf 100644 --- a/workers/agent-hub/test/routes.test.ts +++ b/workers/agent-hub/test/routes.test.ts @@ -118,7 +118,7 @@ describe("eval scorecard in catalog", () => { const body = (await res.json()) as any; const entry = body.agents[0]; expect(entry.eval_score).toBe(87.5); - expect(entry.eval_scorecard_url).toMatch(/\/agents\/chat\/0\.1\.0\/eval-scorecard\.md$/); + expect(entry.eval_scorecard_url).toMatch(/\/agents\/chat\/0\.1\.0\/SCORECARD\.md$/); }); it("omits eval_score and eval_scorecard_url when no scorecard is published", async () => { From 40107bff90c75d9b2599abedadc3ea94465b201e Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 13:33:16 -0400 Subject: [PATCH 16/18] docs(eval): update scorecard docs and skill for single SCORECARD.md convention - eval-scorecard.mdx: storage convention is now a single SCORECARD.md (not scorecards/.md); gate uses --scorecard + --baseline-ref/--baseline-file; carry_forward reads version from front matter; Reproduction section documented; npm files include SCORECARD.md only (not scorecards/ dir) - SKILL.md: doc-root/SCORECARD.md as single file; reproduction_command in adapter; gate CLI updated to --scorecard / --baseline-ref pattern; Phase 4 examples updated --- .claude/skills/adding-eval-scorecard/SKILL.md | 37 +++-- docs/reference/eval-scorecard.mdx | 128 ++++++++++-------- 2 files changed, 94 insertions(+), 71 deletions(-) diff --git a/.claude/skills/adding-eval-scorecard/SKILL.md b/.claude/skills/adding-eval-scorecard/SKILL.md index 0afaa057f..97123244d 100644 --- a/.claude/skills/adding-eval-scorecard/SKILL.md +++ b/.claude/skills/adding-eval-scorecard/SKILL.md @@ -8,7 +8,7 @@ description: "Adopt the per-agent eval scorecard for a GAIA hub agent: write the Adopt the release **eval scorecard** ([`docs/reference/eval-scorecard.mdx`](../../../docs/reference/eval-scorecard.mdx)) for one hub agent. The system is `harness → result payload → generator → scorecard`, with a standalone presence+regression release gate. The **email agent is the reference implementation** — mirror it. **Core modules (do not modify; reuse):** -- `src/gaia/eval/release_scorecard.py` — `ResultPayload`, `compute_aggregate`, `render_scorecard`, `write_scorecard`, `validate_scorecard`, `carry_forward`, `latest_version_below`. Harness-agnostic (stdlib + PyYAML only). +- `src/gaia/eval/release_scorecard.py` — `ResultPayload`, `compute_aggregate`, `render_scorecard`, `write_scorecard`, `validate_scorecard`, `carry_forward`. Harness-agnostic (stdlib + PyYAML only). - `src/gaia/eval/scorecard_gate.py` — the standalone gate (`python -m gaia.eval.scorecard_gate`). - Reference adapter: `hub/agents/python/email/packaging/gen_scorecard.py`. @@ -18,7 +18,7 @@ This is a **phased checklist with a hard gate at the real-eval step** — the sc 1. **Version source of truth** = the `version:` field in `/gaia-agent.yaml`. Never invent a parallel scheme. 2. **Canonical README** (where the scorecard is linked + surfaced): for an npm-published agent it is the npm client README (e.g. `hub/agents/npm//README.md`), NOT a `packaging/README.md`. For a Python-only agent it is `hub/agents/python//README.md`. Confirm which by checking what `release_agent_.yml` publishes (`README:` env) — the published README is the one to link. -3. **doc-root** = the directory holding that canonical README. Scorecards live at `/scorecards/.md`. +3. **doc-root** = the directory holding that canonical README. The scorecard lives at `/SCORECARD.md` — a **single file updated in place**, versioned via the publish snapshot (same as README.md). **There is no `scorecards/` directory.** 4. **Eval vehicle**: what existing harness produces this agent's accuracy metric? (email → `gaia eval benchmark` over `tests/fixtures/email/`.) If none exists, STOP and surface that — propose the minimal harness before building; do not invent numbers. ## Phase 2 — Write the adapter (harness → payload) @@ -26,11 +26,12 @@ This is a **phased checklist with a hard gate at the real-eval step** — the sc Copy `hub/agents/python/email/packaging/gen_scorecard.py` as the template. The adapter: - imports ONLY `gaia.eval.release_scorecard` (never the harness or agent package — preserve loose coupling); - reads the harness output, builds a `ResultPayload`; +- populates `reproduction_command` with the **exact shell commands** to reproduce this scorecard, including all required env vars (`PYTHON_KEYRING_BACKEND`, `GAIA_AGENT_TOOL_TIMEOUT`, `PYTHONPATH`); - defines **"judged"** explicitly and **raises loudly** if zero results are judged (no silent 0.0); - records **dataset size** (total labeled examples) and **test_cases_run** (subset executed) as DISTINCT fields; - stores **repo-relative** paths only (never a local absolute path — it ships in a published artifact); - records the eval `limit`/config so future regression checks are comparable; -- writes to `/scorecards/.md`. +- writes to `/SCORECARD.md` (the single file; `--output-dir` overrides to a directory, but the filename is always `SCORECARD.md`). Add an offline unit test against a committed sample harness-output fixture (see `tests/fixtures/eval/email_benchmark_scorecard.json` + `tests/unit/eval/test_release_scorecard.py::TestEmailAdapter`) so the adapter is testable without a live model. @@ -50,8 +51,10 @@ PYTHONPATH="$(pwd)" \ --ground-truth tests/fixtures/email/ground_truth.json \ --limit 25 --output-dir +PYTHONPATH="$(pwd)" \ /bin/python hub/agents/python/email/packaging/gen_scorecard.py \ --benchmark-dir --limit 25 +# → writes hub/agents/npm/agent-email/SCORECARD.md in place ``` **Headless gotchas (see memory `project-email-benchmark-headless-gotchas`):** @@ -63,23 +66,33 @@ PYTHONPATH="$(pwd)" \ ## Phase 4 — Surface, link, and gate -1. **Link + surface** from the canonical README: a one-line `Eval scorecard (vX.Y.Z): aggregate N/100 … ([./scorecards/X.Y.Z.md](./scorecards/X.Y.Z.md))`. The relative link must resolve in-repo. -2. **npm `files`**: if the agent publishes on npm, add `scorecards/` to `package.json` `files` so the link resolves on the published package too. -3. **Hub display**: a published scorecard surfaces on the agent's hub page / Agent UI detail view (see `workers/agent-hub` + `AgentDetailModal.tsx`); ensure the publish step uploads the scorecard alongside the README. +1. **Link + surface** from the canonical README: a one-line `Eval scorecard (vX.Y.Z): aggregate N/100 … ([./SCORECARD.md](./SCORECARD.md))`. The relative link must resolve in-repo. +2. **npm `files`**: if the agent publishes on npm, add `SCORECARD.md` to `package.json` `files`. **Do not** add a `scorecards/` directory — only the single current file ships. +3. **Hub display**: a published scorecard surfaces on the agent's hub page / Agent UI detail view (see `workers/agent-hub` + `AgentDetailModal.tsx`); ensure the publish step passes `--eval-scorecard /SCORECARD.md` to `publish_to_r2.py`. 4. **Release gate**: add a `scorecard-gate` job to `release_agent_.yml` and list it in `publish.needs`. The job runs on a GitHub-hosted runner (it only parses committed files — no eval): ```bash + # Presence-only (no previous tag yet): python -m gaia.eval.scorecard_gate \ - --scorecards-dir /scorecards \ - --manifest hub/agents/python//gaia-agent.yaml + --scorecard /SCORECARD.md + + # With best-effort previous-release baseline (recommended for CI): + PREV="$(git describe --tags --abbrev=0 --match 'agent-pkg--*' "${GITHUB_REF_NAME}^" 2>/dev/null || true)" + if [ -n "$PREV" ]; then + python -m gaia.eval.scorecard_gate \ + --scorecard /SCORECARD.md --baseline-ref "$PREV" + else + python -m gaia.eval.scorecard_gate \ + --scorecard /SCORECARD.md + fi ``` - The job must NOT have `continue-on-error`, an `environment:`, or a `permissions:` override (inherits `contents: read`; needs no secrets). -5. **Auto-update/reject loop**: for re-running on agent changes and refreshing the committed scorecard, see `eval-scorecard.mdx` "Keeping the scorecard current" and the self-hosted refresh workflow — reject-on-worse is the gate; better-or-equal refreshes the committed card. + The job must NOT have `continue-on-error`, an `environment:`, or a `permissions:` override (inherits `contents: read`; needs no secrets). Fetch full history (`fetch-depth: 0`) so `git describe` resolves. +5. **Auto-update/reject loop**: for re-running on agent changes and refreshing the committed scorecard, see `eval-scorecard.mdx` "Keeping the scorecard current" and the self-hosted refresh workflow — reject-on-worse is the gate; better-or-equal refreshes the committed `SCORECARD.md`. ## Phase 5 — Verify (evidence before "done") -Run and capture: the generated `.md`; the gate **passing** on it (exit 0); the gate **blocking** a manufactured regression (exit 1) and a missing card (exit 1); a by-hand recompute of the aggregate from `aggregate.components` matching the recorded value. Run `python util/lint.py --all` and the eval unit tests. These are the PR's real-world proof. +Run and capture: the generated `SCORECARD.md`; the gate **passing** on it (exit 0); the gate **blocking** a manufactured regression (exit 1, via `--baseline-file` with a higher-scoring card) and a missing card (exit 1); a by-hand recompute of the aggregate from `aggregate.components` matching the recorded value. Run `python util/lint.py --all` and the eval unit tests. These are the PR's real-world proof. ## Versioning -- **Patch** release → `carry_forward(prev_path, new_version)` (copies results verbatim, sets `inherited_from`); do NOT re-run the eval. +- **Patch** release → `carry_forward(prev_scorecard_path, new_version)` reads the version from the front matter of the current `SCORECARD.md` (not from the filename) and copies results verbatim, sets `inherited_from`; do NOT re-run the eval. - **Minor/major** release → re-run the eval (Phase 3); `carry_forward` refuses a non-patch bump with a "re-run" error. diff --git a/docs/reference/eval-scorecard.mdx b/docs/reference/eval-scorecard.mdx index b00d9d00e..20151a45e 100644 --- a/docs/reference/eval-scorecard.mdx +++ b/docs/reference/eval-scorecard.mdx @@ -1,6 +1,6 @@ --- title: "Release Eval Scorecard" -description: "Per-agent, per-version eval scorecard: schema, storage convention, aggregate formula, versioning policy, and release gate." +description: "Per-agent eval scorecard: schema, storage convention, aggregate formula, versioning policy, reproduction, and release gate." icon: "chart-bar" --- @@ -14,17 +14,18 @@ icon: "chart-bar" ## Overview -Each published hub agent ships a **release scorecard** — a versioned Markdown file that records: +Each published hub agent ships a **release scorecard** — a single `SCORECARD.md` file (updated in place per release, versioned via the publish snapshot, the same way `README.md` works) that records: - The **eval recipe**: dataset reference, methodology, configuration, and metric definitions. - The **measured results**: per-metric values, number of test cases actually run, and dataset size. - A single **named aggregate score**: a deterministic, recomputable percentage so a reviewer can verify the number without re-running the eval. +- A **Reproduction section**: the exact commands to reproduce the result from scratch. Scorecards are committed alongside the agent's README and linked from it. A standalone **release gate** (`scorecard_gate.py`) blocks packaging when the scorecard is missing or when its aggregate score strictly regresses below the prior version's. ## File format -Scorecard files are **Markdown with YAML front matter** (`.md`). The front matter holds all machine-readable fields; the body is a human-readable summary with a worked recomputation. +Scorecard files are **Markdown with YAML front matter** (`.md`). The front matter holds all machine-readable fields; the body is a human-readable summary with a worked recomputation and a Reproduction section. ``` --- @@ -35,7 +36,7 @@ agent: recipe: dataset: reference: tests/fixtures/email/ground_truth.json - description: Synthetic email corpus (FakeGmailBackend, 4-category priority labels) + description: Synthetic email corpus (FakeGmailBackend, schema-2.0 triage taxonomy) size: 220 methodology: gaia eval benchmark — category_accuracy (case-insensitive exact match) config: @@ -43,26 +44,31 @@ recipe: model: Gemma-4-E4B-it-GGUF limit: 25 results: - test_cases_run: 24 + test_cases_run: 25 metrics: - name: category_accuracy - value: 0.4584 + value: 0.40 weight: 1.0 aggregate: name: weighted_accuracy formula: "round(100 * sum(weight_i * value_i) / sum(weight_i), 2)" components: - metric: category_accuracy - value: 0.4584 + value: 0.40 weight: 1.0 - value: 45.84 -generated_at: "2026-06-25T10:00:00+00:00" + value: 40.0 +generated_at: "2026-06-26T16:47:13+00:00" inherited_from: null --- # Email Triage — Eval Scorecard v0.2.4 -**Aggregate score: 45.84** (out of 100) +**Aggregate score: 40.0** (out of 100) +... + +## Reproduction + +Run the following commands from the repository root: ... ``` @@ -111,7 +117,7 @@ where each `valueᵢ` is a metric value in [0, 1] and each `weightᵢ` defaults The result is a **percentage in [0, 100]**. For a single metric with weight 1.0: ``` -round(100 × 0.4584, 2) = 45.84 +round(100 × 0.40, 2) = 40.0 ``` A reader can reproduce this value from `aggregate.components` alone — no eval-harness access needed. @@ -119,15 +125,15 @@ The `aggregate.formula` field in the front matter states the formula in human-re ## Storage convention -Scorecards live in a `scorecards/` subdirectory beside the agent's canonical README: +Each agent package ships a **single `SCORECARD.md`** file, updated in place per release — the same way `README.md` works. Per-version uniqueness comes from the publish snapshot (R2 stores the file at `agents///SCORECARD.md`; the npm package ships only the current version's `SCORECARD.md`). ``` / - README.md ← canonical README (links to scorecard) - scorecards/ - 0.1.0.md - 0.2.3.md - 0.2.4.md ← latest + README.md ← canonical README (links to SCORECARD.md) + SCORECARD.md ← current version's scorecard, updated in place + SPEC.md + SKILL.md + CHANGELOG.md ``` The `doc-root` is the location of the agent's canonical README: @@ -135,35 +141,39 @@ The `doc-root` is the location of the agent's canonical README: | Agent | doc-root | |-------|----------| | Email Triage (`@amd-gaia/agent-email`) | `hub/agents/npm/agent-email/` | -| Hello World | `hub/agents/python/hello-world/` | -The relative link `./scorecards/.md` resolves both in-repo and when the directory is published as an npm package. +The relative link `./SCORECARD.md` resolves both in-repo and when the directory is published as an npm package. The npm `files` array includes `SCORECARD.md` (not a `scorecards/` directory). ## Versioning policy ### Patch releases — carry forward -For a **patch release** (same `major.minor`, `patch` incremented), the prior version's results are carried forward verbatim using `carry_forward()`: +For a **patch release** (same `major.minor`, `patch` incremented), the prior version's results are carried forward verbatim using `carry_forward()`. Pass the path to the agent's current `SCORECARD.md`: ```python from gaia.eval.release_scorecard import carry_forward, write_scorecard from pathlib import Path new_payload = carry_forward( - prev_path=Path("scorecards/0.2.3.md"), - new_version="0.2.4", + prev_scorecard_path=Path("hub/agents/npm/agent-email/SCORECARD.md"), + new_version="0.2.5", ) -# new_payload.inherited_from == "0.2.3" -write_scorecard(new_payload, Path("scorecards/0.2.4.md")) +# new_payload.inherited_from == "0.2.4" (read from front matter, not filename) +write_scorecard(new_payload, Path("hub/agents/npm/agent-email/SCORECARD.md")) ``` -The resulting scorecard has `inherited_from: "0.2.3"` and identical `results` and `aggregate` fields. The aggregate score is unchanged, so the release gate's equal-score case passes. +The resulting scorecard has `inherited_from: "0.2.4"` and identical `results` and `aggregate` fields. The aggregate score is unchanged, so the release gate's equal-score case passes. + +`carry_forward()` reads the prior version from the `agent.version` field in the front matter — **not** from the filename. ### Minor / major releases — re-run required For a **minor or major bump**, `carry_forward()` raises `ValueError` with a "re-run" message. Run the eval fresh and generate a new scorecard: ```bash +PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \ +GAIA_AGENT_TOOL_TIMEOUT=120 \ +PYTHONPATH="$(pwd)" \ gaia eval benchmark \ --model Gemma-4-E4B-it-GGUF \ --mbox-path tests/fixtures/email/synthetic_inbox.mbox \ @@ -171,33 +181,44 @@ gaia eval benchmark \ --limit 25 \ --output-dir /tmp/email-eval +PYTHONPATH="$(pwd)" \ python hub/agents/python/email/packaging/gen_scorecard.py \ - --benchmark-dir /tmp/email-eval + --benchmark-dir /tmp/email-eval \ + --limit 25 ``` +This writes `hub/agents/npm/agent-email/SCORECARD.md` in place. + ## Release gate `scorecard_gate.py` is a standalone script that exits non-zero on failure: ```bash +# Presence-only check (first adoption or no baseline specified): python -m gaia.eval.scorecard_gate \ - --scorecards-dir hub/agents/npm/agent-email/scorecards \ - --manifest hub/agents/python/email/gaia-agent.yaml -``` + --scorecard hub/agents/npm/agent-email/SCORECARD.md -Or with an explicit version: +# Regression check against a specific prior scorecard file (unit tests / local): +python -m gaia.eval.scorecard_gate \ + --scorecard hub/agents/npm/agent-email/SCORECARD.md \ + --baseline-file /tmp/prev-SCORECARD.md -```bash +# Regression check against a prior release tag (CI): python -m gaia.eval.scorecard_gate \ - --scorecards-dir hub/agents/npm/agent-email/scorecards \ - --version 0.2.4 + --scorecard hub/agents/npm/agent-email/SCORECARD.md \ + --baseline-ref agent-pkg-email-v0.2.3 ``` +`--baseline-file` and `--baseline-ref` are mutually exclusive. If the file doesn't exist at the given ref, the gate treats it as first adoption (presence-only pass). + ### Gate logic -1. **Presence check**: `/.md` must exist and be valid. → exit 1 if not. -2. **Locate prior**: find the greatest semver strictly below `` in ``. If none → **first adoption**, exit 0 (presence-only pass). -3. **Regression check**: if `candidate.aggregate.value < prior.aggregate.value` (strict) → exit 1. +1. **Presence check**: `--scorecard` path must exist and be a valid scorecard. → exit 1 if not. +2. **Baseline resolution**: + - `--baseline-file`: read the given file directly (no git access; suitable for unit tests). + - `--baseline-ref`: resolve via `git show :`. If the file does not exist at that ref → **first adoption**, exit 0. + - Neither specified: **first adoption**, exit 0 (presence-only pass). +3. **Regression check**: if `candidate.aggregate.value < baseline.aggregate.value` (strict) → exit 1. 4. Equal or greater → exit 0. ### Exit codes @@ -205,8 +226,9 @@ python -m gaia.eval.scorecard_gate \ | Case | Exit code | |------|-----------| | Missing or invalid candidate scorecard | `1` | -| Strict regression vs prior version | `1` | -| No prior version (first adoption) | `0` | +| Strict regression vs baseline | `1` | +| No baseline (first adoption) | `0` | +| File absent at `--baseline-ref` | `0` | | Equal score (patch carry-forward) | `0` | | Score improved | `0` | @@ -215,29 +237,18 @@ python -m gaia.eval.scorecard_gate \ When a regression is intentional (e.g. a dataset correction or methodology change), use `--allow-regression`. The gate prints a GHA `::warning::` annotation naming both versions and scores, then exits 0: ``` -::warning::Scorecard regression allowed by --allow-regression: 0.2.3=65.0 → 0.2.4=45.84 -WARNING: Regression override active. Prior version 0.2.3 scored 65.0; candidate 0.2.4 scored 45.84. ... +::warning::Scorecard regression allowed by --allow-regression: v0.2.3=65.0 → v0.2.4=40.0 +WARNING: Regression override active. Prior version v0.2.3 scored 65.0; candidate v0.2.4 scored 40.0. ... ``` -### How the gate resolves "previous version" - -The gate calls `latest_version_below(scorecards_dir, version)`, which: - -1. Lists all `*.md` files in `scorecards_dir`. -2. Keeps only those whose **stem** matches the anchored regex `^\d+\.\d+\.\d+$` (skips `README.md`, `.gitkeep`, prerelease tags, etc.). -3. Compares versions as **integer tuples** `(major, minor, patch)` — so `0.10.0 > 0.2.9` correctly. -4. Returns the greatest version strictly below the candidate, or `None`. - -The version is read from `gaia-agent.yaml` (via `--manifest`) or passed explicitly (via `--version`). - ## Keeping the scorecard current (the update / reject loop) -The scorecard must move with the agent: when LLM-affecting code changes, the eval is re-run and the committed scorecard refreshed — **upward**. A regression is blocked. +The scorecard must move with the agent: when LLM-affecting code changes, the eval is re-run and the committed `SCORECARD.md` refreshed — **upward**. A regression is blocked. Two enforcement points work together: 1. **Reject-on-worse (always on, GitHub-hosted).** The `scorecard-gate` job in `release_agent_.yml` runs on every release. It only parses committed files (no eval), so it runs on a standard runner and **fails the build** if the committed scorecard regressed below the prior version or is missing. This is the hard gate. -2. **Run-and-refresh (self-hosted AMD).** `gaia eval benchmark` needs Lemonade on AMD hardware, so it cannot run on GitHub-hosted runners — it runs on the `[self-hosted, lemonade-eval]` pool. The `Email Agent Eval — scorecard refresh` workflow (`.github/workflows/email_scorecard_refresh.yml`) runs on demand (and on pushes touching the email agent), re-runs the eval, regenerates the scorecard, then: +2. **Run-and-refresh (self-hosted AMD).** `gaia eval benchmark` needs Lemonade on AMD hardware, so it cannot run on GitHub-hosted runners — it runs on the `[self-hosted, lemonade-eval]` pool. The `Email Agent Eval — scorecard refresh` workflow (`.github/workflows/email_scorecard_refresh.yml`) runs on demand (and on pushes touching the email agent), re-runs the eval, regenerates `SCORECARD.md`, then: - **score ≥ committed** → commits the refreshed scorecard back to the branch (the PR carries the improved number); - **score < committed** → fails loudly (the regression must be investigated, or consciously overridden with `--allow-regression`). @@ -253,9 +264,8 @@ So a PR that changes the agent gets its scorecard refreshed (better) or rejected **Use the [`adding-eval-scorecard` skill](https://github.com/amd/gaia/tree/main/.claude/skills/adding-eval-scorecard/SKILL.md).** In Claude Code, invoke it instead of following these steps by hand — it carries the exact commands, the harness→payload→generator flow, the headless-eval gotchas (keyring/PYTHONPATH/tool-timeout), and the verification evidence to capture. The steps below are the reference the skill automates. -1. Create the `scorecards/` directory beside the agent's canonical README. -2. Write a `packaging/gen_scorecard.py` adapter (see `hub/agents/python/email/packaging/gen_scorecard.py` for a reference). -3. Run the eval and call the adapter → commit the resulting `.md`. -4. Link the scorecard from the README: `./scorecards/.md`. -5. Add `scorecards/` to the npm `package.json` `files` array (if published on npm). -6. Wire `scorecard_gate` into the release workflow (see `release_agent_email.yml` for the job topology). +1. Write a `packaging/gen_scorecard.py` adapter (see `hub/agents/python/email/packaging/gen_scorecard.py` for a reference). The adapter should populate `reproduction_command` with the exact commands needed to reproduce the scorecard. +2. Run the eval and call the adapter → commit the resulting `SCORECARD.md` to `/SCORECARD.md`. +3. Link the scorecard from the README: `./SCORECARD.md`. +4. Add `SCORECARD.md` to the npm `package.json` `files` array (if published on npm); do **not** add a `scorecards/` directory. +5. Wire `scorecard_gate` into the release workflow (see `release_agent_email.yml` for the job topology). Use `--scorecard /SCORECARD.md` and `--baseline-ref ` (best-effort). From 20dbdbece3f0597fcdfcab85da3c3478b2e73e62 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 13:38:29 -0400 Subject: [PATCH 17/18] fix(eval): scorecard_gate pylint and black formatting - subprocess.run: add check=False (W1510) - Remove bare f-strings with no interpolated vars (W1309) - black reformatted test_scorecard_gate.py --- src/gaia/eval/scorecard_gate.py | 15 +++++++------- tests/unit/eval/test_scorecard_gate.py | 27 +++++++++++++++++--------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/gaia/eval/scorecard_gate.py b/src/gaia/eval/scorecard_gate.py index 4328511c4..3a54c09b6 100644 --- a/src/gaia/eval/scorecard_gate.py +++ b/src/gaia/eval/scorecard_gate.py @@ -89,10 +89,11 @@ def _parse_baseline_ref(scorecard_path: Path, ref: str) -> str | None: git_path = rel.as_posix() try: - result = subprocess.run( + result = subprocess.run( # noqa: S603 (git is trusted here) ["git", "show", f"{ref}:{git_path}"], capture_output=True, text=True, + check=False, ) except FileNotFoundError as exc: raise ValueError(f"git not found: {exc}") from exc @@ -246,9 +247,9 @@ def main(argv=None) -> int: prev_errors = validate_scorecard(prev_parsed) if prev_errors: print( - f"ERROR: Baseline SCORECARD.md is invalid:\n" + "ERROR: Baseline SCORECARD.md is invalid:\n" + "\n".join(f" - {e}" for e in prev_errors) - + f"\n Fix the baseline scorecard before releasing." + + "\n Fix the baseline scorecard before releasing." ) return 1 @@ -258,15 +259,15 @@ def main(argv=None) -> int: if candidate_score is None: print( f"ERROR: Candidate SCORECARD.md at {candidate_path} has no " - f"'aggregate.value' field.\n" - f" Fix the scorecard front matter before releasing." + "'aggregate.value' field.\n" + " Fix the scorecard front matter before releasing." ) return 1 if prev_score is None: print( - f"ERROR: Baseline SCORECARD.md has no 'aggregate.value' field.\n" - f" Fix the baseline scorecard before releasing." + "ERROR: Baseline SCORECARD.md has no 'aggregate.value' field.\n" + " Fix the baseline scorecard before releasing." ) return 1 diff --git a/tests/unit/eval/test_scorecard_gate.py b/tests/unit/eval/test_scorecard_gate.py index 32424f97a..efc5d4fad 100644 --- a/tests/unit/eval/test_scorecard_gate.py +++ b/tests/unit/eval/test_scorecard_gate.py @@ -151,8 +151,10 @@ def test_allow_regression_flag_returns_0(self, tmp_path): result = main( [ - "--scorecard", str(candidate), - "--baseline-file", str(baseline), + "--scorecard", + str(candidate), + "--baseline-file", + str(baseline), "--allow-regression", ] ) @@ -169,8 +171,10 @@ def test_allow_regression_prints_warning_line(self, tmp_path, capsys): main( [ - "--scorecard", str(candidate), - "--baseline-file", str(baseline), + "--scorecard", + str(candidate), + "--baseline-file", + str(baseline), "--allow-regression", ] ) @@ -188,8 +192,10 @@ def test_missing_baseline_file_returns_1(self, tmp_path): candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6) result = main( [ - "--scorecard", str(candidate), - "--baseline-file", str(tmp_path / "nonexistent-SCORECARD.md"), + "--scorecard", + str(candidate), + "--baseline-file", + str(tmp_path / "nonexistent-SCORECARD.md"), ] ) assert result == 1 @@ -291,9 +297,12 @@ def test_baseline_file_and_ref_mutually_exclusive(self, tmp_path): candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6) result = main( [ - "--scorecard", str(candidate), - "--baseline-file", str(candidate), - "--baseline-ref", "v1.0.0", + "--scorecard", + str(candidate), + "--baseline-file", + str(candidate), + "--baseline-ref", + "v1.0.0", ] ) assert result == 1 From c2dcf6c991c59c7c434085eb6fb2699e281dac83 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Fri, 26 Jun 2026 13:41:52 -0400 Subject: [PATCH 18/18] feat(eval): email SCORECARD.md from full-corpus run (46.0); portable reproduction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regenerate the email v0.2.4 SCORECARD.md from a full-corpus gaia eval benchmark run on AMD Strix Halo: category_accuracy 0.46 over 100 of 220 emails (the triage tool processes up to 100 per call) -> aggregate 46.0/100. Errors are dominated by the inherently-ambiguous fyi<->needs_response boundary; the model over-assigns NEEDS_RESPONSE. Fix the adapter's reproduction command to be portable (generic /tmp/email-eval output dir, full model/mbox/ground-truth/output-dir flags) — no local absolute path in the published artifact. README reflects 46.0. --- hub/agents/npm/agent-email/README.md | 2 +- hub/agents/npm/agent-email/SCORECARD.md | 38 ++++++++++--------- .../python/email/packaging/gen_scorecard.py | 34 +++++++++-------- 3 files changed, 41 insertions(+), 33 deletions(-) diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md index 92424371b..f8d797279 100644 --- a/hub/agents/npm/agent-email/README.md +++ b/hub/agents/npm/agent-email/README.md @@ -2,7 +2,7 @@ [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24** -**Eval scorecard (v0.2.4): aggregate 40.0 / 100** — `category_accuracy` 0.40 over 25 of 220 labeled emails ([`./SCORECARD.md`](./SCORECARD.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, and a worked recomputation of the aggregate. +**Eval scorecard (v0.2.4): aggregate 46.0 / 100** — `category_accuracy` 0.46 over 100 of 220 labeled emails ([`./SCORECARD.md`](./SCORECARD.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, a worked recomputation, and reproduction steps. Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies to, and schedules from Gmail and Outlook — with every email body analyzed diff --git a/hub/agents/npm/agent-email/SCORECARD.md b/hub/agents/npm/agent-email/SCORECARD.md index b4f8ae5ca..000f2a127 100644 --- a/hub/agents/npm/agent-email/SCORECARD.md +++ b/hub/agents/npm/agent-email/SCORECARD.md @@ -18,27 +18,27 @@ recipe: model: Gemma-4-E4B-it-GGUF corpus: tests/fixtures/email/synthetic_inbox.mbox ground_truth: tests/fixtures/email/ground_truth.json - limit: 25 + limit: 220 results: - test_cases_run: 25 + test_cases_run: 100 metrics: - name: category_accuracy - value: 0.4 + value: 0.46 weight: 1.0 aggregate: name: weighted_accuracy formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2) components: - metric: category_accuracy - value: 0.4 + value: 0.46 weight: 1.0 - value: 40.0 -generated_at: '2026-06-26T17:29:34.631236+00:00' + value: 46.0 +generated_at: '2026-06-26T17:40:26.470285+00:00' inherited_from: null --- # Email Triage — Eval Scorecard v0.2.4 -**Aggregate score: 40.0** (out of 100) +**Aggregate score: 46.0** (out of 100) ## Recipe @@ -47,12 +47,12 @@ inherited_from: null | Dataset | [tests/fixtures/email/ground_truth.json](tests/fixtures/email/ground_truth.json) | | Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, schema-2.0 triage taxonomy: fyi / needs_response / promotional / urgent / personal) | | Dataset size | 220 labeled examples | -| Test cases run | 25 | +| Test cases run | 100 | | Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0 triage taxonomy, aligned with the agent's output labels (#1874) | ## Metrics - - **category_accuracy**: 0.4000 × 1.0 + - **category_accuracy**: 0.4600 × 1.0 ## Aggregate score recomputation @@ -61,7 +61,7 @@ Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)` Worked example: ``` -round(100 × ((0.4000 × 1.0)) / 1.0, 2) = 40.0 +round(100 × ((0.4600 × 1.0)) / 1.0, 2) = 46.0 ``` A reader can reproduce this value from the `aggregate.components` in the front @@ -72,19 +72,23 @@ matter alone — no eval-harness access needed. Run the following commands from the repository root: ```sh -# Step 1: run the benchmark (requires a running Lemonade Server on :13305) +# Step 1: run the benchmark (requires a Lemonade Server with the model loaded; AMD Ryzen AI / Strix Halo recommended) PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \ -GAIA_AGENT_TOOL_TIMEOUT=120 \ +GAIA_AGENT_TOOL_TIMEOUT=900 \ PYTHONPATH="$(pwd)" \ -gaia eval benchmark --limit 25 +gaia eval benchmark \ + --model Gemma-4-E4B-it-GGUF \ + --mbox-path tests/fixtures/email/synthetic_inbox.mbox \ + --ground-truth tests/fixtures/email/ground_truth.json \ + --limit 220 \ + --output-dir /tmp/email-eval -# Step 2: generate the scorecard from the benchmark output -PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \ +# Step 2: generate this scorecard from the benchmark output PYTHONPATH="$(pwd)" \ python hub/agents/python/email/packaging/gen_scorecard.py \ - --benchmark-dir /private/tmp/claude-501/-Users-tomasz-src-amd-gaia--claude-worktrees-sleepy-chatelet-2b818a/314bd25e-fbc0-4ab7-aab0-a8825585e5ef/scratchpad/email-eval-relabeled \ + --benchmark-dir /tmp/email-eval \ --ground-truth tests/fixtures/email/ground_truth.json \ - --limit 25 + --limit 220 ``` See [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) for the full setup guide. diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py index 1837a1389..344817bda 100644 --- a/hub/agents/python/email/packaging/gen_scorecard.py +++ b/hub/agents/python/email/packaging/gen_scorecard.py @@ -212,28 +212,32 @@ def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None): import datetime - # Construct an exact reproduction command using the supplied arguments, so any - # reader can reproduce the scorecard result from scratch. - limit_flag = f" --limit {limit}" if limit is not None else "" + # Construct a portable, exact reproduction command so any reader can reproduce + # this scorecard from scratch. Use repo-relative paths and a generic output dir + # only — never a local absolute path (this ships in a published artifact). + limit_flag = f" \\\n --limit {limit}" if limit is not None else "" ground_truth_rel = ( str(ground_truth_path.relative_to(_REPO_ROOT)) if str(ground_truth_path).startswith(str(_REPO_ROOT)) else ground_truth_path.name ) - benchmark_dir_display = str(benchmark_dir) reproduction_command = ( - "# Step 1: run the benchmark (requires a running Lemonade Server on :13305)\n" - f"PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n" - f"GAIA_AGENT_TOOL_TIMEOUT=120 \\\n" - f"PYTHONPATH=\"$(pwd)\" \\\n" - f"gaia eval benchmark{limit_flag}\n\n" - "# Step 2: generate the scorecard from the benchmark output\n" - f"PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n" - f"PYTHONPATH=\"$(pwd)\" \\\n" - f"python hub/agents/python/email/packaging/gen_scorecard.py \\\n" - f" --benchmark-dir {benchmark_dir_display} \\\n" + "# Step 1: run the benchmark (requires a Lemonade Server with the model " + "loaded; AMD Ryzen AI / Strix Halo recommended)\n" + "PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n" + "GAIA_AGENT_TOOL_TIMEOUT=900 \\\n" + 'PYTHONPATH="$(pwd)" \\\n' + "gaia eval benchmark \\\n" + f" --model {model} \\\n" + " --mbox-path tests/fixtures/email/synthetic_inbox.mbox \\\n" + f" --ground-truth {ground_truth_rel}{limit_flag} \\\n" + " --output-dir /tmp/email-eval\n\n" + "# Step 2: generate this scorecard from the benchmark output\n" + 'PYTHONPATH="$(pwd)" \\\n' + "python hub/agents/python/email/packaging/gen_scorecard.py \\\n" + " --benchmark-dir /tmp/email-eval \\\n" f" --ground-truth {ground_truth_rel}" - + (f" \\\n --limit {limit}" if limit is not None else "") + + (f"{limit_flag}" if limit is not None else "") ) return ResultPayload(