diff --git a/.claude/skills/adding-eval-scorecard/SKILL.md b/.claude/skills/adding-eval-scorecard/SKILL.md new file mode 100644 index 000000000..97123244d --- /dev/null +++ b/.claude/skills/adding-eval-scorecard/SKILL.md @@ -0,0 +1,98 @@ +--- +name: "adding-eval-scorecard" +description: "Adopt the per-agent eval scorecard for a GAIA hub agent: write the harness→payload adapter, run the eval to produce a REAL scorecard, link + surface it from the agent's README, wire the release gate, and (for a new agent) generalize the format. Use when asked to 'add a scorecard', 'adopt the eval scorecard', 'generate the scorecard for ', or wire scorecard CI for an agent. Builds on docs/reference/eval-scorecard.mdx and the email agent reference adapter." +--- + +# Adding an Eval Scorecard to a GAIA Agent + +Adopt the release **eval scorecard** ([`docs/reference/eval-scorecard.mdx`](../../../docs/reference/eval-scorecard.mdx)) for one hub agent. The system is `harness → result payload → generator → scorecard`, with a standalone presence+regression release gate. The **email agent is the reference implementation** — mirror it. + +**Core modules (do not modify; reuse):** +- `src/gaia/eval/release_scorecard.py` — `ResultPayload`, `compute_aggregate`, `render_scorecard`, `write_scorecard`, `validate_scorecard`, `carry_forward`. Harness-agnostic (stdlib + PyYAML only). +- `src/gaia/eval/scorecard_gate.py` — the standalone gate (`python -m gaia.eval.scorecard_gate`). +- Reference adapter: `hub/agents/python/email/packaging/gen_scorecard.py`. + +This is a **phased checklist with a hard gate at the real-eval step** — the scorecard MUST come from an actual eval run, never hand-authored numbers. + +## Phase 1 — Locate the agent's surfaces + +1. **Version source of truth** = the `version:` field in `/gaia-agent.yaml`. Never invent a parallel scheme. +2. **Canonical README** (where the scorecard is linked + surfaced): for an npm-published agent it is the npm client README (e.g. `hub/agents/npm//README.md`), NOT a `packaging/README.md`. For a Python-only agent it is `hub/agents/python//README.md`. Confirm which by checking what `release_agent_.yml` publishes (`README:` env) — the published README is the one to link. +3. **doc-root** = the directory holding that canonical README. The scorecard lives at `/SCORECARD.md` — a **single file updated in place**, versioned via the publish snapshot (same as README.md). **There is no `scorecards/` directory.** +4. **Eval vehicle**: what existing harness produces this agent's accuracy metric? (email → `gaia eval benchmark` over `tests/fixtures/email/`.) If none exists, STOP and surface that — propose the minimal harness before building; do not invent numbers. + +## Phase 2 — Write the adapter (harness → payload) + +Copy `hub/agents/python/email/packaging/gen_scorecard.py` as the template. The adapter: +- imports ONLY `gaia.eval.release_scorecard` (never the harness or agent package — preserve loose coupling); +- reads the harness output, builds a `ResultPayload`; +- populates `reproduction_command` with the **exact shell commands** to reproduce this scorecard, including all required env vars (`PYTHON_KEYRING_BACKEND`, `GAIA_AGENT_TOOL_TIMEOUT`, `PYTHONPATH`); +- defines **"judged"** explicitly and **raises loudly** if zero results are judged (no silent 0.0); +- records **dataset size** (total labeled examples) and **test_cases_run** (subset executed) as DISTINCT fields; +- stores **repo-relative** paths only (never a local absolute path — it ships in a published artifact); +- records the eval `limit`/config so future regression checks are comparable; +- writes to `/SCORECARD.md` (the single file; `--output-dir` overrides to a directory, but the filename is always `SCORECARD.md`). + +Add an offline unit test against a committed sample harness-output fixture (see `tests/fixtures/eval/email_benchmark_scorecard.json` + `tests/unit/eval/test_release_scorecard.py::TestEmailAdapter`) so the adapter is testable without a live model. + +## Phase 3 — Run the REAL eval (hard gate — no hand-authored numbers) + +The accuracy number must come from an actual run. For the email agent: + +```bash +# Real eval needs Lemonade + the model. Prefer AMD hardware (Strix Halo / Ryzen AI); +# the [self-hosted, lemonade-eval] runner is the canonical environment. +GAIA_AGENT_TOOL_TIMEOUT=900 \ +PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \ +PYTHONPATH="$(pwd)" \ + /bin/gaia eval benchmark \ + --model Gemma-4-E4B-it-GGUF \ + --mbox-path tests/fixtures/email/synthetic_inbox.mbox \ + --ground-truth tests/fixtures/email/ground_truth.json \ + --limit 25 --output-dir + +PYTHONPATH="$(pwd)" \ +/bin/python hub/agents/python/email/packaging/gen_scorecard.py \ + --benchmark-dir --limit 25 +# → writes hub/agents/npm/agent-email/SCORECARD.md in place +``` + +**Headless gotchas (see memory `project-email-benchmark-headless-gotchas`):** +- `PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring` — the email agent's calendar-connector resolution blocks forever on the macOS Keychain (and can stall on Linux SecretService) in non-interactive contexts. Without this it hangs at 0% CPU during agent construction. +- `PYTHONPATH="$(pwd)"` — the benchmark imports `tests.fixtures.email.*`; the console script doesn't add the repo root. +- `GAIA_AGENT_TOOL_TIMEOUT=900` — triage of N emails is one tool call; the 180s default abandons it on slow backends, yielding a degenerate 0-email FAIL run. +- Write `--output-dir` to a **persistent** dir, not `/tmp` (cleared on session resume). +- Record honestly: if the metric is low for a known reason (e.g. a taxonomy/label mismatch), put the explanation in the adapter's `methodology` string and link the tracking issue — never inflate the number. + +## Phase 4 — Surface, link, and gate + +1. **Link + surface** from the canonical README: a one-line `Eval scorecard (vX.Y.Z): aggregate N/100 … ([./SCORECARD.md](./SCORECARD.md))`. The relative link must resolve in-repo. +2. **npm `files`**: if the agent publishes on npm, add `SCORECARD.md` to `package.json` `files`. **Do not** add a `scorecards/` directory — only the single current file ships. +3. **Hub display**: a published scorecard surfaces on the agent's hub page / Agent UI detail view (see `workers/agent-hub` + `AgentDetailModal.tsx`); ensure the publish step passes `--eval-scorecard /SCORECARD.md` to `publish_to_r2.py`. +4. **Release gate**: add a `scorecard-gate` job to `release_agent_.yml` and list it in `publish.needs`. The job runs on a GitHub-hosted runner (it only parses committed files — no eval): + ```bash + # Presence-only (no previous tag yet): + python -m gaia.eval.scorecard_gate \ + --scorecard /SCORECARD.md + + # With best-effort previous-release baseline (recommended for CI): + PREV="$(git describe --tags --abbrev=0 --match 'agent-pkg--*' "${GITHUB_REF_NAME}^" 2>/dev/null || true)" + if [ -n "$PREV" ]; then + python -m gaia.eval.scorecard_gate \ + --scorecard /SCORECARD.md --baseline-ref "$PREV" + else + python -m gaia.eval.scorecard_gate \ + --scorecard /SCORECARD.md + fi + ``` + The job must NOT have `continue-on-error`, an `environment:`, or a `permissions:` override (inherits `contents: read`; needs no secrets). Fetch full history (`fetch-depth: 0`) so `git describe` resolves. +5. **Auto-update/reject loop**: for re-running on agent changes and refreshing the committed scorecard, see `eval-scorecard.mdx` "Keeping the scorecard current" and the self-hosted refresh workflow — reject-on-worse is the gate; better-or-equal refreshes the committed `SCORECARD.md`. + +## Phase 5 — Verify (evidence before "done") + +Run and capture: the generated `SCORECARD.md`; the gate **passing** on it (exit 0); the gate **blocking** a manufactured regression (exit 1, via `--baseline-file` with a higher-scoring card) and a missing card (exit 1); a by-hand recompute of the aggregate from `aggregate.components` matching the recorded value. Run `python util/lint.py --all` and the eval unit tests. These are the PR's real-world proof. + +## Versioning + +- **Patch** release → `carry_forward(prev_scorecard_path, new_version)` reads the version from the front matter of the current `SCORECARD.md` (not from the filename) and copies results verbatim, sets `inherited_from`; do NOT re-run the eval. +- **Minor/major** release → re-run the eval (Phase 3); `carry_forward` refuses a non-patch bump with a "re-run" error. diff --git a/.github/workflows/email_scorecard_refresh.yml b/.github/workflows/email_scorecard_refresh.yml new file mode 100644 index 000000000..9a8f0a849 --- /dev/null +++ b/.github/workflows/email_scorecard_refresh.yml @@ -0,0 +1,172 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT + +# Email agent eval-scorecard refresh + regression gate (#1862). +# +# Answers "how does a PR that changes the agent keep the scorecard honest?": +# when the email agent's LLM-affecting code (or the eval corpus) changes, this +# re-runs the REAL eval, regenerates the scorecard, and then: +# - score IMPROVED or held -> commits the refreshed SCORECARD.md to the branch +# - score REGRESSED -> fails the job (the worse card is NOT committed) +# +# `gaia eval benchmark` needs Lemonade on AMD hardware, so this runs ONLY on the +# self-hosted [self-hosted, lemonade-eval] pool — GitHub-hosted runners cannot run +# it. The release-time `scorecard-gate` job in release_agent_email.yml is the +# hosted-CI backstop (it parses committed files only, no eval). +# +# Two regression checks run here: +# 1. SAME-VERSION: fresh aggregate vs the currently-committed SCORECARD.md — +# stops a noisy/worse re-run from silently overwriting a good score. +# 2. CROSS-VERSION (best-effort): fresh SCORECARD.md vs the prior version tag +# via --baseline-ref. +# +# Auto-commit needs `contents: write` and only works on the repo's own branches; +# a fork PR's GITHUB_TOKEN is read-only — for forks, run the eval locally / on AMD +# hardware and commit the scorecard by hand (the release gate still enforces it). + +name: Email Agent Eval — scorecard refresh + +on: + workflow_dispatch: + inputs: + limit: + description: 'Messages to triage (must match the committed scorecard for comparability)' + required: false + default: '25' + model: + description: 'Lemonade model id' + required: false + default: 'Gemma-4-E4B-it-GGUF' + push: + branches-ignore: + - main + paths: + - 'hub/agents/python/email/**' + - 'tests/fixtures/email/**' + - 'src/gaia/eval/release_scorecard.py' + - 'src/gaia/eval/scorecard_gate.py' + +concurrency: + # Share the single Lemonade backend slot with the other self-hosted evals so two + # runs never race-evict each other's model (CLAUDE.md: evals run serially). + group: lemonade-eval + cancel-in-progress: false + +permissions: + contents: write # auto-commit the refreshed scorecard to the branch + +env: + SCORECARD: hub/agents/npm/agent-email/SCORECARD.md + MANIFEST: hub/agents/python/email/gaia-agent.yaml + LIMIT: ${{ github.event.inputs.limit || '25' }} + MODEL: ${{ github.event.inputs.model || 'Gemma-4-E4B-it-GGUF' }} + +jobs: + refresh: + name: Re-run eval, refresh-or-reject scorecard + runs-on: [self-hosted, lemonade-eval] + timeout-minutes: 90 + steps: + - name: Checkout (the pushed branch) + uses: actions/checkout@v6 + with: + ref: ${{ github.head_ref || github.ref_name }} + fetch-depth: 0 # full history for git describe (cross-version baseline) + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.10' + + - name: Install in isolated venv + run: | + python -m venv .venv-scorecard + source .venv-scorecard/bin/activate + python -m pip install --upgrade pip + pip install -e ".[dev,eval,api]" + echo "$PWD/.venv-scorecard/bin" >> "$GITHUB_PATH" + + - name: Resolve version + capture currently-committed aggregate + id: pre + run: | + set -euo pipefail + VERSION=$(python -c "import yaml; print(yaml.safe_load(open('${MANIFEST}'))['version'])") + echo "version=${VERSION}" >> "$GITHUB_OUTPUT" + # Aggregate of the SCORECARD.md as committed on this branch (empty if new). + if git cat-file -e "HEAD:${SCORECARD}" 2>/dev/null; then + git show "HEAD:${SCORECARD}" > /tmp/committed_scorecard.md + COMMITTED=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('/tmp/committed_scorecard.md'))['aggregate']['value'])") + else + COMMITTED="" + fi + echo "committed=${COMMITTED}" >> "$GITHUB_OUTPUT" + # Resolve the previous release tag for the cross-version check. + PREV="$(git describe --tags --abbrev=0 \ + --match 'agent-pkg-email-*' \ + "HEAD^" 2>/dev/null || true)" + echo "prev_tag=${PREV}" >> "$GITHUB_OUTPUT" + echo "Version ${VERSION}; committed aggregate: ${COMMITTED:-}; prev tag: ${PREV:-}" + + - name: Run the email-triage benchmark (real eval) + env: + # The agent's calendar-connector resolution blocks on the OS keyring in + # a headless context — disable it so construction doesn't hang. + PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring + # Triage of N emails is one tool call; the 180s default abandons it on a + # slow backend and yields a degenerate 0-email run. + GAIA_AGENT_TOOL_TIMEOUT: '900' + PYTHONPATH: ${{ github.workspace }} + run: | + set -euo pipefail + rm -rf eval-out && mkdir -p eval-out + gaia eval benchmark \ + --model "${MODEL}" \ + --mbox-path tests/fixtures/email/synthetic_inbox.mbox \ + --ground-truth tests/fixtures/email/ground_truth.json \ + --limit "${LIMIT}" \ + --output-dir eval-out + + - name: Regenerate SCORECARD.md from the real run + run: | + set -euo pipefail + python hub/agents/python/email/packaging/gen_scorecard.py \ + --benchmark-dir eval-out --limit "${LIMIT}" + + - name: Same-version regression check (reject a worse re-run) + run: | + set -euo pipefail + COMMITTED="${{ steps.pre.outputs.committed }}" + FRESH=$(python -c "from gaia.eval.release_scorecard import parse_scorecard; print(parse_scorecard(__import__('pathlib').Path('${SCORECARD}'))['aggregate']['value'])") + echo "fresh aggregate: ${FRESH} | committed: ${COMMITTED:-}" + if [ -n "${COMMITTED}" ] && python -c "import sys; sys.exit(0 if float('${FRESH}') < float('${COMMITTED}') else 1)"; then + echo "::error::Scorecard regression: re-run scored ${FRESH} < committed ${COMMITTED}. Not committing the worse card. Investigate, or override intentionally via --allow-regression in a manual commit." + git checkout -- "${SCORECARD}" || true + exit 1 + fi + echo "No same-version regression — fresh score is >= committed." + + - name: Cross-version gate (fresh SCORECARD.md vs prior version tag, best-effort) + run: | + set -euo pipefail + PREV="${{ steps.pre.outputs.prev_tag }}" + if [ -n "${PREV}" ]; then + python -m gaia.eval.scorecard_gate \ + --scorecard "${SCORECARD}" \ + --baseline-ref "${PREV}" + else + python -m gaia.eval.scorecard_gate \ + --scorecard "${SCORECARD}" + fi + + - name: Commit the refreshed SCORECARD.md (only if it changed for the better/equal) + run: | + set -euo pipefail + if git diff --quiet -- "${SCORECARD}"; then + echo "SCORECARD.md unchanged — nothing to commit." + exit 0 + fi + git config user.name "${{ github.actor }}" + git config user.email "${{ github.actor }}@users.noreply.github.com" + git add "${SCORECARD}" + git commit -m "eval(email): refresh v${{ steps.pre.outputs.version }} scorecard from benchmark run" + git push origin "HEAD:${{ github.head_ref || github.ref_name }}" diff --git a/.github/workflows/release_agent_email.yml b/.github/workflows/release_agent_email.yml index ea183775f..3bc05451b 100644 --- a/.github/workflows/release_agent_email.yml +++ b/.github/workflows/release_agent_email.yml @@ -266,11 +266,55 @@ jobs: echo "ok=false" >> "$GITHUB_OUTPUT" fi + # ── Stage 1b: scorecard presence + regression gate ───────────────── + scorecard-gate: + name: Scorecard gate + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 # full history so git describe can find previous tags + - uses: actions/setup-python@v6 + with: + python-version: "3.12" + - name: Install core + PyYAML + run: pip install -e . pyyaml + - name: Resolve previous release tag (best-effort baseline) + id: prev_tag + shell: bash + run: | + set -uo pipefail + # Find the most recent agent-pkg-email-* tag strictly before the + # current ref. On workflow_dispatch the current ref is a branch, not + # a tag, so we look for the latest tag of the right pattern overall. + PREV="$(git describe --tags --abbrev=0 \ + --match 'agent-pkg-email-*' \ + "${GITHUB_REF_NAME}^" 2>/dev/null || true)" + echo "prev_tag=${PREV}" >> "$GITHUB_OUTPUT" + if [ -n "${PREV}" ]; then + echo "Baseline tag: ${PREV}" + else + echo "No previous release tag found — presence-only check." + fi + - name: Run scorecard gate + shell: bash + run: | + set -euo pipefail + PREV="${{ steps.prev_tag.outputs.prev_tag }}" + if [ -n "${PREV}" ]; then + python -m gaia.eval.scorecard_gate \ + --scorecard hub/agents/npm/agent-email/SCORECARD.md \ + --baseline-ref "${PREV}" + else + python -m gaia.eval.scorecard_gate \ + --scorecard hub/agents/npm/agent-email/SCORECARD.md + fi + # ── Stage 2: publish to the hub + npm (single atomic step) ───────── publish: name: Publish to Hub + npm runs-on: ubuntu-latest - needs: [build, verify-darwin-x64-compat] + needs: [build, verify-darwin-x64-compat, scorecard-gate] # Manual approval gate: the `agent-publish` environment is configured (repo # Settings → Environments) with required reviewers, so this job pauses until a # maintainer approves — the human backstop for an accidental/tampered release @@ -458,6 +502,12 @@ jobs: case "$f" in *.json) continue ;; esac args+=(--artifact "$f") done + VER="${{ steps.ver.outputs.version }}" + scorecard_args=() + SCORECARD="hub/agents/npm/agent-email/SCORECARD.md" + if [ -f "${SCORECARD}" ]; then + scorecard_args+=(--eval-scorecard "${SCORECARD}") + fi python hub/agents/python/email/packaging/publish_to_r2.py \ --base-url "${GAIA_HUB_PUBLISH_URL:-${GAIA_HUB_BASE_URL:-https://hub.amd-gaia.ai}}" \ --manifest "${MANIFEST}" \ @@ -465,6 +515,7 @@ jobs: --changelog "${CHANGELOG}" \ --spec "${SPEC}" \ --skill "${SKILL}" \ + "${scorecard_args[@]}" \ "${args[@]}" \ --summary-out published.json echo "=== publish summary ===" diff --git a/docs/docs.json b/docs/docs.json index 5397cf0a5..ba1b26d90 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -356,6 +356,7 @@ "group": "Evaluation Framework", "pages": [ "reference/eval", + "reference/eval-scorecard", "eval" ] }, diff --git a/docs/reference/eval-scorecard.mdx b/docs/reference/eval-scorecard.mdx new file mode 100644 index 000000000..20151a45e --- /dev/null +++ b/docs/reference/eval-scorecard.mdx @@ -0,0 +1,271 @@ +--- +title: "Release Eval Scorecard" +description: "Per-agent eval scorecard: schema, storage convention, aggregate formula, versioning policy, reproduction, and release gate." +icon: "chart-bar" +--- + + + **Source Code:** + [`src/gaia/eval/release_scorecard.py`](https://github.com/amd/gaia/tree/main/src/gaia/eval/release_scorecard.py) (core generator) · + [`src/gaia/eval/scorecard_gate.py`](https://github.com/amd/gaia/tree/main/src/gaia/eval/scorecard_gate.py) (release gate) + + **Distinct from** [`src/gaia/eval/scorecard.py`](https://github.com/amd/gaia/tree/main/src/gaia/eval/scorecard.py) — that file is the per-run scenario PASS/FAIL aggregator used internally by `gaia eval agent`. This document describes the outward-facing *release artifact*. + + +## Overview + +Each published hub agent ships a **release scorecard** — a single `SCORECARD.md` file (updated in place per release, versioned via the publish snapshot, the same way `README.md` works) that records: + +- The **eval recipe**: dataset reference, methodology, configuration, and metric definitions. +- The **measured results**: per-metric values, number of test cases actually run, and dataset size. +- A single **named aggregate score**: a deterministic, recomputable percentage so a reviewer can verify the number without re-running the eval. +- A **Reproduction section**: the exact commands to reproduce the result from scratch. + +Scorecards are committed alongside the agent's README and linked from it. A standalone **release gate** (`scorecard_gate.py`) blocks packaging when the scorecard is missing or when its aggregate score strictly regresses below the prior version's. + +## File format + +Scorecard files are **Markdown with YAML front matter** (`.md`). The front matter holds all machine-readable fields; the body is a human-readable summary with a worked recomputation and a Reproduction section. + +``` +--- +schema_version: 1 +agent: + name: Email Triage + version: 0.2.4 +recipe: + dataset: + reference: tests/fixtures/email/ground_truth.json + description: Synthetic email corpus (FakeGmailBackend, schema-2.0 triage taxonomy) + size: 220 + methodology: gaia eval benchmark — category_accuracy (case-insensitive exact match) + config: + harness: gaia eval benchmark + model: Gemma-4-E4B-it-GGUF + limit: 25 +results: + test_cases_run: 25 + metrics: + - name: category_accuracy + value: 0.40 + weight: 1.0 +aggregate: + name: weighted_accuracy + formula: "round(100 * sum(weight_i * value_i) / sum(weight_i), 2)" + components: + - metric: category_accuracy + value: 0.40 + weight: 1.0 + value: 40.0 +generated_at: "2026-06-26T16:47:13+00:00" +inherited_from: null +--- + +# Email Triage — Eval Scorecard v0.2.4 + +**Aggregate score: 40.0** (out of 100) +... + +## Reproduction + +Run the following commands from the repository root: +... +``` + +### Required fields + +A scorecard missing any of these is **invalid** and will be rejected by the release gate: + +| Field | Description | +|-------|-------------| +| `schema_version` | Always `1` for this schema version | +| `agent.name` | Human-readable agent name | +| `agent.version` | Semver version string (e.g. `0.2.4`) | +| `recipe.dataset.reference` | Dataset path or URL | +| `recipe.dataset.description` | Short description | +| `recipe.dataset.size` | Total labeled examples available | +| `recipe.methodology` | How the eval was run | +| `recipe.config` | Harness config (model, limit, corpus, …) | +| `results.test_cases_run` | Subset of examples actually executed this run | +| `results.metrics` | List of `{name, value, weight}` dicts | +| `aggregate.name` | Name of the aggregate score | +| `aggregate.formula` | Human-readable formula string | +| `aggregate.components` | List of `{metric, value, weight}` dicts | +| `aggregate.value` | The computed aggregate float | + +### Two counts — defined distinctly + +`recipe.dataset.size` and `results.test_cases_run` are intentionally **separate fields**: + +- **`recipe.dataset.size`** — total labeled examples available in the dataset (fixed for a given dataset version). +- **`results.test_cases_run`** — the subset actually executed in this run (may be limited by `--limit`). Must be ≤ `recipe.dataset.size`. + +They may be numerically equal (when the full dataset is run), but they represent different things. + + + **Comparability depends on a consistent `--limit`.** Future regression checks compare aggregate scores. If one run uses `--limit 12` and the next uses `--limit 100`, the scores may differ for reasons unrelated to model quality. Record the exact `limit` in `recipe.config` and keep it consistent across versions. + + +## Aggregate score formula + +``` +aggregate.value = round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2) +``` + +where each `valueᵢ` is a metric value in [0, 1] and each `weightᵢ` defaults to 1.0. + +The result is a **percentage in [0, 100]**. For a single metric with weight 1.0: + +``` +round(100 × 0.40, 2) = 40.0 +``` + +A reader can reproduce this value from `aggregate.components` alone — no eval-harness access needed. +The `aggregate.formula` field in the front matter states the formula in human-readable form so it is self-documenting. + +## Storage convention + +Each agent package ships a **single `SCORECARD.md`** file, updated in place per release — the same way `README.md` works. Per-version uniqueness comes from the publish snapshot (R2 stores the file at `agents///SCORECARD.md`; the npm package ships only the current version's `SCORECARD.md`). + +``` +/ + README.md ← canonical README (links to SCORECARD.md) + SCORECARD.md ← current version's scorecard, updated in place + SPEC.md + SKILL.md + CHANGELOG.md +``` + +The `doc-root` is the location of the agent's canonical README: + +| Agent | doc-root | +|-------|----------| +| Email Triage (`@amd-gaia/agent-email`) | `hub/agents/npm/agent-email/` | + +The relative link `./SCORECARD.md` resolves both in-repo and when the directory is published as an npm package. The npm `files` array includes `SCORECARD.md` (not a `scorecards/` directory). + +## Versioning policy + +### Patch releases — carry forward + +For a **patch release** (same `major.minor`, `patch` incremented), the prior version's results are carried forward verbatim using `carry_forward()`. Pass the path to the agent's current `SCORECARD.md`: + +```python +from gaia.eval.release_scorecard import carry_forward, write_scorecard +from pathlib import Path + +new_payload = carry_forward( + prev_scorecard_path=Path("hub/agents/npm/agent-email/SCORECARD.md"), + new_version="0.2.5", +) +# new_payload.inherited_from == "0.2.4" (read from front matter, not filename) +write_scorecard(new_payload, Path("hub/agents/npm/agent-email/SCORECARD.md")) +``` + +The resulting scorecard has `inherited_from: "0.2.4"` and identical `results` and `aggregate` fields. The aggregate score is unchanged, so the release gate's equal-score case passes. + +`carry_forward()` reads the prior version from the `agent.version` field in the front matter — **not** from the filename. + +### Minor / major releases — re-run required + +For a **minor or major bump**, `carry_forward()` raises `ValueError` with a "re-run" message. Run the eval fresh and generate a new scorecard: + +```bash +PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \ +GAIA_AGENT_TOOL_TIMEOUT=120 \ +PYTHONPATH="$(pwd)" \ +gaia eval benchmark \ + --model Gemma-4-E4B-it-GGUF \ + --mbox-path tests/fixtures/email/synthetic_inbox.mbox \ + --ground-truth tests/fixtures/email/ground_truth.json \ + --limit 25 \ + --output-dir /tmp/email-eval + +PYTHONPATH="$(pwd)" \ +python hub/agents/python/email/packaging/gen_scorecard.py \ + --benchmark-dir /tmp/email-eval \ + --limit 25 +``` + +This writes `hub/agents/npm/agent-email/SCORECARD.md` in place. + +## Release gate + +`scorecard_gate.py` is a standalone script that exits non-zero on failure: + +```bash +# Presence-only check (first adoption or no baseline specified): +python -m gaia.eval.scorecard_gate \ + --scorecard hub/agents/npm/agent-email/SCORECARD.md + +# Regression check against a specific prior scorecard file (unit tests / local): +python -m gaia.eval.scorecard_gate \ + --scorecard hub/agents/npm/agent-email/SCORECARD.md \ + --baseline-file /tmp/prev-SCORECARD.md + +# Regression check against a prior release tag (CI): +python -m gaia.eval.scorecard_gate \ + --scorecard hub/agents/npm/agent-email/SCORECARD.md \ + --baseline-ref agent-pkg-email-v0.2.3 +``` + +`--baseline-file` and `--baseline-ref` are mutually exclusive. If the file doesn't exist at the given ref, the gate treats it as first adoption (presence-only pass). + +### Gate logic + +1. **Presence check**: `--scorecard` path must exist and be a valid scorecard. → exit 1 if not. +2. **Baseline resolution**: + - `--baseline-file`: read the given file directly (no git access; suitable for unit tests). + - `--baseline-ref`: resolve via `git show :`. If the file does not exist at that ref → **first adoption**, exit 0. + - Neither specified: **first adoption**, exit 0 (presence-only pass). +3. **Regression check**: if `candidate.aggregate.value < baseline.aggregate.value` (strict) → exit 1. +4. Equal or greater → exit 0. + +### Exit codes + +| Case | Exit code | +|------|-----------| +| Missing or invalid candidate scorecard | `1` | +| Strict regression vs baseline | `1` | +| No baseline (first adoption) | `0` | +| File absent at `--baseline-ref` | `0` | +| Equal score (patch carry-forward) | `0` | +| Score improved | `0` | + +### `--allow-regression` + +When a regression is intentional (e.g. a dataset correction or methodology change), use `--allow-regression`. The gate prints a GHA `::warning::` annotation naming both versions and scores, then exits 0: + +``` +::warning::Scorecard regression allowed by --allow-regression: v0.2.3=65.0 → v0.2.4=40.0 +WARNING: Regression override active. Prior version v0.2.3 scored 65.0; candidate v0.2.4 scored 40.0. ... +``` + +## Keeping the scorecard current (the update / reject loop) + +The scorecard must move with the agent: when LLM-affecting code changes, the eval is re-run and the committed `SCORECARD.md` refreshed — **upward**. A regression is blocked. + +Two enforcement points work together: + +1. **Reject-on-worse (always on, GitHub-hosted).** The `scorecard-gate` job in `release_agent_.yml` runs on every release. It only parses committed files (no eval), so it runs on a standard runner and **fails the build** if the committed scorecard regressed below the prior version or is missing. This is the hard gate. +2. **Run-and-refresh (self-hosted AMD).** `gaia eval benchmark` needs Lemonade on AMD hardware, so it cannot run on GitHub-hosted runners — it runs on the `[self-hosted, lemonade-eval]` pool. The `Email Agent Eval — scorecard refresh` workflow (`.github/workflows/email_scorecard_refresh.yml`) runs on demand (and on pushes touching the email agent), re-runs the eval, regenerates `SCORECARD.md`, then: + - **score ≥ committed** → commits the refreshed scorecard back to the branch (the PR carries the improved number); + - **score < committed** → fails loudly (the regression must be investigated, or consciously overridden with `--allow-regression`). + +So a PR that changes the agent gets its scorecard refreshed (better) or rejected (worse) automatically on the AMD runner, and the release gate is the backstop on hosted CI. Locally, `gen_scorecard.py` + `scorecard_gate.py` reproduce both steps (see the **`adding-eval-scorecard` skill**). + + + The refresh job needs `contents: write` and runs only on the repo's own branches — a fork PR's `GITHUB_TOKEN` is read-only and cannot auto-commit. For a fork PR, run the eval locally/on AMD hardware and commit the scorecard manually; the release gate still enforces no-regression. + + +## Adding a scorecard for a new agent + + + **Use the [`adding-eval-scorecard` skill](https://github.com/amd/gaia/tree/main/.claude/skills/adding-eval-scorecard/SKILL.md).** In Claude Code, invoke it instead of following these steps by hand — it carries the exact commands, the harness→payload→generator flow, the headless-eval gotchas (keyring/PYTHONPATH/tool-timeout), and the verification evidence to capture. The steps below are the reference the skill automates. + + +1. Write a `packaging/gen_scorecard.py` adapter (see `hub/agents/python/email/packaging/gen_scorecard.py` for a reference). The adapter should populate `reproduction_command` with the exact commands needed to reproduce the scorecard. +2. Run the eval and call the adapter → commit the resulting `SCORECARD.md` to `/SCORECARD.md`. +3. Link the scorecard from the README: `./SCORECARD.md`. +4. Add `SCORECARD.md` to the npm `package.json` `files` array (if published on npm); do **not** add a `scorecards/` directory. +5. Wire `scorecard_gate` into the release workflow (see `release_agent_email.yml` for the job topology). Use `--scorecard /SCORECARD.md` and `--baseline-ref ` (best-effort). diff --git a/hub/agents/npm/agent-email/README.md b/hub/agents/npm/agent-email/README.md index fa7ef97e0..f8d797279 100644 --- a/hub/agents/npm/agent-email/README.md +++ b/hub/agents/npm/agent-email/README.md @@ -2,6 +2,8 @@ [![npm version](https://img.shields.io/npm/v/@amd-gaia/agent-email?label=version)](https://www.npmjs.com/package/@amd-gaia/agent-email) · contract `SCHEMA_VERSION` **2.0** · last updated **2026-06-24** +**Eval scorecard (v0.2.4): aggregate 46.0 / 100** — `category_accuracy` 0.46 over 100 of 220 labeled emails ([`./SCORECARD.md`](./SCORECARD.md)), scored against the schema-2.0 triage taxonomy. The linked scorecard carries the full recipe, metrics, a worked recomputation, and reproduction steps. + Embed the **GAIA email agent** in your JS/TS app. It triages, organizes, replies to, and schedules from Gmail and Outlook — with every email body analyzed **locally on AMD Ryzen AI** via Lemonade. No message content is sent to a cloud diff --git a/hub/agents/npm/agent-email/SCORECARD.md b/hub/agents/npm/agent-email/SCORECARD.md new file mode 100644 index 000000000..000f2a127 --- /dev/null +++ b/hub/agents/npm/agent-email/SCORECARD.md @@ -0,0 +1,94 @@ +--- +schema_version: 1 +agent: + name: Email Triage + version: 0.2.4 +recipe: + dataset: + reference: tests/fixtures/email/ground_truth.json + description: 'Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, + schema-2.0 triage taxonomy: fyi / needs_response / promotional / urgent / personal)' + size: 220 + methodology: gaia eval benchmark — category classification accuracy (case-insensitive + exact match of the agent's triage label vs the ground-truth label) over a synthetic + labeled corpus via FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0 + triage taxonomy, aligned with the agent's output labels (#1874) + config: + harness: gaia eval benchmark + model: Gemma-4-E4B-it-GGUF + corpus: tests/fixtures/email/synthetic_inbox.mbox + ground_truth: tests/fixtures/email/ground_truth.json + limit: 220 +results: + test_cases_run: 100 + metrics: + - name: category_accuracy + value: 0.46 + weight: 1.0 +aggregate: + name: weighted_accuracy + formula: round(100 * sum(weight_i * value_i) / sum(weight_i), 2) + components: + - metric: category_accuracy + value: 0.46 + weight: 1.0 + value: 46.0 +generated_at: '2026-06-26T17:40:26.470285+00:00' +inherited_from: null +--- +# Email Triage — Eval Scorecard v0.2.4 + +**Aggregate score: 46.0** (out of 100) + +## Recipe + +| Field | Value | +|-------|-------| +| Dataset | [tests/fixtures/email/ground_truth.json](tests/fixtures/email/ground_truth.json) | +| Description | Synthetic email corpus for GAIA email-triage evaluation (FakeGmailBackend, schema-2.0 triage taxonomy: fyi / needs_response / promotional / urgent / personal) | +| Dataset size | 220 labeled examples | +| Test cases run | 100 | +| Methodology | gaia eval benchmark — category classification accuracy (case-insensitive exact match of the agent's triage label vs the ground-truth label) over a synthetic labeled corpus via FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0 triage taxonomy, aligned with the agent's output labels (#1874) | + +## Metrics + + - **category_accuracy**: 0.4600 × 1.0 + +## Aggregate score recomputation + +Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)` + +Worked example: + +``` +round(100 × ((0.4600 × 1.0)) / 1.0, 2) = 46.0 +``` + +A reader can reproduce this value from the `aggregate.components` in the front +matter alone — no eval-harness access needed. + +## Reproduction + +Run the following commands from the repository root: + +```sh +# Step 1: run the benchmark (requires a Lemonade Server with the model loaded; AMD Ryzen AI / Strix Halo recommended) +PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \ +GAIA_AGENT_TOOL_TIMEOUT=900 \ +PYTHONPATH="$(pwd)" \ +gaia eval benchmark \ + --model Gemma-4-E4B-it-GGUF \ + --mbox-path tests/fixtures/email/synthetic_inbox.mbox \ + --ground-truth tests/fixtures/email/ground_truth.json \ + --limit 220 \ + --output-dir /tmp/email-eval + +# Step 2: generate this scorecard from the benchmark output +PYTHONPATH="$(pwd)" \ +python hub/agents/python/email/packaging/gen_scorecard.py \ + --benchmark-dir /tmp/email-eval \ + --ground-truth tests/fixtures/email/ground_truth.json \ + --limit 220 +``` + +See [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) for the full setup guide. diff --git a/hub/agents/npm/agent-email/package.json b/hub/agents/npm/agent-email/package.json index fc3ad9be5..115483bc0 100644 --- a/hub/agents/npm/agent-email/package.json +++ b/hub/agents/npm/agent-email/package.json @@ -48,6 +48,7 @@ "CHANGELOG.md", "SPEC.md", "SKILL.md", + "SCORECARD.md", "LICENSE" ], "engines": { diff --git a/hub/agents/python/email/packaging/gen_scorecard.py b/hub/agents/python/email/packaging/gen_scorecard.py new file mode 100644 index 000000000..344817bda --- /dev/null +++ b/hub/agents/python/email/packaging/gen_scorecard.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python3 +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +""" +Email-agent adapter: generate a release scorecard from a ``gaia eval benchmark`` run. + +Reads the benchmark ``--output-dir`` (looks for a JSON file containing a +``scenarios`` key — ``scorecard.json`` in a real run, or any ``*scorecard*.json`` +fixture) and the ground-truth JSON, builds a :class:`ResultPayload`, and writes the +scorecard to ``hub/agents/npm/agent-email/SCORECARD.md`` (a single file, updated +in place — versioned via the publish snapshot, the same way README.md works). + +This adapter imports ``gaia.eval.release_scorecard`` (core generator) but never +imports the eval harness (``gaia.eval.benchmark``) or the email-agent package — +the loose-coupling spine is preserved. + +Usage:: + + PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\ + GAIA_AGENT_TOOL_TIMEOUT=120 \\ + PYTHONPATH="$(pwd)" \\ + python hub/agents/python/email/packaging/gen_scorecard.py \\ + --benchmark-dir /tmp/email-eval \\ + [--ground-truth tests/fixtures/email/ground_truth.json] \\ + [--limit 25] + +The ``--ground-truth`` path defaults to the canonical fixture in the repository. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +# Derive repo root the same way stamp_version.py does: +# packaging/ -> email/ -> python/ -> agents/ -> hub/ -> repo root +_PACKAGING_DIR = Path(__file__).resolve().parent +_EMAIL_ROOT = _PACKAGING_DIR.parent +_REPO_ROOT = _EMAIL_ROOT.parent.parent.parent.parent +_NPM_ROOT = _REPO_ROOT / "hub" / "agents" / "npm" / "agent-email" + +# Default ground-truth path +_DEFAULT_GT = _REPO_ROOT / "tests" / "fixtures" / "email" / "ground_truth.json" + +# Canonical benchmark scorecard filename (written by gaia eval benchmark) +_SCORECARD_FILENAME = "scorecard.json" + +# Output filename: single SCORECARD.md per agent package, updated in place. +_OUTPUT_FILENAME = "SCORECARD.md" + + +def _find_benchmark_scorecard(benchmark_dir: Path) -> Path: + """Locate the benchmark scorecard JSON in ``benchmark_dir``. + + Looks first for the canonical ``scorecard.json``, then for any ``*.json`` + file whose parsed content contains a ``scenarios`` key. Raises loudly if + none is found or if multiple ambiguous files match. + + Args: + benchmark_dir: Directory written by ``gaia eval benchmark --output-dir``. + + Returns: + Path to the benchmark scorecard JSON file. + + Raises: + FileNotFoundError: If ``benchmark_dir`` does not exist. + ValueError: If no suitable scorecard JSON is found in the directory. + """ + if not benchmark_dir.is_dir(): + raise FileNotFoundError( + f"Benchmark directory not found: {benchmark_dir}\n" + f"Run 'gaia eval benchmark --output-dir ' first." + ) + + # Try the canonical name first + canonical = benchmark_dir / _SCORECARD_FILENAME + if canonical.exists(): + return canonical + + # Scan for any JSON containing a 'scenarios' key + matches: list[Path] = [] + for p in sorted(benchmark_dir.glob("*.json")): + try: + data = json.loads(p.read_text(encoding="utf-8")) + if isinstance(data, dict) and "scenarios" in data: + matches.append(p) + except (json.JSONDecodeError, OSError): + continue + + if not matches: + raise ValueError( + f"No benchmark scorecard JSON found in {benchmark_dir}.\n" + f"Expected '{_SCORECARD_FILENAME}' (written by 'gaia eval benchmark'), " + f"or any JSON file with a 'scenarios' key.\n" + f"Run 'gaia eval benchmark --output-dir {benchmark_dir}' to generate it." + ) + + if len(matches) > 1: + paths = ", ".join(str(p) for p in matches) + raise ValueError( + f"Ambiguous benchmark scorecard: multiple JSON files with a 'scenarios' " + f"key found in {benchmark_dir}: {paths}.\n" + f"Remove all but '{_SCORECARD_FILENAME}' and retry." + ) + + return matches[0] + + +def _is_judged(scenario: dict) -> bool: + """Return True if a scenario has a valid category_accuracy in [0,1].""" + quality = scenario.get("quality") + if not isinstance(quality, dict): + return False + acc = quality.get("category_accuracy") + if acc is None: + return False + try: + import math + + f = float(acc) + except (TypeError, ValueError): + return False + return 0.0 <= f <= 1.0 and math.isfinite(f) + + +def build_payload(benchmark_dir: Path, ground_truth_path: Path, limit=None): + """Build a :class:`~gaia.eval.release_scorecard.ResultPayload` from benchmark output. + + A scenario is **judged** iff it has a ``quality`` dict AND + ``quality.category_accuracy`` is a finite float in [0, 1]. Non-judged + scenarios (missing ``quality`` or invalid accuracy) are skipped. + + Args: + benchmark_dir: Directory written by ``gaia eval benchmark --output-dir``. + ground_truth_path: Path to ``ground_truth.json`` (the labeled corpus). + limit: The ``--limit`` value used for the eval run, recorded in + ``config["limit"]`` for cross-version comparability. The benchmark + ``scorecard.json`` does not persist this, so it must be passed in. + + Returns: + Populated :class:`~gaia.eval.release_scorecard.ResultPayload`. + + Raises: + ValueError: If zero scenarios are judged (likely missing ``--ground-truth`` + or a benchmark run that produced no quality metrics). + FileNotFoundError: If required files are not found. + """ + # Import here (not at module top) so tests that import build_payload before + # gaia is installed in the test environment fail at call time, not import time. + from gaia.eval.release_scorecard import ResultPayload, compute_aggregate + + scorecard_path = _find_benchmark_scorecard(benchmark_dir) + data = json.loads(scorecard_path.read_text(encoding="utf-8")) + scenarios = data.get("scenarios", []) + + # Separate judged from non-judged scenarios + judged = [s for s in scenarios if _is_judged(s)] + + if not judged: + raise ValueError( + f"Zero judged scenarios in {scorecard_path}.\n" + f"Possible causes: benchmark ran without '--ground-truth', " + f"or no scenario produced a category_accuracy metric.\n" + f"Benchmark dir: {benchmark_dir}" + ) + + # Aggregate metrics from judged scenarios + category_accuracy = sum( + s["quality"]["category_accuracy"] for s in judged + ) / len(judged) + + test_cases_run = sum(int(s.get("total_emails", 0)) for s in judged) + + # Dataset size = labeled entries in ground_truth.json (excluding _meta key) + if not ground_truth_path.exists(): + raise FileNotFoundError( + f"Ground truth not found: {ground_truth_path}\n" + f"Pass --ground-truth pointing to the labeled corpus JSON." + ) + ground_truth = json.loads(ground_truth_path.read_text(encoding="utf-8")) + dataset_size = len(ground_truth) - (1 if "_meta" in ground_truth else 0) + + # Read version from gaia-agent.yaml + agent_yaml_path = _EMAIL_ROOT / "gaia-agent.yaml" + try: + import yaml # noqa: PLC0415 (local import; PyYAML already a dep) + + agent_data = yaml.safe_load(agent_yaml_path.read_text(encoding="utf-8")) or {} + except Exception as exc: + raise ValueError( + f"Cannot read agent version from {agent_yaml_path}: {exc}" + ) from exc + + version = str(agent_data.get("version", "")) + if not version: + raise ValueError( + f"No 'version:' field found in {agent_yaml_path}." + ) + + # Model id: benchmark output records it as the per-scenario `category`. + # Fall back to the manifest's first declared model. + scenario_model = scenarios[0].get("category") if scenarios else None + manifest_models = agent_data.get("models") or [None] + model = scenario_model or manifest_models[0] + + metrics = [ + {"name": "category_accuracy", "value": float(category_accuracy), "weight": 1.0} + ] + compute_aggregate(metrics) # validate metrics; aggregate embedded in render_scorecard + + import datetime + + # Construct a portable, exact reproduction command so any reader can reproduce + # this scorecard from scratch. Use repo-relative paths and a generic output dir + # only — never a local absolute path (this ships in a published artifact). + limit_flag = f" \\\n --limit {limit}" if limit is not None else "" + ground_truth_rel = ( + str(ground_truth_path.relative_to(_REPO_ROOT)) + if str(ground_truth_path).startswith(str(_REPO_ROOT)) + else ground_truth_path.name + ) + reproduction_command = ( + "# Step 1: run the benchmark (requires a Lemonade Server with the model " + "loaded; AMD Ryzen AI / Strix Halo recommended)\n" + "PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring \\\n" + "GAIA_AGENT_TOOL_TIMEOUT=900 \\\n" + 'PYTHONPATH="$(pwd)" \\\n' + "gaia eval benchmark \\\n" + f" --model {model} \\\n" + " --mbox-path tests/fixtures/email/synthetic_inbox.mbox \\\n" + f" --ground-truth {ground_truth_rel}{limit_flag} \\\n" + " --output-dir /tmp/email-eval\n\n" + "# Step 2: generate this scorecard from the benchmark output\n" + 'PYTHONPATH="$(pwd)" \\\n' + "python hub/agents/python/email/packaging/gen_scorecard.py \\\n" + " --benchmark-dir /tmp/email-eval \\\n" + f" --ground-truth {ground_truth_rel}" + + (f"{limit_flag}" if limit is not None else "") + ) + + return ResultPayload( + agent_name="Email Triage", + agent_version=version, + dataset_reference="tests/fixtures/email/ground_truth.json", + dataset_description=( + "Synthetic email corpus for GAIA email-triage evaluation " + "(FakeGmailBackend, schema-2.0 triage taxonomy: " + "fyi / needs_response / promotional / urgent / personal)" + ), + dataset_size=dataset_size, + methodology=( + "gaia eval benchmark — category classification accuracy " + "(case-insensitive exact match of the agent's triage label vs the " + "ground-truth label) over a synthetic labeled corpus via " + "FakeGmailBackend; no LLM judge. The corpus uses the schema-2.0 " + "triage taxonomy, aligned with the agent's output labels (#1874)" + ), + config={ + "harness": "gaia eval benchmark", + "model": model, + "corpus": "tests/fixtures/email/synthetic_inbox.mbox", + # Store a repo-relative path — never leak a local absolute path into + # a committed/published artifact. + "ground_truth": ground_truth_rel, + "limit": limit, + }, + test_cases_run=test_cases_run, + metrics=metrics, + aggregate_name="weighted_accuracy", + generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + inherited_from=None, + reproduction_command=reproduction_command, + ) + + +def main(argv=None) -> int: + """Generate and write the email-agent scorecard.""" + parser = argparse.ArgumentParser( + description="Generate a release scorecard for the email-triage agent.", + prog="gen_scorecard.py", + ) + parser.add_argument( + "--benchmark-dir", + required=True, + help=( + "Directory written by 'gaia eval benchmark --output-dir ' " + "(must contain scorecard.json)." + ), + ) + parser.add_argument( + "--ground-truth", + default=str(_DEFAULT_GT), + help=( + f"Path to ground_truth.json (default: {_DEFAULT_GT.relative_to(_REPO_ROOT)})" + ), + ) + parser.add_argument( + "--output-dir", + default=None, + help=( + "Override the scorecard output directory " + f"(default: hub/agents/npm/agent-email/, writes {_OUTPUT_FILENAME})." + ), + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help=( + "The --limit value passed to 'gaia eval benchmark' for this run. " + "Recorded in config.limit for cross-version comparability " + "(the benchmark output does not persist it)." + ), + ) + + args = parser.parse_args(argv) + + benchmark_dir = Path(args.benchmark_dir).resolve() + gt_path = Path(args.ground_truth).resolve() + + try: + payload = build_payload(benchmark_dir, gt_path, limit=args.limit) + except (ValueError, FileNotFoundError) as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 1 + + from gaia.eval.release_scorecard import write_scorecard + + if args.output_dir: + out_dir = Path(args.output_dir) + else: + out_dir = _NPM_ROOT + + out_dir.mkdir(parents=True, exist_ok=True) + out_path = out_dir / _OUTPUT_FILENAME + write_scorecard(payload, out_path) + + print( + f"Scorecard written: {out_path}\n" + f" Version: {payload.agent_version}\n" + f" Aggregate: {payload.metrics[0]['value']:.4f} category_accuracy " + f"({payload.test_cases_run} emails judged)\n" + f" Dataset size: {payload.dataset_size} labeled examples" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hub/agents/python/email/packaging/publish_to_r2.py b/hub/agents/python/email/packaging/publish_to_r2.py index 4da341837..9cca41e3d 100644 --- a/hub/agents/python/email/packaging/publish_to_r2.py +++ b/hub/agents/python/email/packaging/publish_to_r2.py @@ -129,6 +129,7 @@ def publish_one( changelog_bytes: bytes | None = None, spec_bytes: bytes | None = None, skill_bytes: bytes | None = None, + eval_scorecard_bytes: bytes | None = None, package_files_bytes: bytes | None = None, ) -> dict: if not artifact_path.exists(): @@ -172,6 +173,10 @@ def publish_one( files["spec"] = ("SPEC.md", spec_bytes, "text/markdown") if skill_bytes is not None: files["skill"] = ("SKILL.md", skill_bytes, "text/markdown") + # The eval scorecard rides along with the first platform binary and becomes + # the catalog entry's `eval_score` and `eval_scorecard_url`. + if eval_scorecard_bytes is not None: + files["eval_scorecard"] = ("eval-scorecard.md", eval_scorecard_bytes, "text/markdown") # The whole-package file listing rides with the zip artifact — it becomes # the catalog entry's `package.files` (the hub's file-list display). if package_files_bytes is not None: @@ -271,6 +276,14 @@ def main(argv=None) -> int: help="Path to SKILL.md to publish as the agent's catalog skill " "(POSTed as the multipart 'skill' part the Worker accepts).", ) + parser.add_argument( + "--eval-scorecard", + type=Path, + help="Path to the eval scorecard markdown (e.g. SCORECARD.md) to " + "publish as the agent's catalog eval score and scorecard URL " + "(POSTed as the multipart 'eval_scorecard' part the Worker accepts). " + "Absent = publish without an eval scorecard.", + ) parser.add_argument( "--package-files", type=Path, @@ -341,6 +354,21 @@ def main(argv=None) -> int: flush=True, ) + eval_scorecard_bytes = None + if args.eval_scorecard is not None: + if not args.eval_scorecard.exists(): + raise SystemExit( + f"error: --eval-scorecard path not found: {args.eval_scorecard}. " + "Pass the scorecard markdown, or omit --eval-scorecard to publish " + "without one." + ) + eval_scorecard_bytes = args.eval_scorecard.read_bytes() + print( + f"[publish] attaching eval scorecard: {args.eval_scorecard} " + f"({len(eval_scorecard_bytes)} bytes)", + flush=True, + ) + package_files_bytes = None if args.package_files is not None: if not args.package_files.exists(): @@ -376,6 +404,7 @@ def main(argv=None) -> int: changelog_bytes=changelog_bytes, spec_bytes=spec_bytes, skill_bytes=skill_bytes, + eval_scorecard_bytes=eval_scorecard_bytes, package_files_bytes=package_files_bytes, ) ) diff --git a/src/gaia/apps/webui/src/components/AgentDetailModal.tsx b/src/gaia/apps/webui/src/components/AgentDetailModal.tsx index 7de638328..b1a2dd954 100644 --- a/src/gaia/apps/webui/src/components/AgentDetailModal.tsx +++ b/src/gaia/apps/webui/src/components/AgentDetailModal.tsx @@ -2,7 +2,7 @@ // SPDX-License-Identifier: MIT import { useEffect, useCallback } from 'react'; -import { Wrench, Cpu, Shield, X, HardDrive, CheckCircle2, FlaskConical, AlertTriangle } from 'lucide-react'; +import { Wrench, Cpu, Shield, X, HardDrive, CheckCircle2, FlaskConical, AlertTriangle, BarChart2 } from 'lucide-react'; import { getAgentIcon } from './agentIcons'; import type { AgentInfo } from '../types'; @@ -172,6 +172,30 @@ export function AgentDetailModal({ agent, onClose, onStartChat }: AgentDetailMod )} + {/* Eval scorecard */} + {agent.eval_score != null && ( +
+
Eval scorecard
+
+ +
+
Eval score
+
+ {agent.eval_score} / 100 + {agent.eval_scorecard_url && ( + <> — View scorecard + )} +
+
+
+
+ )} + {/* Conversation starters */} {starters.length > 0 && (
diff --git a/src/gaia/apps/webui/src/types/index.ts b/src/gaia/apps/webui/src/types/index.ts index b9ba28282..4404769ea 100644 --- a/src/gaia/apps/webui/src/types/index.ts +++ b/src/gaia/apps/webui/src/types/index.ts @@ -117,6 +117,10 @@ export interface AgentInfo { avatar_url?: string; /** True when the publisher has deprecated this agent. */ deprecated?: boolean; + /** Public URL of the eval scorecard markdown; absent when none was published. */ + eval_scorecard_url?: string; + /** Aggregate eval score (0–100) from the latest published scorecard; absent when none. */ + eval_score?: number; } /** Derived card state for the Agent Hub (issue #1097). */ diff --git a/src/gaia/eval/release_scorecard.py b/src/gaia/eval/release_scorecard.py new file mode 100644 index 000000000..46111455e --- /dev/null +++ b/src/gaia/eval/release_scorecard.py @@ -0,0 +1,476 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +""" +Per-agent eval scorecard: generator, parser, validator, and versioning helpers. + +**Distinct from** ``src/gaia/eval/scorecard.py`` — that module is the per-eval-run +scenario PASS/FAIL aggregator (``build_scorecard``). This module produces the +outward-facing *release artifact*: a single ``SCORECARD.md`` file (updated in +place per release, versioned via the publish snapshot — the same way README.md +works) with YAML front matter holding measured accuracy metrics, the eval recipe, +a deterministic aggregate score, and a Reproduction section. + +Storage convention: ``/SCORECARD.md`` (NOT ``scorecards/.md``). +Per-version uniqueness comes from the publish snapshot in R2 (the hub stores every +doc per version at ``agents///SCORECARD.md``). + +Intentionally harness-agnostic: this module imports ONLY stdlib + PyYAML. +No other loader is permitted — ``yaml.safe_load`` only. + +Usage pattern:: + + payload = ResultPayload( + agent_name="email-triage", + agent_version="0.2.4", + ... + ) + text = render_scorecard(payload) + write_scorecard(payload, path) +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +import yaml + +# Anchored semver regex — no prerelease/build suffixes permitted. +_SEMVER_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)$") + +# Required top-level keys in the parsed front matter. +REQUIRED_FIELDS: list[str] = [ + "schema_version", + "agent", + "recipe", + "results", + "aggregate", +] + + +@dataclass +class ResultPayload: + """Harness-agnostic result payload — the input to the scorecard generator. + + Fields: + agent_name: Human-readable agent name (e.g. "Email Triage"). + agent_version: Semver version string (e.g. "0.2.4"). + dataset_reference: Repo-relative path or URL to the dataset. + dataset_description: Short human description of the dataset. + dataset_size: Total labeled examples available in the dataset. + methodology: Short description of the eval methodology. + config: Arbitrary dict of harness config (model, limit, corpus, etc.). + test_cases_run: Number of cases actually executed this run (<= dataset_size). + metrics: List of dicts with keys ``name`` (str), ``value`` (float 0..1), + and optionally ``weight`` (float, default 1.0). + aggregate_name: Name for the aggregate score (default "weighted_accuracy"). + generated_at: ISO-8601 timestamp string; informational only. + inherited_from: If this is a patch carry-forward, the prior version string; + otherwise None. + reproduction_command: Optional exact shell command(s) to reproduce this + scorecard run. Rendered in the ``## Reproduction`` section. If None, + a generic pointer to the docs/skill is rendered instead. + """ + + agent_name: str + agent_version: str + dataset_reference: str + dataset_description: str + dataset_size: int + methodology: str + config: dict + test_cases_run: int + metrics: list + aggregate_name: str = "weighted_accuracy" + generated_at: str = "" + inherited_from: Optional[str] = None + reproduction_command: Optional[str] = None + + +def compute_aggregate(metrics: list) -> tuple: + """Compute the weighted aggregate score over a list of metrics. + + Formula:: + + round(100 * sum(weight_i * value_i) / sum(weight_i), 2) + + Args: + metrics: List of dicts with ``name``, ``value`` (float in [0,1]), + and optional ``weight`` (float, default 1.0). + + Returns: + (components, value) where ``components`` is a list of dicts + ``{metric, value, weight}`` and ``value`` is the aggregate float. + + Raises: + ValueError: If metrics is empty or the total weight is zero. + """ + if not metrics: + raise ValueError("aggregate undefined: no metrics / zero total weight") + + components = [] + total_weight = 0.0 + weighted_sum = 0.0 + for m in metrics: + w = float(m.get("weight", 1.0)) + v = float(m["value"]) + components.append({"metric": m["name"], "value": v, "weight": w}) + total_weight += w + weighted_sum += w * v + + if total_weight == 0.0: + raise ValueError("aggregate undefined: no metrics / zero total weight") + + value = round(100.0 * weighted_sum / total_weight, 2) + return components, value + + +def render_scorecard(payload: ResultPayload) -> str: + """Render a scorecard as Markdown with YAML front matter. + + The front matter is machine-readable; the body is a human-readable summary + that includes the aggregate formula, a worked recomputation example, and a + Reproduction section so any reader can reproduce the result from scratch. + + Args: + payload: Populated :class:`ResultPayload`. + + Returns: + Markdown string starting with ``---`` front matter. + """ + _assert_valid_version(payload.agent_version) + + components, agg_value = compute_aggregate(payload.metrics) + + # Build the YAML-serialisable front-matter dict + front: dict = { + "schema_version": 1, + "agent": { + "name": payload.agent_name, + "version": payload.agent_version, + }, + "recipe": { + "dataset": { + "reference": payload.dataset_reference, + "description": payload.dataset_description, + "size": payload.dataset_size, + }, + "methodology": payload.methodology, + "config": payload.config, + }, + "results": { + "test_cases_run": payload.test_cases_run, + "metrics": [ + { + "name": m["name"], + "value": float(m["value"]), + "weight": float(m.get("weight", 1.0)), + } + for m in payload.metrics + ], + }, + "aggregate": { + "name": payload.aggregate_name, + "formula": "round(100 * sum(weight_i * value_i) / sum(weight_i), 2)", + "components": components, + "value": agg_value, + }, + "generated_at": payload.generated_at, + "inherited_from": payload.inherited_from, + } + + fm_text = yaml.dump( + front, default_flow_style=False, sort_keys=False, allow_unicode=True + ) + + # Human-readable body with worked recompute + metric_lines = "\n".join( + f" - **{c['metric']}**: {c['value']:.4f} × {c['weight']:.1f}" + for c in components + ) + total_w = sum(c["weight"] for c in components) + worked = " + ".join(f"({c['value']:.4f} × {c['weight']:.1f})" for c in components) + + # Reproduction section + if payload.reproduction_command: + repro_body = ( + "Run the following commands from the repository root:\n\n" + f"```sh\n{payload.reproduction_command}\n```\n\n" + "See [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) " + "and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) " + "for the full setup guide." + ) + else: + repro_body = ( + "See the [eval-scorecard docs](https://amd-gaia.ai/docs/reference/eval-scorecard) " + "and the [`adding-eval-scorecard` skill](.claude/skills/adding-eval-scorecard/SKILL.md) " + "for the full reproduction recipe." + ) + + body = f"""# {payload.agent_name} — Eval Scorecard v{payload.agent_version} + +**Aggregate score: {agg_value}** (out of 100) + +## Recipe + +| Field | Value | +|-------|-------| +| Dataset | [{payload.dataset_reference}]({payload.dataset_reference}) | +| Description | {payload.dataset_description} | +| Dataset size | {payload.dataset_size} labeled examples | +| Test cases run | {payload.test_cases_run} | +| Methodology | {payload.methodology} | + +## Metrics + +{metric_lines} + +## Aggregate score recomputation + +Formula: `round(100 × Σ(weightᵢ × valueᵢ) / Σ(weightᵢ), 2)` + +Worked example: + +``` +round(100 × ({worked}) / {total_w:.1f}, 2) = {agg_value} +``` + +A reader can reproduce this value from the `aggregate.components` in the front +matter alone — no eval-harness access needed. + +## Reproduction + +{repro_body} +""" + + if payload.inherited_from: + body += f"\n> **Inherited from {payload.inherited_from}** — results carried forward verbatim (patch release).\n" + + return f"---\n{fm_text}---\n{body}" + + +def write_scorecard(payload: ResultPayload, path: Path) -> None: + """Write a rendered scorecard to ``path``. + + Args: + payload: Populated :class:`ResultPayload`. + path: Destination file path. Parent directory must exist. + """ + path = Path(path) + path.write_text(render_scorecard(payload), encoding="utf-8") + + +def parse_scorecard(source) -> dict: + """Parse the YAML front matter from a scorecard file or string. + + Extracts the first ``---`` … ``---`` block and runs ``yaml.safe_load`` + on it only — a bare ``---`` rule in the Markdown body is never parsed. + + Args: + source: A :class:`pathlib.Path` (file to read) or a ``str`` (raw text). + + Returns: + Parsed front-matter dict. + + Raises: + ValueError: If no valid front-matter block is found or YAML is invalid. + """ + if isinstance(source, Path): + text = source.read_text(encoding="utf-8") + else: + text = str(source) + + # Split on first pair of '---' delimiters + if not text.startswith("---"): + raise ValueError("Scorecard does not start with '---' front matter") + + # Find the closing '---' (first occurrence after the opening line) + rest = text[3:] # strip opening --- + # The closing delimiter is a line consisting of exactly --- + closing_match = re.search(r"\n---\n", rest) + if closing_match is None: + # Try end-of-string variant + closing_match = re.search(r"\n---$", rest) + if closing_match is None: + raise ValueError("Scorecard front matter has no closing '---'") + + yaml_block = rest[: closing_match.start()] + try: + return yaml.safe_load(yaml_block) or {} + except yaml.YAMLError as exc: + raise ValueError(f"Invalid YAML in scorecard front matter: {exc}") from exc + + +def validate_scorecard(parsed: dict) -> list: + """Validate a parsed scorecard front-matter dict. + + Args: + parsed: Dict returned by :func:`parse_scorecard`. + + Returns: + List of error strings. Empty list means the scorecard is valid. + """ + errors: list[str] = [] + + # Top-level required keys + for key in REQUIRED_FIELDS: + if key not in parsed: + errors.append(f"Missing required field: '{key}'") + + def _section(name: str): + """Return the section dict if present and a dict, else record an error.""" + value = parsed.get(name) + if name in parsed and not isinstance(value, dict): + errors.append( + f"Field '{name}' must be a mapping, got {type(value).__name__}" + ) + return None + return value if isinstance(value, dict) else None + + # agent.{name, version} + agent = _section("agent") + if agent is not None: + for sub in ("name", "version"): + if sub not in agent: + errors.append(f"Missing required field: 'agent.{sub}'") + + # recipe.{dataset.{reference, size}, methodology, config} + recipe = _section("recipe") + if recipe is not None: + for sub in ("methodology", "config"): + if sub not in recipe: + errors.append(f"Missing required field: 'recipe.{sub}'") + dataset = recipe.get("dataset") + if "dataset" not in recipe: + errors.append("Missing required field: 'recipe.dataset'") + elif not isinstance(dataset, dict): + errors.append( + f"Field 'recipe.dataset' must be a mapping, got {type(dataset).__name__}" + ) + else: + for sub in ("reference", "size"): + if sub not in dataset: + errors.append(f"Missing required field: 'recipe.dataset.{sub}'") + + # results.{test_cases_run, metrics} + results = _section("results") + if results is not None: + if "test_cases_run" not in results: + errors.append("Missing required field: 'results.test_cases_run'") + metrics = results.get("metrics") + if "metrics" not in results: + errors.append("Missing required field: 'results.metrics'") + elif not isinstance(metrics, list) or not metrics: + errors.append("Field 'results.metrics' must be a non-empty list") + else: + for i, metric in enumerate(metrics): + if not isinstance(metric, dict): + errors.append(f"Field 'results.metrics[{i}]' must be a mapping") + continue + for sub in ("name", "value"): + if sub not in metric: + errors.append( + f"Missing required field: 'results.metrics[{i}].{sub}'" + ) + + # aggregate.{name, formula, value} + aggregate = _section("aggregate") + if aggregate is not None: + for sub in ("name", "formula", "value"): + if sub not in aggregate: + errors.append(f"Missing required field: 'aggregate.{sub}'") + + return errors + + +def _semver_tuple(v: str) -> tuple: + """Parse a semver string to an int tuple, or raise ValueError.""" + m = _SEMVER_RE.match(v) + if not m: + raise ValueError(f"Not a valid semver string: {v!r}") + return (int(m.group(1)), int(m.group(2)), int(m.group(3))) + + +def _assert_valid_version(version: str) -> None: + """Raise ValueError if version does not match the anchored semver regex.""" + m = _SEMVER_RE.match(version) + if not m: + raise ValueError( + f"Version {version!r} does not match semver pattern X.Y.Z — " + "prerelease and build-metadata suffixes are not permitted." + ) + + +def carry_forward(prev_scorecard_path: Path, new_version: str) -> ResultPayload: + """Carry forward a prior SCORECARD.md's results to a new patch version. + + Reads the single ``SCORECARD.md`` (the agent's one scorecard file, updated + in place per release), copies all results verbatim, and sets + ``inherited_from`` to the prior version string recorded in the front matter. + + Only patch bumps are allowed: if the prior scorecard's ``agent.version`` + differs in major or minor from ``new_version``, the caller must re-run the + eval to generate fresh results. + + Args: + prev_scorecard_path: Path to the prior ``SCORECARD.md`` file. + new_version: The new version string (must be a patch bump of the prior). + + Returns: + A :class:`ResultPayload` with results copied and ``inherited_from`` set. + + Raises: + ValueError: If ``new_version`` is not a patch-only bump of the prior version + (i.e. if major or minor differs). The error message contains "re-run" + to inform the caller that a fresh eval is required. + ValueError: If the prior scorecard cannot be parsed. + """ + _assert_valid_version(new_version) + prev_scorecard_path = Path(prev_scorecard_path) + + parsed = parse_scorecard(prev_scorecard_path) + + # Extract prior version from front matter (agent.version) + agent = parsed.get("agent", {}) + prev_version = str(agent.get("version", "")) + if not prev_version: + raise ValueError( + f"Cannot read prior version from {prev_scorecard_path}: " + "missing 'agent.version' field in front matter." + ) + + prev_tuple = _semver_tuple(prev_version) + new_tuple = _semver_tuple(new_version) + + # Only patch bumps are allowed for carry-forward. + if prev_tuple[0] != new_tuple[0] or prev_tuple[1] != new_tuple[1]: + raise ValueError( + f"Cannot carry forward from {prev_version} to {new_version}: " + f"major or minor version changed. Please re-run the eval to " + f"generate fresh results for this release." + ) + + # Extract fields from the parsed front matter + recipe = parsed.get("recipe", {}) + dataset = recipe.get("dataset", {}) + results = parsed.get("results", {}) + metrics_raw = results.get("metrics", []) + + import datetime + + return ResultPayload( + agent_name=agent.get("name", ""), + agent_version=new_version, + dataset_reference=dataset.get("reference", ""), + dataset_description=dataset.get("description", ""), + dataset_size=dataset.get("size", 0), + methodology=recipe.get("methodology", ""), + config=recipe.get("config", {}), + test_cases_run=results.get("test_cases_run", 0), + metrics=metrics_raw, + aggregate_name=parsed.get("aggregate", {}).get("name", "weighted_accuracy"), + generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + inherited_from=prev_version, + ) diff --git a/src/gaia/eval/scorecard_gate.py b/src/gaia/eval/scorecard_gate.py new file mode 100644 index 000000000..3a54c09b6 --- /dev/null +++ b/src/gaia/eval/scorecard_gate.py @@ -0,0 +1,309 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +""" +Standalone release gate: blocks packaging when the candidate SCORECARD.md is +missing, invalid, or when its aggregate score strictly regressed below the prior +version's. + +**Distinct from** ``src/gaia/eval/scorecard.py`` — that module aggregates per-run +scenario PASS/FAIL for internal CI. This gate checks the *outward-facing* release +artifact produced by ``release_scorecard.py``. + +Storage convention: one ``SCORECARD.md`` per agent package (updated in place, +versioned via the publish snapshot — the same way README.md works). + +Usage:: + + # Presence-only (first adoption): + python -m gaia.eval.scorecard_gate \\ + --scorecard hub/agents/npm/agent-email/SCORECARD.md + + # With a baseline from a file (unit tests): + python -m gaia.eval.scorecard_gate \\ + --scorecard hub/agents/npm/agent-email/SCORECARD.md \\ + --baseline-file /tmp/prev-SCORECARD.md + + # With a baseline resolved from a git ref (CI): + python -m gaia.eval.scorecard_gate \\ + --scorecard hub/agents/npm/agent-email/SCORECARD.md \\ + --baseline-ref agent-pkg-email-v0.2.3 + +Exit codes: + 0 — Passed (presence-only first adoption, equal score, or score improved). + 1 — Failed (missing/invalid candidate, strict regression, invalid baseline). + +The ``--allow-regression`` flag overrides a regression: prints a ``::warning::`` +GHA annotation and both version/score pairs, then exits 0. +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +from pathlib import Path + +from gaia.eval.release_scorecard import ( + parse_scorecard, + validate_scorecard, +) + + +def _parse_baseline_ref(scorecard_path: Path, ref: str) -> str | None: + """Resolve ``:`` via ``git show`` and return the content. + + The path used in the git command is the path of ``scorecard_path`` relative + to the repository root (discovered by ``git rev-parse --show-toplevel``). + + Returns the file content as a string, or None if the file does not exist at + that ref (treated as first adoption — presence-only pass). + + Raises: + ValueError: If ``git`` cannot be called or the ref is otherwise invalid + (the caller treats this as an actionable error, not first adoption). + """ + # Discover repo root so we can form a root-relative path for git show. + try: + result = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + capture_output=True, + text=True, + check=True, + ) + except (subprocess.CalledProcessError, FileNotFoundError) as exc: + raise ValueError( + f"Cannot determine git repository root: {exc}. " + "Run from inside a git repository, or use --baseline-file instead." + ) from exc + + repo_root = Path(result.stdout.strip()) + scorecard_path = Path(scorecard_path).resolve() + try: + rel = scorecard_path.relative_to(repo_root) + except ValueError: + raise ValueError( + f"SCORECARD path {scorecard_path} is not inside the git repo root " + f"{repo_root}. Use an absolute path inside the repo, or use " + "--baseline-file instead." + ) + + git_path = rel.as_posix() + try: + result = subprocess.run( # noqa: S603 (git is trusted here) + ["git", "show", f"{ref}:{git_path}"], + capture_output=True, + text=True, + check=False, + ) + except FileNotFoundError as exc: + raise ValueError(f"git not found: {exc}") from exc + + if result.returncode != 0: + # File absent at that ref → first adoption (presence-only pass). + return None + + return result.stdout + + +def main(argv=None) -> int: + """Run the scorecard gate. + + Args: + argv: Argument list (``sys.argv[1:]`` if None). + + Returns: + 0 on pass, 1 on failure. + """ + parser = argparse.ArgumentParser( + description=( + "Release gate: ensures a valid SCORECARD.md exists and that its " + "aggregate score has not strictly regressed vs the prior version." + ), + prog="python -m gaia.eval.scorecard_gate", + ) + parser.add_argument( + "--scorecard", + required=True, + help="Path to the candidate SCORECARD.md (e.g. hub/agents/npm/agent-email/SCORECARD.md).", + ) + baseline_group = parser.add_mutually_exclusive_group() + baseline_group.add_argument( + "--baseline-file", + help=( + "Path to the prior version's SCORECARD.md for regression comparison " + "(for unit tests; no git access needed)." + ), + ) + baseline_group.add_argument( + "--baseline-ref", + help=( + "Git ref (tag or commit) of the prior release to use as baseline. " + "Resolves via 'git show :'. If the file does not " + "exist at that ref, a presence-only pass is applied (first adoption)." + ), + ) + parser.add_argument( + "--allow-regression", + action="store_true", + default=False, + help=( + "Override a regression: prints a GHA ::warning:: annotation and both " + "version/score pairs, then exits 0. Use only when a regression is intentional." + ), + ) + + try: + args = parser.parse_args(argv) + except SystemExit: + return 1 + + candidate_path = Path(args.scorecard) + + # --- Step 1: Presence check --- + if not candidate_path.exists(): + print( + f"ERROR: SCORECARD.md missing at {candidate_path}.\n" + f" Run 'python gen_scorecard.py' (or 'carry_forward') to generate it, " + f"then commit the file before releasing.\n" + f" See https://amd-gaia.ai/docs/reference/eval-scorecard and " + f".claude/skills/adding-eval-scorecard/SKILL.md" + ) + return 1 + + try: + candidate_parsed = parse_scorecard(candidate_path) + except ValueError as exc: + print(f"ERROR: Cannot parse candidate SCORECARD.md at {candidate_path}: {exc}") + return 1 + + errors = validate_scorecard(candidate_parsed) + if errors: + print( + f"ERROR: Candidate SCORECARD.md at {candidate_path} is invalid:\n" + + "\n".join(f" - {e}" for e in errors) + ) + return 1 + + # --- Step 2: Resolve baseline --- + baseline_text: str | None = None + + if args.baseline_file: + baseline_path = Path(args.baseline_file) + if not baseline_path.exists(): + print( + f"ERROR: --baseline-file not found: {baseline_path}.\n" + f" Provide a valid path to a prior SCORECARD.md, or omit --baseline-file " + f"for a presence-only pass." + ) + return 1 + try: + baseline_text = baseline_path.read_text(encoding="utf-8") + except OSError as exc: + print(f"ERROR: Cannot read --baseline-file {baseline_path}: {exc}") + return 1 + + elif args.baseline_ref: + try: + baseline_text = _parse_baseline_ref(candidate_path, args.baseline_ref) + except ValueError as exc: + print(f"ERROR: {exc}") + return 1 + # None means the file doesn't exist at that ref → first adoption + if baseline_text is None: + print( + f"PASS: No SCORECARD.md found at ref '{args.baseline_ref}'. " + f"First adoption — presence check only." + ) + return 0 + + if baseline_text is None: + # No baseline specified at all → presence-only pass. + candidate_version = candidate_parsed.get("agent", {}).get("version", "?") + candidate_score = candidate_parsed.get("aggregate", {}).get("value") + if candidate_score is None: + print( + f"ERROR: Candidate SCORECARD.md at {candidate_path} has no " + f"'aggregate.value' field.\n" + f" Fix the scorecard front matter before releasing." + ) + return 1 + print( + f"PASS: No baseline provided. Presence check only.\n" + f" Candidate v{candidate_version}: aggregate.value = {candidate_score}" + ) + return 0 + + # --- Step 3: Parse baseline and regression check --- + try: + prev_parsed = parse_scorecard(baseline_text) + except ValueError as exc: + print( + f"ERROR: Cannot parse baseline SCORECARD.md: {exc}\n" + f" The baseline is corrupt or missing a valid front matter. " + f"Fix it before releasing." + ) + return 1 + + prev_errors = validate_scorecard(prev_parsed) + if prev_errors: + print( + "ERROR: Baseline SCORECARD.md is invalid:\n" + + "\n".join(f" - {e}" for e in prev_errors) + + "\n Fix the baseline scorecard before releasing." + ) + return 1 + + candidate_score = candidate_parsed.get("aggregate", {}).get("value") + prev_score = prev_parsed.get("aggregate", {}).get("value") + + if candidate_score is None: + print( + f"ERROR: Candidate SCORECARD.md at {candidate_path} has no " + "'aggregate.value' field.\n" + " Fix the scorecard front matter before releasing." + ) + return 1 + + if prev_score is None: + print( + "ERROR: Baseline SCORECARD.md has no 'aggregate.value' field.\n" + " Fix the baseline scorecard before releasing." + ) + return 1 + + candidate_version = candidate_parsed.get("agent", {}).get("version", "?") + prev_version = prev_parsed.get("agent", {}).get("version", "?") + + if float(candidate_score) < float(prev_score): + # Strict regression detected + if args.allow_regression: + print( + f"::warning::Scorecard regression allowed by --allow-regression: " + f"v{prev_version}={prev_score} → v{candidate_version}={candidate_score}" + ) + print( + f"WARNING: Regression override active. " + f"Prior version v{prev_version} scored {prev_score}; " + f"candidate v{candidate_version} scored {candidate_score}. " + f"This regression has been explicitly acknowledged." + ) + return 0 + print( + f"ERROR: Scorecard regression detected.\n" + f" Prior version v{prev_version}: aggregate.value = {prev_score}\n" + f" Candidate v{candidate_version}: aggregate.value = {candidate_score}\n" + f" The candidate score is strictly lower than the prior. " + f"Investigate the regression or use --allow-regression to override intentionally." + ) + return 1 + + print( + f"PASS: Scorecard gate passed.\n" + f" Candidate v{candidate_version}: aggregate.value = {candidate_score} " + f"(prior v{prev_version}: {prev_score})" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/gaia/hub/catalog.py b/src/gaia/hub/catalog.py index b78337895..461e34b0f 100644 --- a/src/gaia/hub/catalog.py +++ b/src/gaia/hub/catalog.py @@ -389,7 +389,7 @@ def merge_with_registry( language = entry.get("language", "python") security_tier = entry.get("security_tier", "experimental") - by_id[agent_id] = { + merged: Dict[str, Any] = { "id": agent_id, "name": entry.get("name", agent_id), "description": entry.get("description", ""), @@ -407,6 +407,13 @@ def merge_with_registry( "status": status, "source": (reg.source if reg is not None else "hub"), } + # Optional eval scorecard fields — absent from older catalog entries and + # from builtin/custom agents that haven't run a benchmark yet. + if "eval_score" in entry: + merged["eval_score"] = entry["eval_score"] + if "eval_scorecard_url" in entry: + merged["eval_scorecard_url"] = entry["eval_scorecard_url"] + by_id[agent_id] = merged # 2. Registry-only agents (builtins / custom not published to the hub). for agent_id, reg in registered.items(): diff --git a/tests/fixtures/eval/email_benchmark_scorecard.json b/tests/fixtures/eval/email_benchmark_scorecard.json new file mode 100644 index 000000000..389e7292f --- /dev/null +++ b/tests/fixtures/eval/email_benchmark_scorecard.json @@ -0,0 +1,5 @@ +{"run_id":"bench-fixture","scenarios":[ + {"category":"Gemma-4-E4B-it-GGUF","status":"PASS","total_emails":12,"quality":{"category_accuracy":0.4167}}, + {"category":"Gemma-4-E4B-it-GGUF","status":"PASS","total_emails":12,"quality":{"category_accuracy":0.5000}}, + {"category":"Gemma-4-E4B-it-GGUF","status":"PASS","total_emails":0} +]} diff --git a/tests/unit/eval/test_release_scorecard.py b/tests/unit/eval/test_release_scorecard.py new file mode 100644 index 000000000..d13abdc1e --- /dev/null +++ b/tests/unit/eval/test_release_scorecard.py @@ -0,0 +1,558 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +"""TDD tests for gaia.eval.release_scorecard — written before implementation exists.""" + +import datetime +import importlib.util +import json +from pathlib import Path + +import pytest + +from gaia.eval.release_scorecard import ( + REQUIRED_FIELDS, + ResultPayload, + carry_forward, + compute_aggregate, + parse_scorecard, + render_scorecard, + validate_scorecard, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +FIXTURE_DIR = Path(__file__).parents[2] / "fixtures" / "eval" +EMAIL_BENCHMARK_FIXTURE = FIXTURE_DIR / "email_benchmark_scorecard.json" + + +def _make_payload(version="1.0.0", accuracy=0.5): + metrics = [{"name": "category_accuracy", "value": accuracy, "weight": 1.0}] + components, agg_value = compute_aggregate(metrics) + return ResultPayload( + agent_name="test-agent", + agent_version=version, + dataset_reference="test/fixture", + dataset_description="test dataset", + dataset_size=100, + methodology="unit test", + config={"model": "test"}, + test_cases_run=10, + metrics=metrics, + aggregate_name="weighted_accuracy", + generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + inherited_from=None, + ) + + +# --------------------------------------------------------------------------- +# 1. Schema / validator round-trip +# --------------------------------------------------------------------------- + + +class TestSchemaValidator: + def test_valid_payload_passes_validation(self): + payload = _make_payload() + text = render_scorecard(payload) + parsed = parse_scorecard(text) + errors = validate_scorecard(parsed) + assert errors == [], f"Expected no errors, got: {errors}" + + def test_missing_required_fields_each_flagged(self): + payload = _make_payload() + text = render_scorecard(payload) + parsed = parse_scorecard(text) + + # Each required top-level field, when removed, should produce a non-empty error list. + for field in REQUIRED_FIELDS: + mutated = {k: v for k, v in parsed.items() if k != field} + errors = validate_scorecard(mutated) + assert errors, ( + f"Expected validate_scorecard to flag missing '{field}' " + f"but got empty error list" + ) + + def test_required_top_level_keys_include_expected_sections(self): + # schema_version, agent, recipe, results, aggregate must be required + for section in ("schema_version", "agent", "recipe", "results", "aggregate"): + assert section in REQUIRED_FIELDS, f"'{section}' must be in REQUIRED_FIELDS" + + def test_missing_nested_aggregate_value_flagged(self): + payload = _make_payload() + parsed = parse_scorecard(render_scorecard(payload)) + # Complete card stays valid + assert validate_scorecard(parsed) == [] + # Removing a nested required field flags it + del parsed["aggregate"]["value"] + errors = validate_scorecard(parsed) + assert errors, "Expected missing 'aggregate.value' to be flagged" + assert any("aggregate.value" in e for e in errors), errors + + def test_missing_nested_agent_version_flagged(self): + payload = _make_payload() + parsed = parse_scorecard(render_scorecard(payload)) + del parsed["agent"]["version"] + errors = validate_scorecard(parsed) + assert errors, "Expected missing 'agent.version' to be flagged" + assert any("agent.version" in e for e in errors), errors + + def test_missing_nested_dataset_size_flagged(self): + payload = _make_payload() + parsed = parse_scorecard(render_scorecard(payload)) + del parsed["recipe"]["dataset"]["size"] + errors = validate_scorecard(parsed) + assert any("recipe.dataset.size" in e for e in errors), errors + + def test_empty_metrics_list_flagged(self): + payload = _make_payload() + parsed = parse_scorecard(render_scorecard(payload)) + parsed["results"]["metrics"] = [] + errors = validate_scorecard(parsed) + assert any("metrics" in e for e in errors), errors + + def test_non_dict_section_flagged_not_crash(self): + payload = _make_payload() + parsed = parse_scorecard(render_scorecard(payload)) + parsed["agent"] = "not-a-dict" + errors = validate_scorecard(parsed) + assert errors, "Expected a non-dict 'agent' section to be flagged" + + +# --------------------------------------------------------------------------- +# 2. Aggregate computation +# --------------------------------------------------------------------------- + + +class TestComputeAggregate: + def test_single_metric(self): + _, value = compute_aggregate([{"name": "acc", "value": 0.5, "weight": 1.0}]) + assert value == 50.0 + + def test_multiple_metrics_weighted(self): + metrics = [ + {"name": "a", "value": 0.4167, "weight": 1.0}, + {"name": "b", "value": 0.5, "weight": 2.0}, + ] + _, value = compute_aggregate(metrics) + expected = round(100 * (0.4167 + 2 * 0.5) / (1 + 2), 2) + assert value == expected + + def test_empty_metrics_raises(self): + with pytest.raises(ValueError): + compute_aggregate([]) + + def test_zero_weight_raises(self): + with pytest.raises(ValueError): + compute_aggregate([{"name": "x", "value": 0.5, "weight": 0.0}]) + + def test_recompute_from_components_matches_aggregate_value(self): + metrics = [ + {"name": "cat_acc", "value": 0.4167, "weight": 1.0}, + {"name": "send_acc", "value": 0.75, "weight": 2.0}, + ] + payload = _make_payload() + # Build payload with these 2 metrics directly + components, agg_value = compute_aggregate(metrics) + recomputed = round( + 100 + * sum(c["weight"] * c["value"] for c in components) + / sum(c["weight"] for c in components), + 2, + ) + assert recomputed == agg_value + + +# --------------------------------------------------------------------------- +# 3. Generator round-trip +# --------------------------------------------------------------------------- + + +class TestGeneratorRoundTrip: + def test_rendered_text_starts_with_dashes(self): + payload = _make_payload() + text = render_scorecard(payload) + lines = text.splitlines() + assert lines[0] == "---", f"First line must be '---', got: {lines[0]!r}" + + def test_rendered_text_contains_closing_dashes(self): + payload = _make_payload() + text = render_scorecard(payload) + lines = text.splitlines() + # Find second occurrence of '---' + closing = [i for i, l in enumerate(lines) if l == "---" and i > 0] + assert ( + closing + ), "Rendered scorecard must contain a closing '---' after the first" + + def test_body_after_front_matter_is_non_empty(self): + payload = _make_payload() + text = render_scorecard(payload) + lines = text.splitlines() + closing_indices = [i for i, l in enumerate(lines) if l == "---"] + assert len(closing_indices) >= 2, "Need at least two '---' lines" + body = "\n".join(lines[closing_indices[1] + 1 :]) + assert body.strip(), "Body after front matter must be non-empty" + + def test_parse_recovers_all_required_fields(self): + payload = _make_payload() + text = render_scorecard(payload) + parsed = parse_scorecard(text) + errors = validate_scorecard(parsed) + assert errors == [] + + def test_body_contains_reproduction_section(self): + payload = _make_payload() + text = render_scorecard(payload) + assert "## Reproduction" in text + + def test_reproduction_section_includes_custom_command(self): + payload = _make_payload() + payload.reproduction_command = "gaia eval benchmark --limit 25" + text = render_scorecard(payload) + assert "gaia eval benchmark --limit 25" in text + + def test_reproduction_section_generic_when_no_command(self): + payload = _make_payload() + # No reproduction_command (default None) + text = render_scorecard(payload) + assert "## Reproduction" in text + assert "eval-scorecard" in text + + +# --------------------------------------------------------------------------- +# 4. Two counts distinct as separate fields +# --------------------------------------------------------------------------- + + +class TestDistinctCountFields: + def test_test_cases_run_and_dataset_size_both_present(self): + payload = _make_payload() + text = render_scorecard(payload) + parsed = parse_scorecard(text) + assert "results" in parsed, "'results' section missing from parsed scorecard" + assert ( + "test_cases_run" in parsed["results"] + ), "'results.test_cases_run' must be a distinct field" + assert "recipe" in parsed, "'recipe' section missing from parsed scorecard" + assert "dataset" in parsed["recipe"], "'recipe.dataset' sub-section missing" + assert ( + "size" in parsed["recipe"]["dataset"] + ), "'recipe.dataset.size' must be a distinct field" + + +# --------------------------------------------------------------------------- +# 5. Loose coupling — no harness/agent modules imported +# --------------------------------------------------------------------------- + + +class TestLooseCoupling: + def test_no_benchmark_or_agent_modules_imported(self): + # Importing release_scorecard must not pull in the eval harness or any + # agent package. Run in a fresh subprocess and baseline sys.modules + # BEFORE the import, so we measure only what the import itself adds — + # not pytest plugins or editable-install path finders that the + # interpreter registers at startup regardless of any import. + import subprocess + import sys as _sys + + code = ( + "import sys; " + "before=set(sys.modules); " + "import gaia.eval.release_scorecard; " + "added=set(sys.modules)-before; " + "bad=[m for m in added if 'gaia.eval.benchmark' in m " + "or 'gaia.eval.quality_metrics' in m or 'gaia_agent_email' in m]; " + "assert not bad, bad" + ) + r = subprocess.run( + [_sys.executable, "-c", code], capture_output=True, text=True + ) + assert r.returncode == 0, r.stderr + + +# --------------------------------------------------------------------------- +# 6. Markdown structure (duplicate guard on render) +# --------------------------------------------------------------------------- + + +class TestMarkdownStructure: + def test_first_line_is_dashes(self): + text = render_scorecard(_make_payload()) + assert text.splitlines()[0] == "---" + + def test_contains_closing_dashes(self): + text = render_scorecard(_make_payload()) + count = text.count("\n---") + assert count >= 1, "Must contain at least one closing '---' line" + + def test_body_non_empty(self): + text = render_scorecard(_make_payload()) + parts = text.split("---") + # parts[0] is empty, parts[1] is YAML, parts[2+] is body + body = "---".join(parts[2:]) + assert body.strip(), "Markdown body after front matter must not be empty" + + +# --------------------------------------------------------------------------- +# 7. Versioning — patch carry-forward (SCORECARD.md is a single file) +# --------------------------------------------------------------------------- + + +class TestCarryForwardPatch: + def test_carry_forward_sets_inherited_from(self, tmp_path): + src = _make_payload(version="0.2.3", accuracy=0.75) + card_path = tmp_path / "SCORECARD.md" + card_path.write_text(render_scorecard(src)) + + result = carry_forward(card_path, "0.2.4") + assert result.inherited_from == "0.2.3" + + def test_carry_forward_copies_metrics_verbatim(self, tmp_path): + src = _make_payload(version="0.2.3", accuracy=0.75) + card_path = tmp_path / "SCORECARD.md" + card_path.write_text(render_scorecard(src)) + + result = carry_forward(card_path, "0.2.4") + assert result.metrics == src.metrics + + def test_carry_forward_reads_version_from_front_matter(self, tmp_path): + # The new carry_forward reads agent.version from front matter, NOT filename. + src = _make_payload(version="0.2.3", accuracy=0.75) + # Use a different filename to confirm it's not read from stem + card_path = tmp_path / "SCORECARD.md" + card_path.write_text(render_scorecard(src)) + + result = carry_forward(card_path, "0.2.4") + assert result.agent_version == "0.2.4" + assert result.inherited_from == "0.2.3" + + +# --------------------------------------------------------------------------- +# 8. Versioning — minor bump refuses +# --------------------------------------------------------------------------- + + +class TestCarryForwardMinorBumpRefuses: + def test_minor_bump_raises_value_error(self, tmp_path): + src = _make_payload(version="0.2.3", accuracy=0.75) + card_path = tmp_path / "SCORECARD.md" + card_path.write_text(render_scorecard(src)) + + with pytest.raises(ValueError, match="re-run"): + carry_forward(card_path, "0.3.0") + + def test_major_bump_raises_value_error(self, tmp_path): + src = _make_payload(version="0.2.3", accuracy=0.75) + card_path = tmp_path / "SCORECARD.md" + card_path.write_text(render_scorecard(src)) + + with pytest.raises(ValueError, match="re-run"): + carry_forward(card_path, "1.0.0") + + +# --------------------------------------------------------------------------- +# 9. Non-carry-forward card has inherited_from=None +# --------------------------------------------------------------------------- + + +class TestInheritedFromNone: + def test_fresh_payload_has_null_inherited_from(self): + payload = _make_payload() + assert payload.inherited_from is None + + def test_rendered_parsed_inherited_from_null_or_absent(self): + payload = _make_payload() + text = render_scorecard(payload) + parsed = parse_scorecard(text) + # Either key absent or value is None/null + value = parsed.get("inherited_from", None) + assert value is None + + +# --------------------------------------------------------------------------- +# 10. Gate integration: second-agent generalization (no fabricated artifacts) +# --------------------------------------------------------------------------- + + +class TestSecondAgentGeneralization: + """Prove the generator + gate work for an agent OTHER than email-triage.""" + + def test_second_agent_scorecard_validates_and_gate_passes(self, tmp_path): + from gaia.eval.scorecard_gate import main as gate_main + + # Build a ResultPayload for a different agent + metrics = [{"name": "accuracy", "value": 0.75, "weight": 1.0}] + payload = ResultPayload( + agent_name="Hello World Agent", + agent_version="0.1.0", + dataset_reference="tests/fixtures/hello/ground_truth.json", + dataset_description="Hello world evaluation dataset", + dataset_size=50, + methodology="exact match accuracy", + config={"model": "Gemma-4-E4B-it-GGUF", "limit": 20}, + test_cases_run=20, + metrics=metrics, + aggregate_name="weighted_accuracy", + generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + inherited_from=None, + reproduction_command="gaia eval agent --category hello", + ) + + scorecard_path = tmp_path / "SCORECARD.md" + from gaia.eval.release_scorecard import write_scorecard + + write_scorecard(payload, scorecard_path) + + # Validate the written scorecard + text = scorecard_path.read_text() + parsed = parse_scorecard(text) + errors = validate_scorecard(parsed) + assert errors == [], f"Second-agent scorecard should be valid, got: {errors}" + + # Gate should pass (no baseline → presence-only) + result = gate_main(["--scorecard", str(scorecard_path)]) + assert result == 0, "Gate should pass for a valid second-agent SCORECARD.md" + + +# --------------------------------------------------------------------------- +# Adapter tests: TestEmailAdapter +# --------------------------------------------------------------------------- + + +class TestEmailAdapter: + """Tests for hub/agents/python/email/packaging/gen_scorecard.py adapter.""" + + def _load_gen_scorecard(self): + adapter_path = ( + Path(__file__).parents[3] + / "hub" + / "agents" + / "python" + / "email" + / "packaging" + / "gen_scorecard.py" + ) + spec = importlib.util.spec_from_file_location("gen_scorecard", adapter_path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + def test_build_payload_mean_of_judged_scenarios(self, tmp_path): + mod = self._load_gen_scorecard() + + # Copy fixture to a benchmark dir + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json" + scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text()) + + # Fake ground_truth.json with 3 keys (2 labeled + 1 _meta → dataset_size=2) + ground_truth = { + "_meta": {"count": 3}, + "email1": {"label": "spam"}, + "email2": {"label": "promo"}, + } + gt_path = tmp_path / "ground_truth.json" + gt_path.write_text(json.dumps(ground_truth)) + + payload = mod.build_payload(benchmark_dir, gt_path) + + expected_mean = round((0.4167 + 0.5000) / 2, 10) + assert payload.metrics[0]["value"] == pytest.approx( + expected_mean + ), f"Expected metric value {expected_mean}, got {payload.metrics[0]['value']}" + + def test_build_payload_test_cases_run(self, tmp_path): + mod = self._load_gen_scorecard() + + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json" + scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text()) + + ground_truth = { + "_meta": {"count": 3}, + "email1": {"label": "spam"}, + "email2": {"label": "promo"}, + } + gt_path = tmp_path / "ground_truth.json" + gt_path.write_text(json.dumps(ground_truth)) + + payload = mod.build_payload(benchmark_dir, gt_path) + # 12 + 12 = 24; third scenario skipped (no quality key) + assert payload.test_cases_run == 24 + + def test_build_payload_dataset_size(self, tmp_path): + mod = self._load_gen_scorecard() + + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json" + scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text()) + + ground_truth = { + "_meta": {"count": 3}, + "email1": {"label": "spam"}, + "email2": {"label": "promo"}, + } + gt_path = tmp_path / "ground_truth.json" + gt_path.write_text(json.dumps(ground_truth)) + + payload = mod.build_payload(benchmark_dir, gt_path) + # 3 keys - 1 _meta = 2 + assert payload.dataset_size == 2 + + def test_all_no_quality_raises(self, tmp_path): + mod = self._load_gen_scorecard() + + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + # Scorecard where no scenario has quality + empty_scorecard = { + "run_id": "no-quality", + "scenarios": [ + { + "category": "Gemma-4-E4B-it-GGUF", + "status": "PASS", + "total_emails": 0, + }, + { + "category": "Gemma-4-E4B-it-GGUF", + "status": "PASS", + "total_emails": 0, + }, + ], + } + (benchmark_dir / "email_benchmark_scorecard.json").write_text( + json.dumps(empty_scorecard) + ) + + ground_truth = {"_meta": {"count": 1}, "email1": {"label": "spam"}} + gt_path = tmp_path / "ground_truth.json" + gt_path.write_text(json.dumps(ground_truth)) + + with pytest.raises(ValueError): + mod.build_payload(benchmark_dir, gt_path) + + def test_build_payload_includes_reproduction_command(self, tmp_path): + mod = self._load_gen_scorecard() + + benchmark_dir = tmp_path / "benchmark" + benchmark_dir.mkdir() + scorecard_dest = benchmark_dir / "email_benchmark_scorecard.json" + scorecard_dest.write_text(EMAIL_BENCHMARK_FIXTURE.read_text()) + + ground_truth = { + "_meta": {"count": 3}, + "email1": {"label": "spam"}, + "email2": {"label": "promo"}, + } + gt_path = tmp_path / "ground_truth.json" + gt_path.write_text(json.dumps(ground_truth)) + + payload = mod.build_payload(benchmark_dir, gt_path, limit=25) + assert payload.reproduction_command is not None + assert "gaia eval benchmark" in payload.reproduction_command + assert "gen_scorecard.py" in payload.reproduction_command + assert "PYTHON_KEYRING_BACKEND" in payload.reproduction_command diff --git a/tests/unit/eval/test_scorecard_gate.py b/tests/unit/eval/test_scorecard_gate.py new file mode 100644 index 000000000..efc5d4fad --- /dev/null +++ b/tests/unit/eval/test_scorecard_gate.py @@ -0,0 +1,308 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +"""TDD tests for gaia.eval.scorecard_gate — new single-file SCORECARD.md interface.""" + +import datetime +from pathlib import Path + +import yaml + +from gaia.eval.release_scorecard import ( + ResultPayload, + compute_aggregate, + render_scorecard, +) +from gaia.eval.scorecard_gate import main + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + + +def _make_payload(version="1.0.0", accuracy=0.5): + metrics = [{"name": "category_accuracy", "value": accuracy, "weight": 1.0}] + components, agg_value = compute_aggregate(metrics) + return ResultPayload( + agent_name="test-agent", + agent_version=version, + dataset_reference="test/fixture", + dataset_description="test dataset", + dataset_size=100, + methodology="unit test", + config={"model": "test"}, + test_cases_run=10, + metrics=metrics, + aggregate_name="weighted_accuracy", + generated_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + inherited_from=None, + ) + + +def _write_card(directory: Path, version: str, accuracy: float) -> Path: + """Write a valid SCORECARD.md to directory/SCORECARD.md.""" + payload = _make_payload(version=version, accuracy=accuracy) + path = directory / "SCORECARD.md" + path.write_text(render_scorecard(payload)) + return path + + +def _write_card_named(path: Path, version: str, accuracy: float) -> Path: + """Write a valid SCORECARD.md to an explicit path.""" + payload = _make_payload(version=version, accuracy=accuracy) + path.write_text(render_scorecard(payload)) + return path + + +# --------------------------------------------------------------------------- +# Case (a) — missing card → exit 1 +# --------------------------------------------------------------------------- + + +class TestMissingCard: + def test_missing_card_returns_1(self, tmp_path): + scorecard = tmp_path / "SCORECARD.md" + result = main(["--scorecard", str(scorecard)]) + assert result == 1 + + +# --------------------------------------------------------------------------- +# Case (b) — strict regression with --baseline-file → exit 1 +# --------------------------------------------------------------------------- + + +class TestStrictRegression: + def test_regression_returns_1(self, tmp_path): + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.8) + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5) + + result = main(["--scorecard", str(candidate), "--baseline-file", str(baseline)]) + assert result == 1 + + +# --------------------------------------------------------------------------- +# Case (c) — no baseline → presence-only pass → exit 0 +# --------------------------------------------------------------------------- + + +class TestNoPrior: + def test_first_adoption_returns_0(self, tmp_path): + candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6) + result = main(["--scorecard", str(candidate)]) + assert result == 0 + + +# --------------------------------------------------------------------------- +# Case (d) — equal score (carry-forward) with --baseline-file → exit 0 +# --------------------------------------------------------------------------- + + +class TestEqualScore: + def test_equal_score_returns_0(self, tmp_path): + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.5) + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5) + + result = main(["--scorecard", str(candidate), "--baseline-file", str(baseline)]) + assert result == 0 + + +# --------------------------------------------------------------------------- +# Case (e) — improved score → exit 0 +# --------------------------------------------------------------------------- + + +class TestImprovedScore: + def test_improved_score_returns_0(self, tmp_path): + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.5) + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.8) + + result = main(["--scorecard", str(candidate), "--baseline-file", str(baseline)]) + assert result == 0 + + +# --------------------------------------------------------------------------- +# --allow-regression → exit 0 +# --------------------------------------------------------------------------- + + +class TestAllowRegression: + def test_allow_regression_flag_returns_0(self, tmp_path): + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.8) + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5) + + result = main( + [ + "--scorecard", + str(candidate), + "--baseline-file", + str(baseline), + "--allow-regression", + ] + ) + assert result == 0 + + def test_allow_regression_prints_warning_line(self, tmp_path, capsys): + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + baseline = _write_card(baseline_dir, "0.2.3", accuracy=0.8) + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.5) + + main( + [ + "--scorecard", + str(candidate), + "--baseline-file", + str(baseline), + "--allow-regression", + ] + ) + captured = capsys.readouterr() + assert "::warning::" in captured.out + + +# --------------------------------------------------------------------------- +# --baseline-file missing → exit 1 +# --------------------------------------------------------------------------- + + +class TestBaselineFileMissing: + def test_missing_baseline_file_returns_1(self, tmp_path): + candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6) + result = main( + [ + "--scorecard", + str(candidate), + "--baseline-file", + str(tmp_path / "nonexistent-SCORECARD.md"), + ] + ) + assert result == 1 + + +# --------------------------------------------------------------------------- +# Invalid candidate (corrupt YAML front matter) → exit 1 +# --------------------------------------------------------------------------- + + +class TestInvalidCandidate: + def test_corrupt_candidate_returns_1(self, tmp_path): + corrupt_path = tmp_path / "SCORECARD.md" + corrupt_path.write_text("this is not valid yaml front matter at all\ngarbage\n") + result = main(["--scorecard", str(corrupt_path)]) + assert result == 1 + + def test_empty_candidate_returns_1(self, tmp_path): + empty_path = tmp_path / "SCORECARD.md" + empty_path.write_text("") + result = main(["--scorecard", str(empty_path)]) + assert result == 1 + + +# --------------------------------------------------------------------------- +# Invalid baseline → exit 1 +# --------------------------------------------------------------------------- + + +class TestInvalidPrior: + def test_corrupt_baseline_returns_1(self, tmp_path): + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + corrupt = baseline_dir / "SCORECARD.md" + corrupt.write_text("this is not valid yaml front matter at all\ngarbage\n") + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.9) + + result = main(["--scorecard", str(candidate), "--baseline-file", str(corrupt)]) + assert result == 1 + + def test_empty_baseline_returns_1(self, tmp_path): + baseline_dir = tmp_path / "baseline" + baseline_dir.mkdir() + empty = baseline_dir / "SCORECARD.md" + empty.write_text("") + + candidate_dir = tmp_path / "candidate" + candidate_dir.mkdir() + candidate = _write_card(candidate_dir, "0.2.4", accuracy=0.9) + + result = main(["--scorecard", str(candidate), "--baseline-file", str(empty)]) + assert result == 1 + + +# --------------------------------------------------------------------------- +# Workflow YAML test: publish job must list scorecard-gate in needs +# --------------------------------------------------------------------------- + + +class TestWorkflowYaml: + def test_publish_job_needs_scorecard_gate(self): + workflow_path = ( + Path(__file__).parents[3] + / ".github" + / "workflows" + / "release_agent_email.yml" + ) + assert workflow_path.exists(), f"Workflow file not found: {workflow_path}" + content = workflow_path.read_text() + parsed = yaml.safe_load(content) + + assert "jobs" in parsed, "Workflow has no 'jobs' key" + assert ( + "publish" in parsed["jobs"] + ), "Workflow has no 'publish' job — add it or check the job name" + needs = parsed["jobs"]["publish"].get("needs", []) + # needs can be a string or a list + if isinstance(needs, str): + needs = [needs] + assert ( + "scorecard-gate" in needs + ), f"'publish' job must list 'scorecard-gate' in its needs; got: {needs}" + + +# --------------------------------------------------------------------------- +# Error handling — bad CLI input returns 1 (not exception) +# --------------------------------------------------------------------------- + + +class TestCliErrorHandling: + def test_missing_scorecard_flag_returns_1(self): + result = main([]) + assert result == 1 + + def test_baseline_file_and_ref_mutually_exclusive(self, tmp_path): + candidate = _write_card(tmp_path, "1.0.0", accuracy=0.6) + result = main( + [ + "--scorecard", + str(candidate), + "--baseline-file", + str(candidate), + "--baseline-ref", + "v1.0.0", + ] + ) + assert result == 1 diff --git a/workers/agent-hub/src/catalog.ts b/workers/agent-hub/src/catalog.ts index 357cf83f8..786d278d9 100644 --- a/workers/agent-hub/src/catalog.ts +++ b/workers/agent-hub/src/catalog.ts @@ -5,11 +5,15 @@ * Build per-agent manifests and the top-level catalog index. */ +import { parse as parseYaml } from "yaml"; + import { compareSemver } from "./manifest"; import { + evalScorecardKey, listAgentIds, readAgentManifest, readChangelog, + readEvalScorecard, readPackageFiles, readReadme, readSkill, @@ -96,10 +100,31 @@ export function upsertVersion( }; } +/** + * Parse the `aggregate.value` from a scorecard's YAML front matter. Returns + * undefined when the scorecard is absent, malformed, or missing the field — + * never throws so a bad scorecard never breaks the catalog build. + */ +function parseScorecardScore(markdown: string | null): number | undefined { + if (!markdown) return undefined; + // Extract the YAML front matter block between the leading --- delimiters. + const match = /^---\n([\s\S]*?)\n---/.exec(markdown); + if (!match) return undefined; + try { + const fm = parseYaml(match[1]) as Record | null; + const agg = fm && typeof fm === "object" ? (fm.aggregate as Record | undefined) : undefined; + const val = agg?.value; + return typeof val === "number" && Number.isFinite(val) ? val : undefined; + } catch { + return undefined; + } +} + /** * Build the catalog entry for one agent manifest. `readme`/`changelog` are the * latest version's markdown ("" if none was published); `packageFiles` is the - * whole-package zip's file listing (null if no package zip was published). + * whole-package zip's file listing (null if no package zip was published); + * `evalScorecard` is the scorecard markdown (null if none was published). */ export function toIndexEntry( agent: AgentManifest, @@ -107,7 +132,9 @@ export function toIndexEntry( changelog: string, packageFiles: { files: { name: string; size_bytes: number }[] } | null, spec = "", - skill = "" + skill = "", + evalScorecard: string | null = null, + baseUrl = "https://hub.amd-gaia.ai" ): IndexEntry { const latest = agent.versions[agent.latest_version]; const req = agent.requirements; @@ -154,6 +181,10 @@ export function toIndexEntry( // undefined serializes to "key absent" — only present when the manifest set it. npm_package: agent.npm_package, playground_url: agent.playground_url, + eval_scorecard_url: evalScorecard !== null + ? `${baseUrl.replace(/\/$/, "")}/${evalScorecardKey(agent.id, agent.latest_version)}` + : undefined, + eval_score: parseScorecardScore(evalScorecard), package: pkg, }; } @@ -164,7 +195,8 @@ export function toIndexEntry( */ export async function rebuildIndex( bucket: R2Bucket, - now: Date = new Date() + now: Date = new Date(), + baseUrl = "https://hub.amd-gaia.ai" ): Promise { const ids = await listAgentIds(bucket); const entries: IndexEntry[] = []; @@ -176,7 +208,8 @@ export async function rebuildIndex( const packageFiles = await readPackageFiles(bucket, id, agent.latest_version); const spec = await readSpec(bucket, id, agent.latest_version); const skill = await readSkill(bucket, id, agent.latest_version); - entries.push(toIndexEntry(agent, readme, changelog, packageFiles, spec, skill)); + const evalScorecard = await readEvalScorecard(bucket, id, agent.latest_version); + entries.push(toIndexEntry(agent, readme, changelog, packageFiles, spec, skill, evalScorecard, baseUrl)); } entries.sort((a, b) => a.id.localeCompare(b.id)); diff --git a/workers/agent-hub/src/publish.ts b/workers/agent-hub/src/publish.ts index 6c9b638dd..c869ea623 100644 --- a/workers/agent-hub/src/publish.ts +++ b/workers/agent-hub/src/publish.ts @@ -18,6 +18,7 @@ import { parseManifest } from "./manifest"; import { artifactKey, changelogKey, + evalScorecardKey, packageFilesKey, rawManifestKey, readAgentManifest, @@ -173,6 +174,9 @@ export async function handlePublish( // semantics as README/CHANGELOG. const specText = await optionalMarkdownPart(form, "spec", "SPEC.md"); const skillText = await optionalMarkdownPart(form, "skill", "SKILL.md"); + // Optional eval scorecard markdown (the agent's benchmark results, rendered on + // the hub listing as an aggregate score + link). Per-version, first-POST semantics. + const evalScorecardText = await optionalMarkdownPart(form, "eval_scorecard", "SCORECARD.md"); // Optional whole-package file listing (the zip's contents, for the hub's file // list). The zip itself rides in as a normal `artifact`; this is just the // manifest of what's inside it. @@ -276,6 +280,11 @@ export async function handlePublish( httpMetadata: { contentType: "text/markdown; charset=utf-8" }, }); } + if (evalScorecardText != null) { + await env.BUCKET.put(evalScorecardKey(manifest.id, manifest.version), evalScorecardText, { + httpMetadata: { contentType: "text/markdown; charset=utf-8" }, + }); + } } // The package file listing rides the whole-package zip POST, which in a real @@ -296,7 +305,8 @@ export async function handlePublish( const updated = upsertVersion(existing, manifest, versionEntry); await writeAgentManifest(env.BUCKET, updated); - const index = await rebuildIndex(env.BUCKET, now); + const baseUrl = new URL(request.url).origin; + const index = await rebuildIndex(env.BUCKET, now, baseUrl); return json( { diff --git a/workers/agent-hub/src/storage.ts b/workers/agent-hub/src/storage.ts index 0b15640f6..3a26647a4 100644 --- a/workers/agent-hub/src/storage.ts +++ b/workers/agent-hub/src/storage.ts @@ -52,6 +52,10 @@ export function skillKey(id: string, version: string): string { return `${versionDir(id, version)}SKILL.md`; } +export function evalScorecardKey(id: string, version: string): string { + return `${versionDir(id, version)}SCORECARD.md`; +} + export function packageFilesKey(id: string, version: string): string { return `${versionDir(id, version)}package-files.json`; } @@ -114,6 +118,21 @@ export async function readSkill( return obj.text(); } +/** + * Read the eval scorecard markdown for one published version. Returns null when + * none was published — the `eval_scorecard` form part is optional, so its + * absence is not an error. + */ +export async function readEvalScorecard( + bucket: R2Bucket, + id: string, + version: string +): Promise { + const obj = await bucket.get(evalScorecardKey(id, version)); + if (!obj) return null; + return obj.text(); +} + /** * Read the whole-package file listing (`{ files: [{name, size_bytes}] }`) for one * version, or null when none was published — the `package_files` form part on diff --git a/workers/agent-hub/src/types.ts b/workers/agent-hub/src/types.ts index 571d75f9d..36df4a811 100644 --- a/workers/agent-hub/src/types.ts +++ b/workers/agent-hub/src/types.ts @@ -199,6 +199,10 @@ export interface IndexEntry { npm_package?: string; /** Localhost playground URL served by the agent's sidecar; absent otherwise. */ playground_url?: string; + /** Public URL of the eval scorecard markdown for the latest version; absent when none was published. */ + eval_scorecard_url?: string; + /** Aggregate eval score (0–100) parsed from the latest version's scorecard front matter; absent when none was published or parseable. */ + eval_score?: number; /** * Whole-package download: a single zip (all platform binaries + client + docs) * plus its file listing. Present only when a `package_files` manifest was diff --git a/workers/agent-hub/test/fake-r2.ts b/workers/agent-hub/test/fake-r2.ts index 79284f98b..9e149c681 100644 --- a/workers/agent-hub/test/fake-r2.ts +++ b/workers/agent-hub/test/fake-r2.ts @@ -159,6 +159,7 @@ export function publishRequest(opts: { changelog?: string; spec?: string; skill?: string; + evalScorecard?: string; packageFiles?: string; }): Request { const form = new FormData(); @@ -167,6 +168,7 @@ export function publishRequest(opts: { if (opts.changelog !== undefined) form.set("changelog", opts.changelog); if (opts.spec !== undefined) form.set("spec", opts.spec); if (opts.skill !== undefined) form.set("skill", opts.skill); + if (opts.evalScorecard !== undefined) form.set("eval_scorecard", opts.evalScorecard); if (opts.packageFiles !== undefined) form.set("package_files", opts.packageFiles); const bytes = typeof opts.artifact === "string" ? new TextEncoder().encode(opts.artifact) : opts.artifact; form.set( diff --git a/workers/agent-hub/test/routes.test.ts b/workers/agent-hub/test/routes.test.ts index bb602d127..cb00f5abf 100644 --- a/workers/agent-hub/test/routes.test.ts +++ b/workers/agent-hub/test/routes.test.ts @@ -81,3 +81,64 @@ describe("GET routes", () => { expect(res.status).toBe(405); }); }); + +// Minimal YAML front matter matching the email agent's scorecard shape. +const SAMPLE_SCORECARD = [ + "---", + "schema_version: 1", + "agent:", + " name: Test Agent", + " version: 0.1.0", + "aggregate:", + " name: weighted_accuracy", + " value: 87.5", + "generated_at: '2026-06-26T00:00:00Z'", + "---", + "# Test Agent — Eval Scorecard v0.1.0", + "", + "**Aggregate score: 87.5** (out of 100)", +].join("\n"); + +describe("eval scorecard in catalog", () => { + it("exposes eval_score and eval_scorecard_url when a scorecard is published", async () => { + const env = makeEnv(); + await worker.fetch( + publishRequest({ + token: "tok_amd", + manifestYaml: sampleManifest({ id: "chat", version: "0.1.0" }), + artifact: "chat-wheel", + filename: "gaia_agent_chat-0.1.0-py3-none-any.whl", + evalScorecard: SAMPLE_SCORECARD, + }), + env as never + ); + + const res = await worker.fetch(get("/index.json"), env as never); + expect(res.status).toBe(200); + const body = (await res.json()) as any; + const entry = body.agents[0]; + expect(entry.eval_score).toBe(87.5); + expect(entry.eval_scorecard_url).toMatch(/\/agents\/chat\/0\.1\.0\/SCORECARD\.md$/); + }); + + it("omits eval_score and eval_scorecard_url when no scorecard is published", async () => { + const env = makeEnv(); + await worker.fetch( + publishRequest({ + token: "tok_amd", + manifestYaml: sampleManifest({ id: "chat", version: "0.1.0" }), + artifact: "chat-wheel", + filename: "gaia_agent_chat-0.1.0-py3-none-any.whl", + // no evalScorecard + }), + env as never + ); + + const res = await worker.fetch(get("/index.json"), env as never); + expect(res.status).toBe(200); + const body = (await res.json()) as any; + const entry = body.agents[0]; + expect(entry.eval_score).toBeUndefined(); + expect(entry.eval_scorecard_url).toBeUndefined(); + }); +});