From c149b27db94822f9ed89b38bf8bb2e8b06fc1f3d Mon Sep 17 00:00:00 2001
From: ianu82 <ianu82@yahoo.co.uk>
Date: Wed, 24 Jun 2026 17:46:23 +0100
Subject: [PATCH] Add additive per-model USD cost meter to LLM usage telemetry

Turn the token counts Anton already records per LLM call into a dollar
figure a host can surface as "$ this turn / $ this task". Purely
additive telemetry: no budget object, no enforcement, no control-flow
change (those are explicitly later slices).

- New anton/core/llm/pricing.py: a maintained per-model price table
  (input/output/cache USD rates per 1M tokens, matched by model-ID
  prefix like the existing _CONTEXT_WINDOWS table) plus compute_cost().
  Unknown models price at 0.0 and None token counts are treated as 0,
  so an unpriced model or a missing usage field never breaks a turn
  (mirrors compute_context_pressure's defensive posture).
- Usage gains additive cache_write_tokens / cache_read_tokens / cost_usd
  fields, all defaulted so every existing construction site stays valid.
- Both providers populate cost_usd (and cache tokens where the SDK
  reports them) at all 6 Usage construction sites. Anthropic reports
  cache tokens separately, so they're summed; OpenAI folds cached tokens
  into prompt_tokens, so they're surfaced for telemetry but not double
  -priced. cost_usd rides on the existing StreamComplete/usage output.

Adds tests/test_pricing.py (tokens x price table -> expected USD,
including cache and zero/None/unknown-model cases).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 anton/core/llm/anthropic.py |  26 ++++++++-
 anton/core/llm/openai.py    |  43 +++++++++++++-
 anton/core/llm/pricing.py   | 109 ++++++++++++++++++++++++++++++++++++
 anton/core/llm/provider.py  |  10 ++++
 tests/test_pricing.py       | 109 ++++++++++++++++++++++++++++++++++++
 5 files changed, 295 insertions(+), 2 deletions(-)
 create mode 100644 anton/core/llm/pricing.py
 create mode 100644 tests/test_pricing.py

diff --git a/anton/core/llm/anthropic.py b/anton/core/llm/anthropic.py
index 7ca41555..7a074f1f 100644
--- a/anton/core/llm/anthropic.py
+++ b/anton/core/llm/anthropic.py
@@ -21,6 +21,7 @@
     Usage,
     compute_context_pressure,
 )
+from .pricing import compute_cost
 
 # Native server-side web tool type strings exposed by the Anthropic Messages API.
 # The model invokes these inside the provider — Anton's tool-dispatch loop never
@@ -151,13 +152,24 @@ async def complete(
                 )
 
         input_tokens = response.usage.input_tokens
+        output_tokens = response.usage.output_tokens
+        # Cache-token counts when present (Anthropic ships them on usage); 0
+        # otherwise. Anton sends no cache_control today, so these are normally
+        # 0 — read defensively so the cost meter is correct if that changes.
+        cache_write = getattr(response.usage, "cache_creation_input_tokens", 0) or 0
+        cache_read = getattr(response.usage, "cache_read_input_tokens", 0) or 0
         return LLMResponse(
             content=content_text,
             tool_calls=tool_calls,
             usage=Usage(
                 input_tokens=input_tokens,
-                output_tokens=response.usage.output_tokens,
+                output_tokens=output_tokens,
                 context_pressure=compute_context_pressure(model, input_tokens),
+                cache_write_tokens=cache_write,
+                cache_read_tokens=cache_read,
+                cost_usd=compute_cost(
+                    model, input_tokens, output_tokens, cache_write, cache_read
+                ),
             ),
             stop_reason=response.stop_reason,
         )
@@ -192,6 +204,8 @@ async def stream(
         tool_calls: list[ToolCall] = []
         input_tokens = 0
         output_tokens = 0
+        cache_write = 0
+        cache_read = 0
         stop_reason: str | None = None
 
         # Track content blocks by index for tool correlation
@@ -204,6 +218,11 @@ async def stream(
                         usage = event.message.usage
                         input_tokens = usage.input_tokens
                         output_tokens = getattr(usage, "output_tokens", 0)
+                        # Cache-token counts when present; 0 otherwise (Anton
+                        # sends no cache_control today). Read defensively so the
+                        # cost meter stays correct if caching is enabled later.
+                        cache_write = getattr(usage, "cache_creation_input_tokens", 0) or 0
+                        cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
 
                     elif event.type == "content_block_start":
                         idx = event.index
@@ -294,6 +313,11 @@ async def stream(
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
                     context_pressure=compute_context_pressure(model, input_tokens),
+                    cache_write_tokens=cache_write,
+                    cache_read_tokens=cache_read,
+                    cost_usd=compute_cost(
+                        model, input_tokens, output_tokens, cache_write, cache_read
+                    ),
                 ),
                 stop_reason=stop_reason,
             )
diff --git a/anton/core/llm/openai.py b/anton/core/llm/openai.py
index c2f0549c..b412bc1e 100644
--- a/anton/core/llm/openai.py
+++ b/anton/core/llm/openai.py
@@ -23,6 +23,7 @@
     Usage,
     compute_context_pressure,
 )
+from .pricing import compute_cost
 
 
 def _translate_tools(tools: list[dict]) -> list[dict]:
@@ -42,6 +43,26 @@ def _translate_tools(tools: list[dict]) -> list[dict]:
     return result
 
 
+def _openai_cached_tokens(usage_obj) -> int:
+    """Best-effort cached-prompt-token count off an OpenAI usage object.
+
+    OpenAI reports cached tokens as a subset of the prompt under
+    ``prompt_tokens_details.cached_tokens`` (Chat Completions) or
+    ``input_tokens_details.cached_tokens`` (Responses API). Returns 0 when the
+    field is absent — surfaced for telemetry only; it is NOT priced again on
+    top of the input rate (OpenAI already counts it in the input total).
+    """
+    if not usage_obj:
+        return 0
+    details = (
+        getattr(usage_obj, "prompt_tokens_details", None)
+        or getattr(usage_obj, "input_tokens_details", None)
+    )
+    if details is None:
+        return 0
+    return getattr(details, "cached_tokens", 0) or 0
+
+
 def _translate_tool_choice(tool_choice: dict) -> dict | str:
     """Anthropic tool_choice -> OpenAI tool_choice."""
     tc_type = tool_choice.get("type")
@@ -720,13 +741,22 @@ async def complete(
 
         usage_obj = response.usage
         input_tokens = usage_obj.prompt_tokens if usage_obj else 0
+        output_tokens = usage_obj.completion_tokens if usage_obj else 0
+        # OpenAI folds cached tokens *into* prompt_tokens (a subset, already
+        # priced by the input rate), unlike Anthropic which reports them
+        # separately. So we surface cache_read for telemetry but do NOT add a
+        # cache term to compute_cost — that would double-count. Anton sends no
+        # cache_control today, so this is 0 in practice regardless.
+        cache_read = _openai_cached_tokens(usage_obj)
         return LLMResponse(
             content=content_text,
             tool_calls=tool_calls,
             usage=Usage(
                 input_tokens=input_tokens,
-                output_tokens=usage_obj.completion_tokens if usage_obj else 0,
+                output_tokens=output_tokens,
                 context_pressure=compute_context_pressure(model, input_tokens),
+                cache_read_tokens=cache_read,
+                cost_usd=compute_cost(model, input_tokens, output_tokens),
             ),
             stop_reason=choice.finish_reason,
         )
@@ -777,6 +807,7 @@ async def stream(
         tool_calls: list[ToolCall] = []
         input_tokens = 0
         output_tokens = 0
+        cache_read = 0
         stop_reason: str | None = None
 
         # Track tool call deltas by index
@@ -788,6 +819,7 @@ async def stream(
                 if chunk.usage:
                     input_tokens = chunk.usage.prompt_tokens
                     output_tokens = chunk.usage.completion_tokens
+                    cache_read = _openai_cached_tokens(chunk.usage)
 
                 if not chunk.choices:
                     continue
@@ -888,6 +920,8 @@ async def stream(
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
                     context_pressure=compute_context_pressure(model, input_tokens),
+                    cache_read_tokens=cache_read,
+                    cost_usd=compute_cost(model, input_tokens, output_tokens),
                 ),
                 stop_reason=stop_reason,
             )
@@ -1009,6 +1043,7 @@ async def _stream_via_responses(
         tool_calls: list[ToolCall] = []
         input_tokens = 0
         output_tokens = 0
+        cache_read = 0
         stop_reason: str | None = None
 
         # Map output_index → in-flight function-call state. Responses API uses
@@ -1083,6 +1118,7 @@ async def _stream_via_responses(
                         if usage is not None:
                             input_tokens = getattr(usage, "input_tokens", 0) or 0
                             output_tokens = getattr(usage, "output_tokens", 0) or 0
+                            cache_read = _openai_cached_tokens(usage)
                         stop_reason = getattr(final_response, "status", None)
         except openai.BadRequestError as exc:
             msg = str(exc).lower()
@@ -1119,6 +1155,8 @@ async def _stream_via_responses(
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
                     context_pressure=compute_context_pressure(model, input_tokens),
+                    cache_read_tokens=cache_read,
+                    cost_usd=compute_cost(model, input_tokens, output_tokens),
                 ),
                 stop_reason=stop_reason,
             )
@@ -1161,6 +1199,7 @@ def _parse_response_object(response, model: str) -> LLMResponse:
     # which a bare getattr default does NOT catch. Mirrors the streaming path.
     input_tokens = (getattr(usage, "input_tokens", 0) or 0) if usage else 0
     output_tokens = (getattr(usage, "output_tokens", 0) or 0) if usage else 0
+    cache_read = _openai_cached_tokens(usage)
 
     return LLMResponse(
         content=content_text,
@@ -1169,6 +1208,8 @@ def _parse_response_object(response, model: str) -> LLMResponse:
             input_tokens=input_tokens,
             output_tokens=output_tokens,
             context_pressure=compute_context_pressure(model, input_tokens),
+            cache_read_tokens=cache_read,
+            cost_usd=compute_cost(model, input_tokens, output_tokens),
         ),
         stop_reason=getattr(response, "status", None),
     )
diff --git a/anton/core/llm/pricing.py b/anton/core/llm/pricing.py
new file mode 100644
index 00000000..d934275b
--- /dev/null
+++ b/anton/core/llm/pricing.py
@@ -0,0 +1,109 @@
+"""Per-model USD price table + cost computation for LLM token usage.
+
+This module is *purely additive* telemetry: it turns the token counts Anton
+already records on every LLM call (see ``Usage`` in ``provider.py``) into a
+dollar figure a host can surface as "$ this turn / $ this task". It does not
+gate, cap, or alter any request — there is no budget object and no enforcement
+here by design (those are later slices).
+
+Rates are USD per **one million** tokens, matching how providers publish them.
+The table is matched by model-ID prefix in declaration order — exact/most
+specific IDs first, family fallbacks last — mirroring the ``_CONTEXT_WINDOWS``
+table in ``provider.py`` so the two stay stylistically aligned. Models not in
+the table price at ``0.0`` rather than raising: an unpriced model must never
+break a turn, and a zero cost is an honest "we don't have a rate for this".
+
+Cache rates follow Anthropic's published multipliers relative to the base
+input rate: a 5-minute cache *write* costs ~1.25x input, a cache *read* ~0.1x.
+Anton does not enable prompt caching today (no ``cache_control`` is sent), so
+``cache_write_tokens`` / ``cache_read_tokens`` are 0 in practice and the cache
+terms contribute nothing — but the table and ``compute_cost`` carry them so the
+meter stays correct the moment caching is turned on upstream.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class ModelPrice:
+    """USD rates for one model, per **one million** tokens.
+
+    ``cache_write`` / ``cache_read`` default to the Anthropic-published
+    multiples of ``input`` (1.25x write, 0.1x read) so a table entry only has
+    to specify the two base rates unless a provider prices caching differently.
+    """
+
+    input: float
+    output: float
+    cache_write: float | None = None
+    cache_read: float | None = None
+
+    def cache_write_rate(self) -> float:
+        return self.cache_write if self.cache_write is not None else self.input * 1.25
+
+    def cache_read_rate(self) -> float:
+        return self.cache_read if self.cache_read is not None else self.input * 0.1
+
+
+# Matched by prefix in order — exact model IDs first, family fallbacks last.
+# Rates are USD per 1M tokens. Keep this list maintained as the set of models
+# Anton actually runs changes; an absent model prices at 0.0 (see compute_cost).
+_MODEL_PRICES: list[tuple[str, ModelPrice]] = [
+    # Anton defaults (exact model IDs first — see anton/config/settings.py)
+    ("claude-sonnet-4-6", ModelPrice(input=3.00, output=15.00)),
+    ("claude-haiku-4-5-20251001", ModelPrice(input=1.00, output=5.00)),
+    # Claude families (most specific prefix first)
+    ("claude-opus-4", ModelPrice(input=5.00, output=25.00)),
+    ("claude-sonnet-4", ModelPrice(input=3.00, output=15.00)),
+    ("claude-haiku-4", ModelPrice(input=1.00, output=5.00)),
+    # OpenAI families (rates per 1M tokens)
+    ("gpt-5", ModelPrice(input=1.25, output=10.00)),
+    ("gpt-4.1", ModelPrice(input=2.00, output=8.00)),
+    ("gpt-4o", ModelPrice(input=2.50, output=10.00)),
+    ("o3", ModelPrice(input=2.00, output=8.00)),
+    ("o1", ModelPrice(input=15.00, output=60.00)),
+]
+
+_PER_MILLION = 1_000_000.0
+
+
+def get_model_price(model: str) -> ModelPrice | None:
+    """Return the price entry whose prefix matches ``model``, or None.
+
+    Matching is longest-declared-first by prefix, the same scheme
+    ``compute_context_pressure`` uses for context windows.
+    """
+    if not model:
+        return None
+    for prefix, price in _MODEL_PRICES:
+        if model.startswith(prefix):
+            return price
+    return None
+
+
+def compute_cost(
+    model: str,
+    input_tokens: int | None,
+    output_tokens: int | None,
+    cache_write_tokens: int | None = 0,
+    cache_read_tokens: int | None = 0,
+) -> float:
+    """Return the USD cost of one LLM call's token usage.
+
+    Any token count may be ``None`` — some providers omit a usage field (e.g.
+    the MindsHub passthrough returns ``input_tokens=None`` on web-search
+    responses), so a missing count is treated as 0 rather than crashing, exactly
+    as ``compute_context_pressure`` does. An unknown/unpriced model returns
+    ``0.0``. The result is always a non-negative float in dollars.
+    """
+    price = get_model_price(model)
+    if price is None:
+        return 0.0
+    cost = (
+        (input_tokens or 0) * price.input
+        + (output_tokens or 0) * price.output
+        + (cache_write_tokens or 0) * price.cache_write_rate()
+        + (cache_read_tokens or 0) * price.cache_read_rate()
+    )
+    return cost / _PER_MILLION
diff --git a/anton/core/llm/provider.py b/anton/core/llm/provider.py
index 10727a20..3aa577e8 100644
--- a/anton/core/llm/provider.py
+++ b/anton/core/llm/provider.py
@@ -26,6 +26,16 @@ class Usage:
     input_tokens: int = 0
     output_tokens: int = 0
     context_pressure: float = 0.0
+    # Cache-token counts, when the provider reports them. Anton does not send
+    # cache_control today, so these are 0 in practice — they exist so the USD
+    # cost meter (see pricing.compute_cost / cost_usd below) stays correct the
+    # moment prompt caching is enabled upstream.
+    cache_write_tokens: int = 0
+    cache_read_tokens: int = 0
+    # Additive telemetry: USD cost of this single call's tokens, priced by
+    # pricing.compute_cost(). 0.0 when the model has no maintained rate. This is
+    # read-only output for hosts ("$ this turn"); it gates nothing.
+    cost_usd: float = 0.0
 
 
 @dataclass
diff --git a/tests/test_pricing.py b/tests/test_pricing.py
new file mode 100644
index 00000000..52277cc8
--- /dev/null
+++ b/tests/test_pricing.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+from anton.core.llm.pricing import ModelPrice, compute_cost, get_model_price
+from anton.core.llm.provider import Usage
+
+
+class TestGetModelPrice:
+    def test_exact_default_models_match(self):
+        # Anton's two default models (anton/config/settings.py) must be priced.
+        assert get_model_price("claude-sonnet-4-6") is not None
+        assert get_model_price("claude-haiku-4-5-20251001") is not None
+
+    def test_dated_haiku_id_matches_exact_entry_before_family(self):
+        # The coding default is the dated id; it must resolve to the dated
+        # entry, not fall through to a looser family prefix.
+        price = get_model_price("claude-haiku-4-5-20251001")
+        assert price is not None and price.input == 1.00 and price.output == 5.00
+
+    def test_family_prefix_fallback(self):
+        # An opus model with a date suffix still matches the "claude-opus-4"
+        # family prefix.
+        price = get_model_price("claude-opus-4-8")
+        assert price is not None and price.input == 5.00 and price.output == 25.00
+
+    def test_unknown_model_returns_none(self):
+        assert get_model_price("totally-made-up-model") is None
+
+    def test_empty_model_returns_none(self):
+        assert get_model_price("") is None
+
+
+class TestModelPriceCacheRates:
+    def test_cache_rates_default_to_input_multiples(self):
+        # 1.25x write, 0.1x read, relative to the base input rate.
+        p = ModelPrice(input=4.00, output=20.00)
+        assert p.cache_write_rate() == 5.00
+        assert p.cache_read_rate() == 0.40
+
+    def test_explicit_cache_rates_win(self):
+        p = ModelPrice(input=4.00, output=20.00, cache_write=9.99, cache_read=0.01)
+        assert p.cache_write_rate() == 9.99
+        assert p.cache_read_rate() == 0.01
+
+
+class TestComputeCost:
+    def test_input_and_output_priced_per_million(self):
+        # Sonnet: $3/1M in, $15/1M out → 1M each = $18.00 exactly.
+        assert compute_cost("claude-sonnet-4-6", 1_000_000, 1_000_000) == 18.00
+
+    def test_small_token_counts(self):
+        # Haiku: 1000 in @ $1/1M = $0.001, 2000 out @ $5/1M = $0.010 → $0.011.
+        assert compute_cost("claude-haiku-4-5-20251001", 1000, 2000) == 0.011
+
+    def test_opus_family(self):
+        # Opus: 100 in @ $5/1M + 50 out @ $25/1M = 0.0005 + 0.00125 = 0.00175.
+        assert compute_cost("claude-opus-4-8", 100, 50) == 0.00175
+
+    def test_cache_tokens_priced_additively(self):
+        # Anthropic reports cache tokens separately from input, so compute_cost
+        # adds them: sonnet cache_read 1M = 0.1 * $3 = $0.30.
+        cost = compute_cost("claude-sonnet-4-6", 0, 0, 0, 1_000_000)
+        assert round(cost, 6) == 0.30
+
+    def test_cache_write_more_expensive_than_read(self):
+        write_only = compute_cost("claude-sonnet-4-6", 0, 0, 1_000_000, 0)
+        read_only = compute_cost("claude-sonnet-4-6", 0, 0, 0, 1_000_000)
+        # 1.25x input vs 0.1x input.
+        assert round(write_only, 6) == 3.75
+        assert write_only > read_only
+
+    def test_full_breakdown_sums_all_four_components(self):
+        # input + output + cache_write + cache_read, all per-million.
+        cost = compute_cost("claude-sonnet-4-6", 1_000_000, 1_000_000, 1_000_000, 1_000_000)
+        # 3 + 15 + 3.75 + 0.30 = 22.05
+        assert round(cost, 6) == 22.05
+
+    def test_zero_tokens_is_zero(self):
+        assert compute_cost("claude-sonnet-4-6", 0, 0) == 0.0
+
+    def test_none_tokens_treated_as_zero_not_crash(self):
+        # Mirrors compute_context_pressure: the MindsHub passthrough can return
+        # usage.input_tokens=None; cost must not raise on None * float.
+        assert compute_cost("claude-sonnet-4-6", None, None) == 0.0
+        assert compute_cost("claude-sonnet-4-6", None, 1_000_000) == 15.00
+
+    def test_unknown_model_costs_zero_not_crash(self):
+        # An unpriced model must never break a turn — price it at 0.0.
+        assert compute_cost("some-unlisted-model", 1_000_000, 1_000_000) == 0.0
+
+    def test_empty_model_costs_zero(self):
+        assert compute_cost("", 1000, 1000) == 0.0
+
+
+class TestUsageCarriesCost:
+    def test_usage_has_additive_cost_fields_with_safe_defaults(self):
+        # The cost meter rides on the existing Usage dataclass; defaults keep
+        # every prior construction site valid (additive-only change).
+        u = Usage(input_tokens=10, output_tokens=20)
+        assert u.cost_usd == 0.0
+        assert u.cache_write_tokens == 0
+        assert u.cache_read_tokens == 0
+
+    def test_usage_accepts_populated_cost(self):
+        u = Usage(
+            input_tokens=1_000_000,
+            output_tokens=1_000_000,
+            cost_usd=compute_cost("claude-sonnet-4-6", 1_000_000, 1_000_000),
+        )
+        assert u.cost_usd == 18.00