mindsdb · ianu82 · Jun 24, 2026
diff --git a/anton/core/llm/anthropic.py b/anton/core/llm/anthropic.py
@@ -21,6 +21,7 @@
     Usage,
     compute_context_pressure,
 )
+from .pricing import compute_cost
 
 # Native server-side web tool type strings exposed by the Anthropic Messages API.
 # The model invokes these inside the provider — Anton's tool-dispatch loop never
@@ -151,13 +152,24 @@ async def complete(
                 )
 
         input_tokens = response.usage.input_tokens
+        output_tokens = response.usage.output_tokens
+        # Cache-token counts when present (Anthropic ships them on usage); 0
+        # otherwise. Anton sends no cache_control today, so these are normally
+        # 0 — read defensively so the cost meter is correct if that changes.
+        cache_write = getattr(response.usage, "cache_creation_input_tokens", 0) or 0
+        cache_read = getattr(response.usage, "cache_read_input_tokens", 0) or 0
         return LLMResponse(
             content=content_text,
             tool_calls=tool_calls,
             usage=Usage(
                 input_tokens=input_tokens,
-                output_tokens=response.usage.output_tokens,
+                output_tokens=output_tokens,
                 context_pressure=compute_context_pressure(model, input_tokens),
+                cache_write_tokens=cache_write,
+                cache_read_tokens=cache_read,
+                cost_usd=compute_cost(
+                    model, input_tokens, output_tokens, cache_write, cache_read
+                ),
             ),
             stop_reason=response.stop_reason,
         )
@@ -192,6 +204,8 @@ async def stream(
         tool_calls: list[ToolCall] = []
         input_tokens = 0
         output_tokens = 0
+        cache_write = 0
+        cache_read = 0
         stop_reason: str | None = None
 
         # Track content blocks by index for tool correlation
@@ -204,6 +218,11 @@ async def stream(
                         usage = event.message.usage
                         input_tokens = usage.input_tokens
                         output_tokens = getattr(usage, "output_tokens", 0)
+                        # Cache-token counts when present; 0 otherwise (Anton
+                        # sends no cache_control today). Read defensively so the
+                        # cost meter stays correct if caching is enabled later.
+                        cache_write = getattr(usage, "cache_creation_input_tokens", 0) or 0
+                        cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
 
                     elif event.type == "content_block_start":
                         idx = event.index
@@ -294,6 +313,11 @@ async def stream(
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
                     context_pressure=compute_context_pressure(model, input_tokens),
+                    cache_write_tokens=cache_write,
+                    cache_read_tokens=cache_read,
+                    cost_usd=compute_cost(
+                        model, input_tokens, output_tokens, cache_write, cache_read
+                    ),
                 ),
                 stop_reason=stop_reason,
             )

diff --git a/anton/core/llm/openai.py b/anton/core/llm/openai.py
@@ -23,6 +23,7 @@
     Usage,
     compute_context_pressure,
 )
+from .pricing import compute_cost
 
 
 def _translate_tools(tools: list[dict]) -> list[dict]:
@@ -42,6 +43,26 @@ def _translate_tools(tools: list[dict]) -> list[dict]:
     return result
 
 
+def _openai_cached_tokens(usage_obj) -> int:
+    """Best-effort cached-prompt-token count off an OpenAI usage object.
+
+    OpenAI reports cached tokens as a subset of the prompt under
+    ``prompt_tokens_details.cached_tokens`` (Chat Completions) or
+    ``input_tokens_details.cached_tokens`` (Responses API). Returns 0 when the
+    field is absent — surfaced for telemetry only; it is NOT priced again on
+    top of the input rate (OpenAI already counts it in the input total).
+    """
+    if not usage_obj:
+        return 0
+    details = (
+        getattr(usage_obj, "prompt_tokens_details", None)
+        or getattr(usage_obj, "input_tokens_details", None)
+    )
+    if details is None:
+        return 0
+    return getattr(details, "cached_tokens", 0) or 0
+
+
 def _translate_tool_choice(tool_choice: dict) -> dict | str:
     """Anthropic tool_choice -> OpenAI tool_choice."""
     tc_type = tool_choice.get("type")
@@ -720,13 +741,22 @@ async def complete(
 
         usage_obj = response.usage
         input_tokens = usage_obj.prompt_tokens if usage_obj else 0
+        output_tokens = usage_obj.completion_tokens if usage_obj else 0
+        # OpenAI folds cached tokens *into* prompt_tokens (a subset, already
+        # priced by the input rate), unlike Anthropic which reports them
+        # separately. So we surface cache_read for telemetry but do NOT add a
+        # cache term to compute_cost — that would double-count. Anton sends no
+        # cache_control today, so this is 0 in practice regardless.
+        cache_read = _openai_cached_tokens(usage_obj)
         return LLMResponse(
             content=content_text,
             tool_calls=tool_calls,
             usage=Usage(
                 input_tokens=input_tokens,
-                output_tokens=usage_obj.completion_tokens if usage_obj else 0,
+                output_tokens=output_tokens,
                 context_pressure=compute_context_pressure(model, input_tokens),
+                cache_read_tokens=cache_read,
+                cost_usd=compute_cost(model, input_tokens, output_tokens),
             ),
             stop_reason=choice.finish_reason,
         )
@@ -777,6 +807,7 @@ async def stream(
         tool_calls: list[ToolCall] = []
         input_tokens = 0
         output_tokens = 0
+        cache_read = 0
         stop_reason: str | None = None
 
         # Track tool call deltas by index
@@ -788,6 +819,7 @@ async def stream(
                 if chunk.usage:
                     input_tokens = chunk.usage.prompt_tokens
                     output_tokens = chunk.usage.completion_tokens
+                    cache_read = _openai_cached_tokens(chunk.usage)
 
                 if not chunk.choices:
                     continue
@@ -888,6 +920,8 @@ async def stream(
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
                     context_pressure=compute_context_pressure(model, input_tokens),
+                    cache_read_tokens=cache_read,
+                    cost_usd=compute_cost(model, input_tokens, output_tokens),
                 ),
                 stop_reason=stop_reason,
             )
@@ -1009,6 +1043,7 @@ async def _stream_via_responses(
         tool_calls: list[ToolCall] = []
         input_tokens = 0
         output_tokens = 0
+        cache_read = 0
         stop_reason: str | None = None
 
         # Map output_index → in-flight function-call state. Responses API uses
@@ -1083,6 +1118,7 @@ async def _stream_via_responses(
                         if usage is not None:
                             input_tokens = getattr(usage, "input_tokens", 0) or 0
                             output_tokens = getattr(usage, "output_tokens", 0) or 0
+                            cache_read = _openai_cached_tokens(usage)
                         stop_reason = getattr(final_response, "status", None)
         except openai.BadRequestError as exc:
             msg = str(exc).lower()
@@ -1119,6 +1155,8 @@ async def _stream_via_responses(
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
                     context_pressure=compute_context_pressure(model, input_tokens),
+                    cache_read_tokens=cache_read,
+                    cost_usd=compute_cost(model, input_tokens, output_tokens),
                 ),
                 stop_reason=stop_reason,
             )
@@ -1161,6 +1199,7 @@ def _parse_response_object(response, model: str) -> LLMResponse:
     # which a bare getattr default does NOT catch. Mirrors the streaming path.
     input_tokens = (getattr(usage, "input_tokens", 0) or 0) if usage else 0
     output_tokens = (getattr(usage, "output_tokens", 0) or 0) if usage else 0
+    cache_read = _openai_cached_tokens(usage)
 
     return LLMResponse(
         content=content_text,
@@ -1169,6 +1208,8 @@ def _parse_response_object(response, model: str) -> LLMResponse:
             input_tokens=input_tokens,
             output_tokens=output_tokens,
             context_pressure=compute_context_pressure(model, input_tokens),
+            cache_read_tokens=cache_read,
+            cost_usd=compute_cost(model, input_tokens, output_tokens),
         ),
         stop_reason=getattr(response, "status", None),
     )
diff --git a/anton/core/llm/pricing.py b/anton/core/llm/pricing.py
@@ -0,0 +1,109 @@
+"""Per-model USD price table + cost computation for LLM token usage.
+
+This module is *purely additive* telemetry: it turns the token counts Anton
+already records on every LLM call (see ``Usage`` in ``provider.py``) into a
+dollar figure a host can surface as "$ this turn / $ this task". It does not
+gate, cap, or alter any request — there is no budget object and no enforcement
+here by design (those are later slices).
+
+Rates are USD per **one million** tokens, matching how providers publish them.
+The table is matched by model-ID prefix in declaration order — exact/most
+specific IDs first, family fallbacks last — mirroring the ``_CONTEXT_WINDOWS``
+table in ``provider.py`` so the two stay stylistically aligned. Models not in
+the table price at ``0.0`` rather than raising: an unpriced model must never
+break a turn, and a zero cost is an honest "we don't have a rate for this".
+
+Cache rates follow Anthropic's published multipliers relative to the base
+input rate: a 5-minute cache *write* costs ~1.25x input, a cache *read* ~0.1x.
+Anton does not enable prompt caching today (no ``cache_control`` is sent), so
+``cache_write_tokens`` / ``cache_read_tokens`` are 0 in practice and the cache
+terms contribute nothing — but the table and ``compute_cost`` carry them so the
+meter stays correct the moment caching is turned on upstream.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class ModelPrice:
+    """USD rates for one model, per **one million** tokens.
+
+    ``cache_write`` / ``cache_read`` default to the Anthropic-published
+    multiples of ``input`` (1.25x write, 0.1x read) so a table entry only has
+    to specify the two base rates unless a provider prices caching differently.
+    """
+
+    input: float
+    output: float
+    cache_write: float | None = None
+    cache_read: float | None = None
+
+    def cache_write_rate(self) -> float:
+        return self.cache_write if self.cache_write is not None else self.input * 1.25
+
+    def cache_read_rate(self) -> float:
+        return self.cache_read if self.cache_read is not None else self.input * 0.1
+
+
+# Matched by prefix in order — exact model IDs first, family fallbacks last.
+# Rates are USD per 1M tokens. Keep this list maintained as the set of models
+# Anton actually runs changes; an absent model prices at 0.0 (see compute_cost).
+_MODEL_PRICES: list[tuple[str, ModelPrice]] = [
+    # Anton defaults (exact model IDs first — see anton/config/settings.py)
+    ("claude-sonnet-4-6", ModelPrice(input=3.00, output=15.00)),
+    ("claude-haiku-4-5-20251001", ModelPrice(input=1.00, output=5.00)),
+    # Claude families (most specific prefix first)
+    ("claude-opus-4", ModelPrice(input=5.00, output=25.00)),
+    ("claude-sonnet-4", ModelPrice(input=3.00, output=15.00)),
+    ("claude-haiku-4", ModelPrice(input=1.00, output=5.00)),
+    # OpenAI families (rates per 1M tokens)
+    ("gpt-5", ModelPrice(input=1.25, output=10.00)),
+    ("gpt-4.1", ModelPrice(input=2.00, output=8.00)),
+    ("gpt-4o", ModelPrice(input=2.50, output=10.00)),
+    ("o3", ModelPrice(input=2.00, output=8.00)),
+    ("o1", ModelPrice(input=15.00, output=60.00)),
+]
+
+_PER_MILLION = 1_000_000.0
+
+
+def get_model_price(model: str) -> ModelPrice | None:
+    """Return the price entry whose prefix matches ``model``, or None.
+
+    Matching is longest-declared-first by prefix, the same scheme
+    ``compute_context_pressure`` uses for context windows.
+    """
+    if not model:
+        return None
+    for prefix, price in _MODEL_PRICES:
+        if model.startswith(prefix):
+            return price
+    return None
+
+
+def compute_cost(
+    model: str,
+    input_tokens: int | None,
+    output_tokens: int | None,
+    cache_write_tokens: int | None = 0,
+    cache_read_tokens: int | None = 0,
+) -> float:
+    """Return the USD cost of one LLM call's token usage.
+
+    Any token count may be ``None`` — some providers omit a usage field (e.g.
+    the MindsHub passthrough returns ``input_tokens=None`` on web-search
+    responses), so a missing count is treated as 0 rather than crashing, exactly
+    as ``compute_context_pressure`` does. An unknown/unpriced model returns
+    ``0.0``. The result is always a non-negative float in dollars.
+    """
+    price = get_model_price(model)
+    if price is None:
+        return 0.0
+    cost = (
+        (input_tokens or 0) * price.input
+        + (output_tokens or 0) * price.output
+        + (cache_write_tokens or 0) * price.cache_write_rate()
+        + (cache_read_tokens or 0) * price.cache_read_rate()
+    )
+    return cost / _PER_MILLION
diff --git a/anton/core/llm/provider.py b/anton/core/llm/provider.py
@@ -26,6 +26,16 @@ class Usage:
     input_tokens: int = 0
     output_tokens: int = 0
     context_pressure: float = 0.0
+    # Cache-token counts, when the provider reports them. Anton does not send
+    # cache_control today, so these are 0 in practice — they exist so the USD
+    # cost meter (see pricing.compute_cost / cost_usd below) stays correct the
+    # moment prompt caching is enabled upstream.
+    cache_write_tokens: int = 0
+    cache_read_tokens: int = 0
+    # Additive telemetry: USD cost of this single call's tokens, priced by
+    # pricing.compute_cost(). 0.0 when the model has no maintained rate. This is
+    # read-only output for hosts ("$ this turn"); it gates nothing.
+    cost_usd: float = 0.0
 
 
 @dataclass