From c149b27db94822f9ed89b38bf8bb2e8b06fc1f3d Mon Sep 17 00:00:00 2001 From: ianu82 Date: Wed, 24 Jun 2026 17:46:23 +0100 Subject: [PATCH] Add additive per-model USD cost meter to LLM usage telemetry Turn the token counts Anton already records per LLM call into a dollar figure a host can surface as "$ this turn / $ this task". Purely additive telemetry: no budget object, no enforcement, no control-flow change (those are explicitly later slices). - New anton/core/llm/pricing.py: a maintained per-model price table (input/output/cache USD rates per 1M tokens, matched by model-ID prefix like the existing _CONTEXT_WINDOWS table) plus compute_cost(). Unknown models price at 0.0 and None token counts are treated as 0, so an unpriced model or a missing usage field never breaks a turn (mirrors compute_context_pressure's defensive posture). - Usage gains additive cache_write_tokens / cache_read_tokens / cost_usd fields, all defaulted so every existing construction site stays valid. - Both providers populate cost_usd (and cache tokens where the SDK reports them) at all 6 Usage construction sites. Anthropic reports cache tokens separately, so they're summed; OpenAI folds cached tokens into prompt_tokens, so they're surfaced for telemetry but not double -priced. cost_usd rides on the existing StreamComplete/usage output. Adds tests/test_pricing.py (tokens x price table -> expected USD, including cache and zero/None/unknown-model cases). Co-Authored-By: Claude Opus 4.8 --- anton/core/llm/anthropic.py | 26 ++++++++- anton/core/llm/openai.py | 43 +++++++++++++- anton/core/llm/pricing.py | 109 ++++++++++++++++++++++++++++++++++++ anton/core/llm/provider.py | 10 ++++ tests/test_pricing.py | 109 ++++++++++++++++++++++++++++++++++++ 5 files changed, 295 insertions(+), 2 deletions(-) create mode 100644 anton/core/llm/pricing.py create mode 100644 tests/test_pricing.py diff --git a/anton/core/llm/anthropic.py b/anton/core/llm/anthropic.py index 7ca41555..7a074f1f 100644 --- a/anton/core/llm/anthropic.py +++ b/anton/core/llm/anthropic.py @@ -21,6 +21,7 @@ Usage, compute_context_pressure, ) +from .pricing import compute_cost # Native server-side web tool type strings exposed by the Anthropic Messages API. # The model invokes these inside the provider — Anton's tool-dispatch loop never @@ -151,13 +152,24 @@ async def complete( ) input_tokens = response.usage.input_tokens + output_tokens = response.usage.output_tokens + # Cache-token counts when present (Anthropic ships them on usage); 0 + # otherwise. Anton sends no cache_control today, so these are normally + # 0 — read defensively so the cost meter is correct if that changes. + cache_write = getattr(response.usage, "cache_creation_input_tokens", 0) or 0 + cache_read = getattr(response.usage, "cache_read_input_tokens", 0) or 0 return LLMResponse( content=content_text, tool_calls=tool_calls, usage=Usage( input_tokens=input_tokens, - output_tokens=response.usage.output_tokens, + output_tokens=output_tokens, context_pressure=compute_context_pressure(model, input_tokens), + cache_write_tokens=cache_write, + cache_read_tokens=cache_read, + cost_usd=compute_cost( + model, input_tokens, output_tokens, cache_write, cache_read + ), ), stop_reason=response.stop_reason, ) @@ -192,6 +204,8 @@ async def stream( tool_calls: list[ToolCall] = [] input_tokens = 0 output_tokens = 0 + cache_write = 0 + cache_read = 0 stop_reason: str | None = None # Track content blocks by index for tool correlation @@ -204,6 +218,11 @@ async def stream( usage = event.message.usage input_tokens = usage.input_tokens output_tokens = getattr(usage, "output_tokens", 0) + # Cache-token counts when present; 0 otherwise (Anton + # sends no cache_control today). Read defensively so the + # cost meter stays correct if caching is enabled later. + cache_write = getattr(usage, "cache_creation_input_tokens", 0) or 0 + cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0 elif event.type == "content_block_start": idx = event.index @@ -294,6 +313,11 @@ async def stream( input_tokens=input_tokens, output_tokens=output_tokens, context_pressure=compute_context_pressure(model, input_tokens), + cache_write_tokens=cache_write, + cache_read_tokens=cache_read, + cost_usd=compute_cost( + model, input_tokens, output_tokens, cache_write, cache_read + ), ), stop_reason=stop_reason, ) diff --git a/anton/core/llm/openai.py b/anton/core/llm/openai.py index c2f0549c..b412bc1e 100644 --- a/anton/core/llm/openai.py +++ b/anton/core/llm/openai.py @@ -23,6 +23,7 @@ Usage, compute_context_pressure, ) +from .pricing import compute_cost def _translate_tools(tools: list[dict]) -> list[dict]: @@ -42,6 +43,26 @@ def _translate_tools(tools: list[dict]) -> list[dict]: return result +def _openai_cached_tokens(usage_obj) -> int: + """Best-effort cached-prompt-token count off an OpenAI usage object. + + OpenAI reports cached tokens as a subset of the prompt under + ``prompt_tokens_details.cached_tokens`` (Chat Completions) or + ``input_tokens_details.cached_tokens`` (Responses API). Returns 0 when the + field is absent — surfaced for telemetry only; it is NOT priced again on + top of the input rate (OpenAI already counts it in the input total). + """ + if not usage_obj: + return 0 + details = ( + getattr(usage_obj, "prompt_tokens_details", None) + or getattr(usage_obj, "input_tokens_details", None) + ) + if details is None: + return 0 + return getattr(details, "cached_tokens", 0) or 0 + + def _translate_tool_choice(tool_choice: dict) -> dict | str: """Anthropic tool_choice -> OpenAI tool_choice.""" tc_type = tool_choice.get("type") @@ -720,13 +741,22 @@ async def complete( usage_obj = response.usage input_tokens = usage_obj.prompt_tokens if usage_obj else 0 + output_tokens = usage_obj.completion_tokens if usage_obj else 0 + # OpenAI folds cached tokens *into* prompt_tokens (a subset, already + # priced by the input rate), unlike Anthropic which reports them + # separately. So we surface cache_read for telemetry but do NOT add a + # cache term to compute_cost — that would double-count. Anton sends no + # cache_control today, so this is 0 in practice regardless. + cache_read = _openai_cached_tokens(usage_obj) return LLMResponse( content=content_text, tool_calls=tool_calls, usage=Usage( input_tokens=input_tokens, - output_tokens=usage_obj.completion_tokens if usage_obj else 0, + output_tokens=output_tokens, context_pressure=compute_context_pressure(model, input_tokens), + cache_read_tokens=cache_read, + cost_usd=compute_cost(model, input_tokens, output_tokens), ), stop_reason=choice.finish_reason, ) @@ -777,6 +807,7 @@ async def stream( tool_calls: list[ToolCall] = [] input_tokens = 0 output_tokens = 0 + cache_read = 0 stop_reason: str | None = None # Track tool call deltas by index @@ -788,6 +819,7 @@ async def stream( if chunk.usage: input_tokens = chunk.usage.prompt_tokens output_tokens = chunk.usage.completion_tokens + cache_read = _openai_cached_tokens(chunk.usage) if not chunk.choices: continue @@ -888,6 +920,8 @@ async def stream( input_tokens=input_tokens, output_tokens=output_tokens, context_pressure=compute_context_pressure(model, input_tokens), + cache_read_tokens=cache_read, + cost_usd=compute_cost(model, input_tokens, output_tokens), ), stop_reason=stop_reason, ) @@ -1009,6 +1043,7 @@ async def _stream_via_responses( tool_calls: list[ToolCall] = [] input_tokens = 0 output_tokens = 0 + cache_read = 0 stop_reason: str | None = None # Map output_index → in-flight function-call state. Responses API uses @@ -1083,6 +1118,7 @@ async def _stream_via_responses( if usage is not None: input_tokens = getattr(usage, "input_tokens", 0) or 0 output_tokens = getattr(usage, "output_tokens", 0) or 0 + cache_read = _openai_cached_tokens(usage) stop_reason = getattr(final_response, "status", None) except openai.BadRequestError as exc: msg = str(exc).lower() @@ -1119,6 +1155,8 @@ async def _stream_via_responses( input_tokens=input_tokens, output_tokens=output_tokens, context_pressure=compute_context_pressure(model, input_tokens), + cache_read_tokens=cache_read, + cost_usd=compute_cost(model, input_tokens, output_tokens), ), stop_reason=stop_reason, ) @@ -1161,6 +1199,7 @@ def _parse_response_object(response, model: str) -> LLMResponse: # which a bare getattr default does NOT catch. Mirrors the streaming path. input_tokens = (getattr(usage, "input_tokens", 0) or 0) if usage else 0 output_tokens = (getattr(usage, "output_tokens", 0) or 0) if usage else 0 + cache_read = _openai_cached_tokens(usage) return LLMResponse( content=content_text, @@ -1169,6 +1208,8 @@ def _parse_response_object(response, model: str) -> LLMResponse: input_tokens=input_tokens, output_tokens=output_tokens, context_pressure=compute_context_pressure(model, input_tokens), + cache_read_tokens=cache_read, + cost_usd=compute_cost(model, input_tokens, output_tokens), ), stop_reason=getattr(response, "status", None), ) diff --git a/anton/core/llm/pricing.py b/anton/core/llm/pricing.py new file mode 100644 index 00000000..d934275b --- /dev/null +++ b/anton/core/llm/pricing.py @@ -0,0 +1,109 @@ +"""Per-model USD price table + cost computation for LLM token usage. + +This module is *purely additive* telemetry: it turns the token counts Anton +already records on every LLM call (see ``Usage`` in ``provider.py``) into a +dollar figure a host can surface as "$ this turn / $ this task". It does not +gate, cap, or alter any request — there is no budget object and no enforcement +here by design (those are later slices). + +Rates are USD per **one million** tokens, matching how providers publish them. +The table is matched by model-ID prefix in declaration order — exact/most +specific IDs first, family fallbacks last — mirroring the ``_CONTEXT_WINDOWS`` +table in ``provider.py`` so the two stay stylistically aligned. Models not in +the table price at ``0.0`` rather than raising: an unpriced model must never +break a turn, and a zero cost is an honest "we don't have a rate for this". + +Cache rates follow Anthropic's published multipliers relative to the base +input rate: a 5-minute cache *write* costs ~1.25x input, a cache *read* ~0.1x. +Anton does not enable prompt caching today (no ``cache_control`` is sent), so +``cache_write_tokens`` / ``cache_read_tokens`` are 0 in practice and the cache +terms contribute nothing — but the table and ``compute_cost`` carry them so the +meter stays correct the moment caching is turned on upstream. +""" +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ModelPrice: + """USD rates for one model, per **one million** tokens. + + ``cache_write`` / ``cache_read`` default to the Anthropic-published + multiples of ``input`` (1.25x write, 0.1x read) so a table entry only has + to specify the two base rates unless a provider prices caching differently. + """ + + input: float + output: float + cache_write: float | None = None + cache_read: float | None = None + + def cache_write_rate(self) -> float: + return self.cache_write if self.cache_write is not None else self.input * 1.25 + + def cache_read_rate(self) -> float: + return self.cache_read if self.cache_read is not None else self.input * 0.1 + + +# Matched by prefix in order — exact model IDs first, family fallbacks last. +# Rates are USD per 1M tokens. Keep this list maintained as the set of models +# Anton actually runs changes; an absent model prices at 0.0 (see compute_cost). +_MODEL_PRICES: list[tuple[str, ModelPrice]] = [ + # Anton defaults (exact model IDs first — see anton/config/settings.py) + ("claude-sonnet-4-6", ModelPrice(input=3.00, output=15.00)), + ("claude-haiku-4-5-20251001", ModelPrice(input=1.00, output=5.00)), + # Claude families (most specific prefix first) + ("claude-opus-4", ModelPrice(input=5.00, output=25.00)), + ("claude-sonnet-4", ModelPrice(input=3.00, output=15.00)), + ("claude-haiku-4", ModelPrice(input=1.00, output=5.00)), + # OpenAI families (rates per 1M tokens) + ("gpt-5", ModelPrice(input=1.25, output=10.00)), + ("gpt-4.1", ModelPrice(input=2.00, output=8.00)), + ("gpt-4o", ModelPrice(input=2.50, output=10.00)), + ("o3", ModelPrice(input=2.00, output=8.00)), + ("o1", ModelPrice(input=15.00, output=60.00)), +] + +_PER_MILLION = 1_000_000.0 + + +def get_model_price(model: str) -> ModelPrice | None: + """Return the price entry whose prefix matches ``model``, or None. + + Matching is longest-declared-first by prefix, the same scheme + ``compute_context_pressure`` uses for context windows. + """ + if not model: + return None + for prefix, price in _MODEL_PRICES: + if model.startswith(prefix): + return price + return None + + +def compute_cost( + model: str, + input_tokens: int | None, + output_tokens: int | None, + cache_write_tokens: int | None = 0, + cache_read_tokens: int | None = 0, +) -> float: + """Return the USD cost of one LLM call's token usage. + + Any token count may be ``None`` — some providers omit a usage field (e.g. + the MindsHub passthrough returns ``input_tokens=None`` on web-search + responses), so a missing count is treated as 0 rather than crashing, exactly + as ``compute_context_pressure`` does. An unknown/unpriced model returns + ``0.0``. The result is always a non-negative float in dollars. + """ + price = get_model_price(model) + if price is None: + return 0.0 + cost = ( + (input_tokens or 0) * price.input + + (output_tokens or 0) * price.output + + (cache_write_tokens or 0) * price.cache_write_rate() + + (cache_read_tokens or 0) * price.cache_read_rate() + ) + return cost / _PER_MILLION diff --git a/anton/core/llm/provider.py b/anton/core/llm/provider.py index 10727a20..3aa577e8 100644 --- a/anton/core/llm/provider.py +++ b/anton/core/llm/provider.py @@ -26,6 +26,16 @@ class Usage: input_tokens: int = 0 output_tokens: int = 0 context_pressure: float = 0.0 + # Cache-token counts, when the provider reports them. Anton does not send + # cache_control today, so these are 0 in practice — they exist so the USD + # cost meter (see pricing.compute_cost / cost_usd below) stays correct the + # moment prompt caching is enabled upstream. + cache_write_tokens: int = 0 + cache_read_tokens: int = 0 + # Additive telemetry: USD cost of this single call's tokens, priced by + # pricing.compute_cost(). 0.0 when the model has no maintained rate. This is + # read-only output for hosts ("$ this turn"); it gates nothing. + cost_usd: float = 0.0 @dataclass diff --git a/tests/test_pricing.py b/tests/test_pricing.py new file mode 100644 index 00000000..52277cc8 --- /dev/null +++ b/tests/test_pricing.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from anton.core.llm.pricing import ModelPrice, compute_cost, get_model_price +from anton.core.llm.provider import Usage + + +class TestGetModelPrice: + def test_exact_default_models_match(self): + # Anton's two default models (anton/config/settings.py) must be priced. + assert get_model_price("claude-sonnet-4-6") is not None + assert get_model_price("claude-haiku-4-5-20251001") is not None + + def test_dated_haiku_id_matches_exact_entry_before_family(self): + # The coding default is the dated id; it must resolve to the dated + # entry, not fall through to a looser family prefix. + price = get_model_price("claude-haiku-4-5-20251001") + assert price is not None and price.input == 1.00 and price.output == 5.00 + + def test_family_prefix_fallback(self): + # An opus model with a date suffix still matches the "claude-opus-4" + # family prefix. + price = get_model_price("claude-opus-4-8") + assert price is not None and price.input == 5.00 and price.output == 25.00 + + def test_unknown_model_returns_none(self): + assert get_model_price("totally-made-up-model") is None + + def test_empty_model_returns_none(self): + assert get_model_price("") is None + + +class TestModelPriceCacheRates: + def test_cache_rates_default_to_input_multiples(self): + # 1.25x write, 0.1x read, relative to the base input rate. + p = ModelPrice(input=4.00, output=20.00) + assert p.cache_write_rate() == 5.00 + assert p.cache_read_rate() == 0.40 + + def test_explicit_cache_rates_win(self): + p = ModelPrice(input=4.00, output=20.00, cache_write=9.99, cache_read=0.01) + assert p.cache_write_rate() == 9.99 + assert p.cache_read_rate() == 0.01 + + +class TestComputeCost: + def test_input_and_output_priced_per_million(self): + # Sonnet: $3/1M in, $15/1M out → 1M each = $18.00 exactly. + assert compute_cost("claude-sonnet-4-6", 1_000_000, 1_000_000) == 18.00 + + def test_small_token_counts(self): + # Haiku: 1000 in @ $1/1M = $0.001, 2000 out @ $5/1M = $0.010 → $0.011. + assert compute_cost("claude-haiku-4-5-20251001", 1000, 2000) == 0.011 + + def test_opus_family(self): + # Opus: 100 in @ $5/1M + 50 out @ $25/1M = 0.0005 + 0.00125 = 0.00175. + assert compute_cost("claude-opus-4-8", 100, 50) == 0.00175 + + def test_cache_tokens_priced_additively(self): + # Anthropic reports cache tokens separately from input, so compute_cost + # adds them: sonnet cache_read 1M = 0.1 * $3 = $0.30. + cost = compute_cost("claude-sonnet-4-6", 0, 0, 0, 1_000_000) + assert round(cost, 6) == 0.30 + + def test_cache_write_more_expensive_than_read(self): + write_only = compute_cost("claude-sonnet-4-6", 0, 0, 1_000_000, 0) + read_only = compute_cost("claude-sonnet-4-6", 0, 0, 0, 1_000_000) + # 1.25x input vs 0.1x input. + assert round(write_only, 6) == 3.75 + assert write_only > read_only + + def test_full_breakdown_sums_all_four_components(self): + # input + output + cache_write + cache_read, all per-million. + cost = compute_cost("claude-sonnet-4-6", 1_000_000, 1_000_000, 1_000_000, 1_000_000) + # 3 + 15 + 3.75 + 0.30 = 22.05 + assert round(cost, 6) == 22.05 + + def test_zero_tokens_is_zero(self): + assert compute_cost("claude-sonnet-4-6", 0, 0) == 0.0 + + def test_none_tokens_treated_as_zero_not_crash(self): + # Mirrors compute_context_pressure: the MindsHub passthrough can return + # usage.input_tokens=None; cost must not raise on None * float. + assert compute_cost("claude-sonnet-4-6", None, None) == 0.0 + assert compute_cost("claude-sonnet-4-6", None, 1_000_000) == 15.00 + + def test_unknown_model_costs_zero_not_crash(self): + # An unpriced model must never break a turn — price it at 0.0. + assert compute_cost("some-unlisted-model", 1_000_000, 1_000_000) == 0.0 + + def test_empty_model_costs_zero(self): + assert compute_cost("", 1000, 1000) == 0.0 + + +class TestUsageCarriesCost: + def test_usage_has_additive_cost_fields_with_safe_defaults(self): + # The cost meter rides on the existing Usage dataclass; defaults keep + # every prior construction site valid (additive-only change). + u = Usage(input_tokens=10, output_tokens=20) + assert u.cost_usd == 0.0 + assert u.cache_write_tokens == 0 + assert u.cache_read_tokens == 0 + + def test_usage_accepts_populated_cost(self): + u = Usage( + input_tokens=1_000_000, + output_tokens=1_000_000, + cost_usd=compute_cost("claude-sonnet-4-6", 1_000_000, 1_000_000), + ) + assert u.cost_usd == 18.00