Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion anton/core/llm/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
Usage,
compute_context_pressure,
)
from .pricing import compute_cost

# Native server-side web tool type strings exposed by the Anthropic Messages API.
# The model invokes these inside the provider — Anton's tool-dispatch loop never
Expand Down Expand Up @@ -151,13 +152,24 @@ async def complete(
)

input_tokens = response.usage.input_tokens
output_tokens = response.usage.output_tokens
# Cache-token counts when present (Anthropic ships them on usage); 0
# otherwise. Anton sends no cache_control today, so these are normally
# 0 — read defensively so the cost meter is correct if that changes.
cache_write = getattr(response.usage, "cache_creation_input_tokens", 0) or 0
cache_read = getattr(response.usage, "cache_read_input_tokens", 0) or 0
return LLMResponse(
content=content_text,
tool_calls=tool_calls,
usage=Usage(
input_tokens=input_tokens,
output_tokens=response.usage.output_tokens,
output_tokens=output_tokens,
context_pressure=compute_context_pressure(model, input_tokens),
cache_write_tokens=cache_write,
cache_read_tokens=cache_read,
cost_usd=compute_cost(
model, input_tokens, output_tokens, cache_write, cache_read
),
),
stop_reason=response.stop_reason,
)
Expand Down Expand Up @@ -192,6 +204,8 @@ async def stream(
tool_calls: list[ToolCall] = []
input_tokens = 0
output_tokens = 0
cache_write = 0
cache_read = 0
stop_reason: str | None = None

# Track content blocks by index for tool correlation
Expand All @@ -204,6 +218,11 @@ async def stream(
usage = event.message.usage
input_tokens = usage.input_tokens
output_tokens = getattr(usage, "output_tokens", 0)
# Cache-token counts when present; 0 otherwise (Anton
# sends no cache_control today). Read defensively so the
# cost meter stays correct if caching is enabled later.
cache_write = getattr(usage, "cache_creation_input_tokens", 0) or 0
cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0

elif event.type == "content_block_start":
idx = event.index
Expand Down Expand Up @@ -294,6 +313,11 @@ async def stream(
input_tokens=input_tokens,
output_tokens=output_tokens,
context_pressure=compute_context_pressure(model, input_tokens),
cache_write_tokens=cache_write,
cache_read_tokens=cache_read,
cost_usd=compute_cost(
model, input_tokens, output_tokens, cache_write, cache_read
),
),
stop_reason=stop_reason,
)
Expand Down
43 changes: 42 additions & 1 deletion anton/core/llm/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
Usage,
compute_context_pressure,
)
from .pricing import compute_cost


def _translate_tools(tools: list[dict]) -> list[dict]:
Expand All @@ -42,6 +43,26 @@ def _translate_tools(tools: list[dict]) -> list[dict]:
return result


def _openai_cached_tokens(usage_obj) -> int:
"""Best-effort cached-prompt-token count off an OpenAI usage object.

OpenAI reports cached tokens as a subset of the prompt under
``prompt_tokens_details.cached_tokens`` (Chat Completions) or
``input_tokens_details.cached_tokens`` (Responses API). Returns 0 when the
field is absent — surfaced for telemetry only; it is NOT priced again on
top of the input rate (OpenAI already counts it in the input total).
"""
if not usage_obj:
return 0
details = (
getattr(usage_obj, "prompt_tokens_details", None)
or getattr(usage_obj, "input_tokens_details", None)
)
if details is None:
return 0
return getattr(details, "cached_tokens", 0) or 0


def _translate_tool_choice(tool_choice: dict) -> dict | str:
"""Anthropic tool_choice -> OpenAI tool_choice."""
tc_type = tool_choice.get("type")
Expand Down Expand Up @@ -720,13 +741,22 @@ async def complete(

usage_obj = response.usage
input_tokens = usage_obj.prompt_tokens if usage_obj else 0
output_tokens = usage_obj.completion_tokens if usage_obj else 0
# OpenAI folds cached tokens *into* prompt_tokens (a subset, already
# priced by the input rate), unlike Anthropic which reports them
# separately. So we surface cache_read for telemetry but do NOT add a
# cache term to compute_cost — that would double-count. Anton sends no
# cache_control today, so this is 0 in practice regardless.
cache_read = _openai_cached_tokens(usage_obj)
return LLMResponse(
content=content_text,
tool_calls=tool_calls,
usage=Usage(
input_tokens=input_tokens,
output_tokens=usage_obj.completion_tokens if usage_obj else 0,
output_tokens=output_tokens,
context_pressure=compute_context_pressure(model, input_tokens),
cache_read_tokens=cache_read,
cost_usd=compute_cost(model, input_tokens, output_tokens),
),
stop_reason=choice.finish_reason,
)
Expand Down Expand Up @@ -777,6 +807,7 @@ async def stream(
tool_calls: list[ToolCall] = []
input_tokens = 0
output_tokens = 0
cache_read = 0
stop_reason: str | None = None

# Track tool call deltas by index
Expand All @@ -788,6 +819,7 @@ async def stream(
if chunk.usage:
input_tokens = chunk.usage.prompt_tokens
output_tokens = chunk.usage.completion_tokens
cache_read = _openai_cached_tokens(chunk.usage)

if not chunk.choices:
continue
Expand Down Expand Up @@ -888,6 +920,8 @@ async def stream(
input_tokens=input_tokens,
output_tokens=output_tokens,
context_pressure=compute_context_pressure(model, input_tokens),
cache_read_tokens=cache_read,
cost_usd=compute_cost(model, input_tokens, output_tokens),
),
stop_reason=stop_reason,
)
Expand Down Expand Up @@ -1009,6 +1043,7 @@ async def _stream_via_responses(
tool_calls: list[ToolCall] = []
input_tokens = 0
output_tokens = 0
cache_read = 0
stop_reason: str | None = None

# Map output_index → in-flight function-call state. Responses API uses
Expand Down Expand Up @@ -1083,6 +1118,7 @@ async def _stream_via_responses(
if usage is not None:
input_tokens = getattr(usage, "input_tokens", 0) or 0
output_tokens = getattr(usage, "output_tokens", 0) or 0
cache_read = _openai_cached_tokens(usage)
stop_reason = getattr(final_response, "status", None)
except openai.BadRequestError as exc:
msg = str(exc).lower()
Expand Down Expand Up @@ -1119,6 +1155,8 @@ async def _stream_via_responses(
input_tokens=input_tokens,
output_tokens=output_tokens,
context_pressure=compute_context_pressure(model, input_tokens),
cache_read_tokens=cache_read,
cost_usd=compute_cost(model, input_tokens, output_tokens),
),
stop_reason=stop_reason,
)
Expand Down Expand Up @@ -1161,6 +1199,7 @@ def _parse_response_object(response, model: str) -> LLMResponse:
# which a bare getattr default does NOT catch. Mirrors the streaming path.
input_tokens = (getattr(usage, "input_tokens", 0) or 0) if usage else 0
output_tokens = (getattr(usage, "output_tokens", 0) or 0) if usage else 0
cache_read = _openai_cached_tokens(usage)

return LLMResponse(
content=content_text,
Expand All @@ -1169,6 +1208,8 @@ def _parse_response_object(response, model: str) -> LLMResponse:
input_tokens=input_tokens,
output_tokens=output_tokens,
context_pressure=compute_context_pressure(model, input_tokens),
cache_read_tokens=cache_read,
cost_usd=compute_cost(model, input_tokens, output_tokens),
),
stop_reason=getattr(response, "status", None),
)
109 changes: 109 additions & 0 deletions anton/core/llm/pricing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""Per-model USD price table + cost computation for LLM token usage.

This module is *purely additive* telemetry: it turns the token counts Anton
already records on every LLM call (see ``Usage`` in ``provider.py``) into a
dollar figure a host can surface as "$ this turn / $ this task". It does not
gate, cap, or alter any request — there is no budget object and no enforcement
here by design (those are later slices).

Rates are USD per **one million** tokens, matching how providers publish them.
The table is matched by model-ID prefix in declaration order — exact/most
specific IDs first, family fallbacks last — mirroring the ``_CONTEXT_WINDOWS``
table in ``provider.py`` so the two stay stylistically aligned. Models not in
the table price at ``0.0`` rather than raising: an unpriced model must never
break a turn, and a zero cost is an honest "we don't have a rate for this".

Cache rates follow Anthropic's published multipliers relative to the base
input rate: a 5-minute cache *write* costs ~1.25x input, a cache *read* ~0.1x.
Anton does not enable prompt caching today (no ``cache_control`` is sent), so
``cache_write_tokens`` / ``cache_read_tokens`` are 0 in practice and the cache
terms contribute nothing — but the table and ``compute_cost`` carry them so the
meter stays correct the moment caching is turned on upstream.
"""
from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class ModelPrice:
"""USD rates for one model, per **one million** tokens.

``cache_write`` / ``cache_read`` default to the Anthropic-published
multiples of ``input`` (1.25x write, 0.1x read) so a table entry only has
to specify the two base rates unless a provider prices caching differently.
"""

input: float
output: float
cache_write: float | None = None
cache_read: float | None = None

def cache_write_rate(self) -> float:
return self.cache_write if self.cache_write is not None else self.input * 1.25

def cache_read_rate(self) -> float:
return self.cache_read if self.cache_read is not None else self.input * 0.1


# Matched by prefix in order — exact model IDs first, family fallbacks last.
# Rates are USD per 1M tokens. Keep this list maintained as the set of models
# Anton actually runs changes; an absent model prices at 0.0 (see compute_cost).
_MODEL_PRICES: list[tuple[str, ModelPrice]] = [
# Anton defaults (exact model IDs first — see anton/config/settings.py)
("claude-sonnet-4-6", ModelPrice(input=3.00, output=15.00)),
("claude-haiku-4-5-20251001", ModelPrice(input=1.00, output=5.00)),
# Claude families (most specific prefix first)
("claude-opus-4", ModelPrice(input=5.00, output=25.00)),
("claude-sonnet-4", ModelPrice(input=3.00, output=15.00)),
("claude-haiku-4", ModelPrice(input=1.00, output=5.00)),
# OpenAI families (rates per 1M tokens)
("gpt-5", ModelPrice(input=1.25, output=10.00)),
("gpt-4.1", ModelPrice(input=2.00, output=8.00)),
("gpt-4o", ModelPrice(input=2.50, output=10.00)),
("o3", ModelPrice(input=2.00, output=8.00)),
("o1", ModelPrice(input=15.00, output=60.00)),
]

_PER_MILLION = 1_000_000.0


def get_model_price(model: str) -> ModelPrice | None:
"""Return the price entry whose prefix matches ``model``, or None.

Matching is longest-declared-first by prefix, the same scheme
``compute_context_pressure`` uses for context windows.
"""
if not model:
return None
for prefix, price in _MODEL_PRICES:
if model.startswith(prefix):
return price
return None


def compute_cost(
model: str,
input_tokens: int | None,
output_tokens: int | None,
cache_write_tokens: int | None = 0,
cache_read_tokens: int | None = 0,
) -> float:
"""Return the USD cost of one LLM call's token usage.

Any token count may be ``None`` — some providers omit a usage field (e.g.
the MindsHub passthrough returns ``input_tokens=None`` on web-search
responses), so a missing count is treated as 0 rather than crashing, exactly
as ``compute_context_pressure`` does. An unknown/unpriced model returns
``0.0``. The result is always a non-negative float in dollars.
"""
price = get_model_price(model)
if price is None:
return 0.0
cost = (
(input_tokens or 0) * price.input
+ (output_tokens or 0) * price.output
+ (cache_write_tokens or 0) * price.cache_write_rate()
+ (cache_read_tokens or 0) * price.cache_read_rate()
)
return cost / _PER_MILLION
10 changes: 10 additions & 0 deletions anton/core/llm/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ class Usage:
input_tokens: int = 0
output_tokens: int = 0
context_pressure: float = 0.0
# Cache-token counts, when the provider reports them. Anton does not send
# cache_control today, so these are 0 in practice — they exist so the USD
# cost meter (see pricing.compute_cost / cost_usd below) stays correct the
# moment prompt caching is enabled upstream.
cache_write_tokens: int = 0
cache_read_tokens: int = 0
# Additive telemetry: USD cost of this single call's tokens, priced by
# pricing.compute_cost(). 0.0 when the model has no maintained rate. This is
# read-only output for hosts ("$ this turn"); it gates nothing.
cost_usd: float = 0.0


@dataclass
Expand Down
Loading
Loading