Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
0be8da0
feat(cartesia): add ink-2 stt
charlotte-zhuang May 24, 2026
7d0bcf5
feat(cartesia): create cartesia example agent
charlotte-zhuang May 24, 2026
f4c1549
docs(cartesia): add changeset
charlotte-zhuang May 24, 2026
3fe673b
refactor(cartesia): add types for server messages
charlotte-zhuang May 24, 2026
013ff24
fix(cartesia): raise api errors rather than swallowing
charlotte-zhuang May 24, 2026
682c1e2
fix(cartesia): tigthen which errors are ignored
charlotte-zhuang May 24, 2026
ce47013
ci(cartesia): add ink-whisper stt test
charlotte-zhuang May 24, 2026
623ea64
docs(cartesia): move where the api doc url is
charlotte-zhuang May 24, 2026
2beb3ea
fix(cartesia): handle flush sentinels with ink-whisper
charlotte-zhuang May 24, 2026
1ebaa4d
docs(cartesia): update changeset
charlotte-zhuang May 24, 2026
cc4747f
ci(cartesia): run make fix
charlotte-zhuang May 24, 2026
8d04745
fix(cartesia): correct error event schema
charlotte-zhuang May 24, 2026
66ebad8
fix(cartesia): send finalize when there is no audio
charlotte-zhuang May 24, 2026
d811bc0
fix(stt): reset state on reconnect
charlotte-zhuang May 24, 2026
02d380e
style(cartesia): remove override decorators
charlotte-zhuang May 26, 2026
5014d51
fix(cartesia): undo language type hint narrowing
charlotte-zhuang May 26, 2026
383ec38
feat(cartesia): support stream flushing
charlotte-zhuang May 27, 2026
29f36f6
feat(cartesia): support stream.flush()
charlotte-zhuang May 27, 2026
52c3d3e
remove unnecessary changeset
charlotte-zhuang May 27, 2026
3ec9c64
fix(cartesia): do not wait for keepalive task to finish before exiting
charlotte-zhuang May 27, 2026
af750cd
docs(cartesia): add docstring to cartesia.py example
charlotte-zhuang May 27, 2026
5501f65
docs(cartesia): add flush example
charlotte-zhuang May 27, 2026
8d69915
docs(cartesia): add flush eval example
charlotte-zhuang May 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions examples/other/cartesia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""A LiveKit voice agent powered by Cartesia speech-to-text and text-to-speech.

Requires ``CARTESIA_API_KEY`` from https://play.cartesia.ai/keys
and one of:

- ``LIVEKIT_INFERENCE_API_KEY`` + ``LIVEKIT_INFERENCE_API_SECRET``
- or ``ANTHROPIC_API_KEY``
- or ``GOOGLE_API_KEY``
- or ``OPENAI_API_KEY``

Run with:

uv run examples/other/cartesia.py
"""

import logging
import os
from collections.abc import Callable

from dotenv import load_dotenv

from livekit.agents import (
Agent,
AgentServer,
AgentSession,
JobContext,
MetricsCollectedEvent,
cli,
inference,
metrics,
room_io,
)
from livekit.agents.beta.tools.end_call import EndCallTool
from livekit.agents.llm import LLM
from livekit.plugins import anthropic, cartesia, google, openai


class MyAgent(Agent):
def __init__(self) -> None:
super().__init__(
instructions="your name is Katie, built by Cartesia."
" you would interact with users via voice."
" with that in mind, keep your responses concise and to the point."
" do not use emojis, asterisks, markdown, or other special characters in your responses."
" you are curious and friendly, and have a sense of humor."
" you will speak english to the user.",
tools=[EndCallTool()],
)

async def on_enter(self) -> None:
self.session.generate_reply(instructions="greet the user and introduce yourself")


def main() -> None:
load_dotenv()

api_key = os.environ.get("CARTESIA_API_KEY")

llm_factories: list[Callable[[], LLM]] = [
lambda: inference.LLM("google/gemini-3-flash"),
lambda: anthropic.LLM(model="claude-haiku-4-5"),
lambda: google.LLM(model="gemini-3.5-flash"),
lambda: openai.LLM(model="gpt-5.4-mini"),
]

llm: LLM | None = None
for factory in llm_factories:
try:
llm = factory()
break
except ValueError:
continue

if not api_key or llm is None:
parts: list[str] = []
if not api_key:
parts.append("CARTESIA_API_KEY is required")
if llm is None:
parts.append(
"No LLM keys were provided (e.g. LIVEKIT_INFERENCE_API_KEY + LIVEKIT_INFERENCE_API_SECRET,"
" ANTHROPIC_API_KEY, GOOGLE_API_KEY, or OPENAI_API_KEY)"
)
raise ValueError(". ".join(parts))

logger = logging.getLogger("cartesia-demo-agent")
server = AgentServer()

@server.rtc_session()
async def entrypoint(ctx: JobContext) -> None:
ctx.log_context_fields = {
"room": ctx.room.name,
}
session: AgentSession = AgentSession(
stt=cartesia.STT(
model="ink-2",
api_key=api_key,
),
llm=llm,
tts=cartesia.TTS(
model="sonic-3.5",
api_key=api_key,
),
turn_handling={
# ink-2 does a great job without VAD
# you may use ink-2 with VAD if desired
"turn_detection": "stt",
},
)

@session.on("metrics_collected")
def _on_metrics_collected(ev: MetricsCollectedEvent) -> None:
metrics.log_metrics(ev.metrics)

async def log_usage():
logger.info(f"Usage: {session.usage}")

ctx.add_shutdown_callback(log_usage)

await session.start(
agent=MyAgent(),
room=ctx.room,
room_options=room_io.RoomOptions(
audio_input=room_io.AudioInputOptions(),
),
)

cli.run_app(server)


if __name__ == "__main__":
main()
129 changes: 129 additions & 0 deletions examples/other/cartesia_transcribe_on_flush_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""Cartesia STT transcribe on flush use-case: WER example.

If you're building your first voice agent, try examples/other/cartesia.py

When configured to "transcribe_on_flush", Cartesia STT only emits a
:attr:`~livekit.agents.stt.SpeechEventType.FINAL_TRANSCRIPT` when *you* call
:meth:`~livekit.agents.stt.RecognizeStream.flush`.

It never emits ``START_OF_SPEECH`` / ``END_OF_SPEECH``.

That makes it a good fit for offline evaluation:
you push known audio, call ``flush()`` at the segment boundaries you control,
and score the final transcripts.

This script is fully self-contained: it synthesizes the reference audio with
``cartesia.TTS`` (so the TTS input text doubles as the WER reference), feeds it to
``cartesia.STT(behavior="transcribe_on_flush")``, flushes once per segment, and prints the
word error rate.

Run with ``CARTESIA_API_KEY`` set:

uv run examples/other/cartesia_transcribe_on_flush_eval.py
"""

from __future__ import annotations

import asyncio
import logging
import re

import aiohttp
from dotenv import load_dotenv

from livekit import rtc
from livekit.agents import stt
from livekit.plugins import cartesia

# Each entry is flushed as its own segment. In a real eval these would be utterances
# from your dataset, each paired with a ground-truth transcript.
REFERENCE_SEGMENTS = [
"The quick brown fox jumps over the lazy dog.",
" Cartesia STT transcribes speech with low latency.",
]


def word_error_rate(reference: str, hypothesis: str) -> float:
"""Word-level WER via Levenshtein edit distance, after light normalization.

Kept dependency-free so the example runs with only the plugins installed. For serious
evaluation prefer a maintained library such as ``jiwer``.
"""

def normalize(text: str) -> list[str]:
text = re.sub(r"[^a-z0-9' ]+", " ", text.lower())
return text.split()

ref = normalize(reference)
hyp = normalize(hypothesis)
if not ref:
return 0.0 if not hyp else 1.0

prev_row = list(range(len(hyp) + 1))
for i, ref_word in enumerate(ref, start=1):
curr_row = [i] + [0] * len(hyp)
for j, hyp_word in enumerate(hyp, start=1):
cost = 0 if ref_word == hyp_word else 1
curr_row[j] = min(
curr_row[j - 1] + 1, # insertion
prev_row[j] + 1, # deletion
prev_row[j - 1] + cost, # substitution
)
prev_row = curr_row
return prev_row[len(hyp)] / len(ref)


async def transcribe_segment(
logger: logging.Logger, stream: stt.RecognizeStream, audio: rtc.AudioFrame
) -> str:
"""Push one segment, flush, and return the resulting final transcript."""
stream.push_frame(audio)
# transcribe_on_flush emits exactly one FINAL_TRANSCRIPT per flush().
stream.flush()
async for ev in stream:
if ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
logger.debug("interim: %s", ev.alternatives[0].text)
elif ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
return ev.alternatives[0].text
return ""


async def main() -> None:
load_dotenv()

logger = logging.getLogger("cartesia-transcribe-on-flush-eval")

logging.basicConfig(level=logging.INFO)

async with aiohttp.ClientSession() as http_session:
tts = cartesia.TTS(model="sonic-3.5", http_session=http_session)
speech_to_text = cartesia.STT(
model="ink-2",
behavior="transcribe_on_flush",
http_session=http_session,
)

stream = speech_to_text.stream()
hypotheses: list[str] = []
try:
for segment_text in REFERENCE_SEGMENTS:
# Generate the reference audio with TTS so the example needs no audio file.
audio = await tts.synthesize(segment_text).collect()
# RecognizeStream.push_frame resamples to the STT sample rate automatically.
hypothesis = await transcribe_segment(logger, stream, audio)
logger.info("segment final: %s", hypothesis)
hypotheses.append(hypothesis)
finally:
await stream.aclose()

# do not add or remove spaces when joining!
# Cartesia's API expects transcript chunks to be joined with no extra formatting
reference = "".join(REFERENCE_SEGMENTS)
hypothesis = "".join(hypotheses)
logger.info("reference: %s", reference)
logger.info("hypothesis: %s", hypothesis)
logger.info("WER: %.2f%%", word_error_rate(reference, hypothesis) * 100)


if __name__ == "__main__":
asyncio.run(main())
Loading
Loading