livekit · charlotte-zhuang · May 24, 2026 · May 24, 2026 · May 24, 2026 · May 24, 2026
diff --git a/examples/other/cartesia.py b/examples/other/cartesia.py
@@ -0,0 +1,131 @@
+"""A LiveKit voice agent powered by Cartesia speech-to-text and text-to-speech.
+
+Requires ``CARTESIA_API_KEY`` from https://play.cartesia.ai/keys
+and one of:
+
+- ``LIVEKIT_INFERENCE_API_KEY`` + ``LIVEKIT_INFERENCE_API_SECRET``
+- or ``ANTHROPIC_API_KEY``
+- or ``GOOGLE_API_KEY``
+- or ``OPENAI_API_KEY``
+
+Run with:
+
+    uv run examples/other/cartesia.py
+"""
+
+import logging
+import os
+from collections.abc import Callable
+
+from dotenv import load_dotenv
+
+from livekit.agents import (
+    Agent,
+    AgentServer,
+    AgentSession,
+    JobContext,
+    MetricsCollectedEvent,
+    cli,
+    inference,
+    metrics,
+    room_io,
+)
+from livekit.agents.beta.tools.end_call import EndCallTool
+from livekit.agents.llm import LLM
+from livekit.plugins import anthropic, cartesia, google, openai
+
+
+class MyAgent(Agent):
+    def __init__(self) -> None:
+        super().__init__(
+            instructions="your name is Katie, built by Cartesia."
+            " you would interact with users via voice."
+            " with that in mind, keep your responses concise and to the point."
+            " do not use emojis, asterisks, markdown, or other special characters in your responses."
+            " you are curious and friendly, and have a sense of humor."
+            " you will speak english to the user.",
+            tools=[EndCallTool()],
+        )
+
+    async def on_enter(self) -> None:
+        self.session.generate_reply(instructions="greet the user and introduce yourself")
+
+
+def main() -> None:
+    load_dotenv()
+
+    api_key = os.environ.get("CARTESIA_API_KEY")
+
+    llm_factories: list[Callable[[], LLM]] = [
+        lambda: inference.LLM("google/gemini-3-flash"),
+        lambda: anthropic.LLM(model="claude-haiku-4-5"),
+        lambda: google.LLM(model="gemini-3.5-flash"),
+        lambda: openai.LLM(model="gpt-5.4-mini"),
+    ]
+
+    llm: LLM | None = None
+    for factory in llm_factories:
+        try:
+            llm = factory()
+            break
+        except ValueError:
+            continue
+
+    if not api_key or llm is None:
+        parts: list[str] = []
+        if not api_key:
+            parts.append("CARTESIA_API_KEY is required")
+        if llm is None:
+            parts.append(
+                "No LLM keys were provided (e.g. LIVEKIT_INFERENCE_API_KEY + LIVEKIT_INFERENCE_API_SECRET,"
+                " ANTHROPIC_API_KEY, GOOGLE_API_KEY, or OPENAI_API_KEY)"
+            )
+        raise ValueError(". ".join(parts))
+
+    logger = logging.getLogger("cartesia-demo-agent")
+    server = AgentServer()
+
+    @server.rtc_session()
+    async def entrypoint(ctx: JobContext) -> None:
+        ctx.log_context_fields = {
+            "room": ctx.room.name,
+        }
+        session: AgentSession = AgentSession(
+            stt=cartesia.STT(
+                model="ink-2",
+                api_key=api_key,
+            ),
+            llm=llm,
+            tts=cartesia.TTS(
+                model="sonic-3.5",
+                api_key=api_key,
+            ),
+            turn_handling={
+                # ink-2 does a great job without VAD
+                # you may use ink-2 with VAD if desired
+                "turn_detection": "stt",
+            },
+        )
+
+        @session.on("metrics_collected")
+        def _on_metrics_collected(ev: MetricsCollectedEvent) -> None:
+            metrics.log_metrics(ev.metrics)
+
+        async def log_usage():
+            logger.info(f"Usage: {session.usage}")
+
+        ctx.add_shutdown_callback(log_usage)
+
+        await session.start(
+            agent=MyAgent(),
+            room=ctx.room,
+            room_options=room_io.RoomOptions(
+                audio_input=room_io.AudioInputOptions(),
+            ),
+        )
+
+    cli.run_app(server)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/other/cartesia_transcribe_on_flush_eval.py b/examples/other/cartesia_transcribe_on_flush_eval.py
@@ -0,0 +1,129 @@
+"""Cartesia STT transcribe on flush use-case: WER example.
+
+If you're building your first voice agent, try examples/other/cartesia.py
+
+When configured to "transcribe_on_flush", Cartesia STT only emits a
+:attr:`~livekit.agents.stt.SpeechEventType.FINAL_TRANSCRIPT` when *you* call
+:meth:`~livekit.agents.stt.RecognizeStream.flush`.
+
+It never emits ``START_OF_SPEECH`` / ``END_OF_SPEECH``.
+
+That makes it a good fit for offline evaluation:
+you push known audio, call ``flush()`` at the segment boundaries you control,
+and score the final transcripts.
+
+This script is fully self-contained: it synthesizes the reference audio with
+``cartesia.TTS`` (so the TTS input text doubles as the WER reference), feeds it to
+``cartesia.STT(behavior="transcribe_on_flush")``, flushes once per segment, and prints the
+word error rate.
+
+Run with ``CARTESIA_API_KEY`` set:
+
+    uv run examples/other/cartesia_transcribe_on_flush_eval.py
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import re
+
+import aiohttp
+from dotenv import load_dotenv
+
+from livekit import rtc
+from livekit.agents import stt
+from livekit.plugins import cartesia
+
+# Each entry is flushed as its own segment. In a real eval these would be utterances
+# from your dataset, each paired with a ground-truth transcript.
+REFERENCE_SEGMENTS = [
+    "The quick brown fox jumps over the lazy dog.",
+    " Cartesia STT transcribes speech with low latency.",
+]
+
+
+def word_error_rate(reference: str, hypothesis: str) -> float:
+    """Word-level WER via Levenshtein edit distance, after light normalization.
+
+    Kept dependency-free so the example runs with only the plugins installed. For serious
+    evaluation prefer a maintained library such as ``jiwer``.
+    """
+
+    def normalize(text: str) -> list[str]:
+        text = re.sub(r"[^a-z0-9' ]+", " ", text.lower())
+        return text.split()
+
+    ref = normalize(reference)
+    hyp = normalize(hypothesis)
+    if not ref:
+        return 0.0 if not hyp else 1.0
+
+    prev_row = list(range(len(hyp) + 1))
+    for i, ref_word in enumerate(ref, start=1):
+        curr_row = [i] + [0] * len(hyp)
+        for j, hyp_word in enumerate(hyp, start=1):
+            cost = 0 if ref_word == hyp_word else 1
+            curr_row[j] = min(
+                curr_row[j - 1] + 1,  # insertion
+                prev_row[j] + 1,  # deletion
+                prev_row[j - 1] + cost,  # substitution
+            )
+        prev_row = curr_row
+    return prev_row[len(hyp)] / len(ref)
+
+
+async def transcribe_segment(
+    logger: logging.Logger, stream: stt.RecognizeStream, audio: rtc.AudioFrame
+) -> str:
+    """Push one segment, flush, and return the resulting final transcript."""
+    stream.push_frame(audio)
+    # transcribe_on_flush emits exactly one FINAL_TRANSCRIPT per flush().
+    stream.flush()
+    async for ev in stream:
+        if ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
+            logger.debug("interim: %s", ev.alternatives[0].text)
+        elif ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
+            return ev.alternatives[0].text
+    return ""
+
+
+async def main() -> None:
+    load_dotenv()
+
+    logger = logging.getLogger("cartesia-transcribe-on-flush-eval")
+
+    logging.basicConfig(level=logging.INFO)
+
+    async with aiohttp.ClientSession() as http_session:
+        tts = cartesia.TTS(model="sonic-3.5", http_session=http_session)
+        speech_to_text = cartesia.STT(
+            model="ink-2",
+            behavior="transcribe_on_flush",
+            http_session=http_session,
+        )
+
+        stream = speech_to_text.stream()
+        hypotheses: list[str] = []
+        try:
+            for segment_text in REFERENCE_SEGMENTS:
+                # Generate the reference audio with TTS so the example needs no audio file.
+                audio = await tts.synthesize(segment_text).collect()
+                # RecognizeStream.push_frame resamples to the STT sample rate automatically.
+                hypothesis = await transcribe_segment(logger, stream, audio)
+                logger.info("segment final: %s", hypothesis)
+                hypotheses.append(hypothesis)
+        finally:
+            await stream.aclose()
+
+    # do not add or remove spaces when joining!
+    # Cartesia's API expects transcript chunks to be joined with no extra formatting
+    reference = "".join(REFERENCE_SEGMENTS)
+    hypothesis = "".join(hypotheses)
+    logger.info("reference:  %s", reference)
+    logger.info("hypothesis: %s", hypothesis)
+    logger.info("WER: %.2f%%", word_error_rate(reference, hypothesis) * 100)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())