diff --git a/.changeset/assemblyai-continuous-partials.md b/.changeset/assemblyai-continuous-partials.md new file mode 100644 index 000000000..e8895fdc6 --- /dev/null +++ b/.changeset/assemblyai-continuous-partials.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents-plugin-assemblyai": patch +--- + +feat(assemblyai): add continuousPartials and interruptionDelay streaming options diff --git a/plugins/assemblyai/src/stt.ts b/plugins/assemblyai/src/stt.ts index 871502814..de42f69b0 100644 --- a/plugins/assemblyai/src/stt.ts +++ b/plugins/assemblyai/src/stt.ts @@ -65,6 +65,17 @@ export interface STTOptions { /** Maximum silence (ms) before end-of-turn is forced regardless of confidence. */ maxTurnSilence?: number; formatTurns?: boolean; + /** + * Whether to emit additional partial transcripts during long turns at a steady + * cadence. Only supported with the `u3-rt-pro` model. Defaults to true for + * `u3-rt-pro`. + */ + continuousPartials?: boolean; + /** + * How soon the first early partial is emitted, in milliseconds. Only supported + * with the `u3-rt-pro` model. + */ + interruptionDelay?: number; keytermsPrompt?: string[]; /** Only supported with the `u3-rt-pro` model. */ prompt?: string; @@ -121,6 +132,18 @@ export class STT extends stt.STT { throw new Error("The 'prompt' parameter is only supported with the 'u3-rt-pro' model."); } + if (opts.continuousPartials !== undefined && opts.speechModel !== 'u3-rt-pro') { + throw new Error( + "The 'continuousPartials' parameter is only supported with the 'u3-rt-pro' model.", + ); + } + + if (opts.interruptionDelay !== undefined && opts.speechModel !== 'u3-rt-pro') { + throw new Error( + "The 'interruptionDelay' parameter is only supported with the 'u3-rt-pro' model.", + ); + } + const apiKey = opts.apiKey ?? defaultSTTOptions.apiKey; if (!apiKey) { throw new Error( @@ -130,12 +153,15 @@ export class STT extends stt.STT { // Minimize latency; matches LK's end-of-turn detector well. const minTurnSilence = opts.minTurnSilence ?? 100; + const continuousPartials = + opts.continuousPartials ?? (opts.speechModel === 'u3-rt-pro' ? true : undefined); this.#opts = { ...defaultSTTOptions, ...opts, apiKey, minTurnSilence, + continuousPartials, }; } @@ -210,6 +236,10 @@ export class SpeechStream extends stt.SpeechStream { if (opts.endOfTurnConfidenceThreshold !== undefined) { configMsg.end_of_turn_confidence_threshold = opts.endOfTurnConfidenceThreshold; } + if (opts.continuousPartials !== undefined) { + configMsg.continuous_partials = opts.continuousPartials; + } + if (opts.interruptionDelay !== undefined) configMsg.interruption_delay = opts.interruptionDelay; if (opts.vadThreshold !== undefined) configMsg.vad_threshold = opts.vadThreshold; // Only send if any actual fields (besides `type`) were specified. @@ -280,6 +310,8 @@ export class SpeechStream extends stt.SpeechStream { encoding: this.#opts.encoding, speech_model: this.#opts.speechModel, format_turns: this.#opts.formatTurns, + continuous_partials: this.#opts.continuousPartials, + interruption_delay: this.#opts.interruptionDelay, end_of_turn_confidence_threshold: this.#opts.endOfTurnConfidenceThreshold, min_turn_silence: minSilence, max_turn_silence: maxSilence,