Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions astrbot/core/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1627,6 +1627,8 @@
"mimo-tts-style-prompt": "",
"mimo-tts-dialect": "",
"mimo-tts-seed-text": "Hello, MiMo, have you had lunch?",
"mimo-tts-user-prompt": "",
"mimo-tts-voice-audio-path": "",
"timeout": "20",
"proxy": "",
},
Expand Down Expand Up @@ -2590,18 +2592,28 @@
"mimo-tts-style-prompt": {
"description": "风格提示词",
"type": "string",
"hint": "会以 <style>...</style> 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。",
"hint": "用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。v2 系列会以 <style>...</style> 标签拼接到文本开头,v2.5 系列会以(...)括号形式拼接。可留空。",
},
"mimo-tts-dialect": {
"description": "方言",
"type": "string",
"hint": "会与风格提示词一起写入开头的 <style>...</style> 标签中,例如 东北话、四川话、河南话、粤语。可留空。",
"hint": "与风格提示词一起拼接到文本开头,例如 东北话、四川话、河南话、粤语。v2 系列使用 <style> 标签,v2.5 系列使用()括号。可留空。",
},
"mimo-tts-seed-text": {
"description": "种子文本",
"type": "string",
"hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。",
},
"mimo-tts-user-prompt": {
"description": "用户提示词",
"type": "string",
"hint": "自定义 user 角色消息。使用 voicedesign 模型时为必填项,用于通过自然语言描述音色。其他模型可留空(优先级高于种子文本)。",
},
"mimo-tts-voice-audio-path": {
"description": "克隆音频文件路径",
"type": "string",
"hint": "voiceclone 模型的待模拟音频文件的本地路径,例如 /path/to/audio.wav。仅 voiceclone 模型需要填写。",
},
"fishaudio-tts-character": {
"description": "character",
"type": "string",
Expand Down
48 changes: 45 additions & 3 deletions astrbot/core/provider/sources/mimo_tts_api_source.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import base64
import uuid
from pathlib import Path

from astrbot import logger

from ..entities import ProviderType
from ..provider import TTSProvider
Expand Down Expand Up @@ -41,10 +44,20 @@ def __init__(
self.seed_text = provider_config.get(
"mimo-tts-seed-text", DEFAULT_MIMO_TTS_SEED_TEXT
)
self.user_prompt = provider_config.get("mimo-tts-user-prompt", "")
self.voice_audio_path = provider_config.get("mimo-tts-voice-audio-path", "")
Comment on lines +47 to +48
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Defensively handle cases where mimo-tts-user-prompt or mimo-tts-voice-audio-path might be configured as None (e.g., when cleared in the UI or parsed from null JSON values). Using or "" ensures they are always initialized as strings, preventing potential AttributeError crashes when calling .strip() later. Also, initialize a cache variable for the base64-encoded voice audio to avoid repeated disk reads.

Suggested change
self.user_prompt = provider_config.get("mimo-tts-user-prompt", "")
self.voice_audio_path = provider_config.get("mimo-tts-voice-audio-path", "")
self.user_prompt = provider_config.get("mimo-tts-user-prompt") or ""
self.voice_audio_path = provider_config.get("mimo-tts-voice-audio-path") or ""
self._voice_audio_cache: str | None = None

self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL))
self.client = create_http_client(self.timeout, self.proxy)

def _is_v2_5(self) -> bool:
"""Check if the current model is a v2.5 series model."""
return "v2.5" in self.model_name
Comment on lines +52 to +54

def _build_user_prompt(self) -> str | None:
# For voicedesign models, custom user prompt takes precedence.
if "voicedesign" in self.model_name and self.user_prompt.strip():
return self.user_prompt.strip()
# For other models, use seed_text as fallback.
seed_text = self.seed_text.strip()
return seed_text or None

Expand All @@ -62,13 +75,36 @@ def _build_style_prefix(self) -> str:

# MiMo recommends using only the singing style tag at the very beginning.
if "唱歌" in style_content:
# v2.5 uses parentheses; v2 uses <style> tags.
if self._is_v2_5():
return "(唱歌)"
return "<style>唱歌</style>"

# v2.5 uses parentheses; v2 uses <style> tags.
if self._is_v2_5():
return f"({style_content})"
return f"<style>{style_content}</style>"

def _build_assistant_content(self, text: str) -> str:
return f"{self._build_style_prefix()}{text}"

def _read_voice_audio_base64(self) -> str:
if not self.voice_audio_path.strip():
return ""
path = Path(self.voice_audio_path.strip())
if not path.exists():
logger.warning("Voice audio file not found: %s", path)
return ""
try:
suffix = path.suffix.lower().lstrip(".")
mime_map = {"wav": "audio/wav", "mp3": "audio/mpeg", "ogg": "audio/ogg"}
mime = mime_map.get(suffix, "audio/wav")
Comment on lines +98 to +101
b64 = base64.b64encode(path.read_bytes()).decode("utf-8")
return f"data:{mime};base64,{b64}"
except Exception as exc:
logger.warning("Failed to read voice audio file %s: %s", path, exc)
return ""
Comment on lines +91 to +106
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Reading the voice audio file from disk and base64-encoding it on every single payload construction is highly inefficient and blocks the single-threaded asyncio event loop. Since the clone audio file is static for the lifetime of the provider, we should cache the base64-encoded result after the first read. Because this helper is a synchronous function, modifying the shared cache state is safe from race conditions in the single-threaded asyncio event loop.

Additionally, instead of hardcoding a limited set of audio formats in mime_map, we can use Python's standard library mimetypes module to dynamically and robustly guess the correct MIME type.

    def _read_voice_audio_base64(self) -> str:
        if self._voice_audio_cache is not None:
            return self._voice_audio_cache

        if not self.voice_audio_path.strip():
            self._voice_audio_cache = ""
            return ""
        path = Path(self.voice_audio_path.strip())
        if not path.exists():
            logger.warning("Voice audio file not found: %s", path)
            self._voice_audio_cache = ""
            return ""
        try:
            import mimetypes
            mime, _ = mimetypes.guess_type(str(path))
            mime = mime or "audio/wav"
            b64 = base64.b64encode(path.read_bytes()).decode("utf-8")
            self._voice_audio_cache = f"data:{mime};base64,{b64}"
            return self._voice_audio_cache
        except Exception as exc:
            logger.warning("Failed to read voice audio file %s: %s", path, exc)
            self._voice_audio_cache = ""
            return ""
References
  1. In a single-threaded asyncio event loop, synchronous functions (code blocks without 'await') are executed atomically and will not be interrupted by other coroutines. Therefore, they are safe from race conditions when modifying shared state within that block.


def _build_payload(self, text: str) -> dict:
messages: list[dict[str, str]] = []

Expand All @@ -88,10 +124,16 @@ def _build_payload(self, text: str) -> dict:
}
)

audio_params = {"format": self.audio_format}
# voice design 模型不支持 audio.voice 参数
audio_params: dict[str, str] = {"format": self.audio_format}
if "voicedesign" not in self.model_name:
audio_params["voice"] = self.voice
if "voiceclone" in self.model_name:
voice_audio_b64 = self._read_voice_audio_base64()
if voice_audio_b64:
audio_params["voice"] = voice_audio_b64
Comment on lines +129 to +132
else:
audio_params["voice"] = self.voice
else:
audio_params["voice"] = self.voice

return {
"model": self.model_name,
Expand Down
12 changes: 10 additions & 2 deletions dashboard/src/i18n/locales/en-US/features/config-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -1536,16 +1536,24 @@
},
"mimo-tts-style-prompt": {
"description": "Style prompt",
"hint": "Prepended to the synthesis target text as a <style>...</style> tag to control speed, emotion, character, or style, such as happy, faster, Sun Wukong, or whispering. Optional."
"hint": "Controls speed, emotion, character, or style, such as happy, faster, Sun Wukong, or whispering. v2 series prepends a <style>...</style> tag; v2.5 series uses (...) parentheses. Optional."
},
"mimo-tts-dialect": {
"description": "Dialect",
"hint": "Combined with the style prompt inside the leading <style>...</style> tag, for example Northeastern Mandarin, Sichuan dialect, Henan dialect, or Cantonese. Optional."
"hint": "Combined with the style prompt at the beginning of the text, for example Northeastern Mandarin, Sichuan dialect, Henan dialect, or Cantonese. v2 series uses <style> tags; v2.5 series uses () parentheses. Optional."
},
"mimo-tts-seed-text": {
"description": "Seed text",
"hint": "Sent as an optional user message to help guide tone and speaking style. It is not appended to the synthesis target text."
},
"mimo-tts-user-prompt": {
"description": "User prompt",
"hint": "Custom user role message. Required for voicedesign models to describe the voice via natural language. For other models, leave empty (takes precedence over seed text)."
},
"mimo-tts-voice-audio-path": {
"description": "Clone audio file path",
"hint": "Local path to the audio file for voiceclone models, e.g. /path/to/audio.wav. Only needed for voiceclone models."
},
"fishaudio-tts-character": {
"description": "character",
"hint": "Fishaudio TTS character. Default is Klee. More roles: https://fish.audio/zh-CN/discovery"
Expand Down
12 changes: 10 additions & 2 deletions dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -1533,16 +1533,24 @@
},
"mimo-tts-style-prompt": {
"description": "Подсказка стиля",
"hint": "Добавляется в начало синтезируемого текста в виде тега <style>...</style> и управляет скоростью, эмоцией, ролью или манерой речи. Необязательно."
"hint": "Управляет скоростью, эмоцией, ролью или манерой речи. Серия v2 добавляет тег <style>...</style>; серия v2.5 использует скобки (...). Необязательно."
},
"mimo-tts-dialect": {
"description": "Диалект",
"hint": "Объединяется с подсказкой стиля внутри начального тега <style>...</style>, например северо-восточный, сычуаньский, хэнаньский или кантонский вариант речи. Необязательно."
"hint": "Объединяется с подсказкой стиля в начале текста, например северо-восточный, сычуаньский, хэнаньский или кантонский. Серия v2 использует теги <style>; серия v2.5 использует скобки (). Необязательно."
},
"mimo-tts-seed-text": {
"description": "Начальный текст",
"hint": "Отправляется как необязательное user-сообщение для настройки тона и манеры речи. Не добавляется к самому тексту синтеза."
},
"mimo-tts-user-prompt": {
"description": "User-подсказка",
"hint": "Пользовательское user-сообщение. Обязательно для voicedesign-моделей для описания голоса на естественном языке. Для других моделей оставьте пустым (имеет приоритет над начальным текстом)."
},
"mimo-tts-voice-audio-path": {
"description": "Путь к аудио для клонирования",
"hint": "Локальный путь к аудиофайлу для voiceclone-моделей, например /path/to/audio.wav. Нужно только для voiceclone."
},
"fishaudio-tts-character": {
"description": "Персонаж",
"hint": "Персонаж Fishaudio. По умолчанию Klee."
Expand Down
12 changes: 10 additions & 2 deletions dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -1538,16 +1538,24 @@
},
"mimo-tts-style-prompt": {
"description": "风格提示词",
"hint": "会以 <style>...</style> 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。"
"hint": "用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。v2 系列会以 <style>...</style> 标签拼接到文本开头,v2.5 系列会以(...)括号形式拼接。可留空。"
},
"mimo-tts-dialect": {
"description": "方言",
"hint": "会与风格提示词一起写入开头的 <style>...</style> 标签中,例如 东北话、四川话、河南话、粤语。可留空。"
"hint": "与风格提示词一起拼接到文本开头,例如 东北话、四川话、河南话、粤语。v2 系列使用 <style> 标签,v2.5 系列使用()括号。可留空。"
},
"mimo-tts-seed-text": {
"description": "种子文本",
"hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。"
},
"mimo-tts-user-prompt": {
"description": "用户提示词",
"hint": "自定义 user 角色消息。使用 voicedesign 模型时为必填项,用于通过自然语言描述音色。其他模型可留空(优先级高于种子文本)。"
},
"mimo-tts-voice-audio-path": {
"description": "克隆音频文件路径",
"hint": "voiceclone 模型的待模拟音频文件的本地路径,例如 /path/to/audio.wav。仅 voiceclone 模型需要填写。"
},
"fishaudio-tts-character": {
"description": "character",
"hint": "fishaudio TTS 的角色。默认为可莉。更多角色请访问:https://fish.audio/zh-CN/discovery"
Expand Down
103 changes: 103 additions & 0 deletions tests/test_mimo_api_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,106 @@ async def _post(*_args, **_kwargs):
return response

return _post


def test_mimo_tts_v2_5_style_uses_parentheses():
provider = _make_tts_provider(
{
"model": "mimo-v2.5-tts",
"mimo-tts-style-prompt": "开心",
"mimo-tts-dialect": "四川话",
"mimo-tts-seed-text": "",
}
)
try:
payload = provider._build_payload("hello")
assert payload["messages"][0]["content"] == "(开心 四川话)hello"
finally:
asyncio.run(provider.terminate())


def test_mimo_tts_v2_5_singing_uses_parentheses():
provider = _make_tts_provider(
{
"model": "mimo-v2.5-tts",
"mimo-tts-style-prompt": "唱歌 开心",
"mimo-tts-dialect": "",
}
)
try:
payload = provider._build_payload("歌词")
assert payload["messages"][1]["content"] == "(唱歌)歌词"
finally:
asyncio.run(provider.terminate())


def test_mimo_tts_voicedesign_uses_custom_user_prompt():
provider = _make_tts_provider(
{
"model": "mimo-v2.5-tts-voicedesign",
"mimo-tts-user-prompt": "用活泼的声音",
"mimo-tts-seed-text": "",
}
)
try:
payload = provider._build_payload("hello")
assert payload["messages"][0] == {
"role": "user",
"content": "用活泼的声音",
}
finally:
asyncio.run(provider.terminate())


def test_mimo_tts_voicedesign_seed_text_used_when_no_user_prompt():
provider = _make_tts_provider(
{
"model": "mimo-v2.5-tts-voicedesign",
"mimo-tts-user-prompt": "",
"mimo-tts-seed-text": "fallback seed",
}
)
try:
payload = provider._build_payload("hello")
assert payload["messages"][0] == {
"role": "user",
"content": "fallback seed",
}
finally:
asyncio.run(provider.terminate())


def test_mimo_tts_voiceclone_uses_voice_audio(monkeypatch, tmp_path):
audio_file = tmp_path / "test.wav"
audio_file.write_bytes(b"fake audio data")
provider = _make_tts_provider(
{
"model": "mimo-v2.5-tts-voiceclone",
"mimo-tts-voice-audio-path": str(audio_file),
"mimo-tts-seed-text": "",
}
)
try:
payload = provider._build_payload("hello")
import base64

expected_b64 = base64.b64encode(b"fake audio data").decode("utf-8")
assert payload["audio"]["voice"] == f"data:audio/wav;base64,{expected_b64}"
finally:
asyncio.run(provider.terminate())


def test_mimo_tts_voiceclone_falls_back_to_voice_when_no_audio():
provider = _make_tts_provider(
{
"model": "mimo-v2.5-tts-voiceclone",
"mimo-tts-voice": "preset_voice",
"mimo-tts-voice-audio-path": "",
"mimo-tts-seed-text": "",
}
)
try:
payload = provider._build_payload("hello")
assert payload["audio"]["voice"] == "preset_voice"
finally:
asyncio.run(provider.terminate())