-
-
Notifications
You must be signed in to change notification settings - Fork 2.3k
feat(mimo): adapt MiMo-V2.5-TTS series with voicedesign and voiceclone support #8428
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,8 @@ | ||
| import base64 | ||
| import uuid | ||
| from pathlib import Path | ||
|
|
||
| from astrbot import logger | ||
|
|
||
| from ..entities import ProviderType | ||
| from ..provider import TTSProvider | ||
|
|
@@ -41,10 +44,20 @@ def __init__( | |
| self.seed_text = provider_config.get( | ||
| "mimo-tts-seed-text", DEFAULT_MIMO_TTS_SEED_TEXT | ||
| ) | ||
| self.user_prompt = provider_config.get("mimo-tts-user-prompt", "") | ||
| self.voice_audio_path = provider_config.get("mimo-tts-voice-audio-path", "") | ||
| self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL)) | ||
| self.client = create_http_client(self.timeout, self.proxy) | ||
|
|
||
| def _is_v2_5(self) -> bool: | ||
| """Check if the current model is a v2.5 series model.""" | ||
| return "v2.5" in self.model_name | ||
|
Comment on lines
+52
to
+54
|
||
|
|
||
| def _build_user_prompt(self) -> str | None: | ||
| # For voicedesign models, custom user prompt takes precedence. | ||
| if "voicedesign" in self.model_name and self.user_prompt.strip(): | ||
| return self.user_prompt.strip() | ||
| # For other models, use seed_text as fallback. | ||
| seed_text = self.seed_text.strip() | ||
| return seed_text or None | ||
|
|
||
|
|
@@ -62,13 +75,36 @@ def _build_style_prefix(self) -> str: | |
|
|
||
| # MiMo recommends using only the singing style tag at the very beginning. | ||
| if "唱歌" in style_content: | ||
| # v2.5 uses parentheses; v2 uses <style> tags. | ||
| if self._is_v2_5(): | ||
| return "(唱歌)" | ||
| return "<style>唱歌</style>" | ||
|
|
||
| # v2.5 uses parentheses; v2 uses <style> tags. | ||
| if self._is_v2_5(): | ||
| return f"({style_content})" | ||
| return f"<style>{style_content}</style>" | ||
|
|
||
| def _build_assistant_content(self, text: str) -> str: | ||
| return f"{self._build_style_prefix()}{text}" | ||
|
|
||
| def _read_voice_audio_base64(self) -> str: | ||
| if not self.voice_audio_path.strip(): | ||
| return "" | ||
| path = Path(self.voice_audio_path.strip()) | ||
| if not path.exists(): | ||
| logger.warning("Voice audio file not found: %s", path) | ||
| return "" | ||
| try: | ||
| suffix = path.suffix.lower().lstrip(".") | ||
| mime_map = {"wav": "audio/wav", "mp3": "audio/mpeg", "ogg": "audio/ogg"} | ||
| mime = mime_map.get(suffix, "audio/wav") | ||
|
Comment on lines
+98
to
+101
|
||
| b64 = base64.b64encode(path.read_bytes()).decode("utf-8") | ||
| return f"data:{mime};base64,{b64}" | ||
| except Exception as exc: | ||
| logger.warning("Failed to read voice audio file %s: %s", path, exc) | ||
| return "" | ||
|
Comment on lines
+91
to
+106
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reading the voice audio file from disk and base64-encoding it on every single payload construction is highly inefficient and blocks the single-threaded asyncio event loop. Since the clone audio file is static for the lifetime of the provider, we should cache the base64-encoded result after the first read. Because this helper is a synchronous function, modifying the shared cache state is safe from race conditions in the single-threaded asyncio event loop. Additionally, instead of hardcoding a limited set of audio formats in def _read_voice_audio_base64(self) -> str:
if self._voice_audio_cache is not None:
return self._voice_audio_cache
if not self.voice_audio_path.strip():
self._voice_audio_cache = ""
return ""
path = Path(self.voice_audio_path.strip())
if not path.exists():
logger.warning("Voice audio file not found: %s", path)
self._voice_audio_cache = ""
return ""
try:
import mimetypes
mime, _ = mimetypes.guess_type(str(path))
mime = mime or "audio/wav"
b64 = base64.b64encode(path.read_bytes()).decode("utf-8")
self._voice_audio_cache = f"data:{mime};base64,{b64}"
return self._voice_audio_cache
except Exception as exc:
logger.warning("Failed to read voice audio file %s: %s", path, exc)
self._voice_audio_cache = ""
return ""References
|
||
|
|
||
| def _build_payload(self, text: str) -> dict: | ||
| messages: list[dict[str, str]] = [] | ||
|
|
||
|
|
@@ -88,10 +124,16 @@ def _build_payload(self, text: str) -> dict: | |
| } | ||
| ) | ||
|
|
||
| audio_params = {"format": self.audio_format} | ||
| # voice design 模型不支持 audio.voice 参数 | ||
| audio_params: dict[str, str] = {"format": self.audio_format} | ||
| if "voicedesign" not in self.model_name: | ||
| audio_params["voice"] = self.voice | ||
| if "voiceclone" in self.model_name: | ||
| voice_audio_b64 = self._read_voice_audio_base64() | ||
| if voice_audio_b64: | ||
| audio_params["voice"] = voice_audio_b64 | ||
|
Comment on lines
+129
to
+132
|
||
| else: | ||
| audio_params["voice"] = self.voice | ||
| else: | ||
| audio_params["voice"] = self.voice | ||
|
|
||
| return { | ||
| "model": self.model_name, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Defensively handle cases where
mimo-tts-user-promptormimo-tts-voice-audio-pathmight be configured asNone(e.g., when cleared in the UI or parsed from null JSON values). Usingor ""ensures they are always initialized as strings, preventing potentialAttributeErrorcrashes when calling.strip()later. Also, initialize a cache variable for the base64-encoded voice audio to avoid repeated disk reads.