Voice-Acting-TTS / run_mimo_audio.py
mrfakename's picture
init
ef96930
# Copyright 2025 Xiaomi Corporation.
import gradio as gr
import torch
import os
import tempfile
import argparse
from pathlib import Path
from src.mimo_audio.mimo_audio import MimoAudio
class TTSGenerator:
def __init__(self, model, device=None):
self.model = model
self.device = device
def generate(self, text, instruct, output_audio_path):
path = Path(output_audio_path)
path.parent.mkdir(parents=True, exist_ok=True)
text_output = self.model.tts_sft(text, output_audio_path, instruct)
return text_output
class AudioUnderstandingGenerator:
def __init__(self, model, device=None):
self.model = model
self.device = device
def generate(self, input_speech, input_text, thinking=False):
text = self.model.audio_understanding_sft(input_speech, input_text, thinking=thinking)
return text
class SpokenDialogueGenerator:
def __init__(self, model, device=None):
self.model = model
self.device = device
def generate(self, input_speech, output_audio_path, system_prompt="You are MiMo-Audio, a friendly AI assistant and your response needs to be concise.", prompt_speech=None, add_history=False):
path = Path(output_audio_path)
path.parent.mkdir(parents=True, exist_ok=True)
text_response = self.model.spoken_dialogue_sft(input_speech, output_audio_path, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history)
return text_response
def clear_history(self):
self.model.clear_history()
class Speech2TextDialogueGenerator:
def __init__(self, model, device=None):
self.model = model
self.device = device
def generate(self, input_speech, thinking=False, add_history=False):
text = self.model.speech2text_dialogue_sft(input_speech, thinking=thinking, add_history=add_history)
return text
def clear_history(self):
self.model.clear_history()
class TextDialogueGenerator:
def __init__(self, model, device=None):
self.model = model
self.device = device
def generate(self, input_text, thinking=False, add_history=False):
text = self.model.text_dialogue_sft(input_text, thinking=thinking, add_history=add_history)
return text
def clear_history(self):
self.model.clear_history()
class MultiModalSpeechInterface:
def __init__(self):
self.model = None
self.tts_generator = None
self.audio_understanding_generator = None
self.spoken_dialogue_generator = None
self.speech2text_dialogue_generator = None
self.text_dialogue_generator = None
self.device = None
self.model_initialized = False
def initialize_model(self, model_path=None, tokenizer_path=None):
try:
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if model_path is None:
model_path = "./models/MiMo-Audio-7B-Instruct"
if tokenizer_path is None:
tokenizer_path = "./models/MiMo-Audio-Tokenizer"
print(f"Model path: {model_path}")
print(f"Tokenizer path: {tokenizer_path}")
self.model = MimoAudio(model_path, tokenizer_path)
self.tts_generator = TTSGenerator(self.model, self.device)
self.audio_understanding_generator = AudioUnderstandingGenerator(self.model, self.device)
self.spoken_dialogue_generator = SpokenDialogueGenerator(self.model, self.device)
self.speech2text_dialogue_generator = Speech2TextDialogueGenerator(self.model, self.device)
self.text_dialogue_generator = TextDialogueGenerator(self.model, self.device)
self.model_initialized = True
print("Model loaded successfully!")
return "✅ Model loaded successfully!"
except Exception as e:
error_msg = f"❌ Model loading failed: {str(e)}"
print(error_msg)
return error_msg
def generate_tts_audio(self, input_text, instruct="", use_instruct=False):
if not self.model_initialized:
return None, "❌ Error: Model not initialized, please load the model first"
if not input_text.strip():
return None, "❌ Error: Please input text content"
try:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
if not (use_instruct and instruct.strip()):
instruct = None
print(f"Generating TTS audio: {input_text}")
text_channel = self.tts_generator.generate(input_text, instruct, output_path)
status_msg = f"✅ TTS audio generated successfully!\n📝 Input text: {input_text}"
if use_instruct and instruct is not None and instruct.strip():
status_msg += f"\n🎭 Style description: {instruct}"
status_msg += f"\n🎵 Output text channel: {text_channel}"
return output_path, status_msg, gr.update(value=output_path, visible=True)
except Exception as e:
error_msg = f"❌ Error generating TTS audio: {str(e)}"
print(error_msg)
return None, error_msg, gr.update(visible=False)
def generate_audio_understanding_response(self, input_audio, input_text, thinking=False):
if not self.model_initialized:
return "", "❌ Error: Model not initialized, please load the model first"
if input_audio is None and not input_text.strip():
return "", "❌ Error: Please provide either audio input or text question"
if input_audio is None:
return "", "❌ Error: Please upload an audio file for Audio Understanding task"
if not input_text.strip():
return "", "❌ Error: Please input your question"
try:
print(f"Performing Audio Understanding task:")
print(f"Audio input: {input_audio}")
print(f"Text question: {input_text}")
audio_understanding_response = self.audio_understanding_generator.generate(input_audio, input_text.strip(), thinking=thinking)
status_msg = f"✅ Audio Understanding task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n❓ Question: {input_text}\n💬 Answer: {audio_understanding_response}"
return audio_understanding_response, status_msg
except Exception as e:
error_msg = f"❌ Error performing Audio Understanding task: {str(e)}"
print(error_msg)
return "", error_msg
def generate_spoken_dialogue_response(self, input_audio, system_prompt=None, prompt_speech=None, add_history=False):
if not self.model_initialized:
return "", "❌ Error: Model not initialized, please load the model first"
if input_audio is None:
return "", "❌ Error: Please upload an audio file"
try:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_audio_path = tmp_file.name
print(f"Performing spoken dialogue task:")
print(f"Audio input: {input_audio}")
print(f"Output path: {output_audio_path}")
dialogue_response = self.spoken_dialogue_generator.generate(input_audio, output_audio_path, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history)
status_msg = f"✅ Spoken dialogue task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n💬 Response: {dialogue_response[:300]}..."
return output_audio_path, dialogue_response, status_msg
except Exception as e:
error_msg = f"❌ Error performing spoken dialogue task: {str(e)}"
print(error_msg)
return None, None, error_msg
def generate_speech2text_dialogue_response(self, input_audio, thinking=False, add_history=False):
if not self.model_initialized:
return "", "❌ Error: Model not initialized, please load the model first"
if input_audio is None:
return "", "❌ Error: Please upload an audio file for S2T Dialogue task"
try:
print(f"Performing S2T Dialogue task:")
print(f"Audio input: {input_audio}")
s2t_response = self.speech2text_dialogue_generator.generate(input_audio, thinking=thinking, add_history=add_history)
status_msg = f"✅ S2T dialogue task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n❓💬 Answer: {s2t_response}"
return s2t_response, status_msg
except Exception as e:
error_msg = f"❌ Error performing QA task: {str(e)}"
print(error_msg)
return "", error_msg
def generate_text_dialogue_response(self, input_text, thinking=False, add_history=False):
if not self.model_initialized:
return "", "❌ Error: Model not initialized, please load the model first"
if not input_text or not input_text.strip():
return "", "❌ Error: Please input your text"
try:
print(f"Performing Text Dialogue task:")
print(f"Text input: {input_text}")
print(f"Thinking mode: {thinking}")
print(f"Add history: {add_history}")
t2t_response = self.text_dialogue_generator.generate(input_text.strip(), thinking=thinking, add_history=add_history)
status_msg = f"✅ T2T dialogue task completed successfully!\n💬 Input: {input_text}"
if thinking:
status_msg += f"\n🧠 Thinking mode: Enabled"
status_msg += f"\n💬 Answer: {t2t_response}"
return t2t_response, status_msg
except Exception as e:
error_msg = f"❌ Error performing T2T dialogue task: {str(e)}"
print(error_msg)
return "", error_msg
def clear_spoken_dialogue_history(self):
if not self.model_initialized:
return None, "", "❌ Error: Model not initialized, please load the model first"
try:
self.spoken_dialogue_generator.clear_history()
return None, "", "✅ Spoken dialogue history cleared successfully!"
except Exception as e:
error_msg = f"❌ Error clearing spoken dialogue history: {str(e)}"
print(error_msg)
return None, "", error_msg
def clear_speech2text_dialogue_history(self):
if not self.model_initialized:
return "", "❌ Error: Model not initialized, please load the model first"
try:
self.speech2text_dialogue_generator.clear_history()
return "", "✅ Speech-to-text dialogue history cleared successfully!"
except Exception as e:
error_msg = f"❌ Error clearing S2T dialogue history: {str(e)}"
print(error_msg)
return "", error_msg
def clear_text_dialogue_history(self):
if not self.model_initialized:
return "", "❌ Error: Model not initialized, please load the model first"
try:
self.text_dialogue_generator.clear_history()
return "", "✅ Text dialogue history cleared successfully!"
except Exception as e:
error_msg = f"❌ Error clearing T2T dialogue history: {str(e)}"
print(error_msg)
return "", error_msg
def create_interface(self):
with gr.Blocks(title="MiMo-Audio Multimodal Speech Processing System", theme=gr.themes.Soft()) as iface:
gr.Markdown("# 🎵 MiMo-Audio Multimodal Speech Processing System")
gr.Markdown("Supports audio understanding, text-to-speech, spoken dialogue, speech-to-text dialogue and text-to-text dialogue")
with gr.Tabs():
with gr.TabItem("⚙️ Model Configuration"):
gr.Markdown("### 📋 Model initialization configuration")
with gr.Row():
with gr.Column():
model_path = gr.Textbox(
label="Model path",
placeholder="Leave blank to use default path: ./models/MiMo-Audio-7B-Instruct",
lines=3
)
tokenizer_path = gr.Textbox(
label="Tokenizer path",
placeholder="Leave blank to use default path: ./models/MiMo-Audio-Tokenizer",
lines=3
)
init_btn = gr.Button("🔄 Initialize model", variant="primary", size="lg")
with gr.Column():
init_status = gr.Textbox(
label="Initialization status",
interactive=False,
lines=6,
placeholder="Click the initialize model button to start..."
)
gr.Markdown("### 💻 System information")
device_info = gr.Textbox(
label="Device information",
value=f"GPU available: {'Yes' if torch.cuda.is_available() else 'No'}",
interactive=False
)
with gr.TabItem("🔊 Audio Understanding"):
gr.Markdown("### 🎯 Audio Understanding")
with gr.Row():
with gr.Column():
audio_understanding_input_audio = gr.Audio(
label="Upload Audio File",
type="filepath",
interactive=True,
)
audio_understanding_input_text = gr.Textbox(
label="Input Question",
placeholder="Please input your question...",
lines=3,
)
audio_understanding_thinking = gr.Checkbox(
label="Enable Thinking Mode",
value=False,
info="Enable thinking mode, AI will perform a deeper analysis and thinking"
)
audio_understanding_generate_btn = gr.Button("🤖 Start Audio Understanding", variant="primary", size="lg")
with gr.Column():
audio_understanding_output_text = gr.Textbox(
label="Answer Result",
lines=8,
interactive=False,
placeholder="AI's answer will be displayed here...",
elem_id="audio_understanding_output_text"
)
audio_understanding_status = gr.Textbox(
label="Processing Status",
lines=6,
interactive=False,
placeholder="Processing status information will be displayed here..."
)
with gr.Row():
audio_understanding_copy_btn = gr.Button("📋 Copy Answer", size="sm")
audio_understanding_clear_btn = gr.Button("🗑️ Clear Result", size="sm")
gr.Markdown("### 🌟 Audio Understanding Examples")
audio_understanding_examples = gr.Examples(
examples=[
[None, "这段音频的主要内容是什么?"],
[None, "说话者的情感状态如何?"],
[None, "音频中提到了哪些关键信息?"],
[None, "Please summarize the main points of this conversation."],
[None, "What viewpoint does the speaker want to express?"]
],
inputs=[audio_understanding_input_audio, audio_understanding_input_text],
label="Click the example to automatically fill the question"
)
with gr.TabItem("🎵 Text-to-Speech"):
gr.Markdown("### 🎵 Text-to-Speech")
with gr.Row():
with gr.Column():
tts_input_text = gr.Textbox(
label="Input Text",
placeholder="Please input the text you want to convert to speech...",
lines=4,
max_lines=8
)
tts_instruct = gr.Textbox(
label="Style Description (Optional)",
placeholder="Please input the style description (optional)...",
lines=3,
max_lines=5
)
tts_use_instruct = gr.Checkbox(
label="Use Style Description",
value=True,
info="Enable to use InstructTTS for style-controlled speech generation"
)
tts_generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
with gr.Column():
tts_output_audio = gr.Audio(
label="Generated Speech",
type="filepath"
)
tts_status = gr.Textbox(
label="Generation Status",
lines=6,
interactive=False
)
tts_download_btn = gr.DownloadButton(
label="Download Generated Audio",
visible=False
)
with gr.TabItem("🎤 Spoken Dialogue"):
gr.Markdown("### 🎯 Spoken Dialogue")
with gr.Row():
with gr.Column():
dialogue_input_audio = gr.Audio(
label="Upload User Speech",
type="filepath",
interactive=True
)
system_prompt = gr.Textbox(
label="System Prompt (Optional)",
placeholder="e.g.: You are MiMo-Audio, a friendly AI assistant and your response needs to be concise.",
lines=1
)
prompt_speech = gr.Audio(
label="Prompt Speech (Optional, MiMo-Audio speaks with the same timbre as your prompt.)",
type="filepath",
interactive=True
)
spoken_dialogue_add_history = gr.Checkbox(
label="Enable History Record",
value=True,
info="Enable to remember the previous dialogue context"
)
with gr.Row():
dialogue_generate_btn = gr.Button("💬 Start Dialogue", variant="primary", size="lg")
with gr.Row():
dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")
with gr.Column():
dialogue_output_audio = gr.Audio(
label="Output Audio",
type="filepath"
)
dialogue_output_text = gr.Textbox(
label="Dialogue Response",
lines=5,
interactive=False,
)
dialogue_status = gr.Textbox(
label="Dialogue Status",
lines=5,
interactive=False,
)
with gr.TabItem("💬 S2T Dialogue"):
gr.Markdown("### 🎯 S2T Dialogue")
with gr.Row():
with gr.Column():
s2t_dialogue_input_audio = gr.Audio(
label="Upload User Speech",
type="filepath",
interactive=True
)
s2t_dialogue_add_history = gr.Checkbox(
label="Enable History Record",
value=True,
info="Enable to remember the previous dialogue context"
)
s2t_dialogue_thinking = gr.Checkbox(
label="Enable Thinking Mode (think mode)",
value=False,
info="Enable to perform a deeper analysis and reasoning"
)
with gr.Row():
s2t_dialogue_generate_btn = gr.Button("🎧 Start S2T Dialogue", variant="primary", size="lg")
with gr.Row():
s2t_dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")
with gr.Column():
s2t_dialogue_output_text = gr.Textbox(
label="Dialogue Response",
lines=8,
interactive=False,
placeholder="AI's dialogue response will be displayed here..."
)
s2t_dialogue_status = gr.Textbox(
label="Dialogue Status",
lines=5,
interactive=False,
placeholder="Dialogue status information will be displayed here..."
)
with gr.TabItem("📝 T2T Dialogue"):
gr.Markdown("### 🎯 T2T Dialogue")
with gr.Row():
with gr.Column():
t2t_dialogue_input_text = gr.Textbox(
label="Input Dialogue Content",
placeholder="Please input the text content you want to dialogue...",
lines=4,
max_lines=8
)
t2t_dialogue_add_history = gr.Checkbox(
label="Enable History Record",
value=True,
info="Enable to remember the previous dialogue context"
)
t2t_dialogue_thinking = gr.Checkbox(
label="Enable Thinking Mode (Thinking)",
value=False,
info="Enable thinking mode, AI will perform a deeper analysis and thinking"
)
with gr.Row():
t2t_dialogue_generate_btn = gr.Button("💬 Start T2T Dialogue", variant="primary", size="lg")
with gr.Row():
t2t_dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")
with gr.Column():
t2t_dialogue_output_text = gr.Textbox(
label="Dialogue Response",
lines=8,
interactive=False,
placeholder="AI's dialogue response will be displayed here..."
)
t2t_dialogue_status = gr.Textbox(
label="Dialogue Status",
lines=5,
interactive=False,
placeholder="Dialogue status information will be displayed here..."
)
gr.Markdown("### 🌟 T2T Dialogue Examples")
t2t_dialogue_examples = gr.Examples(
examples=[
["Hello, how are you?"],
["I want to know the history of the development of artificial intelligence"],
["Please recommend some good movies"],
["Can you help me explain the basic concepts of quantum physics?"],
["I'm learning programming recently, any suggestions?"]
],
inputs=[t2t_dialogue_input_text],
label="Click the example to automatically fill the dialogue content"
)
def copy_text_to_clipboard(text):
return text
def clear_audio_understanding_results():
return "", "🗑️ Audio Understanding Result Cleared"
init_btn.click(
fn=lambda path, tok_path: self.initialize_model(path or None, tok_path or None),
inputs=[model_path, tokenizer_path],
outputs=[init_status]
)
audio_understanding_generate_btn.click(
fn=self.generate_audio_understanding_response,
inputs=[audio_understanding_input_audio, audio_understanding_input_text, audio_understanding_thinking],
outputs=[audio_understanding_output_text, audio_understanding_status]
)
audio_understanding_copy_btn.click(
fn=None,
inputs=[audio_understanding_output_text],
js="(text) => {navigator.clipboard.writeText(text); alert('Copied to clipboard!')}"
)
tts_generate_btn.click(
fn=self.generate_tts_audio,
inputs=[tts_input_text, tts_instruct, tts_use_instruct],
outputs=[tts_output_audio, tts_status, tts_download_btn]
)
dialogue_generate_btn.click(
fn=self.generate_spoken_dialogue_response,
inputs=[dialogue_input_audio, system_prompt, prompt_speech, spoken_dialogue_add_history],
outputs=[dialogue_output_audio, dialogue_output_text, dialogue_status]
)
dialogue_clear_history_btn.click(
fn=self.clear_spoken_dialogue_history,
outputs=[dialogue_output_audio, dialogue_output_text, dialogue_status]
)
s2t_dialogue_generate_btn.click(
fn=self.generate_speech2text_dialogue_response,
inputs=[s2t_dialogue_input_audio, s2t_dialogue_thinking, s2t_dialogue_add_history],
outputs=[s2t_dialogue_output_text, s2t_dialogue_status]
)
s2t_dialogue_clear_history_btn.click(
fn=self.clear_speech2text_dialogue_history,
outputs=[s2t_dialogue_output_text, s2t_dialogue_status]
)
t2t_dialogue_generate_btn.click(
fn=self.generate_text_dialogue_response,
inputs=[t2t_dialogue_input_text, t2t_dialogue_thinking, t2t_dialogue_add_history],
outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
)
t2t_dialogue_clear_history_btn.click(
fn=self.clear_text_dialogue_history,
outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
)
audio_understanding_clear_btn.click(
fn=clear_audio_understanding_results,
outputs=[audio_understanding_output_text, audio_understanding_status]
)
tts_input_text.submit(
fn=self.generate_tts_audio,
inputs=[tts_input_text, tts_instruct, tts_use_instruct],
outputs=[tts_output_audio, tts_status, tts_download_btn]
)
audio_understanding_input_text.submit(
fn=self.generate_audio_understanding_response,
inputs=[audio_understanding_input_audio, audio_understanding_input_text, audio_understanding_thinking],
outputs=[audio_understanding_output_text, audio_understanding_status]
)
t2t_dialogue_input_text.submit(
fn=self.generate_text_dialogue_response,
inputs=[t2t_dialogue_input_text, t2t_dialogue_thinking, t2t_dialogue_add_history],
outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
)
return iface
def main():
parser = argparse.ArgumentParser(description="MiMo-Audio")
parser.add_argument("--host", default="0.0.0.0", help="Server Address")
parser.add_argument("--port", type=int, default=7897, help="Port")
parser.add_argument("--share", action="store_true", help="Create Public Link")
parser.add_argument("--debug", action="store_true", help="Debug Mode")
args = parser.parse_args()
print("🚀 Launch MiMo-Audio...")
speech_interface = MultiModalSpeechInterface()
print("🎨 Create Gradio Interface...")
iface = speech_interface.create_interface()
print(f"🌐 Launch Service - Address: {args.host}:{args.port}")
iface.launch(
server_name=args.host,
server_port=args.port,
share=args.share,
debug=args.debug
)
if __name__ == "__main__":
main()