Spaces:

mrfakename
/

Voice-Acting-TTS

Running on Zero

App Files Files Community

Voice-Acting-TTS / run_mimo_audio.py

mrfakename

init

ef96930 about 2 months ago

raw

history blame

33.7 kB

	# Copyright 2025 Xiaomi Corporation.
	import gradio as gr
	import torch
	import os
	import tempfile
	import argparse
	from pathlib import Path
	from src.mimo_audio.mimo_audio import MimoAudio


	class TTSGenerator:
	def __init__(self, model, device=None):
	self.model = model
	self.device = device

	def generate(self, text, instruct, output_audio_path):
	path = Path(output_audio_path)
	path.parent.mkdir(parents=True, exist_ok=True)
	text_output = self.model.tts_sft(text, output_audio_path, instruct)
	return text_output

	class AudioUnderstandingGenerator:
	def __init__(self, model, device=None):
	self.model = model
	self.device = device

	def generate(self, input_speech, input_text, thinking=False):
	text = self.model.audio_understanding_sft(input_speech, input_text, thinking=thinking)
	return text

	class SpokenDialogueGenerator:
	def __init__(self, model, device=None):
	self.model = model
	self.device = device

	def generate(self, input_speech, output_audio_path, system_prompt="You are MiMo-Audio, a friendly AI assistant and your response needs to be concise.", prompt_speech=None, add_history=False):

	path = Path(output_audio_path)
	path.parent.mkdir(parents=True, exist_ok=True)
	text_response = self.model.spoken_dialogue_sft(input_speech, output_audio_path, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history)
	return text_response

	def clear_history(self):
	self.model.clear_history()

	class Speech2TextDialogueGenerator:
	def __init__(self, model, device=None):
	self.model = model
	self.device = device

	def generate(self, input_speech, thinking=False, add_history=False):
	text = self.model.speech2text_dialogue_sft(input_speech, thinking=thinking, add_history=add_history)
	return text

	def clear_history(self):
	self.model.clear_history()


	class TextDialogueGenerator:
	def __init__(self, model, device=None):
	self.model = model
	self.device = device

	def generate(self, input_text, thinking=False, add_history=False):
	text = self.model.text_dialogue_sft(input_text, thinking=thinking, add_history=add_history)
	return text

	def clear_history(self):
	self.model.clear_history()


	class MultiModalSpeechInterface:
	def __init__(self):
	self.model = None
	self.tts_generator = None
	self.audio_understanding_generator = None
	self.spoken_dialogue_generator = None
	self.speech2text_dialogue_generator = None
	self.text_dialogue_generator = None

	self.device = None
	self.model_initialized = False

	def initialize_model(self, model_path=None, tokenizer_path=None):

	try:
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	if model_path is None:
	model_path = "./models/MiMo-Audio-7B-Instruct"
	if tokenizer_path is None:
	tokenizer_path = "./models/MiMo-Audio-Tokenizer"


	print(f"Model path: {model_path}")
	print(f"Tokenizer path: {tokenizer_path}")

	self.model = MimoAudio(model_path, tokenizer_path)
	self.tts_generator = TTSGenerator(self.model, self.device)
	self.audio_understanding_generator = AudioUnderstandingGenerator(self.model, self.device)
	self.spoken_dialogue_generator = SpokenDialogueGenerator(self.model, self.device)
	self.speech2text_dialogue_generator = Speech2TextDialogueGenerator(self.model, self.device)
	self.text_dialogue_generator = TextDialogueGenerator(self.model, self.device)


	self.model_initialized = True
	print("Model loaded successfully!")
	return "✅ Model loaded successfully!"

	except Exception as e:
	error_msg = f"❌ Model loading failed: {str(e)}"
	print(error_msg)
	return error_msg

	def generate_tts_audio(self, input_text, instruct="", use_instruct=False):
	if not self.model_initialized:
	return None, "❌ Error: Model not initialized, please load the model first"

	if not input_text.strip():
	return None, "❌ Error: Please input text content"


	try:
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	output_path = tmp_file.name


	if not (use_instruct and instruct.strip()):
	instruct = None

	print(f"Generating TTS audio: {input_text}")

	text_channel = self.tts_generator.generate(input_text, instruct, output_path)
	status_msg = f"✅ TTS audio generated successfully!\n📝 Input text: {input_text}"
	if use_instruct and instruct is not None and instruct.strip():
	status_msg += f"\n🎭 Style description: {instruct}"
	status_msg += f"\n🎵 Output text channel: {text_channel}"

	return output_path, status_msg, gr.update(value=output_path, visible=True)

	except Exception as e:
	error_msg = f"❌ Error generating TTS audio: {str(e)}"
	print(error_msg)
	return None, error_msg, gr.update(visible=False)


	def generate_audio_understanding_response(self, input_audio, input_text, thinking=False):
	if not self.model_initialized:
	return "", "❌ Error: Model not initialized, please load the model first"

	if input_audio is None and not input_text.strip():
	return "", "❌ Error: Please provide either audio input or text question"

	if input_audio is None:
	return "", "❌ Error: Please upload an audio file for Audio Understanding task"

	if not input_text.strip():
	return "", "❌ Error: Please input your question"

	try:
	print(f"Performing Audio Understanding task:")
	print(f"Audio input: {input_audio}")
	print(f"Text question: {input_text}")


	audio_understanding_response = self.audio_understanding_generator.generate(input_audio, input_text.strip(), thinking=thinking)

	status_msg = f"✅ Audio Understanding task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n❓ Question: {input_text}\n💬 Answer: {audio_understanding_response}"

	return audio_understanding_response, status_msg

	except Exception as e:
	error_msg = f"❌ Error performing Audio Understanding task: {str(e)}"
	print(error_msg)
	return "", error_msg

	def generate_spoken_dialogue_response(self, input_audio, system_prompt=None, prompt_speech=None, add_history=False):
	if not self.model_initialized:
	return "", "❌ Error: Model not initialized, please load the model first"

	if input_audio is None:
	return "", "❌ Error: Please upload an audio file"

	try:

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	output_audio_path = tmp_file.name

	print(f"Performing spoken dialogue task:")
	print(f"Audio input: {input_audio}")
	print(f"Output path: {output_audio_path}")


	dialogue_response = self.spoken_dialogue_generator.generate(input_audio, output_audio_path, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history)

	status_msg = f"✅ Spoken dialogue task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n💬 Response: {dialogue_response[:300]}..."

	return output_audio_path, dialogue_response, status_msg

	except Exception as e:
	error_msg = f"❌ Error performing spoken dialogue task: {str(e)}"
	print(error_msg)
	return None, None, error_msg


	def generate_speech2text_dialogue_response(self, input_audio, thinking=False, add_history=False):
	if not self.model_initialized:
	return "", "❌ Error: Model not initialized, please load the model first"

	if input_audio is None:
	return "", "❌ Error: Please upload an audio file for S2T Dialogue task"


	try:
	print(f"Performing S2T Dialogue task:")
	print(f"Audio input: {input_audio}")


	s2t_response = self.speech2text_dialogue_generator.generate(input_audio, thinking=thinking, add_history=add_history)

	status_msg = f"✅ S2T dialogue task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n❓💬 Answer: {s2t_response}"

	return s2t_response, status_msg

	except Exception as e:
	error_msg = f"❌ Error performing QA task: {str(e)}"
	print(error_msg)
	return "", error_msg

	def generate_text_dialogue_response(self, input_text, thinking=False, add_history=False):
	if not self.model_initialized:
	return "", "❌ Error: Model not initialized, please load the model first"

	if not input_text or not input_text.strip():
	return "", "❌ Error: Please input your text"

	try:
	print(f"Performing Text Dialogue task:")
	print(f"Text input: {input_text}")
	print(f"Thinking mode: {thinking}")
	print(f"Add history: {add_history}")


	t2t_response = self.text_dialogue_generator.generate(input_text.strip(), thinking=thinking, add_history=add_history)

	status_msg = f"✅ T2T dialogue task completed successfully!\n💬 Input: {input_text}"
	if thinking:
	status_msg += f"\n🧠 Thinking mode: Enabled"
	status_msg += f"\n💬 Answer: {t2t_response}"

	return t2t_response, status_msg

	except Exception as e:
	error_msg = f"❌ Error performing T2T dialogue task: {str(e)}"
	print(error_msg)
	return "", error_msg

	def clear_spoken_dialogue_history(self):
	if not self.model_initialized:
	return None, "", "❌ Error: Model not initialized, please load the model first"

	try:
	self.spoken_dialogue_generator.clear_history()
	return None, "", "✅ Spoken dialogue history cleared successfully!"
	except Exception as e:
	error_msg = f"❌ Error clearing spoken dialogue history: {str(e)}"
	print(error_msg)
	return None, "", error_msg

	def clear_speech2text_dialogue_history(self):
	if not self.model_initialized:
	return "", "❌ Error: Model not initialized, please load the model first"

	try:
	self.speech2text_dialogue_generator.clear_history()
	return "", "✅ Speech-to-text dialogue history cleared successfully!"
	except Exception as e:
	error_msg = f"❌ Error clearing S2T dialogue history: {str(e)}"
	print(error_msg)
	return "", error_msg

	def clear_text_dialogue_history(self):
	if not self.model_initialized:
	return "", "❌ Error: Model not initialized, please load the model first"

	try:
	self.text_dialogue_generator.clear_history()
	return "", "✅ Text dialogue history cleared successfully!"
	except Exception as e:
	error_msg = f"❌ Error clearing T2T dialogue history: {str(e)}"
	print(error_msg)
	return "", error_msg



	def create_interface(self):

	with gr.Blocks(title="MiMo-Audio Multimodal Speech Processing System", theme=gr.themes.Soft()) as iface:
	gr.Markdown("# 🎵 MiMo-Audio Multimodal Speech Processing System")
	gr.Markdown("Supports audio understanding, text-to-speech, spoken dialogue, speech-to-text dialogue and text-to-text dialogue")

	with gr.Tabs():

	with gr.TabItem("⚙️ Model Configuration"):
	gr.Markdown("### 📋 Model initialization configuration")

	with gr.Row():
	with gr.Column():

	model_path = gr.Textbox(
	label="Model path",
	placeholder="Leave blank to use default path: ./models/MiMo-Audio-7B-Instruct",
	lines=3
	)

	tokenizer_path = gr.Textbox(
	label="Tokenizer path",
	placeholder="Leave blank to use default path: ./models/MiMo-Audio-Tokenizer",
	lines=3
	)

	init_btn = gr.Button("🔄 Initialize model", variant="primary", size="lg")

	with gr.Column():
	init_status = gr.Textbox(
	label="Initialization status",
	interactive=False,
	lines=6,
	placeholder="Click the initialize model button to start..."
	)


	gr.Markdown("### 💻 System information")
	device_info = gr.Textbox(
	label="Device information",
	value=f"GPU available: {'Yes' if torch.cuda.is_available() else 'No'}",
	interactive=False
	)


	with gr.TabItem("🔊 Audio Understanding"):
	gr.Markdown("### 🎯 Audio Understanding")

	with gr.Row():
	with gr.Column():
	audio_understanding_input_audio = gr.Audio(
	label="Upload Audio File",
	type="filepath",
	interactive=True,
	)

	audio_understanding_input_text = gr.Textbox(
	label="Input Question",
	placeholder="Please input your question...",
	lines=3,
	)

	audio_understanding_thinking = gr.Checkbox(
	label="Enable Thinking Mode",
	value=False,
	info="Enable thinking mode, AI will perform a deeper analysis and thinking"
	)

	audio_understanding_generate_btn = gr.Button("🤖 Start Audio Understanding", variant="primary", size="lg")



	with gr.Column():
	audio_understanding_output_text = gr.Textbox(
	label="Answer Result",
	lines=8,
	interactive=False,
	placeholder="AI's answer will be displayed here...",
	elem_id="audio_understanding_output_text"
	)

	audio_understanding_status = gr.Textbox(
	label="Processing Status",
	lines=6,
	interactive=False,
	placeholder="Processing status information will be displayed here..."
	)

	with gr.Row():
	audio_understanding_copy_btn = gr.Button("📋 Copy Answer", size="sm")
	audio_understanding_clear_btn = gr.Button("🗑️ Clear Result", size="sm")

	gr.Markdown("### 🌟 Audio Understanding Examples")
	audio_understanding_examples = gr.Examples(
	examples=[
	[None, "这段音频的主要内容是什么？"],
	[None, "说话者的情感状态如何？"],
	[None, "音频中提到了哪些关键信息？"],
	[None, "Please summarize the main points of this conversation."],
	[None, "What viewpoint does the speaker want to express?"]
	],
	inputs=[audio_understanding_input_audio, audio_understanding_input_text],
	label="Click the example to automatically fill the question"
	)




	with gr.TabItem("🎵 Text-to-Speech"):
	gr.Markdown("### 🎵 Text-to-Speech")

	with gr.Row():
	with gr.Column():

	tts_input_text = gr.Textbox(
	label="Input Text",
	placeholder="Please input the text you want to convert to speech...",
	lines=4,
	max_lines=8
	)

	tts_instruct = gr.Textbox(
	label="Style Description (Optional)",
	placeholder="Please input the style description (optional)...",
	lines=3,
	max_lines=5
	)

	tts_use_instruct = gr.Checkbox(
	label="Use Style Description",
	value=True,
	info="Enable to use InstructTTS for style-controlled speech generation"
	)

	tts_generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")

	with gr.Column():

	tts_output_audio = gr.Audio(
	label="Generated Speech",
	type="filepath"
	)

	tts_status = gr.Textbox(
	label="Generation Status",
	lines=6,
	interactive=False
	)


	tts_download_btn = gr.DownloadButton(
	label="Download Generated Audio",
	visible=False
	)




	with gr.TabItem("🎤 Spoken Dialogue"):
	gr.Markdown("### 🎯 Spoken Dialogue")

	with gr.Row():
	with gr.Column():

	dialogue_input_audio = gr.Audio(
	label="Upload User Speech",
	type="filepath",
	interactive=True
	)
	system_prompt = gr.Textbox(
	label="System Prompt (Optional)",
	placeholder="e.g.: You are MiMo-Audio, a friendly AI assistant and your response needs to be concise.",
	lines=1
	)
	prompt_speech = gr.Audio(
	label="Prompt Speech (Optional, MiMo-Audio speaks with the same timbre as your prompt.)",
	type="filepath",
	interactive=True
	)
	spoken_dialogue_add_history = gr.Checkbox(
	label="Enable History Record",
	value=True,
	info="Enable to remember the previous dialogue context"
	)

	with gr.Row():
	dialogue_generate_btn = gr.Button("💬 Start Dialogue", variant="primary", size="lg")

	with gr.Row():
	dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")





	with gr.Column():

	dialogue_output_audio = gr.Audio(
	label="Output Audio",
	type="filepath"
	)
	dialogue_output_text = gr.Textbox(
	label="Dialogue Response",
	lines=5,
	interactive=False,
	)
	dialogue_status = gr.Textbox(
	label="Dialogue Status",
	lines=5,
	interactive=False,
	)





	with gr.TabItem("💬 S2T Dialogue"):
	gr.Markdown("### 🎯 S2T Dialogue")

	with gr.Row():
	with gr.Column():

	s2t_dialogue_input_audio = gr.Audio(
	label="Upload User Speech",
	type="filepath",
	interactive=True
	)


	s2t_dialogue_add_history = gr.Checkbox(
	label="Enable History Record",
	value=True,
	info="Enable to remember the previous dialogue context"
	)

	s2t_dialogue_thinking = gr.Checkbox(
	label="Enable Thinking Mode (think mode)",
	value=False,
	info="Enable to perform a deeper analysis and reasoning"
	)

	with gr.Row():
	s2t_dialogue_generate_btn = gr.Button("🎧 Start S2T Dialogue", variant="primary", size="lg")

	with gr.Row():
	s2t_dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")


	with gr.Column():

	s2t_dialogue_output_text = gr.Textbox(
	label="Dialogue Response",
	lines=8,
	interactive=False,
	placeholder="AI's dialogue response will be displayed here..."
	)

	s2t_dialogue_status = gr.Textbox(
	label="Dialogue Status",
	lines=5,
	interactive=False,
	placeholder="Dialogue status information will be displayed here..."
	)



	with gr.TabItem("📝 T2T Dialogue"):
	gr.Markdown("### 🎯 T2T Dialogue")

	with gr.Row():
	with gr.Column():

	t2t_dialogue_input_text = gr.Textbox(
	label="Input Dialogue Content",
	placeholder="Please input the text content you want to dialogue...",
	lines=4,
	max_lines=8
	)

	t2t_dialogue_add_history = gr.Checkbox(
	label="Enable History Record",
	value=True,
	info="Enable to remember the previous dialogue context"
	)

	t2t_dialogue_thinking = gr.Checkbox(
	label="Enable Thinking Mode (Thinking)",
	value=False,
	info="Enable thinking mode, AI will perform a deeper analysis and thinking"
	)

	with gr.Row():
	t2t_dialogue_generate_btn = gr.Button("💬 Start T2T Dialogue", variant="primary", size="lg")

	with gr.Row():
	t2t_dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")



	with gr.Column():
	t2t_dialogue_output_text = gr.Textbox(
	label="Dialogue Response",
	lines=8,
	interactive=False,
	placeholder="AI's dialogue response will be displayed here..."
	)

	t2t_dialogue_status = gr.Textbox(
	label="Dialogue Status",
	lines=5,
	interactive=False,
	placeholder="Dialogue status information will be displayed here..."
	)

	gr.Markdown("### 🌟 T2T Dialogue Examples")
	t2t_dialogue_examples = gr.Examples(
	examples=[
	["Hello, how are you?"],
	["I want to know the history of the development of artificial intelligence"],
	["Please recommend some good movies"],
	["Can you help me explain the basic concepts of quantum physics?"],
	["I'm learning programming recently, any suggestions?"]
	],
	inputs=[t2t_dialogue_input_text],
	label="Click the example to automatically fill the dialogue content"
	)



	def copy_text_to_clipboard(text):
	return text

	def clear_audio_understanding_results():
	return "", "🗑️ Audio Understanding Result Cleared"


	init_btn.click(
	fn=lambda path, tok_path: self.initialize_model(path or None, tok_path or None),
	inputs=[model_path, tokenizer_path],
	outputs=[init_status]
	)


	audio_understanding_generate_btn.click(
	fn=self.generate_audio_understanding_response,
	inputs=[audio_understanding_input_audio, audio_understanding_input_text, audio_understanding_thinking],
	outputs=[audio_understanding_output_text, audio_understanding_status]
	)

	audio_understanding_copy_btn.click(
	fn=None,
	inputs=[audio_understanding_output_text],
	js="(text) => {navigator.clipboard.writeText(text); alert('Copied to clipboard!')}"
	)

	tts_generate_btn.click(
	fn=self.generate_tts_audio,
	inputs=[tts_input_text, tts_instruct, tts_use_instruct],
	outputs=[tts_output_audio, tts_status, tts_download_btn]
	)

	dialogue_generate_btn.click(
	fn=self.generate_spoken_dialogue_response,
	inputs=[dialogue_input_audio, system_prompt, prompt_speech, spoken_dialogue_add_history],
	outputs=[dialogue_output_audio, dialogue_output_text, dialogue_status]
	)



	dialogue_clear_history_btn.click(
	fn=self.clear_spoken_dialogue_history,
	outputs=[dialogue_output_audio, dialogue_output_text, dialogue_status]
	)


	s2t_dialogue_generate_btn.click(
	fn=self.generate_speech2text_dialogue_response,
	inputs=[s2t_dialogue_input_audio, s2t_dialogue_thinking, s2t_dialogue_add_history],
	outputs=[s2t_dialogue_output_text, s2t_dialogue_status]
	)



	s2t_dialogue_clear_history_btn.click(
	fn=self.clear_speech2text_dialogue_history,
	outputs=[s2t_dialogue_output_text, s2t_dialogue_status]
	)


	t2t_dialogue_generate_btn.click(
	fn=self.generate_text_dialogue_response,
	inputs=[t2t_dialogue_input_text, t2t_dialogue_thinking, t2t_dialogue_add_history],
	outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
	)


	t2t_dialogue_clear_history_btn.click(
	fn=self.clear_text_dialogue_history,
	outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
	)




	audio_understanding_clear_btn.click(
	fn=clear_audio_understanding_results,
	outputs=[audio_understanding_output_text, audio_understanding_status]
	)






	tts_input_text.submit(
	fn=self.generate_tts_audio,
	inputs=[tts_input_text, tts_instruct, tts_use_instruct],
	outputs=[tts_output_audio, tts_status, tts_download_btn]
	)


	audio_understanding_input_text.submit(
	fn=self.generate_audio_understanding_response,
	inputs=[audio_understanding_input_audio, audio_understanding_input_text, audio_understanding_thinking],
	outputs=[audio_understanding_output_text, audio_understanding_status]
	)

	t2t_dialogue_input_text.submit(
	fn=self.generate_text_dialogue_response,
	inputs=[t2t_dialogue_input_text, t2t_dialogue_thinking, t2t_dialogue_add_history],
	outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
	)


	return iface

	def main():
	parser = argparse.ArgumentParser(description="MiMo-Audio")
	parser.add_argument("--host", default="0.0.0.0", help="Server Address")
	parser.add_argument("--port", type=int, default=7897, help="Port")
	parser.add_argument("--share", action="store_true", help="Create Public Link")
	parser.add_argument("--debug", action="store_true", help="Debug Mode")

	args = parser.parse_args()



	print("🚀 Launch MiMo-Audio...")


	speech_interface = MultiModalSpeechInterface()



	print("🎨 Create Gradio Interface...")
	iface = speech_interface.create_interface()


	print(f"🌐 Launch Service - Address: {args.host}:{args.port}")

	iface.launch(
	server_name=args.host,
	server_port=args.port,
	share=args.share,
	debug=args.debug
	)

	if __name__ == "__main__":
	main()