Spaces:
Running
on
Zero
Running
on
Zero
| # Copyright 2025 Xiaomi Corporation. | |
| import gradio as gr | |
| import torch | |
| import os | |
| import tempfile | |
| import argparse | |
| from pathlib import Path | |
| from src.mimo_audio.mimo_audio import MimoAudio | |
| class TTSGenerator: | |
| def __init__(self, model, device=None): | |
| self.model = model | |
| self.device = device | |
| def generate(self, text, instruct, output_audio_path): | |
| path = Path(output_audio_path) | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| text_output = self.model.tts_sft(text, output_audio_path, instruct) | |
| return text_output | |
| class AudioUnderstandingGenerator: | |
| def __init__(self, model, device=None): | |
| self.model = model | |
| self.device = device | |
| def generate(self, input_speech, input_text, thinking=False): | |
| text = self.model.audio_understanding_sft(input_speech, input_text, thinking=thinking) | |
| return text | |
| class SpokenDialogueGenerator: | |
| def __init__(self, model, device=None): | |
| self.model = model | |
| self.device = device | |
| def generate(self, input_speech, output_audio_path, system_prompt="You are MiMo-Audio, a friendly AI assistant and your response needs to be concise.", prompt_speech=None, add_history=False): | |
| path = Path(output_audio_path) | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| text_response = self.model.spoken_dialogue_sft(input_speech, output_audio_path, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history) | |
| return text_response | |
| def clear_history(self): | |
| self.model.clear_history() | |
| class Speech2TextDialogueGenerator: | |
| def __init__(self, model, device=None): | |
| self.model = model | |
| self.device = device | |
| def generate(self, input_speech, thinking=False, add_history=False): | |
| text = self.model.speech2text_dialogue_sft(input_speech, thinking=thinking, add_history=add_history) | |
| return text | |
| def clear_history(self): | |
| self.model.clear_history() | |
| class TextDialogueGenerator: | |
| def __init__(self, model, device=None): | |
| self.model = model | |
| self.device = device | |
| def generate(self, input_text, thinking=False, add_history=False): | |
| text = self.model.text_dialogue_sft(input_text, thinking=thinking, add_history=add_history) | |
| return text | |
| def clear_history(self): | |
| self.model.clear_history() | |
| class MultiModalSpeechInterface: | |
| def __init__(self): | |
| self.model = None | |
| self.tts_generator = None | |
| self.audio_understanding_generator = None | |
| self.spoken_dialogue_generator = None | |
| self.speech2text_dialogue_generator = None | |
| self.text_dialogue_generator = None | |
| self.device = None | |
| self.model_initialized = False | |
| def initialize_model(self, model_path=None, tokenizer_path=None): | |
| try: | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| if model_path is None: | |
| model_path = "./models/MiMo-Audio-7B-Instruct" | |
| if tokenizer_path is None: | |
| tokenizer_path = "./models/MiMo-Audio-Tokenizer" | |
| print(f"Model path: {model_path}") | |
| print(f"Tokenizer path: {tokenizer_path}") | |
| self.model = MimoAudio(model_path, tokenizer_path) | |
| self.tts_generator = TTSGenerator(self.model, self.device) | |
| self.audio_understanding_generator = AudioUnderstandingGenerator(self.model, self.device) | |
| self.spoken_dialogue_generator = SpokenDialogueGenerator(self.model, self.device) | |
| self.speech2text_dialogue_generator = Speech2TextDialogueGenerator(self.model, self.device) | |
| self.text_dialogue_generator = TextDialogueGenerator(self.model, self.device) | |
| self.model_initialized = True | |
| print("Model loaded successfully!") | |
| return "✅ Model loaded successfully!" | |
| except Exception as e: | |
| error_msg = f"❌ Model loading failed: {str(e)}" | |
| print(error_msg) | |
| return error_msg | |
| def generate_tts_audio(self, input_text, instruct="", use_instruct=False): | |
| if not self.model_initialized: | |
| return None, "❌ Error: Model not initialized, please load the model first" | |
| if not input_text.strip(): | |
| return None, "❌ Error: Please input text content" | |
| try: | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| if not (use_instruct and instruct.strip()): | |
| instruct = None | |
| print(f"Generating TTS audio: {input_text}") | |
| text_channel = self.tts_generator.generate(input_text, instruct, output_path) | |
| status_msg = f"✅ TTS audio generated successfully!\n📝 Input text: {input_text}" | |
| if use_instruct and instruct is not None and instruct.strip(): | |
| status_msg += f"\n🎭 Style description: {instruct}" | |
| status_msg += f"\n🎵 Output text channel: {text_channel}" | |
| return output_path, status_msg, gr.update(value=output_path, visible=True) | |
| except Exception as e: | |
| error_msg = f"❌ Error generating TTS audio: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg, gr.update(visible=False) | |
| def generate_audio_understanding_response(self, input_audio, input_text, thinking=False): | |
| if not self.model_initialized: | |
| return "", "❌ Error: Model not initialized, please load the model first" | |
| if input_audio is None and not input_text.strip(): | |
| return "", "❌ Error: Please provide either audio input or text question" | |
| if input_audio is None: | |
| return "", "❌ Error: Please upload an audio file for Audio Understanding task" | |
| if not input_text.strip(): | |
| return "", "❌ Error: Please input your question" | |
| try: | |
| print(f"Performing Audio Understanding task:") | |
| print(f"Audio input: {input_audio}") | |
| print(f"Text question: {input_text}") | |
| audio_understanding_response = self.audio_understanding_generator.generate(input_audio, input_text.strip(), thinking=thinking) | |
| status_msg = f"✅ Audio Understanding task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n❓ Question: {input_text}\n💬 Answer: {audio_understanding_response}" | |
| return audio_understanding_response, status_msg | |
| except Exception as e: | |
| error_msg = f"❌ Error performing Audio Understanding task: {str(e)}" | |
| print(error_msg) | |
| return "", error_msg | |
| def generate_spoken_dialogue_response(self, input_audio, system_prompt=None, prompt_speech=None, add_history=False): | |
| if not self.model_initialized: | |
| return "", "❌ Error: Model not initialized, please load the model first" | |
| if input_audio is None: | |
| return "", "❌ Error: Please upload an audio file" | |
| try: | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_audio_path = tmp_file.name | |
| print(f"Performing spoken dialogue task:") | |
| print(f"Audio input: {input_audio}") | |
| print(f"Output path: {output_audio_path}") | |
| dialogue_response = self.spoken_dialogue_generator.generate(input_audio, output_audio_path, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history) | |
| status_msg = f"✅ Spoken dialogue task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n💬 Response: {dialogue_response[:300]}..." | |
| return output_audio_path, dialogue_response, status_msg | |
| except Exception as e: | |
| error_msg = f"❌ Error performing spoken dialogue task: {str(e)}" | |
| print(error_msg) | |
| return None, None, error_msg | |
| def generate_speech2text_dialogue_response(self, input_audio, thinking=False, add_history=False): | |
| if not self.model_initialized: | |
| return "", "❌ Error: Model not initialized, please load the model first" | |
| if input_audio is None: | |
| return "", "❌ Error: Please upload an audio file for S2T Dialogue task" | |
| try: | |
| print(f"Performing S2T Dialogue task:") | |
| print(f"Audio input: {input_audio}") | |
| s2t_response = self.speech2text_dialogue_generator.generate(input_audio, thinking=thinking, add_history=add_history) | |
| status_msg = f"✅ S2T dialogue task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n❓💬 Answer: {s2t_response}" | |
| return s2t_response, status_msg | |
| except Exception as e: | |
| error_msg = f"❌ Error performing QA task: {str(e)}" | |
| print(error_msg) | |
| return "", error_msg | |
| def generate_text_dialogue_response(self, input_text, thinking=False, add_history=False): | |
| if not self.model_initialized: | |
| return "", "❌ Error: Model not initialized, please load the model first" | |
| if not input_text or not input_text.strip(): | |
| return "", "❌ Error: Please input your text" | |
| try: | |
| print(f"Performing Text Dialogue task:") | |
| print(f"Text input: {input_text}") | |
| print(f"Thinking mode: {thinking}") | |
| print(f"Add history: {add_history}") | |
| t2t_response = self.text_dialogue_generator.generate(input_text.strip(), thinking=thinking, add_history=add_history) | |
| status_msg = f"✅ T2T dialogue task completed successfully!\n💬 Input: {input_text}" | |
| if thinking: | |
| status_msg += f"\n🧠 Thinking mode: Enabled" | |
| status_msg += f"\n💬 Answer: {t2t_response}" | |
| return t2t_response, status_msg | |
| except Exception as e: | |
| error_msg = f"❌ Error performing T2T dialogue task: {str(e)}" | |
| print(error_msg) | |
| return "", error_msg | |
| def clear_spoken_dialogue_history(self): | |
| if not self.model_initialized: | |
| return None, "", "❌ Error: Model not initialized, please load the model first" | |
| try: | |
| self.spoken_dialogue_generator.clear_history() | |
| return None, "", "✅ Spoken dialogue history cleared successfully!" | |
| except Exception as e: | |
| error_msg = f"❌ Error clearing spoken dialogue history: {str(e)}" | |
| print(error_msg) | |
| return None, "", error_msg | |
| def clear_speech2text_dialogue_history(self): | |
| if not self.model_initialized: | |
| return "", "❌ Error: Model not initialized, please load the model first" | |
| try: | |
| self.speech2text_dialogue_generator.clear_history() | |
| return "", "✅ Speech-to-text dialogue history cleared successfully!" | |
| except Exception as e: | |
| error_msg = f"❌ Error clearing S2T dialogue history: {str(e)}" | |
| print(error_msg) | |
| return "", error_msg | |
| def clear_text_dialogue_history(self): | |
| if not self.model_initialized: | |
| return "", "❌ Error: Model not initialized, please load the model first" | |
| try: | |
| self.text_dialogue_generator.clear_history() | |
| return "", "✅ Text dialogue history cleared successfully!" | |
| except Exception as e: | |
| error_msg = f"❌ Error clearing T2T dialogue history: {str(e)}" | |
| print(error_msg) | |
| return "", error_msg | |
| def create_interface(self): | |
| with gr.Blocks(title="MiMo-Audio Multimodal Speech Processing System", theme=gr.themes.Soft()) as iface: | |
| gr.Markdown("# 🎵 MiMo-Audio Multimodal Speech Processing System") | |
| gr.Markdown("Supports audio understanding, text-to-speech, spoken dialogue, speech-to-text dialogue and text-to-text dialogue") | |
| with gr.Tabs(): | |
| with gr.TabItem("⚙️ Model Configuration"): | |
| gr.Markdown("### 📋 Model initialization configuration") | |
| with gr.Row(): | |
| with gr.Column(): | |
| model_path = gr.Textbox( | |
| label="Model path", | |
| placeholder="Leave blank to use default path: ./models/MiMo-Audio-7B-Instruct", | |
| lines=3 | |
| ) | |
| tokenizer_path = gr.Textbox( | |
| label="Tokenizer path", | |
| placeholder="Leave blank to use default path: ./models/MiMo-Audio-Tokenizer", | |
| lines=3 | |
| ) | |
| init_btn = gr.Button("🔄 Initialize model", variant="primary", size="lg") | |
| with gr.Column(): | |
| init_status = gr.Textbox( | |
| label="Initialization status", | |
| interactive=False, | |
| lines=6, | |
| placeholder="Click the initialize model button to start..." | |
| ) | |
| gr.Markdown("### 💻 System information") | |
| device_info = gr.Textbox( | |
| label="Device information", | |
| value=f"GPU available: {'Yes' if torch.cuda.is_available() else 'No'}", | |
| interactive=False | |
| ) | |
| with gr.TabItem("🔊 Audio Understanding"): | |
| gr.Markdown("### 🎯 Audio Understanding") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_understanding_input_audio = gr.Audio( | |
| label="Upload Audio File", | |
| type="filepath", | |
| interactive=True, | |
| ) | |
| audio_understanding_input_text = gr.Textbox( | |
| label="Input Question", | |
| placeholder="Please input your question...", | |
| lines=3, | |
| ) | |
| audio_understanding_thinking = gr.Checkbox( | |
| label="Enable Thinking Mode", | |
| value=False, | |
| info="Enable thinking mode, AI will perform a deeper analysis and thinking" | |
| ) | |
| audio_understanding_generate_btn = gr.Button("🤖 Start Audio Understanding", variant="primary", size="lg") | |
| with gr.Column(): | |
| audio_understanding_output_text = gr.Textbox( | |
| label="Answer Result", | |
| lines=8, | |
| interactive=False, | |
| placeholder="AI's answer will be displayed here...", | |
| elem_id="audio_understanding_output_text" | |
| ) | |
| audio_understanding_status = gr.Textbox( | |
| label="Processing Status", | |
| lines=6, | |
| interactive=False, | |
| placeholder="Processing status information will be displayed here..." | |
| ) | |
| with gr.Row(): | |
| audio_understanding_copy_btn = gr.Button("📋 Copy Answer", size="sm") | |
| audio_understanding_clear_btn = gr.Button("🗑️ Clear Result", size="sm") | |
| gr.Markdown("### 🌟 Audio Understanding Examples") | |
| audio_understanding_examples = gr.Examples( | |
| examples=[ | |
| [None, "这段音频的主要内容是什么?"], | |
| [None, "说话者的情感状态如何?"], | |
| [None, "音频中提到了哪些关键信息?"], | |
| [None, "Please summarize the main points of this conversation."], | |
| [None, "What viewpoint does the speaker want to express?"] | |
| ], | |
| inputs=[audio_understanding_input_audio, audio_understanding_input_text], | |
| label="Click the example to automatically fill the question" | |
| ) | |
| with gr.TabItem("🎵 Text-to-Speech"): | |
| gr.Markdown("### 🎵 Text-to-Speech") | |
| with gr.Row(): | |
| with gr.Column(): | |
| tts_input_text = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Please input the text you want to convert to speech...", | |
| lines=4, | |
| max_lines=8 | |
| ) | |
| tts_instruct = gr.Textbox( | |
| label="Style Description (Optional)", | |
| placeholder="Please input the style description (optional)...", | |
| lines=3, | |
| max_lines=5 | |
| ) | |
| tts_use_instruct = gr.Checkbox( | |
| label="Use Style Description", | |
| value=True, | |
| info="Enable to use InstructTTS for style-controlled speech generation" | |
| ) | |
| tts_generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") | |
| with gr.Column(): | |
| tts_output_audio = gr.Audio( | |
| label="Generated Speech", | |
| type="filepath" | |
| ) | |
| tts_status = gr.Textbox( | |
| label="Generation Status", | |
| lines=6, | |
| interactive=False | |
| ) | |
| tts_download_btn = gr.DownloadButton( | |
| label="Download Generated Audio", | |
| visible=False | |
| ) | |
| with gr.TabItem("🎤 Spoken Dialogue"): | |
| gr.Markdown("### 🎯 Spoken Dialogue") | |
| with gr.Row(): | |
| with gr.Column(): | |
| dialogue_input_audio = gr.Audio( | |
| label="Upload User Speech", | |
| type="filepath", | |
| interactive=True | |
| ) | |
| system_prompt = gr.Textbox( | |
| label="System Prompt (Optional)", | |
| placeholder="e.g.: You are MiMo-Audio, a friendly AI assistant and your response needs to be concise.", | |
| lines=1 | |
| ) | |
| prompt_speech = gr.Audio( | |
| label="Prompt Speech (Optional, MiMo-Audio speaks with the same timbre as your prompt.)", | |
| type="filepath", | |
| interactive=True | |
| ) | |
| spoken_dialogue_add_history = gr.Checkbox( | |
| label="Enable History Record", | |
| value=True, | |
| info="Enable to remember the previous dialogue context" | |
| ) | |
| with gr.Row(): | |
| dialogue_generate_btn = gr.Button("💬 Start Dialogue", variant="primary", size="lg") | |
| with gr.Row(): | |
| dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary") | |
| with gr.Column(): | |
| dialogue_output_audio = gr.Audio( | |
| label="Output Audio", | |
| type="filepath" | |
| ) | |
| dialogue_output_text = gr.Textbox( | |
| label="Dialogue Response", | |
| lines=5, | |
| interactive=False, | |
| ) | |
| dialogue_status = gr.Textbox( | |
| label="Dialogue Status", | |
| lines=5, | |
| interactive=False, | |
| ) | |
| with gr.TabItem("💬 S2T Dialogue"): | |
| gr.Markdown("### 🎯 S2T Dialogue") | |
| with gr.Row(): | |
| with gr.Column(): | |
| s2t_dialogue_input_audio = gr.Audio( | |
| label="Upload User Speech", | |
| type="filepath", | |
| interactive=True | |
| ) | |
| s2t_dialogue_add_history = gr.Checkbox( | |
| label="Enable History Record", | |
| value=True, | |
| info="Enable to remember the previous dialogue context" | |
| ) | |
| s2t_dialogue_thinking = gr.Checkbox( | |
| label="Enable Thinking Mode (think mode)", | |
| value=False, | |
| info="Enable to perform a deeper analysis and reasoning" | |
| ) | |
| with gr.Row(): | |
| s2t_dialogue_generate_btn = gr.Button("🎧 Start S2T Dialogue", variant="primary", size="lg") | |
| with gr.Row(): | |
| s2t_dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary") | |
| with gr.Column(): | |
| s2t_dialogue_output_text = gr.Textbox( | |
| label="Dialogue Response", | |
| lines=8, | |
| interactive=False, | |
| placeholder="AI's dialogue response will be displayed here..." | |
| ) | |
| s2t_dialogue_status = gr.Textbox( | |
| label="Dialogue Status", | |
| lines=5, | |
| interactive=False, | |
| placeholder="Dialogue status information will be displayed here..." | |
| ) | |
| with gr.TabItem("📝 T2T Dialogue"): | |
| gr.Markdown("### 🎯 T2T Dialogue") | |
| with gr.Row(): | |
| with gr.Column(): | |
| t2t_dialogue_input_text = gr.Textbox( | |
| label="Input Dialogue Content", | |
| placeholder="Please input the text content you want to dialogue...", | |
| lines=4, | |
| max_lines=8 | |
| ) | |
| t2t_dialogue_add_history = gr.Checkbox( | |
| label="Enable History Record", | |
| value=True, | |
| info="Enable to remember the previous dialogue context" | |
| ) | |
| t2t_dialogue_thinking = gr.Checkbox( | |
| label="Enable Thinking Mode (Thinking)", | |
| value=False, | |
| info="Enable thinking mode, AI will perform a deeper analysis and thinking" | |
| ) | |
| with gr.Row(): | |
| t2t_dialogue_generate_btn = gr.Button("💬 Start T2T Dialogue", variant="primary", size="lg") | |
| with gr.Row(): | |
| t2t_dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary") | |
| with gr.Column(): | |
| t2t_dialogue_output_text = gr.Textbox( | |
| label="Dialogue Response", | |
| lines=8, | |
| interactive=False, | |
| placeholder="AI's dialogue response will be displayed here..." | |
| ) | |
| t2t_dialogue_status = gr.Textbox( | |
| label="Dialogue Status", | |
| lines=5, | |
| interactive=False, | |
| placeholder="Dialogue status information will be displayed here..." | |
| ) | |
| gr.Markdown("### 🌟 T2T Dialogue Examples") | |
| t2t_dialogue_examples = gr.Examples( | |
| examples=[ | |
| ["Hello, how are you?"], | |
| ["I want to know the history of the development of artificial intelligence"], | |
| ["Please recommend some good movies"], | |
| ["Can you help me explain the basic concepts of quantum physics?"], | |
| ["I'm learning programming recently, any suggestions?"] | |
| ], | |
| inputs=[t2t_dialogue_input_text], | |
| label="Click the example to automatically fill the dialogue content" | |
| ) | |
| def copy_text_to_clipboard(text): | |
| return text | |
| def clear_audio_understanding_results(): | |
| return "", "🗑️ Audio Understanding Result Cleared" | |
| init_btn.click( | |
| fn=lambda path, tok_path: self.initialize_model(path or None, tok_path or None), | |
| inputs=[model_path, tokenizer_path], | |
| outputs=[init_status] | |
| ) | |
| audio_understanding_generate_btn.click( | |
| fn=self.generate_audio_understanding_response, | |
| inputs=[audio_understanding_input_audio, audio_understanding_input_text, audio_understanding_thinking], | |
| outputs=[audio_understanding_output_text, audio_understanding_status] | |
| ) | |
| audio_understanding_copy_btn.click( | |
| fn=None, | |
| inputs=[audio_understanding_output_text], | |
| js="(text) => {navigator.clipboard.writeText(text); alert('Copied to clipboard!')}" | |
| ) | |
| tts_generate_btn.click( | |
| fn=self.generate_tts_audio, | |
| inputs=[tts_input_text, tts_instruct, tts_use_instruct], | |
| outputs=[tts_output_audio, tts_status, tts_download_btn] | |
| ) | |
| dialogue_generate_btn.click( | |
| fn=self.generate_spoken_dialogue_response, | |
| inputs=[dialogue_input_audio, system_prompt, prompt_speech, spoken_dialogue_add_history], | |
| outputs=[dialogue_output_audio, dialogue_output_text, dialogue_status] | |
| ) | |
| dialogue_clear_history_btn.click( | |
| fn=self.clear_spoken_dialogue_history, | |
| outputs=[dialogue_output_audio, dialogue_output_text, dialogue_status] | |
| ) | |
| s2t_dialogue_generate_btn.click( | |
| fn=self.generate_speech2text_dialogue_response, | |
| inputs=[s2t_dialogue_input_audio, s2t_dialogue_thinking, s2t_dialogue_add_history], | |
| outputs=[s2t_dialogue_output_text, s2t_dialogue_status] | |
| ) | |
| s2t_dialogue_clear_history_btn.click( | |
| fn=self.clear_speech2text_dialogue_history, | |
| outputs=[s2t_dialogue_output_text, s2t_dialogue_status] | |
| ) | |
| t2t_dialogue_generate_btn.click( | |
| fn=self.generate_text_dialogue_response, | |
| inputs=[t2t_dialogue_input_text, t2t_dialogue_thinking, t2t_dialogue_add_history], | |
| outputs=[t2t_dialogue_output_text, t2t_dialogue_status] | |
| ) | |
| t2t_dialogue_clear_history_btn.click( | |
| fn=self.clear_text_dialogue_history, | |
| outputs=[t2t_dialogue_output_text, t2t_dialogue_status] | |
| ) | |
| audio_understanding_clear_btn.click( | |
| fn=clear_audio_understanding_results, | |
| outputs=[audio_understanding_output_text, audio_understanding_status] | |
| ) | |
| tts_input_text.submit( | |
| fn=self.generate_tts_audio, | |
| inputs=[tts_input_text, tts_instruct, tts_use_instruct], | |
| outputs=[tts_output_audio, tts_status, tts_download_btn] | |
| ) | |
| audio_understanding_input_text.submit( | |
| fn=self.generate_audio_understanding_response, | |
| inputs=[audio_understanding_input_audio, audio_understanding_input_text, audio_understanding_thinking], | |
| outputs=[audio_understanding_output_text, audio_understanding_status] | |
| ) | |
| t2t_dialogue_input_text.submit( | |
| fn=self.generate_text_dialogue_response, | |
| inputs=[t2t_dialogue_input_text, t2t_dialogue_thinking, t2t_dialogue_add_history], | |
| outputs=[t2t_dialogue_output_text, t2t_dialogue_status] | |
| ) | |
| return iface | |
| def main(): | |
| parser = argparse.ArgumentParser(description="MiMo-Audio") | |
| parser.add_argument("--host", default="0.0.0.0", help="Server Address") | |
| parser.add_argument("--port", type=int, default=7897, help="Port") | |
| parser.add_argument("--share", action="store_true", help="Create Public Link") | |
| parser.add_argument("--debug", action="store_true", help="Debug Mode") | |
| args = parser.parse_args() | |
| print("🚀 Launch MiMo-Audio...") | |
| speech_interface = MultiModalSpeechInterface() | |
| print("🎨 Create Gradio Interface...") | |
| iface = speech_interface.create_interface() | |
| print(f"🌐 Launch Service - Address: {args.host}:{args.port}") | |
| iface.launch( | |
| server_name=args.host, | |
| server_port=args.port, | |
| share=args.share, | |
| debug=args.debug | |
| ) | |
| if __name__ == "__main__": | |
| main() |