Spaces:

mrfakename
/

Voice-Acting-TTS

Running on Zero

File size: 33,748 Bytes

ef96930

# Copyright 2025 Xiaomi Corporation.
import gradio as gr
import torch
import os
import tempfile
import argparse
from pathlib import Path
from src.mimo_audio.mimo_audio import MimoAudio


class TTSGenerator:
    def __init__(self, model, device=None):
        self.model = model
        self.device = device

    def generate(self, text, instruct, output_audio_path):
        path = Path(output_audio_path)
        path.parent.mkdir(parents=True, exist_ok=True)
        text_output = self.model.tts_sft(text, output_audio_path, instruct)
        return text_output

class AudioUnderstandingGenerator:
    def __init__(self, model, device=None):
        self.model = model
        self.device = device

    def generate(self, input_speech, input_text, thinking=False):
        text = self.model.audio_understanding_sft(input_speech, input_text, thinking=thinking)
        return text

class SpokenDialogueGenerator:
    def __init__(self, model, device=None):
        self.model = model
        self.device = device

    def generate(self, input_speech, output_audio_path, system_prompt="You are MiMo-Audio, a friendly AI assistant and your response needs to be concise.", prompt_speech=None, add_history=False):
        
        path = Path(output_audio_path)
        path.parent.mkdir(parents=True, exist_ok=True)
        text_response = self.model.spoken_dialogue_sft(input_speech, output_audio_path, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history)
        return text_response
    
    def clear_history(self):
        self.model.clear_history()

class Speech2TextDialogueGenerator:
    def __init__(self, model, device=None):
        self.model = model
        self.device = device

    def generate(self, input_speech, thinking=False, add_history=False):
        text = self.model.speech2text_dialogue_sft(input_speech, thinking=thinking, add_history=add_history)
        return text
    
    def clear_history(self):
        self.model.clear_history()


class TextDialogueGenerator:
    def __init__(self, model, device=None):
        self.model = model
        self.device = device

    def generate(self, input_text, thinking=False, add_history=False):
        text = self.model.text_dialogue_sft(input_text, thinking=thinking, add_history=add_history)
        return text
    
    def clear_history(self):
        self.model.clear_history()


class MultiModalSpeechInterface:
    def __init__(self):
        self.model = None
        self.tts_generator = None
        self.audio_understanding_generator = None
        self.spoken_dialogue_generator = None
        self.speech2text_dialogue_generator = None
        self.text_dialogue_generator = None
        
        self.device = None
        self.model_initialized = False
        
    def initialize_model(self, model_path=None, tokenizer_path=None):

        try:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            
            if model_path is None:
                model_path = "./models/MiMo-Audio-7B-Instruct"
            if tokenizer_path is None:
                tokenizer_path = "./models/MiMo-Audio-Tokenizer"
            

            print(f"Model path: {model_path}")
            print(f"Tokenizer path: {tokenizer_path}")
            
            self.model = MimoAudio(model_path, tokenizer_path)
            self.tts_generator = TTSGenerator(self.model, self.device)
            self.audio_understanding_generator = AudioUnderstandingGenerator(self.model, self.device)
            self.spoken_dialogue_generator = SpokenDialogueGenerator(self.model, self.device)
            self.speech2text_dialogue_generator = Speech2TextDialogueGenerator(self.model, self.device)
            self.text_dialogue_generator = TextDialogueGenerator(self.model, self.device)
            
            
            self.model_initialized = True
            print("Model loaded successfully!")
            return "✅ Model loaded successfully!"
            
        except Exception as e:
            error_msg = f"❌ Model loading failed: {str(e)}"
            print(error_msg)
            return error_msg

    def generate_tts_audio(self, input_text, instruct="", use_instruct=False):
        if not self.model_initialized:
            return None, "❌ Error: Model not initialized, please load the model first"
        
        if not input_text.strip():
            return None, "❌ Error: Please input text content"
        
        
        try:
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                output_path = tmp_file.name
            
            
            if not (use_instruct and instruct.strip()):
                instruct = None
                       
            print(f"Generating TTS audio: {input_text}")

            text_channel = self.tts_generator.generate(input_text, instruct, output_path)
            status_msg = f"✅ TTS audio generated successfully!\n📝 Input text: {input_text}"
            if use_instruct and instruct is not None and instruct.strip():
                status_msg += f"\n🎭 Style description: {instruct}"
            status_msg += f"\n🎵 Output text channel: {text_channel}"

            return output_path, status_msg, gr.update(value=output_path, visible=True)
            
        except Exception as e:
            error_msg = f"❌ Error generating TTS audio: {str(e)}"
            print(error_msg)
            return None, error_msg, gr.update(visible=False)


    def generate_audio_understanding_response(self, input_audio, input_text, thinking=False):
        if not self.model_initialized:
            return "", "❌ Error: Model not initialized, please load the model first"
        
        if input_audio is None and not input_text.strip():
            return "", "❌ Error: Please provide either audio input or text question"
        
        if input_audio is None:
            return "", "❌ Error: Please upload an audio file for Audio Understanding task"
        
        if not input_text.strip():
            return "", "❌ Error: Please input your question"
        
        try:
            print(f"Performing Audio Understanding task:")
            print(f"Audio input: {input_audio}")
            print(f"Text question: {input_text}")
            
            
            audio_understanding_response = self.audio_understanding_generator.generate(input_audio, input_text.strip(), thinking=thinking)
            
            status_msg = f"✅ Audio Understanding task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n❓ Question: {input_text}\n💬 Answer: {audio_understanding_response}"
            
            return audio_understanding_response, status_msg
            
        except Exception as e:
            error_msg = f"❌ Error performing Audio Understanding task: {str(e)}"
            print(error_msg)
            return "", error_msg

    def generate_spoken_dialogue_response(self, input_audio, system_prompt=None, prompt_speech=None, add_history=False):
        if not self.model_initialized:
            return "", "❌ Error: Model not initialized, please load the model first"
        
        if input_audio is None:
            return "", "❌ Error: Please upload an audio file"
        
        try:
            
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                output_audio_path = tmp_file.name
            
            print(f"Performing spoken dialogue task:")
            print(f"Audio input: {input_audio}")
            print(f"Output path: {output_audio_path}")
            
            
            dialogue_response = self.spoken_dialogue_generator.generate(input_audio, output_audio_path, system_prompt=system_prompt, prompt_speech=prompt_speech, add_history=add_history)
            
            status_msg = f"✅ Spoken dialogue task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n💬 Response: {dialogue_response[:300]}..."
            
            return output_audio_path, dialogue_response, status_msg
            
        except Exception as e:
            error_msg = f"❌ Error performing spoken dialogue task: {str(e)}"
            print(error_msg)
            return None, None, error_msg


    def generate_speech2text_dialogue_response(self, input_audio, thinking=False, add_history=False):
        if not self.model_initialized:
            return "", "❌ Error: Model not initialized, please load the model first"
        
        if input_audio is None:
            return "", "❌ Error: Please upload an audio file for S2T Dialogue task"
        
        
        try:
            print(f"Performing S2T Dialogue task:")
            print(f"Audio input: {input_audio}")
            
            
            s2t_response = self.speech2text_dialogue_generator.generate(input_audio, thinking=thinking, add_history=add_history)
            
            status_msg = f"✅ S2T dialogue task completed successfully!\n🎵 Audio input: {os.path.basename(input_audio)}\n❓💬 Answer: {s2t_response}"
            
            return s2t_response, status_msg
            
        except Exception as e:
            error_msg = f"❌ Error performing QA task: {str(e)}"
            print(error_msg)
            return "", error_msg

    def generate_text_dialogue_response(self, input_text, thinking=False, add_history=False):
        if not self.model_initialized:
            return "", "❌ Error: Model not initialized, please load the model first"
        
        if not input_text or not input_text.strip():
            return "", "❌ Error: Please input your text"
        
        try:
            print(f"Performing Text Dialogue task:")
            print(f"Text input: {input_text}")
            print(f"Thinking mode: {thinking}")
            print(f"Add history: {add_history}")
            
            
            t2t_response = self.text_dialogue_generator.generate(input_text.strip(), thinking=thinking, add_history=add_history)
            
            status_msg = f"✅ T2T dialogue task completed successfully!\n💬 Input: {input_text}"
            if thinking:
                status_msg += f"\n🧠 Thinking mode: Enabled"
            status_msg += f"\n💬 Answer: {t2t_response}"
            
            return t2t_response, status_msg
            
        except Exception as e:
            error_msg = f"❌ Error performing T2T dialogue task: {str(e)}"
            print(error_msg)
            return "", error_msg

    def clear_spoken_dialogue_history(self):
        if not self.model_initialized:
            return None, "", "❌ Error: Model not initialized, please load the model first"
        
        try:
            self.spoken_dialogue_generator.clear_history()
            return None, "", "✅ Spoken dialogue history cleared successfully!"
        except Exception as e:
            error_msg = f"❌ Error clearing spoken dialogue history: {str(e)}"
            print(error_msg)
            return None, "", error_msg
    
    def clear_speech2text_dialogue_history(self):
        if not self.model_initialized:
            return "", "❌ Error: Model not initialized, please load the model first"
        
        try:
            self.speech2text_dialogue_generator.clear_history()
            return "", "✅ Speech-to-text dialogue history cleared successfully!"
        except Exception as e:
            error_msg = f"❌ Error clearing S2T dialogue history: {str(e)}"
            print(error_msg)
            return "", error_msg

    def clear_text_dialogue_history(self):
        if not self.model_initialized:
            return "", "❌ Error: Model not initialized, please load the model first"
        
        try:
            self.text_dialogue_generator.clear_history()
            return "", "✅ Text dialogue history cleared successfully!"
        except Exception as e:
            error_msg = f"❌ Error clearing T2T dialogue history: {str(e)}"
            print(error_msg)
            return "", error_msg



    def create_interface(self):
        
        with gr.Blocks(title="MiMo-Audio Multimodal Speech Processing System", theme=gr.themes.Soft()) as iface:
            gr.Markdown("# 🎵 MiMo-Audio Multimodal Speech Processing System")
            gr.Markdown("Supports audio understanding, text-to-speech, spoken dialogue, speech-to-text dialogue and text-to-text dialogue")
            
            with gr.Tabs():
                
                with gr.TabItem("⚙️ Model Configuration"):
                    gr.Markdown("### 📋 Model initialization configuration")
                    
                    with gr.Row():
                        with gr.Column():
                            
                            model_path = gr.Textbox(
                                label="Model path",
                                placeholder="Leave blank to use default path: ./models/MiMo-Audio-7B-Instruct",
                                lines=3
                            )
                            
                            tokenizer_path = gr.Textbox(
                                label="Tokenizer path",
                                placeholder="Leave blank to use default path: ./models/MiMo-Audio-Tokenizer",
                                lines=3
                            )
                            
                            init_btn = gr.Button("🔄 Initialize model", variant="primary", size="lg")
                            
                        with gr.Column():
                            init_status = gr.Textbox(
                                label="Initialization status",
                                interactive=False,
                                lines=6,
                                placeholder="Click the initialize model button to start..."
                            )
                            
                            
                            gr.Markdown("### 💻 System information")
                            device_info = gr.Textbox(
                                label="Device information",
                                value=f"GPU available: {'Yes' if torch.cuda.is_available() else 'No'}",
                                interactive=False
                            )
                
                
                with gr.TabItem("🔊 Audio Understanding"):
                    gr.Markdown("### 🎯 Audio Understanding")
                    
                    with gr.Row():
                        with gr.Column():
                            audio_understanding_input_audio = gr.Audio(
                                label="Upload Audio File",
                                type="filepath",
                                interactive=True,
                            )
                            
                            audio_understanding_input_text = gr.Textbox(
                                label="Input Question",
                                placeholder="Please input your question...",
                                lines=3,
                            )
                            
                            audio_understanding_thinking = gr.Checkbox(
                                label="Enable Thinking Mode",
                                value=False,
                                info="Enable thinking mode, AI will perform a deeper analysis and thinking"
                            )
                            
                            audio_understanding_generate_btn = gr.Button("🤖 Start Audio Understanding", variant="primary", size="lg")
                            
                            
                            
                        with gr.Column():
                            audio_understanding_output_text = gr.Textbox(
                                label="Answer Result",
                                lines=8,
                                interactive=False,
                                placeholder="AI's answer will be displayed here...",
                                elem_id="audio_understanding_output_text"
                            )
                            
                            audio_understanding_status = gr.Textbox(
                                label="Processing Status",
                                lines=6,
                                interactive=False,
                                placeholder="Processing status information will be displayed here..."
                            )

                            with gr.Row():
                                audio_understanding_copy_btn = gr.Button("📋 Copy Answer", size="sm")
                                audio_understanding_clear_btn = gr.Button("🗑️ Clear Result", size="sm")
                    
                    gr.Markdown("### 🌟 Audio Understanding Examples")
                    audio_understanding_examples = gr.Examples(
                        examples=[
                            [None, "这段音频的主要内容是什么？"],
                            [None, "说话者的情感状态如何？"],
                            [None, "音频中提到了哪些关键信息？"],
                            [None, "Please summarize the main points of this conversation."],
                            [None, "What viewpoint does the speaker want to express?"]
                        ],
                        inputs=[audio_understanding_input_audio, audio_understanding_input_text],
                        label="Click the example to automatically fill the question"
                    )
                    

                
                
                with gr.TabItem("🎵 Text-to-Speech"):
                    gr.Markdown("### 🎵 Text-to-Speech")
                    
                    with gr.Row():
                        with gr.Column():
                            
                            tts_input_text = gr.Textbox(
                                label="Input Text",
                                placeholder="Please input the text you want to convert to speech...",
                                lines=4,
                                max_lines=8
                            )
                            
                            tts_instruct = gr.Textbox(
                                label="Style Description (Optional)",
                                placeholder="Please input the style description (optional)...",
                                lines=3,
                                max_lines=5
                            )
                            
                            tts_use_instruct = gr.Checkbox(
                                label="Use Style Description",
                                value=True,
                                info="Enable to use InstructTTS for style-controlled speech generation"
                            )
                            
                            tts_generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
                            
                        with gr.Column():
                            
                            tts_output_audio = gr.Audio(
                                label="Generated Speech",
                                type="filepath"
                            )
                            
                            tts_status = gr.Textbox(
                                label="Generation Status",
                                lines=6,
                                interactive=False
                            )

                            
                            tts_download_btn = gr.DownloadButton(
                                label="Download Generated Audio",
                                visible=False
                            )
                    
                
                
                
                with gr.TabItem("🎤 Spoken Dialogue"):
                    gr.Markdown("### 🎯 Spoken Dialogue")
                    
                    with gr.Row():
                        with gr.Column():
                            
                            dialogue_input_audio = gr.Audio(
                                label="Upload User Speech",
                                type="filepath",
                                interactive=True
                            )
                            system_prompt = gr.Textbox(
                                label="System Prompt (Optional)",
                                placeholder="e.g.: You are MiMo-Audio, a friendly AI assistant and your response needs to be concise.",
                                lines=1
                            )
                            prompt_speech = gr.Audio(
                                label="Prompt Speech (Optional, MiMo-Audio speaks with the same timbre as your prompt.)",
                                type="filepath",
                                interactive=True
                            )
                            spoken_dialogue_add_history = gr.Checkbox(
                                label="Enable History Record",
                                value=True,
                                info="Enable to remember the previous dialogue context"
                            )
                            
                            with gr.Row():
                                dialogue_generate_btn = gr.Button("💬 Start Dialogue", variant="primary", size="lg")
                            
                            with gr.Row():
                                dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")
                            

                            

                            
                        with gr.Column():
                            
                            dialogue_output_audio = gr.Audio(
                                label="Output Audio",
                                type="filepath"
                            )
                            dialogue_output_text = gr.Textbox(
                                label="Dialogue Response",
                                lines=5,
                                interactive=False,
                            )
                            dialogue_status = gr.Textbox(
                                label="Dialogue Status",
                                lines=5,
                                interactive=False,
                            )
                    
                    
                    

                
                with gr.TabItem("💬 S2T Dialogue"):
                    gr.Markdown("### 🎯 S2T Dialogue")
                    
                    with gr.Row():
                        with gr.Column():
                            
                            s2t_dialogue_input_audio = gr.Audio(
                                label="Upload User Speech",
                                type="filepath",
                                interactive=True
                            )
                            
                            
                            s2t_dialogue_add_history = gr.Checkbox(
                                label="Enable History Record",
                                value=True,
                                info="Enable to remember the previous dialogue context"
                            )
                            
                            s2t_dialogue_thinking = gr.Checkbox(
                                label="Enable Thinking Mode (think mode)",
                                value=False,
                                info="Enable to perform a deeper analysis and reasoning"
                            )
                            
                            with gr.Row():
                                s2t_dialogue_generate_btn = gr.Button("🎧 Start S2T Dialogue", variant="primary", size="lg")
                            
                            with gr.Row():
                                s2t_dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")
                            
                            
                        with gr.Column():
                            
                            s2t_dialogue_output_text = gr.Textbox(
                                label="Dialogue Response",
                                lines=8,
                                interactive=False,
                                placeholder="AI's dialogue response will be displayed here..."
                            )
                            
                            s2t_dialogue_status = gr.Textbox(
                                label="Dialogue Status",
                                lines=5,
                                interactive=False,
                                placeholder="Dialogue status information will be displayed here..."
                            )
                    

                
                with gr.TabItem("📝 T2T Dialogue"):
                    gr.Markdown("### 🎯 T2T Dialogue")
                    
                    with gr.Row():
                        with gr.Column():
                            
                            t2t_dialogue_input_text = gr.Textbox(
                                label="Input Dialogue Content",
                                placeholder="Please input the text content you want to dialogue...",
                                lines=4,
                                max_lines=8
                            )
                            
                            t2t_dialogue_add_history = gr.Checkbox(
                                label="Enable History Record",
                                value=True,
                                info="Enable to remember the previous dialogue context"
                            )
                            
                            t2t_dialogue_thinking = gr.Checkbox(
                                label="Enable Thinking Mode (Thinking)",
                                value=False,
                                info="Enable thinking mode, AI will perform a deeper analysis and thinking"
                            )
                            
                            with gr.Row():
                                t2t_dialogue_generate_btn = gr.Button("💬 Start T2T Dialogue", variant="primary", size="lg")
                            
                            with gr.Row():
                                t2t_dialogue_clear_history_btn = gr.Button("🗑️ Clear Dialogue History", size="sm", variant="secondary")
                            
                            
                            
                        with gr.Column():
                            t2t_dialogue_output_text = gr.Textbox(
                                label="Dialogue Response",
                                lines=8,
                                interactive=False,
                                placeholder="AI's dialogue response will be displayed here..."
                            )
                            
                            t2t_dialogue_status = gr.Textbox(
                                label="Dialogue Status",
                                lines=5,
                                interactive=False,
                                placeholder="Dialogue status information will be displayed here..."
                            )
                    
                    gr.Markdown("### 🌟 T2T Dialogue Examples")
                    t2t_dialogue_examples = gr.Examples(
                        examples=[
                            ["Hello, how are you?"],
                            ["I want to know the history of the development of artificial intelligence"],
                            ["Please recommend some good movies"],
                            ["Can you help me explain the basic concepts of quantum physics?"],
                            ["I'm learning programming recently, any suggestions?"]
                        ],
                        inputs=[t2t_dialogue_input_text],
                        label="Click the example to automatically fill the dialogue content"
                    )
                    
            
            
            def copy_text_to_clipboard(text):
                return text
            
            def clear_audio_understanding_results():
                return "", "🗑️ Audio Understanding Result Cleared"
                
            
            init_btn.click(
                fn=lambda path, tok_path: self.initialize_model(path or None, tok_path or None),
                inputs=[model_path, tokenizer_path],
                outputs=[init_status]
            )
            
            
            audio_understanding_generate_btn.click(
                fn=self.generate_audio_understanding_response,
                inputs=[audio_understanding_input_audio, audio_understanding_input_text, audio_understanding_thinking],
                outputs=[audio_understanding_output_text, audio_understanding_status]
            )

            audio_understanding_copy_btn.click(
                fn=None,
                inputs=[audio_understanding_output_text],
                js="(text) => {navigator.clipboard.writeText(text); alert('Copied to clipboard!')}"
            )
            
            tts_generate_btn.click(
                fn=self.generate_tts_audio,
                inputs=[tts_input_text, tts_instruct, tts_use_instruct],
                outputs=[tts_output_audio, tts_status, tts_download_btn]
            )
            
            dialogue_generate_btn.click(
                fn=self.generate_spoken_dialogue_response,
                inputs=[dialogue_input_audio, system_prompt, prompt_speech, spoken_dialogue_add_history],
                outputs=[dialogue_output_audio, dialogue_output_text, dialogue_status]
            )
        
            
            
            dialogue_clear_history_btn.click(
                fn=self.clear_spoken_dialogue_history,
                outputs=[dialogue_output_audio, dialogue_output_text, dialogue_status]
            )
            
            
            s2t_dialogue_generate_btn.click(
                fn=self.generate_speech2text_dialogue_response,
                inputs=[s2t_dialogue_input_audio, s2t_dialogue_thinking, s2t_dialogue_add_history],
                outputs=[s2t_dialogue_output_text, s2t_dialogue_status]
            )
            
            
            
            s2t_dialogue_clear_history_btn.click(
                fn=self.clear_speech2text_dialogue_history,
                outputs=[s2t_dialogue_output_text, s2t_dialogue_status]
            )
            
            
            t2t_dialogue_generate_btn.click(
                fn=self.generate_text_dialogue_response,
                inputs=[t2t_dialogue_input_text, t2t_dialogue_thinking, t2t_dialogue_add_history],
                outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
            )
            
            
            t2t_dialogue_clear_history_btn.click(
                fn=self.clear_text_dialogue_history,
                outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
            )
            
            
            
            
            audio_understanding_clear_btn.click(
                fn=clear_audio_understanding_results,
                outputs=[audio_understanding_output_text, audio_understanding_status]
            )
            
            
            
           
            
            
            tts_input_text.submit(
                fn=self.generate_tts_audio,
                inputs=[tts_input_text, tts_instruct, tts_use_instruct],
                outputs=[tts_output_audio, tts_status, tts_download_btn]
            )
            
            
            audio_understanding_input_text.submit(
                fn=self.generate_audio_understanding_response,
                inputs=[audio_understanding_input_audio, audio_understanding_input_text, audio_understanding_thinking],
                outputs=[audio_understanding_output_text, audio_understanding_status]
            )
            
            t2t_dialogue_input_text.submit(
                fn=self.generate_text_dialogue_response,
                inputs=[t2t_dialogue_input_text, t2t_dialogue_thinking, t2t_dialogue_add_history],
                outputs=[t2t_dialogue_output_text, t2t_dialogue_status]
            )

        
        return iface

def main():
    parser = argparse.ArgumentParser(description="MiMo-Audio")
    parser.add_argument("--host", default="0.0.0.0", help="Server Address")
    parser.add_argument("--port", type=int, default=7897, help="Port")
    parser.add_argument("--share", action="store_true", help="Create Public Link")
    parser.add_argument("--debug", action="store_true", help="Debug Mode")
    
    args = parser.parse_args()
    
    
    
    print("🚀 Launch MiMo-Audio...")
    
    
    speech_interface = MultiModalSpeechInterface()
    
    
    
    print("🎨 Create Gradio Interface...")
    iface = speech_interface.create_interface()
    
    
    print(f"🌐 Launch Service - Address: {args.host}:{args.port}")
    
    iface.launch(
        server_name=args.host,
        server_port=args.port,
        share=args.share,
        debug=args.debug
    )

if __name__ == "__main__":
    main()