Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Jul 4

Commit

e045021

1 Parent(s): c0cf9b3

update requirements.txt

Browse files

Files changed (2) hide show

app.py +18 -51
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -14,7 +14,6 @@ from transformers import pipeline
 from pyannote.audio import Pipeline
 import requests
 import base64
-from typing import List, Dict, Any, Optional, Tuple
 # Install flash attention for acceleration
 try:
@@ -59,7 +58,7 @@ class WhisperTranscriber:
                 print(f"Could not load diarization model: {e}")
                 self.diarization_model = None
-    def convert_audio_format(self, audio_path: str) -> str:
         """Convert audio to 16kHz mono WAV format"""
         temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         temp_wav_path = temp_wav.name
@@ -76,13 +75,7 @@ class WhisperTranscriber:
             raise RuntimeError(f"Audio conversion failed: {e}")
     @spaces.GPU
-    def transcribe_audio(
-        self,
-        audio_path: str,
-        language: Optional[str] = None,
-        translate: bool = False,
-        prompt: Optional[str] = None
-    ) -> Tuple[List[Dict], str]:
         """Transcribe audio using Whisper with flash attention"""
         if self.pipe is None:
             self.setup_models()
@@ -132,11 +125,7 @@ class WhisperTranscriber:
         return segments, detected_language
     @spaces.GPU
-    def perform_diarization(
-        self,
-        audio_path: str,
-        num_speakers: Optional[int] = None
-    ) -> Tuple[List[Dict], int]:
         """Perform speaker diarization"""
         if self.diarization_model is None:
             print("Diarization model not available, assigning single speaker")
@@ -173,11 +162,7 @@ class WhisperTranscriber:
         return diarize_segments, detected_num_speakers
-    def merge_transcription_and_diarization(
-        self,
-        transcription_segments: List[Dict],
-        diarization_segments: List[Dict]
-    ) -> List[Dict]:
         """Merge transcription segments with speaker information"""
         if not diarization_segments:
             # No diarization available, assign single speaker
@@ -214,12 +199,7 @@ class WhisperTranscriber:
         return final_segments
-    def group_segments_by_speaker(
-        self,
-        segments: List[Dict],
-        max_gap: float = 1.0,
-        max_duration: float = 30.0
-    ) -> List[Dict]:
         """Group consecutive segments from the same speaker"""
         if not segments:
             return segments
@@ -260,15 +240,8 @@ class WhisperTranscriber:
         return grouped_segments
     @spaces.GPU
-    def process_audio(
-        self,
-        audio_file,
-        num_speakers: Optional[int] = None,
-        language: Optional[str] = None,
-        translate: bool = False,
-        prompt: Optional[str] = None,
-        group_segments: bool = True
-    ) -> Dict[str, Any]:
         """Main processing function"""
         if audio_file is None:
             return {"error": "No audio file provided"}
@@ -318,7 +291,7 @@ class WhisperTranscriber:
 # Initialize transcriber
 transcriber = WhisperTranscriber()
-def format_segments_for_display(result: Dict[str, Any]) -> str:
     """Format segments for display in Gradio"""
     if "error" in result:
         return f"❌ Error: {result['error']}"
@@ -345,21 +318,14 @@ def format_segments_for_display(result: Dict[str, Any]) -> str:
     return output
-def process_audio_gradio(
-    audio_file,
-    num_speakers,
-    language,
-    translate,
-    prompt,
-    group_segments
-):
     """Gradio interface function"""
     result = transcriber.process_audio(
         audio_file=audio_file,
         num_speakers=num_speakers if num_speakers > 0 else None,
         language=language if language != "auto" else None,
         translate=translate,
-        prompt=prompt if prompt.strip() else None,
         group_segments=group_segments
     )
@@ -367,10 +333,12 @@ def process_audio_gradio(
     return formatted_output, result
 # Create Gradio interface
-with gr.Blocks(
     title="🎙️ Whisper Transcription with Speaker Diarization",
-    theme=gr.themes.Soft()
-) as demo:
     gr.Markdown("""
     # 🎙️ Advanced Audio Transcription & Speaker Diarization
@@ -385,7 +353,7 @@ with gr.Blocks(
             audio_input = gr.Audio(
                 label="🎵 Upload Audio File",
                 type="filepath",
-                sources=["upload", "microphone"]
             )
             with gr.Accordion("⚙️ Advanced Settings", open=False):
@@ -419,7 +387,7 @@ with gr.Blocks(
                     value=True
                 )
-            process_btn = gr.Button("🚀 Transcribe Audio", variant="primary", size="lg")
         with gr.Column():
             output_text = gr.Markdown(
@@ -443,8 +411,7 @@ with gr.Blocks(
             prompt,
             group_segments
         ],
-        outputs=[output_text, output_json],
-        show_progress=True
     )
     # Examples

 from pyannote.audio import Pipeline
 import requests
 import base64
 # Install flash attention for acceleration
 try:
                 print(f"Could not load diarization model: {e}")
                 self.diarization_model = None
+    def convert_audio_format(self, audio_path):
         """Convert audio to 16kHz mono WAV format"""
         temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         temp_wav_path = temp_wav.name
             raise RuntimeError(f"Audio conversion failed: {e}")
     @spaces.GPU
+    def transcribe_audio(self, audio_path, language=None, translate=False, prompt=None):
         """Transcribe audio using Whisper with flash attention"""
         if self.pipe is None:
             self.setup_models()
         return segments, detected_language
     @spaces.GPU
+    def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
         if self.diarization_model is None:
             print("Diarization model not available, assigning single speaker")
         return diarize_segments, detected_num_speakers
+    def merge_transcription_and_diarization(self, transcription_segments, diarization_segments):
         """Merge transcription segments with speaker information"""
         if not diarization_segments:
             # No diarization available, assign single speaker
         return final_segments
+    def group_segments_by_speaker(self, segments, max_gap=1.0, max_duration=30.0):
         """Group consecutive segments from the same speaker"""
         if not segments:
             return segments
         return grouped_segments
     @spaces.GPU
+    def process_audio(self, audio_file, num_speakers=None, language=None,
+                     translate=False, prompt=None, group_segments=True):
         """Main processing function"""
         if audio_file is None:
             return {"error": "No audio file provided"}
 # Initialize transcriber
 transcriber = WhisperTranscriber()
+def format_segments_for_display(result):
     """Format segments for display in Gradio"""
     if "error" in result:
         return f"❌ Error: {result['error']}"
     return output
+def process_audio_gradio(audio_file, num_speakers, language, translate, prompt, group_segments):
     """Gradio interface function"""
     result = transcriber.process_audio(
         audio_file=audio_file,
         num_speakers=num_speakers if num_speakers > 0 else None,
         language=language if language != "auto" else None,
         translate=translate,
+        prompt=prompt if prompt and prompt.strip() else None,
         group_segments=group_segments
     )
     return formatted_output, result
 # Create Gradio interface
+demo = gr.Blocks(
     title="🎙️ Whisper Transcription with Speaker Diarization",
+    theme="default"
+)
+with demo:
     gr.Markdown("""
     # 🎙️ Advanced Audio Transcription & Speaker Diarization
             audio_input = gr.Audio(
                 label="🎵 Upload Audio File",
                 type="filepath",
+                source="upload"
             )
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                     value=True
                 )
+            process_btn = gr.Button("🚀 Transcribe Audio", variant="primary")
         with gr.Column():
             output_text = gr.Markdown(
             prompt,
             group_segments
         ],
+        outputs=[output_text, output_json]
     )
     # Examples

requirements.txt CHANGED Viewed

@@ -4,6 +4,10 @@ torchaudio>=2.0.0
 transformers>=4.35.0
 accelerate>=0.24.0
 # Audio processing and transcription
 ffmpeg-python>=0.2.0
 librosa>=0.10.0

 transformers>=4.35.0
 accelerate>=0.24.0
+# Gradio and Spaces - matching SDK version 4.44.1
+gradio==4.44.1
+spaces>=0.19.0
 # Audio processing and transcription
 ffmpeg-python>=0.2.0
 librosa>=0.10.0