Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Jul 21

Commit

8c68b8b

1 Parent(s): d441278

Add full audio transcription functionality and update Gradio interface

Browse files

Files changed (1) hide show

app.py +146 -27

app.py CHANGED Viewed

@@ -52,7 +52,6 @@ except OSError as e:
 _whisper = None
 _diarizer = None
 # Create global diarization pipeline
 try:
     print("Loading diarization model...")
@@ -63,17 +62,8 @@ try:
     _diarizer = Pipeline.from_pretrained(
         "pyannote/speaker-diarization-3.1",
         use_auth_token=os.getenv("HF_TOKEN"),
-        #torch_dtype=torch.float16,
     ).to(torch.device("cuda"))
-    '''
-    _diarizer.model.half()                       # FP16
-    for m in _diarizer.model.modules():          # compact LSTM weights
-        if isinstance(m, torch.nn.LSTM):
-            m.flatten_parameters()
-    _diarizer.model = torch.compile(_diarizer.model, mode="reduce-overhead")
-    '''
     print("Diarization model loaded successfully")
 except Exception as e:
     import traceback
@@ -116,6 +106,68 @@ class WhisperTranscriber:
         except subprocess.CalledProcessError as e:
             raise RuntimeError(f"Audio conversion failed: {e}")
     def cut_audio_segments(self, audio_path, diarization_segments):
         """Cut audio into segments based on diarization results"""
         print("Cutting audio into segments...")
@@ -309,6 +361,47 @@ class WhisperTranscriber:
         return grouped_segments
     @spaces.GPU           # each call gets a GPU slice
     def process_audio(self, audio_file, num_speakers=None, language=None,
                      translate=False, prompt=None, group_segments=True):
@@ -345,7 +438,8 @@ class WhisperTranscriber:
             return {
                 "segments": transcription_results,
                 "language": detected_language,
-                "num_speakers": detected_num_speakers
             }
         except Exception as e:
@@ -369,11 +463,13 @@ def format_segments_for_display(result):
     segments = result.get("segments", [])
     language = result.get("language", "unknown")
     num_speakers = result.get("num_speakers", 1)
     output = f"🎯 **Detection Results:**\n"
     output += f"- Language: {language}\n"
     output += f"- Speakers: {num_speakers}\n"
-    output += f"- Segments: {len(segments)}\n\n"
     output += "📝 **Transcription:**\n\n"
@@ -389,16 +485,25 @@ def format_segments_for_display(result):
     return output
 @spaces.GPU
-def process_audio_gradio(audio_file, num_speakers, language, translate, prompt, group_segments):
     """Gradio interface function"""
-    result = transcriber.process_audio(
-        audio_file=audio_file,
-        num_speakers=num_speakers if num_speakers > 0 else None,
-        language=language if language != "auto" else None,
-        translate=translate,
-        prompt=prompt if prompt and prompt.strip() else None,
-        group_segments=group_segments
-    )
     formatted_output = format_segments_for_display(result)
     return formatted_output, result
@@ -424,16 +529,22 @@ with demo:
             audio_input = gr.Audio(
                 label="🎵 Upload Audio File",
                 type="filepath",
-                #source="upload"
             )
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 num_speakers = gr.Slider(
                     minimum=0,
                     maximum=20,
                     value=0,
                     step=1,
-                    label="Number of Speakers (0 = auto-detect)"
                 )
                 language = gr.Dropdown(
@@ -454,7 +565,7 @@ with demo:
                 )
                 group_segments = gr.Checkbox(
-                    label="Group segments by speaker",
                     value=True
                 )
@@ -471,6 +582,13 @@ with demo:
                 visible=False
             )
     # Event handlers
     process_btn.click(
         fn=process_audio_gradio,
@@ -480,7 +598,8 @@ with demo:
             language,
             translate,
             prompt,
-            group_segments
         ],
         outputs=[output_text, output_json]
     )
@@ -490,7 +609,7 @@ with demo:
     gr.Markdown("""
     - **Supported formats**: MP3, WAV, M4A, FLAC, OGG, and more
     - **Max duration**: Recommended under 10 minutes for optimal performance
-    - **Speaker detection**: Works best with clear, distinct voices
     - **Languages**: Supports 100+ languages with auto-detection
     - **Vocabulary**: Add names and technical terms in the prompt for better accuracy
     """)

 _whisper = None
 _diarizer = None
 # Create global diarization pipeline
 try:
     print("Loading diarization model...")
     _diarizer = Pipeline.from_pretrained(
         "pyannote/speaker-diarization-3.1",
         use_auth_token=os.getenv("HF_TOKEN"),
     ).to(torch.device("cuda"))
     print("Diarization model loaded successfully")
 except Exception as e:
     import traceback
         except subprocess.CalledProcessError as e:
             raise RuntimeError(f"Audio conversion failed: {e}")
+    @spaces.GPU           # each call gets a GPU slice
+    def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None):
+        """Transcribe the entire audio file without speaker diarization"""
+        whisper, _ = _load_models()   # models live on the GPU
+        print("Transcribing full audio...")
+        start_time = time.time()
+        # Prepare options
+        options = dict(
+            language=language,
+            beam_size=5,
+            vad_filter=True,
+            vad_parameters=VadOptions(
+                max_speech_duration_s=whisper.feature_extractor.chunk_length,
+                min_speech_duration_ms=100,
+                speech_pad_ms=100,
+                threshold=0.25,
+                neg_threshold=0.2,
+            ),
+            word_timestamps=True,
+            initial_prompt=prompt,
+            language_detection_segments=1,
+            task="translate" if translate else "transcribe",
+        )
+        # Transcribe the entire audio
+        segments, transcript_info = whisper.transcribe(audio_path, **options)
+        segments = list(segments)
+        detected_language = transcript_info.language
+        # Process segments
+        results = []
+        for seg in segments:
+            # Create result entry with detailed format
+            words_list = []
+            if seg.words:
+                for word in seg.words:
+                    words_list.append({
+                        "start": float(word.start),
+                        "end": float(word.end),
+                        "word": word.word,
+                        "probability": word.probability,
+                        "speaker": "SPEAKER_00"  # No speaker identification in full transcription
+                    })
+            results.append({
+                "start": float(seg.start),
+                "end": float(seg.end),
+                "text": seg.text,
+                "speaker": "SPEAKER_00",  # Single speaker assumption
+                "avg_logprob": seg.avg_logprob,
+                "words": words_list,
+                "duration": float(seg.end - seg.start)
+            })
+        transcription_time = time.time() - start_time
+        print(f"Full audio transcribed in {transcription_time:.2f} seconds")
+        return results, detected_language
     def cut_audio_segments(self, audio_path, diarization_segments):
         """Cut audio into segments based on diarization results"""
         print("Cutting audio into segments...")
         return grouped_segments
+    @spaces.GPU           # each call gets a GPU slice
+    def process_audio_full(self, audio_file, language=None, translate=False, prompt=None, group_segments=True):
+        """Process audio with full transcription (no speaker diarization)"""
+        if audio_file is None:
+            return {"error": "No audio file provided"}
+        converted_audio_path = None
+        try:
+            print("Starting full transcription pipeline...")
+            # Step 1: Convert audio format
+            print("Converting audio format...")
+            converted_audio_path = self.convert_audio_format(audio_file)
+            # Step 2: Transcribe the entire audio
+            transcription_results, detected_language = self.transcribe_full_audio(
+                converted_audio_path, language, translate, prompt
+            )
+            # Step 3: Group segments if requested (based on time gaps and sentence endings)
+            if group_segments:
+                transcription_results = self.group_segments_by_speaker(transcription_results)
+            # Step 4: Return results
+            return {
+                "segments": transcription_results,
+                "language": detected_language,
+                "num_speakers": 1,  # Single speaker assumption
+                "transcription_method": "full_audio"
+            }
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {"error": f"Processing failed: {str(e)}"}
+        finally:
+            # Clean up converted audio file
+            if converted_audio_path and os.path.exists(converted_audio_path):
+                os.unlink(converted_audio_path)
+                print("Cleaned up converted audio file")
     @spaces.GPU           # each call gets a GPU slice
     def process_audio(self, audio_file, num_speakers=None, language=None,
                      translate=False, prompt=None, group_segments=True):
             return {
                 "segments": transcription_results,
                 "language": detected_language,
+                "num_speakers": detected_num_speakers,
+                "transcription_method": "diarized_segments"
             }
         except Exception as e:
     segments = result.get("segments", [])
     language = result.get("language", "unknown")
     num_speakers = result.get("num_speakers", 1)
+    method = result.get("transcription_method", "unknown")
     output = f"🎯 **Detection Results:**\n"
     output += f"- Language: {language}\n"
     output += f"- Speakers: {num_speakers}\n"
+    output += f"- Segments: {len(segments)}\n"
+    output += f"- Method: {method}\n\n"
     output += "📝 **Transcription:**\n\n"
     return output
 @spaces.GPU
+def process_audio_gradio(audio_file, num_speakers, language, translate, prompt, group_segments, use_diarization):
     """Gradio interface function"""
+    if use_diarization:
+        result = transcriber.process_audio(
+            audio_file=audio_file,
+            num_speakers=num_speakers if num_speakers > 0 else None,
+            language=language if language != "auto" else None,
+            translate=translate,
+            prompt=prompt if prompt and prompt.strip() else None,
+            group_segments=group_segments
+        )
+    else:
+        result = transcriber.process_audio_full(
+            audio_file=audio_file,
+            language=language if language != "auto" else None,
+            translate=translate,
+            prompt=prompt if prompt and prompt.strip() else None,
+            group_segments=group_segments
+        )
     formatted_output = format_segments_for_display(result)
     return formatted_output, result
             audio_input = gr.Audio(
                 label="🎵 Upload Audio File",
                 type="filepath",
             )
             with gr.Accordion("⚙️ Advanced Settings", open=False):
+                use_diarization = gr.Checkbox(
+                    label="Enable Speaker Diarization",
+                    value=True,
+                    info="Uncheck for faster transcription without speaker identification"
+                )
                 num_speakers = gr.Slider(
                     minimum=0,
                     maximum=20,
                     value=0,
                     step=1,
+                    label="Number of Speakers (0 = auto-detect)",
+                    visible=True
                 )
                 language = gr.Dropdown(
                 )
                 group_segments = gr.Checkbox(
+                    label="Group segments by speaker/time",
                     value=True
                 )
                 visible=False
             )
+    # Update visibility of num_speakers based on diarization toggle
+    use_diarization.change(
+        fn=lambda x: gr.update(visible=x),
+        inputs=[use_diarization],
+        outputs=[num_speakers]
+    )
     # Event handlers
     process_btn.click(
         fn=process_audio_gradio,
             language,
             translate,
             prompt,
+            group_segments,
+            use_diarization
         ],
         outputs=[output_text, output_json]
     )
     gr.Markdown("""
     - **Supported formats**: MP3, WAV, M4A, FLAC, OGG, and more
     - **Max duration**: Recommended under 10 minutes for optimal performance
+    - **Speaker diarization**: Enable for speaker identification (slower), disable for faster transcription
     - **Languages**: Supports 100+ languages with auto-detection
     - **Vocabulary**: Add names and technical terms in the prompt for better accuracy
     """)