Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Sep 16

Commit

6475331

1 Parent(s): c8b690c

try use diarization as clip_timestamp

Browse files

Files changed (1) hide show

app.py +19 -15

app.py CHANGED Viewed

@@ -362,7 +362,7 @@ class WhisperTranscriber:
         return meta
     @spaces.GPU           # each call gets a GPU slice
-    def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0):
         """Transcribe the entire audio file without speaker diarization using batched inference"""
         whisper, batched_whisper, _ = _load_models()   # models live on the GPU
@@ -373,21 +373,24 @@ class WhisperTranscriber:
         options = dict(
             language=language,
             beam_size=5,
-            vad_filter=True,  # VAD is enabled by default for batched transcription
-            vad_parameters=VadOptions(
-                max_speech_duration_s=whisper.feature_extractor.chunk_length,
-                min_speech_duration_ms=150,   # ignore ultra-short blips
-                min_silence_duration_ms=150,  # split on short Mandarin pauses (if supported)                speech_pad_ms=100,
-                threshold=0.25,
-                neg_threshold=0.2,
-            ),
             word_timestamps=True,
             initial_prompt=prompt,
             condition_on_previous_text=False,  # avoid runaway context
             language_detection_segments=1,
             task="translate" if translate else "transcribe",
         )
         if batch_size > 1:
             # Use batched inference for better performance
             segments, transcript_info = batched_whisper.transcribe(
@@ -907,17 +910,18 @@ class WhisperTranscriber:
                 return {"segments": [], "language": "unknown", "num_speakers": 0, "transcription_method": "diarized_segments_batched", "batch_size": batch_size}
             wav_path = pre_meta["out_wav_path"]
             base_offset_s = float(pre_meta.get("abs_start_ms", 0)) / 1000.0
-            # Step 2: Transcribe full audio once
-            transcription_results, detected_language = self.transcribe_full_audio(
-                wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s
-            )
             # Step 3: Perform diarization with global offset
             diarization_segments, detected_num_speakers, speaker_embeddings = self.perform_diarization(
                 wav_path, num_speakers, base_offset_s=base_offset_s
             )
             # Step 4: Merge diarization into transcription (assign speakers)
             transcription_results, unmatched_diarization_segments = self.assign_speakers_to_transcription(
                 transcription_results, diarization_segments

         return meta
     @spaces.GPU           # each call gets a GPU slice
+    def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None):
         """Transcribe the entire audio file without speaker diarization using batched inference"""
         whisper, batched_whisper, _ = _load_models()   # models live on the GPU
         options = dict(
             language=language,
             beam_size=5,
             word_timestamps=True,
             initial_prompt=prompt,
             condition_on_previous_text=False,  # avoid runaway context
             language_detection_segments=1,
             task="translate" if translate else "transcribe",
         )
+        if clip_timestamps:
+            options["vad_filter"] = False
+            options["clip_timestamps"] = clip_timestamps
+        else:
+            options["vad_filter"] = True  # VAD is enabled by default for batched transcription
+            options["vad_parameters"] = VadOptions(
+                max_speech_duration_s=whisper.feature_extractor.chunk_length,
+                min_speech_duration_ms=150,   # ignore ultra-short blips
+                min_silence_duration_ms=150,  # split on short Mandarin pauses (if supported)                speech_pad_ms=100,
+                threshold=0.25,
+                neg_threshold=0.2,
+            )
         if batch_size > 1:
             # Use batched inference for better performance
             segments, transcript_info = batched_whisper.transcribe(
                 return {"segments": [], "language": "unknown", "num_speakers": 0, "transcription_method": "diarized_segments_batched", "batch_size": batch_size}
             wav_path = pre_meta["out_wav_path"]
             base_offset_s = float(pre_meta.get("abs_start_ms", 0)) / 1000.0
             # Step 3: Perform diarization with global offset
             diarization_segments, detected_num_speakers, speaker_embeddings = self.perform_diarization(
                 wav_path, num_speakers, base_offset_s=base_offset_s
             )
+            # Step 2: Transcribe full audio once
+            transcription_results, detected_language = self.transcribe_full_audio(
+                wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s, clip_timestamps=diarization_segments
+            )
             # Step 4: Merge diarization into transcription (assign speakers)
             transcription_results, unmatched_diarization_segments = self.assign_speakers_to_transcription(
                 transcription_results, diarization_segments