liuyang commited on
Commit
6475331
·
1 Parent(s): c8b690c

try use diarization as clip_timestamp

Browse files
Files changed (1) hide show
  1. app.py +19 -15
app.py CHANGED
@@ -362,7 +362,7 @@ class WhisperTranscriber:
362
  return meta
363
 
364
  @spaces.GPU # each call gets a GPU slice
365
- def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0):
366
  """Transcribe the entire audio file without speaker diarization using batched inference"""
367
  whisper, batched_whisper, _ = _load_models() # models live on the GPU
368
 
@@ -373,21 +373,24 @@ class WhisperTranscriber:
373
  options = dict(
374
  language=language,
375
  beam_size=5,
376
- vad_filter=True, # VAD is enabled by default for batched transcription
377
- vad_parameters=VadOptions(
378
- max_speech_duration_s=whisper.feature_extractor.chunk_length,
379
- min_speech_duration_ms=150, # ignore ultra-short blips
380
- min_silence_duration_ms=150, # split on short Mandarin pauses (if supported) speech_pad_ms=100,
381
- threshold=0.25,
382
- neg_threshold=0.2,
383
-
384
- ),
385
  word_timestamps=True,
386
  initial_prompt=prompt,
387
  condition_on_previous_text=False, # avoid runaway context
388
  language_detection_segments=1,
389
  task="translate" if translate else "transcribe",
390
  )
 
 
 
 
 
 
 
 
 
 
 
 
391
  if batch_size > 1:
392
  # Use batched inference for better performance
393
  segments, transcript_info = batched_whisper.transcribe(
@@ -907,17 +910,18 @@ class WhisperTranscriber:
907
  return {"segments": [], "language": "unknown", "num_speakers": 0, "transcription_method": "diarized_segments_batched", "batch_size": batch_size}
908
  wav_path = pre_meta["out_wav_path"]
909
  base_offset_s = float(pre_meta.get("abs_start_ms", 0)) / 1000.0
910
-
911
- # Step 2: Transcribe full audio once
912
- transcription_results, detected_language = self.transcribe_full_audio(
913
- wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s
914
- )
915
 
916
  # Step 3: Perform diarization with global offset
917
  diarization_segments, detected_num_speakers, speaker_embeddings = self.perform_diarization(
918
  wav_path, num_speakers, base_offset_s=base_offset_s
919
  )
920
 
 
 
 
 
 
 
921
  # Step 4: Merge diarization into transcription (assign speakers)
922
  transcription_results, unmatched_diarization_segments = self.assign_speakers_to_transcription(
923
  transcription_results, diarization_segments
 
362
  return meta
363
 
364
  @spaces.GPU # each call gets a GPU slice
365
+ def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None):
366
  """Transcribe the entire audio file without speaker diarization using batched inference"""
367
  whisper, batched_whisper, _ = _load_models() # models live on the GPU
368
 
 
373
  options = dict(
374
  language=language,
375
  beam_size=5,
 
 
 
 
 
 
 
 
 
376
  word_timestamps=True,
377
  initial_prompt=prompt,
378
  condition_on_previous_text=False, # avoid runaway context
379
  language_detection_segments=1,
380
  task="translate" if translate else "transcribe",
381
  )
382
+ if clip_timestamps:
383
+ options["vad_filter"] = False
384
+ options["clip_timestamps"] = clip_timestamps
385
+ else:
386
+ options["vad_filter"] = True # VAD is enabled by default for batched transcription
387
+ options["vad_parameters"] = VadOptions(
388
+ max_speech_duration_s=whisper.feature_extractor.chunk_length,
389
+ min_speech_duration_ms=150, # ignore ultra-short blips
390
+ min_silence_duration_ms=150, # split on short Mandarin pauses (if supported) speech_pad_ms=100,
391
+ threshold=0.25,
392
+ neg_threshold=0.2,
393
+ )
394
  if batch_size > 1:
395
  # Use batched inference for better performance
396
  segments, transcript_info = batched_whisper.transcribe(
 
910
  return {"segments": [], "language": "unknown", "num_speakers": 0, "transcription_method": "diarized_segments_batched", "batch_size": batch_size}
911
  wav_path = pre_meta["out_wav_path"]
912
  base_offset_s = float(pre_meta.get("abs_start_ms", 0)) / 1000.0
 
 
 
 
 
913
 
914
  # Step 3: Perform diarization with global offset
915
  diarization_segments, detected_num_speakers, speaker_embeddings = self.perform_diarization(
916
  wav_path, num_speakers, base_offset_s=base_offset_s
917
  )
918
 
919
+ # Step 2: Transcribe full audio once
920
+ transcription_results, detected_language = self.transcribe_full_audio(
921
+ wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s, clip_timestamps=diarization_segments
922
+ )
923
+
924
+
925
  # Step 4: Merge diarization into transcription (assign speakers)
926
  transcription_results, unmatched_diarization_segments = self.assign_speakers_to_transcription(
927
  transcription_results, diarization_segments