Spaces:
Runtime error
Runtime error
liuyang
commited on
Commit
·
6475331
1
Parent(s):
c8b690c
try use diarization as clip_timestamp
Browse files
app.py
CHANGED
|
@@ -362,7 +362,7 @@ class WhisperTranscriber:
|
|
| 362 |
return meta
|
| 363 |
|
| 364 |
@spaces.GPU # each call gets a GPU slice
|
| 365 |
-
def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0):
|
| 366 |
"""Transcribe the entire audio file without speaker diarization using batched inference"""
|
| 367 |
whisper, batched_whisper, _ = _load_models() # models live on the GPU
|
| 368 |
|
|
@@ -373,21 +373,24 @@ class WhisperTranscriber:
|
|
| 373 |
options = dict(
|
| 374 |
language=language,
|
| 375 |
beam_size=5,
|
| 376 |
-
vad_filter=True, # VAD is enabled by default for batched transcription
|
| 377 |
-
vad_parameters=VadOptions(
|
| 378 |
-
max_speech_duration_s=whisper.feature_extractor.chunk_length,
|
| 379 |
-
min_speech_duration_ms=150, # ignore ultra-short blips
|
| 380 |
-
min_silence_duration_ms=150, # split on short Mandarin pauses (if supported) speech_pad_ms=100,
|
| 381 |
-
threshold=0.25,
|
| 382 |
-
neg_threshold=0.2,
|
| 383 |
-
|
| 384 |
-
),
|
| 385 |
word_timestamps=True,
|
| 386 |
initial_prompt=prompt,
|
| 387 |
condition_on_previous_text=False, # avoid runaway context
|
| 388 |
language_detection_segments=1,
|
| 389 |
task="translate" if translate else "transcribe",
|
| 390 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
if batch_size > 1:
|
| 392 |
# Use batched inference for better performance
|
| 393 |
segments, transcript_info = batched_whisper.transcribe(
|
|
@@ -907,17 +910,18 @@ class WhisperTranscriber:
|
|
| 907 |
return {"segments": [], "language": "unknown", "num_speakers": 0, "transcription_method": "diarized_segments_batched", "batch_size": batch_size}
|
| 908 |
wav_path = pre_meta["out_wav_path"]
|
| 909 |
base_offset_s = float(pre_meta.get("abs_start_ms", 0)) / 1000.0
|
| 910 |
-
|
| 911 |
-
# Step 2: Transcribe full audio once
|
| 912 |
-
transcription_results, detected_language = self.transcribe_full_audio(
|
| 913 |
-
wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s
|
| 914 |
-
)
|
| 915 |
|
| 916 |
# Step 3: Perform diarization with global offset
|
| 917 |
diarization_segments, detected_num_speakers, speaker_embeddings = self.perform_diarization(
|
| 918 |
wav_path, num_speakers, base_offset_s=base_offset_s
|
| 919 |
)
|
| 920 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 921 |
# Step 4: Merge diarization into transcription (assign speakers)
|
| 922 |
transcription_results, unmatched_diarization_segments = self.assign_speakers_to_transcription(
|
| 923 |
transcription_results, diarization_segments
|
|
|
|
| 362 |
return meta
|
| 363 |
|
| 364 |
@spaces.GPU # each call gets a GPU slice
|
| 365 |
+
def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None):
|
| 366 |
"""Transcribe the entire audio file without speaker diarization using batched inference"""
|
| 367 |
whisper, batched_whisper, _ = _load_models() # models live on the GPU
|
| 368 |
|
|
|
|
| 373 |
options = dict(
|
| 374 |
language=language,
|
| 375 |
beam_size=5,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
word_timestamps=True,
|
| 377 |
initial_prompt=prompt,
|
| 378 |
condition_on_previous_text=False, # avoid runaway context
|
| 379 |
language_detection_segments=1,
|
| 380 |
task="translate" if translate else "transcribe",
|
| 381 |
)
|
| 382 |
+
if clip_timestamps:
|
| 383 |
+
options["vad_filter"] = False
|
| 384 |
+
options["clip_timestamps"] = clip_timestamps
|
| 385 |
+
else:
|
| 386 |
+
options["vad_filter"] = True # VAD is enabled by default for batched transcription
|
| 387 |
+
options["vad_parameters"] = VadOptions(
|
| 388 |
+
max_speech_duration_s=whisper.feature_extractor.chunk_length,
|
| 389 |
+
min_speech_duration_ms=150, # ignore ultra-short blips
|
| 390 |
+
min_silence_duration_ms=150, # split on short Mandarin pauses (if supported) speech_pad_ms=100,
|
| 391 |
+
threshold=0.25,
|
| 392 |
+
neg_threshold=0.2,
|
| 393 |
+
)
|
| 394 |
if batch_size > 1:
|
| 395 |
# Use batched inference for better performance
|
| 396 |
segments, transcript_info = batched_whisper.transcribe(
|
|
|
|
| 910 |
return {"segments": [], "language": "unknown", "num_speakers": 0, "transcription_method": "diarized_segments_batched", "batch_size": batch_size}
|
| 911 |
wav_path = pre_meta["out_wav_path"]
|
| 912 |
base_offset_s = float(pre_meta.get("abs_start_ms", 0)) / 1000.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 913 |
|
| 914 |
# Step 3: Perform diarization with global offset
|
| 915 |
diarization_segments, detected_num_speakers, speaker_embeddings = self.perform_diarization(
|
| 916 |
wav_path, num_speakers, base_offset_s=base_offset_s
|
| 917 |
)
|
| 918 |
|
| 919 |
+
# Step 2: Transcribe full audio once
|
| 920 |
+
transcription_results, detected_language = self.transcribe_full_audio(
|
| 921 |
+
wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s, clip_timestamps=diarization_segments
|
| 922 |
+
)
|
| 923 |
+
|
| 924 |
+
|
| 925 |
# Step 4: Merge diarization into transcription (assign speakers)
|
| 926 |
transcription_results, unmatched_diarization_segments = self.assign_speakers_to_transcription(
|
| 927 |
transcription_results, diarization_segments
|