Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Aug 21

Commit

70438f0

1 Parent(s): 7c60c3b

Implement audio preprocessing and speaker diarization enhancements in WhisperTranscriber. Introduce methods for audio chunk preparation, VAD-based trimming, and speaker embedding extraction. Update process_audio methods to utilize task JSON for improved workflow and metadata handling. Add webrtcvad dependency for voice activity detection.

Browse files

Files changed (2) hide show

app.py +365 -187
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -32,9 +32,11 @@ from faster_whisper import WhisperModel, BatchedInferencePipeline
 from faster_whisper.vad import VadOptions
 import requests
 import base64
-from pyannote.audio import Pipeline
 import os, sys, importlib.util, pathlib, ctypes, tempfile, wave, math
 spec = importlib.util.find_spec("nvidia.cudnn")
 if spec is None:
     sys.exit("❌  nvidia-cudnn-cu12 wheel not found.  Run:  pip install nvidia-cudnn-cu12")
@@ -53,6 +55,183 @@ from huggingface_hub import snapshot_download
 MODEL_REPO = "deepdml/faster-whisper-large-v3-turbo-ct2"   # CT2 format
 LOCAL_DIR  = f"{CACHE_ROOT}/whisper_turbo"
 # Download once; later runs are instant
 snapshot_download(
     repo_id=MODEL_REPO,
@@ -66,6 +245,7 @@ model_cache_path = LOCAL_DIR      # <‑‑ this is what we pass to WhisperModel
 _whisper = None
 _batched_whisper = None
 _diarizer = None
 # Create global diarization pipeline
 try:
@@ -108,24 +288,20 @@ class WhisperTranscriber:
         # do **not** create the models here!
         pass
-    def convert_audio_format(self, audio_path):
-        """Convert audio to 16kHz mono WAV format"""
-        temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-        temp_wav_path = temp_wav.name
-        temp_wav.close()
         try:
-            subprocess.run([
-                "ffmpeg", "-i", audio_path,
-                "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
-                temp_wav_path, "-y"
-            ], check=True, capture_output=True)
-            return temp_wav_path
-        except subprocess.CalledProcessError as e:
-            raise RuntimeError(f"Audio conversion failed: {e}")
     @spaces.GPU           # each call gets a GPU slice
-    def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16):
         """Transcribe the entire audio file without speaker diarization using batched inference"""
         whisper, batched_whisper, _ = _load_models()   # models live on the GPU
@@ -168,16 +344,16 @@ class WhisperTranscriber:
             if seg.words:
                 for word in seg.words:
                     words_list.append({
-                        "start": float(word.start),
-                        "end": float(word.end),
                         "word": word.word,
                         "probability": word.probability,
                         "speaker": "SPEAKER_00"  # No speaker identification in full transcription
                     })
             results.append({
-                "start": float(seg.start),
-                "end": float(seg.end),
                 "text": seg.text,
                 "speaker": "SPEAKER_00",  # Single speaker assumption
                 "avg_logprob": seg.avg_logprob,
@@ -190,118 +366,14 @@ class WhisperTranscriber:
         print(results)
         return results, detected_language
-    def cut_audio_segments(self, audio_path, diarization_segments):
-        """Cut audio into segments based on diarization results"""
-        print("Cutting audio into segments...")
-        # Load the full audio
-        waveform, sample_rate = torchaudio.load(audio_path)
-        audio_segments = []
-        for segment in diarization_segments:
-            start_sample = int(segment["start"] * sample_rate)
-            end_sample = int(segment["end"] * sample_rate)
-            # Extract the segment
-            segment_waveform = waveform[:, start_sample:end_sample]
-            # Create temporary file for this segment
-            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-            temp_file.close()
-            # Save the segment
-            torchaudio.save(temp_file.name, segment_waveform, sample_rate)
-            audio_segments.append({
-                "audio_path": temp_file.name,
-                "start": segment["start"],
-                "end": segment["end"],
-                "speaker": segment["speaker"]
-            })
-        return audio_segments
     @spaces.GPU           # each call gets a GPU slice
-    def transcribe_audio_segments(self, audio_segments, language=None, translate=False, prompt=None, batch_size=8):
-        """Transcribe multiple audio segments using faster_whisper with batching"""
-        whisper, batched_whisper, _ = _load_models()   # models live on the GPU
-        print(f"Transcribing {len(audio_segments)} audio segments with batch size {batch_size}...")
-        start_time = time.time()
-        # Prepare options
-        options = dict(
-            language=language,
-            beam_size=5,
-            vad_filter=True,
-            vad_parameters=VadOptions(
-                max_speech_duration_s=whisper.feature_extractor.chunk_length,
-                min_speech_duration_ms=100,
-                speech_pad_ms=100,
-                threshold=0.25,
-                neg_threshold=0.2,
-            ),
-            word_timestamps=True,
-            initial_prompt=prompt,
-            language_detection_segments=1,
-            task="translate" if translate else "transcribe",
-        )
-        results = []
-        detected_language = None
-        for i, segment in enumerate(audio_segments):
-            print(f"Processing segment {i+1}/{len(audio_segments)}")
-            # Use batched inference for each segment
-            segments, transcript_info = batched_whisper.transcribe(
-                segment["audio_path"],
-                batch_size=batch_size,
-                **options
-            )
-            segments = list(segments)
-            # Get detected language from first segment
-            if detected_language is None:
-                detected_language = transcript_info.language
-            # Process each transcribed segment
-            for seg in segments:
-                # Create result entry with detailed format
-                words_list = []
-                if seg.words:
-                    for word in seg.words:
-                        words_list.append({
-                            "start": float(word.start) + segment["start"],
-                            "end": float(word.end) + segment["start"],
-                            "word": word.word,
-                            "probability": word.probability,
-                            "speaker": segment["speaker"]
-                        })
-                results.append({
-                    "start": float(seg.start) + segment["start"],
-                    "end": float(seg.end) + segment["start"],
-                    "text": seg.text,
-                    "speaker": segment["speaker"],
-                    "avg_logprob": seg.avg_logprob,
-                    "words": words_list,
-                    "duration": float(seg.end - seg.start)
-                })
-        # Clean up temporary files
-        for segment in audio_segments:
-            if os.path.exists(segment["audio_path"]):
-                os.unlink(segment["audio_path"])
-        transcription_time = time.time() - start_time
-        print(f"All segments transcribed in {transcription_time:.2f} seconds using batch size {batch_size}")
-        return results, detected_language
     @spaces.GPU           # each call gets a GPU slice
-    def perform_diarization(self, audio_path, num_speakers=None):
-        """Perform speaker diarization"""
         _, _, diarizer = _load_models()   # models live on the GPU
         if diarizer is None:
@@ -309,11 +381,20 @@ class WhisperTranscriber:
             # Load audio to get duration
             waveform, sample_rate = torchaudio.load(audio_path)
             duration = waveform.shape[1] / sample_rate
             return [{
-                "start": 0.0,
-                "end": duration,
                 "speaker": "SPEAKER_00"
-            }], 1
         print("Starting diarization...")
         start_time = time.time()
@@ -330,21 +411,100 @@ class WhisperTranscriber:
         # Convert to list format
         diarize_segments = []
         diarization_list = list(diarization.itertracks(yield_label=True))
         for turn, _, speaker in diarization_list:
             diarize_segments.append({
-                "start": turn.start,
-                "end": turn.end,
                 "speaker": speaker
             })
         unique_speakers = {speaker for segment in diarize_segments for speaker in [segment["speaker"]]}
         detected_num_speakers = len(unique_speakers)
         diarization_time = time.time() - start_time
         print(f"Diarization completed in {diarization_time:.2f} seconds")
-        return diarize_segments, detected_num_speakers
     def group_segments_by_speaker(self, segments, max_gap=1.0, max_duration=30.0):
         """Group consecutive segments from the same speaker"""
@@ -388,22 +548,27 @@ class WhisperTranscriber:
         return grouped_segments
     @spaces.GPU           # each call gets a GPU slice
-    def process_audio_full(self, audio_file, language=None, translate=False, prompt=None, group_segments=True, batch_size=16):
-        """Process audio with full transcription (no speaker diarization)"""
-        if audio_file is None:
-            return {"error": "No audio file provided"}
-        converted_audio_path = None
         try:
             print("Starting full transcription pipeline...")
-            # Step 1: Convert audio format
-            print("Converting audio format...")
-            converted_audio_path = self.convert_audio_format(audio_file)
             # Step 2: Transcribe the entire audio with batching
             transcription_results, detected_language = self.transcribe_full_audio(
-                converted_audio_path, language, translate, prompt, batch_size
             )
             # Step 3: Group segments if requested (based on time gaps and sentence endings)
@@ -424,38 +589,47 @@ class WhisperTranscriber:
             traceback.print_exc()
             return {"error": f"Processing failed: {str(e)}"}
         finally:
-            # Clean up converted audio file
-            if converted_audio_path and os.path.exists(converted_audio_path):
-                os.unlink(converted_audio_path)
-                print("Cleaned up converted audio file")
     @spaces.GPU           # each call gets a GPU slice
-    def process_audio(self, audio_file, num_speakers=None, language=None,
                      translate=False, prompt=None, group_segments=True, batch_size=8):
-        """Main processing function - diarization first, then transcription"""
-        if audio_file is None:
-            return {"error": "No audio file provided"}
-        converted_audio_path = None
         try:
             print("Starting new processing pipeline...")
-            # Step 1: Convert audio format first
-            print("Converting audio format...")
-            converted_audio_path = self.convert_audio_format(audio_file)
-            # Step 2: Perform diarization on converted audio
-            diarization_segments, detected_num_speakers = self.perform_diarization(
-                converted_audio_path, num_speakers
             )
-            # Step 3: Cut audio into segments based on diarization
-            audio_segments = self.cut_audio_segments(converted_audio_path, diarization_segments)
-            # Step 4: Transcribe each segment with batching
-            transcription_results, detected_language = self.transcribe_audio_segments(
-                audio_segments, language, translate, prompt, batch_size
             )
             # Step 5: Group segments if requested
             if group_segments:
@@ -467,7 +641,8 @@ class WhisperTranscriber:
                 "language": detected_language,
                 "num_speakers": detected_num_speakers,
                 "transcription_method": "diarized_segments_batched",
-                "batch_size": batch_size
             }
         except Exception as e:
@@ -475,10 +650,12 @@ class WhisperTranscriber:
             traceback.print_exc()
             return {"error": f"Processing failed: {str(e)}"}
         finally:
-            # Clean up converted audio file
-            if converted_audio_path and os.path.exists(converted_audio_path):
-                os.unlink(converted_audio_path)
-                print("Cleaned up converted audio file")
 # Initialize transcriber
 transcriber = WhisperTranscriber()
@@ -515,11 +692,11 @@ def format_segments_for_display(result):
     return output
 @spaces.GPU
-def process_audio_gradio(audio_file, num_speakers, language, translate, prompt, group_segments, use_diarization, batch_size):
     """Gradio interface function"""
     if use_diarization:
         result = transcriber.process_audio(
-            audio_file=audio_file,
             num_speakers=num_speakers if num_speakers > 0 else None,
             language=language if language != "auto" else None,
             translate=translate,
@@ -529,7 +706,7 @@ def process_audio_gradio(audio_file, num_speakers, language, translate, prompt,
         )
     else:
         result = transcriber.process_audio_full(
-            audio_file=audio_file,
             language=language if language != "auto" else None,
             translate=translate,
             prompt=prompt if prompt and prompt.strip() else None,
@@ -558,9 +735,10 @@ with demo:
     with gr.Row():
         with gr.Column():
-            audio_input = gr.Audio(
-                label="🎵 Upload Audio File",
-                type="filepath",
             )
             with gr.Accordion("⚙️ Advanced Settings", open=False):
@@ -572,7 +750,7 @@ with demo:
                 batch_size = gr.Slider(
                     minimum=1,
-                    maximum=32,
                     value=16,
                     step=1,
                     label="Batch Size",
@@ -615,7 +793,7 @@ with demo:
         with gr.Column():
             output_text = gr.Markdown(
                 label="📝 Transcription Results",
-                value="Upload an audio file and click 'Transcribe Audio' to get started!"
             )
             output_json = gr.JSON(
@@ -634,7 +812,7 @@ with demo:
     process_btn.click(
         fn=process_audio_gradio,
         inputs=[
-            audio_input,
             num_speakers,
             language,
             translate,
@@ -649,11 +827,11 @@ with demo:
     # Examples
     gr.Markdown("### 📋 Usage Tips:")
     gr.Markdown("""
-    - **Supported formats**: MP3, WAV, M4A, FLAC, OGG, and more
-    - **Batch Size**: Higher values (16-24) = faster processing but more GPU memory
-    - **Speaker diarization**: Enable for speaker identification (slower), disable for faster transcription
-    - **Languages**: Supports 100+ languages with auto-detection
-    - **Vocabulary**: Add names and technical terms in the prompt for better accuracy
     """)
 if __name__ == "__main__":

 from faster_whisper.vad import VadOptions
 import requests
 import base64
+from pyannote.audio import Pipeline, Inference
 import os, sys, importlib.util, pathlib, ctypes, tempfile, wave, math
+import json
+import webrtcvad
 spec = importlib.util.find_spec("nvidia.cudnn")
 if spec is None:
     sys.exit("❌  nvidia-cudnn-cu12 wheel not found.  Run:  pip install nvidia-cudnn-cu12")
 MODEL_REPO = "deepdml/faster-whisper-large-v3-turbo-ct2"   # CT2 format
 LOCAL_DIR  = f"{CACHE_ROOT}/whisper_turbo"
+# -----------------------------------------------------------------------------
+# Audio preprocess helper (from input_and_preprocess rule)
+# -----------------------------------------------------------------------------
+TRIM_THRESHOLD_MS = 10_000  # 10 seconds
+DEFAULT_PAD_MS    = 250     # safety context around detected speech
+FRAME_MS          = 30      # VAD frame
+HANG_MS           = 240     # hangover (keep speech "on" after silence)
+VAD_LEVEL         = 2       # 0-3
+def _decode_chunk_to_pcm(task: dict) -> bytes:
+    """Use ffmpeg to decode the chunk to s16le mono @ 16k PCM bytes."""
+    src = task["source_uri"]
+    ing = task["ingest_recipe"]
+    seek = task["ffmpeg_seek"]
+    cmd = [
+        "ffmpeg", "-nostdin", "-hide_banner", "-v", "error",
+        "-ss", f"{max(0.0, float(seek['pre_ss_sec'])):.3f}",
+        "-i", src,
+        "-map", "0:a:0",
+        "-ss", f"{float(seek['post_ss_sec']):.2f}",
+        "-t", f"{float(seek['t_sec']):.3f}",
+    ]
+    # Optional L/R extraction
+    if ing.get("channel_extract_filter"):
+        cmd += ["-af", ing["channel_extract_filter"]]
+    # Force mono 16k s16le to stdout
+    cmd += ["-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", "-f", "s16le", "pipe:1"]
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    pcm, err = p.communicate()
+    if p.returncode != 0:
+        raise RuntimeError(f"ffmpeg failed: {err.decode('utf-8', 'ignore')}")
+    return pcm
+def _find_head_tail_speech_ms(
+    pcm: bytes,
+    sr: int = 16000,
+    frame_ms: int = FRAME_MS,
+    vad_level: int = VAD_LEVEL,
+    hang_ms: int = HANG_MS,
+):
+    """Return (first_ms, last_ms) speech boundaries using webrtcvad with hangover."""
+    if not pcm:
+        return None, None
+    vad = webrtcvad.Vad(int(vad_level))
+    bpf = 2  # bytes per sample (s16)
+    samples_per_ms = sr // 1000  # 16
+    bytes_per_frame = samples_per_ms * bpf * frame_ms
+    n_frames = len(pcm) // bytes_per_frame
+    if n_frames == 0:
+        return None, None
+    first_ms, last_ms = None, None
+    t_ms = 0
+    in_speech = False
+    silence_run = 0
+    view = memoryview(pcm)[: n_frames * bytes_per_frame]
+    for i in range(n_frames):
+        frame = view[i * bytes_per_frame : (i + 1) * bytes_per_frame]
+        if vad.is_speech(frame, sr):
+            if first_ms is None:
+                first_ms = t_ms
+            in_speech = True
+            silence_run = 0
+        else:
+            if in_speech:
+                silence_run += frame_ms
+                if silence_run >= hang_ms:
+                    last_ms = t_ms - (silence_run - hang_ms)
+                    in_speech = False
+                    silence_run = 0
+        t_ms += frame_ms
+    if in_speech:
+        last_ms = t_ms
+    return first_ms, last_ms
+def _write_wav(path: str, pcm: bytes, sr: int = 16000):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with wave.open(path, "wb") as w:
+        w.setnchannels(1)
+        w.setsampwidth(2)  # s16
+        w.setframerate(sr)
+        w.writeframes(pcm)
+def prepare_and_save_audio_for_model(task: dict, out_dir: str) -> dict:
+    """
+    1) Decode chunk to mono 16k PCM.
+    2) Run VAD to locate head/tail silence.
+    3) Trim only if head or tail silence >= 10s.
+    4) Save the (possibly trimmed) WAV to local file.
+    5) Return timing metadata, including 'trimmed_start_ms' to preserve global timestamps.
+    """
+    # 0) Names & constants
+    sr = 16000
+    bpf = 2
+    samples_per_ms = sr // 1000
+    def bytes_from_ms(ms: int) -> int:
+        return int(ms * samples_per_ms) * bpf
+    ch = task["channel"]
+    ck = task["chunk"]
+    job = task.get("job_id", "job")
+    idx = str(ck["idx"])
+    # 1) Decode chunk
+    pcm = _decode_chunk_to_pcm(task)
+    planned_dur_ms = int(ck["dur_ms"])
+    # 2) VAD head/tail detection
+    first_ms, last_ms = _find_head_tail_speech_ms(pcm, sr=sr)
+    head_sil_ms = int(first_ms) if first_ms is not None else planned_dur_ms
+    tail_sil_ms = int(planned_dur_ms - last_ms) if last_ms is not None else planned_dur_ms
+    # 3) Decide trimming (only if head or tail >= 10s)
+    trim_applied = False
+    eff_start_ms = 0
+    eff_end_ms = planned_dur_ms
+    trimmed_pcm = pcm
+    if (head_sil_ms >= TRIM_THRESHOLD_MS) or (tail_sil_ms >= TRIM_THRESHOLD_MS):
+        # If no speech found at all, mark skip
+        if first_ms is None or last_ms is None or last_ms <= first_ms:
+            out_wav_path = os.path.join(out_dir, f"{job}_{ch}_{idx}_nospeech.wav")
+            _write_wav(out_wav_path, b"", sr)
+            return {
+                "out_wav_path": out_wav_path,
+                "sr": sr,
+                "trim_applied": False,
+                "trimmed_start_ms": 0,
+                "head_silence_ms": head_sil_ms,
+                "tail_silence_ms": tail_sil_ms,
+                "effective_start_ms": 0,
+                "effective_dur_ms": 0,
+                "abs_start_ms": ck["global_offset_ms"],
+                "chunk_idx": idx,
+                "channel": ch,
+                "skip": True,
+            }
+        # Apply padding & slice
+        start_ms = max(0, int(first_ms) - DEFAULT_PAD_MS)
+        end_ms = min(planned_dur_ms, int(last_ms) + DEFAULT_PAD_MS)
+        if end_ms > start_ms:
+            eff_start_ms = start_ms
+            eff_end_ms = end_ms
+            trimmed_pcm = pcm[bytes_from_ms(start_ms) : bytes_from_ms(end_ms)]
+            trim_applied = True
+    # 4) Write WAV to local file (trimmed or original)
+    tag = "trim" if trim_applied else "full"
+    out_wav_path = os.path.join(out_dir, f"{job}_{ch}_{idx}_{tag}.wav")
+    _write_wav(out_wav_path, trimmed_pcm, sr)
+    # 5) Return metadata
+    return {
+        "out_wav_path": out_wav_path,
+        "sr": sr,
+        "trim_applied": trim_applied,
+        "trimmed_start_ms": eff_start_ms if trim_applied else 0,
+        "head_silence_ms": head_sil_ms,
+        "tail_silence_ms": tail_sil_ms,
+        "effective_start_ms": eff_start_ms,
+        "effective_dur_ms": eff_end_ms - eff_start_ms,
+        "abs_start_ms": int(ck["global_offset_ms"]) + eff_start_ms,
+        "chunk_idx": idx,
+        "channel": ch,
+        "skip": False if (trim_applied or len(pcm) > 0) else True,
+    }
 # Download once; later runs are instant
 snapshot_download(
     repo_id=MODEL_REPO,
 _whisper = None
 _batched_whisper = None
 _diarizer = None
+_embedder = None
 # Create global diarization pipeline
 try:
         # do **not** create the models here!
         pass
+    def preprocess_from_task_json(self, task_json: str) -> dict:
+        """Parse task JSON and run prepare_and_save_audio_for_model, returning metadata."""
         try:
+            task = json.loads(task_json)
+        except Exception as e:
+            raise RuntimeError(f"Invalid JSON: {e}")
+        out_dir = os.path.join(CACHE_ROOT, "preprocessed")
+        os.makedirs(out_dir, exist_ok=True)
+        meta = prepare_and_save_audio_for_model(task, out_dir)
+        return meta
     @spaces.GPU           # each call gets a GPU slice
+    def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0):
         """Transcribe the entire audio file without speaker diarization using batched inference"""
         whisper, batched_whisper, _ = _load_models()   # models live on the GPU
             if seg.words:
                 for word in seg.words:
                     words_list.append({
+                        "start": float(word.start) + float(base_offset_s),
+                        "end": float(word.end) + float(base_offset_s),
                         "word": word.word,
                         "probability": word.probability,
                         "speaker": "SPEAKER_00"  # No speaker identification in full transcription
                     })
             results.append({
+                "start": float(seg.start) + float(base_offset_s),
+                "end": float(seg.end) + float(base_offset_s),
                 "text": seg.text,
                 "speaker": "SPEAKER_00",  # Single speaker assumption
                 "avg_logprob": seg.avg_logprob,
         print(results)
         return results, detected_language
+    # Removed audio cutting; transcription is done once on the full (preprocessed) audio
     @spaces.GPU           # each call gets a GPU slice
+    # Removed segment-wise transcription; using single full-audio transcription
     @spaces.GPU           # each call gets a GPU slice
+    def perform_diarization(self, audio_path, num_speakers=None, base_offset_s: float = 0.0):
+        """Perform speaker diarization; return segments with global timestamps and per-speaker embeddings."""
         _, _, diarizer = _load_models()   # models live on the GPU
         if diarizer is None:
             # Load audio to get duration
             waveform, sample_rate = torchaudio.load(audio_path)
             duration = waveform.shape[1] / sample_rate
+            # Try to compute a single-speaker embedding
+            speaker_embeddings = {}
+            try:
+                embedder = self._load_embedder()
+                # waveform is (1, T); embedder expects mono 1D
+                emb = embedder({"waveform": waveform.squeeze(0), "sample_rate": sample_rate})
+                speaker_embeddings["SPEAKER_00"] = emb.squeeze().tolist()
+            except Exception:
+                pass
             return [{
+                "start": 0.0 + float(base_offset_s),
+                "end": duration + float(base_offset_s),
                 "speaker": "SPEAKER_00"
+            }], 1, speaker_embeddings
         print("Starting diarization...")
         start_time = time.time()
         # Convert to list format
         diarize_segments = []
         diarization_list = list(diarization.itertracks(yield_label=True))
+        print(diarization_list)
         for turn, _, speaker in diarization_list:
             diarize_segments.append({
+                "start": float(turn.start) + float(base_offset_s),
+                "end": float(turn.end) + float(base_offset_s),
                 "speaker": speaker
             })
         unique_speakers = {speaker for segment in diarize_segments for speaker in [segment["speaker"]]}
         detected_num_speakers = len(unique_speakers)
+        # Compute per-speaker embeddings by averaging segment embeddings
+        speaker_embeddings = {}
+        try:
+            embedder = self._load_embedder()
+            spk_to_embs = {spk: [] for spk in unique_speakers}
+            for turn, _, speaker in diarization_list:
+                start_sample = int(float(turn.start) * sample_rate)
+                end_sample = int(float(turn.end) * sample_rate)
+                if end_sample > start_sample:
+                    seg_wav = waveform[0, start_sample:end_sample].contiguous()
+                    emb = embedder({"waveform": seg_wav, "sample_rate": sample_rate})
+                    spk_to_embs[speaker].append(emb.squeeze())
+            # average
+            for spk, embs in spk_to_embs.items():
+                if len(embs) == 0:
+                    continue
+                # stack and mean
+                try:
+                    import torch as _torch
+                    embs_tensor = _torch.stack([_torch.as_tensor(e) for e in embs], dim=0)
+                    centroid = embs_tensor.mean(dim=0)
+                    # L2 normalize
+                    centroid = centroid / (centroid.norm(p=2) + 1e-12)
+                    speaker_embeddings[spk] = centroid.cpu().tolist()
+                except Exception:
+                    # fallback to first embedding
+                    speaker_embeddings[spk] = embs[0].cpu().tolist()
+        except Exception:
+            pass
         diarization_time = time.time() - start_time
         print(f"Diarization completed in {diarization_time:.2f} seconds")
+        return diarize_segments, detected_num_speakers, speaker_embeddings
+    def _load_embedder(self):
+        """Lazy-load speaker embedding inference model on GPU."""
+        global _embedder
+        if _embedder is None:
+            # window="whole" to compute one embedding per provided chunk
+            _embedder = Inference("pyannote/embedding", window="whole", device=torch.device("cuda"))
+        return _embedder
+    def assign_speakers_to_transcription(self, transcription_results, diarization_segments):
+        """Assign speakers to words and segments based on overlap with diarization segments."""
+        if not diarization_segments:
+            return transcription_results
+        # simple helper to find speaker at given time
+        def speaker_at(t: float):
+            for seg in diarization_segments:
+                if seg["start"] <= t < seg["end"]:
+                    return seg["speaker"]
+            # if not inside, return closest segment's speaker
+            closest = None
+            best = float("inf")
+            for seg in diarization_segments:
+                if t < seg["start"]:
+                    d = seg["start"] - t
+                elif t > seg["end"]:
+                    d = t - seg["end"]
+                else:
+                    d = 0.0
+                if d < best:
+                    best = d
+                    closest = seg
+            return closest["speaker"] if closest else "SPEAKER_00"
+        for seg in transcription_results:
+            # Assign per-word speakers
+            if seg.get("words"):
+                speaker_counts = {}
+                for w in seg["words"]:
+                    mid = (float(w["start"]) + float(w["end"])) / 2.0
+                    spk = speaker_at(mid)
+                    w["speaker"] = spk
+                    speaker_counts[spk] = speaker_counts.get(spk, 0) + (float(w["end"]) - float(w["start"]))
+                # Segment speaker = speaker with max accumulated word duration
+                if speaker_counts:
+                    seg["speaker"] = max(speaker_counts.items(), key=lambda kv: kv[1])[0]
+            else:
+                mid = (float(seg["start"]) + float(seg["end"])) / 2.0
+                seg["speaker"] = speaker_at(mid)
+        return transcription_results
     def group_segments_by_speaker(self, segments, max_gap=1.0, max_duration=30.0):
         """Group consecutive segments from the same speaker"""
         return grouped_segments
     @spaces.GPU           # each call gets a GPU slice
+    def process_audio_full(self, task_json, language=None, translate=False, prompt=None, group_segments=True, batch_size=16):
+        """Process a single chunk using task JSON (no diarization)."""
+        if not task_json or not str(task_json).strip():
+            return {"error": "No JSON provided"}
+        pre_meta = None
         try:
             print("Starting full transcription pipeline...")
+            # Step 1: Preprocess per chunk JSON
+            print("Preprocessing chunk JSON...")
+            pre_meta = self.preprocess_from_task_json(task_json)
+            if pre_meta.get("skip"):
+                return {"segments": [], "language": "unknown", "num_speakers": 1, "transcription_method": "full_audio_batched", "batch_size": batch_size}
+            wav_path = pre_meta["out_wav_path"]
+            # Adjust timestamps by trimmed_start_ms: abs_start_ms is already global start for saved file
+            base_offset_s = float(pre_meta.get("abs_start_ms", 0)) / 1000.0
             # Step 2: Transcribe the entire audio with batching
             transcription_results, detected_language = self.transcribe_full_audio(
+                wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s
             )
             # Step 3: Group segments if requested (based on time gaps and sentence endings)
             traceback.print_exc()
             return {"error": f"Processing failed: {str(e)}"}
         finally:
+            # Clean up preprocessed wav
+            if pre_meta and pre_meta.get("out_wav_path") and os.path.exists(pre_meta["out_wav_path"]):
+                try:
+                    os.unlink(pre_meta["out_wav_path"])
+                except Exception:
+                    pass
     @spaces.GPU           # each call gets a GPU slice
+    def process_audio(self, task_json, num_speakers=None, language=None,
                      translate=False, prompt=None, group_segments=True, batch_size=8):
+        """Main processing function with diarization using task JSON for a single chunk.
+        Transcribes full (preprocessed) audio once, performs diarization, merges speakers into transcription.
+        """
+        if not task_json or not str(task_json).strip():
+            return {"error": "No JSON provided"}
+        pre_meta = None
         try:
             print("Starting new processing pipeline...")
+            # Step 1: Preprocess per chunk JSON
+            print("Preprocessing chunk JSON...")
+            pre_meta = self.preprocess_from_task_json(task_json)
+            if pre_meta.get("skip"):
+                return {"segments": [], "language": "unknown", "num_speakers": 0, "transcription_method": "diarized_segments_batched", "batch_size": batch_size}
+            wav_path = pre_meta["out_wav_path"]
+            base_offset_s = float(pre_meta.get("abs_start_ms", 0)) / 1000.0
+            # Step 2: Transcribe full audio once
+            transcription_results, detected_language = self.transcribe_full_audio(
+                wav_path, language, translate, prompt, batch_size, base_offset_s=base_offset_s
             )
+            # Step 3: Perform diarization with global offset
+            diarization_segments, detected_num_speakers, speaker_embeddings = self.perform_diarization(
+                wav_path, num_speakers, base_offset_s=base_offset_s
             )
+            # Step 4: Merge diarization into transcription (assign speakers)
+            transcription_results = self.assign_speakers_to_transcription(transcription_results, diarization_segments)
             # Step 5: Group segments if requested
             if group_segments:
                 "language": detected_language,
                 "num_speakers": detected_num_speakers,
                 "transcription_method": "diarized_segments_batched",
+                "batch_size": batch_size,
+                "speaker_embeddings": speaker_embeddings,
             }
         except Exception as e:
             traceback.print_exc()
             return {"error": f"Processing failed: {str(e)}"}
         finally:
+            # Clean up preprocessed wav
+            if pre_meta and pre_meta.get("out_wav_path") and os.path.exists(pre_meta["out_wav_path"]):
+                try:
+                    os.unlink(pre_meta["out_wav_path"])
+                except Exception:
+                    pass
 # Initialize transcriber
 transcriber = WhisperTranscriber()
     return output
 @spaces.GPU
+def process_audio_gradio(task_json, num_speakers, language, translate, prompt, group_segments, use_diarization, batch_size):
     """Gradio interface function"""
     if use_diarization:
         result = transcriber.process_audio(
+            task_json=task_json,
             num_speakers=num_speakers if num_speakers > 0 else None,
             language=language if language != "auto" else None,
             translate=translate,
         )
     else:
         result = transcriber.process_audio_full(
+            task_json=task_json,
             language=language if language != "auto" else None,
             translate=translate,
             prompt=prompt if prompt and prompt.strip() else None,
     with gr.Row():
         with gr.Column():
+            task_json_input = gr.Textbox(
+                label="🧾 Paste Task JSON",
+                placeholder="Paste the per-chunk task JSON here...",
+                lines=16,
             )
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 batch_size = gr.Slider(
                     minimum=1,
+                    maximum=128,
                     value=16,
                     step=1,
                     label="Batch Size",
         with gr.Column():
             output_text = gr.Markdown(
                 label="📝 Transcription Results",
+                value="Paste task JSON and click 'Transcribe Audio' to get started!"
             )
             output_json = gr.JSON(
     process_btn.click(
         fn=process_audio_gradio,
         inputs=[
+            task_json_input,
             num_speakers,
             language,
             translate,
     # Examples
     gr.Markdown("### 📋 Usage Tips:")
     gr.Markdown("""
+    - Paste a single-chunk task JSON matching the preprocess schema
+    - Batch Size: Higher values (16-24) = faster but uses more GPU memory
+    - Speaker diarization: Enable for speaker identification (slower)
+    - Languages: Supports 100+ languages with auto-detection
+    - Vocabulary: Add names and technical terms in the prompt for better accuracy
     """)
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -19,4 +19,5 @@ librosa>=0.10.0
 soundfile>=0.12.0
 ffmpeg-python>=0.2.0
 requests>=2.28.0
-nvidia-cudnn-cu12==9.1.0.70  # any 9.1.x that pip can find is fine

 soundfile>=0.12.0
 ffmpeg-python>=0.2.0
 requests>=2.28.0
+nvidia-cudnn-cu12==9.1.0.70  # any 9.1.x that pip can find is fine
+webrtcvad>=2.0.10