Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Oct 10

Commit

726a091

1 Parent(s): 54da026

restore to whisper

Browse files

Files changed (2) hide show

app.py +55 -178
requirements.txt +2 -2

app.py CHANGED Viewed

@@ -37,7 +37,6 @@ import tempfile
 import spaces
 from faster_whisper import WhisperModel, BatchedInferencePipeline
 from faster_whisper.vad import VadOptions
-import whisperx
 import requests
 import base64
 from pyannote.audio import Pipeline, Inference, Model
@@ -133,17 +132,14 @@ MODELS = {
 }
 DEFAULT_MODEL = "large-v3-turbo"
-# Supported languages for alignment models (whisperX)
-ALIGN_LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh", "ar", "nl", "tr", "pl", "cs", "sv", "da", "fi", "no", "uk"]
 def _download_model(model_name: str):
-    """Downloads a faster-whisper model from the hub if not already present."""
     if model_name not in MODELS:
         raise ValueError(f"Model '{model_name}' not found in MODELS registry.")
     model_info = MODELS[model_name]
     if not os.path.exists(model_info["local_dir"]):
-        print(f"Downloading faster-whisper model '{model_name}' from {model_info['repo_id']}...")
         snapshot_download(
             repo_id=model_info["repo_id"],
             local_dir=model_info["local_dir"],
@@ -152,11 +148,9 @@ def _download_model(model_name: str):
         )
     return model_info["local_dir"]
-# Download all faster-whisper models on startup
-print("Downloading all faster-whisper models...")
 for model in MODELS:
     _download_model(model)
-print("All faster-whisper models downloaded!")
 # -----------------------------------------------------------------------------
@@ -384,32 +378,13 @@ def _process_single_chunk(task: dict, out_dir: str) -> dict:
 # Lazy global holder ----------------------------------------------------------
 _whisper_models = {}
 _batched_whisper_models = {}
 _whipser_x_align_models = {}
 _diarizer = None
 _embedder = None
-# Preload WhisperX alignment models at startup (no GPU decorator needed)
-print("Preloading all WhisperX alignment models...")
-for lang in ALIGN_LANGUAGES:
-    try:
-        print(f"Loading alignment model for language '{lang}'...")
-        device = "cuda"
-        align_model, align_metadata = whisperx.load_align_model(
-            language_code=lang,
-            device=device,
-            model_dir=CACHE_ROOT
-        )
-        _whipser_x_align_models[lang] = {
-            "model": align_model,
-            "metadata": align_metadata
-        }
-        print(f"Alignment model for '{lang}' loaded successfully")
-    except Exception as e:
-        print(f"Could not load alignment model for '{lang}': {e}")
-# Create global diarization pipeline at startup
 try:
     print("Loading diarization model...")
     torch.backends.cuda.matmul.allow_tf32 = True
@@ -428,24 +403,15 @@ except Exception as e:
     print(f"Could not load diarization model: {e}")
     _diarizer = None
-print("WhisperX alignment and diarization models preloaded successfully!")
 @spaces.GPU   # GPU is guaranteed to exist *inside* this function
-def _load_faster_whisper_model(model_name: str):
-    """Load a specific faster-whisper model on GPU (lazy loading)"""
-    global _whisper_models, _batched_whisper_models
-    if model_name in _whisper_models:
-        print(f"Faster-whisper model '{model_name}' already loaded")
-        return _whisper_models[model_name], _batched_whisper_models[model_name]
-    if model_name not in MODELS:
-        raise ValueError(f"Model '{model_name}' not found in MODELS registry. Available: {list(MODELS.keys())}")
-    print(f"Loading faster-whisper model '{model_name}' on GPU...")
-    model_cache_path = _download_model(model_name)
-    try:
         model = WhisperModel(
             model_cache_path,
             device="cuda",
@@ -458,24 +424,12 @@ def _load_faster_whisper_model(model_name: str):
         _whisper_models[model_name] = model
         _batched_whisper_models[model_name] = batched_model
-        print(f"Faster-whisper model '{model_name}' and batched pipeline loaded successfully")
-        return model, batched_model
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        raise RuntimeError(f"Could not load faster-whisper model '{model_name}': {e}")
-# Optional: Preload all faster-whisper models explicitly
-@spaces.GPU
-def preload_all_whisper_models():
-    """Preload all faster-whisper models - optional, for faster first-time use"""
-    print("Preloading all faster-whisper models...")
-    for model_name in MODELS.keys():
-        try:
-            _load_faster_whisper_model(model_name)
-        except Exception as e:
-            print(f"Failed to preload model '{model_name}': {e}")
-    print("All faster-whisper models preloaded!")
 # -----------------------------------------------------------------------------
 class WhisperTranscriber:
@@ -504,18 +458,10 @@ class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None, model_name: str = DEFAULT_MODEL, transcribe_options: dict = None):
-        """Transcribe the entire audio file using faster-whisper, then align with WhisperX"""
-        global _whisper_models, _batched_whisper_models, _whipser_x_align_models
-        # Get preloaded faster-whisper model, or load it if not available
-        if model_name not in _whisper_models:
-            print(f"Faster-whisper model '{model_name}' not preloaded, loading now...")
-            _load_faster_whisper_model(model_name)
-        whisper = _whisper_models[model_name]
-        batched_whisper = _batched_whisper_models[model_name]
-        print(f"Transcribing full audio with faster-whisper '{model_name}' and batch size {batch_size}...")
         start_time = time.time()
         # Prepare options for batched inference
@@ -528,125 +474,65 @@ class WhisperTranscriber:
             language_detection_segments=1,
             task="translate" if translate else "transcribe",
         )
         if clip_timestamps:
             options["vad_filter"] = False
             options["clip_timestamps"] = clip_timestamps
         else:
-            vad_options = transcribe_options.get("vad_parameters", None) if transcribe_options else None
             options["vad_filter"] = True  # VAD is enabled by default for batched transcription
             options["vad_parameters"] = VadOptions(**vad_options) if vad_options else VadOptions(
                 max_speech_duration_s=whisper.feature_extractor.chunk_length,
                 min_speech_duration_ms=180,   # ignore ultra-short blips
-                min_silence_duration_ms=120,  # split on short Mandarin pauses (if supported)
                 speech_pad_ms=120,
                 threshold=0.35,
                 neg_threshold=0.2,
             )
         if batch_size > 1:
             # Use batched inference for better performance
             segments, transcript_info = batched_whisper.transcribe(
-                audio_path,
-                batch_size=batch_size,
                 **options
             )
         else:
             segments, transcript_info = whisper.transcribe(
-                audio_path,
                 **options
             )
         segments = list(segments)
         detected_language = transcript_info.language
-        print(f"Detected language: {detected_language}, segments: {len(segments)}, transcribing done in {time.time() - start_time:.2f} seconds")
-        # Align with WhisperX if alignment model is available
-        aligned_segments = segments
-        if detected_language in _whipser_x_align_models:
-            print(f"Performing WhisperX alignment for language '{detected_language}'...")
-            align_start = time.time()
-            try:
-                # Load audio for whisperx alignment
-                audio = whisperx.load_audio(audio_path)
-                # Convert faster-whisper segments to whisperx format
-                whisperx_segments = []
-                for seg in segments:
-                    whisperx_segments.append({
-                        "start": seg.start,
-                        "end": seg.end,
-                        "text": seg.text
-                    })
-                align_info = _whipser_x_align_models[detected_language]
-                result = whisperx.align(
-                    whisperx_segments,
-                    align_info["model"],
-                    align_info["metadata"],
-                    audio,
-                    "cuda",
-                    return_char_alignments=False
-                )
-                aligned_segments = result.get("segments", segments)
-                print(f"WhisperX alignment completed in {time.time() - align_start:.2f} seconds")
-            except Exception as e:
-                print(f"WhisperX alignment failed: {e}, using original timestamps")
-                aligned_segments = segments
-        else:
-            print(f"No WhisperX alignment model available for language '{detected_language}', using faster-whisper timestamps")
-        # Process segments into the expected format
         results = []
-        for i, seg in enumerate(aligned_segments):
-            # Check if this is a whisperx aligned segment (dict) or faster-whisper segment (object)
-            if isinstance(seg, dict):
-                # WhisperX aligned segment
-                words_list = []
-                if "words" in seg:
-                    for word in seg["words"]:
-                        words_list.append({
-                            "start": float(word.get("start", 0.0)) + float(base_offset_s),
-                            "end": float(word.get("end", 0.0)) + float(base_offset_s),
-                            "word": word.get("word", ""),
-                            "probability": word.get("score", 1.0),
-                            "speaker": "SPEAKER_00"
-                        })
-                results.append({
-                    "start": float(seg.get("start", 0.0)) + float(base_offset_s),
-                    "end": float(seg.get("end", 0.0)) + float(base_offset_s),
-                    "text": seg.get("text", ""),
-                    "speaker": "SPEAKER_00",
-                    "avg_logprob": segments[i].avg_logprob if i < len(segments) else 0.0,
-                    "words": words_list,
-                    "duration": float(seg.get("end", 0.0)) - float(seg.get("start", 0.0))
-                })
-            else:
-                # Faster-whisper segment (not aligned)
-                words_list = []
-                if seg.words:
-                    for word in seg.words:
-                        words_list.append({
-                            "start": float(word.start) + float(base_offset_s),
-                            "end": float(word.end) + float(base_offset_s),
-                            "word": word.word,
-                            "probability": word.probability,
-                            "speaker": "SPEAKER_00"
-                        })
-                results.append({
-                    "start": float(seg.start) + float(base_offset_s),
-                    "end": float(seg.end) + float(base_offset_s),
-                    "text": seg.text,
-                    "speaker": "SPEAKER_00",
-                    "avg_logprob": seg.avg_logprob,
-                    "words": words_list,
-                    "duration": float(seg.end - seg.start)
-                })
         transcription_time = time.time() - start_time
-        print(f"Full audio transcribed and aligned in {transcription_time:.2f} seconds using batch size {batch_size}")
         return results, detected_language
     # Removed audio cutting; transcription is done once on the full (preprocessed) audio
@@ -654,9 +540,9 @@ class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def perform_diarization(self, audio_path, num_speakers=None, base_offset_s: float = 0.0):
         """Perform speaker diarization; return segments with global timestamps and per-speaker embeddings."""
-        global _diarizer
-        if _diarizer is None:
             print("Diarization model not available, creating single speaker segment")
             # Load audio to get duration
             waveform, sample_rate = torchaudio.load(audio_path)
@@ -689,7 +575,7 @@ class WhisperTranscriber:
         waveform, sample_rate = torchaudio.load(audio_path)
         # Perform diarization
-        diarization = _diarizer(
             {"waveform": waveform, "sample_rate": sample_rate},
             num_speakers=num_speakers,
         )
@@ -1604,14 +1490,5 @@ with demo:
     - Vocabulary: Add names and technical terms in the prompt for better accuracy
     """)
-# Preload all whisper models once at service initialization
-print("Preloading all WhisperX transcribe models at startup...")
-try:
-    preload_all_whisper_models()
-    print("All WhisperX transcribe models preloaded at startup!")
-except Exception as e:
-    print(f"Warning: Could not preload whisper models at startup: {e}")
-    print("Models will be loaded on first use instead.")
 if __name__ == "__main__":
     demo.launch(debug=True)

 import spaces
 from faster_whisper import WhisperModel, BatchedInferencePipeline
 from faster_whisper.vad import VadOptions
 import requests
 import base64
 from pyannote.audio import Pipeline, Inference, Model
 }
 DEFAULT_MODEL = "large-v3-turbo"
 def _download_model(model_name: str):
+    """Downloads a model from the hub if not already present."""
     if model_name not in MODELS:
         raise ValueError(f"Model '{model_name}' not found in MODELS registry.")
     model_info = MODELS[model_name]
     if not os.path.exists(model_info["local_dir"]):
+        print(f"Downloading model '{model_name}' from {model_info['repo_id']}...")
         snapshot_download(
             repo_id=model_info["repo_id"],
             local_dir=model_info["local_dir"],
         )
     return model_info["local_dir"]
+# Download the default model on startup
 for model in MODELS:
     _download_model(model)
 # -----------------------------------------------------------------------------
 # Lazy global holder ----------------------------------------------------------
 _whisper_models = {}
 _batched_whisper_models = {}
+_whipser_x_transcribe_models = {}
 _whipser_x_align_models = {}
 _diarizer = None
 _embedder = None
+# Create global diarization pipeline
 try:
     print("Loading diarization model...")
     torch.backends.cuda.matmul.allow_tf32 = True
     print(f"Could not load diarization model: {e}")
     _diarizer = None
 @spaces.GPU   # GPU is guaranteed to exist *inside* this function
+def _load_models(model_name: str = DEFAULT_MODEL):
+    global _whisper_models, _batched_whisper_models, _diarizer
+    if model_name not in _whisper_models:
+        print(f"Loading Whisper model '{model_name}'...")
+        model_cache_path = _download_model(model_name)
         model = WhisperModel(
             model_cache_path,
             device="cuda",
         _whisper_models[model_name] = model
         _batched_whisper_models[model_name] = batched_model
+        print(f"Whisper model '{model_name}' and batched pipeline loaded successfully")
+    whisper = _whisper_models[model_name]
+    batched_whisper = _batched_whisper_models[model_name]
+    return whisper, batched_whisper, _diarizer
 # -----------------------------------------------------------------------------
 class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None, batch_size=16, base_offset_s: float = 0.0, clip_timestamps=None, model_name: str = DEFAULT_MODEL, transcribe_options: dict = None):
+        """Transcribe the entire audio file without speaker diarization using batched inference"""
+        whisper, batched_whisper, _ = _load_models(model_name)   # models live on the GPU
+        print(f"Transcribing full audio with '{model_name}' and batch size {batch_size}...")
         start_time = time.time()
         # Prepare options for batched inference
             language_detection_segments=1,
             task="translate" if translate else "transcribe",
         )
         if clip_timestamps:
             options["vad_filter"] = False
             options["clip_timestamps"] = clip_timestamps
         else:
+            vad_options = transcribe_options.get("vad_parameters", None)
             options["vad_filter"] = True  # VAD is enabled by default for batched transcription
             options["vad_parameters"] = VadOptions(**vad_options) if vad_options else VadOptions(
                 max_speech_duration_s=whisper.feature_extractor.chunk_length,
                 min_speech_duration_ms=180,   # ignore ultra-short blips
+                min_silence_duration_ms=120,  # split on short Mandarin pauses (if supported)
                 speech_pad_ms=120,
                 threshold=0.35,
                 neg_threshold=0.2,
             )
         if batch_size > 1:
             # Use batched inference for better performance
             segments, transcript_info = batched_whisper.transcribe(
+                audio_path,
+                batch_size=batch_size,
                 **options
             )
         else:
             segments, transcript_info = whisper.transcribe(
+                audio_path,
                 **options
             )
         segments = list(segments)
         detected_language = transcript_info.language
+        print("Detected language: ", detected_language, "segments: ", len(segments))
+        # Process segments
         results = []
+        for seg in segments:
+            # Create result entry with detailed format
+            words_list = []
+            if seg.words:
+                for word in seg.words:
+                    words_list.append({
+                        "start": float(word.start) + float(base_offset_s),
+                        "end": float(word.end) + float(base_offset_s),
+                        "word": word.word,
+                        "probability": word.probability,
+                        "speaker": "SPEAKER_00"  # No speaker identification in full transcription
+                    })
+            results.append({
+                "start": float(seg.start) + float(base_offset_s),
+                "end": float(seg.end) + float(base_offset_s),
+                "text": seg.text,
+                "speaker": "SPEAKER_00",  # Single speaker assumption
+                "avg_logprob": seg.avg_logprob,
+                "words": words_list,
+                "duration": float(seg.end - seg.start)
+            })
         transcription_time = time.time() - start_time
+        print(f"Full audio transcribed in {transcription_time:.2f} seconds using batch size {batch_size}")
+        print(results)
         return results, detected_language
     # Removed audio cutting; transcription is done once on the full (preprocessed) audio
     @spaces.GPU           # each call gets a GPU slice
     def perform_diarization(self, audio_path, num_speakers=None, base_offset_s: float = 0.0):
         """Perform speaker diarization; return segments with global timestamps and per-speaker embeddings."""
+        _, _, diarizer = _load_models()   # models live on the GPU
+        if diarizer is None:
             print("Diarization model not available, creating single speaker segment")
             # Load audio to get duration
             waveform, sample_rate = torchaudio.load(audio_path)
         waveform, sample_rate = torchaudio.load(audio_path)
         # Perform diarization
+        diarization = diarizer(
             {"waveform": waveform, "sample_rate": sample_rate},
             num_speakers=num_speakers,
         )
     - Vocabulary: Add names and technical terms in the prompt for better accuracy
     """)
 if __name__ == "__main__":
     demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -5,8 +5,8 @@ transformers==4.48.0
 pydantic==2.10.6
 # 2.  Main whisper model - using whisperx instead of faster-whisper
-ctranslate2==4.4.0
-whisperx
 torch
 # 3.  Extra libs your app really needs

 pydantic==2.10.6
 # 2.  Main whisper model - using whisperx instead of faster-whisper
+faster-whisper==1.1.1
+ctranslate2==4.5.0
 torch
 # 3.  Extra libs your app really needs