Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Jul 20

Commit

6d56dd1

1 Parent(s): 99ff812

lazy load

Browse files

Files changed (1) hide show

app.py +42 -30

app.py CHANGED Viewed

@@ -30,37 +30,44 @@ import tempfile
 import spaces
 from faster_whisper import WhisperModel
 from faster_whisper.vad import VadOptions
-from pyannote.audio import Pipeline
 import requests
 import base64
-# Create global Whisper model
-print("Loading Whisper model...")
-model = WhisperModel(
-    "large-v3-turbo",
-    device="cuda",
-    compute_type="float16",
-)
-print("Whisper model loaded successfully")
-# Create global diarization pipeline
-diarization_pipe = None
-try:
-    print("Loading diarization model...")
-    diarization_pipe = Pipeline.from_pretrained(
-        "pyannote/speaker-diarization-3.1",
-        use_auth_token=os.getenv("HF_TOKEN"),
-        torch_dtype=torch.float16,
-    ).to(torch.device("cuda"))
-    print("Diarization model loaded successfully")
-except Exception as e:
-    print(f"Could not load diarization model: {e}")
-    diarization_pipe = None
 class WhisperTranscriber:
     def __init__(self):
-        self.model = model  # Use global Whisper model
-        self.diarization_model = diarization_pipe  # Use global diarization pipeline
     def convert_audio_format(self, audio_path):
         """Convert audio to 16kHz mono WAV format"""
@@ -109,9 +116,11 @@ class WhisperTranscriber:
         return audio_segments
-    @spaces.GPU
     def transcribe_audio_segments(self, audio_segments, language=None, translate=False, prompt=None):
         """Transcribe multiple audio segments using faster_whisper"""
         print(f"Transcribing {len(audio_segments)} audio segments...")
         start_time = time.time()
@@ -121,7 +130,7 @@ class WhisperTranscriber:
             beam_size=5,
             vad_filter=True,
             vad_parameters=VadOptions(
-                max_speech_duration_s=self.model.feature_extractor.chunk_length,
                 min_speech_duration_ms=100,
                 speech_pad_ms=100,
                 threshold=0.25,
@@ -140,7 +149,7 @@ class WhisperTranscriber:
             print(f"Processing segment {i+1}/{len(audio_segments)}")
             # Transcribe this segment
-            segments, transcript_info = self.model.transcribe(segment["audio_path"], **options)
             segments = list(segments)
             # Get detected language from first segment
@@ -181,9 +190,12 @@ class WhisperTranscriber:
         return results, detected_language
     def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
-        if self.diarization_model is None:
             print("Diarization model not available, creating single speaker segment")
             # Load audio to get duration
             waveform, sample_rate = torchaudio.load(audio_path)
@@ -201,7 +213,7 @@ class WhisperTranscriber:
         waveform, sample_rate = torchaudio.load(audio_path)
         # Perform diarization
-        diarization = self.diarization_model(
             {"waveform": waveform, "sample_rate": sample_rate},
             num_speakers=num_speakers,
         )
@@ -266,7 +278,7 @@ class WhisperTranscriber:
         return grouped_segments
-    @spaces.GPU
     def process_audio(self, audio_file, num_speakers=None, language=None,
                      translate=False, prompt=None, group_segments=True):
         """Main processing function - diarization first, then transcription"""

 import spaces
 from faster_whisper import WhisperModel
 from faster_whisper.vad import VadOptions
 import requests
 import base64
+# Lazy global holder ----------------------------------------------------------
+_whisper = None
+_diarizer = None
+@spaces.GPU   # GPU is guaranteed to exist *inside* this function
+def _load_models():
+    global _whisper, _diarizer
+    if _whisper is None:
+        print("Loading Whisper model...")
+        _whisper = WhisperModel(
+            "large-v3-turbo",
+            device="cuda",
+            compute_type="float16",
+        )
+        print("Whisper model loaded successfully")
+    if _diarizer is None:
+        print("Loading diarization model...")
+        try:
+            from pyannote.audio import Pipeline
+            _diarizer = Pipeline.from_pretrained(
+                "pyannote/speaker-diarization-3.1",
+                use_auth_token=os.getenv("HF_TOKEN"),
+                torch_dtype=torch.float16,
+            ).to(torch.device("cuda"))
+            print("Diarization model loaded successfully")
+        except Exception as e:
+            print(f"Could not load diarization model: {e}")
+            _diarizer = None
+    return _whisper, _diarizer
+# -----------------------------------------------------------------------------
 class WhisperTranscriber:
     def __init__(self):
+        # do **not** create the models here!
+        pass
     def convert_audio_format(self, audio_path):
         """Convert audio to 16kHz mono WAV format"""
         return audio_segments
+    @spaces.GPU           # each call gets a GPU slice
     def transcribe_audio_segments(self, audio_segments, language=None, translate=False, prompt=None):
         """Transcribe multiple audio segments using faster_whisper"""
+        whisper, diarizer = _load_models()   # models live on the GPU
         print(f"Transcribing {len(audio_segments)} audio segments...")
         start_time = time.time()
             beam_size=5,
             vad_filter=True,
             vad_parameters=VadOptions(
+                max_speech_duration_s=whisper.feature_extractor.chunk_length,
                 min_speech_duration_ms=100,
                 speech_pad_ms=100,
                 threshold=0.25,
             print(f"Processing segment {i+1}/{len(audio_segments)}")
             # Transcribe this segment
+            segments, transcript_info = whisper.transcribe(segment["audio_path"], **options)
             segments = list(segments)
             # Get detected language from first segment
         return results, detected_language
+    @spaces.GPU           # each call gets a GPU slice
     def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
+        whisper, diarizer = _load_models()   # models live on the GPU
+        if diarizer is None:
             print("Diarization model not available, creating single speaker segment")
             # Load audio to get duration
             waveform, sample_rate = torchaudio.load(audio_path)
         waveform, sample_rate = torchaudio.load(audio_path)
         # Perform diarization
+        diarization = diarizer(
             {"waveform": waveform, "sample_rate": sample_rate},
             num_speakers=num_speakers,
         )
         return grouped_segments
+    @spaces.GPU           # each call gets a GPU slice
     def process_audio(self, audio_file, num_speakers=None, language=None,
                      translate=False, prompt=None, group_segments=True):
         """Main processing function - diarization first, then transcription"""