liuyang commited on
Commit
e045021
Β·
1 Parent(s): c0cf9b3

update requirements.txt

Browse files
Files changed (2) hide show
  1. app.py +18 -51
  2. requirements.txt +4 -0
app.py CHANGED
@@ -14,7 +14,6 @@ from transformers import pipeline
14
  from pyannote.audio import Pipeline
15
  import requests
16
  import base64
17
- from typing import List, Dict, Any, Optional, Tuple
18
 
19
  # Install flash attention for acceleration
20
  try:
@@ -59,7 +58,7 @@ class WhisperTranscriber:
59
  print(f"Could not load diarization model: {e}")
60
  self.diarization_model = None
61
 
62
- def convert_audio_format(self, audio_path: str) -> str:
63
  """Convert audio to 16kHz mono WAV format"""
64
  temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
65
  temp_wav_path = temp_wav.name
@@ -76,13 +75,7 @@ class WhisperTranscriber:
76
  raise RuntimeError(f"Audio conversion failed: {e}")
77
 
78
  @spaces.GPU
79
- def transcribe_audio(
80
- self,
81
- audio_path: str,
82
- language: Optional[str] = None,
83
- translate: bool = False,
84
- prompt: Optional[str] = None
85
- ) -> Tuple[List[Dict], str]:
86
  """Transcribe audio using Whisper with flash attention"""
87
  if self.pipe is None:
88
  self.setup_models()
@@ -132,11 +125,7 @@ class WhisperTranscriber:
132
  return segments, detected_language
133
 
134
  @spaces.GPU
135
- def perform_diarization(
136
- self,
137
- audio_path: str,
138
- num_speakers: Optional[int] = None
139
- ) -> Tuple[List[Dict], int]:
140
  """Perform speaker diarization"""
141
  if self.diarization_model is None:
142
  print("Diarization model not available, assigning single speaker")
@@ -173,11 +162,7 @@ class WhisperTranscriber:
173
 
174
  return diarize_segments, detected_num_speakers
175
 
176
- def merge_transcription_and_diarization(
177
- self,
178
- transcription_segments: List[Dict],
179
- diarization_segments: List[Dict]
180
- ) -> List[Dict]:
181
  """Merge transcription segments with speaker information"""
182
  if not diarization_segments:
183
  # No diarization available, assign single speaker
@@ -214,12 +199,7 @@ class WhisperTranscriber:
214
 
215
  return final_segments
216
 
217
- def group_segments_by_speaker(
218
- self,
219
- segments: List[Dict],
220
- max_gap: float = 1.0,
221
- max_duration: float = 30.0
222
- ) -> List[Dict]:
223
  """Group consecutive segments from the same speaker"""
224
  if not segments:
225
  return segments
@@ -260,15 +240,8 @@ class WhisperTranscriber:
260
  return grouped_segments
261
 
262
  @spaces.GPU
263
- def process_audio(
264
- self,
265
- audio_file,
266
- num_speakers: Optional[int] = None,
267
- language: Optional[str] = None,
268
- translate: bool = False,
269
- prompt: Optional[str] = None,
270
- group_segments: bool = True
271
- ) -> Dict[str, Any]:
272
  """Main processing function"""
273
  if audio_file is None:
274
  return {"error": "No audio file provided"}
@@ -318,7 +291,7 @@ class WhisperTranscriber:
318
  # Initialize transcriber
319
  transcriber = WhisperTranscriber()
320
 
321
- def format_segments_for_display(result: Dict[str, Any]) -> str:
322
  """Format segments for display in Gradio"""
323
  if "error" in result:
324
  return f"❌ Error: {result['error']}"
@@ -345,21 +318,14 @@ def format_segments_for_display(result: Dict[str, Any]) -> str:
345
 
346
  return output
347
 
348
- def process_audio_gradio(
349
- audio_file,
350
- num_speakers,
351
- language,
352
- translate,
353
- prompt,
354
- group_segments
355
- ):
356
  """Gradio interface function"""
357
  result = transcriber.process_audio(
358
  audio_file=audio_file,
359
  num_speakers=num_speakers if num_speakers > 0 else None,
360
  language=language if language != "auto" else None,
361
  translate=translate,
362
- prompt=prompt if prompt.strip() else None,
363
  group_segments=group_segments
364
  )
365
 
@@ -367,10 +333,12 @@ def process_audio_gradio(
367
  return formatted_output, result
368
 
369
  # Create Gradio interface
370
- with gr.Blocks(
371
  title="πŸŽ™οΈ Whisper Transcription with Speaker Diarization",
372
- theme=gr.themes.Soft()
373
- ) as demo:
 
 
374
  gr.Markdown("""
375
  # πŸŽ™οΈ Advanced Audio Transcription & Speaker Diarization
376
 
@@ -385,7 +353,7 @@ with gr.Blocks(
385
  audio_input = gr.Audio(
386
  label="🎡 Upload Audio File",
387
  type="filepath",
388
- sources=["upload", "microphone"]
389
  )
390
 
391
  with gr.Accordion("βš™οΈ Advanced Settings", open=False):
@@ -419,7 +387,7 @@ with gr.Blocks(
419
  value=True
420
  )
421
 
422
- process_btn = gr.Button("πŸš€ Transcribe Audio", variant="primary", size="lg")
423
 
424
  with gr.Column():
425
  output_text = gr.Markdown(
@@ -443,8 +411,7 @@ with gr.Blocks(
443
  prompt,
444
  group_segments
445
  ],
446
- outputs=[output_text, output_json],
447
- show_progress=True
448
  )
449
 
450
  # Examples
 
14
  from pyannote.audio import Pipeline
15
  import requests
16
  import base64
 
17
 
18
  # Install flash attention for acceleration
19
  try:
 
58
  print(f"Could not load diarization model: {e}")
59
  self.diarization_model = None
60
 
61
+ def convert_audio_format(self, audio_path):
62
  """Convert audio to 16kHz mono WAV format"""
63
  temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
64
  temp_wav_path = temp_wav.name
 
75
  raise RuntimeError(f"Audio conversion failed: {e}")
76
 
77
  @spaces.GPU
78
+ def transcribe_audio(self, audio_path, language=None, translate=False, prompt=None):
 
 
 
 
 
 
79
  """Transcribe audio using Whisper with flash attention"""
80
  if self.pipe is None:
81
  self.setup_models()
 
125
  return segments, detected_language
126
 
127
  @spaces.GPU
128
+ def perform_diarization(self, audio_path, num_speakers=None):
 
 
 
 
129
  """Perform speaker diarization"""
130
  if self.diarization_model is None:
131
  print("Diarization model not available, assigning single speaker")
 
162
 
163
  return diarize_segments, detected_num_speakers
164
 
165
+ def merge_transcription_and_diarization(self, transcription_segments, diarization_segments):
 
 
 
 
166
  """Merge transcription segments with speaker information"""
167
  if not diarization_segments:
168
  # No diarization available, assign single speaker
 
199
 
200
  return final_segments
201
 
202
+ def group_segments_by_speaker(self, segments, max_gap=1.0, max_duration=30.0):
 
 
 
 
 
203
  """Group consecutive segments from the same speaker"""
204
  if not segments:
205
  return segments
 
240
  return grouped_segments
241
 
242
  @spaces.GPU
243
+ def process_audio(self, audio_file, num_speakers=None, language=None,
244
+ translate=False, prompt=None, group_segments=True):
 
 
 
 
 
 
 
245
  """Main processing function"""
246
  if audio_file is None:
247
  return {"error": "No audio file provided"}
 
291
  # Initialize transcriber
292
  transcriber = WhisperTranscriber()
293
 
294
+ def format_segments_for_display(result):
295
  """Format segments for display in Gradio"""
296
  if "error" in result:
297
  return f"❌ Error: {result['error']}"
 
318
 
319
  return output
320
 
321
+ def process_audio_gradio(audio_file, num_speakers, language, translate, prompt, group_segments):
 
 
 
 
 
 
 
322
  """Gradio interface function"""
323
  result = transcriber.process_audio(
324
  audio_file=audio_file,
325
  num_speakers=num_speakers if num_speakers > 0 else None,
326
  language=language if language != "auto" else None,
327
  translate=translate,
328
+ prompt=prompt if prompt and prompt.strip() else None,
329
  group_segments=group_segments
330
  )
331
 
 
333
  return formatted_output, result
334
 
335
  # Create Gradio interface
336
+ demo = gr.Blocks(
337
  title="πŸŽ™οΈ Whisper Transcription with Speaker Diarization",
338
+ theme="default"
339
+ )
340
+
341
+ with demo:
342
  gr.Markdown("""
343
  # πŸŽ™οΈ Advanced Audio Transcription & Speaker Diarization
344
 
 
353
  audio_input = gr.Audio(
354
  label="🎡 Upload Audio File",
355
  type="filepath",
356
+ source="upload"
357
  )
358
 
359
  with gr.Accordion("βš™οΈ Advanced Settings", open=False):
 
387
  value=True
388
  )
389
 
390
+ process_btn = gr.Button("πŸš€ Transcribe Audio", variant="primary")
391
 
392
  with gr.Column():
393
  output_text = gr.Markdown(
 
411
  prompt,
412
  group_segments
413
  ],
414
+ outputs=[output_text, output_json]
 
415
  )
416
 
417
  # Examples
requirements.txt CHANGED
@@ -4,6 +4,10 @@ torchaudio>=2.0.0
4
  transformers>=4.35.0
5
  accelerate>=0.24.0
6
 
 
 
 
 
7
  # Audio processing and transcription
8
  ffmpeg-python>=0.2.0
9
  librosa>=0.10.0
 
4
  transformers>=4.35.0
5
  accelerate>=0.24.0
6
 
7
+ # Gradio and Spaces - matching SDK version 4.44.1
8
+ gradio==4.44.1
9
+ spaces>=0.19.0
10
+
11
  # Audio processing and transcription
12
  ffmpeg-python>=0.2.0
13
  librosa>=0.10.0