PlotweaverModel commited on
Commit
b73bcd3
·
verified ·
1 Parent(s): fdd3dce

update app file

Browse files
Files changed (1) hide show
  1. app.py +130 -25
app.py CHANGED
@@ -81,6 +81,29 @@ tts_pipe = hf_pipeline(
81
  print(" TTS loaded")
82
  print("All models loaded!")
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  # =============================================================================
86
  # Pipeline functions
@@ -110,29 +133,64 @@ def split_into_sentences(text):
110
  def transcribe(audio_array, sample_rate=16000):
111
  """ASR: English audio to text.
112
 
113
- Automatically handles both short (<30s) and long audio by enabling
114
- timestamps and chunking for longer audio.
 
115
  """
116
  if len(audio_array) < 1600: # Less than 0.1s
117
  return ""
118
 
119
  duration_s = len(audio_array) / sample_rate
120
 
121
- if duration_s > 28:
122
- # Long-form: enable chunking and timestamps (required by Whisper)
123
- result = asr_pipe(
124
- {"raw": audio_array, "sampling_rate": sample_rate},
125
- return_timestamps=True,
126
- chunk_length_s=25,
127
- stride_length_s=5,
128
- )
129
- else:
130
- # Short: standard single-pass transcription
131
  result = asr_pipe(
132
  {"raw": audio_array, "sampling_rate": sample_rate},
133
  return_timestamps=False,
134
  )
135
- return result["text"].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
 
138
  def translate_sentence(text, max_length=256, fast=False):
@@ -468,7 +526,7 @@ def stretch_audio_to_duration(input_audio_path, output_audio_path, target_durati
468
 
469
 
470
  def mux_video_with_new_audio(video_path, audio_path, output_video_path):
471
- """Combine original video (no audio) with new audio track into final MP4."""
472
  cmd = [
473
  "ffmpeg", "-y",
474
  "-i", video_path, # input video (with original audio)
@@ -486,6 +544,31 @@ def mux_video_with_new_audio(video_path, audio_path, output_video_path):
486
  return output_video_path
487
 
488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  def dub_video(video_path, progress=gr.Progress()):
490
  """
491
  Full video dubbing pipeline:
@@ -598,25 +681,47 @@ def dub_video(video_path, progress=gr.Progress()):
598
  log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
599
  log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio ({n_yo} sentences)")
600
 
601
- # Step 5: Time-align Yoruba audio to match video duration
602
- progress(0.85, desc="Aligning audio to video duration...")
 
 
603
  t0 = time.time()
 
604
  stretch_ratio = yoruba_duration / video_duration
605
- log_lines.append(f"\n**Alignment** ({time.time()-t0:.2f}s)")
606
- log_lines.append(f"Stretch ratio: {stretch_ratio:.2f}x (target: {video_duration:.1f}s)")
607
-
608
- if abs(stretch_ratio - 1.0) > 0.02: # Only stretch if >2% difference
609
- stretch_audio_to_duration(yoruba_audio_raw, yoruba_audio_aligned, video_duration)
 
 
 
 
 
 
 
610
  else:
611
- # Ratios close enough just copy
 
 
612
  import shutil
613
  shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
 
 
 
 
614
 
615
- # Step 6: Mux with original video
616
  progress(0.95, desc="Combining audio and video...")
617
  t0 = time.time()
618
- mux_video_with_new_audio(video_path, yoruba_audio_aligned, output_video)
619
- log_lines.append(f"\n**Muxing** ({time.time()-t0:.2f}s)")
 
 
 
 
 
 
620
 
621
  total = time.time() - total_start
622
  log_lines.append(f"\n---\n**Total processing time:** {total:.1f}s")
 
81
  print(" TTS loaded")
82
  print("All models loaded!")
83
 
84
+ # Diagnostic: confirm models are actually on the expected device
85
+ print(f"\n=== Device diagnostics ===")
86
+ print(f"CUDA available: {torch.cuda.is_available()}")
87
+ if torch.cuda.is_available():
88
+ print(f"CUDA device: {torch.cuda.get_device_name(0)}")
89
+ print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
90
+ try:
91
+ asr_device = next(asr_pipe.model.parameters()).device
92
+ print(f"ASR model on: {asr_device}")
93
+ except Exception as e:
94
+ print(f"ASR device check failed: {e}")
95
+ try:
96
+ mt_device = next(mt_model.parameters()).device
97
+ print(f"MT model on: {mt_device}")
98
+ except Exception as e:
99
+ print(f"MT device check failed: {e}")
100
+ try:
101
+ tts_device = next(tts_pipe.model.parameters()).device
102
+ print(f"TTS model on: {tts_device}")
103
+ except Exception as e:
104
+ print(f"TTS device check failed: {e}")
105
+ print(f"==========================\n")
106
+
107
 
108
  # =============================================================================
109
  # Pipeline functions
 
133
  def transcribe(audio_array, sample_rate=16000):
134
  """ASR: English audio to text.
135
 
136
+ For short audio (<28s): uses HF pipeline (fast, single-pass).
137
+ For long audio: uses native Whisper generate() with long-form support,
138
+ which is dramatically faster than the pipeline's chunking mode.
139
  """
140
  if len(audio_array) < 1600: # Less than 0.1s
141
  return ""
142
 
143
  duration_s = len(audio_array) / sample_rate
144
 
145
+ # Resample to 16kHz if needed (Whisper requires exactly 16kHz)
146
+ if sample_rate != 16000:
147
+ import torchaudio.functional as F_audio
148
+ audio_tensor = torch.from_numpy(audio_array).float()
149
+ audio_tensor = F_audio.resample(audio_tensor, sample_rate, 16000)
150
+ audio_array = audio_tensor.numpy()
151
+ sample_rate = 16000
152
+
153
+ if duration_s <= 28:
154
+ # Short audio: standard single-pass transcription via pipeline
155
  result = asr_pipe(
156
  {"raw": audio_array, "sampling_rate": sample_rate},
157
  return_timestamps=False,
158
  )
159
+ return result["text"].strip()
160
+
161
+ # Long audio: use native Whisper generate() with built-in long-form support
162
+ # This is dramatically faster than pipeline(chunk_length_s=...)
163
+ model = asr_pipe.model
164
+ processor = asr_pipe.feature_extractor
165
+ tokenizer = asr_pipe.tokenizer
166
+
167
+ # Process the full audio - Whisper's native long-form handles chunking internally
168
+ inputs = processor(
169
+ audio_array,
170
+ sampling_rate=16000,
171
+ return_tensors="pt",
172
+ truncation=False,
173
+ padding="longest",
174
+ return_attention_mask=True,
175
+ )
176
+ input_features = inputs.input_features.to(DEVICE, dtype=TORCH_DTYPE)
177
+ attention_mask = inputs.attention_mask.to(DEVICE) if "attention_mask" in inputs else None
178
+
179
+ generate_kwargs = {
180
+ "return_timestamps": True,
181
+ "language": "en",
182
+ "task": "transcribe",
183
+ }
184
+ if attention_mask is not None:
185
+ generate_kwargs["attention_mask"] = attention_mask
186
+
187
+ with torch.no_grad():
188
+ predicted_ids = model.generate(input_features, **generate_kwargs)
189
+
190
+ transcription = tokenizer.batch_decode(
191
+ predicted_ids, skip_special_tokens=True
192
+ )[0]
193
+ return transcription.strip()
194
 
195
 
196
  def translate_sentence(text, max_length=256, fast=False):
 
526
 
527
 
528
  def mux_video_with_new_audio(video_path, audio_path, output_video_path):
529
+ """Combine original video with new audio track into final MP4."""
530
  cmd = [
531
  "ffmpeg", "-y",
532
  "-i", video_path, # input video (with original audio)
 
544
  return output_video_path
545
 
546
 
547
+ def mux_video_extended_with_audio(video_path, audio_path, output_video_path, target_duration_s):
548
+ """
549
+ Combine video with longer audio by extending video (freezing last frame).
550
+ Uses ffmpeg's tpad filter to hold the last frame until audio ends.
551
+ """
552
+ cmd = [
553
+ "ffmpeg", "-y",
554
+ "-i", video_path,
555
+ "-i", audio_path,
556
+ "-filter_complex",
557
+ f"[0:v]tpad=stop_mode=clone:stop_duration={target_duration_s}[v]",
558
+ "-map", "[v]",
559
+ "-map", "1:a:0",
560
+ "-c:v", "libx264",
561
+ "-preset", "fast",
562
+ "-c:a", "aac",
563
+ "-t", str(target_duration_s),
564
+ output_video_path,
565
+ ]
566
+ result = subprocess.run(cmd, capture_output=True, text=True)
567
+ if result.returncode != 0:
568
+ raise RuntimeError(f"ffmpeg video extension failed:\n{result.stderr}")
569
+ return output_video_path
570
+
571
+
572
  def dub_video(video_path, progress=gr.Progress()):
573
  """
574
  Full video dubbing pipeline:
 
681
  log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
682
  log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio ({n_yo} sentences)")
683
 
684
+ # Step 5: Decide alignment strategy
685
+ # Cap stretch at 1.2x to avoid unnatural-sounding audio.
686
+ # If Yoruba needs more compression than that, extend the video instead.
687
+ progress(0.85, desc="Aligning audio to video...")
688
  t0 = time.time()
689
+ MAX_STRETCH = 1.2 # Maximum 1.2x speedup allowed
690
  stretch_ratio = yoruba_duration / video_duration
691
+ log_lines.append(f"\n**Alignment** (ratio: {stretch_ratio:.2f}x)")
692
+
693
+ if stretch_ratio <= MAX_STRETCH:
694
+ # Stretch is acceptable - shrink Yoruba audio to fit video
695
+ log_lines.append(f"Stretching audio to fit {video_duration:.1f}s video")
696
+ if abs(stretch_ratio - 1.0) > 0.02:
697
+ stretch_audio_to_duration(yoruba_audio_raw, yoruba_audio_aligned, video_duration)
698
+ else:
699
+ import shutil
700
+ shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
701
+ final_duration = video_duration
702
+ extend_video = False
703
  else:
704
+ # Stretch would be too aggressive - keep natural speed and extend video
705
+ log_lines.append(f"Ratio exceeds {MAX_STRETCH}x cap - keeping natural speed")
706
+ log_lines.append(f"Video will be extended from {video_duration:.1f}s to {yoruba_duration:.1f}s")
707
  import shutil
708
  shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
709
+ final_duration = yoruba_duration
710
+ extend_video = True
711
+
712
+ log_lines.append(f"Alignment took {time.time()-t0:.2f}s")
713
 
714
+ # Step 6: Mux with video (extend if needed)
715
  progress(0.95, desc="Combining audio and video...")
716
  t0 = time.time()
717
+ if extend_video:
718
+ mux_video_extended_with_audio(
719
+ video_path, yoruba_audio_aligned, output_video, final_duration
720
+ )
721
+ log_lines.append(f"\n**Muxing** ({time.time()-t0:.2f}s) - video extended by freezing last frame")
722
+ else:
723
+ mux_video_with_new_audio(video_path, yoruba_audio_aligned, output_video)
724
+ log_lines.append(f"\n**Muxing** ({time.time()-t0:.2f}s)")
725
 
726
  total = time.time() - total_start
727
  log_lines.append(f"\n---\n**Total processing time:** {total:.1f}s")