update app file
Browse files
app.py
CHANGED
|
@@ -81,6 +81,29 @@ tts_pipe = hf_pipeline(
|
|
| 81 |
print(" TTS loaded")
|
| 82 |
print("All models loaded!")
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
# =============================================================================
|
| 86 |
# Pipeline functions
|
|
@@ -110,29 +133,64 @@ def split_into_sentences(text):
|
|
| 110 |
def transcribe(audio_array, sample_rate=16000):
|
| 111 |
"""ASR: English audio to text.
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
|
|
|
| 115 |
"""
|
| 116 |
if len(audio_array) < 1600: # Less than 0.1s
|
| 117 |
return ""
|
| 118 |
|
| 119 |
duration_s = len(audio_array) / sample_rate
|
| 120 |
|
| 121 |
-
if
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
# Short: standard single-pass transcription
|
| 131 |
result = asr_pipe(
|
| 132 |
{"raw": audio_array, "sampling_rate": sample_rate},
|
| 133 |
return_timestamps=False,
|
| 134 |
)
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
|
| 138 |
def translate_sentence(text, max_length=256, fast=False):
|
|
@@ -468,7 +526,7 @@ def stretch_audio_to_duration(input_audio_path, output_audio_path, target_durati
|
|
| 468 |
|
| 469 |
|
| 470 |
def mux_video_with_new_audio(video_path, audio_path, output_video_path):
|
| 471 |
-
"""Combine original video
|
| 472 |
cmd = [
|
| 473 |
"ffmpeg", "-y",
|
| 474 |
"-i", video_path, # input video (with original audio)
|
|
@@ -486,6 +544,31 @@ def mux_video_with_new_audio(video_path, audio_path, output_video_path):
|
|
| 486 |
return output_video_path
|
| 487 |
|
| 488 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
def dub_video(video_path, progress=gr.Progress()):
|
| 490 |
"""
|
| 491 |
Full video dubbing pipeline:
|
|
@@ -598,25 +681,47 @@ def dub_video(video_path, progress=gr.Progress()):
|
|
| 598 |
log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
|
| 599 |
log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio ({n_yo} sentences)")
|
| 600 |
|
| 601 |
-
# Step 5:
|
| 602 |
-
|
|
|
|
|
|
|
| 603 |
t0 = time.time()
|
|
|
|
| 604 |
stretch_ratio = yoruba_duration / video_duration
|
| 605 |
-
log_lines.append(f"\n**Alignment** ({
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
else:
|
| 611 |
-
#
|
|
|
|
|
|
|
| 612 |
import shutil
|
| 613 |
shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
|
| 615 |
-
# Step 6: Mux with
|
| 616 |
progress(0.95, desc="Combining audio and video...")
|
| 617 |
t0 = time.time()
|
| 618 |
-
|
| 619 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
|
| 621 |
total = time.time() - total_start
|
| 622 |
log_lines.append(f"\n---\n**Total processing time:** {total:.1f}s")
|
|
|
|
| 81 |
print(" TTS loaded")
|
| 82 |
print("All models loaded!")
|
| 83 |
|
| 84 |
+
# Diagnostic: confirm models are actually on the expected device
|
| 85 |
+
print(f"\n=== Device diagnostics ===")
|
| 86 |
+
print(f"CUDA available: {torch.cuda.is_available()}")
|
| 87 |
+
if torch.cuda.is_available():
|
| 88 |
+
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
|
| 89 |
+
print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
|
| 90 |
+
try:
|
| 91 |
+
asr_device = next(asr_pipe.model.parameters()).device
|
| 92 |
+
print(f"ASR model on: {asr_device}")
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"ASR device check failed: {e}")
|
| 95 |
+
try:
|
| 96 |
+
mt_device = next(mt_model.parameters()).device
|
| 97 |
+
print(f"MT model on: {mt_device}")
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f"MT device check failed: {e}")
|
| 100 |
+
try:
|
| 101 |
+
tts_device = next(tts_pipe.model.parameters()).device
|
| 102 |
+
print(f"TTS model on: {tts_device}")
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"TTS device check failed: {e}")
|
| 105 |
+
print(f"==========================\n")
|
| 106 |
+
|
| 107 |
|
| 108 |
# =============================================================================
|
| 109 |
# Pipeline functions
|
|
|
|
| 133 |
def transcribe(audio_array, sample_rate=16000):
|
| 134 |
"""ASR: English audio to text.
|
| 135 |
|
| 136 |
+
For short audio (<28s): uses HF pipeline (fast, single-pass).
|
| 137 |
+
For long audio: uses native Whisper generate() with long-form support,
|
| 138 |
+
which is dramatically faster than the pipeline's chunking mode.
|
| 139 |
"""
|
| 140 |
if len(audio_array) < 1600: # Less than 0.1s
|
| 141 |
return ""
|
| 142 |
|
| 143 |
duration_s = len(audio_array) / sample_rate
|
| 144 |
|
| 145 |
+
# Resample to 16kHz if needed (Whisper requires exactly 16kHz)
|
| 146 |
+
if sample_rate != 16000:
|
| 147 |
+
import torchaudio.functional as F_audio
|
| 148 |
+
audio_tensor = torch.from_numpy(audio_array).float()
|
| 149 |
+
audio_tensor = F_audio.resample(audio_tensor, sample_rate, 16000)
|
| 150 |
+
audio_array = audio_tensor.numpy()
|
| 151 |
+
sample_rate = 16000
|
| 152 |
+
|
| 153 |
+
if duration_s <= 28:
|
| 154 |
+
# Short audio: standard single-pass transcription via pipeline
|
| 155 |
result = asr_pipe(
|
| 156 |
{"raw": audio_array, "sampling_rate": sample_rate},
|
| 157 |
return_timestamps=False,
|
| 158 |
)
|
| 159 |
+
return result["text"].strip()
|
| 160 |
+
|
| 161 |
+
# Long audio: use native Whisper generate() with built-in long-form support
|
| 162 |
+
# This is dramatically faster than pipeline(chunk_length_s=...)
|
| 163 |
+
model = asr_pipe.model
|
| 164 |
+
processor = asr_pipe.feature_extractor
|
| 165 |
+
tokenizer = asr_pipe.tokenizer
|
| 166 |
+
|
| 167 |
+
# Process the full audio - Whisper's native long-form handles chunking internally
|
| 168 |
+
inputs = processor(
|
| 169 |
+
audio_array,
|
| 170 |
+
sampling_rate=16000,
|
| 171 |
+
return_tensors="pt",
|
| 172 |
+
truncation=False,
|
| 173 |
+
padding="longest",
|
| 174 |
+
return_attention_mask=True,
|
| 175 |
+
)
|
| 176 |
+
input_features = inputs.input_features.to(DEVICE, dtype=TORCH_DTYPE)
|
| 177 |
+
attention_mask = inputs.attention_mask.to(DEVICE) if "attention_mask" in inputs else None
|
| 178 |
+
|
| 179 |
+
generate_kwargs = {
|
| 180 |
+
"return_timestamps": True,
|
| 181 |
+
"language": "en",
|
| 182 |
+
"task": "transcribe",
|
| 183 |
+
}
|
| 184 |
+
if attention_mask is not None:
|
| 185 |
+
generate_kwargs["attention_mask"] = attention_mask
|
| 186 |
+
|
| 187 |
+
with torch.no_grad():
|
| 188 |
+
predicted_ids = model.generate(input_features, **generate_kwargs)
|
| 189 |
+
|
| 190 |
+
transcription = tokenizer.batch_decode(
|
| 191 |
+
predicted_ids, skip_special_tokens=True
|
| 192 |
+
)[0]
|
| 193 |
+
return transcription.strip()
|
| 194 |
|
| 195 |
|
| 196 |
def translate_sentence(text, max_length=256, fast=False):
|
|
|
|
| 526 |
|
| 527 |
|
| 528 |
def mux_video_with_new_audio(video_path, audio_path, output_video_path):
|
| 529 |
+
"""Combine original video with new audio track into final MP4."""
|
| 530 |
cmd = [
|
| 531 |
"ffmpeg", "-y",
|
| 532 |
"-i", video_path, # input video (with original audio)
|
|
|
|
| 544 |
return output_video_path
|
| 545 |
|
| 546 |
|
| 547 |
+
def mux_video_extended_with_audio(video_path, audio_path, output_video_path, target_duration_s):
|
| 548 |
+
"""
|
| 549 |
+
Combine video with longer audio by extending video (freezing last frame).
|
| 550 |
+
Uses ffmpeg's tpad filter to hold the last frame until audio ends.
|
| 551 |
+
"""
|
| 552 |
+
cmd = [
|
| 553 |
+
"ffmpeg", "-y",
|
| 554 |
+
"-i", video_path,
|
| 555 |
+
"-i", audio_path,
|
| 556 |
+
"-filter_complex",
|
| 557 |
+
f"[0:v]tpad=stop_mode=clone:stop_duration={target_duration_s}[v]",
|
| 558 |
+
"-map", "[v]",
|
| 559 |
+
"-map", "1:a:0",
|
| 560 |
+
"-c:v", "libx264",
|
| 561 |
+
"-preset", "fast",
|
| 562 |
+
"-c:a", "aac",
|
| 563 |
+
"-t", str(target_duration_s),
|
| 564 |
+
output_video_path,
|
| 565 |
+
]
|
| 566 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 567 |
+
if result.returncode != 0:
|
| 568 |
+
raise RuntimeError(f"ffmpeg video extension failed:\n{result.stderr}")
|
| 569 |
+
return output_video_path
|
| 570 |
+
|
| 571 |
+
|
| 572 |
def dub_video(video_path, progress=gr.Progress()):
|
| 573 |
"""
|
| 574 |
Full video dubbing pipeline:
|
|
|
|
| 681 |
log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
|
| 682 |
log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio ({n_yo} sentences)")
|
| 683 |
|
| 684 |
+
# Step 5: Decide alignment strategy
|
| 685 |
+
# Cap stretch at 1.2x to avoid unnatural-sounding audio.
|
| 686 |
+
# If Yoruba needs more compression than that, extend the video instead.
|
| 687 |
+
progress(0.85, desc="Aligning audio to video...")
|
| 688 |
t0 = time.time()
|
| 689 |
+
MAX_STRETCH = 1.2 # Maximum 1.2x speedup allowed
|
| 690 |
stretch_ratio = yoruba_duration / video_duration
|
| 691 |
+
log_lines.append(f"\n**Alignment** (ratio: {stretch_ratio:.2f}x)")
|
| 692 |
+
|
| 693 |
+
if stretch_ratio <= MAX_STRETCH:
|
| 694 |
+
# Stretch is acceptable - shrink Yoruba audio to fit video
|
| 695 |
+
log_lines.append(f"Stretching audio to fit {video_duration:.1f}s video")
|
| 696 |
+
if abs(stretch_ratio - 1.0) > 0.02:
|
| 697 |
+
stretch_audio_to_duration(yoruba_audio_raw, yoruba_audio_aligned, video_duration)
|
| 698 |
+
else:
|
| 699 |
+
import shutil
|
| 700 |
+
shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
|
| 701 |
+
final_duration = video_duration
|
| 702 |
+
extend_video = False
|
| 703 |
else:
|
| 704 |
+
# Stretch would be too aggressive - keep natural speed and extend video
|
| 705 |
+
log_lines.append(f"Ratio exceeds {MAX_STRETCH}x cap - keeping natural speed")
|
| 706 |
+
log_lines.append(f"Video will be extended from {video_duration:.1f}s to {yoruba_duration:.1f}s")
|
| 707 |
import shutil
|
| 708 |
shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
|
| 709 |
+
final_duration = yoruba_duration
|
| 710 |
+
extend_video = True
|
| 711 |
+
|
| 712 |
+
log_lines.append(f"Alignment took {time.time()-t0:.2f}s")
|
| 713 |
|
| 714 |
+
# Step 6: Mux with video (extend if needed)
|
| 715 |
progress(0.95, desc="Combining audio and video...")
|
| 716 |
t0 = time.time()
|
| 717 |
+
if extend_video:
|
| 718 |
+
mux_video_extended_with_audio(
|
| 719 |
+
video_path, yoruba_audio_aligned, output_video, final_duration
|
| 720 |
+
)
|
| 721 |
+
log_lines.append(f"\n**Muxing** ({time.time()-t0:.2f}s) - video extended by freezing last frame")
|
| 722 |
+
else:
|
| 723 |
+
mux_video_with_new_audio(video_path, yoruba_audio_aligned, output_video)
|
| 724 |
+
log_lines.append(f"\n**Muxing** ({time.time()-t0:.2f}s)")
|
| 725 |
|
| 726 |
total = time.time() - total_start
|
| 727 |
log_lines.append(f"\n---\n**Total processing time:** {total:.1f}s")
|