Spaces:

PlotweaverModel
/

Live-Football-Commentary

Running

App Files Files Community

PlotweaverModel commited on 10 days ago

Commit

b73bcd3

verified ·

1 Parent(s): fdd3dce

update app file

Browse files

Files changed (1) hide show

app.py +130 -25

app.py CHANGED Viewed

@@ -81,6 +81,29 @@ tts_pipe = hf_pipeline(
 print("  TTS loaded")
 print("All models loaded!")
 # =============================================================================
 # Pipeline functions
@@ -110,29 +133,64 @@ def split_into_sentences(text):
 def transcribe(audio_array, sample_rate=16000):
     """ASR: English audio to text.
-    Automatically handles both short (<30s) and long audio by enabling
-    timestamps and chunking for longer audio.
     """
     if len(audio_array) < 1600:  # Less than 0.1s
         return ""
     duration_s = len(audio_array) / sample_rate
-    if duration_s > 28:
-        # Long-form: enable chunking and timestamps (required by Whisper)
-        result = asr_pipe(
-            {"raw": audio_array, "sampling_rate": sample_rate},
-            return_timestamps=True,
-            chunk_length_s=25,
-            stride_length_s=5,
-        )
-    else:
-        # Short: standard single-pass transcription
         result = asr_pipe(
             {"raw": audio_array, "sampling_rate": sample_rate},
             return_timestamps=False,
         )
-    return result["text"].strip()
 def translate_sentence(text, max_length=256, fast=False):
@@ -468,7 +526,7 @@ def stretch_audio_to_duration(input_audio_path, output_audio_path, target_durati
 def mux_video_with_new_audio(video_path, audio_path, output_video_path):
-    """Combine original video (no audio) with new audio track into final MP4."""
     cmd = [
         "ffmpeg", "-y",
         "-i", video_path,              # input video (with original audio)
@@ -486,6 +544,31 @@ def mux_video_with_new_audio(video_path, audio_path, output_video_path):
     return output_video_path
 def dub_video(video_path, progress=gr.Progress()):
     """
     Full video dubbing pipeline:
@@ -598,25 +681,47 @@ def dub_video(video_path, progress=gr.Progress()):
         log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
         log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio ({n_yo} sentences)")
-        # Step 5: Time-align Yoruba audio to match video duration
-        progress(0.85, desc="Aligning audio to video duration...")
         t0 = time.time()
         stretch_ratio = yoruba_duration / video_duration
-        log_lines.append(f"\n**Alignment** ({time.time()-t0:.2f}s)")
-        log_lines.append(f"Stretch ratio: {stretch_ratio:.2f}x (target: {video_duration:.1f}s)")
-        if abs(stretch_ratio - 1.0) > 0.02:  # Only stretch if >2% difference
-            stretch_audio_to_duration(yoruba_audio_raw, yoruba_audio_aligned, video_duration)
         else:
-            # Ratios close enough — just copy
             import shutil
             shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
-        # Step 6: Mux with original video
         progress(0.95, desc="Combining audio and video...")
         t0 = time.time()
-        mux_video_with_new_audio(video_path, yoruba_audio_aligned, output_video)
-        log_lines.append(f"\n**Muxing** ({time.time()-t0:.2f}s)")
         total = time.time() - total_start
         log_lines.append(f"\n---\n**Total processing time:** {total:.1f}s")

 print("  TTS loaded")
 print("All models loaded!")
+# Diagnostic: confirm models are actually on the expected device
+print(f"\n=== Device diagnostics ===")
+print(f"CUDA available: {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
+    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+try:
+    asr_device = next(asr_pipe.model.parameters()).device
+    print(f"ASR model on: {asr_device}")
+except Exception as e:
+    print(f"ASR device check failed: {e}")
+try:
+    mt_device = next(mt_model.parameters()).device
+    print(f"MT model on: {mt_device}")
+except Exception as e:
+    print(f"MT device check failed: {e}")
+try:
+    tts_device = next(tts_pipe.model.parameters()).device
+    print(f"TTS model on: {tts_device}")
+except Exception as e:
+    print(f"TTS device check failed: {e}")
+print(f"==========================\n")
 # =============================================================================
 # Pipeline functions
 def transcribe(audio_array, sample_rate=16000):
     """ASR: English audio to text.
+    For short audio (<28s): uses HF pipeline (fast, single-pass).
+    For long audio: uses native Whisper generate() with long-form support,
+    which is dramatically faster than the pipeline's chunking mode.
     """
     if len(audio_array) < 1600:  # Less than 0.1s
         return ""
     duration_s = len(audio_array) / sample_rate
+    # Resample to 16kHz if needed (Whisper requires exactly 16kHz)
+    if sample_rate != 16000:
+        import torchaudio.functional as F_audio
+        audio_tensor = torch.from_numpy(audio_array).float()
+        audio_tensor = F_audio.resample(audio_tensor, sample_rate, 16000)
+        audio_array = audio_tensor.numpy()
+        sample_rate = 16000
+    if duration_s <= 28:
+        # Short audio: standard single-pass transcription via pipeline
         result = asr_pipe(
             {"raw": audio_array, "sampling_rate": sample_rate},
             return_timestamps=False,
         )
+        return result["text"].strip()
+    # Long audio: use native Whisper generate() with built-in long-form support
+    # This is dramatically faster than pipeline(chunk_length_s=...)
+    model = asr_pipe.model
+    processor = asr_pipe.feature_extractor
+    tokenizer = asr_pipe.tokenizer
+    # Process the full audio - Whisper's native long-form handles chunking internally
+    inputs = processor(
+        audio_array,
+        sampling_rate=16000,
+        return_tensors="pt",
+        truncation=False,
+        padding="longest",
+        return_attention_mask=True,
+    )
+    input_features = inputs.input_features.to(DEVICE, dtype=TORCH_DTYPE)
+    attention_mask = inputs.attention_mask.to(DEVICE) if "attention_mask" in inputs else None
+    generate_kwargs = {
+        "return_timestamps": True,
+        "language": "en",
+        "task": "transcribe",
+    }
+    if attention_mask is not None:
+        generate_kwargs["attention_mask"] = attention_mask
+    with torch.no_grad():
+        predicted_ids = model.generate(input_features, **generate_kwargs)
+    transcription = tokenizer.batch_decode(
+        predicted_ids, skip_special_tokens=True
+    )[0]
+    return transcription.strip()
 def translate_sentence(text, max_length=256, fast=False):
 def mux_video_with_new_audio(video_path, audio_path, output_video_path):
+    """Combine original video with new audio track into final MP4."""
     cmd = [
         "ffmpeg", "-y",
         "-i", video_path,              # input video (with original audio)
     return output_video_path
+def mux_video_extended_with_audio(video_path, audio_path, output_video_path, target_duration_s):
+    """
+    Combine video with longer audio by extending video (freezing last frame).
+    Uses ffmpeg's tpad filter to hold the last frame until audio ends.
+    """
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", video_path,
+        "-i", audio_path,
+        "-filter_complex",
+        f"[0:v]tpad=stop_mode=clone:stop_duration={target_duration_s}[v]",
+        "-map", "[v]",
+        "-map", "1:a:0",
+        "-c:v", "libx264",
+        "-preset", "fast",
+        "-c:a", "aac",
+        "-t", str(target_duration_s),
+        output_video_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"ffmpeg video extension failed:\n{result.stderr}")
+    return output_video_path
 def dub_video(video_path, progress=gr.Progress()):
     """
     Full video dubbing pipeline:
         log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
         log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio ({n_yo} sentences)")
+        # Step 5: Decide alignment strategy
+        # Cap stretch at 1.2x to avoid unnatural-sounding audio.
+        # If Yoruba needs more compression than that, extend the video instead.
+        progress(0.85, desc="Aligning audio to video...")
         t0 = time.time()
+        MAX_STRETCH = 1.2  # Maximum 1.2x speedup allowed
         stretch_ratio = yoruba_duration / video_duration
+        log_lines.append(f"\n**Alignment** (ratio: {stretch_ratio:.2f}x)")
+        if stretch_ratio <= MAX_STRETCH:
+            # Stretch is acceptable - shrink Yoruba audio to fit video
+            log_lines.append(f"Stretching audio to fit {video_duration:.1f}s video")
+            if abs(stretch_ratio - 1.0) > 0.02:
+                stretch_audio_to_duration(yoruba_audio_raw, yoruba_audio_aligned, video_duration)
+            else:
+                import shutil
+                shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
+            final_duration = video_duration
+            extend_video = False
         else:
+            # Stretch would be too aggressive - keep natural speed and extend video
+            log_lines.append(f"Ratio exceeds {MAX_STRETCH}x cap - keeping natural speed")
+            log_lines.append(f"Video will be extended from {video_duration:.1f}s to {yoruba_duration:.1f}s")
             import shutil
             shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
+            final_duration = yoruba_duration
+            extend_video = True
+        log_lines.append(f"Alignment took {time.time()-t0:.2f}s")
+        # Step 6: Mux with video (extend if needed)
         progress(0.95, desc="Combining audio and video...")
         t0 = time.time()
+        if extend_video:
+            mux_video_extended_with_audio(
+                video_path, yoruba_audio_aligned, output_video, final_duration
+            )
+            log_lines.append(f"\n**Muxing** ({time.time()-t0:.2f}s) - video extended by freezing last frame")
+        else:
+            mux_video_with_new_audio(video_path, yoruba_audio_aligned, output_video)
+            log_lines.append(f"\n**Muxing** ({time.time()-t0:.2f}s)")
         total = time.time() - total_start
         log_lines.append(f"\n---\n**Total processing time:** {total:.1f}s")