Spaces:

PlotweaverModel
/

Live-Football-Commentary

Running

App Files Files Community

PlotweaverModel commited on 11 days ago

Commit

db26bd8

verified ·

1 Parent(s): 8955646

Package file upload and updating app file

Browse files

Files changed (2) hide show

app.py +245 -3
packages.txt +1 -0

app.py CHANGED Viewed

@@ -12,7 +12,11 @@ import numpy as np
 import re
 import time
 import io
 import logging
 import gradio as gr
 from transformers import (
     pipeline as hf_pipeline,
@@ -375,9 +379,200 @@ def clear_stream_state():
 # =============================================================================
-# Gradio UI
 # =============================================================================
 DESCRIPTION = """
 # Live Football Commentary \u2014 English \u2192 Yoruba
@@ -390,9 +585,11 @@ STREAMING_INSTRUCTIONS = """
 ### How to use live streaming:
 1. Click the **microphone** button to start recording
 2. Speak English commentary naturally
-3. The transcript updates live below
-4. Click **Clear** to reset
 """.format(chunk_dur=CHUNK_DURATION_S)
 EXAMPLES_TEXT = [
@@ -538,6 +735,51 @@ with gr.Blocks(
                 outputs=[text_audio_output, text_log],
             )
     gr.Markdown("""
 ---
 **Models:**

 import re
 import time
 import io
+import os
+import subprocess
+import tempfile
 import logging
+import soundfile as sf
 import gradio as gr
 from transformers import (
     pipeline as hf_pipeline,
 # =============================================================================
+# Video Dubbing Pipeline
 # =============================================================================
+def extract_audio_from_video(video_path, output_audio_path, target_sr=16000):
+    """Extract audio track from video file as 16kHz mono WAV using ffmpeg."""
+    cmd = [
+        "ffmpeg", "-y",           # overwrite output
+        "-i", video_path,         # input video
+        "-vn",                    # no video
+        "-acodec", "pcm_s16le",   # 16-bit PCM
+        "-ar", str(target_sr),    # sample rate
+        "-ac", "1",               # mono
+        output_audio_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"ffmpeg audio extraction failed:\n{result.stderr}")
+    return output_audio_path
+def get_video_duration(video_path):
+    """Get video duration in seconds using ffprobe."""
+    cmd = [
+        "ffprobe", "-v", "error",
+        "-show_entries", "format=duration",
+        "-of", "default=noprint_wrappers=1:nokey=1",
+        video_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"ffprobe failed: {result.stderr}")
+    return float(result.stdout.strip())
+def stretch_audio_to_duration(input_audio_path, output_audio_path, target_duration_s):
+    """
+    Stretch or compress audio to match a target duration using ffmpeg's atempo filter.
+    atempo accepts 0.5-2.0 per filter; chain multiple for larger ratios.
+    """
+    # Get current audio duration
+    current_duration = get_video_duration(input_audio_path)
+    if current_duration <= 0:
+        raise RuntimeError("Invalid audio duration")
+    # Calculate the tempo ratio (>1 speeds up, <1 slows down)
+    ratio = current_duration / target_duration_s
+    # atempo filter is limited to 0.5-2.0; chain if needed
+    filters = []
+    remaining = ratio
+    while remaining > 2.0:
+        filters.append("atempo=2.0")
+        remaining /= 2.0
+    while remaining < 0.5:
+        filters.append("atempo=0.5")
+        remaining /= 0.5
+    filters.append(f"atempo={remaining:.4f}")
+    filter_str = ",".join(filters)
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", input_audio_path,
+        "-filter:a", filter_str,
+        output_audio_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"ffmpeg tempo adjustment failed:\n{result.stderr}")
+    return output_audio_path
+def mux_video_with_new_audio(video_path, audio_path, output_video_path):
+    """Combine original video (no audio) with new audio track into final MP4."""
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", video_path,              # input video (with original audio)
+        "-i", audio_path,              # new audio track
+        "-c:v", "copy",                # copy video stream without re-encoding
+        "-c:a", "aac",                 # encode audio as AAC (standard for MP4)
+        "-map", "0:v:0",               # take video from first input
+        "-map", "1:a:0",               # take audio from second input
+        "-shortest",                   # stop at shortest stream
+        output_video_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"ffmpeg muxing failed:\n{result.stderr}")
+    return output_video_path
+def dub_video(video_path, progress=gr.Progress()):
+    """
+    Full video dubbing pipeline:
+    1. Extract audio from video
+    2. Transcribe English audio
+    3. Translate to Yoruba
+    4. Synthesize Yoruba audio
+    5. Stretch to match original duration
+    6. Combine with video
+    """
+    if video_path is None:
+        return None, "Please upload a video file."
+    total_start = time.time()
+    log_lines = []
+    try:
+        # Create working directory
+        work_dir = tempfile.mkdtemp(prefix="dub_")
+        extracted_audio = os.path.join(work_dir, "original_audio.wav")
+        yoruba_audio_raw = os.path.join(work_dir, "yoruba_raw.wav")
+        yoruba_audio_aligned = os.path.join(work_dir, "yoruba_aligned.wav")
+        output_video = os.path.join(work_dir, "dubbed_output.mp4")
+        # Step 1: Extract audio from video
+        progress(0.1, desc="Extracting audio from video...")
+        t0 = time.time()
+        extract_audio_from_video(video_path, extracted_audio)
+        video_duration = get_video_duration(video_path)
+        log_lines.append(f"**Video duration:** {video_duration:.1f}s")
+        log_lines.append(f"**Audio extraction:** {time.time()-t0:.2f}s")
+        # Load extracted audio for ASR
+        audio_array, sample_rate = sf.read(extracted_audio, dtype="float32")
+        if audio_array.ndim > 1:
+            audio_array = audio_array.mean(axis=1)
+        # Step 2: ASR
+        progress(0.25, desc="Transcribing English speech...")
+        t0 = time.time()
+        english_text = transcribe(audio_array, sample_rate)
+        log_lines.append(f"\n**ASR** ({time.time()-t0:.2f}s)")
+        log_lines.append(f"{english_text[:300]}{'...' if len(english_text) > 300 else ''}")
+        if not english_text:
+            return None, "ASR returned empty text. The video may have no audible speech."
+        # Step 3: Translate (using beam search for best quality since this is batch)
+        progress(0.5, desc="Translating English to Yoruba...")
+        t0 = time.time()
+        sentences = split_into_sentences(english_text)
+        translations = []
+        for s in sentences:
+            yo = translate_sentence(s, fast=False)  # beam search for quality
+            translations.append(yo)
+        yoruba_text = ' '.join(translations)
+        log_lines.append(f"\n**MT** ({time.time()-t0:.2f}s, {len(sentences)} sentences)")
+        log_lines.append(f"{yoruba_text[:300]}{'...' if len(yoruba_text) > 300 else ''}")
+        if not yoruba_text:
+            return None, "Translation returned empty text."
+        # Step 4: TTS
+        progress(0.7, desc="Synthesizing Yoruba speech...")
+        t0 = time.time()
+        yoruba_audio, output_sr = synthesize(yoruba_text)
+        sf.write(yoruba_audio_raw, yoruba_audio, output_sr)
+        yoruba_duration = len(yoruba_audio) / output_sr
+        log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
+        log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio")
+        # Step 5: Time-align Yoruba audio to match video duration
+        progress(0.85, desc="Aligning audio to video duration...")
+        t0 = time.time()
+        stretch_ratio = yoruba_duration / video_duration
+        log_lines.append(f"\n**Alignment** ({time.time()-t0:.2f}s)")
+        log_lines.append(f"Stretch ratio: {stretch_ratio:.2f}x (target: {video_duration:.1f}s)")
+        if abs(stretch_ratio - 1.0) > 0.02:  # Only stretch if >2% difference
+            stretch_audio_to_duration(yoruba_audio_raw, yoruba_audio_aligned, video_duration)
+        else:
+            # Ratios close enough — just copy
+            import shutil
+            shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
+        # Step 6: Mux with original video
+        progress(0.95, desc="Combining audio and video...")
+        t0 = time.time()
+        mux_video_with_new_audio(video_path, yoruba_audio_aligned, output_video)
+        log_lines.append(f"\n**Muxing** ({time.time()-t0:.2f}s)")
+        total = time.time() - total_start
+        log_lines.append(f"\n---\n**Total processing time:** {total:.1f}s")
+        progress(1.0, desc="Done!")
+        return output_video, "\n".join(log_lines)
+    except Exception as e:
+        logger.exception("Video dubbing failed")
+        return None, f"Error: {str(e)}"
 DESCRIPTION = """
 # Live Football Commentary \u2014 English \u2192 Yoruba
 ### How to use live streaming:
 1. Click the **microphone** button to start recording
 2. Speak English commentary naturally
+3. Every **{chunk_dur}s**, the pipeline processes your audio and plays back Yoruba
+4. The transcript updates live below
+5. Click **Clear** to reset
+**Expected latency:** ~3\u20135 seconds behind your speech.
 """.format(chunk_dur=CHUNK_DURATION_S)
 EXAMPLES_TEXT = [
                 outputs=[text_audio_output, text_log],
             )
+        # ---- Tab 4: Video Dubbing ----
+        with gr.TabItem("Video Dubbing"):
+            gr.Markdown("""
+### Video Dubbing (English \u2192 Yoruba)
+Upload a video with English commentary and get back the same video with Yoruba dubbed audio.
+**How it works:**
+1. Audio is extracted from your video
+2. Transcribed to English text (Whisper)
+3. Translated to Yoruba (NLLB-200 with beam search)
+4. Synthesized into Yoruba speech (MMS-TTS)
+5. Time-aligned to match the original video duration
+6. Combined with the original video (visuals preserved)
+**Note:** Processing takes approximately 30\u201360% of the video duration on GPU. A 5-minute video takes about 2\u20133 minutes to process. Lip sync is not preserved \u2014 this is standard AI dubbing.
+""")
+            with gr.Row():
+                with gr.Column():
+                    video_input = gr.Video(
+                        label="Upload English Commentary Video",
+                        sources=["upload"],
+                    )
+                    video_submit = gr.Button(
+                        "Dub to Yoruba",
+                        variant="primary",
+                        size="lg"
+                    )
+                with gr.Column():
+                    video_output = gr.Video(
+                        label="Yoruba Dubbed Video (Download from player)",
+                    )
+                    video_log = gr.Markdown(
+                        label="Processing Log",
+                        value="Upload a video and click 'Dub to Yoruba' to start."
+                    )
+            video_submit.click(
+                fn=dub_video,
+                inputs=[video_input],
+                outputs=[video_output, video_log],
+            )
     gr.Markdown("""
 ---
 **Models:**

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg