PlotweaverModel commited on
Commit
db26bd8
·
verified ·
1 Parent(s): 8955646

Package file upload and updating app file

Browse files
Files changed (2) hide show
  1. app.py +245 -3
  2. packages.txt +1 -0
app.py CHANGED
@@ -12,7 +12,11 @@ import numpy as np
12
  import re
13
  import time
14
  import io
 
 
 
15
  import logging
 
16
  import gradio as gr
17
  from transformers import (
18
  pipeline as hf_pipeline,
@@ -375,9 +379,200 @@ def clear_stream_state():
375
 
376
 
377
  # =============================================================================
378
- # Gradio UI
379
  # =============================================================================
380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  DESCRIPTION = """
382
  # Live Football Commentary \u2014 English \u2192 Yoruba
383
 
@@ -390,9 +585,11 @@ STREAMING_INSTRUCTIONS = """
390
  ### How to use live streaming:
391
  1. Click the **microphone** button to start recording
392
  2. Speak English commentary naturally
393
- 3. The transcript updates live below
394
- 4. Click **Clear** to reset
 
395
 
 
396
  """.format(chunk_dur=CHUNK_DURATION_S)
397
 
398
  EXAMPLES_TEXT = [
@@ -538,6 +735,51 @@ with gr.Blocks(
538
  outputs=[text_audio_output, text_log],
539
  )
540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
  gr.Markdown("""
542
  ---
543
  **Models:**
 
12
  import re
13
  import time
14
  import io
15
+ import os
16
+ import subprocess
17
+ import tempfile
18
  import logging
19
+ import soundfile as sf
20
  import gradio as gr
21
  from transformers import (
22
  pipeline as hf_pipeline,
 
379
 
380
 
381
  # =============================================================================
382
+ # Video Dubbing Pipeline
383
  # =============================================================================
384
 
385
+ def extract_audio_from_video(video_path, output_audio_path, target_sr=16000):
386
+ """Extract audio track from video file as 16kHz mono WAV using ffmpeg."""
387
+ cmd = [
388
+ "ffmpeg", "-y", # overwrite output
389
+ "-i", video_path, # input video
390
+ "-vn", # no video
391
+ "-acodec", "pcm_s16le", # 16-bit PCM
392
+ "-ar", str(target_sr), # sample rate
393
+ "-ac", "1", # mono
394
+ output_audio_path,
395
+ ]
396
+ result = subprocess.run(cmd, capture_output=True, text=True)
397
+ if result.returncode != 0:
398
+ raise RuntimeError(f"ffmpeg audio extraction failed:\n{result.stderr}")
399
+ return output_audio_path
400
+
401
+
402
+ def get_video_duration(video_path):
403
+ """Get video duration in seconds using ffprobe."""
404
+ cmd = [
405
+ "ffprobe", "-v", "error",
406
+ "-show_entries", "format=duration",
407
+ "-of", "default=noprint_wrappers=1:nokey=1",
408
+ video_path,
409
+ ]
410
+ result = subprocess.run(cmd, capture_output=True, text=True)
411
+ if result.returncode != 0:
412
+ raise RuntimeError(f"ffprobe failed: {result.stderr}")
413
+ return float(result.stdout.strip())
414
+
415
+
416
+ def stretch_audio_to_duration(input_audio_path, output_audio_path, target_duration_s):
417
+ """
418
+ Stretch or compress audio to match a target duration using ffmpeg's atempo filter.
419
+ atempo accepts 0.5-2.0 per filter; chain multiple for larger ratios.
420
+ """
421
+ # Get current audio duration
422
+ current_duration = get_video_duration(input_audio_path)
423
+ if current_duration <= 0:
424
+ raise RuntimeError("Invalid audio duration")
425
+
426
+ # Calculate the tempo ratio (>1 speeds up, <1 slows down)
427
+ ratio = current_duration / target_duration_s
428
+
429
+ # atempo filter is limited to 0.5-2.0; chain if needed
430
+ filters = []
431
+ remaining = ratio
432
+ while remaining > 2.0:
433
+ filters.append("atempo=2.0")
434
+ remaining /= 2.0
435
+ while remaining < 0.5:
436
+ filters.append("atempo=0.5")
437
+ remaining /= 0.5
438
+ filters.append(f"atempo={remaining:.4f}")
439
+ filter_str = ",".join(filters)
440
+
441
+ cmd = [
442
+ "ffmpeg", "-y",
443
+ "-i", input_audio_path,
444
+ "-filter:a", filter_str,
445
+ output_audio_path,
446
+ ]
447
+ result = subprocess.run(cmd, capture_output=True, text=True)
448
+ if result.returncode != 0:
449
+ raise RuntimeError(f"ffmpeg tempo adjustment failed:\n{result.stderr}")
450
+ return output_audio_path
451
+
452
+
453
+ def mux_video_with_new_audio(video_path, audio_path, output_video_path):
454
+ """Combine original video (no audio) with new audio track into final MP4."""
455
+ cmd = [
456
+ "ffmpeg", "-y",
457
+ "-i", video_path, # input video (with original audio)
458
+ "-i", audio_path, # new audio track
459
+ "-c:v", "copy", # copy video stream without re-encoding
460
+ "-c:a", "aac", # encode audio as AAC (standard for MP4)
461
+ "-map", "0:v:0", # take video from first input
462
+ "-map", "1:a:0", # take audio from second input
463
+ "-shortest", # stop at shortest stream
464
+ output_video_path,
465
+ ]
466
+ result = subprocess.run(cmd, capture_output=True, text=True)
467
+ if result.returncode != 0:
468
+ raise RuntimeError(f"ffmpeg muxing failed:\n{result.stderr}")
469
+ return output_video_path
470
+
471
+
472
+ def dub_video(video_path, progress=gr.Progress()):
473
+ """
474
+ Full video dubbing pipeline:
475
+ 1. Extract audio from video
476
+ 2. Transcribe English audio
477
+ 3. Translate to Yoruba
478
+ 4. Synthesize Yoruba audio
479
+ 5. Stretch to match original duration
480
+ 6. Combine with video
481
+ """
482
+ if video_path is None:
483
+ return None, "Please upload a video file."
484
+
485
+ total_start = time.time()
486
+ log_lines = []
487
+
488
+ try:
489
+ # Create working directory
490
+ work_dir = tempfile.mkdtemp(prefix="dub_")
491
+ extracted_audio = os.path.join(work_dir, "original_audio.wav")
492
+ yoruba_audio_raw = os.path.join(work_dir, "yoruba_raw.wav")
493
+ yoruba_audio_aligned = os.path.join(work_dir, "yoruba_aligned.wav")
494
+ output_video = os.path.join(work_dir, "dubbed_output.mp4")
495
+
496
+ # Step 1: Extract audio from video
497
+ progress(0.1, desc="Extracting audio from video...")
498
+ t0 = time.time()
499
+ extract_audio_from_video(video_path, extracted_audio)
500
+ video_duration = get_video_duration(video_path)
501
+ log_lines.append(f"**Video duration:** {video_duration:.1f}s")
502
+ log_lines.append(f"**Audio extraction:** {time.time()-t0:.2f}s")
503
+
504
+ # Load extracted audio for ASR
505
+ audio_array, sample_rate = sf.read(extracted_audio, dtype="float32")
506
+ if audio_array.ndim > 1:
507
+ audio_array = audio_array.mean(axis=1)
508
+
509
+ # Step 2: ASR
510
+ progress(0.25, desc="Transcribing English speech...")
511
+ t0 = time.time()
512
+ english_text = transcribe(audio_array, sample_rate)
513
+ log_lines.append(f"\n**ASR** ({time.time()-t0:.2f}s)")
514
+ log_lines.append(f"{english_text[:300]}{'...' if len(english_text) > 300 else ''}")
515
+
516
+ if not english_text:
517
+ return None, "ASR returned empty text. The video may have no audible speech."
518
+
519
+ # Step 3: Translate (using beam search for best quality since this is batch)
520
+ progress(0.5, desc="Translating English to Yoruba...")
521
+ t0 = time.time()
522
+ sentences = split_into_sentences(english_text)
523
+ translations = []
524
+ for s in sentences:
525
+ yo = translate_sentence(s, fast=False) # beam search for quality
526
+ translations.append(yo)
527
+ yoruba_text = ' '.join(translations)
528
+ log_lines.append(f"\n**MT** ({time.time()-t0:.2f}s, {len(sentences)} sentences)")
529
+ log_lines.append(f"{yoruba_text[:300]}{'...' if len(yoruba_text) > 300 else ''}")
530
+
531
+ if not yoruba_text:
532
+ return None, "Translation returned empty text."
533
+
534
+ # Step 4: TTS
535
+ progress(0.7, desc="Synthesizing Yoruba speech...")
536
+ t0 = time.time()
537
+ yoruba_audio, output_sr = synthesize(yoruba_text)
538
+ sf.write(yoruba_audio_raw, yoruba_audio, output_sr)
539
+ yoruba_duration = len(yoruba_audio) / output_sr
540
+ log_lines.append(f"\n**TTS** ({time.time()-t0:.2f}s)")
541
+ log_lines.append(f"Generated {yoruba_duration:.1f}s of Yoruba audio")
542
+
543
+ # Step 5: Time-align Yoruba audio to match video duration
544
+ progress(0.85, desc="Aligning audio to video duration...")
545
+ t0 = time.time()
546
+ stretch_ratio = yoruba_duration / video_duration
547
+ log_lines.append(f"\n**Alignment** ({time.time()-t0:.2f}s)")
548
+ log_lines.append(f"Stretch ratio: {stretch_ratio:.2f}x (target: {video_duration:.1f}s)")
549
+
550
+ if abs(stretch_ratio - 1.0) > 0.02: # Only stretch if >2% difference
551
+ stretch_audio_to_duration(yoruba_audio_raw, yoruba_audio_aligned, video_duration)
552
+ else:
553
+ # Ratios close enough — just copy
554
+ import shutil
555
+ shutil.copy(yoruba_audio_raw, yoruba_audio_aligned)
556
+
557
+ # Step 6: Mux with original video
558
+ progress(0.95, desc="Combining audio and video...")
559
+ t0 = time.time()
560
+ mux_video_with_new_audio(video_path, yoruba_audio_aligned, output_video)
561
+ log_lines.append(f"\n**Muxing** ({time.time()-t0:.2f}s)")
562
+
563
+ total = time.time() - total_start
564
+ log_lines.append(f"\n---\n**Total processing time:** {total:.1f}s")
565
+
566
+ progress(1.0, desc="Done!")
567
+ return output_video, "\n".join(log_lines)
568
+
569
+ except Exception as e:
570
+ logger.exception("Video dubbing failed")
571
+ return None, f"Error: {str(e)}"
572
+
573
+
574
+
575
+
576
  DESCRIPTION = """
577
  # Live Football Commentary \u2014 English \u2192 Yoruba
578
 
 
585
  ### How to use live streaming:
586
  1. Click the **microphone** button to start recording
587
  2. Speak English commentary naturally
588
+ 3. Every **{chunk_dur}s**, the pipeline processes your audio and plays back Yoruba
589
+ 4. The transcript updates live below
590
+ 5. Click **Clear** to reset
591
 
592
+ **Expected latency:** ~3\u20135 seconds behind your speech.
593
  """.format(chunk_dur=CHUNK_DURATION_S)
594
 
595
  EXAMPLES_TEXT = [
 
735
  outputs=[text_audio_output, text_log],
736
  )
737
 
738
+ # ---- Tab 4: Video Dubbing ----
739
+ with gr.TabItem("Video Dubbing"):
740
+ gr.Markdown("""
741
+ ### Video Dubbing (English \u2192 Yoruba)
742
+
743
+ Upload a video with English commentary and get back the same video with Yoruba dubbed audio.
744
+
745
+ **How it works:**
746
+ 1. Audio is extracted from your video
747
+ 2. Transcribed to English text (Whisper)
748
+ 3. Translated to Yoruba (NLLB-200 with beam search)
749
+ 4. Synthesized into Yoruba speech (MMS-TTS)
750
+ 5. Time-aligned to match the original video duration
751
+ 6. Combined with the original video (visuals preserved)
752
+
753
+ **Note:** Processing takes approximately 30\u201360% of the video duration on GPU. A 5-minute video takes about 2\u20133 minutes to process. Lip sync is not preserved \u2014 this is standard AI dubbing.
754
+ """)
755
+
756
+ with gr.Row():
757
+ with gr.Column():
758
+ video_input = gr.Video(
759
+ label="Upload English Commentary Video",
760
+ sources=["upload"],
761
+ )
762
+ video_submit = gr.Button(
763
+ "Dub to Yoruba",
764
+ variant="primary",
765
+ size="lg"
766
+ )
767
+
768
+ with gr.Column():
769
+ video_output = gr.Video(
770
+ label="Yoruba Dubbed Video (Download from player)",
771
+ )
772
+ video_log = gr.Markdown(
773
+ label="Processing Log",
774
+ value="Upload a video and click 'Dub to Yoruba' to start."
775
+ )
776
+
777
+ video_submit.click(
778
+ fn=dub_video,
779
+ inputs=[video_input],
780
+ outputs=[video_output, video_log],
781
+ )
782
+
783
  gr.Markdown("""
784
  ---
785
  **Models:**
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg