Spaces:

Rafii
/

videovoice-dramabox

Running on Zero

File size: 16,871 Bytes
"""
Step 6: Audio sync — match synthesised segment durations to original timestamps.

For each segment:
  - Too long  → speed up using ffmpeg atempo filter
  - Too short → pad with silence at the end
Then stitch all segments into a single final audio track.
"""
import array
import math
import os
import subprocess
import wave
from pathlib import Path


def _get_wav_duration(wav_path: str) -> float:
    with wave.open(wav_path, 'r') as f:
        frames = f.getnframes()
        rate = f.getframerate()
        return frames / float(rate)


def _speedup_audio(input_path: str, output_path: str, factor: float) -> None:
    """Speed up/slow down audio by factor using ffmpeg atempo (supports 0.5–100x via chaining)."""
    # atempo supports 0.5 to 2.0, chain filters for larger factors
    filters = []
    remaining = factor
    while remaining > 2.0:
        filters.append("atempo=2.0")
        remaining /= 2.0
    while remaining < 0.5:
        filters.append("atempo=0.5")
        remaining /= 0.5
    filters.append(f"atempo={remaining:.4f}")
    filter_str = ",".join(filters)

    cmd = [
        "ffmpeg", "-y", "-i", input_path,
        "-filter:a", filter_str,
        output_path,
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"ffmpeg atempo failed:\n{result.stderr}")


def _pad_silence(input_path: str, output_path: str, target_duration: float) -> None:
    """Pad audio with silence to reach target_duration seconds."""
    current = _get_wav_duration(input_path)
    pad_seconds = max(0, target_duration - current)

    cmd = [
        "ffmpeg", "-y", "-i", input_path,
        "-af", f"apad=pad_dur={pad_seconds:.4f}",
        "-t", str(target_duration),
        output_path,
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"ffmpeg apad failed:\n{result.stderr}")


def _trim_audio(input_path: str, output_path: str, duration: float) -> None:
    """Trim audio to exactly duration seconds."""
    tmp = output_path + ".trim.wav"
    cmd = ["ffmpeg", "-y", "-i", input_path, "-t", str(duration), tmp]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"ffmpeg trim failed:\n{result.stderr}")
    os.replace(tmp, output_path)


def _detect_pauses(words: list[dict], min_pause: float = 0.15) -> list[dict]:
    """Find gaps between consecutive words that exceed min_pause seconds.

    Returns list of {after_word_idx, position, duration} sorted by position.
    """
    pauses = []
    for i in range(len(words) - 1):
        gap = words[i + 1]["start"] - words[i]["end"]
        if gap >= min_pause:
            pauses.append({
                "after_word_idx": i,
                "position": words[i]["end"],
                "duration": gap,
            })
    return pauses


def _find_tts_silences(wav_path: str, threshold_db: float = -35.0,
                       min_dur: float = 0.08) -> list[dict]:
    """Find silence regions in a TTS WAV using RMS energy.

    Returns list of {start, end, duration} for each detected silence region.
    """
    with wave.open(wav_path, "r") as f:
        n_frames = f.getnframes()
        sample_rate = f.getframerate()
        raw = f.readframes(n_frames)

    # Convert raw bytes to 16-bit signed samples
    samples = array.array("h", raw)

    window_size = int(0.02 * sample_rate)  # 20 ms windows
    hop = window_size // 2
    threshold_linear = 10 ** (threshold_db / 20.0) * 32768  # dBFS to linear amplitude

    silences: list[dict] = []
    in_silence = False
    silence_start = 0.0

    for pos in range(0, len(samples) - window_size, hop):
        chunk = samples[pos:pos + window_size]
        rms = math.sqrt(sum(s * s for s in chunk) / window_size)
        t = pos / sample_rate

        if rms < threshold_linear:
            if not in_silence:
                in_silence = True
                silence_start = t
        else:
            if in_silence:
                dur = t - silence_start
                if dur >= min_dur:
                    silences.append({"start": silence_start, "end": t, "duration": dur})
                in_silence = False

    # Close trailing silence
    if in_silence:
        t_end = len(samples) / sample_rate
        dur = t_end - silence_start
        if dur >= min_dur:
            silences.append({"start": silence_start, "end": t_end, "duration": dur})

    return silences


def _read_wav_samples(wav_path: str) -> tuple[array.array, int]:
    """Read a mono 16-bit WAV and return (samples, sample_rate)."""
    with wave.open(wav_path, "r") as f:
        sr = f.getframerate()
        raw = f.readframes(f.getnframes())
    return array.array("h", raw), sr


def _write_wav_samples(samples: array.array, sample_rate: int, output_path: str) -> None:
    """Write 16-bit mono samples to a WAV file."""
    with wave.open(output_path, "w") as f:
        f.setnchannels(1)
        f.setsampwidth(2)
        f.setframerate(sample_rate)
        f.writeframes(samples.tobytes())


def _pause_aware_sync(tts_path: str, synced_path: str, target_duration: float,
                      words: list[dict], max_speed: float,
                      max_overflow: float = 0.0) -> None:
    """Sync TTS audio using pause-aware strategy: compress silences first, then atempo.

    When TTS is too long: shrink detected silence regions before speeding up speech.
    When TTS is too short: distribute extra padding at natural pause points.

    `max_overflow`: extra seconds the synced output may exceed target_duration without
    trimming. The caller borrows this budget from the inter-segment silence that follows,
    so we never silently drop trailing words just to hit `target_duration` exactly.
    """
    tts_duration = _get_wav_duration(tts_path)
    original_pauses = _detect_pauses(words)
    tts_silences = _find_tts_silences(tts_path)

    total_tts_silence = sum(s["duration"] for s in tts_silences)
    hard_cap = target_duration + max_overflow
    overshoot_vs_cap = tts_duration - hard_cap

    if tts_duration > target_duration * 1.02:
        if tts_silences and total_tts_silence > 0:
            if overshoot_vs_cap <= 0:
                # Already within hard_cap once we factor in the borrow budget — keep TTS as-is.
                import shutil
                shutil.copy(tts_path, synced_path)
                print(f"[s5]   pause-aware: within +{max_overflow:.2f}s borrow, no compression")
            else:
                removable = min(total_tts_silence * 0.9, overshoot_vs_cap)
                if removable >= overshoot_vs_cap:
                    compression_ratio = 1.0 - (removable / total_tts_silence)
                    _compress_silences(tts_path, synced_path, tts_silences, compression_ratio)
                    print(f"[s5]   pause-aware: compressed silences (ratio {compression_ratio:.2f}, +{max_overflow:.2f}s borrow)")
                else:
                    _compress_silences(tts_path, synced_path, tts_silences, 0.1)  # keep 10%
                    remaining_dur = _get_wav_duration(synced_path)
                    speed_factor = remaining_dur / hard_cap if hard_cap > 0 else max_speed
                    if speed_factor > max_speed:
                        print(f"[s5]   pause-aware: WARNING speed x{speed_factor:.2f} exceeds max, capping at x{max_speed} (will overflow next gap)")
                        speed_factor = max_speed
                    print(f"[s5]   pause-aware: compressed silences + speedup x{speed_factor:.2f} (+{max_overflow:.2f}s borrow)")
                    tmp = synced_path + ".tmp.wav"
                    _speedup_audio(synced_path, tmp, speed_factor)
                    os.replace(tmp, synced_path)
        else:
            # No silences detected — uniform speedup, but use hard_cap as the target.
            speed_factor = tts_duration / hard_cap if hard_cap > 0 else max_speed
            if speed_factor > max_speed:
                print(f"[s5]   pause-aware: WARNING speed x{speed_factor:.2f} exceeds max, capping at x{max_speed} (will overflow next gap)")
                speed_factor = max_speed
            print(f"[s5]   pause-aware: uniform speedup x{speed_factor:.2f} (no silences, +{max_overflow:.2f}s borrow)")
            _speedup_audio(tts_path, synced_path, speed_factor)

    elif tts_duration < target_duration * 0.98:
        shortfall = target_duration - tts_duration
        if tts_silences and original_pauses:
            # Distribute padding at detected silence positions
            _distribute_padding(tts_path, synced_path, tts_silences, shortfall, target_duration)
            print(f"[s5]   pause-aware: distributed {shortfall:.2f}s padding across {len(tts_silences)} pause points")
        else:
            # No pause points — pad at end
            _pad_silence(tts_path, synced_path, target_duration)
            print(f"[s5]   pause-aware: padded {shortfall:.2f}s at end (no pause points)")
    else:
        import shutil
        shutil.copy(tts_path, synced_path)


def _compress_silences(input_path: str, output_path: str,
                       silences: list[dict], keep_ratio: float) -> None:
    """Rewrite WAV with silence regions compressed to keep_ratio of their original duration."""
    samples, sr = _read_wav_samples(input_path)
    out = array.array("h")

    prev_end_sample = 0
    for sil in silences:
        sil_start = int(sil["start"] * sr)
        sil_end = int(sil["end"] * sr)

        # Copy speech before this silence
        out.extend(samples[prev_end_sample:sil_start])

        # Keep only keep_ratio of the silence
        kept_samples = int((sil_end - sil_start) * keep_ratio)
        if kept_samples > 0:
            out.extend(samples[sil_start:sil_start + kept_samples])

        prev_end_sample = sil_end

    # Copy remaining speech after last silence
    out.extend(samples[prev_end_sample:])
    _write_wav_samples(out, sr, output_path)


def _distribute_padding(input_path: str, output_path: str,
                        tts_silences: list[dict], shortfall: float,
                        target_duration: float) -> None:
    """Insert extra silence distributed across detected pause points."""
    samples, sr = _read_wav_samples(input_path)
    n_points = len(tts_silences)
    pad_per_point = shortfall / n_points

    out = array.array("h")
    prev_end_sample = 0

    for sil in tts_silences:
        sil_end = int(sil["end"] * sr)

        # Copy everything up to end of this silence region
        out.extend(samples[prev_end_sample:sil_end])

        # Insert extra silence
        extra_samples = int(pad_per_point * sr)
        out.extend(array.array("h", [0] * extra_samples))

        prev_end_sample = sil_end

    # Copy remaining audio
    out.extend(samples[prev_end_sample:])

    _write_wav_samples(out, sr, output_path)

    # Trim to exact target if slightly over due to rounding
    actual = len(out) / sr
    if actual > target_duration * 1.02:
        _trim_audio(output_path, output_path, target_duration)


def _generate_silence(output_path: str, duration: float, sample_rate: int = 16000) -> None:
    """Generate a silent WAV file of given duration."""
    num_samples = int(duration * sample_rate)
    with wave.open(output_path, "w") as f:
        f.setnchannels(1)
        f.setsampwidth(2)  # 16-bit
        f.setframerate(sample_rate)
        f.writeframes(b"\x00\x00" * num_samples)


def sync_and_stitch(
    segments: list[dict],
    output_path: str = "tmp/audio/final_audio.wav",
    synced_dir: str = "tmp/audio/tts_synced",
    max_speed: float = 1.8,
) -> str:
    """
    Sync each TTS segment to its original timestamp window and stitch into a single WAV.

    Args:
        segments: List of dicts with {start, end, tts_path}.
        output_path: Where to write the final stitched audio.
        synced_dir: Temp directory for per-segment synced WAVs.
        max_speed: Maximum allowed speedup factor (default 1.8x to preserve naturalness).

    Returns:
        Path to the final stitched audio WAV.
    """
    Path(synced_dir).mkdir(parents=True, exist_ok=True)
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)

    # Detect TTS sample rate from the first segment
    with wave.open(segments[0]["tts_path"], 'r') as f:
        tts_sample_rate = f.getframerate()
    print(f"[s5] TTS sample rate: {tts_sample_rate} Hz")

    concat_list_path = "tmp/concat_list.txt"
    concat_entries = []

    # Track the real wall-clock playback cursor. When a segment overflows its
    # original window, the cursor moves past the segment's nominal end, and the
    # next inter-segment silence shrinks accordingly — overflow is absorbed by
    # the following gap instead of being trimmed off the end of the audio.
    playback_cursor = 0.0
    for i, seg in enumerate(segments):
        start = seg["start"]
        end = seg["end"]
        target_duration = end - start
        tts_path = seg["tts_path"]

        # Fill gap before this segment with silence — but only as much as the
        # cursor is actually behind. If a prior segment overflowed past `start`,
        # `gap` goes negative and we skip the silence (and start slightly late).
        gap = start - playback_cursor
        if gap > 0.01:
            sil_path = os.path.join(synced_dir, f"silence_{i:04d}.wav")
            _generate_silence(sil_path, gap, sample_rate=tts_sample_rate)
            concat_entries.append(sil_path)
            playback_cursor += gap
        elif gap < -0.05:
            print(f"[s5] Seg {i}: running {-gap:.2f}s behind original timeline (prior overflow absorbed)")

        # Borrow budget: how much we may overflow `target_duration` without
        # trimming. We can use the silence between this segment's `end` and the
        # next segment's `start`. Last segment has no follower → 0.
        if i + 1 < len(segments):
            allowed_overflow = max(segments[i + 1]["start"] - end, 0.0)
        else:
            allowed_overflow = 0.0

        tts_duration = _get_wav_duration(tts_path)
        synced_path = os.path.join(synced_dir, f"synced_{i:04d}.wav")
        hard_cap = target_duration + allowed_overflow

        words = seg.get("words")
        if words and len(words) > 1:
            print(f"[s5] Seg {i}: pause-aware sync ({tts_duration:.2f}s -> {target_duration:.2f}s, +{allowed_overflow:.2f}s borrow)")
            _pause_aware_sync(tts_path, synced_path, target_duration, words, max_speed,
                              max_overflow=allowed_overflow)
        elif tts_duration > target_duration * 1.02:
            # Speed up only as far as needed to land within hard_cap; if the
            # required factor exceeds max_speed, cap it and let it overflow —
            # the next gap will shrink to absorb it. Never trim.
            if tts_duration <= hard_cap:
                speed_factor = 1.0
            else:
                speed_factor = tts_duration / hard_cap if hard_cap > 0 else max_speed
            if speed_factor > max_speed:
                print(f"[s5] Seg {i}: WARNING speed x{speed_factor:.2f} exceeds max, capping at x{max_speed} (will overflow next gap)")
                speed_factor = max_speed
            if speed_factor > 1.001:
                print(f"[s5] Seg {i}: speeding up x{speed_factor:.2f} (+{allowed_overflow:.2f}s borrow)")
                _speedup_audio(tts_path, synced_path, speed_factor)
            else:
                import shutil
                shutil.copy(tts_path, synced_path)
                print(f"[s5] Seg {i}: within +{allowed_overflow:.2f}s borrow, no speedup")
        elif tts_duration < target_duration * 0.98:
            print(f"[s5] Seg {i}: padding {target_duration - tts_duration:.2f}s silence")
            _pad_silence(tts_path, synced_path, target_duration)
        else:
            import shutil
            shutil.copy(tts_path, synced_path)

        concat_entries.append(synced_path)
        playback_cursor += _get_wav_duration(synced_path)

    # Write concat list for ffmpeg
    with open(concat_list_path, "w") as f:
        for entry in concat_entries:
            abs_entry = os.path.abspath(entry)
            f.write(f"file '{abs_entry}'\n")

    # Concatenate all segments (re-encode to normalize sample rates)
    cmd = [
        "ffmpeg", "-y",
        "-f", "concat", "-safe", "0",
        "-i", concat_list_path,
        "-ar", str(tts_sample_rate),
        "-ac", "1",
        "-acodec", "pcm_s16le",
        output_path,
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        raise RuntimeError(f"ffmpeg concat failed:\n{result.stderr}")

    print(f"[s5] Audio sync complete → {output_path} ✓")
    return output_path