clone

Runtime error

App Files Files Community

PatnaikAshish commited on 30 days ago

Commit

66525f3

verified ·

1 Parent(s): 8da48d8

Create chunked_convert.py

Browse files

Files changed (1) hide show

core/chunked_convert.py +185 -0

core/chunked_convert.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+core/chunked_convert.py
+-----------------------
+VRAM-aware chunked voice conversion using the Kanade model.
+On CUDA devices, the source waveform is split into overlapping chunks so that
+peak activation memory stays within a configurable fraction of total VRAM
+(default 50%).  On CPU the waveform is still chunked to respect the model's
+RoPE sequence-length limit.
+RoPE ceiling (why chunks must be small)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The Kanade ``mel_decoder`` Transformer processes mel-spectrogram frames of the
+source chunk.  Its RoPE positional embeddings are precomputed for
+``_ROPE_MAX_FRAMES = 1024`` positions.  The mel frame count for a window of
+``W`` samples is ``W // hop_length + 1``.  Keeping that ≤ 1024 requires:
+    W  ≤  (1024 − 1) × hop_length  =  1023 × 256  =  261,888 samples  ≈ 10.9 s
+Each chunk window includes a 0.5 s overlap on both sides for boundary
+smoothing, so the *chunk* itself must be:
+    chunk  ≤  261,888 − 2 × (0.5 s × sample_rate)  ≈  9.9 s
+A 10 % safety margin is applied, giving ``_ROPE_SAFE_CHUNK_FACTOR ≈ 8.9 s``
+worth of source audio per chunk.
+Overlap / boundary handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each chunk includes a short overlap window on both sides.  After the
+voice-conversion forward pass, the overlap frames are trimmed from the mel
+output before the pieces are concatenated.  The final assembled mel is vocoded
+in a single pass.
+"""
+from __future__ import annotations
+import time
+import torch
+from kanade_tokenizer import vocode
+# Empirical constant: ~10 seconds of audio fit in 1 GB of VRAM budget for the
+# Kanade-12.5hz model.  Adjust downward if you observe OOM errors.
+_SECONDS_PER_GB: float = 10.0
+# Overlap window on each side of a chunk (seconds).
+_OVERLAP_SECONDS: float = 0.5
+# --------------------------------------------------------------------------
+# RoPE safety ceiling — derived from the mel_decoder Transformer
+# --------------------------------------------------------------------------
+# mel_decoder seqlen = audio_length // hop_length + 1 (center-padding mel).
+# Its RoPE freqs_cis is precomputed for _ROPE_MAX_FRAMES positions.
+# hop_length comes directly from KanadeModelConfig (hop_length = 256).
+_ROPE_MAX_FRAMES: int = 1024   # precomputed RoPE window (freqs_cis.shape[0])
+_MEL_HOP_LENGTH: int = 256     # KanadeModelConfig.hop_length
+_ROPE_SAFETY_MARGIN: float = 0.75
+# Output mel frame rate — kept for reference only; NOT used for overlap trimming.
+# Mel frames used internally are at sample_rate / hop_length (93.75 fps), not 12.5 fps.
+_MEL_FPS: float = 12.5
+def chunked_voice_conversion(
+    kanade,
+    vocoder_model,
+    source_wav: torch.Tensor,
+    ref_wav: torch.Tensor,
+    sample_rate: int,
+    vram_fraction: float = 0.9,
+) -> torch.Tensor:
+    """Convert *source_wav* to the reference voice in VRAM-safe chunks.
+    Parameters
+    ----------
+    kanade:
+        A loaded ``KanadeModel`` instance (already on the target device).
+    vocoder_model:
+        The vocoder loaded via ``load_vocoder`` (already on the target device).
+    source_wav:
+        Source waveform tensor of shape ``[T]`` or ``[1, T]``, on the same
+        device as *kanade*.
+    ref_wav:
+        Reference waveform tensor of shape ``[T]`` or ``[1, T]``, on the same
+        device as *kanade*.
+    sample_rate:
+        Audio sample rate in Hz (taken from ``kanade.config.sample_rate``).
+    vram_fraction:
+        Fraction of total VRAM to target per chunk.  Default ``0.5`` → 50 %.
+    Returns
+    -------
+    torch.Tensor
+        Converted waveform as a 1-D CPU float32 tensor.
+    """
+    device: torch.device = source_wav.device
+    n_samples: int = source_wav.shape[-1]
+    _start = time.perf_counter()
+    # ── 1. Determine chunk size ──────────────────────────────────────────────
+    # The mel_decoder RoPE ceiling limits the total window (chunk + overlaps).
+    # Max window in samples: (ROPE_MAX_FRAMES - 1) * MEL_HOP_LENGTH
+    # Subtract both overlap sides, then apply a safety margin.
+    overlap_samples = int(_OVERLAP_SECONDS * sample_rate)
+    rope_max_window = (_ROPE_MAX_FRAMES - 1) * _MEL_HOP_LENGTH  # 261,888 samples ≈ 10.9 s
+    rope_safe_chunk = int((rope_max_window - 2 * overlap_samples) * _ROPE_SAFETY_MARGIN)
+    rope_safe_seconds = rope_safe_chunk / sample_rate
+    if device.type == "cuda":
+        total_vram_bytes = torch.cuda.get_device_properties(device).total_memory
+        budget_bytes = total_vram_bytes * vram_fraction
+        budget_gb = budget_bytes / (1024 ** 3)
+        vram_chunk_samples = int(max(5.0, budget_gb * _SECONDS_PER_GB) * sample_rate)
+        # Take the smaller of VRAM-based and RoPE-safe limits.
+        chunk_samples = min(vram_chunk_samples, rope_safe_chunk)
+        chunk_seconds = chunk_samples / sample_rate
+        print(
+            f"[chunked_convert] VRAM budget: {budget_gb:.2f} GB "
+            f"({vram_fraction*100:.0f}% of {total_vram_bytes / (1024**3):.2f} GB) "
+            f"→ chunk size: {chunk_seconds:.1f}s / {chunk_samples:,} samples "
+            f"(RoPE ceiling: {rope_safe_seconds:.1f}s)"
+        )
+    else:
+        # CPU: no VRAM limit, but still respect the RoPE ceiling for quality.
+        chunk_samples = rope_safe_chunk
+    # ── 2. Short-circuit when the whole file fits in one chunk ───────────────
+    if n_samples <= chunk_samples:
+        with torch.inference_mode():
+            mel = kanade.voice_conversion(
+                source_waveform=source_wav, reference_waveform=ref_wav
+            )
+            wav = vocode(vocoder_model, mel.unsqueeze(0))
+        elapsed = time.perf_counter() - _start
+        print(f"[chunked_convert] Completed in {elapsed:.1f}s")
+        return wav.squeeze().cpu()
+    # ── 3. Chunked processing with overlap ──────────────────────────────────
+    # Mel frames corresponding to the overlap window.
+    # The mel output is at sample_rate / hop_length = 93.75 fps, NOT _MEL_FPS.
+    overlap_frames = overlap_samples // _MEL_HOP_LENGTH  # 12000 // 256 = 46
+    mel_parts: list[torch.Tensor] = []
+    pos = 0
+    while pos < n_samples:
+        # Extend the window on both sides by overlap_samples so the model has
+        # context at each boundary.
+        win_start = max(0, pos - overlap_samples)
+        win_end   = min(n_samples, pos + chunk_samples + overlap_samples)
+        chunk = source_wav[..., win_start:win_end]
+        with torch.inference_mode():
+            mel_chunk: torch.Tensor = kanade.voice_conversion(
+                source_waveform=chunk, reference_waveform=ref_wav
+            )
+        # Move to CPU immediately so the GPU buffer is freed before the next chunk.
+        mel_chunk = mel_chunk.cpu()
+        # Trim overlap frames that were only there for context.
+        left_trim  = 0 if pos == 0 else overlap_frames
+        right_trim = mel_chunk.shape[-1] if win_end >= n_samples else mel_chunk.shape[-1] - overlap_frames
+        mel_parts.append(mel_chunk[..., left_trim:right_trim])
+        pos += chunk_samples
+        if device.type == "cuda":
+            torch.cuda.empty_cache()
+    # ── 4. Assemble full mel and vocode in one pass ──────────────────────────
+    full_mel = torch.cat(mel_parts, dim=-1).to(device)
+    with torch.inference_mode():
+        wav = vocode(vocoder_model, full_mel.unsqueeze(0))
+    elapsed = time.perf_counter() - _start
+    print(f"[chunked_convert] Completed in {elapsed:.1f}s")
+    return wav.squeeze().cpu()