clone

Runtime error

App Files Files Community

clone / core /chunked_convert.py

PatnaikAshish

Create chunked_convert.py

66525f3 verified 29 days ago

raw

history blame contribute delete

7.7 kB

	"""
	core/chunked_convert.py
	-----------------------
	VRAM-aware chunked voice conversion using the Kanade model.

	On CUDA devices, the source waveform is split into overlapping chunks so that
	peak activation memory stays within a configurable fraction of total VRAM
	(default 50%). On CPU the waveform is still chunked to respect the model's
	RoPE sequence-length limit.

	RoPE ceiling (why chunks must be small)
	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	The Kanade ``mel_decoder`` Transformer processes mel-spectrogram frames of the
	source chunk. Its RoPE positional embeddings are precomputed for
	``_ROPE_MAX_FRAMES = 1024`` positions. The mel frame count for a window of
	``W`` samples is ``W // hop_length + 1``. Keeping that ≤ 1024 requires:

	W ≤ (1024 − 1) × hop_length = 1023 × 256 = 261,888 samples ≈ 10.9 s

	Each chunk window includes a 0.5 s overlap on both sides for boundary
	smoothing, so the chunk itself must be:

	chunk ≤ 261,888 − 2 × (0.5 s × sample_rate) ≈ 9.9 s

	A 10 % safety margin is applied, giving ``_ROPE_SAFE_CHUNK_FACTOR ≈ 8.9 s``
	worth of source audio per chunk.

	Overlap / boundary handling
	~~~~~~~~~~~~~~~~~~~~~~~~~~~
	Each chunk includes a short overlap window on both sides. After the
	voice-conversion forward pass, the overlap frames are trimmed from the mel
	output before the pieces are concatenated. The final assembled mel is vocoded
	in a single pass.
	"""

	from __future__ import annotations

	import time
	import torch
	from kanade_tokenizer import vocode


	# Empirical constant: ~10 seconds of audio fit in 1 GB of VRAM budget for the
	# Kanade-12.5hz model. Adjust downward if you observe OOM errors.
	_SECONDS_PER_GB: float = 10.0

	# Overlap window on each side of a chunk (seconds).
	_OVERLAP_SECONDS: float = 0.5

	# --------------------------------------------------------------------------
	# RoPE safety ceiling — derived from the mel_decoder Transformer
	# --------------------------------------------------------------------------
	# mel_decoder seqlen = audio_length // hop_length + 1 (center-padding mel).
	# Its RoPE freqs_cis is precomputed for _ROPE_MAX_FRAMES positions.
	# hop_length comes directly from KanadeModelConfig (hop_length = 256).
	_ROPE_MAX_FRAMES: int = 1024 # precomputed RoPE window (freqs_cis.shape[0])
	_MEL_HOP_LENGTH: int = 256 # KanadeModelConfig.hop_length
	_ROPE_SAFETY_MARGIN: float = 0.75

	# Output mel frame rate — kept for reference only; NOT used for overlap trimming.
	# Mel frames used internally are at sample_rate / hop_length (93.75 fps), not 12.5 fps.
	_MEL_FPS: float = 12.5


	def chunked_voice_conversion(
	kanade,
	vocoder_model,
	source_wav: torch.Tensor,
	ref_wav: torch.Tensor,
	sample_rate: int,
	vram_fraction: float = 0.9,
	) -> torch.Tensor:
	"""Convert source_wav to the reference voice in VRAM-safe chunks.

	Parameters
	----------
	kanade:
	A loaded ``KanadeModel`` instance (already on the target device).
	vocoder_model:
	The vocoder loaded via ``load_vocoder`` (already on the target device).
	source_wav:
	Source waveform tensor of shape ``[T]`` or ``[1, T]``, on the same
	device as kanade.
	ref_wav:
	Reference waveform tensor of shape ``[T]`` or ``[1, T]``, on the same
	device as kanade.
	sample_rate:
	Audio sample rate in Hz (taken from ``kanade.config.sample_rate``).
	vram_fraction:
	Fraction of total VRAM to target per chunk. Default ``0.5`` → 50 %.

	Returns
	-------
	torch.Tensor
	Converted waveform as a 1-D CPU float32 tensor.
	"""
	device: torch.device = source_wav.device
	n_samples: int = source_wav.shape[-1]
	_start = time.perf_counter()

	# ── 1. Determine chunk size ──────────────────────────────────────────────
	# The mel_decoder RoPE ceiling limits the total window (chunk + overlaps).
	# Max window in samples: (ROPE_MAX_FRAMES - 1) * MEL_HOP_LENGTH
	# Subtract both overlap sides, then apply a safety margin.
	overlap_samples = int(_OVERLAP_SECONDS * sample_rate)
	rope_max_window = (_ROPE_MAX_FRAMES - 1) * _MEL_HOP_LENGTH # 261,888 samples ≈ 10.9 s
	rope_safe_chunk = int((rope_max_window - 2 * overlap_samples) * _ROPE_SAFETY_MARGIN)
	rope_safe_seconds = rope_safe_chunk / sample_rate

	if device.type == "cuda":
	total_vram_bytes = torch.cuda.get_device_properties(device).total_memory
	budget_bytes = total_vram_bytes * vram_fraction
	budget_gb = budget_bytes / (1024 ** 3)

	vram_chunk_samples = int(max(5.0, budget_gb * _SECONDS_PER_GB) * sample_rate)

	# Take the smaller of VRAM-based and RoPE-safe limits.
	chunk_samples = min(vram_chunk_samples, rope_safe_chunk)
	chunk_seconds = chunk_samples / sample_rate

	print(
	f"[chunked_convert] VRAM budget: {budget_gb:.2f} GB "
	f"({vram_fraction100:.0f}% of {total_vram_bytes / (1024*3):.2f} GB) "
	f"→ chunk size: {chunk_seconds:.1f}s / {chunk_samples:,} samples "
	f"(RoPE ceiling: {rope_safe_seconds:.1f}s)"
	)
	else:
	# CPU: no VRAM limit, but still respect the RoPE ceiling for quality.
	chunk_samples = rope_safe_chunk

	# ── 2. Short-circuit when the whole file fits in one chunk ───────────────
	if n_samples <= chunk_samples:
	with torch.inference_mode():
	mel = kanade.voice_conversion(
	source_waveform=source_wav, reference_waveform=ref_wav
	)
	wav = vocode(vocoder_model, mel.unsqueeze(0))
	elapsed = time.perf_counter() - _start
	print(f"[chunked_convert] Completed in {elapsed:.1f}s")
	return wav.squeeze().cpu()

	# ── 3. Chunked processing with overlap ──────────────────────────────────
	# Mel frames corresponding to the overlap window.
	# The mel output is at sample_rate / hop_length = 93.75 fps, NOT _MEL_FPS.
	overlap_frames = overlap_samples // _MEL_HOP_LENGTH # 12000 // 256 = 46

	mel_parts: list[torch.Tensor] = []
	pos = 0

	while pos < n_samples:
	# Extend the window on both sides by overlap_samples so the model has
	# context at each boundary.
	win_start = max(0, pos - overlap_samples)
	win_end = min(n_samples, pos + chunk_samples + overlap_samples)

	chunk = source_wav[..., win_start:win_end]

	with torch.inference_mode():
	mel_chunk: torch.Tensor = kanade.voice_conversion(
	source_waveform=chunk, reference_waveform=ref_wav
	)

	# Move to CPU immediately so the GPU buffer is freed before the next chunk.
	mel_chunk = mel_chunk.cpu()

	# Trim overlap frames that were only there for context.
	left_trim = 0 if pos == 0 else overlap_frames
	right_trim = mel_chunk.shape[-1] if win_end >= n_samples else mel_chunk.shape[-1] - overlap_frames

	mel_parts.append(mel_chunk[..., left_trim:right_trim])

	pos += chunk_samples

	if device.type == "cuda":
	torch.cuda.empty_cache()

	# ── 4. Assemble full mel and vocode in one pass ──────────────────────────
	full_mel = torch.cat(mel_parts, dim=-1).to(device)

	with torch.inference_mode():
	wav = vocode(vocoder_model, full_mel.unsqueeze(0))

	elapsed = time.perf_counter() - _start
	print(f"[chunked_convert] Completed in {elapsed:.1f}s")
	return wav.squeeze().cpu()