internal-engine-x888

Sleeping

File size: 281,635 Bytes

3fd8c76

"""
RVC + Beatrice v2 Voice Conversion - Single-file app for HuggingFace Spaces
RVC-Project + Beatrice v2 (fierce-cats/beatrice-trainer), consolidated into single file

- Inference: RVC v2 (.pth) + Beatrice v2 (.pt.gz), CPU or GPU
- Training: RVC v2 + Beatrice v2, GPU recommended

Usage:
  CLI:    python app.py infer -i input.wav -m model.pth -o output.wav
          python app.py infer -i input.wav -m beatrice.pt.gz -o output.wav
  Gradio: python app.py
"""
import os
import sys

# MPS fallback for macOS
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

import argparse
import gc
import gzip
import json as json_module
import logging
import math
import re
import shutil
import tempfile
import warnings

# Suppress known harmless warnings from HF Spaces / torch internals
warnings.filterwarnings("ignore", message=".*torch.distributed.reduce_op.*", category=FutureWarning)
warnings.filterwarnings("ignore", message=".*torch.nn.utils.weight_norm.*", category=FutureWarning)
from collections import defaultdict
from fractions import Fraction
from functools import partial
from pathlib import Path
from random import Random
from typing import Optional, List, Tuple, Union, BinaryIO, Literal, Sequence, Iterable, Callable

import gradio as gr
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Conv1d, ConvTranspose1d
from torch.nn.utils import weight_norm, remove_weight_norm
import librosa
import pyworld
import soundfile as sf
import torchaudio
from scipy import signal
from huggingface_hub import hf_hub_download
from tqdm.auto import tqdm

# 48 Hz high-pass filter to remove low-frequency artifacts (same as Applio)
FILTER_ORDER = 5
CUTOFF_FREQUENCY = 48  # Hz
SAMPLE_RATE = 16000  # Hz
bh, ah = signal.butter(N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE)

def sanitize_model_name(name: str) -> str:
    """Sanitize model name for safe use in file paths"""
    name = os.path.basename(name.strip())
    name = re.sub(r'[^\w\-.]', '_', name)
    return name or "unnamed_model"

def list_rvc_models() -> list:
    """Scan the weights/ directory and return a sorted list of .pth model filenames."""
    weights_dir = Path("weights")
    if not weights_dir.exists():
        return []
    return sorted([p.name for p in weights_dir.glob("*.pth")])

# Default example model
DEFAULT_MODEL_REPO = "audo/Benee-RVC"
DEFAULT_MODEL_FILE = "BENEE8000.pth"
DEFAULT_INDEX_FILE = "added_IVF1054_Flat_nprobe_8.index"

# RVC v2 pretrained weights from official repo
RVC_PRETRAINED_REPO = "lj1995/VoiceConversionWebUI"
RVC_PRETRAINED_V2 = {
    # Generator with f0 (pitch)
    "f0G48k": "pretrained_v2/f0G48k.pth",
    "f0G40k": "pretrained_v2/f0G40k.pth",
    "f0G32k": "pretrained_v2/f0G32k.pth",
    # Discriminator with f0
    "f0D48k": "pretrained_v2/f0D48k.pth",
    "f0D40k": "pretrained_v2/f0D40k.pth",
    "f0D32k": "pretrained_v2/f0D32k.pth",
    # Generator without f0
    "G48k": "pretrained_v2/G48k.pth",
    "G40k": "pretrained_v2/G40k.pth",
    "G32k": "pretrained_v2/G32k.pth",
    # Discriminator without f0
    "D48k": "pretrained_v2/D48k.pth",
    "D40k": "pretrained_v2/D40k.pth",
    "D32k": "pretrained_v2/D32k.pth",
}

def download_pretrained_rvc(name: str) -> str:
    """Download RVC v2 pretrained weights from HuggingFace"""
    if name not in RVC_PRETRAINED_V2:
        raise ValueError(f"Unknown pretrained: {name}. Available: {list(RVC_PRETRAINED_V2.keys())}")
    filepath = RVC_PRETRAINED_V2[name]
    logger.info(f"Downloading pretrained {name} from {RVC_PRETRAINED_REPO}...")
    return hf_hub_download(repo_id=RVC_PRETRAINED_REPO, filename=filepath)

# Beatrice v2 pretrained assets
BEATRICE_REPO = "fierce-cats/beatrice-trainer"
BEATRICE_PRETRAINED = {
    "phone_extractor": "assets/pretrained/122_checkpoint_03000000.pt",
    "pitch_estimator": "assets/pretrained/104_3_checkpoint_00300000.pt",
    "pretrained_model": "assets/pretrained/151_checkpoint_libritts_r_200_02750000.pt.gz",
}

def download_beatrice_asset(name: str) -> str:
    """Download Beatrice v2 pretrained asset from HuggingFace"""
    if name not in BEATRICE_PRETRAINED:
        raise ValueError(f"Unknown asset: {name}. Available: {list(BEATRICE_PRETRAINED.keys())}")
    filepath = BEATRICE_PRETRAINED[name]
    logger.info(f"Downloading Beatrice asset {name} from {BEATRICE_REPO}...")
    return hf_hub_download(repo_id=BEATRICE_REPO, filename=filepath)

def download_beatrice_augmentation():
    """Download Beatrice augmentation assets (noise + IR) - optional for training"""
    try:
        from huggingface_hub import snapshot_download
        cache_dir = snapshot_download(repo_id=BEATRICE_REPO, allow_patterns=["assets/noise/*", "assets/ir/*"])
        noise_dir = os.path.join(cache_dir, "assets", "noise")
        ir_dir = os.path.join(cache_dir, "assets", "ir")
        if os.path.isdir(noise_dir) and os.path.isdir(ir_dir):
            return noise_dir, ir_dir
        return None, None
    except Exception as e:
        logger.warning(f"Could not download augmentation assets: {e}")
        return None, None

def load_pretrained_weights(model: nn.Module, pretrained_path: str) -> None:
    """Load pretrained weights into model, handling speaker embedding mismatch"""
    logger.info(f"Loading pretrained weights: {pretrained_path}")
    state_dict = torch.load(pretrained_path, map_location="cpu", weights_only=True)
    # Handle different checkpoint formats
    if "model" in state_dict:
        state_dict = state_dict["model"]

    # Filter out mismatched keys, but handle emb_g specially
    model_state = model.state_dict()
    filtered_state = {}
    skipped = []
    for k, v in state_dict.items():
        if k in model_state:
            if v.shape == model_state[k].shape:
                filtered_state[k] = v
            elif k == "emb_g.weight":
                # Initialize our speaker embedding with mean of pretrained embeddings
                # This gives a much better starting point than random initialization
                mean_emb = v.mean(dim=0, keepdim=True)  # [1, 256]
                num_speakers = model_state[k].shape[0]
                filtered_state[k] = mean_emb.expand(num_speakers, -1).clone()
                logger.info(f"Initialized emb_g from pretrained mean ({v.shape[0]} -> {num_speakers} speakers)")
            else:
                skipped.append(f"{k}: {v.shape} vs {model_state[k].shape}")
        else:
            skipped.append(f"{k}: not in model")

    if skipped:
        logger.info(f"Skipped {len(skipped)} mismatched keys")

    model.load_state_dict(filtered_state, strict=False)
    logger.info(f"Loaded {len(filtered_state)}/{len(state_dict)} pretrained weights")

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Device selection:
# - Inference: Always CPU (HF Spaces free tier, also works everywhere)
# - Training: GPU if available for speed, CPU fallback
device = torch.device("cpu")  # For inference
train_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # For training

logger.info(f"Inference device: {device}")
logger.info(f"Training device: {train_device}")

# ============================================================
# CPU OPTIMIZATION — Locked 2-core HuggingFace Spaces config
# ============================================================
# Restrict PyTorch to exactly 2 physical cores.
# OpenMP and MKL must both be capped before any tensor ops fire.
_CPU_CORES = 2
torch.set_num_threads(_CPU_CORES)
torch.set_num_interop_threads(_CPU_CORES)
os.environ["OMP_NUM_THREADS"]        = str(_CPU_CORES)
os.environ["MKL_NUM_THREADS"]        = str(_CPU_CORES)
os.environ["OPENBLAS_NUM_THREADS"]   = str(_CPU_CORES)
os.environ["VECLIB_MAXIMUM_THREADS"] = str(_CPU_CORES)
os.environ["NUMEXPR_NUM_THREADS"]    = str(_CPU_CORES)

# torch.inference_mode is heavier than no_grad but also frees the
# autograd graph eagerly, which helps on a memory-constrained CPU.
# Enable oneDNN graph fusion (fuses conv+bn, linear+relu etc. into
# single kernels — measurable speedup on Intel Xeon VMs).
torch.backends.mkldnn.enabled = True
try:
    torch.jit.enable_onednn_fusion(True)
except Exception:
    pass

logger.info(f"PyTorch CPU threads: {torch.get_num_threads()} (interop={torch.get_num_interop_threads()})")

# ============================================================
# MEMORY MANAGEMENT — purge_memory()
# Call this between every heavy operation to prevent OOM on
# the 16 GB HuggingFace Spaces free-tier CPU instance.
# ============================================================
import ctypes, platform

def purge_memory(*tensors_or_arrays):
    """
    Aggressively free memory after a heavy generation step.

    Pass any tensors / numpy arrays that should be deleted.
    The function:
      1. Deletes every passed object from caller scope.
      2. Runs Python gc (two passes: first collects cycles,
         second collects anything the first pass freed).
      3. On Linux (HuggingFace Spaces), calls malloc_trim(0) via
         ctypes so glibc returns freed pages to the OS immediately.
         Without this, RSS can stay high even after gc.collect().
      4. Clears CUDA cache if a GPU is somehow available.
    """
    for obj in tensors_or_arrays:
        try:
            del obj
        except Exception:
            pass
    # Two-pass gc: cycles first, then their referents
    gc.collect()
    gc.collect()
    # Return glibc memory to the OS (Linux only — HF Spaces is Linux)
    if platform.system() == "Linux":
        try:
            ctypes.CDLL("libc.so.6").malloc_trim(0)
        except Exception:
            pass
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

# ============================================================
# COMMONS - Helper functions from infer/lib/infer_pack/commons.py
# ============================================================

def init_weights(m, mean=0.0, std=0.01):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        m.weight.data.normal_(mean, std)

def get_padding(kernel_size, dilation=1):
    return int((kernel_size * dilation - dilation) / 2)

def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
    if max_length is None:
        max_length = length.max()
    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
    return x.unsqueeze(0) < length.unsqueeze(1)

@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
    n_channels_int = n_channels[0]
    in_act = input_a + input_b
    t_act = torch.tanh(in_act[:, :n_channels_int, :])
    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
    return t_act * s_act

def slice_segments(x, ids_str, segment_size=4):
    """Slice segments from tensor"""
    ret = torch.zeros_like(x[:, :, :segment_size])
    for i in range(x.size(0)):
        idx_str = ids_str[i]
        idx_end = idx_str + segment_size
        ret[i] = x[i, :, idx_str:idx_end]
    return ret

def slice_segments2(x, ids_str, segment_size=4):
    """Slice segments from 2D tensor"""
    ret = torch.zeros_like(x[:, :segment_size])
    for i in range(x.size(0)):
        idx_str = ids_str[i]
        idx_end = idx_str + segment_size
        ret[i] = x[i, idx_str:idx_end]
    return ret

def rand_slice_segments(x, x_lengths=None, segment_size=4):
    """Random slice segments"""
    b, d, t = x.size()
    if x_lengths is None:
        x_lengths = t
    ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=1)
    ids_str = (torch.rand([b], device=x.device) * ids_str_max.float()).long()
    ret = slice_segments(x, ids_str, segment_size)
    return ret, ids_str

# ============================================================
# MODULES - From infer/lib/infer_pack/modules.py
# ============================================================

LRELU_SLOPE = 0.1

class LayerNorm(nn.Module):
    def __init__(self, channels, eps=1e-5):
        super().__init__()
        self.channels = channels
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(channels))
        self.beta = nn.Parameter(torch.zeros(channels))

    def forward(self, x):
        x = x.transpose(1, -1)
        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
        return x.transpose(1, -1)

class WN(nn.Module):
    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
        super().__init__()
        assert kernel_size % 2 == 1
        self.hidden_channels = hidden_channels
        self.kernel_size = (kernel_size,)
        self.dilation_rate = dilation_rate
        self.n_layers = n_layers
        self.gin_channels = gin_channels
        self.p_dropout = float(p_dropout)

        self.in_layers = nn.ModuleList()
        self.res_skip_layers = nn.ModuleList()
        self.drop = nn.Dropout(float(p_dropout))

        if gin_channels != 0:
            cond_layer = nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
            self.cond_layer = weight_norm(cond_layer, name="weight")

        for i in range(n_layers):
            dilation = dilation_rate ** i
            padding = int((kernel_size * dilation - dilation) / 2)
            in_layer = nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding)
            in_layer = weight_norm(in_layer, name="weight")
            self.in_layers.append(in_layer)

            if i < n_layers - 1:
                res_skip_channels = 2 * hidden_channels
            else:
                res_skip_channels = hidden_channels
            res_skip_layer = nn.Conv1d(hidden_channels, res_skip_channels, 1)
            res_skip_layer = weight_norm(res_skip_layer, name="weight")
            self.res_skip_layers.append(res_skip_layer)

    def forward(self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None):
        output = torch.zeros_like(x)
        n_channels_tensor = torch.IntTensor([self.hidden_channels])

        if g is not None:
            g = self.cond_layer(g)

        for i, (in_layer, res_skip_layer) in enumerate(zip(self.in_layers, self.res_skip_layers)):
            x_in = in_layer(x)
            if g is not None:
                cond_offset = i * 2 * self.hidden_channels
                g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
            else:
                g_l = torch.zeros_like(x_in)

            acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
            acts = self.drop(acts)
            res_skip_acts = res_skip_layer(acts)

            if i < self.n_layers - 1:
                res_acts = res_skip_acts[:, :self.hidden_channels, :]
                x = (x + res_acts) * x_mask
                output = output + res_skip_acts[:, self.hidden_channels:, :]
            else:
                output = output + res_skip_acts
        return output * x_mask

    def remove_weight_norm(self):
        if self.gin_channels != 0:
            remove_weight_norm(self.cond_layer)
        for l in self.in_layers:
            remove_weight_norm(l)
        for l in self.res_skip_layers:
            remove_weight_norm(l)

class ResBlock1(nn.Module):
    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
        super().__init__()
        self.convs1 = nn.ModuleList([
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))),
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))),
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2]))),
        ])
        self.convs1.apply(init_weights)
        self.convs2 = nn.ModuleList([
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))),
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))),
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))),
        ])
        self.convs2.apply(init_weights)
        self.lrelu_slope = LRELU_SLOPE

    def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None):
        for c1, c2 in zip(self.convs1, self.convs2):
            xt = F.leaky_relu(x, self.lrelu_slope)
            if x_mask is not None:
                xt = xt * x_mask
            xt = c1(xt)
            xt = F.leaky_relu(xt, self.lrelu_slope)
            if x_mask is not None:
                xt = xt * x_mask
            xt = c2(xt)
            x = xt + x
        if x_mask is not None:
            x = x * x_mask
        return x

    def remove_weight_norm(self):
        for l in self.convs1:
            remove_weight_norm(l)
        for l in self.convs2:
            remove_weight_norm(l)

class ResBlock2(nn.Module):
    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
        super().__init__()
        self.convs = nn.ModuleList([
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))),
            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))),
        ])
        self.convs.apply(init_weights)
        self.lrelu_slope = LRELU_SLOPE

    def forward(self, x, x_mask: Optional[torch.Tensor] = None):
        for c in self.convs:
            xt = F.leaky_relu(x, self.lrelu_slope)
            if x_mask is not None:
                xt = xt * x_mask
            xt = c(xt)
            x = xt + x
        if x_mask is not None:
            x = x * x_mask
        return x

    def remove_weight_norm(self):
        for l in self.convs:
            remove_weight_norm(l)

class Flip(nn.Module):
    def forward(self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None, reverse: bool = False):
        x = torch.flip(x, [1])
        if not reverse:
            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
            return x, logdet
        else:
            return x, torch.zeros([1], device=x.device)

class ResidualCouplingLayer(nn.Module):
    def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
        assert channels % 2 == 0
        super().__init__()
        self.channels = channels
        self.hidden_channels = hidden_channels
        self.half_channels = channels // 2
        self.mean_only = mean_only

        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=float(p_dropout), gin_channels=gin_channels)
        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
        self.post.weight.data.zero_()
        self.post.bias.data.zero_()

    def forward(self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None, reverse: bool = False):
        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
        h = self.pre(x0) * x_mask
        h = self.enc(h, x_mask, g=g)
        stats = self.post(h) * x_mask
        if not self.mean_only:
            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
        else:
            m = stats
            logs = torch.zeros_like(m)

        if not reverse:
            x1 = m + x1 * torch.exp(logs) * x_mask
            x = torch.cat([x0, x1], 1)
            logdet = torch.sum(logs, [1, 2])
            return x, logdet
        else:
            x1 = (x1 - m) * torch.exp(-logs) * x_mask
            x = torch.cat([x0, x1], 1)
            return x, torch.zeros([1])

    def remove_weight_norm(self):
        self.enc.remove_weight_norm()

# ============================================================
# ATTENTIONS - From infer/lib/infer_pack/attentions.py
# ============================================================

class MultiHeadAttention(nn.Module):
    def __init__(self, channels, out_channels, n_heads, p_dropout=0.0, window_size=None, heads_share=True, proximal_bias=False, proximal_init=False):
        super().__init__()
        assert channels % n_heads == 0
        self.channels = channels
        self.out_channels = out_channels
        self.n_heads = n_heads
        self.p_dropout = p_dropout
        self.window_size = window_size
        self.heads_share = heads_share
        self.proximal_bias = proximal_bias
        self.proximal_init = proximal_init

        self.k_channels = channels // n_heads
        self.conv_q = nn.Conv1d(channels, channels, 1)
        self.conv_k = nn.Conv1d(channels, channels, 1)
        self.conv_v = nn.Conv1d(channels, channels, 1)
        self.conv_o = nn.Conv1d(channels, out_channels, 1)
        self.drop = nn.Dropout(p_dropout)

        if window_size is not None:
            n_heads_rel = 1 if heads_share else n_heads
            rel_stddev = self.k_channels ** -0.5
            self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
            self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)

        nn.init.xavier_uniform_(self.conv_q.weight)
        nn.init.xavier_uniform_(self.conv_k.weight)
        nn.init.xavier_uniform_(self.conv_v.weight)
        if proximal_init:
            with torch.no_grad():
                self.conv_k.weight.copy_(self.conv_q.weight)
                self.conv_k.bias.copy_(self.conv_q.bias)

    def forward(self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
        q = self.conv_q(x)
        k = self.conv_k(c)
        v = self.conv_v(c)
        x, _ = self.attention(q, k, v, mask=attn_mask)
        x = self.conv_o(x)
        return x

    def attention(self, query, key, value, mask=None):
        b, d, t_s = key.size()
        t_t = query.size(2)
        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)

        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
        if self.window_size is not None:
            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
            rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
            scores_local = self._relative_position_to_absolute_position(rel_logits)
            scores = scores + scores_local
        if self.proximal_bias:
            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e4)
        p_attn = F.softmax(scores, dim=-1)
        p_attn = self.drop(p_attn)
        output = torch.matmul(p_attn, value)
        if self.window_size is not None:
            relative_weights = self._absolute_position_to_relative_position(p_attn)
            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
        output = output.transpose(2, 3).contiguous().view(b, d, t_t)
        return output, p_attn

    def _matmul_with_relative_values(self, x, y):
        return torch.matmul(x, y.unsqueeze(0))

    def _matmul_with_relative_keys(self, x, y):
        return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))

    def _get_relative_embeddings(self, relative_embeddings, length):
        pad_length = max(length - (self.window_size + 1), 0)
        slice_start_position = max((self.window_size + 1) - length, 0)
        slice_end_position = slice_start_position + 2 * length - 1
        if pad_length > 0:
            padded_relative_embeddings = F.pad(relative_embeddings, [0, 0, pad_length, pad_length, 0, 0])
        else:
            padded_relative_embeddings = relative_embeddings
        return padded_relative_embeddings[:, slice_start_position:slice_end_position]

    def _relative_position_to_absolute_position(self, x):
        batch, heads, length, _ = x.size()
        x = F.pad(x, [0, 1, 0, 0, 0, 0, 0, 0])
        x_flat = x.view([batch, heads, length * 2 * length])
        x_flat = F.pad(x_flat, [0, int(length) - 1, 0, 0, 0, 0])
        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
        return x_final

    def _absolute_position_to_relative_position(self, x):
        batch, heads, length, _ = x.size()
        x = F.pad(x, [0, int(length) - 1, 0, 0, 0, 0, 0, 0])
        x_flat = x.view([batch, heads, int(length ** 2) + int(length * (length - 1))])
        x_flat = F.pad(x_flat, [length, 0, 0, 0, 0, 0])
        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
        return x_final

    def _attention_bias_proximal(self, length):
        r = torch.arange(length, dtype=torch.float32)
        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)

class FFN(nn.Module):
    def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0, activation=None, causal=False):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.filter_channels = filter_channels
        self.kernel_size = kernel_size
        self.p_dropout = p_dropout
        self.causal = causal
        self.is_activation = activation == "gelu"

        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
        self.drop = nn.Dropout(p_dropout)

    def forward(self, x: torch.Tensor, x_mask: torch.Tensor):
        x = self.conv_1(self._padding(x, x_mask))
        if self.is_activation:
            x = x * torch.sigmoid(1.702 * x)
        else:
            x = torch.relu(x)
        x = self.drop(x)
        x = self.conv_2(self._padding(x, x_mask))
        return x * x_mask

    def _padding(self, x, x_mask):
        if self.causal:
            if self.kernel_size == 1:
                return x * x_mask
            pad_l = self.kernel_size - 1
            return F.pad(x * x_mask, [pad_l, 0, 0, 0, 0, 0])
        else:
            if self.kernel_size == 1:
                return x * x_mask
            pad_l = (self.kernel_size - 1) // 2
            pad_r = self.kernel_size // 2
            return F.pad(x * x_mask, [pad_l, pad_r, 0, 0, 0, 0])

class Encoder(nn.Module):
    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.0, window_size=10, **kwargs):
        super().__init__()
        self.hidden_channels = hidden_channels
        self.n_layers = int(n_layers)
        self.drop = nn.Dropout(p_dropout)
        self.attn_layers = nn.ModuleList()
        self.norm_layers_1 = nn.ModuleList()
        self.ffn_layers = nn.ModuleList()
        self.norm_layers_2 = nn.ModuleList()

        for i in range(self.n_layers):
            self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
            self.norm_layers_1.append(LayerNorm(hidden_channels))
            self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
            self.norm_layers_2.append(LayerNorm(hidden_channels))

    def forward(self, x, x_mask):
        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
        x = x * x_mask
        for attn, norm1, ffn, norm2 in zip(self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2):
            y = attn(x, x, attn_mask)
            y = self.drop(y)
            x = norm1(x + y)
            y = ffn(x, x_mask)
            y = self.drop(y)
            x = norm2(x + y)
        return x * x_mask

# ============================================================
# MODELS - From infer/lib/infer_pack/models.py
# ============================================================

sr2sr = {"32k": 32000, "40k": 40000, "48k": 48000}

class TextEncoder256(nn.Module):
    def __init__(self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True):
        super().__init__()
        self.out_channels = out_channels
        self.hidden_channels = hidden_channels
        self.emb_phone = nn.Linear(256, hidden_channels)
        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
        if f0:
            self.emb_pitch = nn.Embedding(256, hidden_channels)
        self.encoder = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout))
        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

    def forward(self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor):
        if pitch is None:
            x = self.emb_phone(phone)
        else:
            x = self.emb_phone(phone) + self.emb_pitch(pitch)
        x = x * math.sqrt(self.hidden_channels)
        x = self.lrelu(x)
        x = torch.transpose(x, 1, -1)
        x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
        x = self.encoder(x * x_mask, x_mask)
        stats = self.proj(x) * x_mask
        m, logs = torch.split(stats, self.out_channels, dim=1)
        return m, logs, x_mask

class TextEncoder768(nn.Module):
    def __init__(self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True):
        super().__init__()
        self.out_channels = out_channels
        self.hidden_channels = hidden_channels
        self.emb_phone = nn.Linear(768, hidden_channels)
        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
        if f0:
            self.emb_pitch = nn.Embedding(256, hidden_channels)
        self.encoder = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout))
        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

    def forward(self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor):
        if pitch is None:
            x = self.emb_phone(phone)
        else:
            x = self.emb_phone(phone) + self.emb_pitch(pitch)
        x = x * math.sqrt(self.hidden_channels)
        x = self.lrelu(x)
        x = torch.transpose(x, 1, -1)
        x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
        x = self.encoder(x * x_mask, x_mask)
        stats = self.proj(x) * x_mask
        m, logs = torch.split(stats, self.out_channels, dim=1)
        return m, logs, x_mask

class ResidualCouplingBlock(nn.Module):
    def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0):
        super().__init__()
        self.n_flows = n_flows
        self.flows = nn.ModuleList()
        for i in range(n_flows):
            self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
            self.flows.append(Flip())

    def forward(self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None, reverse: bool = False):
        if not reverse:
            for flow in self.flows:
                x, _ = flow(x, x_mask, g=g, reverse=reverse)
        else:
            for flow in self.flows[::-1]:
                x, _ = flow.forward(x, x_mask, g=g, reverse=reverse)
        return x

    def remove_weight_norm(self):
        for i in range(self.n_flows):
            self.flows[i * 2].remove_weight_norm()

class PosteriorEncoder(nn.Module):
    def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0):
        super().__init__()
        self.out_channels = out_channels
        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

    def forward(self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None):
        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        x = self.pre(x) * x_mask
        x = self.enc(x, x_mask, g=g)
        stats = self.proj(x) * x_mask
        m, logs = torch.split(stats, self.out_channels, dim=1)
        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
        return z, m, logs, x_mask

    def remove_weight_norm(self):
        self.enc.remove_weight_norm()

class Generator(nn.Module):
    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
        super().__init__()
        self.num_kernels = len(resblock_kernel_sizes)
        self.num_upsamples = len(upsample_rates)
        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
        resblock_class = ResBlock1 if resblock == "1" else ResBlock2

        self.ups = nn.ModuleList()
        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
            self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))

        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
            ch = upsample_initial_channel // (2 ** (i + 1))
            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                self.resblocks.append(resblock_class(ch, k, d))

        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
        self.ups.apply(init_weights)

        if gin_channels != 0:
            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)

    def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
        x = self.conv_pre(x)
        if g is not None:
            x = x + self.cond(g)
        for i in range(self.num_upsamples):
            x = F.leaky_relu(x, LRELU_SLOPE)
            x = self.ups[i](x)
            xs = None
            for j in range(self.num_kernels):
                if xs is None:
                    xs = self.resblocks[i * self.num_kernels + j](x)
                else:
                    xs += self.resblocks[i * self.num_kernels + j](x)
            x = xs / self.num_kernels
        x = F.leaky_relu(x)
        x = self.conv_post(x)
        x = torch.tanh(x)
        return x

    def remove_weight_norm(self):
        for l in self.ups:
            remove_weight_norm(l)
        for l in self.resblocks:
            l.remove_weight_norm()

class SineGen(nn.Module):
    def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0):
        super().__init__()
        self.sine_amp = sine_amp
        self.noise_std = noise_std
        self.harmonic_num = harmonic_num
        self.dim = harmonic_num + 1
        self.sampling_rate = samp_rate
        self.voiced_threshold = voiced_threshold

    def _f02uv(self, f0):
        uv = torch.ones_like(f0)
        uv = uv * (f0 > self.voiced_threshold)
        return uv.float()

    def forward(self, f0: torch.Tensor, upp: int):
        with torch.no_grad():
            f0 = f0[:, None].transpose(1, 2)
            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
            f0_buf[:, :, 0] = f0[:, :, 0]
            for idx in range(self.harmonic_num):
                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
            rad_values = (f0_buf / self.sampling_rate) % 1
            rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device)
            rand_ini[:, 0] = 0
            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
            tmp_over_one = torch.cumsum(rad_values, 1)
            tmp_over_one *= upp
            tmp_over_one = F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=float(upp), mode="linear", align_corners=True).transpose(2, 1)
            rad_values = F.interpolate(rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest").transpose(2, 1)
            tmp_over_one %= 1
            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
            cumsum_shift = torch.zeros_like(rad_values)
            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
            sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi)
            sine_waves = sine_waves * self.sine_amp
            uv = self._f02uv(f0)
            uv = F.interpolate(uv.transpose(2, 1), scale_factor=float(upp), mode="nearest").transpose(2, 1)
            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
            noise = noise_amp * torch.randn_like(sine_waves)
            sine_waves = sine_waves * uv + noise
        return sine_waves, uv, noise

class SourceModuleHnNSF(nn.Module):
    def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0, is_half=False):
        super().__init__()
        self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod)
        self.l_linear = nn.Linear(harmonic_num + 1, 1)
        self.l_tanh = nn.Tanh()

    def forward(self, x: torch.Tensor, upp: int = 1):
        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
        sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
        return sine_merge, None, None

class GeneratorNSF(nn.Module):
    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels, sr, is_half=False):
        super().__init__()
        self.num_kernels = len(resblock_kernel_sizes)
        self.num_upsamples = len(upsample_rates)
        self.f0_upsamp = nn.Upsample(scale_factor=math.prod(upsample_rates))
        self.m_source = SourceModuleHnNSF(sampling_rate=sr, harmonic_num=0, is_half=is_half)
        self.noise_convs = nn.ModuleList()
        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
        resblock_class = ResBlock1 if resblock == "1" else ResBlock2

        self.ups = nn.ModuleList()
        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
            c_cur = upsample_initial_channel // (2 ** (i + 1))
            self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))
            if i + 1 < len(upsample_rates):
                stride_f0 = math.prod(upsample_rates[i + 1:])
                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
            else:
                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))

        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
            ch = upsample_initial_channel // (2 ** (i + 1))
            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                self.resblocks.append(resblock_class(ch, k, d))

        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
        self.ups.apply(init_weights)
        if gin_channels != 0:
            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
        self.upp = math.prod(upsample_rates)
        self.lrelu_slope = LRELU_SLOPE

    def forward(self, x, f0, g: Optional[torch.Tensor] = None):
        har_source, _, _ = self.m_source(f0, self.upp)
        har_source = har_source.transpose(1, 2)
        x = self.conv_pre(x)
        if g is not None:
            x = x + self.cond(g)
        for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
            if i < self.num_upsamples:
                x = F.leaky_relu(x, self.lrelu_slope)
                x = ups(x)
                x_source = noise_convs(har_source)
                x = x + x_source
                xs = None
                l = [i * self.num_kernels + j for j in range(self.num_kernels)]
                for j, resblock in enumerate(self.resblocks):
                    if j in l:
                        if xs is None:
                            xs = resblock(x)
                        else:
                            xs += resblock(x)
                x = xs / self.num_kernels
        x = F.leaky_relu(x)
        x = self.conv_post(x)
        x = torch.tanh(x)
        return x

    def remove_weight_norm(self):
        for l in self.ups:
            remove_weight_norm(l)
        for l in self.resblocks:
            l.remove_weight_norm()

# Synthesizer classes for different model versions
class SynthesizerTrnMs256NSFsid(nn.Module):
    """RVC v1 model with f0"""
    def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr, **kwargs):
        super().__init__()
        if isinstance(sr, str):
            sr = sr2sr[sr]
        self.segment_size = segment_size
        self.gin_channels = gin_channels
        self.spk_embed_dim = spk_embed_dim
        self.enc_p = TextEncoder256(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout))
        self.dec = GeneratorNSF(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, is_half=kwargs.get("is_half", False))
        self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels)
        self.emb_g = nn.Embedding(spk_embed_dim, gin_channels)

    @torch.jit.export
    def infer(self, phone: torch.Tensor, phone_lengths: torch.Tensor, pitch: torch.Tensor, nsff0: torch.Tensor, sid: torch.Tensor, rate: Optional[torch.Tensor] = None):
        g = self.emb_g(sid).unsqueeze(-1)
        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
        if rate is not None:
            head = int(z_p.shape[2] * (1 - rate.item()))
            z_p = z_p[:, :, head:]
            x_mask = x_mask[:, :, head:]
            nsff0 = nsff0[:, head:]
        z = self.flow(z_p, x_mask, g=g, reverse=True)
        o = self.dec(z * x_mask, nsff0, g=g)
        return o, x_mask, (z, z_p, m_p, logs_p)

class SynthesizerTrnMs768NSFsid(nn.Module):
    """RVC v2 model with f0"""
    def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr, **kwargs):
        super().__init__()
        if isinstance(sr, str):
            sr = sr2sr[sr]
        self.segment_size = segment_size
        self.gin_channels = gin_channels
        self.spk_embed_dim = spk_embed_dim
        self.enc_p = TextEncoder768(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout))
        self.dec = GeneratorNSF(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, is_half=kwargs.get("is_half", False))
        self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels)
        self.emb_g = nn.Embedding(spk_embed_dim, gin_channels)

    def forward(self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds):
        """Training forward pass"""
        g = self.emb_g(ds).unsqueeze(-1)
        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
        z_p = self.flow(z, y_mask, g=g)
        z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size)
        pitchf = slice_segments2(pitchf, ids_slice, self.segment_size)
        o = self.dec(z_slice, pitchf, g=g)
        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)

    @torch.jit.export
    def infer(self, phone: torch.Tensor, phone_lengths: torch.Tensor, pitch: torch.Tensor, nsff0: torch.Tensor, sid: torch.Tensor, rate: Optional[torch.Tensor] = None):
        g = self.emb_g(sid).unsqueeze(-1)
        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
        if rate is not None:
            head = int(z_p.shape[2] * (1.0 - rate.item()))
            z_p = z_p[:, :, head:]
            x_mask = x_mask[:, :, head:]
            nsff0 = nsff0[:, head:]
        z = self.flow(z_p, x_mask, g=g, reverse=True)
        o = self.dec(z * x_mask, nsff0, g=g)
        return o, x_mask, (z, z_p, m_p, logs_p)

class SynthesizerTrnMs256NSFsid_nono(nn.Module):
    """RVC v1 model without f0"""
    def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr=None, **kwargs):
        super().__init__()
        self.segment_size = segment_size
        self.gin_channels = gin_channels
        self.enc_p = TextEncoder256(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout), f0=False)
        self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
        self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels)
        self.emb_g = nn.Embedding(spk_embed_dim, gin_channels)

    @torch.jit.export
    def infer(self, phone: torch.Tensor, phone_lengths: torch.Tensor, sid: torch.Tensor, rate: Optional[torch.Tensor] = None):
        g = self.emb_g(sid).unsqueeze(-1)
        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
        if rate is not None:
            head = int(z_p.shape[2] * (1.0 - rate.item()))
            z_p = z_p[:, :, head:]
            x_mask = x_mask[:, :, head:]
        z = self.flow(z_p, x_mask, g=g, reverse=True)
        o = self.dec(z * x_mask, g=g)
        return o, x_mask, (z, z_p, m_p, logs_p)

class SynthesizerTrnMs768NSFsid_nono(nn.Module):
    """RVC v2 model without f0"""
    def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr=None, **kwargs):
        super().__init__()
        self.segment_size = segment_size
        self.gin_channels = gin_channels
        self.enc_p = TextEncoder768(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout), f0=False)
        self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
        self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels)
        self.emb_g = nn.Embedding(spk_embed_dim, gin_channels)

    @torch.jit.export
    def infer(self, phone: torch.Tensor, phone_lengths: torch.Tensor, sid: torch.Tensor, rate: Optional[torch.Tensor] = None):
        g = self.emb_g(sid).unsqueeze(-1)
        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
        if rate is not None:
            head = int(z_p.shape[2] * (1.0 - rate.item()))
            z_p = z_p[:, :, head:]
            x_mask = x_mask[:, :, head:]
        z = self.flow(z_p, x_mask, g=g, reverse=True)
        o = self.dec(z * x_mask, g=g)
        return o, x_mask, (z, z_p, m_p, logs_p)

# ============================================================
# DISCRIMINATOR - For training
# ============================================================

class DiscriminatorS(nn.Module):
    def __init__(self, use_spectral_norm=False):
        super().__init__()
        norm_f = nn.utils.spectral_norm if use_spectral_norm else weight_norm
        self.convs = nn.ModuleList([
            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
        ])
        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))

    def forward(self, x):
        fmap = []
        for l in self.convs:
            x = l(x)
            x = F.leaky_relu(x, 0.1)
            fmap.append(x)
        x = self.conv_post(x)
        fmap.append(x)
        x = torch.flatten(x, 1, -1)
        return x, fmap

class DiscriminatorP(nn.Module):
    def __init__(self, period, use_spectral_norm=False):
        super().__init__()
        self.period = period
        norm_f = nn.utils.spectral_norm if use_spectral_norm else weight_norm
        self.convs = nn.ModuleList([
            norm_f(nn.Conv2d(1, 32, (5, 1), (3, 1), padding=(2, 0))),
            norm_f(nn.Conv2d(32, 128, (5, 1), (3, 1), padding=(2, 0))),
            norm_f(nn.Conv2d(128, 512, (5, 1), (3, 1), padding=(2, 0))),
            norm_f(nn.Conv2d(512, 1024, (5, 1), (3, 1), padding=(2, 0))),
            norm_f(nn.Conv2d(1024, 1024, (5, 1), 1, padding=(2, 0))),
        ])
        self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))

    def forward(self, x):
        fmap = []
        b, c, t = x.shape
        if t % self.period != 0:
            n_pad = self.period - (t % self.period)
            x = F.pad(x, (0, n_pad), "reflect")
            t = t + n_pad
        x = x.view(b, c, t // self.period, self.period)
        for l in self.convs:
            x = l(x)
            x = F.leaky_relu(x, 0.1)
            fmap.append(x)
        x = self.conv_post(x)
        fmap.append(x)
        x = torch.flatten(x, 1, -1)
        return x, fmap

class MultiPeriodDiscriminator(nn.Module):
    def __init__(self, use_spectral_norm=False):
        super().__init__()
        periods = [2, 3, 5, 7, 11, 17, 23, 37]  # 8 periods for v2 pretrained (9 total discriminators)
        self.discriminators = nn.ModuleList(
            [DiscriminatorS(use_spectral_norm)] +
            [DiscriminatorP(p, use_spectral_norm) for p in periods]
        )

    def forward(self, y, y_hat):
        y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
        for d in self.discriminators:
            y_d_r, fmap_r = d(y)
            y_d_g, fmap_g = d(y_hat)
            y_d_rs.append(y_d_r)
            y_d_gs.append(y_d_g)
            fmap_rs.append(fmap_r)
            fmap_gs.append(fmap_g)
        return y_d_rs, y_d_gs, fmap_rs, fmap_gs

# ============================================================
# TRAINING LOSSES
# ============================================================

def feature_loss(fmap_r, fmap_g):
    loss = 0
    for dr, dg in zip(fmap_r, fmap_g):
        for rl, gl in zip(dr, dg):
            loss += torch.mean(torch.abs(rl.float().detach() - gl.float()))
    return loss * 2

def discriminator_loss(disc_real_outputs, disc_generated_outputs):
    loss = 0
    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
        loss += torch.mean((1 - dr.float()) ** 2) + torch.mean(dg.float() ** 2)
    return loss

def generator_loss(disc_outputs):
    loss = 0
    for dg in disc_outputs:
        loss += torch.mean((1 - dg.float()) ** 2)
    return loss

def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
    z_p, logs_q, m_p, logs_p, z_mask = [x.float() for x in [z_p, logs_q, m_p, logs_p, z_mask]]
    kl = logs_p - logs_q - 0.5 + 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
    return torch.sum(kl * z_mask) / torch.sum(z_mask)

# ============================================================
# HUBERT EXTRACTION - Using torchaudio bundle
# ============================================================

# ContentVec model for v1 (256-dim) and HuBERT for v2 (768-dim)
_contentvec_model = None  # For v1 models (256-dim output)
_hubert_model = None      # For v2 models (768-dim output)
_hubert_bundle = None

CONTENTVEC_REPO = "IAHispano/Applio"
CONTENTVEC_MODEL = "Resources/embedders/contentvec/pytorch_model.bin"
CONTENTVEC_CONFIG = "Resources/embedders/contentvec/config.json"

def load_contentvec():
    """Load ContentVec model from HuggingFace for v1 models (256-dim output)"""
    global _contentvec_model
    if _contentvec_model is None:
        try:
            from transformers import HubertModel, HubertConfig
            logger.info("Loading ContentVec model from HuggingFace...")

            # Download model files
            model_path = hf_hub_download(repo_id=CONTENTVEC_REPO, filename=CONTENTVEC_MODEL)
            config_path = hf_hub_download(repo_id=CONTENTVEC_REPO, filename=CONTENTVEC_CONFIG)

            # Create model with final_proj layer
            class HubertModelWithFinalProj(HubertModel):
                def __init__(self, config):
                    super().__init__(config)
                    self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)

            config = HubertConfig.from_pretrained(config_path)
            _contentvec_model = HubertModelWithFinalProj(config)
            state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
            _contentvec_model.load_state_dict(state_dict)
            _contentvec_model.to(device).eval()
            logger.info(f"ContentVec loaded: hidden={config.hidden_size}, proj={config.classifier_proj_size}")
        except Exception as e:
            logger.warning(f"Failed to load ContentVec: {e}, falling back to torchaudio HuBERT")
            _contentvec_model = None
    return _contentvec_model

def load_hubert():
    """Load HuBERT model via torchaudio for v2 models (768-dim output)"""
    global _hubert_model, _hubert_bundle
    if _hubert_model is None:
        import torchaudio
        logger.info("Loading HuBERT model via torchaudio...")
        _hubert_bundle = torchaudio.pipelines.HUBERT_BASE
        _hubert_model = _hubert_bundle.get_model().to(device)
        _hubert_model.eval()
        logger.info("HuBERT model loaded")
    return _hubert_model, _hubert_bundle

def extract_hubert_features(audio: np.ndarray, sr: int = 16000, version: str = "v2") -> torch.Tensor:
    """Extract ContentVec features from audio (same as Applio)

    v1 models: Use ContentVec with final_proj (256-dim)
    v2 models: Use ContentVec without final_proj (768-dim)
    """
    audio = audio.astype(np.float32)
    if np.abs(audio).max() > 1.0:
        audio = audio / np.abs(audio).max()

    inputs = torch.from_numpy(audio).unsqueeze(0).to(device)

    # Use ContentVec for ALL versions (same as Applio)
    contentvec = load_contentvec()
    if contentvec is not None:
        with torch.no_grad():
            output = contentvec(inputs)
            if version == "v1":
                # v1: use final_proj for 256-dim
                feats = contentvec.final_proj(output.last_hidden_state)
            else:
                # v2: use raw hidden state (768-dim)
                feats = output.last_hidden_state
        return feats

    # Fallback to torchaudio HuBERT if ContentVec not available
    logger.warning("ContentVec not available, using torchaudio HuBERT (results may be degraded)")
    hubert, bundle = load_hubert()
    with torch.no_grad():
        features, _ = hubert.extract_features(inputs)
        layer_idx = 11 if version == "v2" else 8
        feats = features[min(layer_idx, len(features)-1)]

        if version == "v1":
            proj = nn.Linear(768, 256, bias=False).to(device)
            with torch.no_grad():
                w = torch.zeros(256, 768)
                for i in range(256):
                    w[i, i*3:(i+1)*3] = 1/3
                proj.weight.copy_(w)
            feats = proj(feats)

    return feats

# ============================================================
# F0 EXTRACTION
# ============================================================

def extract_f0_pm(audio: np.ndarray, sr: int = 16000, f0_up_key: int = 0) -> Tuple[np.ndarray, np.ndarray]:
    """Extract F0 using parselmouth (pm method)"""
    import parselmouth

    p_len = audio.shape[0] // 160 + 1
    f0_min = 65
    f0_max = 1100

    l_pad = int(np.ceil(1.5 / f0_min * 16000))
    r_pad = l_pad + 1

    s = parselmouth.Sound(np.pad(audio, (l_pad, r_pad)), 16000).to_pitch_ac(
        time_step=0.01, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max,
    )

    f0 = s.selected_array["frequency"]
    if len(f0) < p_len:
        f0 = np.pad(f0, (0, p_len - len(f0)))
    f0 = f0[:p_len]
    f0 *= pow(2, f0_up_key / 12)

    return f0_to_coarse(f0)

def extract_f0_harvest(audio: np.ndarray, sr: int = 16000, f0_up_key: int = 0) -> Tuple[np.ndarray, np.ndarray]:
    """Extract F0 using pyworld harvest"""
    import pyworld
    from scipy import signal as scipy_signal

    f0, t = pyworld.harvest(audio.astype(np.double), fs=16000, f0_ceil=1100, f0_floor=50, frame_period=10)
    f0 = scipy_signal.medfilt(f0, 3)
    f0 *= pow(2, f0_up_key / 12)

    return f0_to_coarse(f0)

def f0_to_coarse(f0: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """Convert f0 to coarse representation"""
    f0_min = 50
    f0_max = 1100
    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
    f0_mel_max = 1127 * np.log(1 + f0_max / 700)

    f0bak = f0.copy()
    f0_mel = 1127 * np.log(1 + f0 / 700)
    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
    f0_mel[f0_mel <= 1] = 1
    f0_mel[f0_mel > 255] = 255
    f0_coarse = np.rint(f0_mel).astype(np.int32)

    return f0_coarse, f0bak

# ============================================================
# RMVPE F0 EXTRACTION (from Applio - IAHispano/Applio)
# ============================================================

class RMVPE_ConvBlockRes(nn.Module):
    def __init__(self, in_channels, out_channels, momentum=0.01):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, (3, 3), (1, 1), (1, 1), bias=False),
            nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, (3, 3), (1, 1), (1, 1), bias=False),
            nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU(),
        )
        self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) if in_channels != out_channels else None

    def forward(self, x):
        r = self.conv(x)
        return r + self.shortcut(x) if self.shortcut else r + x

class RMVPE_ResEncoderBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01):
        super().__init__()
        self.conv = nn.ModuleList([RMVPE_ConvBlockRes(in_channels, out_channels, momentum)])
        for _ in range(n_blocks - 1):
            self.conv.append(RMVPE_ConvBlockRes(out_channels, out_channels, momentum))
        self.kernel_size = kernel_size
        if kernel_size is not None:
            self.pool = nn.AvgPool2d(kernel_size=kernel_size)

    def forward(self, x):
        for c in self.conv:
            x = c(x)
        return (x, self.pool(x)) if self.kernel_size is not None else x

class RMVPE_Encoder(nn.Module):
    def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01):
        super().__init__()
        self.n_encoders = n_encoders
        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
        self.layers = nn.ModuleList()
        for _ in range(n_encoders):
            self.layers.append(RMVPE_ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum))
            in_channels = out_channels
            out_channels *= 2
            in_size //= 2
        self.out_size = in_size
        self.out_channel = out_channels

    def forward(self, x):
        concat_tensors = []
        x = self.bn(x)
        for layer in self.layers:
            t, x = layer(x)
            concat_tensors.append(t)
        return x, concat_tensors

class RMVPE_Intermediate(nn.Module):
    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
        super().__init__()
        self.layers = nn.ModuleList([RMVPE_ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)])
        for _ in range(n_inters - 1):
            self.layers.append(RMVPE_ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class RMVPE_ResDecoderBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
        super().__init__()
        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
        self.conv1 = nn.Sequential(
            nn.ConvTranspose2d(in_channels, out_channels, (3, 3), stride, (1, 1), out_padding, bias=False),
            nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU(),
        )
        self.conv2 = nn.ModuleList([RMVPE_ConvBlockRes(out_channels * 2, out_channels, momentum)])
        for _ in range(n_blocks - 1):
            self.conv2.append(RMVPE_ConvBlockRes(out_channels, out_channels, momentum))

    def forward(self, x, concat_tensor):
        x = self.conv1(x)
        x = torch.cat((x, concat_tensor), dim=1)
        for c in self.conv2:
            x = c(x)
        return x

class RMVPE_Decoder(nn.Module):
    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
        super().__init__()
        self.layers = nn.ModuleList()
        for _ in range(n_decoders):
            out_channels = in_channels // 2
            self.layers.append(RMVPE_ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum))
            in_channels = out_channels
        self.n_decoders = n_decoders

    def forward(self, x, concat_tensors):
        for i in range(self.n_decoders):
            x = self.layers[i](x, concat_tensors[-1 - i])
        return x

class RMVPE_DeepUnet(nn.Module):
    def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
        super().__init__()
        self.encoder = RMVPE_Encoder(in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels)
        self.intermediate = RMVPE_Intermediate(self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks)
        self.decoder = RMVPE_Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks)

    def forward(self, x):
        x, concat_tensors = self.encoder(x)
        x = self.intermediate(x)
        x = self.decoder(x, concat_tensors)
        return x

class RMVPE_BiGRU(nn.Module):
    def __init__(self, input_features, hidden_features, num_layers):
        super().__init__()
        self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True)

    def forward(self, x):
        return self.gru(x)[0]

RMVPE_N_MELS = 128
RMVPE_N_CLASS = 360

class RMVPE_E2E(nn.Module):
    def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
        super().__init__()
        self.unet = RMVPE_DeepUnet(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
        self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
        if n_gru:
            self.fc = nn.Sequential(
                RMVPE_BiGRU(3 * 128, 256, n_gru),
                nn.Linear(512, RMVPE_N_CLASS), nn.Dropout(0.25), nn.Sigmoid(),
            )
        else:
            self.fc = nn.Sequential(nn.Linear(3 * RMVPE_N_MELS, RMVPE_N_CLASS), nn.Dropout(0.25), nn.Sigmoid())

    def forward(self, mel):
        mel = mel.transpose(-1, -2).unsqueeze(1)
        x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
        return self.fc(x)

class RMVPE_MelSpectrogram(nn.Module):
    def __init__(self, n_mel_channels=128, sample_rate=16000, win_length=1024, hop_length=160, n_fft=None, mel_fmin=30, mel_fmax=8000, clamp=1e-5):
        super().__init__()
        from librosa.filters import mel as librosa_mel
        n_fft = win_length if n_fft is None else n_fft
        self.hann_window = {}
        mel_basis = librosa_mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax, htk=True)
        self.register_buffer("mel_basis", torch.from_numpy(mel_basis).float())
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.win_length = win_length
        self.clamp = clamp

    def forward(self, audio, keyshift=0, speed=1, center=True):
        factor = 2 ** (keyshift / 12)
        n_fft_new = int(np.round(self.n_fft * factor))
        win_length_new = int(np.round(self.win_length * factor))
        hop_length_new = int(np.round(self.hop_length * speed))
        key = f"{keyshift}_{audio.device}"
        if key not in self.hann_window:
            self.hann_window[key] = torch.hann_window(win_length_new).to(audio.device)
        fft = torch.stft(audio, n_fft=n_fft_new, hop_length=hop_length_new, win_length=win_length_new,
                         window=self.hann_window[key], center=center, return_complex=True)
        magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
        if keyshift != 0:
            size = self.n_fft // 2 + 1
            resize = magnitude.size(1)
            if resize < size:
                magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
        mel_output = torch.matmul(self.mel_basis, magnitude)
        return torch.log(torch.clamp(mel_output, min=self.clamp))

_rmvpe_model = None

def load_rmvpe():
    """Download and load RMVPE model for f0 extraction"""
    global _rmvpe_model
    if _rmvpe_model is None:
        logger.info("Downloading RMVPE model...")
        rmvpe_path = hf_hub_download(repo_id="IAHispano/Applio", filename="Resources/predictors/rmvpe.pt")
        model = RMVPE_E2E(4, 1, (2, 2))
        ckpt = torch.load(rmvpe_path, map_location="cpu", weights_only=True)
        model.load_state_dict(ckpt)
        model.eval().to(device)
        mel_extractor = RMVPE_MelSpectrogram().to(device)
        cents_mapping = 20 * np.arange(RMVPE_N_CLASS) + 1997.3794084376191
        _rmvpe_model = (model, mel_extractor, np.pad(cents_mapping, (4, 4)))
        logger.info("RMVPE model loaded")
    return _rmvpe_model

def extract_f0_rmvpe(audio: np.ndarray, sr: int = 16000, f0_up_key: int = 0, thred: float = 0.03) -> Tuple[np.ndarray, np.ndarray]:
    """Extract F0 using RMVPE (best quality, neural network based)"""
    model, mel_extractor, cents_mapping = load_rmvpe()

    audio_t = torch.from_numpy(audio).float().to(device).unsqueeze(0)
    mel = mel_extractor(audio_t, center=True)
    del audio_t

    # mel2hidden with chunking
    with torch.no_grad():
        n_frames = mel.shape[-1]
        mel_padded = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect")
        chunks = []
        for start in range(0, mel_padded.shape[-1], 32000):
            end = min(start + 32000, mel_padded.shape[-1])
            chunks.append(model(mel_padded[..., start:end]))
        hidden = torch.cat(chunks, dim=1)[:, :n_frames].squeeze(0).cpu().numpy()

    # Decode hidden to f0
    center = np.argmax(hidden, axis=1)
    salience = np.pad(hidden, ((0, 0), (4, 4)))
    center += 4
    todo_salience = []
    todo_cents = []
    for idx in range(salience.shape[0]):
        s, e = center[idx] - 4, center[idx] + 5
        todo_salience.append(salience[idx, s:e])
        todo_cents.append(cents_mapping[s:e])
    todo_salience = np.array(todo_salience)
    todo_cents = np.array(todo_cents)
    cents_pred = np.sum(todo_salience * todo_cents, 1) / np.sum(todo_salience, 1)
    cents_pred[np.max(salience, axis=1) <= thred] = 0

    f0 = 10 * (2 ** (cents_pred / 1200))
    f0[f0 == 10] = 0
    f0 *= pow(2, f0_up_key / 12)

    return f0_to_coarse(f0)

# ============================================================
# MODEL LOADING
# ============================================================

_model_cache = {}

def load_rvc_model(model_path: str):
    """Load RVC model and auto-detect version"""
    if model_path in _model_cache:
        return _model_cache[model_path]

    logger.info(f"Loading RVC model: {model_path}")
    try:
        cpt = torch.load(model_path, map_location="cpu", weights_only=True)
    except Exception:
        logger.warning("Model requires unsafe loading - may be an older format")
        cpt = torch.load(model_path, map_location="cpu", weights_only=False)

    weight_key = None
    for key in ["weight", "model", "state_dict", "net_g"]:
        if key in cpt:
            weight_key = key
            break
    if weight_key is None:
        raise ValueError(f"Cannot find model weights. Keys: {list(cpt.keys())}")

    config = cpt.get("config", None)
    if config is None:
        config = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000]
        logger.warning("No config found, using v2 defaults")

    version = cpt.get("version", "v1")
    if_f0 = cpt.get("f0", 1)

    if weight_key in cpt:
        emb_weight = cpt[weight_key].get("emb_g.weight")
        if emb_weight is not None:
            config[-3] = emb_weight.shape[0]

    sr = config[-1] if isinstance(config[-1], int) else 40000

    if version == "v1":
        model_class = SynthesizerTrnMs256NSFsid if if_f0 == 1 else SynthesizerTrnMs256NSFsid_nono
    else:
        model_class = SynthesizerTrnMs768NSFsid if if_f0 == 1 else SynthesizerTrnMs768NSFsid_nono

    model = model_class(
        spec_channels=config[0], segment_size=config[1], inter_channels=config[2],
        hidden_channels=config[3], filter_channels=config[4], n_heads=config[5],
        n_layers=config[6], kernel_size=config[7], p_dropout=config[8],
        resblock=config[9], resblock_kernel_sizes=config[10],
        resblock_dilation_sizes=config[11], upsample_rates=config[12],
        upsample_initial_channel=config[13], upsample_kernel_sizes=config[14],
        spk_embed_dim=config[15], gin_channels=config[16], sr=sr, is_half=False
    )

    model.load_state_dict(cpt[weight_key], strict=False)
    model.eval().to(device)

    _model_cache[model_path] = (model, sr, version, if_f0)
    logger.info(f"Model loaded: version={version}, f0={if_f0}, sr={sr}")

    return model, sr, version, if_f0

# ============================================================
# TRAINING - Simplified for CPU testing
# ============================================================

def spectrogram_torch(y, n_fft, hop_size, win_size, center=False):
    """Compute spectrogram"""
    hann_window = torch.hann_window(win_size).to(y.device)
    y = F.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect').squeeze(1)
    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window,
                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
    spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
    return spec

# Mel spectrogram for training loss
_mel_basis_cache = {}

def spec_to_mel_torch(spec, n_fft=2048, num_mels=125, sampling_rate=40000, fmin=0, fmax=None):
    """Convert spectrogram to mel spectrogram"""
    from librosa.filters import mel as librosa_mel_fn
    global _mel_basis_cache

    if fmax is None:
        fmax = sampling_rate // 2

    key = f"{n_fft}_{num_mels}_{sampling_rate}_{fmin}_{fmax}_{spec.dtype}_{spec.device}"
    if key not in _mel_basis_cache:
        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
        _mel_basis_cache[key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)

    melspec = torch.matmul(_mel_basis_cache[key], spec)
    melspec = torch.log(torch.clamp(melspec, min=1e-5))  # Log-amplitude
    return melspec

def preprocess_audio_for_training(audio_path: str, output_dir: str, target_sr: int = 40000, f0_method: str = "rmvpe"):
    """Preprocess audio file for training - slice and extract features"""
    import scipy.signal as signal

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(f"{output_dir}/wavs", exist_ok=True)
    os.makedirs(f"{output_dir}/hubert", exist_ok=True)
    os.makedirs(f"{output_dir}/f0", exist_ok=True)

    logger.info(f"Preprocessing: {audio_path}")

    # Load and resample audio
    audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)

    # High-pass filter
    bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=target_sr)
    audio = signal.lfilter(bh, ah, audio)

    # Slice into chunks (3.7 seconds with 0.3 overlap)
    chunk_size = int(3.7 * target_sr)
    hop = int(3.4 * target_sr)

    chunks = []
    for i, start in enumerate(range(0, len(audio) - chunk_size, hop)):
        chunk = audio[start:start + chunk_size]
        # Normalize
        max_val = np.abs(chunk).max()
        if max_val > 0.01:  # Skip silence
            chunk = chunk / max_val * 0.9
            chunks.append((i, chunk))

    if not chunks:
        logger.warning("No valid audio chunks found")
        return None

    logger.info(f"Created {len(chunks)} chunks")

    # Save chunks and extract features
    manifest = []
    for idx, chunk in chunks:
        # Save wav
        wav_path = f"{output_dir}/wavs/{idx:04d}.wav"
        sf.write(wav_path, chunk, target_sr)

        # Resample to 16k for HuBERT
        chunk_16k = librosa.resample(chunk, orig_sr=target_sr, target_sr=16000)

        # Extract HuBERT features
        feats = extract_hubert_features(chunk_16k, sr=16000, version="v2")
        hubert_path = f"{output_dir}/hubert/{idx:04d}.npy"
        np.save(hubert_path, feats.squeeze(0).cpu().numpy())

        # Extract F0
        if f0_method == "rmvpe":
            f0_coarse, f0 = extract_f0_rmvpe(chunk_16k, 16000, 0)
        elif f0_method == "harvest":
            f0_coarse, f0 = extract_f0_harvest(chunk_16k, 16000, 0)
        else:
            f0_coarse, f0 = extract_f0_pm(chunk_16k, 16000, 0)
        f0_path = f"{output_dir}/f0/{idx:04d}.npy"
        np.save(f0_path, np.stack([f0_coarse, f0], axis=0))

        manifest.append(f"{idx:04d}")

    # Save manifest
    with open(f"{output_dir}/manifest.txt", "w") as f:
        f.write("\n".join(manifest))

    logger.info(f"Preprocessing complete: {len(manifest)} samples")
    return output_dir

def train_rvc_generator(
    data_dir: str,
    output_dir: str,
    epochs: int = 10,
    batch_size: int = 2,
    lr: float = 1e-5,  # Lower LR prevents overfitting on small data
    target_sr: int = 40000,
    progress_callback=None
):
    """Generator version of train_rvc - yields (epoch_msg, ckpt_path) tuples"""
    logger.info(f"Starting training: {data_dir} -> {output_dir}")
    os.makedirs(output_dir, exist_ok=True)

    # Load manifest
    with open(f"{data_dir}/manifest.txt") as f:
        samples = [l.strip() for l in f if l.strip()]

    if len(samples) < 1:
        logger.error("No training samples found")
        return None

    logger.info(f"Training with {len(samples)} samples")

    # Model config (v2 40k defaults)
    config = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 1, 256, target_sr]

    # Create models (v2 only - 768-dim HuBERT features)
    net_g = SynthesizerTrnMs768NSFsid(
        spec_channels=config[0], segment_size=config[1], inter_channels=config[2],
        hidden_channels=config[3], filter_channels=config[4], n_heads=config[5],
        n_layers=config[6], kernel_size=config[7], p_dropout=config[8],
        resblock=config[9], resblock_kernel_sizes=config[10],
        resblock_dilation_sizes=config[11], upsample_rates=config[12],
        upsample_initial_channel=config[13], upsample_kernel_sizes=config[14],
        spk_embed_dim=config[15], gin_channels=config[16], sr=target_sr
    ).to(train_device)

    net_d = MultiPeriodDiscriminator().to(train_device)
    logger.info(f"Training on device: {train_device}")

    # Download and load pretrained weights (essential for good results)
    sr_key = f"{target_sr // 1000}k"  # e.g., "40k"
    try:
        pretrain_g_path = download_pretrained_rvc(f"f0G{sr_key}")
        pretrain_d_path = download_pretrained_rvc(f"f0D{sr_key}")
        load_pretrained_weights(net_g, pretrain_g_path)
        load_pretrained_weights(net_d, pretrain_d_path)
    except Exception as e:
        logger.warning(f"Failed to load pretrained weights: {e}")
        logger.warning("Training from scratch (results may be poor)")

    # Optimizers (after loading pretrained weights)
    optim_g = torch.optim.AdamW(net_g.parameters(), lr=lr, betas=(0.8, 0.99))
    optim_d = torch.optim.AdamW(net_d.parameters(), lr=lr, betas=(0.8, 0.99))

    # LR scheduler (matches Applio - exponential decay)
    lr_decay = 0.999875
    scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=lr_decay)
    scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=lr_decay)

    net_g.train()
    net_d.train()

    # Training loop
    for epoch in range(epochs):
        total_loss_g, total_loss_d = 0, 0
        np.random.shuffle(samples)

        for i in range(0, len(samples), batch_size):
            batch_samples = samples[i:i+batch_size]

            # Load batch data
            wavs, huberts, f0s = [], [], []
            for s in batch_samples:
                wav, _ = librosa.load(f"{data_dir}/wavs/{s}.wav", sr=target_sr, mono=True)
                hubert = np.load(f"{data_dir}/hubert/{s}.npy")
                # Upsample 50Hz -> 100Hz using interpolation (same as inference)
                hubert_t = torch.from_numpy(hubert).unsqueeze(0).permute(0, 2, 1)  # (1, 768, seq)
                hubert_t = F.interpolate(hubert_t, scale_factor=2, mode='linear', align_corners=False)
                hubert = hubert_t.permute(0, 2, 1).squeeze(0).numpy()  # (seq*2, 768)
                f0_data = np.load(f"{data_dir}/f0/{s}.npy")
                wavs.append(wav)
                huberts.append(hubert)
                f0s.append(f0_data)

            # Compute spectrogram first to get target length
            max_wav_len = max(len(w) for w in wavs)
            wav_batch = np.zeros((len(wavs), max_wav_len))
            for j, w in enumerate(wavs):
                wav_batch[j, :len(w)] = w
            wav_t = torch.FloatTensor(wav_batch).unsqueeze(1).to(train_device)
            spec = spectrogram_torch(wav_t.squeeze(1), 2048, 400, 2048)
            spec_len = spec.shape[2]  # Target length for all features

            # Pad/truncate features to match spec length exactly
            hubert_batch = np.zeros((len(huberts), spec_len, huberts[0].shape[1]))
            f0_batch = np.zeros((len(f0s), spec_len))
            f0f_batch = np.zeros((len(f0s), spec_len))

            for j, (h, f) in enumerate(zip(huberts, f0s)):
                # Truncate or pad HuBERT to spec_len
                h_len = min(h.shape[0], spec_len)
                hubert_batch[j, :h_len] = h[:h_len]
                # Truncate or pad F0 to spec_len
                f0_len = min(f.shape[1], spec_len)
                f0_batch[j, :f0_len] = f[0, :f0_len]
                f0f_batch[j, :f0_len] = f[1, :f0_len]

            # To tensors - all features now have spec_len
            hubert_t = torch.FloatTensor(hubert_batch).to(train_device)
            f0_t = torch.LongTensor(f0_batch.astype(np.int64)).to(train_device)
            f0f_t = torch.FloatTensor(f0f_batch).to(train_device)
            lengths_t = torch.LongTensor([spec_len] * len(batch_samples)).to(train_device)
            sid_t = torch.LongTensor([0] * len(batch_samples)).to(train_device)
            spec_lengths = torch.LongTensor([spec_len] * len(batch_samples)).to(train_device)

            # Forward pass generator
            # Args: phone, phone_lengths, pitch, pitchf, y (spec), y_lengths, ds
            try:
                y_hat, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(
                    hubert_t, lengths_t, f0_t, f0f_t, spec, spec_lengths, sid_t
                )
            except Exception as e:
                logger.warning(f"Generator forward failed: {e}")
                continue

            # Slice wav at same position model generated (CRITICAL for proper loss)
            # ids_slice is in latent space, multiply by hop_length to get waveform position
            hop_length = 400
            segment_size_wav = 32 * hop_length  # segment_size in latent * hop_length
            y = slice_segments(wav_t, ids_slice * hop_length, segment_size_wav)

            # Discriminator forward
            y_d_rs, y_d_gs, fmap_rs, fmap_gs = net_d(y, y_hat.detach())

            # Discriminator loss
            loss_d = discriminator_loss(y_d_rs, y_d_gs)

            optim_d.zero_grad()
            loss_d.backward()
            optim_d.step()

            # Generator loss
            y_d_rs, y_d_gs, fmap_rs, fmap_gs = net_d(y, y_hat)
            loss_gen = generator_loss(y_d_gs)
            loss_fm = feature_loss(fmap_rs, fmap_gs)
            loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask)

            # Mel spectrogram loss (crucial for quality)
            # Config: n_fft=2048, hop=400, win=2048, n_mels=125, fmin=0, fmax=None
            y_mel = spec_to_mel_torch(spectrogram_torch(y.squeeze(1), 2048, 400, 2048),
                                      n_fft=2048, num_mels=125, sampling_rate=target_sr, fmin=0, fmax=None)
            y_hat_mel = spec_to_mel_torch(spectrogram_torch(y_hat.squeeze(1), 2048, 400, 2048),
                                          n_fft=2048, num_mels=125, sampling_rate=target_sr, fmin=0, fmax=None)
            # Align lengths if needed
            min_len = min(y_mel.shape[2], y_hat_mel.shape[2])
            loss_mel = F.l1_loss(y_mel[:, :, :min_len], y_hat_mel[:, :, :min_len]) * 45  # c_mel = 45

            loss_g = loss_gen + loss_fm + loss_mel + loss_kl

            optim_g.zero_grad()
            loss_g.backward()
            optim_g.step()

            total_loss_g += loss_g.item()
            total_loss_d += loss_d.item()

        avg_loss_g = total_loss_g / max(1, len(samples) // batch_size)
        avg_loss_d = total_loss_d / max(1, len(samples) // batch_size)
        epoch_msg = f"Epoch {epoch+1}/{epochs} - G: {avg_loss_g:.2f}, D: {avg_loss_d:.2f}"
        logger.info(epoch_msg)

        # Update progress callback if provided
        if progress_callback:
            progress_pct = 0.30 + (0.65 * (epoch + 1) / epochs)
            progress_callback(progress_pct, epoch_msg)

        # Yield epoch message for live UI updates
        yield epoch_msg, None, None

        # Step LR schedulers
        scheduler_g.step()
        scheduler_d.step()

    # Save checkpoint
    ckpt_path = f"{output_dir}/model.pth"
    torch.save({
        "weight": net_g.state_dict(),
        "config": config,
        "version": "v2",  # v2 only
        "f0": 1,
    }, ckpt_path)
    logger.info(f"Saved checkpoint: {ckpt_path}")

    # Generate index file for better speaker similarity
    index_path = None
    try:
        import faiss
        hubert_dir = f"{data_dir}/hubert"
        npys = []
        for name in sorted(os.listdir(hubert_dir)):
            if name.endswith('.npy'):
                phone = np.load(os.path.join(hubert_dir, name))
                npys.append(phone)
        if npys:
            big_npy = np.concatenate(npys, axis=0)
            n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
            n_ivf = max(1, n_ivf)  # Ensure at least 1
            index = faiss.index_factory(big_npy.shape[1], f"IVF{n_ivf},Flat")
            index.train(big_npy)
            index.add(big_npy)
            index_path = f"{output_dir}/model.index"
            faiss.write_index(index, index_path)
            logger.info(f"Saved index: {index_path}")
    except Exception as e:
        logger.warning(f"Failed to generate index: {e}")

    # Cleanup training models to free memory
    purge_memory(net_g, net_d, optim_g, optim_d, scheduler_g, scheduler_d)

    yield "Training complete!", ckpt_path, index_path


def train_rvc(
    data_dir: str,
    output_dir: str,
    epochs: int = 10,
    batch_size: int = 2,
    lr: float = 1e-5,  # Lower LR prevents overfitting on small data
    target_sr: int = 40000,
    progress_callback=None
):
    """Non-generator wrapper for CLI use - returns (checkpoint_path, index_path)"""
    ckpt = None
    idx = None
    for msg, path, index in train_rvc_generator(data_dir, output_dir, epochs, batch_size, lr, target_sr, progress_callback):
        if path:
            ckpt = path
        if index:
            idx = index
    return ckpt, idx

# ============================================================
# INFERENCE
# ============================================================

def convert_voice(
    source_audio: str,
    model_file,
    index_file=None,
    pitch_shift: int = 0,
    f0_method: str = "pm",
    index_rate: float = 0.5,
    protect: float = 0.33,
    volume_envelope: float = 1.0,
    progress=gr.Progress()
) -> Tuple[str, str]:
    """Convert voice using RVC model (Applio-compatible pipeline)."""
    try:
        if source_audio is None:
            return None, "Please upload source audio"
        if model_file is None:
            return None, "Please upload RVC model (.pth)"

        model_path = model_file.name if hasattr(model_file, 'name') else model_file

        progress(0.1, "Loading model...")
        model, tgt_sr, version, if_f0 = load_rvc_model(model_path)

        progress(0.2, "Loading audio...")
        audio, sr = librosa.load(source_audio, sr=16000, mono=True)

        # Apply 48Hz high-pass filter (critical - removes low-frequency artifacts)
        audio = signal.filtfilt(bh, ah, audio)

        # Normalize audio
        audio_max = np.abs(audio).max() / 0.95
        if audio_max > 1:
            audio /= audio_max

        # Pipeline constants (same as Applio)
        window = 160  # Critical for feature/pitch alignment
        x_pad = 1  # Padding in seconds
        t_pad = 16000 * x_pad  # Padding in samples

        # Pad audio
        audio_pad = np.pad(audio, (t_pad, t_pad), mode="reflect")
        p_len = audio_pad.shape[0] // window

        progress(0.3, "Extracting features...")
        feats = extract_hubert_features(audio_pad, sr=16000, version=version)

        # Save original features for protect mechanism
        feats0 = feats.clone() if if_f0 == 1 and protect < 0.5 else None

        # Index retrieval (speaker similarity)
        if index_file is not None and index_rate > 0:
            try:
                import faiss
                index_path = index_file.name if hasattr(index_file, 'name') else index_file
                progress(0.4, "Loading index...")
                index = faiss.read_index(index_path)
                big_npy = index.reconstruct_n(0, index.ntotal)

                npy = feats[0].cpu().numpy().astype("float32")
                score, ix = index.search(npy, k=8)
                weight = np.square(1 / score)
                weight /= weight.sum(axis=1, keepdims=True)
                npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
                feats = torch.from_numpy(npy).unsqueeze(0).to(device) * index_rate + (1 - index_rate) * feats
            except Exception as e:
                logger.warning(f"Index retrieval failed: {e}")

        # Feature upsampling by 2x
        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)

        # Adjust length based on audio
        p_len = min(audio_pad.shape[0] // window, feats.shape[1])

        pitch, pitchf = None, None
        if if_f0 == 1:
            progress(0.5, f"Extracting F0 ({f0_method})...")
            if f0_method == "rmvpe":
                pitch, pitchf = extract_f0_rmvpe(audio_pad, 16000, pitch_shift)
            elif f0_method == "harvest":
                pitch, pitchf = extract_f0_harvest(audio_pad, 16000, pitch_shift)
            else:
                pitch, pitchf = extract_f0_pm(audio_pad, 16000, pitch_shift)

            pitch = pitch[:p_len]
            pitchf = pitchf[:p_len]

            # Upsample feats0 for protect
            if feats0 is not None:
                feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)

            # Apply protect mechanism (preserve original features for unvoiced segments)
            if protect < 0.5 and feats0 is not None:
                pitchf_tensor = torch.from_numpy(pitchf).float().to(device)
                pitchff = pitchf_tensor.clone()
                pitchff[pitchf_tensor > 0] = 1
                pitchff[pitchf_tensor < 1] = protect
                pitchff = pitchff.unsqueeze(0).unsqueeze(-1)
                feats = feats[:, :p_len, :] * pitchff + feats0[:, :p_len, :] * (1 - pitchff)

            if len(pitch) < p_len:
                pitch = np.pad(pitch, (0, p_len - len(pitch)))
                pitchf = np.pad(pitchf, (0, p_len - len(pitchf)))

            pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
            pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)

        p_len_tensor = torch.LongTensor([p_len]).to(device)
        sid = torch.LongTensor([0]).to(device)

        progress(0.7, "Running inference...")
        with torch.no_grad():
            if if_f0 == 1:
                audio_out = model.infer(feats[:, :p_len, :], p_len_tensor, pitch, pitchf, sid)[0][0, 0].data.cpu().float().numpy()
            else:
                audio_out = model.infer(feats[:, :p_len, :], p_len_tensor, sid)[0][0, 0].data.cpu().float().numpy()

        # Remove padding from output
        t_pad_tgt = int(t_pad * tgt_sr / 16000)
        if len(audio_out) > 2 * t_pad_tgt:
            audio_out = audio_out[t_pad_tgt:-t_pad_tgt]

        # RMS mixing - match volume dynamics of source audio
        if volume_envelope != 1.0:
            try:
                source_at_tgt_sr = librosa.resample(audio, orig_sr=16000, target_sr=tgt_sr)
                frame_len = tgt_sr // 2 * 2
                hop_len = tgt_sr // 2

                rms_source = librosa.feature.rms(y=source_at_tgt_sr, frame_length=frame_len, hop_length=hop_len)
                rms_output = librosa.feature.rms(y=audio_out, frame_length=frame_len, hop_length=hop_len)

                rms_source = F.interpolate(
                    torch.from_numpy(rms_source).float().unsqueeze(0),
                    size=audio_out.shape[0], mode="linear"
                ).squeeze()
                rms_output = F.interpolate(
                    torch.from_numpy(rms_output).float().unsqueeze(0),
                    size=audio_out.shape[0], mode="linear"
                ).squeeze()
                rms_output = torch.maximum(rms_output, torch.zeros_like(rms_output) + 1e-6)

                # Applio formula: target * (source^(1-rate) * output^(rate-1))
                audio_out = audio_out * (torch.pow(rms_source, 1 - volume_envelope) * torch.pow(rms_output, volume_envelope - 1)).numpy()
            except Exception as e:
                logger.warning(f"RMS mixing failed: {e}")

        # Final normalization
        audio_max = np.abs(audio_out).max() / 0.99
        if audio_max > 1:
            audio_out /= audio_max

        progress(0.9, "Saving output...")
        fd, output_path = tempfile.mkstemp(suffix=".wav")
        os.close(fd)
        sf.write(output_path, audio_out, tgt_sr)

        # Aggressive memory purge after inference — frees glibc arena on Linux
        _model_cache.clear()
        _cleanup_args = [model, feats, audio_out, audio, audio_pad]
        if feats0 is not None:
            _cleanup_args.append(feats0)
        purge_memory(*_cleanup_args)

        return output_path, f"Converted: {version}, sr={tgt_sr}, pitch={pitch_shift:+d}"

    except Exception as e:
        logger.exception("Conversion failed")
        _model_cache.clear()
        purge_memory()
        return None, f"Error: {str(e)}"

# ============================================================
# DEFAULT MODEL DOWNLOAD
# ============================================================

def load_example_model():
    """Download and load the default example model from HuggingFace"""
    import shutil
    try:
        logger.info(f"Downloading example model from {DEFAULT_MODEL_REPO}...")
        model_path = hf_hub_download(repo_id=DEFAULT_MODEL_REPO, filename=DEFAULT_MODEL_FILE)
        index_path = hf_hub_download(repo_id=DEFAULT_MODEL_REPO, filename=DEFAULT_INDEX_FILE)

        # Gradio 6 requires files to be in allowed directories (cwd or /tmp)
        # Copy from HF cache to temp directory
        temp_dir = tempfile.mkdtemp()
        temp_model = os.path.join(temp_dir, DEFAULT_MODEL_FILE)
        temp_index = os.path.join(temp_dir, DEFAULT_INDEX_FILE)
        shutil.copy2(model_path, temp_model)
        shutil.copy2(index_path, temp_index)

        return temp_model, temp_index, f"Loaded: {DEFAULT_MODEL_REPO}"
    except Exception as e:
        logger.exception("Failed to download example model")
        return None, None, f"Error: {str(e)}"

# ============================================================
# BEATRICE V2 MODEL
# ============================================================

def beatrice_load_audio(file, **kwargs):
    """Load audio using soundfile directly (for Beatrice dataset)"""
    data, sr = sf.read(file, dtype='float32')
    # soundfile returns (samples, channels), convert to torch (channels, samples)
    wav = torch.from_numpy(data)
    if wav.ndim == 1:
        wav = wav.unsqueeze(0)  # mono -> (1, samples)
    else:
        wav = wav.T  # (samples, channels) -> (channels, samples)
    return wav, sr


class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__dict__ = self



def dump_params(params: torch.Tensor, f: BinaryIO):
    if params is None:
        return
    if params.dtype == torch.bfloat16:
        f.write(
            params.detach()
            .clone()
            .float()
            .view(torch.short)
            .numpy()
            .ravel()[1::2]
            .tobytes()
        )
    else:
        f.write(params.detach().numpy().ravel().tobytes())
    f.flush()


def dump_layer(layer: nn.Module, f: BinaryIO):
    dump = partial(dump_params, f=f)
    if hasattr(layer, "dump"):
        layer.dump(f)
    elif isinstance(layer, (nn.Linear, nn.Conv1d, nn.LayerNorm)):
        dump(layer.weight)
        dump(layer.bias)
    elif isinstance(layer, nn.MultiheadAttention):
        embed_dim = layer.embed_dim
        num_heads = layer.num_heads
        # [3 * embed_dim, embed_dim]
        in_proj_weight = layer.in_proj_weight.data.clone()
        in_proj_weight[: 2 * embed_dim] *= 1.0 / math.sqrt(
            math.sqrt(embed_dim // num_heads)
        )
        in_proj_weight = in_proj_weight.view(
            3, num_heads, embed_dim // num_heads, embed_dim
        )
        # [num_heads, 3, embed_dim / num_heads, embed_dim]
        in_proj_weight = in_proj_weight.transpose(0, 1)
        # [3 * embed_dim]
        in_proj_bias = layer.in_proj_bias.data.clone()
        in_proj_bias[: 2 * embed_dim] *= 1.0 / math.sqrt(
            math.sqrt(embed_dim // num_heads)
        )
        in_proj_bias = in_proj_bias.view(3, num_heads, embed_dim // num_heads)
        # [num_heads, 3, embed_dim / num_heads]
        in_proj_bias = in_proj_bias.transpose(0, 1)
        dump(in_proj_weight)
        dump(in_proj_bias)
        dump(layer.out_proj.weight)
        dump(layer.out_proj.bias)
    elif isinstance(layer, nn.Embedding):
        dump(layer.weight)
    elif isinstance(layer, nn.Parameter):
        dump(layer)
    elif isinstance(layer, nn.ModuleList):
        for layer_i in layer:
            dump_layer(layer_i, f)
    else:
        assert False, layer


class CausalConv1d(nn.Conv1d):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        dilation: int = 1,
        groups: int = 1,
        bias: bool = True,
        delay: int = 0,
    ):
        padding = (kernel_size - 1) * dilation - delay
        self.trim = (kernel_size - 1) * dilation - 2 * delay
        if self.trim < 0:
            raise ValueError
        super().__init__(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        result = super().forward(input)
        if self.trim == 0:
            return result
        else:
            return result[:, :, : -self.trim]


class WSConv1d(CausalConv1d):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        dilation: int = 1,
        groups: int = 1,
        bias: bool = True,
        delay: int = 0,
    ):
        super().__init__(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            dilation=dilation,
            groups=groups,
            bias=bias,
            delay=delay,
        )
        self.weight.data.normal_(
            0.0, math.sqrt(1.0 / (in_channels * kernel_size // groups))
        )
        if bias:
            self.bias.data.zero_()
        self.gain = nn.Parameter(torch.ones((out_channels, 1, 1)))

    def standardized_weight(self) -> torch.Tensor:
        var, mean = torch.var_mean(self.weight, [1, 2], keepdim=True)
        scale = (
            self.gain
            * (
                self.in_channels * self.kernel_size[0] // self.groups * var + 1e-8
            ).rsqrt()
        )
        return scale * (self.weight - mean)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        result = F.conv1d(
            input,
            self.standardized_weight(),
            self.bias,
            self.stride,
            self.padding,
            self.dilation,
            self.groups,
        )
        if self.trim == 0:
            return result
        else:
            return result[:, :, : -self.trim]

    def merge_weights(self):
        self.weight.data[:] = self.standardized_weight().detach()
        self.gain.data.fill_(1.0)


class WSLinear(nn.Linear):
    def __init__(self, in_features: int, out_features: int, bias: bool = True):
        super().__init__(in_features, out_features, bias)
        self.weight.data.normal_(0.0, math.sqrt(1.0 / in_features))
        self.bias.data.zero_()
        self.gain = nn.Parameter(torch.ones((out_features, 1)))

    def standardized_weight(self) -> torch.Tensor:
        var, mean = torch.var_mean(self.weight, 1, keepdim=True)
        scale = self.gain * (self.in_features * var + 1e-8).rsqrt()
        return scale * (self.weight - mean)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return F.linear(input, self.standardized_weight(), self.bias)

    def merge_weights(self):
        self.weight.data[:] = self.standardized_weight().detach()
        self.gain.data.fill_(1.0)


class CrossAttention(nn.Module):
    def __init__(
        self,
        qk_channels: int,
        vo_channels: int,
        num_heads: int,
        in_q_channels: int,
        in_kv_channels: int,
        out_channels: int,
        dropout: float = 0.0,
    ):
        super().__init__()
        assert qk_channels % num_heads == 0
        self.qk_channels = qk_channels
        self.vo_channels = vo_channels
        self.num_heads = num_heads
        self.in_q_channels = in_q_channels
        self.in_kv_channels = in_kv_channels
        self.out_channels = out_channels
        self.dropout = dropout
        self.head_qk_channels = qk_channels // num_heads
        self.head_vo_channels = vo_channels // num_heads
        self.q_projection = nn.Linear(in_q_channels, qk_channels)
        self.q_projection.weight.data.normal_(0.0, math.sqrt(1.0 / in_q_channels))
        self.q_projection.bias.data.zero_()
        self.kv_projection = nn.Linear(in_kv_channels, qk_channels + vo_channels)
        self.kv_projection.weight.data.normal_(0.0, math.sqrt(1.0 / in_kv_channels))
        self.kv_projection.bias.data.zero_()
        self.out_projection = nn.Linear(vo_channels, out_channels)
        self.out_projection.weight.data.normal_(0.0, math.sqrt(1.0 / vo_channels))
        self.out_projection.bias.data.zero_()

    def forward(
        self,
        q: torch.Tensor,
        kv: torch.Tensor,
    ) -> torch.Tensor:
        # q: [batch_size, q_length, in_q_channels]
        # kv: [batch_size, kv_length, in_kv_channels]
        batch_size, q_length, _ = q.size()
        _, kv_length, _ = kv.size()
        # [batch_size, q_length, qk_channels]
        q = self.q_projection(q)
        # [batch_size, kv_length, qk_channels + vo_channels]
        kv = self.kv_projection(kv)
        # [batch_size, kv_length, qk_channels], [batch_size, kv_length, vo_channels]
        k, v = kv.split([self.qk_channels, self.vo_channels], dim=2)
        q = q.view(
            batch_size, q_length, self.num_heads, self.head_qk_channels
        ).transpose(1, 2)
        k = k.view(
            batch_size, kv_length, self.num_heads, self.head_qk_channels
        ).transpose(1, 2)
        v = v.view(
            batch_size, kv_length, self.num_heads, self.head_vo_channels
        ).transpose(1, 2)
        # [batch_size, num_heads, q_length, head_vo_channels]
        attn_out = F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout)
        # [batch_size, q_length, vo_channels]
        attn_out = (
            attn_out.transpose(1, 2)
            .contiguous()
            .view(batch_size, q_length, self.vo_channels)
        )
        # [batch_size, q_length, out_channels]
        attn_out = self.out_projection(attn_out)
        return attn_out

    def dump(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        q_projection_weight = self.q_projection.weight.data.clone()
        q_projection_bias = self.q_projection.bias.data.clone()
        q_projection_weight *= 1.0 / math.sqrt(math.sqrt(self.head_qk_channels))
        q_projection_bias *= 1.0 / math.sqrt(math.sqrt(self.head_qk_channels))
        dump_params(q_projection_weight, f)
        dump_params(q_projection_bias, f)
        dump_layer(self.out_projection, f)

    def dump_kv(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump_kv(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        kv_projection_weight = self.kv_projection.weight.data.clone()
        kv_projection_bias = self.kv_projection.bias.data.clone()
        k_projection_weight, v_projection_weight = kv_projection_weight.split(
            [self.qk_channels, self.vo_channels]
        )
        k_projection_bias, v_projection_bias = kv_projection_bias.split(
            [self.qk_channels, self.vo_channels]
        )
        k_projection_weight *= 1.0 / math.sqrt(math.sqrt(self.head_qk_channels))
        k_projection_bias *= 1.0 / math.sqrt(math.sqrt(self.head_qk_channels))
        # [qk_channels, in_kv_channels] -> [num_heads, head_qk_channels, in_kv_channels]
        k_projection_weight = k_projection_weight.view(
            self.num_heads, self.head_qk_channels, self.in_kv_channels
        )
        # [qk_channels] -> [num_heads, head_qk_channels]
        k_projection_bias = k_projection_bias.view(
            self.num_heads, self.head_qk_channels
        )
        # [vo_channels, in_kv_channels] -> [num_heads, head_vo_channels, in_kv_channels]
        v_projection_weight = v_projection_weight.view(
            self.num_heads, self.head_vo_channels, self.in_kv_channels
        )
        # [vo_channels] -> [num_heads, head_vo_channels]
        v_projection_bias = v_projection_bias.view(
            self.num_heads, self.head_vo_channels
        )
        for i in range(self.num_heads):
            # [head_qk_channels, in_kv_channels]
            dump_params(k_projection_weight[i], f)
            # [head_vo_channels, in_kv_channels]
            dump_params(v_projection_weight[i], f)
        for i in range(self.num_heads):
            # [head_qk_channels]
            dump_params(k_projection_bias[i], f)
            # [head_vo_channels]
            dump_params(v_projection_bias[i], f)


class ConvNeXtBlock(nn.Module):
    def __init__(
        self,
        channels: int,
        intermediate_channels: int,
        layer_scale_init_value: float,
        kernel_size: int = 7,
        use_weight_standardization: bool = False,
        enable_scaling: bool = False,
        pre_scale: float = 1.0,
        post_scale: float = 1.0,
        use_mha: bool = False,
        cross_attention: bool = False,
        num_heads: int = 4,
        attention_dropout: float = 0.1,
        attention_channels: Optional[int] = None,
        kv_channels: Optional[int] = None,
    ):
        super().__init__()
        self.use_weight_standardization = use_weight_standardization
        self.enable_scaling = enable_scaling
        self.use_mha = use_mha
        self.cross_attention = cross_attention
        if use_mha:
            self.attn_norm = nn.LayerNorm(channels)
            if cross_attention:
                self.mha = CrossAttention(
                    qk_channels=attention_channels,
                    vo_channels=attention_channels,
                    num_heads=num_heads,
                    in_q_channels=channels,
                    in_kv_channels=kv_channels,
                    out_channels=channels,
                    dropout=attention_dropout,
                )
            else:  # self-attention
                assert attention_channels is None
                assert kv_channels is None
                self.mha = nn.MultiheadAttention(
                    embed_dim=channels,
                    num_heads=num_heads,
                    dropout=attention_dropout,
                    batch_first=True,
                )
        self.dwconv = CausalConv1d(
            channels, channels, kernel_size=kernel_size, groups=channels
        )
        self.norm = nn.LayerNorm(channels)
        self.pwconv1 = nn.Linear(channels, intermediate_channels)
        self.pwconv2 = nn.Linear(intermediate_channels, channels)
        self.gamma = nn.Parameter(torch.full((channels,), layer_scale_init_value))
        self.dwconv.weight.data.normal_(0.0, math.sqrt(1.0 / kernel_size))
        self.dwconv.bias.data.zero_()
        self.pwconv1.weight.data.normal_(0.0, math.sqrt(2.0 / channels))
        self.pwconv1.bias.data.zero_()
        self.pwconv2.weight.data.normal_(0.0, math.sqrt(1.0 / intermediate_channels))
        self.pwconv2.bias.data.zero_()
        if use_weight_standardization:
            self.norm = nn.Identity()
            self.dwconv = WSConv1d(channels, channels, kernel_size, groups=channels)
            self.pwconv1 = WSLinear(channels, intermediate_channels)
            self.pwconv2 = WSLinear(intermediate_channels, channels)
            del self.gamma
        if enable_scaling:
            self.register_buffer("pre_scale", torch.tensor(pre_scale))
            self.register_buffer("post_scale", torch.tensor(post_scale))
            self.post_scale_weight = nn.Parameter(torch.ones(()))

    def forward(
        self,
        x: torch.Tensor,
        attn_mask: Optional[torch.Tensor] = None,
        kv: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if self.use_mha:
            batch_size, channels, length = x.size()
            if self.cross_attention:
                assert kv is not None
            else:
                assert kv is None
                assert length % 4 == 0
            identity = x
            if self.cross_attention:
                # kv: [batch_size, kv_length, kv_channels]
                x = x.transpose(1, 2)
                x = self.attn_norm(x)
                x = self.mha(x, kv)
                x = x.transpose(1, 2)
            else:
                x = x.view(batch_size, channels, length // 4, 4)
                x = x.permute(0, 3, 2, 1)
                x = x.reshape(batch_size * 4, length // 4, channels)
                x = self.attn_norm(x)
                x, _ = self.mha(
                    x, x, x, attn_mask=attn_mask, is_causal=True, need_weights=False
                )
                x = x.view(batch_size, 4, length // 4, channels)
                x = x.permute(0, 3, 2, 1)
                x = x.reshape(batch_size, channels, length)
            x += identity

        identity = x
        if self.enable_scaling:
            x = x * self.pre_scale
        x = self.dwconv(x)
        x = x.transpose(1, 2)
        x = self.norm(x)
        x = self.pwconv1(x)
        x = F.gelu(x, approximate="tanh")
        x = self.pwconv2(x)
        if not self.use_weight_standardization:
            x *= self.gamma
        if self.enable_scaling:
            x *= self.post_scale * self.post_scale_weight
        x = x.transpose(1, 2)
        x += identity
        return x

    def merge_weights(self):
        if self.use_mha:
            if self.cross_attention:
                assert isinstance(self.mha, CrossAttention)
                self.mha.q_projection.bias.data += torch.mv(
                    self.mha.q_projection.weight.data, self.attn_norm.bias.data
                )
                self.mha.q_projection.weight.data *= self.attn_norm.weight.data[None, :]
                self.attn_norm.bias.data[:] = 0.0
                self.attn_norm.weight.data[:] = 1.0
            else:  # self-attention
                assert isinstance(self.mha, nn.MultiheadAttention)
                self.mha.in_proj_bias.data += torch.mv(
                    self.mha.in_proj_weight.data, self.attn_norm.bias.data
                )
                self.mha.in_proj_weight.data *= self.attn_norm.weight.data[None, :]
                self.attn_norm.bias.data[:] = 0.0
                self.attn_norm.weight.data[:] = 1.0
        if self.use_weight_standardization:
            self.dwconv.merge_weights()
            self.pwconv1.merge_weights()
            self.pwconv2.merge_weights()
        else:
            self.pwconv1.bias.data += torch.mv(
                self.pwconv1.weight.data, self.norm.bias.data
            )
            self.pwconv1.weight.data *= self.norm.weight.data[None, :]
            self.norm.bias.data[:] = 0.0
            self.norm.weight.data[:] = 1.0
            self.pwconv2.weight.data *= self.gamma.data[:, None]
            self.pwconv2.bias.data *= self.gamma.data
            self.gamma.data[:] = 1.0
        if self.enable_scaling:
            self.dwconv.weight.data *= self.pre_scale.data
            self.pre_scale.data.fill_(1.0)
            self.pwconv2.weight.data *= (
                self.post_scale.data * self.post_scale_weight.data
            )
            self.pwconv2.bias.data *= self.post_scale.data * self.post_scale_weight.data
            self.post_scale.data.fill_(1.0)
            self.post_scale_weight.data.fill_(1.0)

    def dump(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        if self.use_mha:
            dump_layer(self.mha, f)
        dump_layer(self.dwconv, f)
        dump_layer(self.pwconv1, f)
        dump_layer(self.pwconv2, f)


class ConvNeXtStack(nn.Module):
    def __init__(
        self,
        in_channels: int,
        channels: int,
        intermediate_channels: int,
        n_blocks: int,
        delay: int,
        embed_kernel_size: int,
        kernel_size: int,
        use_weight_standardization: bool = False,
        enable_scaling: bool = False,
        use_mha: bool = False,
        cross_attention: bool = False,
        kv_channels: Optional[int] = None,
    ):
        super().__init__()
        assert delay * 2 + 1 <= embed_kernel_size
        assert not (use_weight_standardization and use_mha)  # 未対応
        self.use_weight_standardization = use_weight_standardization
        self.use_mha = use_mha
        self.cross_attention = cross_attention
        self.embed = CausalConv1d(in_channels, channels, embed_kernel_size, delay=delay)
        self.norm = nn.LayerNorm(channels)
        self.convnext = nn.ModuleList()
        for i in range(n_blocks):
            pre_scale = 1.0 / math.sqrt(1.0 + i / n_blocks) if enable_scaling else 1.0
            post_scale = 1.0 / math.sqrt(n_blocks) if enable_scaling else 1.0
            block = ConvNeXtBlock(
                channels=channels,
                intermediate_channels=intermediate_channels,
                layer_scale_init_value=1.0 / n_blocks,
                kernel_size=kernel_size,
                use_weight_standardization=use_weight_standardization,
                enable_scaling=enable_scaling,
                pre_scale=pre_scale,
                post_scale=post_scale,
                use_mha=use_mha,
                cross_attention=cross_attention,
                num_heads=4,
                attention_dropout=0.1,
                attention_channels=kv_channels,
                kv_channels=kv_channels,
            )
            self.convnext.append(block)
        self.final_layer_norm = nn.LayerNorm(channels)
        self.embed.weight.data.normal_(
            0.0, math.sqrt(0.5 / (embed_kernel_size * in_channels))
        )
        self.embed.bias.data.zero_()
        if use_weight_standardization:
            self.embed = WSConv1d(in_channels, channels, embed_kernel_size, delay=delay)
            self.norm = nn.Identity()
            self.final_layer_norm = nn.Identity()

    def forward(
        self, x: torch.Tensor, kv: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        x = self.embed(x)
        x = self.norm(x.transpose(1, 2)).transpose(1, 2)
        if self.use_mha and not self.cross_attention:
            pad_length = -x.size(2) % 4
            if pad_length:
                x = F.pad(x, (0, pad_length))
            t40 = x.size(2) // 4
            attn_mask = torch.ones((t40, t40), dtype=torch.bool, device=x.device).triu(
                1
            )
        else:
            attn_mask = None
        for conv_block in self.convnext:
            x = conv_block(x, attn_mask=attn_mask, kv=kv)
        if self.use_mha and not self.cross_attention and pad_length:
            x = x[:, :, :-pad_length]
        x = self.final_layer_norm(x.transpose(1, 2)).transpose(1, 2)
        return x

    def merge_weights(self):
        if self.use_weight_standardization:
            self.embed.merge_weights()
        for conv_block in self.convnext:
            conv_block.merge_weights()

    def dump(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        dump_layer(self.embed, f)
        if not self.use_weight_standardization:
            dump_layer(self.norm, f)
        dump_layer(self.convnext, f)
        if not self.use_weight_standardization:
            dump_layer(self.final_layer_norm, f)

    def dump_kv(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump_kv(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        assert self.use_mha and self.cross_attention
        for conv_block in self.convnext:
            if not conv_block.use_mha or not conv_block.cross_attention:
                continue
            assert isinstance(conv_block, ConvNeXtBlock)
            assert hasattr(conv_block, "mha")
            assert isinstance(conv_block.mha, CrossAttention)
            conv_block.mha.dump_kv(f)


class FeatureExtractor(nn.Module):
    def __init__(self, hidden_channels: int):
        super().__init__()
        # fmt: off
        self.conv0 = weight_norm(nn.Conv1d(1, hidden_channels // 8, 10, 5, bias=False))
        self.conv1 = weight_norm(nn.Conv1d(hidden_channels // 8, hidden_channels // 4, 3, 2, bias=False))
        self.conv2 = weight_norm(nn.Conv1d(hidden_channels // 4, hidden_channels // 2, 3, 2, bias=False))
        self.conv3 = weight_norm(nn.Conv1d(hidden_channels // 2, hidden_channels, 3, 2, bias=False))
        self.conv4 = weight_norm(nn.Conv1d(hidden_channels, hidden_channels, 3, 2, bias=False))
        self.conv5 = weight_norm(nn.Conv1d(hidden_channels, hidden_channels, 2, 2, bias=False))
        # fmt: on

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: [batch_size, 1, wav_length]
        wav_length = x.size(2)
        if wav_length % 160 != 0:
            warnings.warn("wav_length % 160 != 0")
        x = F.pad(x, (40, 40))
        x = F.gelu(self.conv0(x), approximate="tanh")
        x = F.gelu(self.conv1(x), approximate="tanh")
        x = F.gelu(self.conv2(x), approximate="tanh")
        x = F.gelu(self.conv3(x), approximate="tanh")
        x = F.gelu(self.conv4(x), approximate="tanh")
        x = F.gelu(self.conv5(x), approximate="tanh")
        # [batch_size, hidden_channels, wav_length / 160]
        return x

    def remove_weight_norm(self):
        remove_weight_norm(self.conv0)
        remove_weight_norm(self.conv1)
        remove_weight_norm(self.conv2)
        remove_weight_norm(self.conv3)
        remove_weight_norm(self.conv4)
        remove_weight_norm(self.conv5)

    def dump(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        dump_layer(self.conv0, f)
        dump_layer(self.conv1, f)
        dump_layer(self.conv2, f)
        dump_layer(self.conv3, f)
        dump_layer(self.conv4, f)
        dump_layer(self.conv5, f)


class FeatureProjection(nn.Module):
    def __init__(self, channels: int):
        super().__init__()
        self.norm = nn.LayerNorm(channels)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # [batch_size, channels, length]
        x = self.norm(x.transpose(1, 2)).transpose(1, 2)
        x = self.dropout(x)
        return x


class PhoneExtractor(nn.Module):
    def __init__(
        self,
        phone_channels: int = 128,
        hidden_channels: int = 128,
        backbone_embed_kernel_size: int = 9,
        kernel_size: int = 17,
        n_blocks: int = 20,
    ):
        super().__init__()
        self.feature_extractor = FeatureExtractor(hidden_channels)
        self.feature_projection = FeatureProjection(hidden_channels)
        self.backbone = ConvNeXtStack(
            in_channels=hidden_channels,
            channels=hidden_channels,
            intermediate_channels=hidden_channels * 3,
            n_blocks=n_blocks,
            delay=0,
            embed_kernel_size=backbone_embed_kernel_size,
            kernel_size=kernel_size,
            use_mha=True,
        )
        self.head = weight_norm(nn.Conv1d(hidden_channels, phone_channels, 1))

    def forward(
        self, x: torch.Tensor, return_stats: bool = True
    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, float]]]:
        # x: [batch_size, 1, wav_length]

        stats = {}

        # [batch_size, 1, wav_length] -> [batch_size, feature_extractor_hidden_channels, length]
        x = self.feature_extractor(x)
        if return_stats:
            stats["feature_norm"] = x.detach().norm(dim=1).mean()
        # [batch_size, feature_extractor_hidden_channels, length] -> [batch_size, hidden_channels, length]
        x = self.feature_projection(x)
        # [batch_size, hidden_channels, length]
        x = self.backbone(x)
        # [batch_size, hidden_channels, length] -> [batch_size, phone_channels, length]
        phone = self.head(F.gelu(x, approximate="tanh"))

        results = [phone]
        if return_stats:
            stats["code_norm"] = phone.detach().norm(dim=1).mean()
            results.append(stats)

        if len(results) == 1:
            return results[0]
        return tuple(results)

    @torch.inference_mode()
    def units(self, x: torch.Tensor) -> torch.Tensor:
        # x: [batch_size, 1, wav_length]

        # [batch_size, 1, wav_length] -> [batch_size, phone_channels, length]
        phone = self.forward(x, return_stats=False)
        # [batch_size, phone_channels, length] -> [batch_size, length, phone_channels]
        phone = phone.transpose(1, 2)
        # [batch_size, length, phone_channels]
        return phone

    def remove_weight_norm(self):
        self.feature_extractor.remove_weight_norm()
        remove_weight_norm(self.head)

    def merge_weights(self):
        self.backbone.merge_weights()

        self.backbone.embed.bias.data += (
            (
                self.feature_projection.norm.bias.data[None, :, None]
                * self.backbone.embed.weight.data  # [o, i, k]
            )
            .sum(1)
            .sum(1)
        )
        self.backbone.embed.weight.data *= self.feature_projection.norm.weight.data[
            None, :, None
        ]
        self.feature_projection.norm.bias.data[:] = 0.0
        self.feature_projection.norm.weight.data[:] = 1.0

    def dump(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        dump_layer(self.feature_extractor, f)
        dump_layer(self.backbone, f)
        dump_layer(self.head, f)


class VectorQuantizer(nn.Module):
    def __init__(
        self,
        n_speakers: int,
        codebook_size: int,
        channels: int,
        topk: int = 4,
        training_time_vq: Literal["none", "self", "random"] = "none",
    ):
        super().__init__()
        assert 1 <= topk <= codebook_size
        self.n_speakers = n_speakers
        self.codebook_size = codebook_size
        self.channels = channels
        self.topk = topk
        self.training_time_vq = training_time_vq

        self.register_buffer(
            "codebooks",
            torch.empty(n_speakers, codebook_size, channels, dtype=torch.half),
        )
        self.codebooks: torch.Tensor

        # VQ の適用箇所を変更しやすいように hook にしている
        self._hook_handle: Optional[torch.utils.hooks.RemovableHandle] = None
        self.target_speaker_ids: Optional[torch.Tensor] = None

        def _hook(_, __, output):
            return self(output, self.target_speaker_ids)

        self._hook_fn = _hook

    @torch.no_grad()
    def build_codebooks(
        self,
        collector_func: Callable,
        target_layer: nn.Module,
        inputs: Sequence[Iterable[torch.Tensor]],
        kmeans_n_iters: int = 50,
    ):
        assert len(inputs) == self.n_speakers
        assert self._hook_handle is None, "hook already installed"
        device = next(self.buffers()).device

        for spk_id, inps in enumerate(tqdm(inputs, desc="Building codebooks")):
            activations: list[torch.Tensor] = []

            # TODO: データ多すぎる場合に間引く処理をする

            def _collect(_, __, output):
                # output: [batch_size, channels, length]
                activations.append(output.detach())

            handle = target_layer.register_forward_hook(_collect)
            for x in inps:
                collector_func(x.to(device))
            handle.remove()

            if not activations:
                raise RuntimeError(f"No activation collected for speaker {spk_id}")

            # [n_data, channels]
            activations: torch.Tensor = torch.cat(
                [
                    a.transpose(1, 2).reshape(a.size(0) * a.size(2), self.channels)
                    for a in activations
                ]
            )
            activations = activations.float()
            activations = F.normalize(activations, dim=1, eps=1e-6)
            # [codebook_size, channels]
            centers = (
                self._kmeans_plus_plus(activations, self.codebook_size, kmeans_n_iters)
                if activations.size(0) >= self.codebook_size
                else self._pad_replicate(activations, self.codebook_size)
            )
            self.codebooks[spk_id] = centers.to(self.codebooks.dtype)

    def forward(
        self, x: torch.Tensor, speaker_ids: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        batch_size, channels, length = x.size()
        assert channels == self.channels
        device = x.device
        dtype = x.dtype

        if self.training:
            if self.training_time_vq == "none":
                return x
            elif self.training_time_vq == "self":
                if self.target_speaker_ids is None:
                    raise ValueError("target_speaker_ids is not set")
            elif self.training_time_vq == "random":
                speaker_ids = torch.randint(
                    0, self.n_speakers, (batch_size,), device=device
                )
            else:
                raise ValueError(f"Unknown training_time_vq: {self.training_time_vq}")
        else:
            if speaker_ids is None:
                return x
            speaker_ids = speaker_ids.to(device)

        # [batch_size, channels, length] → [batch_size, length, channels]
        q = F.normalize(x, dim=1, eps=1e-6)
        codes = self.codebooks[speaker_ids].to(q.dtype)
        # [batch_size, length, codebook_size]
        sim = torch.einsum("bcl,bkc->blk", q, codes)

        # [batch_size, length, topk]
        _, topk_idx = sim.topk(self.topk, dim=-1)
        # [batch_size, length, codebook_size, channels]
        expanded_codes = codes[:, None, :, :].expand(-1, length, -1, -1)
        # [batch_size, length, topk, channels]
        expanded_topk_idx = topk_idx[:, :, :, None].expand(-1, -1, -1, channels)
        # [batch_size, length, topk, channels]
        gathered = expanded_codes.gather(2, expanded_topk_idx)
        # [batch_size, length, channels]
        gathered = gathered.mean(2)
        # [batch_size, channels, length]
        return gathered.transpose(1, 2).to(dtype)

    def enable_hook(self, target_layer: nn.Module):
        if self._hook_handle is not None:
            raise RuntimeError("hook already installed")
        self._hook_handle = target_layer.register_forward_hook(self._hook_fn)

    def disable_hook(self):
        if self._hook_handle is None:
            raise RuntimeError("hook not installed")
        self._hook_handle.remove()
        self._hook_handle = None

    def set_target_speaker_ids(self, speaker_ids: Optional[torch.Tensor]):
        # この話者が使われる条件は forward() を参照
        self.target_speaker_ids = speaker_ids

    @staticmethod
    def _pad_replicate(x: torch.Tensor, n: int) -> torch.Tensor:
        # データ数が n に満たないとき適当に複製して埋める
        idx = torch.arange(n, device=x.device) % x.size(0)
        return x[idx]

    @staticmethod
    def _kmeans_plus_plus(
        x: torch.Tensor, n_clusters: int, n_iters: int = 50
    ) -> torch.Tensor:
        n_data, _ = x.size()
        center_indices = [torch.randint(0, n_data, ()).item()]
        min_distances = torch.full((n_data,), math.inf, device=x.device)
        for _ in range(1, n_clusters):
            last_center_index = center_indices[-1]
            min_distances = min_distances.minimum(
                torch.cdist(x, x[last_center_index : last_center_index + 1])
                .float()
                .square_()
                .squeeze_(1)
            )
            probs = min_distances / (min_distances.sum() + 1e-12)
            center_indices.append(torch.multinomial(probs, 1).item())
        centers = x[center_indices]
        del min_distances, probs
        for _ in range(n_iters):
            distances = torch.cdist(x, centers)  # [n_data, n_clusters]
            labels = distances.argmin(1)  # [n_data]
            # [n_clusters, dim]
            new_centers = torch.zeros_like(centers).index_add_(0, labels, x)
            # [n_clusters]
            counts = labels.bincount(minlength=n_clusters)
            if (counts == 0).sum().item() != 0:
                # TODO: 割り当てがないクラスタの処理
                warnings.warn("Some clusters have no assigned data points.")
            new_centers /= counts[:, None].clamp_(min=1).float()
            centers = new_centers
        return centers



def extract_pitch_features(
    y: torch.Tensor,  # [..., wav_length]
    hop_length: int = 160,  # 10ms
    win_length: int = 560,  # 35ms
    max_corr_period: int = 256,  # 16ms, 62.5Hz (16000 / 256)
    corr_win_length: int = 304,  # 19ms
    instfreq_features_cutoff_bin: int = 64,  # 1828Hz (16000 * 64 / 560)
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    assert max_corr_period + corr_win_length == win_length

    # パディングする
    padding_length = (win_length - hop_length) // 2
    y = F.pad(y, (padding_length, padding_length))

    # フレームにする
    # [..., win_length, n_frames]
    y_frames = y.unfold(-1, win_length, hop_length).transpose_(-2, -1)

    # 複素スペクトログラム
    # Complex[..., (win_length // 2 + 1), n_frames]
    spec: torch.Tensor = torch.fft.rfft(y_frames, n=win_length, dim=-2)

    # Complex[..., instfreq_features_cutoff_bin, n_frames]
    spec = spec[..., :instfreq_features_cutoff_bin, :]

    # 対数パワースペクトログラム
    log_power_spec = spec.abs().add_(1e-5).log10_()

    # 瞬時位相の時間差分
    # 時刻 0 の値は 0
    delta_spec = spec[..., :, 1:] * spec[..., :, :-1].conj()
    delta_spec /= delta_spec.abs().add_(1e-5)
    delta_spec = torch.cat(
        [torch.zeros_like(delta_spec[..., :, :1]), delta_spec], dim=-1
    )

    # [..., instfreq_features_cutoff_bin * 3, n_frames]
    instfreq_features = torch.cat(
        [log_power_spec, delta_spec.real, delta_spec.imag], dim=-2
    )

    # 自己相関
    # 元々これに 2.0 / corr_win_length を掛けて使おうと思っていたが、
    # この値は振幅の 2 乗に比例していて、NN に入力するために良い感じに分散を
    # 標準化する方法が思いつかなかったのでやめた
    flipped_y_frames = y_frames.flip((-2,))
    a = torch.fft.rfft(flipped_y_frames, n=win_length, dim=-2)
    b = torch.fft.rfft(y_frames[..., -corr_win_length:, :], n=win_length, dim=-2)
    # [..., max_corr_period, n_frames]
    corr = torch.fft.irfft(a * b, n=win_length, dim=-2)[..., corr_win_length:, :]

    # エネルギー項
    energy = flipped_y_frames.square_().cumsum_(-2)
    energy0 = energy[..., corr_win_length - 1 : corr_win_length, :]
    energy = energy[..., corr_win_length:, :] - energy[..., :-corr_win_length, :]

    # Difference function
    corr_diff = (energy0 + energy).sub_(corr.mul_(2.0))
    assert corr_diff.min() >= -1e-3, corr_diff.min()
    corr_diff.clamp_(min=0.0)  # 計算誤差対策

    # 標準化
    corr_diff *= 2.0 / corr_win_length
    corr_diff.sqrt_()

    # 変換モデルへの入力用のエネルギー
    energy = (
        (y_frames * torch.signal.windows.cosine(win_length, device=y.device)[..., None])
        .square_()
        .sum(-2, keepdim=True)
    )

    energy.clamp_(min=1e-3).log10_()  # >= -3, 振幅 1 の正弦波なら大体 2.15
    energy *= 0.5  # >= -1.5, 振幅 1 の正弦波なら大体 1.07, 1 の差は振幅で 20dB の差

    return (
        instfreq_features,  # [..., instfreq_features_cutoff_bin * 3, n_frames]
        corr_diff,  # [..., max_corr_period, n_frames]
        energy,  # [..., 1, n_frames]
    )


class PitchEstimator(nn.Module):
    def __init__(
        self,
        input_instfreq_channels: int = 192,
        input_corr_channels: int = 256,
        pitch_bins: int = 448,
        channels: int = 192,
        intermediate_channels: int = 192 * 2,
        n_blocks: int = 9,
        delay: int = 1,  # 10ms, 特徴抽出と合わせると 22.5ms
        embed_kernel_size: int = 3,
        kernel_size: int = 33,
        pitch_bins_per_octave: int = 96,
    ):
        super().__init__()
        self.pitch_bins_per_octave = pitch_bins_per_octave

        self.instfreq_embed_0 = nn.Conv1d(input_instfreq_channels, channels, 1)
        self.instfreq_embed_1 = nn.Conv1d(channels, channels, 1)
        self.corr_embed_0 = nn.Conv1d(input_corr_channels, channels, 1)
        self.corr_embed_1 = nn.Conv1d(channels, channels, 1)
        self.backbone = ConvNeXtStack(
            channels,
            channels,
            intermediate_channels,
            n_blocks,
            delay,
            embed_kernel_size,
            kernel_size,
            enable_scaling=True,
        )
        self.head = nn.Conv1d(channels, pitch_bins, 1)

    def forward(self, wav: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        # wav: [batch_size, 1, wav_length]

        # [batch_size, input_instfreq_channels, length],
        # [batch_size, input_corr_channels, length]
        with torch.amp.autocast("cuda" if torch.cuda.is_available() else "cpu", enabled=False):
            instfreq_features, corr_diff, energy = extract_pitch_features(
                wav.squeeze(1),
                hop_length=160,
                win_length=560,
                max_corr_period=256,
                corr_win_length=304,
                instfreq_features_cutoff_bin=64,
            )
        instfreq_features = F.gelu(
            self.instfreq_embed_0(instfreq_features), approximate="tanh"
        )
        instfreq_features = self.instfreq_embed_1(instfreq_features)
        corr_diff = F.gelu(self.corr_embed_0(corr_diff), approximate="tanh")
        corr_diff = self.corr_embed_1(corr_diff)
        # [batch_size, channels, length]
        x = F.gelu(instfreq_features + corr_diff, approximate="tanh")
        x = self.backbone(x)
        # [batch_size, pitch_bins, length]
        x = self.head(x)
        return x, energy

    def sample_pitch(
        self, pitch: torch.Tensor, band_width: int = 4, return_features: bool = False
    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
        # pitch: [batch_size, pitch_bins, length]
        # 返されるピッチの値には 0 は含まれない
        batch_size, pitch_bins, length = pitch.size()
        pitch = pitch.softmax(1)
        if return_features:
            unvoiced_proba = pitch[:, :1, :].clone()
        pitch[:, 0, :] = -100.0
        pitch = (
            pitch.transpose(1, 2).contiguous().view(batch_size * length, 1, pitch_bins)
        )
        band_pitch = F.conv1d(
            pitch,
            torch.ones((1, 1, 1), device=pitch.device).expand(1, 1, band_width),
        )
        # [batch_size * length, 1, pitch_bins - band_width + 1] -> Long[batch_size * length, 1]
        quantized_band_pitch = band_pitch.argmax(2)
        if return_features:
            # [batch_size * length, 1]
            band_proba = band_pitch.gather(2, quantized_band_pitch[:, :, None])
            # [batch_size * length, 1]
            half_pitch_band_proba = band_pitch.gather(
                2,
                (quantized_band_pitch - self.pitch_bins_per_octave).clamp_(min=1)[
                    :, :, None
                ],
            )
            half_pitch_band_proba[
                quantized_band_pitch <= self.pitch_bins_per_octave
            ] = 0.0
            half_pitch_proba = (half_pitch_band_proba / (band_proba + 1e-6)).view(
                batch_size, 1, length
            )
            # [batch_size * length, 1]
            double_pitch_band_proba = band_pitch.gather(
                2,
                (quantized_band_pitch + self.pitch_bins_per_octave).clamp_(
                    max=pitch_bins - band_width
                )[:, :, None],
            )
            double_pitch_band_proba[
                quantized_band_pitch
                > pitch_bins - band_width - self.pitch_bins_per_octave
            ] = 0.0
            double_pitch_proba = (double_pitch_band_proba / (band_proba + 1e-6)).view(
                batch_size, 1, length
            )
        # Long[1, pitch_bins]
        mask = torch.arange(pitch_bins, device=pitch.device)[None, :]
        # bool[batch_size * length, pitch_bins]
        mask = (quantized_band_pitch <= mask) & (
            mask < quantized_band_pitch + band_width
        )
        # Long[batch_size, length]
        quantized_pitch = (pitch.squeeze(1) * mask).argmax(1).view(batch_size, length)

        if return_features:
            features = torch.cat(
                [unvoiced_proba, half_pitch_proba, double_pitch_proba], dim=1
            )
            # Long[batch_size, length], [batch_size, 3, length]
            return quantized_pitch, features
        else:
            return quantized_pitch

    def merge_weights(self):
        self.backbone.merge_weights()

    def dump(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        dump_layer(self.instfreq_embed_0, f)
        dump_layer(self.instfreq_embed_1, f)
        dump_layer(self.corr_embed_0, f)
        dump_layer(self.corr_embed_1, f)
        dump_layer(self.backbone, f)
        dump_layer(self.head, f)



def overlap_add(
    ir_amp: torch.Tensor,
    ir_phase: torch.Tensor,
    window: torch.Tensor,
    pitch: torch.Tensor,
    hop_length: int = 240,
    delay: int = 0,
    sr: float = 24000.0,
) -> torch.Tensor:
    batch_size, ir_length, length = ir_amp.size()
    ir_length = (ir_length - 1) * 2
    assert ir_phase.size() == ir_amp.size()
    assert window.size() == (ir_length,), (window.size(), ir_amp.size())
    assert pitch.size() == (batch_size, length * hop_length)
    assert 0 <= delay < ir_length, (delay, ir_length)
    # 正規化角周波数 [2π rad]
    normalized_freq = pitch / sr
    # 初期位相 [2π rad] をランダムに設定
    normalized_freq[:, 0] = torch.rand(batch_size, device=pitch.device)
    with torch.amp.autocast("cuda" if torch.cuda.is_available() else "cpu", enabled=False):
        phase = (normalized_freq.double().cumsum_(1) % 1.0).float()
    # 重ねる箇所を求める
    # [n_pitchmarks], [n_pitchmarks]
    indices0, indices1 = torch.nonzero(phase[:, :-1] > phase[:, 1:], as_tuple=True)
    # 重ねる箇所の小数部分 (位相の遅れ) を求める
    numer = 1.0 - phase[indices0, indices1]
    # [n_pitchmarks]
    fractional_part = numer / (numer + phase[indices0, indices1 + 1])
    # 重ねる値を求める
    # Complex[n_pitchmarks, ir_length / 2 + 1]
    ir_amp = ir_amp[indices0, :, indices1 // hop_length]
    ir_phase = ir_phase[indices0, :, indices1 // hop_length]
    # 位相遅れの量 [rad]
    # [n_pitchmarks, ir_length / 2 + 1]
    delay_phase = (
        torch.arange(ir_length // 2 + 1, device=pitch.device, dtype=torch.float32)[
            None, :
        ]
        * (-math.tau / ir_length)
        * fractional_part[:, None]
    )
    # Complex[n_pitchmarks, ir_length / 2 + 1]
    spec = torch.polar(ir_amp, ir_phase + delay_phase)
    # [n_pitchmarks, ir_length]
    ir = torch.fft.irfft(spec, n=ir_length, dim=1)
    ir *= window

    # 加算する値をサンプル単位にばらす
    # [n_pitchmarks * ir_length]
    ir = ir.ravel()
    # Long[n_pitchmarks * ir_length]
    indices0 = indices0[:, None].expand(-1, ir_length).ravel()
    # Long[n_pitchmarks * ir_length]
    indices1 = (
        indices1[:, None] + torch.arange(ir_length, device=pitch.device)
    ).ravel()

    # overlap-add する
    overlap_added_signal = torch.zeros(
        (batch_size, length * hop_length + ir_length), device=pitch.device
    )
    overlap_added_signal.index_put_((indices0, indices1), ir, accumulate=True)
    overlap_added_signal = overlap_added_signal[:, delay : -ir_length + delay]

    return overlap_added_signal


def generate_noise(
    aperiodicity: torch.Tensor, delay: int = 0
) -> tuple[torch.Tensor, torch.Tensor]:
    # aperiodicity: [batch_size, hop_length, length]
    batch_size, hop_length, length = aperiodicity.size()
    excitation = torch.rand(
        batch_size, (length + 1) * hop_length, device=aperiodicity.device
    )
    excitation -= 0.5
    n_fft = 2 * hop_length
    # 矩形窓で分析
    # Complex[batch_size, hop_length + 1, length]
    noise = torch.stft(
        excitation,
        n_fft=n_fft,
        hop_length=hop_length,
        window=torch.ones(n_fft, device=excitation.device),
        center=False,
        return_complex=True,
    )
    assert noise.size(2) == aperiodicity.size(2)
    noise[:, 0, :] = 0.0
    noise[:, 1:, :] *= aperiodicity
    # ハン窓で合成
    # torch.istft は最適合成窓が使われるので使えないことに注意
    # [batch_size, 2 * hop_length, length]
    noise = torch.fft.irfft(noise, n=2 * hop_length, dim=1)
    noise *= torch.hann_window(2 * hop_length, device=noise.device)[None, :, None]
    # [batch_size, (length + 1) * hop_length]
    noise = F.fold(
        noise,
        (1, (length + 1) * hop_length),
        (1, 2 * hop_length),
        stride=(1, hop_length),
    ).squeeze_((1, 2))

    assert delay < hop_length
    noise = noise[:, delay : -hop_length + delay]
    excitation = excitation[:, delay : -hop_length + delay]
    return noise, excitation  # [batch_size, length * hop_length]


D4C_PREVENT_ZERO_DIVISION = True  # False にすると本家の処理


def interp(x: torch.Tensor, y: torch.Tensor, xi: torch.Tensor) -> torch.Tensor:
    # x が単調増加で等間隔と仮定
    # 外挿は起こらないと仮定
    x = torch.as_tensor(x)
    y = torch.as_tensor(y)
    xi = torch.as_tensor(xi)
    if xi.ndim < y.ndim:
        diff_ndim = y.ndim - xi.ndim
        xi = xi.view(tuple([1] * diff_ndim) + xi.size())
    if xi.size()[:-1] != y.size()[:-1]:
        xi = xi.expand(y.size()[:-1] + (xi.size(-1),))
    assert (x.min(-1).values == x[..., 0]).all()
    assert (x.max(-1).values == x[..., -1]).all()
    assert (xi.min(-1).values >= x[..., 0]).all()
    assert (xi.max(-1).values <= x[..., -1]).all()
    delta_x = (x[..., -1].double() - x[..., 0].double()) / (x.size(-1) - 1.0)
    delta_x = delta_x.to(x.dtype)
    xi = (xi - x[..., :1]).div_(delta_x[..., None])
    xi_base = xi.floor()
    xi_fraction = xi.sub_(xi_base)
    xi_base = xi_base.long()
    delta_y = y.diff(dim=-1, append=y[..., -1:])
    yi = y.gather(-1, xi_base) + delta_y.gather(-1, xi_base) * xi_fraction
    return yi


def linear_smoothing(
    group_delay: torch.Tensor, sr: int, n_fft: int, width: torch.Tensor
) -> torch.Tensor:
    group_delay = torch.as_tensor(group_delay)
    assert group_delay.size(-1) == n_fft // 2 + 1
    width = torch.as_tensor(width)
    boundary = (width.max() * n_fft / sr).long() + 1

    dtype = group_delay.dtype
    device = group_delay.device
    fft_resolution = sr / n_fft
    mirroring_freq_axis = (
        torch.arange(-boundary, n_fft // 2 + 1 + boundary, dtype=dtype, device=device)
        .add(0.5)
        .mul(fft_resolution)
    )
    if group_delay.ndim == 1:
        mirroring_spec = F.pad(
            group_delay[None], (boundary, boundary), mode="reflect"
        ).squeeze_(0)
    elif group_delay.ndim >= 4:
        shape = group_delay.size()
        mirroring_spec = F.pad(
            group_delay.view(math.prod(shape[:-1]), group_delay.size(-1)),
            (boundary, boundary),
            mode="reflect",
        ).view(shape[:-1] + (shape[-1] + 2 * boundary,))
    else:
        mirroring_spec = F.pad(group_delay, (boundary, boundary), mode="reflect")
    mirroring_segment = mirroring_spec.mul(fft_resolution).cumsum_(-1)
    center_freq = torch.arange(n_fft // 2 + 1, dtype=dtype, device=device).mul_(
        fft_resolution
    )
    low_freq = center_freq - width[..., None] * 0.5
    high_freq = center_freq + width[..., None] * 0.5
    levels = interp(
        mirroring_freq_axis, mirroring_segment, torch.cat([low_freq, high_freq], dim=-1)
    )
    low_levels, high_levels = levels.split([n_fft // 2 + 1] * 2, dim=-1)
    smoothed = (high_levels - low_levels).div_(width[..., None])
    return smoothed


def dc_correction(
    spec: torch.Tensor, sr: int, n_fft: int, f0: torch.Tensor
) -> torch.Tensor:
    spec = torch.as_tensor(spec)
    f0 = torch.as_tensor(f0)
    dtype = spec.dtype
    device = spec.device

    upper_limit = 2 + (f0 * (n_fft / sr)).long()
    max_upper_limit = upper_limit.max()
    upper_limit_mask = (
        torch.arange(max_upper_limit - 1, device=device) < (upper_limit - 1)[..., None]
    )
    low_freq_axis = torch.arange(max_upper_limit + 1, dtype=dtype, device=device) * (
        sr / n_fft
    )
    low_freq_replica = interp(
        f0[..., None] - low_freq_axis.flip(-1),
        spec[..., : max_upper_limit + 1].flip(-1),
        low_freq_axis[..., : max_upper_limit - 1] * upper_limit_mask,
    )
    output = spec.clone()
    output[..., : max_upper_limit - 1] += low_freq_replica * upper_limit_mask
    return output


def nuttall(n: int, device: torch.types.Device) -> torch.Tensor:
    t = torch.linspace(0, math.tau, n, device=device)
    coefs = torch.tensor([0.355768, -0.487396, 0.144232, -0.012604], device=device)
    terms = torch.tensor([0.0, 1.0, 2.0, 3.0], device=device)
    cos_matrix = (terms[:, None] * t).cos_()  # [4, n]
    window = coefs.matmul(cos_matrix)
    return window


def get_windowed_waveform(
    x: torch.Tensor,
    sr: int,
    f0: torch.Tensor,
    position: torch.Tensor,
    half_window_length_ratio: float,
    window_type: Literal["hann", "blackman"],
    n_fft: int,
) -> tuple[torch.Tensor, torch.Tensor]:
    x = torch.as_tensor(x)
    f0 = torch.as_tensor(f0)
    position = torch.as_tensor(position)

    current_sample = position * sr
    # [...]
    half_window_length = (half_window_length_ratio * sr / f0).add_(0.5).long()
    # [..., fft_size]
    base_index = -half_window_length[..., None] + torch.arange(n_fft, device=x.device)
    base_index_mask = base_index <= half_window_length[..., None]
    # [..., fft_size]
    safe_index = ((current_sample + 0.501).long()[..., None] + base_index).clamp_(
        0, x.size(-1) - 1
    )
    # [..., fft_size]
    time_axis = base_index.to(x.dtype).div_(half_window_length_ratio)
    # [...]
    normalized_f0 = math.pi / sr * f0
    # [..., fft_size]
    phase = time_axis.mul_(normalized_f0[..., None])

    if window_type == "hann":
        window = phase.cos_().mul_(0.5).add_(0.5)
    elif window_type == "blackman":
        window = phase.mul(2.0).cos_().mul_(0.08).add_(phase.cos().mul_(0.5)).add_(0.42)
    else:
        assert False
    window *= base_index_mask

    prefix_shape = tuple(
        max(x_size, i_size) for x_size, i_size in zip(x.size(), safe_index.size())
    )[:-1]
    waveform = (
        x.expand(prefix_shape + (-1,))
        .gather(-1, safe_index.expand(prefix_shape + (-1,)))
        .mul_(window)
    )
    if not D4C_PREVENT_ZERO_DIVISION:
        waveform += torch.randn_like(window).mul_(1e-12)
    waveform *= base_index_mask
    waveform -= window * waveform.sum(-1, keepdim=True).div_(
        window.sum(-1, keepdim=True)
    )
    return waveform, window


def get_centroid(x: torch.Tensor, n_fft: int) -> torch.Tensor:
    x = torch.as_tensor(x)
    if D4C_PREVENT_ZERO_DIVISION:
        x = x / x.norm(dim=-1, keepdim=True).clamp(min=6e-8)
    else:
        x = x / x.norm(dim=-1, keepdim=True)
    spec0 = torch.fft.rfft(x, n_fft)
    spec1 = torch.fft.rfft(
        x * torch.arange(1, x.size(-1) + 1, dtype=x.dtype, device=x.device).div_(n_fft),
        n_fft,
    )
    centroid = spec0.real * spec1.real + spec0.imag * spec1.imag
    return centroid


def get_static_centroid(
    x: torch.Tensor, sr: int, f0: torch.Tensor, position: torch.Tensor, n_fft: int
) -> torch.Tensor:
    """First step: calculation of temporally static parameters on basis of group delay"""
    x1, _ = get_windowed_waveform(
        x, sr, f0, position + 0.25 / f0, 2.0, "blackman", n_fft
    )
    x2, _ = get_windowed_waveform(
        x, sr, f0, position - 0.25 / f0, 2.0, "blackman", n_fft
    )
    centroid1 = get_centroid(x1, n_fft)
    centroid2 = get_centroid(x2, n_fft)
    return dc_correction(centroid1 + centroid2, sr, n_fft, f0)


def get_smoothed_power_spec(
    x: torch.Tensor, sr: int, f0: torch.Tensor, position: torch.Tensor, n_fft: int
) -> tuple[torch.Tensor, torch.Tensor]:
    x = torch.as_tensor(x)
    f0 = torch.as_tensor(f0)
    x, window = get_windowed_waveform(x, sr, f0, position, 2.0, "hann", n_fft)
    window_weight = window.square().sum(-1, keepdim=True)
    rms = x.square().sum(-1, keepdim=True).div_(window_weight).sqrt_()
    if D4C_PREVENT_ZERO_DIVISION:
        x = x / (rms * math.sqrt(n_fft)).clamp_(min=6e-8)
    smoothed_power_spec = torch.fft.rfft(x, n_fft).abs().square_()
    smoothed_power_spec = dc_correction(smoothed_power_spec, sr, n_fft, f0)
    smoothed_power_spec = linear_smoothing(smoothed_power_spec, sr, n_fft, f0)
    return smoothed_power_spec, rms.detach().squeeze(-1)


def get_static_group_delay(
    static_centroid: torch.Tensor,
    smoothed_power_spec: torch.Tensor,
    sr: int,
    f0: torch.Tensor,
    n_fft: int,
) -> torch.Tensor:
    """Second step: calculation of parameter shaping"""
    if D4C_PREVENT_ZERO_DIVISION:
        smoothed_power_spec = smoothed_power_spec.clamp(min=6e-8)
    static_group_delay = static_centroid / smoothed_power_spec  # t_g
    static_group_delay = linear_smoothing(
        static_group_delay, sr, n_fft, f0 * 0.5
    )  # t_gs
    smoothed_group_delay = linear_smoothing(static_group_delay, sr, n_fft, f0)  # t_gb
    static_group_delay = static_group_delay - smoothed_group_delay  # t_D
    return static_group_delay


def get_coarse_aperiodicity(
    group_delay: torch.Tensor,
    sr: int,
    n_fft: int,
    freq_interval: int,
    n_aperiodicities: int,
    window: torch.Tensor,
) -> torch.Tensor:
    """Third step: estimation of band-aperiodicity"""
    group_delay = torch.as_tensor(group_delay)
    window = torch.as_tensor(window)
    boundary = int(round(n_fft * 8 / window.size(-1)))
    half_window_length = window.size(-1) // 2
    coarse_aperiodicity = torch.empty(
        group_delay.size()[:-1] + (n_aperiodicities,),
        dtype=group_delay.dtype,
        device=group_delay.device,
    )
    for i in range(n_aperiodicities):
        center = freq_interval * (i + 1) * n_fft // sr
        segment = (
            group_delay[
                ..., center - half_window_length : center + half_window_length + 1
            ]
            * window
        )
        power_spec: torch.Tensor = torch.fft.rfft(segment, n_fft).abs().square_()
        cumulative_power_spec = power_spec.sort(-1).values.cumsum_(-1)
        if D4C_PREVENT_ZERO_DIVISION:
            cumulative_power_spec.clamp_(min=6e-8)
        coarse_aperiodicity[..., i] = (
            cumulative_power_spec[..., n_fft // 2 - boundary - 1]
            / cumulative_power_spec[..., -1]
        )
    coarse_aperiodicity.log10_().mul_(10.0)
    return coarse_aperiodicity


def d4c_love_train(
    x: torch.Tensor, sr: int, f0: torch.Tensor, position: torch.Tensor, threshold: float
) -> int:
    x = torch.as_tensor(x)
    position = torch.as_tensor(position)
    f0: torch.Tensor = torch.as_tensor(f0)
    vuv = f0 != 0
    lowest_f0 = 40
    f0 = f0.clamp(min=lowest_f0)
    n_fft = 1 << (3 * sr // lowest_f0).bit_length()
    boundary0 = (100 * n_fft - 1) // sr + 1
    boundary1 = (4000 * n_fft - 1) // sr + 1
    boundary2 = (7900 * n_fft - 1) // sr + 1
    waveform, _ = get_windowed_waveform(x, sr, f0, position, 1.5, "blackman", n_fft)
    power_spec = torch.fft.rfft(waveform, n_fft).abs().square_()
    power_spec[..., : boundary0 + 1] = 0.0
    cumulative_spec = power_spec.cumsum_(-1)
    vuv = vuv & (
        cumulative_spec[..., boundary1] > threshold * cumulative_spec[..., boundary2]
    )
    return vuv


def d4c_general_body(
    x: torch.Tensor,
    sr: int,
    f0: torch.Tensor,
    freq_interval: int,
    position: torch.Tensor,
    n_fft: int,
    n_aperiodicities: int,
    window: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
    static_centroid = get_static_centroid(x, sr, f0, position, n_fft)
    smoothed_power_spec, rms = get_smoothed_power_spec(x, sr, f0, position, n_fft)
    static_group_delay = get_static_group_delay(
        static_centroid, smoothed_power_spec, sr, f0, n_fft
    )
    coarse_aperiodicity = get_coarse_aperiodicity(
        static_group_delay, sr, n_fft, freq_interval, n_aperiodicities, window
    )
    coarse_aperiodicity.add_((f0[..., None] - 100.0).div_(50.0)).clamp_(max=0.0)
    return coarse_aperiodicity, rms


def d4c(
    x: torch.Tensor,
    f0: torch.Tensor,
    t: torch.Tensor,
    sr: int,
    threshold: float = 0.85,
    n_fft_spec: Optional[int] = None,
    coarse_only: bool = False,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Adapted from https://github.com/tuanad121/Python-WORLD/blob/master/world/d4c.py"""
    FLOOR_F0 = 71
    FLOOR_F0_D4C = 47
    UPPER_LIMIT = 15000
    FREQ_INTERVAL = 3000

    assert sr == int(sr)
    sr = int(sr)
    assert sr % 2 == 0
    x = torch.as_tensor(x)
    f0 = torch.as_tensor(f0)
    temporal_positions = torch.as_tensor(t)

    n_fft_d4c = 1 << (4 * sr // FLOOR_F0_D4C).bit_length()
    if n_fft_spec is None:
        n_fft_spec = 1 << (3 * sr // FLOOR_F0).bit_length()
    n_aperiodicities = min(UPPER_LIMIT, sr // 2 - FREQ_INTERVAL) // FREQ_INTERVAL
    assert n_aperiodicities >= 1
    window_length = FREQ_INTERVAL * n_fft_d4c // sr * 2 + 1
    window = nuttall(window_length, device=x.device)
    freq_axis = torch.arange(n_fft_spec // 2 + 1, device=x.device) * (sr / n_fft_spec)

    coarse_aperiodicity, rms = d4c_general_body(
        x[..., None, :],
        sr,
        f0.clamp(min=FLOOR_F0_D4C),
        FREQ_INTERVAL,
        temporal_positions,
        n_fft_d4c,
        n_aperiodicities,
        window,
    )
    if coarse_only:
        return coarse_aperiodicity, rms

    even_coarse_axis = (
        torch.arange(n_aperiodicities + 3, device=x.device) * FREQ_INTERVAL
    )
    assert even_coarse_axis[-2] <= sr // 2 < even_coarse_axis[-1], sr
    coarse_axis_low = (
        torch.arange(n_aperiodicities + 1, dtype=torch.float, device=x.device)
        * FREQ_INTERVAL
    )
    aperiodicity_low = interp(
        coarse_axis_low,
        F.pad(coarse_aperiodicity, (1, 0), value=-60.0),
        freq_axis[freq_axis < n_aperiodicities * FREQ_INTERVAL],
    )
    coarse_axis_high = torch.tensor(
        [n_aperiodicities * FREQ_INTERVAL, sr * 0.5], device=x.device
    )
    aperiodicity_high = interp(
        coarse_axis_high,
        F.pad(coarse_aperiodicity[..., -1:], (0, 1), value=-1e-12),
        freq_axis[freq_axis >= n_aperiodicities * FREQ_INTERVAL],
    )
    aperiodicity = torch.cat([aperiodicity_low, aperiodicity_high], -1)
    aperiodicity = 10.0 ** (aperiodicity / 20.0)
    vuv = d4c_love_train(x[..., None, :], sr, f0, temporal_positions, threshold)
    aperiodicity = torch.where(vuv[..., None], aperiodicity, 1 - 1e-12)

    return aperiodicity, coarse_aperiodicity


class Vocoder(nn.Module):
    def __init__(
        self,
        channels: int,
        speaker_embedding_channels: int = 128,
        hop_length: int = 240,
        n_pre_blocks: int = 4,
        out_sample_rate: float = 24000.0,
    ):
        super().__init__()
        self.hop_length = hop_length
        self.out_sample_rate = out_sample_rate

        self.prenet = ConvNeXtStack(
            in_channels=channels,
            channels=channels,
            intermediate_channels=channels * 2,
            n_blocks=n_pre_blocks,
            delay=2,  # 20ms 遅延
            embed_kernel_size=7,
            kernel_size=33,
            enable_scaling=True,
            use_mha=True,
            cross_attention=True,
            kv_channels=speaker_embedding_channels,
        )
        self.ir_generator = ConvNeXtStack(
            in_channels=channels,
            channels=channels,
            intermediate_channels=channels * 2,
            n_blocks=2,
            delay=0,
            embed_kernel_size=3,
            kernel_size=33,
            use_weight_standardization=True,
            enable_scaling=True,
        )
        self.ir_generator_post = WSConv1d(channels, 512, 1)
        self.register_buffer("ir_scale", torch.tensor(1.0))
        self.ir_window = nn.Parameter(torch.ones(512))
        self.aperiodicity_generator = ConvNeXtStack(
            in_channels=channels,
            channels=channels,
            intermediate_channels=channels * 2,
            n_blocks=1,
            delay=0,
            embed_kernel_size=3,
            kernel_size=33,
            use_weight_standardization=True,
            enable_scaling=True,
        )
        self.aperiodicity_generator_post = WSConv1d(channels, hop_length, 1, bias=False)
        self.register_buffer("aperiodicity_scale", torch.tensor(0.005))
        self.post_filter_generator = ConvNeXtStack(
            in_channels=channels,
            channels=channels,
            intermediate_channels=channels * 2,
            n_blocks=1,
            delay=0,
            embed_kernel_size=3,
            kernel_size=33,
            use_weight_standardization=True,
            enable_scaling=True,
        )
        self.post_filter_generator_post = WSConv1d(channels, 512, 1, bias=False)
        self.register_buffer("post_filter_scale", torch.tensor(0.01))

    def forward(
        self, x: torch.Tensor, pitch: torch.Tensor, speaker_embedding: torch.Tensor
    ) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
        # x: [batch_size, channels, length]
        # pitch: [batch_size, length]
        # speaker_embedding: [batch_size, speaker_embedding_length, speaker_embedding_channels]
        batch_size, _, length = x.size()

        x = self.prenet(x, speaker_embedding)
        ir = self.ir_generator(x)
        ir = F.silu(ir, inplace=True)
        # [batch_size, 512, length]
        ir = self.ir_generator_post(ir)
        ir *= self.ir_scale
        ir_amp = ir[:, : ir.size(1) // 2 + 1, :].exp()
        ir_phase = F.pad(ir[:, ir.size(1) // 2 + 1 :, :], (0, 0, 1, 1))
        ir_phase[:, 1::2, :] += math.pi
        # TODO: 直流成分が正の値しか取れないのを修正する

        # 最近傍補間
        # [batch_size, length * hop_length]
        pitch = torch.repeat_interleave(pitch, self.hop_length, dim=1)

        # [batch_size, length * hop_length]
        periodic_signal = overlap_add(
            ir_amp,
            ir_phase,
            self.ir_window,
            pitch,
            self.hop_length,
            delay=0,
            sr=self.out_sample_rate,
        )

        aperiodicity = self.aperiodicity_generator(x)
        aperiodicity = F.silu(aperiodicity, inplace=True)
        # [batch_size, hop_length, length]
        aperiodicity = self.aperiodicity_generator_post(aperiodicity)
        aperiodicity *= self.aperiodicity_scale
        # [batch_size, length * hop_length], [batch_size, length * hop_length]
        aperiodic_signal, noise_excitation = generate_noise(aperiodicity, delay=0)

        post_filter = self.post_filter_generator(x)
        post_filter = F.silu(post_filter, inplace=True)
        # [batch_size, 512, length]
        post_filter = self.post_filter_generator_post(post_filter)
        post_filter *= self.post_filter_scale
        post_filter[:, 0, :] += 1.0
        # [batch_size, length, 512]
        post_filter = post_filter.transpose(1, 2)
        with torch.amp.autocast("cuda" if torch.cuda.is_available() else "cpu", enabled=False):
            periodic_signal = periodic_signal.float()
            aperiodic_signal = aperiodic_signal.float()
            post_filter = post_filter.float()
            post_filter = torch.fft.rfft(post_filter, n=768)

            # [batch_size, length, 768]
            periodic_signal = torch.fft.irfft(
                torch.fft.rfft(
                    periodic_signal.view(batch_size, length, self.hop_length), n=768
                )
                * post_filter,
                n=768,
            )
            aperiodic_signal = torch.fft.irfft(
                torch.fft.rfft(
                    aperiodic_signal.view(batch_size, length, self.hop_length), n=768
                )
                * post_filter,
                n=768,
            )
            periodic_signal = F.fold(
                periodic_signal.transpose(1, 2),
                (1, (length - 1) * self.hop_length + 768),
                (1, 768),
                stride=(1, self.hop_length),
            ).squeeze_((1, 2))
            aperiodic_signal = F.fold(
                aperiodic_signal.transpose(1, 2),
                (1, (length - 1) * self.hop_length + 768),
                (1, 768),
                stride=(1, self.hop_length),
            ).squeeze_((1, 2))
        periodic_signal = periodic_signal[:, 120 : 120 + length * self.hop_length]
        aperiodic_signal = aperiodic_signal[:, 120 : 120 + length * self.hop_length]
        noise_excitation = noise_excitation[:, 120:]

        # TODO: compensation の正確さが怪しくなってくる。今も本当に必要なのか？

        # [batch_size, 1, length * hop_length]
        y_g_hat = (periodic_signal + aperiodic_signal)[:, None, :]

        return y_g_hat, {
            "periodic_signal": periodic_signal.detach(),
            "aperiodic_signal": aperiodic_signal.detach(),
            "noise_excitation": noise_excitation.detach(),
        }

    def merge_weights(self):
        self.prenet.merge_weights()
        self.ir_generator.merge_weights()
        self.ir_generator_post.merge_weights()
        self.aperiodicity_generator.merge_weights()
        self.aperiodicity_generator_post.merge_weights()
        self.ir_generator_post.weight.data *= self.ir_scale
        self.ir_generator_post.bias.data *= self.ir_scale
        self.ir_scale.fill_(1.0)
        self.aperiodicity_generator_post.weight.data *= self.aperiodicity_scale
        self.aperiodicity_scale.fill_(1.0)
        self.post_filter_generator.merge_weights()
        self.post_filter_generator_post.merge_weights()
        self.post_filter_generator_post.weight.data *= self.post_filter_scale
        self.post_filter_scale.fill_(1.0)

    def dump(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        dump_layer(self.prenet, f)
        dump_layer(self.ir_generator, f)
        dump_layer(self.ir_generator_post, f)
        dump_layer(self.ir_window, f)
        dump_layer(self.aperiodicity_generator, f)
        dump_layer(self.aperiodicity_generator_post, f)
        dump_layer(self.post_filter_generator, f)
        dump_layer(self.post_filter_generator_post, f)


def compute_loudness(
    x: torch.Tensor, sr: int, win_lengths: list[int]
) -> list[torch.Tensor]:
    # x: [batch_size, wav_length]
    assert x.ndim == 2
    n_fft = 2048
    chunk_length = n_fft // 2
    n_taps = chunk_length + 1

    results = []
    with torch.amp.autocast("cuda" if torch.cuda.is_available() else "cpu", enabled=False):
        if not hasattr(compute_loudness, "filter"):
            compute_loudness.filter = {}
        if sr not in compute_loudness.filter:
            ir = torch.zeros(n_taps, device=x.device, dtype=torch.double)
            ir[0] = 0.5
            ir = torchaudio.functional.treble_biquad(
                ir, sr, 4.0, 1500.0, 1.0 / math.sqrt(2)
            )
            ir = torchaudio.functional.highpass_biquad(ir, sr, 38.0, 0.5)
            ir *= 2.0
            compute_loudness.filter[sr] = torch.fft.rfft(ir, n=n_fft).to(
                torch.complex64
            )

        x = x.float()
        wav_length = x.size(-1)
        if wav_length % chunk_length != 0:
            x = F.pad(x, (0, chunk_length - wav_length % chunk_length))
        padded_wav_length = x.size(-1)
        x = x.view(x.size()[:-1] + (padded_wav_length // chunk_length, chunk_length))
        x = torch.fft.irfft(
            torch.fft.rfft(x, n=n_fft) * compute_loudness.filter[sr],
            n=n_fft,
        )
        x = F.fold(
            x.transpose(-2, -1),
            (1, padded_wav_length + chunk_length),
            (1, n_fft),
            stride=(1, chunk_length),
        ).squeeze_((-3, -2))[..., :wav_length]

        x.square_()
        for win_length in win_lengths:
            hop_length = win_length // 4
            # [..., n_frames]
            energy = (
                x.unfold(-1, win_length, hop_length)
                .matmul(torch.hann_window(win_length, device=x.device))
                .add_(win_length / 4.0 * 1e-5)
                .log10_()
            )
            # フィルタリング後の波形が振幅 1 の正弦波なら大体 log10(win_length/4), 1 の差は 10dB の差
            results.append(energy)
    return results


def beatrice_slice_segments(
    x: torch.Tensor, start_indices: torch.Tensor, segment_length: int
) -> torch.Tensor:
    batch_size, channels, _ = x.size()
    # [batch_size, 1, segment_size]
    indices = start_indices[:, None, None] + torch.arange(
        segment_length, device=start_indices.device
    )
    # [batch_size, channels, segment_size]
    indices = indices.expand(batch_size, channels, segment_length)
    return x.gather(2, indices)


class ConverterNetwork(nn.Module):
    def __init__(
        self,
        phone_extractor: PhoneExtractor,
        pitch_estimator: PitchEstimator,
        n_speakers: int,
        pitch_bins: int,
        hidden_channels: int,
        vq_topk: int = 4,
        training_time_vq: Literal["none", "self", "random"] = "none",
        phone_noise_ratio: int = 0.5,
        floor_noise_level: float = 1e-3,
    ):
        super().__init__()
        self.frozen_modules = {
            "phone_extractor": phone_extractor.eval().requires_grad_(False),
            "pitch_estimator": pitch_estimator.eval().requires_grad_(False),
        }
        self.pitch_bins = pitch_bins
        self.phone_noise_ratio = phone_noise_ratio
        self.floor_noise_level = floor_noise_level
        self.out_sample_rate = out_sample_rate = 24000
        phone_channels = 128
        self.vq = VectorQuantizer(
            n_speakers=n_speakers,
            codebook_size=512,
            channels=phone_channels,
            topk=vq_topk,
            training_time_vq=training_time_vq,
        )
        self.embed_phone = nn.Conv1d(phone_channels, hidden_channels, 1)
        self.embed_phone.weight.data.normal_(0.0, math.sqrt(2.0 / (256 * 5)))
        self.embed_phone.bias.data.zero_()
        self.embed_quantized_pitch = nn.Embedding(pitch_bins, hidden_channels)
        phase = (
            torch.arange(pitch_bins, dtype=torch.float)[:, None]
            * (
                torch.arange(0, hidden_channels, 2, dtype=torch.float)
                * (-math.log(10000.0) / hidden_channels)
            ).exp_()
        )
        self.embed_quantized_pitch.weight.data[:, 0::2] = phase.sin()
        self.embed_quantized_pitch.weight.data[:, 1::2] = phase.cos_()
        self.embed_quantized_pitch.weight.data *= math.sqrt(4.0 / 5.0)
        self.embed_quantized_pitch.weight.requires_grad_(False)
        self.embed_pitch_features = nn.Conv1d(4, hidden_channels, 1)
        self.embed_pitch_features.weight.data.normal_(0.0, math.sqrt(2.0 / (4 * 5)))
        self.embed_pitch_features.bias.data.zero_()
        self.embed_speaker = nn.Embedding(n_speakers, hidden_channels)
        self.embed_speaker.weight.data.normal_(0.0, math.sqrt(2.0 / 5.0))
        self.embed_formant_shift = nn.Embedding(9, hidden_channels)
        self.embed_formant_shift.weight.data.normal_(0.0, math.sqrt(2.0 / 5.0))

        self.key_value_speaker_embedding_length = 384
        self.key_value_speaker_embedding_channels = 128
        self.key_value_speaker_embedding = nn.Embedding(
            n_speakers,
            self.key_value_speaker_embedding_length
            * self.key_value_speaker_embedding_channels,
        )
        self.key_value_speaker_embedding.weight.data[0].normal_()
        self.key_value_speaker_embedding.weight.data[1:] = (
            self.key_value_speaker_embedding.weight.data[0]
        )

        self.vocoder = Vocoder(
            channels=hidden_channels,
            speaker_embedding_channels=self.key_value_speaker_embedding_channels,
            hop_length=out_sample_rate // 100,
            n_pre_blocks=4,
            out_sample_rate=out_sample_rate,
        )
        self.melspectrograms = nn.ModuleList()
        for win_length, n_mels in [
            (32, 5),
            (64, 10),
            (128, 20),
            (256, 40),
            (512, 80),
            (1024, 160),
            (2048, 320),
        ]:
            self.melspectrograms.append(
                torchaudio.transforms.MelSpectrogram(
                    sample_rate=out_sample_rate,
                    n_fft=win_length,
                    win_length=win_length,
                    hop_length=win_length // 4,
                    n_mels=n_mels,
                    power=2,
                    norm="slaney",
                    mel_scale="slaney",
                )
            )

    def initialize_vq(self, inputs: Sequence[Iterable[torch.Tensor]]):
        collector_func = self.frozen_modules["phone_extractor"].units
        target_layer = self.frozen_modules["phone_extractor"].head

        self.vq.build_codebooks(
            collector_func,
            target_layer,
            inputs,
        )
        self.vq.enable_hook(target_layer)

    def enable_hook(self):
        target_layer = self.frozen_modules["phone_extractor"].head
        self.vq.enable_hook(target_layer)

    def _get_resampler(
        self, orig_freq, new_freq, device, cache={}
    ) -> torchaudio.transforms.Resample:
        key = orig_freq, new_freq
        if key in cache:
            return cache[key]
        resampler = torchaudio.transforms.Resample(orig_freq, new_freq).to(
            device, non_blocking=True
        )
        cache[key] = resampler
        return resampler

    def forward(
        self,
        x: torch.Tensor,
        target_speaker_id: torch.Tensor,
        formant_shift_semitone: torch.Tensor,
        pitch_shift_semitone: Optional[torch.Tensor] = None,
        slice_start_indices: Optional[torch.Tensor] = None,
        slice_segment_length: Optional[int] = None,
        return_stats: bool = False,
    ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, float]]]:
        # x: [batch_size, 1, wav_length]
        # target_speaker_id: Long[batch_size]
        # formant_shift_semitone: [batch_size]
        # pitch_shift_semitone: [batch_size]
        # slice_start_indices: [batch_size]

        batch_size, _, _ = x.size()
        self.vq.set_target_speaker_ids(target_speaker_id)

        with torch.inference_mode():
            phone_extractor: PhoneExtractor = self.frozen_modules["phone_extractor"]
            pitch_estimator: PitchEstimator = self.frozen_modules["pitch_estimator"]
            # [batch_size, 1, wav_length] -> [batch_size, phone_channels, length]
            phone = phone_extractor.units(x).transpose(1, 2)

            if self.training and self.phone_noise_ratio != 0.0:
                phone *= (1.0 - self.phone_noise_ratio) / phone.square().mean(
                    1, keepdim=True
                ).sqrt_()
                noise = torch.randn_like(phone)
                noise *= (
                    self.phone_noise_ratio
                    / noise.square().mean(1, keepdim=True).sqrt_()
                )
                phone += noise
            # F.rms_norm は PyTorch >= 2.4 が必要
            phone *= (
                1.0
                / phone.square()
                .mean(1, keepdim=True)
                .add_(torch.finfo(torch.float).eps)
                .sqrt_()
            )

            # [batch_size, 1, wav_length] -> [batch_size, pitch_bins, length], [batch_size, 1, length]
            pitch, energy = pitch_estimator(x)
            # augmentation
            if self.training:
                # [batch_size, pitch_bins - 1]
                weights = pitch.softmax(1)[:, 1:, :].mean(2)
                # [batch_size]
                mean_pitch = (
                    weights
                    * torch.arange(
                        1,
                        self.embed_quantized_pitch.num_embeddings,
                        device=weights.device,
                    )
                ).sum(1) / weights.sum(1)
                mean_pitch = mean_pitch.round_().long()
                target_pitch = torch.randint_like(mean_pitch, 64, 257)
                shift = target_pitch - mean_pitch
                shift_ratio = (
                    2.0 ** (shift.float() / pitch_estimator.pitch_bins_per_octave)
                ).tolist()
                shift = []
                interval_length = 100  # 1s
                interval_zeros = torch.zeros(
                    (1, 1, interval_length * 160), device=x.device
                )
                concatenated_shifted_x = []
                offsets = [0]
                torch.backends.cudnn.benchmark = False
                for i in range(batch_size):
                    shift_ratio_i = shift_ratio[i]
                    shift_ratio_fraction_i = Fraction.from_float(
                        shift_ratio_i
                    ).limit_denominator(30)
                    shift_numer_i = shift_ratio_fraction_i.numerator
                    shift_denom_i = shift_ratio_fraction_i.denominator
                    shift_ratio_i = shift_numer_i / shift_denom_i
                    shift_i = int(
                        round(
                            math.log2(shift_ratio_i)
                            * pitch_estimator.pitch_bins_per_octave
                        )
                    )
                    shift.append(shift_i)
                    shift_ratio[i] = shift_ratio_i
                    # [1, 1, wav_length / shift_ratio]
                    with torch.amp.autocast("cuda" if torch.cuda.is_available() else "cpu", enabled=False):
                        shifted_x_i = self._get_resampler(
                            shift_numer_i, shift_denom_i, x.device
                        )(x[i])[None]
                    if shifted_x_i.size(2) % 160 != 0:
                        shifted_x_i = F.pad(
                            shifted_x_i,
                            (0, 160 - shifted_x_i.size(2) % 160),
                            mode="reflect",
                        )
                    assert shifted_x_i.size(2) % 160 == 0
                    offsets.append(
                        offsets[-1] + interval_length + shifted_x_i.size(2) // 160
                    )
                    concatenated_shifted_x.extend([interval_zeros, shifted_x_i])
                if offsets[-1] % 256 != 0:
                    # 長さが同じ方が何かのキャッシュが効いて早くなるようなので
                    # 適当に 256 の倍数になるようにパディングして長さのパターン数を減らす
                    concatenated_shifted_x.append(
                        torch.zeros(
                            (1, 1, (256 - offsets[-1] % 256) * 160), device=x.device
                        )
                    )
                # [batch_size, 1, sum(wav_length) + batch_size * 16000]
                concatenated_shifted_x = torch.cat(concatenated_shifted_x, dim=2)
                assert concatenated_shifted_x.size(2) % (256 * 160) == 0
                # [1, pitch_bins, length / shift_ratio], [1, 1, length / shift_ratio]
                concatenated_pitch, concatenated_energy = pitch_estimator(
                    concatenated_shifted_x
                )
                for i in range(batch_size):
                    shift_i = shift[i]
                    shift_ratio_i = shift_ratio[i]
                    left = offsets[i] + interval_length
                    right = offsets[i + 1]
                    pitch_i = concatenated_pitch[:, :, left:right]
                    energy_i = concatenated_energy[:, :, left:right]
                    pitch_i = F.interpolate(
                        pitch_i,
                        scale_factor=shift_ratio_i,
                        mode="linear",
                        align_corners=False,
                    )
                    energy_i = F.interpolate(
                        energy_i,
                        scale_factor=shift_ratio_i,
                        mode="linear",
                        align_corners=False,
                    )
                    assert pitch_i.size(2) == energy_i.size(2)
                    assert abs(pitch_i.size(2) - pitch.size(2)) <= 10
                    length = min(pitch_i.size(2), pitch.size(2))

                    if shift_i > 0:
                        pitch[i : i + 1, :1, :length] = pitch_i[:, :1, :length]
                        pitch[i : i + 1, 1:-shift_i, :length] = pitch_i[
                            :, 1 + shift_i :, :length
                        ]
                        pitch[i : i + 1, -shift_i:, :length] = -10.0
                    elif shift_i < 0:
                        pitch[i : i + 1, :1, :length] = pitch_i[:, :1, :length]
                        pitch[i : i + 1, 1 : 1 - shift_i, :length] = -10.0
                        pitch[i : i + 1, 1 - shift_i :, :length] = pitch_i[
                            :, 1:shift_i, :length
                        ]
                    energy[i : i + 1, :, :length] = energy_i[:, :, :length]
                torch.backends.cudnn.benchmark = True

            # [batch_size, pitch_bins, length] -> Long[batch_size, length], [batch_size, 3, length]
            quantized_pitch, pitch_features = pitch_estimator.sample_pitch(
                pitch, return_features=True
            )
            if pitch_shift_semitone is not None:
                quantized_pitch = torch.where(
                    quantized_pitch == 0,
                    quantized_pitch,
                    (
                        quantized_pitch
                        + (
                            pitch_shift_semitone[:, None]
                            * (pitch_estimator.pitch_bins_per_octave / 12.0)
                        )
                        .round_()
                        .long()
                    ).clamp_(1, self.pitch_bins - 1),
                )
            pitch = 55.0 * 2.0 ** (
                quantized_pitch.float() / pitch_estimator.pitch_bins_per_octave
            )
            # phone が 2.5ms 先読みしているのに対して、
            # energy は 12.5ms, pitch_features は 22.5ms 先読みしているので、
            # ずらして phone に合わせる
            energy = F.pad(energy[:, :, :-1], (1, 0), mode="reflect")
            quantized_pitch = F.pad(quantized_pitch[:, :-2], (2, 0), mode="reflect")
            pitch_features = F.pad(pitch_features[:, :, :-2], (2, 0), mode="reflect")
            # [batch_size, 1, length], [batch_size, 3, length] -> [batch_size, 4, length]
            pitch_features = torch.cat([energy, pitch_features], dim=1)
            formant_shift_indices = (
                ((formant_shift_semitone + 2.0) * 2.0).round_().long()
            )

        phone = phone.clone()
        quantized_pitch = quantized_pitch.clone()
        pitch_features = pitch_features.clone()
        formant_shift_indices = formant_shift_indices.clone()
        pitch = pitch.clone()

        # [batch_sise, hidden_channels, length]
        x = (
            self.embed_phone(phone)
            + self.embed_quantized_pitch(quantized_pitch).transpose(1, 2)
            + self.embed_pitch_features(pitch_features)
            + (
                self.embed_speaker(target_speaker_id)[:, :, None]
                + self.embed_formant_shift(formant_shift_indices)[:, :, None]
            )
        )
        if slice_start_indices is not None:
            assert slice_segment_length is not None
            # [batch_size, hidden_channels, length] -> [batch_size, hidden_channels, segment_length]
            x = beatrice_slice_segments(x, slice_start_indices, slice_segment_length)
        x = F.silu(x, inplace=True)

        speaker_embedding = self.key_value_speaker_embedding(target_speaker_id).view(
            batch_size,
            self.key_value_speaker_embedding_length,
            self.key_value_speaker_embedding_channels,
        )

        # [batch_size, hidden_channels, segment_length] -> [batch_size, 1, segment_length * 240]
        y_g_hat, stats = self.vocoder(x, pitch, speaker_embedding)
        stats["pitch"] = pitch
        if return_stats:
            return y_g_hat, stats
        else:
            return y_g_hat

    def _normalize_melsp(self, x):
        return x.clamp(min=1e-10).log_()

    def forward_and_compute_loss(
        self,
        noisy_wavs_16k: torch.Tensor,
        target_speaker_id: torch.Tensor,
        formant_shift_semitone: torch.Tensor,
        slice_start_indices: torch.Tensor,
        slice_segment_length: int,
        y_all: torch.Tensor,
        enable_loss_ap: bool = False,
    ) -> tuple[
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        dict[str, float],
    ]:
        # noisy_wavs_16k: [batch_size, 1, wav_length]
        # target_speaker_id: Long[batch_size]
        # formant_shift_semitone: [batch_size]
        # slice_start_indices: [batch_size]
        # slice_segment_length: int
        # y_all: [batch_size, 1, wav_length]

        stats = {}
        loss_mel = 0.0
        loss_loudness = 0.0
        loudness_win_lengths = [512, 1024, 2048, 4096]

        # [batch_size, 1, wav_length] -> [batch_size, 1, wav_length * 240]
        y_hat_all, intermediates = self(
            noisy_wavs_16k,
            target_speaker_id,
            formant_shift_semitone,
            return_stats=True,
        )
        y_hat_all = y_hat_all.detach().where(y_all == 0.0, y_hat_all)

        with torch.amp.autocast("cuda" if torch.cuda.is_available() else "cpu", enabled=False):
            periodic_signal = intermediates["periodic_signal"].float()
            aperiodic_signal = intermediates["aperiodic_signal"].float()
            noise_excitation = intermediates["noise_excitation"].float()
            periodic_signal = periodic_signal[:, : noise_excitation.size(1)]
            aperiodic_signal = aperiodic_signal[:, : noise_excitation.size(1)]
            y_hat_all = y_hat_all.float()
            floor_noise = torch.randn_like(y_all) * self.floor_noise_level
            y_all = y_all + floor_noise
            y_hat_all += floor_noise
            y_hat_all_truncated = y_hat_all.squeeze(1)[:, : periodic_signal.size(1)]
            y_all_truncated = y_all.squeeze(1)[:, : periodic_signal.size(1)]

            y_loudness = compute_loudness(
                y_all_truncated, self.out_sample_rate, loudness_win_lengths
            )
            y_hat_loudness = compute_loudness(
                y_hat_all_truncated, self.out_sample_rate, loudness_win_lengths
            )
            for win_length, y_loudness_i, y_hat_loudness_i in zip(
                loudness_win_lengths, y_loudness, y_hat_loudness
            ):
                loss_loudness_i = F.mse_loss(y_hat_loudness_i, y_loudness_i)
                loss_loudness += loss_loudness_i * math.sqrt(win_length)
                stats[f"loss_loudness_{win_length}"] = loss_loudness_i.item()

            for melspectrogram in self.melspectrograms:
                melsp_periodic_signal = melspectrogram(periodic_signal)
                melsp_aperiodic_signal = melspectrogram(aperiodic_signal)
                melsp_noise_excitation = melspectrogram(noise_excitation)
                # [1, n_mels, 1]
                # 1/6 ... [-0.5, 0.5] の一様乱数の平均パワー
                # 3/8 ... ハン窓をかけた時のパワー減衰
                # 0.5 ... 謎
                reference_melsp = melspectrogram.mel_scale(
                    torch.full(
                        (1, melspectrogram.n_fft // 2 + 1, 1),
                        (1 / 6) * (3 / 8) * 0.5 * melspectrogram.win_length,
                        device=noisy_wavs_16k.device,
                    )
                )
                aperiodic_ratio = melsp_aperiodic_signal / (
                    melsp_periodic_signal + melsp_aperiodic_signal + 1e-5
                )
                compensation_ratio = reference_melsp / (melsp_noise_excitation + 1e-5)

                melsp_y_hat = melspectrogram(y_hat_all_truncated)
                melsp_y_hat = melsp_y_hat * (
                    (1.0 - aperiodic_ratio) + aperiodic_ratio * compensation_ratio
                )
                y_hat_mel = self._normalize_melsp(melsp_y_hat)

                y_mel = self._normalize_melsp(melspectrogram(y_all_truncated))
                loss_mel_i = F.l1_loss(y_hat_mel, y_mel)
                loss_mel += loss_mel_i
                stats[
                    f"loss_mel_{melspectrogram.win_length}_{melspectrogram.n_mels}"
                ] = loss_mel_i.item()

            loss_mel /= len(self.melspectrograms)

            if enable_loss_ap:
                t = (
                    torch.arange(intermediates["pitch"].size(1), device=y_all.device)
                    * 0.01
                    + 0.005
                )
                y_coarse_aperiodicity, y_rms = d4c(
                    y_all.squeeze(1),
                    intermediates["pitch"],
                    t,
                    self.vocoder.out_sample_rate,
                    coarse_only=True,
                )
                y_coarse_aperiodicity = 10.0 ** (y_coarse_aperiodicity / 10.0)
                y_hat_coarse_aperiodicity, y_hat_rms = d4c(
                    y_hat_all.squeeze(1),
                    intermediates["pitch"],
                    t,
                    self.vocoder.out_sample_rate,
                    coarse_only=True,
                )
                y_hat_coarse_aperiodicity = 10.0 ** (y_hat_coarse_aperiodicity / 10.0)
                rms = torch.maximum(y_rms, y_hat_rms)
                loss_ap = F.mse_loss(
                    y_hat_coarse_aperiodicity, y_coarse_aperiodicity, reduction="none"
                )
                loss_ap *= (rms / (rms + 1e-3) * (rms > 1e-5))[:, :, None]
                loss_ap = loss_ap.mean()
            else:
                loss_ap = torch.tensor(0.0)

        # [batch_size, 1, wav_length] -> [batch_size, 1, slice_segment_length * 240]
        y_hat = beatrice_slice_segments(
            y_hat_all, slice_start_indices * 240, slice_segment_length * 240
        )
        # [batch_size, 1, wav_length] -> [batch_size, 1, slice_segment_length * 240]
        y = beatrice_slice_segments(y_all, slice_start_indices * 240, slice_segment_length * 240)
        return y, y_hat, y_hat_all, loss_loudness, loss_mel, loss_ap, stats

    def merge_weights(self):
        self.vocoder.merge_weights()

    def dump(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        dump_layer(self.embed_phone, f)
        dump_layer(self.embed_quantized_pitch, f)
        dump_layer(self.embed_pitch_features, f)
        dump_layer(self.vocoder, f)

    def dump_speaker_embeddings(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump_speaker_embeddings(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        dump_params(self.vq.codebooks, f)
        dump_layer(self.embed_speaker, f)
        dump_layer(self.embed_formant_shift, f)
        dump_layer(self.key_value_speaker_embedding, f)

    def dump_embedding_setter(self, f: Union[BinaryIO, str, bytes, os.PathLike]):
        if isinstance(f, (str, bytes, os.PathLike)):
            with open(f, "wb") as f:
                self.dump_embedding_setter(f)
            return
        if not hasattr(f, "write"):
            raise TypeError

        self.vocoder.prenet.dump_kv(f)


# Discriminator


def _normalize(tensor: torch.Tensor, dim: int) -> torch.Tensor:
    denom = tensor.norm(p=2.0, dim=dim, keepdim=True).clamp_min(1e-6)
    return tensor / denom


class SANConv2d(nn.Conv2d):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        padding: int = 0,
        dilation: int = 1,
        bias: bool = True,
        padding_mode="zeros",
        device=None,
        dtype=None,
    ):
        super().__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding=padding,
            dilation=dilation,
            groups=1,
            bias=bias,
            padding_mode=padding_mode,
            device=device,
            dtype=dtype,
        )
        scale = self.weight.norm(p=2.0, dim=[1, 2, 3], keepdim=True).clamp_min(1e-6)
        self.weight = nn.parameter.Parameter(self.weight / scale.expand_as(self.weight))
        self.scale = nn.parameter.Parameter(scale.view(out_channels))
        if bias:
            self.bias = nn.parameter.Parameter(
                torch.zeros(in_channels, device=device, dtype=dtype)
            )
        else:
            self.register_parameter("bias", None)

    def forward(
        self, input: torch.Tensor, flg_san_train: bool = False
    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
        if self.bias is not None:
            input = input + self.bias.view(self.in_channels, 1, 1)
        normalized_weight = self._get_normalized_weight()
        scale = self.scale.view(self.out_channels, 1, 1)
        if flg_san_train:
            out_fun = F.conv2d(
                input,
                normalized_weight.detach(),
                None,
                self.stride,
                self.padding,
                self.dilation,
                self.groups,
            )
            out_dir = F.conv2d(
                input.detach(),
                normalized_weight,
                None,
                self.stride,
                self.padding,
                self.dilation,
                self.groups,
            )
            out = out_fun * scale, out_dir * scale.detach()
        else:
            out = F.conv2d(
                input,
                normalized_weight,
                None,
                self.stride,
                self.padding,
                self.dilation,
                self.groups,
            )
            out = out * scale
        return out

    @torch.no_grad()
    def normalize_weight(self):
        self.weight.data = self._get_normalized_weight()

    def _get_normalized_weight(self) -> torch.Tensor:
        return _normalize(self.weight, dim=[1, 2, 3])


def get_padding(kernel_size: int, dilation: int = 1) -> int:
    return (kernel_size * dilation - dilation) // 2


class BeatriceDiscriminatorP(nn.Module):
    def __init__(
        self, period: int, kernel_size: int = 5, stride: int = 3, san: bool = False
    ):
        super().__init__()
        self.period = period
        self.san = san
        # fmt: off
        self.convs = nn.ModuleList([
            weight_norm(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), (get_padding(kernel_size, 1), 0))),
            weight_norm(nn.Conv2d(32, 128, (kernel_size, 1), (stride, 1), (get_padding(kernel_size, 1), 0))),
            weight_norm(nn.Conv2d(128, 512, (kernel_size, 1), (stride, 1), (get_padding(kernel_size, 1), 0))),
            weight_norm(nn.Conv2d(512, 1024, (kernel_size, 1), (stride, 1), (get_padding(kernel_size, 1), 0))),
            weight_norm(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, (get_padding(kernel_size, 1), 0))),
        ])
        # fmt: on
        if san:
            self.conv_post = SANConv2d(1024, 1, (3, 1), 1, (1, 0))
        else:
            self.conv_post = weight_norm(nn.Conv2d(1024, 1, (3, 1), 1, (1, 0)))

    def forward(
        self, x: torch.Tensor, flg_san_train: bool = False
    ) -> tuple[
        Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], list[torch.Tensor]
    ]:
        fmap = []

        b, c, t = x.shape
        if t % self.period != 0:
            n_pad = self.period - (t % self.period)
            x = F.pad(x, (0, n_pad), "reflect")
            t = t + n_pad
        x = x.view(b, c, t // self.period, self.period)

        for conv in self.convs:
            x = conv(x)
            x = F.silu(x, inplace=True)
            fmap.append(x)
        if self.san:
            x = self.conv_post(x, flg_san_train=flg_san_train)
        else:
            x = self.conv_post(x)
        if flg_san_train:
            x_fun, x_dir = x
            fmap.append(x_fun)
            x_fun = torch.flatten(x_fun, 1, -1)
            x_dir = torch.flatten(x_dir, 1, -1)
            x = x_fun, x_dir
        else:
            fmap.append(x)
            x = torch.flatten(x, 1, -1)
        return x, fmap


class BeatriceDiscriminatorR(nn.Module):
    def __init__(self, resolution: int, san: bool = False):
        super().__init__()
        self.resolution = resolution
        self.san = san
        assert len(self.resolution) == 3
        self.convs = nn.ModuleList(
            [
                weight_norm(nn.Conv2d(1, 32, (3, 9), padding=(1, 4))),
                weight_norm(nn.Conv2d(32, 32, (3, 9), (1, 2), (1, 4))),
                weight_norm(nn.Conv2d(32, 32, (3, 9), (1, 2), (1, 4))),
                weight_norm(nn.Conv2d(32, 32, (3, 9), (1, 2), (1, 4))),
                weight_norm(nn.Conv2d(32, 32, (3, 3), padding=(1, 1))),
            ]
        )
        if san:
            self.conv_post = SANConv2d(32, 1, (3, 3), padding=(1, 1))
        else:
            self.conv_post = weight_norm(nn.Conv2d(32, 1, (3, 3), padding=(1, 1)))

    def forward(
        self, x: torch.Tensor, flg_san_train: bool = False
    ) -> tuple[
        Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], list[torch.Tensor]
    ]:
        fmap = []

        x = self._spectrogram(x).unsqueeze(1)
        for conv in self.convs:
            x = conv(x)
            x = F.silu(x, inplace=True)
            fmap.append(x)
        if self.san:
            x = self.conv_post(x, flg_san_train=flg_san_train)
        else:
            x = self.conv_post(x)
        if flg_san_train:
            x_fun, x_dir = x
            fmap.append(x_fun)
            x_fun = torch.flatten(x_fun, 1, -1)
            x_dir = torch.flatten(x_dir, 1, -1)
            x = x_fun, x_dir
        else:
            fmap.append(x)
            x = torch.flatten(x, 1, -1)

        return x, fmap

    def _spectrogram(self, x: torch.Tensor) -> torch.Tensor:
        n_fft, hop_length, win_length = self.resolution
        x = F.pad(
            x, ((n_fft - hop_length) // 2, (n_fft - hop_length) // 2), mode="reflect"
        ).squeeze(1)
        with torch.amp.autocast("cuda" if torch.cuda.is_available() else "cpu", enabled=False):
            mag = torch.stft(
                x.float(),
                n_fft=n_fft,
                hop_length=hop_length,
                win_length=win_length,
                window=torch.ones(win_length, device=x.device),
                center=False,
                return_complex=True,
            ).abs()

        return mag


class BeatriceMultiPeriodDiscriminator(nn.Module):
    def __init__(self, san: bool = False):
        super().__init__()
        resolutions = [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]]
        periods = [2, 3, 5, 7, 11]
        self.discriminators = nn.ModuleList(
            [BeatriceDiscriminatorR(r, san=san) for r in resolutions]
            + [BeatriceDiscriminatorP(p, san=san) for p in periods]
        )
        self.discriminator_names = [f"R_{n}_{h}_{w}" for n, h, w in resolutions] + [
            f"P_{p}" for p in periods
        ]
        self.san = san

    def forward(
        self, y: torch.Tensor, y_hat: torch.Tensor, flg_san_train: bool = False
    ) -> tuple[
        list[Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]],
        list[Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]],
        list[list[torch.Tensor]],
        list[list[torch.Tensor]],
    ]:
        batch_size = y.size(0)
        concatenated_y_y_hat = torch.cat([y, y_hat])
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
        fmap_gs = []
        for d in self.discriminators:
            if flg_san_train:
                (y_d_fun, y_d_dir), fmap = d(
                    concatenated_y_y_hat, flg_san_train=flg_san_train
                )
                y_d_r_fun, y_d_g_fun = torch.split(y_d_fun, batch_size)
                y_d_r_dir, y_d_g_dir = torch.split(y_d_dir, batch_size)
                y_d_r = y_d_r_fun, y_d_r_dir
                y_d_g = y_d_g_fun, y_d_g_dir
            else:
                y_d, fmap = d(concatenated_y_y_hat, flg_san_train=flg_san_train)
                y_d_r, y_d_g = torch.split(y_d, batch_size)
            fmap_r = []
            fmap_g = []
            for fm in fmap:
                fm_r, fm_g = torch.split(fm, batch_size)
                fmap_r.append(fm_r)
                fmap_g.append(fm_g)
            y_d_rs.append(y_d_r)
            y_d_gs.append(y_d_g)
            fmap_rs.append(fmap_r)
            fmap_gs.append(fmap_g)
        return y_d_rs, y_d_gs, fmap_rs, fmap_gs

    def forward_and_compute_loss(
        self, y: torch.Tensor, y_hat: torch.Tensor
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict[str, float]]:
        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self(y, y_hat, flg_san_train=self.san)
        stats = {}
        assert len(y_d_gs) == len(y_d_rs) == len(self.discriminators)
        with torch.amp.autocast("cuda" if torch.cuda.is_available() else "cpu", enabled=False):
            # discriminator loss
            d_loss = 0.0
            for dr, dg, name in zip(y_d_rs, y_d_gs, self.discriminator_names):
                if self.san:
                    dr_fun, dr_dir = map(lambda x: x.float(), dr)
                    dg_fun, dg_dir = map(lambda x: x.float(), dg)
                    r_loss_fun = F.softplus(1.0 - dr_fun).square().mean()
                    g_loss_fun = F.softplus(dg_fun).square().mean()
                    r_loss_dir = F.softplus(1.0 - dr_dir).square().mean()
                    g_loss_dir = -F.softplus(1.0 - dg_dir).square().mean()
                    r_loss = r_loss_fun + r_loss_dir
                    g_loss = g_loss_fun + g_loss_dir
                else:
                    dr = dr.float()
                    dg = dg.float()
                    r_loss = (1.0 - dr).square().mean()
                    g_loss = dg.square().mean()
                stats[f"{name}_dr_loss"] = r_loss.item()
                stats[f"{name}_dg_loss"] = g_loss.item()
                d_loss += r_loss + g_loss
            # adversarial loss
            adv_loss = 0.0
            for dg, name in zip(y_d_gs, self.discriminator_names):
                if self.san:
                    dg_fun = dg[0].float()
                    g_loss = F.softplus(1.0 - dg_fun).square().mean()
                else:
                    dg = dg.float()
                    g_loss = (1.0 - dg).square().mean()
                stats[f"{name}_gg_loss"] = g_loss.item()
                adv_loss += g_loss
            # feature mathcing loss
            fm_loss = 0.0
            for fr, fg, name in zip(fmap_rs, fmap_gs, self.discriminator_names):
                fm_loss_i = 0.0
                for j, (r, g) in enumerate(zip(fr, fg)):
                    fm_loss_ij = (r.detach().float() - g.float()).abs().mean()
                    stats[f"~{name}_fm_loss_{j}"] = fm_loss_ij.item()
                    fm_loss_i += fm_loss_ij
                stats[f"{name}_fm_loss"] = fm_loss_i.item()
                fm_loss += fm_loss_i
        return d_loss, adv_loss, fm_loss, stats



class GradBalancer:
    """Adapted from https://github.com/facebookresearch/encodec/blob/main/encodec/balancer.py"""

    def __init__(
        self,
        weights: dict[str, float],
        rescale_grads: bool = True,
        total_norm: float = 1.0,
        ema_decay: float = 0.999,
        per_batch_item: bool = True,
    ):
        self.weights = weights
        self.per_batch_item = per_batch_item
        self.total_norm = total_norm
        self.ema_decay = ema_decay
        self.rescale_grads = rescale_grads

        self.ema_total: dict[str, float] = defaultdict(float)
        self.ema_fix: dict[str, float] = defaultdict(float)

    def backward(
        self,
        losses: dict[str, torch.Tensor],
        input: torch.Tensor,
        scaler: Optional[torch.amp.GradScaler] = None,
        skip_update_ema: bool = False,
    ) -> dict[str, float]:
        stats = {}
        if skip_update_ema:
            assert len(losses) == len(self.ema_total)
            ema_norms = {k: tot / self.ema_fix[k] for k, tot in self.ema_total.items()}
        else:
            # 各 loss に対して d loss / d input とそのノルムを計算する
            norms = {}
            grads = {}
            for name, loss in losses.items():
                if scaler is not None:
                    loss = scaler.scale(loss)
                (grad,) = torch.autograd.grad(loss, [input], retain_graph=True)
                if not grad.isfinite().all():
                    input.backward(grad)
                    return {}
                grad = grad.detach() / (1.0 if scaler is None else scaler.get_scale())
                if self.per_batch_item:
                    dims = tuple(range(1, grad.dim()))
                    ema_norm = grad.norm(dim=dims).mean()
                else:
                    ema_norm = grad.norm()
                norms[name] = float(ema_norm)
                grads[name] = grad

            # ノルムの移動平均を計算する
            for key, value in norms.items():
                self.ema_total[key] = self.ema_total[key] * self.ema_decay + value
                self.ema_fix[key] = self.ema_fix[key] * self.ema_decay + 1.0
            ema_norms = {k: tot / self.ema_fix[k] for k, tot in self.ema_total.items()}

            # ログを取る
            total_ema_norm = sum(ema_norms.values())
            for k, ema_norm in ema_norms.items():
                stats[f"grad_norm_value_{k}"] = ema_norm
                stats[f"grad_norm_ratio_{k}"] = ema_norm / (total_ema_norm + 1e-12)

        # loss の係数の比率を計算する
        if self.rescale_grads:
            total_weights = sum([self.weights[k] for k in ema_norms])
            ratios = {k: w / total_weights for k, w in self.weights.items()}

        # 勾配を修正する
        loss = 0.0
        for name, ema_norm in ema_norms.items():
            if self.rescale_grads:
                scale = ratios[name] * self.total_norm / (ema_norm + 1e-12)
            else:
                scale = self.weights[name]
            loss += (losses if skip_update_ema else grads)[name] * scale
        if scaler is not None:
            loss = scaler.scale(loss)
        if skip_update_ema:
            (loss,) = torch.autograd.grad(loss, [input])
        input.backward(loss)
        return stats

    def state_dict(self) -> dict[str, dict[str, float]]:
        return {
            "ema_total": dict(self.ema_total),
            "ema_fix": dict(self.ema_fix),
        }

    def load_state_dict(self, state_dict):
        self.ema_total = defaultdict(float, state_dict["ema_total"])
        self.ema_fix = defaultdict(float, state_dict["ema_fix"])


class QualityTester(nn.Module):
    def __init__(self):
        super().__init__()
        self.utmos = torch.hub.load(
            "tarepan/SpeechMOS:v1.0.0", "utmos22_strong", trust_repo=True
        ).eval()

    @torch.inference_mode()
    def compute_mos(self, wav: torch.Tensor) -> dict[str, list[float]]:
        res = {"utmos": self.utmos(wav, sr=16000).tolist()}
        return res

    def test(
        self, converted_wav: torch.Tensor, source_wav: torch.Tensor
    ) -> dict[str, list[float]]:
        # [batch_size, wav_length]
        res = {}
        res.update(self.compute_mos(converted_wav))
        return res

    def test_many(
        self, converted_wavs: list[torch.Tensor], source_wavs: list[torch.Tensor]
    ) -> tuple[dict[str, float], dict[str, list[float]]]:
        # list[batch_size, wav_length]
        results = defaultdict(list)
        assert len(converted_wavs) == len(source_wavs)
        for converted_wav, source_wav in zip(converted_wavs, source_wavs):
            res = self.test(converted_wav, source_wav)
            for metric_name, value in res.items():
                results[metric_name].extend(value)
        return {
            metric_name: sum(values) / len(values)
            for metric_name, values in results.items()
        }, results


def compute_grad_norm(
    model: nn.Module, return_stats: bool = False
) -> Union[float, dict[str, float]]:
    total_norm = 0.0
    stats = {}
    for name, p in model.named_parameters():
        if p.grad is None:
            continue
        param_norm = p.grad.data.norm().item()
        if not math.isfinite(param_norm):
            param_norm = p.grad.data.float().norm().item()
        total_norm += param_norm * param_norm
        if return_stats:
            stats[f"grad_norm_{name}"] = param_norm
    total_norm = math.sqrt(total_norm)
    if return_stats:
        return total_norm, stats
    else:
        return total_norm


def compute_mean_f0(
    files: list[Path], method: Literal["dio", "harvest"] = "dio"
) -> float:
    sum_log_f0 = 0.0
    n_frames = 0
    for file in files:
        wav, sr = beatrice_load_audio(file)
        if method == "dio":
            f0, _ = pyworld.dio(wav.ravel().numpy().astype(np.float64), sr)
        elif method == "harvest":
            f0, _ = pyworld.harvest(wav.ravel().numpy().astype(np.float64), sr)
        else:
            raise ValueError(f"Invalid method: {method}")
        f0 = f0[f0 > 0]
        sum_log_f0 += float(np.log(f0).sum())
        n_frames += len(f0)
    if n_frames == 0:
        return math.nan
    mean_log_f0 = sum_log_f0 / n_frames
    return math.exp(mean_log_f0)



def get_resampler(
    sr_before: int, sr_after: int, device="cpu", cache={}
) -> torchaudio.transforms.Resample:
    if not isinstance(device, str):
        device = str(device)
    if (sr_before, sr_after, device) not in cache:
        cache[(sr_before, sr_after, device)] = torchaudio.transforms.Resample(
            sr_before, sr_after
        ).to(device)
    return cache[(sr_before, sr_after, device)]


def convolve(signal: torch.Tensor, ir: torch.Tensor) -> torch.Tensor:
    n = 1 << (signal.size(-1) + ir.size(-1) - 2).bit_length()
    res = torch.fft.irfft(torch.fft.rfft(signal, n=n) * torch.fft.rfft(ir, n=n), n=n)
    return res[..., : signal.size(-1)]


def random_formant_shift(
    wav: torch.Tensor,
    sample_rate: int,
    formant_shift_semitone_min: float = -3.0,
    formant_shift_semitone_max: float = 3.0,
) -> torch.Tensor:
    assert wav.ndim == 2
    assert wav.size(0) == 1

    device = wav.device

    hop_length = 256

    # [wav_length]
    wav_np = wav.ravel().double().cpu().numpy()
    f0, t = pyworld.dio(
        wav_np,
        sample_rate,
        f0_floor=55,
        f0_ceil=1400,
        frame_period=hop_length * 1000 / sample_rate,
    )
    f0 = pyworld.stonemask(wav_np, f0, t, sample_rate)
    world_sp = pyworld.cheaptrick(wav_np, f0, t, sample_rate)
    world_sp = (
        torch.from_numpy(world_sp).float().to(device).sqrt_()[None]
    )  # [1, length, n_fft // 2 + 1]

    n_fft = win_length = (world_sp.size(2) - 1) * 2

    window = torch.hann_window(win_length, device=device)

    # [1, n_fft // 2 + 1, length]
    stft_sp = torch.stft(
        wav,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window=window,
        return_complex=True,
    )
    assert world_sp.size(1) == stft_sp.size(2), (world_sp.size(), stft_sp.size())
    assert world_sp.size(2) == stft_sp.size(1), (world_sp.size(), stft_sp.size())

    shift_semitones = (
        torch.rand(()).item()
        * (formant_shift_semitone_max - formant_shift_semitone_min)
        + formant_shift_semitone_min
    )
    shift_ratio = 2.0 ** (shift_semitones / 12.0)
    shifted_world_sp = F.interpolate(
        world_sp, scale_factor=shift_ratio, mode="linear", align_corners=True
    )

    if shifted_world_sp.size(2) > n_fft // 2 + 1:
        shifted_world_sp = shifted_world_sp[:, :, : n_fft // 2 + 1]
    elif shifted_world_sp.size(2) < n_fft // 2 + 1:
        shifted_world_sp = F.pad(
            shifted_world_sp, (0, n_fft // 2 + 1 - shifted_world_sp.size(2))
        )

    ratio = ((shifted_world_sp + 1e-5) / (world_sp + 1e-5)).clamp(0.1, 10.0)
    stft_sp *= ratio.transpose(-2, -1)  # [1, n_fft // 2 + 1, length]

    out = torch.istft(
        stft_sp,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window=window,
        length=wav.size(-1),
    )

    return out


def random_filter(audio: torch.Tensor) -> torch.Tensor:
    assert audio.ndim == 2
    ab = torch.rand(audio.size(0), 6) * 0.75 - 0.375
    a, b = ab[:, :3], ab[:, 3:]
    a[:, 0] = 1.0
    b[:, 0] = 1.0
    audio = torchaudio.functional.lfilter(audio, a, b, clamp=False)
    return audio


def get_noise(
    n_samples: int, sample_rate: float, files: list[Union[str, bytes, os.PathLike]]
) -> torch.Tensor:
    resample_augmentation_candidates = [0.9, 0.95, 1.0, 1.05, 1.1]
    wavs = []
    current_length = 0
    while current_length < n_samples:
        idx_files = torch.randint(0, len(files), ())
        file = files[idx_files]
        wav, sr = beatrice_load_audio(file)
        assert wav.size(0) == 1
        augmented_sample_rate = int(
            round(
                sample_rate
                * resample_augmentation_candidates[
                    torch.randint(0, len(resample_augmentation_candidates), ())
                ]
            )
        )
        resampler = get_resampler(sr, augmented_sample_rate)
        wav = resampler(wav)
        wav = random_filter(wav)
        wav *= 0.99 / (wav.abs().max() + 1e-5)
        wavs.append(wav)
        current_length += wav.size(1)
    start = torch.randint(0, current_length - n_samples + 1, ())
    wav = torch.cat(wavs, dim=1)[:, start : start + n_samples]
    assert wav.size() == (1, n_samples), wav.size()
    return wav


def get_butterworth_lpf(
    cutoff_freq: float, sample_rate: int, cache={}
) -> tuple[torch.Tensor, torch.Tensor]:
    if (cutoff_freq, sample_rate) not in cache:
        q = math.sqrt(0.5)
        omega = math.tau * cutoff_freq / sample_rate
        cos_omega = math.cos(omega)
        alpha = math.sin(omega) / (2.0 * q)
        b1 = (1.0 - cos_omega) / (1.0 + alpha)
        b0 = b1 * 0.5
        a1 = -2.0 * cos_omega / (1.0 + alpha)
        a2 = (1.0 - alpha) / (1.0 + alpha)
        cache[(cutoff_freq, sample_rate)] = (
            torch.tensor([b0, b1, b0]),
            torch.tensor([1.0, a1, a2]),
        )
    return cache[(cutoff_freq, sample_rate)]


def augment_audio(
    clean: torch.Tensor,
    sample_rate: int,
    noise_files: list[Union[str, bytes, os.PathLike]],
    ir_files: list[Union[str, bytes, os.PathLike]],
    snr_candidates: list[float] = [20.0, 25.0, 30.0, 35.0, 40.0, 45.0],
    formant_shift_probability: float = 0.5,
    formant_shift_semitone_min: float = -3.0,
    formant_shift_semitone_max: float = 3.0,
    reverb_probability: float = 0.5,
    lpf_probability: float = 0.2,
    lpf_cutoff_freq_candidates: list[float] = [2000.0, 3000.0, 4000.0, 6000.0],
) -> torch.Tensor:
    # [1, wav_length]
    assert clean.size(0) == 1
    n_samples = clean.size(1)

    original_clean_rms = clean.square().mean().sqrt_()

    # clean をフォルマントシフトする
    if torch.rand(()) < formant_shift_probability:
        clean = random_formant_shift(
            clean, sample_rate, formant_shift_semitone_min, formant_shift_semitone_max
        )

    # noise を取得して clean と concat する
    noise = get_noise(n_samples, sample_rate, noise_files)
    signals = torch.cat([clean, noise])

    # clean, noise に異なるランダムフィルタをかける
    signals = random_filter(signals)

    # clean, noise にリバーブをかける
    if torch.rand(()) < reverb_probability:
        ir_file = ir_files[torch.randint(0, len(ir_files), ())]
        ir, sr = beatrice_load_audio(ir_file)
        assert ir.size() == (2, sr), ir.size()
        assert sr == sample_rate, (sr, sample_rate)
        signals = convolve(signals, ir)

    # clean, noise に同じ LPF をかける
    if torch.rand(()) < lpf_probability:
        if signals.abs().max() > 0.8:
            signals /= signals.abs().max() * 1.25
        cutoff_freq = lpf_cutoff_freq_candidates[
            torch.randint(0, len(lpf_cutoff_freq_candidates), ())
        ]
        b, a = get_butterworth_lpf(cutoff_freq, sample_rate)
        signals = torchaudio.functional.lfilter(signals, a, b, clamp=False)

    # clean の音量を合わせる
    clean, noise = signals
    clean_rms = clean.square().mean().sqrt_()
    clean *= original_clean_rms / clean_rms

    if len(snr_candidates) >= 1:
        # clean, noise の音量をピークを重視して取る
        clean_level = clean.square().square_().mean().sqrt_().sqrt_()
        noise_level = noise.square().square_().mean().sqrt_().sqrt_()
        # SNR
        snr = snr_candidates[torch.randint(0, len(snr_candidates), ())]
        # noisy を生成
        noisy = clean + noise * (
            0.1 ** (snr / 20.0) * clean_level / (noise_level + 1e-5)
        )

    return noisy


class WavDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        audio_files: list[tuple[Path, int]],
        in_sample_rate: int = 16000,
        out_sample_rate: int = 24000,
        wav_length: int = 4 * 24000,  # 4s
        segment_length: int = 100,  # 1s
        noise_files: Optional[list[Union[str, bytes, os.PathLike]]] = None,
        ir_files: Optional[list[Union[str, bytes, os.PathLike]]] = None,
        augmentation_snr_candidates: list[float] = [20.0, 25.0, 30.0, 35.0, 40.0, 45.0],
        augmentation_formant_shift_probability: float = 0.5,
        augmentation_formant_shift_semitone_min: float = -3.0,
        augmentation_formant_shift_semitone_max: float = 3.0,
        augmentation_reverb_probability: float = 0.5,
        augmentation_lpf_probability: float = 0.2,
        augmentation_lpf_cutoff_freq_candidates: list[float] = [
            2000.0,
            3000.0,
            4000.0,
            6000.0,
        ],
    ):
        self.audio_files = audio_files
        self.in_sample_rate = in_sample_rate
        self.out_sample_rate = out_sample_rate
        self.wav_length = wav_length
        self.segment_length = segment_length
        self.noise_files = noise_files
        self.ir_files = ir_files
        self.augmentation_snr_candidates = augmentation_snr_candidates
        self.augmentation_formant_shift_probability = (
            augmentation_formant_shift_probability
        )
        self.augmentation_formant_shift_semitone_min = (
            augmentation_formant_shift_semitone_min
        )
        self.augmentation_formant_shift_semitone_max = (
            augmentation_formant_shift_semitone_max
        )
        self.augmentation_reverb_probability = augmentation_reverb_probability
        self.augmentation_lpf_probability = augmentation_lpf_probability
        self.augmentation_lpf_cutoff_freq_candidates = (
            augmentation_lpf_cutoff_freq_candidates
        )

        if (noise_files is None) is not (ir_files is None):
            raise ValueError("noise_files and ir_files must be both None or not None")

        self.in_hop_length = in_sample_rate // 100
        self.out_hop_length = out_sample_rate // 100  # 10ms 刻み

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor, int, int]:
        file, speaker_id = self.audio_files[index]
        clean_wav, sample_rate = beatrice_load_audio(file)
        if clean_wav.size(0) != 1:
            ch = torch.randint(0, clean_wav.size(0), ())
            clean_wav = clean_wav[ch : ch + 1]

        formant_shift_candidates = [-2.0, -1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0]
        formant_shift = formant_shift_candidates[
            torch.randint(0, len(formant_shift_candidates), ()).item()
        ]

        resampler_fraction = Fraction(
            sample_rate / self.out_sample_rate * 2.0 ** (formant_shift / 12.0)
        ).limit_denominator(300)
        clean_wav = get_resampler(
            resampler_fraction.numerator, resampler_fraction.denominator
        )(clean_wav)

        assert clean_wav.size(0) == 1
        assert clean_wav.size(1) != 0

        clean_wav = F.pad(clean_wav, (self.wav_length, self.wav_length))

        if self.noise_files is None:
            noisy_wav_16k = get_resampler(self.out_sample_rate, self.in_sample_rate)(
                clean_wav
            )
        else:
            clean_wav_16k = get_resampler(self.out_sample_rate, self.in_sample_rate)(
                clean_wav
            )
            noisy_wav_16k = augment_audio(
                clean_wav_16k,
                self.in_sample_rate,
                self.noise_files,
                self.ir_files,
                self.augmentation_snr_candidates,
                self.augmentation_formant_shift_probability,
                self.augmentation_formant_shift_semitone_min,
                self.augmentation_formant_shift_semitone_max,
                self.augmentation_reverb_probability,
                self.augmentation_lpf_probability,
                self.augmentation_lpf_cutoff_freq_candidates,
            )

        clean_wav = clean_wav.squeeze_(0)
        noisy_wav_16k = noisy_wav_16k.squeeze_(0)

        # 音量をランダマイズする
        amplitude = torch.rand(()).item() * 0.899 + 0.1
        factor = amplitude / clean_wav.abs().max()
        clean_wav *= factor
        noisy_wav_16k *= factor
        while noisy_wav_16k.abs().max() >= 1.0:
            clean_wav *= 0.5
            noisy_wav_16k *= 0.5

        return clean_wav, noisy_wav_16k, speaker_id, formant_shift

    def __len__(self) -> int:
        return len(self.audio_files)

    def collate(
        self, batch: list[tuple[torch.Tensor, torch.Tensor, int, int]]
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        assert self.wav_length % self.out_hop_length == 0
        length = self.wav_length // self.out_hop_length
        clean_wavs = []
        noisy_wavs = []
        slice_starts = []
        speaker_ids = []
        formant_shifts = []
        for clean_wav, noisy_wav, speaker_id, formant_shift in batch:
            # 発声部分をランダムに 1 箇所選ぶ
            (voiced,) = clean_wav.nonzero(as_tuple=True)
            assert voiced.numel() != 0
            center = voiced[torch.randint(0, voiced.numel(), ()).item()].item()
            # 発声部分が中央にくるように、スライス区間を選ぶ
            slice_start = center - self.segment_length * self.out_hop_length // 2
            assert slice_start >= 0
            # スライス区間が含まれるように、ランダムに wav_length の長さを切り出す
            r = torch.randint(0, length - self.segment_length + 1, ()).item()
            offset = slice_start - r * self.out_hop_length
            clean_wavs.append(clean_wav[offset : offset + self.wav_length])
            offset_in_sample_rate = int(
                round(offset * self.in_sample_rate / self.out_sample_rate)
            )
            noisy_wavs.append(
                noisy_wav[
                    offset_in_sample_rate : offset_in_sample_rate
                    + length * self.in_hop_length
                ]
            )
            slice_start = r
            slice_starts.append(slice_start)
            speaker_ids.append(speaker_id)
            formant_shifts.append(formant_shift)
        clean_wavs = torch.stack(clean_wavs)
        noisy_wavs = torch.stack(noisy_wavs)
        slice_starts = torch.tensor(slice_starts)
        speaker_ids = torch.tensor(speaker_ids)
        formant_shifts = torch.tensor(formant_shifts)
        return (
            clean_wavs,  # [batch_size, wav_length]
            noisy_wavs,  # [batch_size, wav_length]
            slice_starts,  # Long[batch_size]
            speaker_ids,  # Long[batch_size]
            formant_shifts,  # Long[batch_size]
        )



AUDIO_FILE_SUFFIXES = {
    ".wav",
    ".aif",
    ".aiff",
    ".fla",
    ".flac",
    ".oga",
    ".ogg",
    ".opus",
    ".mp3",
}


def get_compressed_optimizer_state_dict(
    optimizer: torch.optim.Optimizer,
) -> dict:
    state_dict = {}
    for k0, v0 in optimizer.state_dict().items():
        if k0 != "state":
            state_dict[k0] = v0
            continue
        state_dict[k0] = {}
        for k1, v1 in v0.items():
            state_dict[k0][k1] = {}
            for k2, v2 in v1.items():
                if isinstance(v2, torch.Tensor):
                    state_dict[k0][k1][k2] = v2.bfloat16()
                    assert state_dict[k0][k1][k2].isfinite().all()
                else:
                    state_dict[k0][k1][k2] = v2
    return state_dict


def get_decompressed_optimizer_state_dict(compressed_state_dict: dict) -> dict:
    state_dict = {}
    for k0, v0 in compressed_state_dict.items():
        if k0 != "state":
            state_dict[k0] = v0
            continue
        state_dict[k0] = {}
        for k1, v1 in v0.items():
            state_dict[k0][k1] = {}
            for k2, v2 in v1.items():
                if isinstance(v2, torch.Tensor):
                    state_dict[k0][k1][k2] = v2.float()
                    assert state_dict[k0][k1][k2].isfinite().all()
                else:
                    state_dict[k0][k1][k2] = v2
    return state_dict




# ============================================================
# BEATRICE V2 TRAINING - Embedded (downloads assets from HuggingFace)
# ============================================================

BEATRICE_AUDIO_FILE_SUFFIXES = {".wav", ".aif", ".aiff", ".fla", ".flac", ".oga", ".ogg", ".opus", ".mp3"}


def preprocess_audio_for_beatrice(audio_path: str, output_dir: str, speaker_name: str = "speaker"):
    """Preprocess audio for Beatrice training using silence-based splitting"""

    # Create speaker directory structure required by Beatrice
    speaker_dir = os.path.join(output_dir, speaker_name)
    os.makedirs(speaker_dir, exist_ok=True)

    # Load audio at 16kHz (Beatrice input sample rate)
    audio, sr = librosa.load(audio_path, sr=16000, mono=True)

    # Simple silence-based splitting (RMS threshold)
    chunk_size = int(4.0 * sr)  # 4 second chunks
    hop = int(3.5 * sr)  # 0.5s overlap
    threshold = 0.01  # RMS threshold

    chunks_saved = 0
    for i, start in enumerate(range(0, len(audio) - chunk_size, hop)):
        chunk = audio[start:start + chunk_size]
        rms = np.sqrt(np.mean(chunk ** 2))

        if rms > threshold:  # Skip silence
            # Normalize
            max_val = np.abs(chunk).max()
            if max_val > 0:
                chunk = chunk / max_val * 0.9

            chunk_path = os.path.join(speaker_dir, f"{speaker_name}_{chunks_saved:04d}.wav")
            sf.write(chunk_path, chunk, sr)
            chunks_saved += 1

    logger.info(f"Beatrice preprocessing: {chunks_saved} chunks saved to {speaker_dir}")
    return chunks_saved, output_dir


def train_beatrice_generator(
    data_dir: str,
    output_dir: str,
    epochs: int = 30,
    batch_size: int = 8,
    lr_g: float = 5e-5,
    lr_d: float = 5e-5,
    use_augmentation: bool = False,
    resume: bool = False,
    progress_callback=None,
):
    """Train Beatrice v2 model - generator yielding (message, model_path) tuples"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Download pretrained
    yield "Downloading pretrained models...", None
    phone_extractor_path = download_beatrice_asset("phone_extractor")
    pitch_estimator_path = download_beatrice_asset("pitch_estimator")
    pretrained_model_path = download_beatrice_asset("pretrained_model")

    # Discover speakers from directory structure
    # Expected: data_dir/speaker_name/*.wav
    speakers = []
    training_filelist = []
    speaker_audio_files = []
    for speaker_dir in sorted(Path(data_dir).iterdir()):
        if not speaker_dir.is_dir():
            continue
        candidates = [f for f in sorted(speaker_dir.rglob("*"))
                      if f.is_file() and f.suffix.lower() in BEATRICE_AUDIO_FILE_SUFFIXES]
        if candidates:
            speaker_id = len(speakers)
            speakers.append(speaker_dir.name)
            training_filelist.extend([(f, speaker_id) for f in candidates])
            speaker_audio_files.append(candidates)

    n_speakers = len(speakers)
    if n_speakers == 0:
        yield "Error: No speakers found in data directory", None
        return

    yield f"Found {n_speakers} speaker(s), {len(training_filelist)} files", None

    # Augmentation assets (optional)
    noise_files = None
    ir_files = None
    if use_augmentation:
        try:
            noise_dir, ir_dir = download_beatrice_augmentation()
            if noise_dir and ir_dir:
                noise_files = sorted(list(Path(noise_dir).rglob("*.wav")) + list(Path(noise_dir).rglob("*.flac")))
                ir_files = sorted(list(Path(ir_dir).rglob("*.wav")) + list(Path(ir_dir).rglob("*.flac")))
                if noise_files and ir_files:
                    yield f"Loaded augmentation: {len(noise_files)} noise, {len(ir_files)} IR files", None
                else:
                    noise_files = None
                    ir_files = None
        except Exception as e:
            yield f"Warning: Could not load augmentation assets: {e}", None

    # Build models
    yield "Building models...", None

    phone_extractor = PhoneExtractor().to(device).eval().requires_grad_(False)
    pe_ckpt = torch.load(phone_extractor_path, map_location="cpu", weights_only=True)
    phone_extractor.load_state_dict(pe_ckpt["phone_extractor"], strict=False)
    del pe_ckpt

    pitch_estimator = PitchEstimator().to(device).eval().requires_grad_(False)
    pi_ckpt = torch.load(pitch_estimator_path, map_location="cpu", weights_only=True)
    pitch_estimator.load_state_dict(pi_ckpt["pitch_estimator"])
    del pi_ckpt

    hidden_channels = 256
    pitch_bins = 448

    net_g = ConverterNetwork(
        phone_extractor, pitch_estimator,
        n_speakers=n_speakers,
        pitch_bins=pitch_bins,
        hidden_channels=hidden_channels,
        vq_topk=4,
        training_time_vq="none",
        phone_noise_ratio=0.5,
        floor_noise_level=1e-3,
    ).to(device)

    net_d = BeatriceMultiPeriodDiscriminator(san=True).to(device)

    # Optimizers
    optim_g = torch.optim.AdamW(net_g.parameters(), lr_g, betas=(0.8, 0.99), eps=1e-6)
    optim_d = torch.optim.AdamW(net_d.parameters(), lr_d, betas=(0.8, 0.99), eps=1e-6)

    grad_scaler = torch.amp.GradScaler(device.type, enabled=device.type == "cuda")
    grad_balancer = GradBalancer(
        weights={
            "loss_loudness": 1.0,
            "loss_mel": 45.0,
            "loss_adv": 1.0,
            "loss_fm": 2.0,
        },
        ema_decay=0.999,
    )

    initial_iteration = 0
    os.makedirs(output_dir, exist_ok=True)

    # Load pretrained or resume
    if resume:
        latest_ckpt = os.path.join(output_dir, "checkpoint_latest.pt.gz")
        if os.path.isfile(latest_ckpt):
            yield "Resuming from checkpoint...", None
            with gzip.open(latest_ckpt, "rb") as f:
                ckpt = torch.load(f, map_location="cpu", weights_only=True)
            net_g.load_state_dict(ckpt["net_g"], strict=False)
            # Filter discriminator for shape mismatches
            net_d_state = net_d.state_dict()
            filtered_d = {k: v for k, v in ckpt["net_d"].items()
                          if k in net_d_state and v.shape == net_d_state[k].shape}
            net_d.load_state_dict(filtered_d, strict=False)
            optim_g.load_state_dict(get_decompressed_optimizer_state_dict(ckpt["optim_g"]))
            optim_d.load_state_dict(get_decompressed_optimizer_state_dict(ckpt["optim_d"]))
            if "grad_balancer" in ckpt:
                grad_balancer.load_state_dict(ckpt["grad_balancer"])
            if "grad_scaler" in ckpt:
                grad_scaler.load_state_dict(ckpt["grad_scaler"])
            initial_iteration = ckpt.get("iteration", 0)
            del ckpt
        else:
            yield "No checkpoint found, starting fresh with pretrained", None
            resume = False

    if not resume:
        yield "Loading pretrained weights...", None
        with gzip.open(pretrained_model_path, "rb") as f:
            pretrained_ckpt = torch.load(f, map_location="cpu", weights_only=True)
        # Adapt pretrained for our n_speakers
        initial_speaker_emb = pretrained_ckpt["net_g"]["embed_speaker.weight"][:1]
        pretrained_ckpt["net_g"]["embed_speaker.weight"] = initial_speaker_emb[[0] * n_speakers]
        initial_kv_emb = pretrained_ckpt["net_g"]["key_value_speaker_embedding.weight"][:1]
        pretrained_ckpt["net_g"]["key_value_speaker_embedding.weight"] = initial_kv_emb[[0] * n_speakers]
        pretrained_ckpt["net_g"]["vq.codebooks"] = pretrained_ckpt["net_g"]["vq.codebooks"][[0] * n_speakers]
        net_g.load_state_dict(pretrained_ckpt["net_g"], strict=False)
        # Filter discriminator state dict for shape mismatches (pretrained may use san=False)
        net_d_state = net_d.state_dict()
        filtered_d = {k: v for k, v in pretrained_ckpt["net_d"].items()
                      if k in net_d_state and v.shape == net_d_state[k].shape}
        net_d.load_state_dict(filtered_d, strict=False)
        logger.info(f"Loaded {len(filtered_d)}/{len(pretrained_ckpt['net_d'])} discriminator weights")
        # Don't load grad_balancer/grad_scaler from pretrained - our loss weights may differ
        # These will be re-initialized fresh for fine-tuning
        del pretrained_ckpt

    # Build VQ codebooks
    yield "Building VQ codebooks...", None

    def wav_iterator(files):
        for file in files:
            wav, sr = beatrice_load_audio(file)
            wav = wav.to(device)
            if sr != 16000:
                wav = get_resampler(sr, 16000, str(device))(wav)
            yield wav[:, None, :]

    if resume:
        net_g.enable_hook()
    else:
        net_g.initialize_vq([wav_iterator(files) for files in speaker_audio_files])

    # Dataset
    dataset = WavDataset(
        training_filelist,
        in_sample_rate=16000,
        out_sample_rate=24000,
        wav_length=96000,
        segment_length=100,
        noise_files=noise_files,
        ir_files=ir_files,
    )

    _num_workers = min(4, os.cpu_count() or 1)
    effective_batch = min(batch_size, len(training_filelist))
    dataloader = torch.utils.data.DataLoader(
        dataset,
        num_workers=_num_workers,
        collate_fn=dataset.collate,
        shuffle=True,
        batch_size=effective_batch,
        pin_memory=True,
        drop_last=len(training_filelist) > effective_batch,
        persistent_workers=_num_workers > 0,
    )

    # Calculate steps
    steps_per_epoch = max(1, len(training_filelist) // batch_size)
    total_steps = epochs * steps_per_epoch
    warmup_steps = min(total_steps // 4, 5000)

    # LR scheduler with warmup
    def lr_lambda(step):
        if step < warmup_steps:
            return step / max(1, warmup_steps)
        return 0.999 ** (step - warmup_steps)

    scheduler_g = torch.optim.lr_scheduler.LambdaLR(optim_g, lr_lambda)
    scheduler_d = torch.optim.lr_scheduler.LambdaLR(optim_d, lr_lambda)

    # Advance schedulers if resuming
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message=r"Detected call of `lr_scheduler\.step\(\)")
        for _ in range(initial_iteration + 1):
            scheduler_g.step()
            scheduler_d.step()

    net_g.train()
    net_d.train()

    yield f"Training {total_steps} steps ({epochs} epochs x {steps_per_epoch} steps/epoch)", None

    # Training loop
    step = initial_iteration
    data_iter = None
    ckpt_path = None

    for epoch in range(epochs):
        epoch_loss_g = 0.0
        epoch_loss_d = 0.0
        epoch_steps = 0

        for batch_idx in range(steps_per_epoch):
            if data_iter is None:
                data_iter = iter(dataloader)
            batch = next(data_iter, None)
            if batch is None:
                data_iter = iter(dataloader)
                batch = next(data_iter, None)
            if batch is None:
                break

            clean_wavs, noisy_wavs_16k, slice_starts, speaker_ids, formant_shifts = \
                [x.to(device, non_blocking=True) for x in batch]

            with torch.amp.autocast(device.type, enabled=device.type == "cuda"):
                # Generator forward
                y, y_hat, y_hat_for_backward, loss_loudness, loss_mel, loss_ap, gen_stats = \
                    net_g.forward_and_compute_loss(
                        noisy_wavs_16k[:, None, :],
                        speaker_ids,
                        formant_shifts,
                        slice_start_indices=slice_starts,
                        slice_segment_length=100,
                        y_all=clean_wavs[:, None, :],
                    )

                # Discriminator forward
                loss_disc, loss_adv, loss_fm, disc_stats = \
                    net_d.forward_and_compute_loss(y, y_hat)

            # Discriminator backward
            optim_d.zero_grad(set_to_none=True)
            grad_scaler.scale(loss_disc).backward(retain_graph=True, inputs=list(net_d.parameters()))
            grad_scaler.unscale_(optim_d)

            # Generator backward
            optim_g.zero_grad(set_to_none=True)
            grad_balancer.backward(
                {"loss_loudness": loss_loudness, "loss_mel": loss_mel,
                 "loss_adv": loss_adv, "loss_fm": loss_fm},
                y_hat_for_backward, grad_scaler,
                skip_update_ema=step > 10 and step % 5 != 0,
            )
            grad_scaler.unscale_(optim_g)

            # Update
            grad_scaler.step(optim_g)
            grad_scaler.step(optim_d)
            grad_scaler.update()
            optim_g.zero_grad(set_to_none=True)
            optim_d.zero_grad(set_to_none=True)

            scheduler_g.step()
            scheduler_d.step()

            epoch_loss_g += loss_mel.item()
            epoch_loss_d += loss_disc.item()
            epoch_steps += 1
            step += 1

            if progress_callback:
                progress_callback(step / total_steps)

        avg_loss_g = epoch_loss_g / max(1, epoch_steps)
        avg_loss_d = epoch_loss_d / max(1, epoch_steps)

        yield f"Epoch {epoch+1}/{epochs} | G loss: {avg_loss_g:.4f} | D loss: {avg_loss_d:.4f} | LR: {scheduler_g.get_last_lr()[0]:.2e}", None

        # Save checkpoint periodically
        if (epoch + 1) % max(1, epochs // 5) == 0 or epoch == epochs - 1:
            ckpt_path = os.path.join(output_dir, f"checkpoint_{step:08d}.pt.gz")
            with gzip.open(ckpt_path, "wb") as f:
                torch.save({
                    "iteration": step,
                    "net_g": net_g.state_dict(),
                    "phone_extractor": phone_extractor.state_dict(),
                    "pitch_estimator": pitch_estimator.state_dict(),
                    "net_d": {k: v.half() for k, v in net_d.state_dict().items()},
                    "optim_g": get_compressed_optimizer_state_dict(optim_g),
                    "optim_d": get_compressed_optimizer_state_dict(optim_d),
                    "grad_balancer": grad_balancer.state_dict(),
                    "grad_scaler": grad_scaler.state_dict(),
                    "h": {
                        "hidden_channels": hidden_channels,
                        "pitch_bins": pitch_bins,
                        "vq_topk": 4,
                        "training_time_vq": "none",
                        "phone_noise_ratio": 0.5,
                        "floor_noise_level": 1e-3,
                        "san": True,
                    },
                    "speakers": speakers,
                }, f)
            shutil.copy(ckpt_path, os.path.join(output_dir, "checkpoint_latest.pt.gz"))
            yield f"Saved checkpoint: {ckpt_path}", ckpt_path

    # Cleanup
    purge_memory(net_g, net_d, optim_g, optim_d, phone_extractor, pitch_estimator)

    yield "Training complete!", ckpt_path


def convert_voice_beatrice(
    source_audio,
    model_file,
    target_speaker: int = 0,
    pitch_shift: int = 0,
    formant_shift: float = 0.0,
    progress=None,
):
    """Convert voice using Beatrice v2 model

    Args:
        source_audio: Path to source audio file
        model_file: Path to Beatrice checkpoint (.pt.gz) or file object with .name
        target_speaker: Target speaker index
        pitch_shift: Pitch shift in semitones
        formant_shift: Formant shift in semitones (-2 to 2)
        progress: Gradio progress callback

    Returns:
        (output_path, status_message) tuple
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Get model path
    if hasattr(model_file, 'name'):
        model_path = model_file.name
    elif isinstance(model_file, str):
        model_path = model_file
    else:
        return None, "Invalid model file"

    if not model_path or not os.path.exists(model_path):
        return None, f"Model file not found: {model_path}"

    try:
        if progress:
            progress(0.1, "Loading models...")

        # Download pretrained assets (phone extractor + pitch estimator)
        phone_extractor_path = download_beatrice_asset("phone_extractor")
        pitch_estimator_path = download_beatrice_asset("pitch_estimator")

        # Build phone extractor
        phone_extractor = PhoneExtractor().to(device).eval().requires_grad_(False)
        pe_ckpt = torch.load(phone_extractor_path, map_location="cpu", weights_only=True)
        phone_extractor.load_state_dict(pe_ckpt["phone_extractor"], strict=False)
        del pe_ckpt

        # Build pitch estimator
        pitch_estimator = PitchEstimator().to(device).eval().requires_grad_(False)
        pi_ckpt = torch.load(pitch_estimator_path, map_location="cpu", weights_only=True)
        pitch_estimator.load_state_dict(pi_ckpt["pitch_estimator"])
        del pi_ckpt

        if progress:
            progress(0.3, "Loading trained model...")

        # Load trained checkpoint
        with gzip.open(model_path, "rb") as f:
            checkpoint = torch.load(f, map_location="cpu", weights_only=True)

        # Determine model params from checkpoint
        n_speakers = checkpoint["net_g"]["embed_speaker.weight"].shape[0]
        h = checkpoint.get("h", {})
        hidden_channels = h.get("hidden_channels", 256)
        pitch_bins = h.get("pitch_bins", 448)
        speakers = checkpoint.get("speakers", [f"Speaker {i}" for i in range(n_speakers)])

        if target_speaker >= n_speakers:
            target_speaker = 0

        net_g = ConverterNetwork(
            phone_extractor, pitch_estimator,
            n_speakers=n_speakers,
            pitch_bins=pitch_bins,
            hidden_channels=hidden_channels,
            vq_topk=h.get("vq_topk", 4),
            training_time_vq=h.get("training_time_vq", "none"),
            phone_noise_ratio=h.get("phone_noise_ratio", 0.5),
            floor_noise_level=h.get("floor_noise_level", 1e-3),
        ).to(device).eval()

        net_g.load_state_dict(checkpoint["net_g"], strict=False)
        net_g.enable_hook()
        del checkpoint

        if progress:
            progress(0.5, "Converting voice...")

        # Load audio at 16kHz
        audio_path = source_audio if isinstance(source_audio, str) else source_audio
        audio, sr = librosa.load(audio_path, sr=16000, mono=True)
        audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).unsqueeze(0).to(device)

        # Pad to multiple of 160 (phone extractor stride)
        original_length = audio_tensor.shape[-1]
        if original_length % 160 != 0:
            pad_len = 160 - original_length % 160
            audio_tensor = F.pad(audio_tensor, (0, pad_len))

        # Convert
        with torch.inference_mode():
            y_hat = net_g(
                audio_tensor,
                torch.tensor([target_speaker], device=device),
                torch.tensor([formant_shift], device=device),
                torch.tensor([float(pitch_shift)], device=device),
            )

        # Output is 24kHz, trim to match input duration
        output_length = original_length // 160 * 240  # 16kHz→24kHz frame ratio
        output = y_hat.squeeze().cpu().numpy()[:output_length]

        # Save
        fd, output_path = tempfile.mkstemp(suffix=".wav")
        os.close(fd)
        sf.write(output_path, output, 24000)

        # Cleanup
        purge_memory(net_g, phone_extractor, pitch_estimator)

        speaker_name = speakers[target_speaker] if target_speaker < len(speakers) else f"Speaker {target_speaker}"
        return output_path, f"Converted using Beatrice v2 | 24kHz | Speaker: {speaker_name} | Pitch: {pitch_shift:+d} | Formant: {formant_shift:+.1f}"

    except Exception as e:
        logger.exception("Beatrice inference error")
        return None, f"Error: {str(e)}"


# ============================================================
# GRADIO UI - Gradio 6 Compatible
# ============================================================

def train_ui(
    audio_file,
    model_name: str,
    epochs: int,
    batch_size: int,
    sample_rate: int,
    f0_method: str = "rmvpe",
    progress=gr.Progress()
):
    """Training function for Gradio UI - Generator for live log updates"""
    if audio_file is None:
        yield None, None, "❌ Please upload training audio"
        return

    if not model_name or model_name.strip() == "":
        yield None, None, "❌ Please enter a model name"
        return

    # Check if CUDA available
    has_cuda = torch.cuda.is_available()
    device_info = "GPU (CUDA)" if has_cuda else "CPU"

    # Log accumulator for live updates
    logs = []

    try:
        model_name = sanitize_model_name(model_name)
        output_dir = f"trained_models/{model_name}"
        data_dir = f"{output_dir}/data"

        # Preprocessing phase
        logs.append(f"🚀 Starting on {device_info}")
        logs.append(f"📂 Output: {output_dir}")
        yield None, None, "\n".join(logs)

        progress(0.1, "Preprocessing...")
        logs.append("🔄 Preprocessing audio...")
        yield None, None, "\n".join(logs)

        audio_path = audio_file if isinstance(audio_file, str) else audio_file.name
        result = preprocess_audio_for_training(audio_path, data_dir, target_sr=sample_rate, f0_method=f0_method)

        if result is None:
            logs.append("❌ Preprocessing failed - no valid audio chunks")
            yield None, None, "\n".join(logs)
            return

        logs.append("✅ Preprocessing complete")
        logs.append(f"🏋️ Training {epochs} epochs...")
        logs.append("─" * 40)
        yield None, None, "\n".join(logs)

        # Training phase - iterate over generator for live updates
        ckpt = None
        idx = None
        for msg, path, index in train_rvc_generator(
            data_dir=data_dir,
            output_dir=output_dir,
            epochs=epochs,
            batch_size=batch_size,
            lr=1e-5,
            target_sr=sample_rate,
            progress_callback=progress
        ):
            logs.append(msg)
            yield None, None, "\n".join(logs)
            if path:
                ckpt = path
            if index:
                idx = index

        if ckpt:
            logs.append("─" * 40)
            logs.append(f"✅ Training complete!")
            logs.append(f"📦 Model: {ckpt}")
            if idx:
                logs.append(f"📦 Index: {idx}")
            progress(1.0, "Done!")
            yield ckpt, idx, "\n".join(logs)
        else:
            logs.append("❌ Training failed")
            yield None, None, "\n".join(logs)

    except Exception as e:
        logger.exception("Training error")
        logs.append(f"❌ Error: {str(e)}")
        yield None, None, "\n".join(logs)


# ============================================================
# SPLIT-AND-STITCH BACKGROUND PROCESSOR
#
# Architecture rationale:
#   Heavy RVC inference on a 2-core CPU can push 10–14 GB RAM for
#   long audio. The solution is to:
#     1. Chop the input into CHUNK_SEC-second slices with a
#        OVERLAP_SEC-second overlap on each side. The overlap gives
#        the vocoder context at boundaries so there are no clicks.
#     2. Process each chunk independently through the full RVC
#        pipeline (RMVPE + FAISS + vocoder). After each chunk,
#        call purge_memory() so RAM never accumulates.
#     3. Cross-fade adjacent chunks over the overlap region using a
#        linear fade-out/fade-in (equal-power is not needed here
#        because RVC output is already bandlimited). The crossfade
#        is the only "small render" step — it's just numpy slicing
#        and a linear ramp, not another model pass.
#     4. Concatenate the stitched segments and write the final WAV.
#
# Non-negotiable settings preserved:
#   - f0_method is always forwarded as-is (RMVPE by default).
#   - index_file is always forwarded (FAISS stays active).
#   - Model sample rate (40k/48k) is respected — chunks are saved
#     at 16k for processing and the stitched output is at tgt_sr.
# ============================================================

CHUNK_SEC   = 30    # seconds per chunk fed to RVC (keeps RAM under ~4 GB/chunk)
OVERLAP_SEC = 0.4   # seconds of overlap for crossfade seam (at tgt_sr)
MIN_CHUNK_SEC = 5   # don't split files shorter than this


def _crossfade_join(seg_a: np.ndarray, seg_b: np.ndarray,
                    overlap_samples: int) -> np.ndarray:
    """
    Overlap-add two mono float32 segments.

    seg_a:  …audio… [overlap_samples tail]
    seg_b:  [overlap_samples head] …audio…

    Returns the seamlessly stitched result.
    """
    if overlap_samples <= 0 or len(seg_a) < overlap_samples or len(seg_b) < overlap_samples:
        return np.concatenate([seg_a, seg_b])

    fade_out = np.linspace(1.0, 0.0, overlap_samples, dtype=np.float32)
    fade_in  = np.linspace(0.0, 1.0, overlap_samples, dtype=np.float32)

    blended = seg_a[-overlap_samples:] * fade_out + seg_b[:overlap_samples] * fade_in
    return np.concatenate([seg_a[:-overlap_samples], blended, seg_b[overlap_samples:]])


def convert_voice_chunked(
    source_audio,
    model_file,
    index_file=None,
    pitch_shift: int = 0,
    f0_method: str = "rmvpe",   # RMVPE is non-negotiable — forwarded as-is
    index_rate: float = 0.75,   # FAISS active — forwarded as-is
    protect: float = 0.33,
    volume_envelope: float = 1.0,
    progress=None,
    chunk_sec: int = CHUNK_SEC,
    overlap_sec: float = OVERLAP_SEC,
) -> Tuple[str, str]:
    """
    Split-and-stitch wrapper around convert_voice().

    For audio shorter than MIN_CHUNK_SEC seconds, falls straight
    through to convert_voice() with no splitting overhead.

    For longer audio:
      • Loads the full waveform once at 16 kHz (source SR for RVC).
      • Slices into overlapping chunks at the 16k level.
      • Each chunk is written to a temp WAV, run through convert_voice(),
        and the result is loaded back as a numpy array.
      • purge_memory() is called after every chunk so RAM is returned
        to the OS via malloc_trim() before the next chunk starts.
      • Chunks are crossfaded and concatenated.
      • The final stitched audio is written to a single output WAV.
    """
    if source_audio is None:
        return None, "Please upload source audio"
    if model_file is None:
        return None, "Please upload RVC model (.pth)"

    # ------------------------------------------------------------------
    # 1. Load source audio once to check duration
    # ------------------------------------------------------------------
    try:
        audio_full, _ = librosa.load(source_audio, sr=16000, mono=True)
    except Exception as e:
        return None, f"Failed to load audio: {e}"

    duration_sec = len(audio_full) / 16000.0

    # Short file — no splitting needed, avoids overhead
    if duration_sec <= MIN_CHUNK_SEC:
        purge_memory(audio_full)
        return convert_voice(
            source_audio, model_file, index_file,
            pitch_shift, f0_method, index_rate, protect, volume_envelope,
            progress if progress is not None else gr.Progress()
        )

    # ------------------------------------------------------------------
    # 2. Determine chunk boundaries (in 16k samples)
    # ------------------------------------------------------------------
    sr_in        = 16000
    chunk_samp   = int(chunk_sec   * sr_in)
    overlap_samp = int(overlap_sec * sr_in)
    hop_samp     = chunk_samp - overlap_samp  # non-overlapping step

    starts = list(range(0, len(audio_full), hop_samp))
    n_chunks = len(starts)

    logger.info(f"[Chunked] {duration_sec:.1f}s → {n_chunks} chunks "
                f"({chunk_sec}s each, {overlap_sec}s overlap)")

    # ------------------------------------------------------------------
    # 3. Process each chunk through convert_voice()
    # ------------------------------------------------------------------
    stitched_segments: list[np.ndarray] = []
    tgt_sr = None          # learned from first chunk output
    overlap_out_samp = 0   # overlap at target SR (learned after first chunk)

    for i, start in enumerate(starts):
        end = min(start + chunk_samp, len(audio_full))
        chunk = audio_full[start:end]

        if progress is not None:
            try:
                progress((i + 0.5) / n_chunks,
                         f"Processing chunk {i+1}/{n_chunks}…")
            except Exception:
                pass

        # Write chunk to temp WAV
        fd, chunk_path = tempfile.mkstemp(suffix=".wav")
        os.close(fd)
        sf.write(chunk_path, chunk, sr_in)

        try:
            # Run full RVC pipeline on this chunk (RMVPE + FAISS inside)
            out_path, status = convert_voice(
                chunk_path, model_file, index_file,
                pitch_shift, f0_method, index_rate, protect, volume_envelope,
                # Suppress inner progress — we own the bar
                gr.Progress() if progress is None else progress
            )
        except Exception as e:
            logger.warning(f"[Chunked] Chunk {i+1} failed: {e}")
            # Remove temp and skip — silence gap is better than crash
            try: os.unlink(chunk_path)
            except OSError: pass
            purge_memory(chunk)
            continue
        finally:
            try: os.unlink(chunk_path)
            except OSError: pass

        if out_path is None:
            logger.warning(f"[Chunked] Chunk {i+1} returned no output: {status}")
            purge_memory(chunk)
            continue

        # Load the converted chunk
        chunk_out, chunk_sr = sf.read(out_path, dtype="float32")
        try: os.unlink(out_path)
        except OSError: pass

        # Learn target SR from first successful chunk
        if tgt_sr is None:
            tgt_sr = chunk_sr
            # Scale overlap to target SR
            overlap_out_samp = int(overlap_sec * tgt_sr)

        stitched_segments.append(chunk_out)

        # Critical: free everything before next chunk loads the model
        purge_memory(chunk, chunk_out)

    # ------------------------------------------------------------------
    # 4. Stitch segments with crossfade
    # ------------------------------------------------------------------
    if not stitched_segments:
        return None, "All chunks failed — no output produced"

    result = stitched_segments[0]
    for seg in stitched_segments[1:]:
        result = _crossfade_join(result, seg, overlap_out_samp)

    # Final normalization (matches convert_voice behaviour)
    audio_max = np.abs(result).max() / 0.99
    if audio_max > 1.0:
        result /= audio_max

    # ------------------------------------------------------------------
    # 5. Write stitched output
    # ------------------------------------------------------------------
    final_sr = tgt_sr if tgt_sr is not None else 40000
    fd, output_path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)
    sf.write(output_path, result, final_sr)

    purge_memory(result, audio_full)
    logger.info(f"[Chunked] Stitched output: {len(stitched_segments)} chunks → {output_path}")
    return output_path, (
        f"Chunked conversion: {n_chunks} chunks stitched | "
        f"sr={final_sr} | pitch={pitch_shift:+d} | f0={f0_method}"
    )


with gr.Blocks() as demo:
    gr.Markdown(f"# 🎤 Voice Conversion (RVC + Beatrice)\nInference: CPU • Training: {'GPU (CUDA)' if torch.cuda.is_available() else 'CPU'}")

    with gr.Tabs():
        # ==================== TAB 1: VOICE CONVERSION ====================
        with gr.Tab("🎵 Voice Conversion"):
            with gr.Row():
                with gr.Column():
                    source_audio = gr.Audio(label="Source Audio", type="filepath")

                    gr.Markdown("### Model")
                    model_type = gr.Radio(
                        ["RVC v2", "Beatrice v2"], value="RVC v2",
                        label="Model Type",
                        info="RVC: .pth files | Beatrice: .pt.gz files"
                    )

                    # RVC model inputs
                    with gr.Group(visible=True) as rvc_model_group:
                        _available_models = list_rvc_models()
                        rvc_model_dropdown = gr.Dropdown(
                            choices=_available_models,
                            label="Select Voice Model",
                            info="Models auto-loaded from weights/ folder",
                            value="model_kunni.pth" if "model_kunni.pth" in _available_models else (_available_models[0] if _available_models else None),
                        )
                        with gr.Row():
                            model_file = gr.File(label="OR Upload New (.pth)", file_types=[".pth"])
                            load_example_btn = gr.Button("Load Example (Benee)", size="sm")
                        index_file = gr.File(label="Index File (.index) - Optional", file_types=[".index"])

                    # Beatrice model inputs
                    with gr.Group(visible=False) as beatrice_model_group:
                        beatrice_model_file = gr.File(label="Beatrice Model (.pt.gz)", file_types=[".gz"])
                        with gr.Row():
                            beatrice_target_speaker = gr.Number(value=0, label="Target Speaker", precision=0)
                            beatrice_formant_shift = gr.Slider(-2, 2, value=0.0, step=0.5, label="Formant Shift")

                    with gr.Row():
                        pitch_shift = gr.Slider(-12, 12, value=0, step=1, label="Pitch (semitones)")
                        f0_method = gr.Radio(["rmvpe", "pm", "harvest"], value="rmvpe", label="F0 Method", visible=True)

                    with gr.Row(visible=True) as rvc_extra_options:
                        index_rate = gr.Slider(0, 1, value=0.75, step=0.05, label="Index Rate")
                        protect = gr.Slider(0, 0.5, value=0.33, step=0.01, label="Protect (voiceless consonants)")
                    convert_btn = gr.Button("Convert", variant="primary")

                with gr.Column():
                    output_audio = gr.Audio(label="Converted Audio", type="filepath")
                    output_info = gr.Textbox(label="Status", lines=2)

            def update_model_type(model_type_val):
                is_rvc = model_type_val == "RVC v2"
                return (
                    gr.update(visible=is_rvc),      # rvc_model_group
                    gr.update(visible=not is_rvc),   # beatrice_model_group
                    gr.update(visible=is_rvc),       # f0_method
                    gr.update(visible=is_rvc),       # rvc_extra_options
                )

            model_type.change(
                update_model_type,
                [model_type],
                [rvc_model_group, beatrice_model_group, f0_method, rvc_extra_options]
            )

            load_example_btn.click(
                load_example_model,
                [],
                [model_file, index_file, output_info]
            )

            def convert_unified(source, m_type, rvc_dropdown_model, rvc_model, rvc_index, beat_model,
                                beat_speaker, beat_formant, pitch, f0, idx_rate, prot,
                                progress=gr.Progress()):
                if m_type == "RVC v2":
                    # Resolve model: prefer uploaded file, fall back to dropdown selection
                    resolved_model = rvc_model
                    resolved_index = rvc_index
                    if resolved_model is None and rvc_dropdown_model:
                        pth_path = Path("weights") / rvc_dropdown_model
                        # Wrap in a simple object so convert_voice can call .name on it
                        class _FileObj:
                            def __init__(self, p): self.name = str(p)
                        resolved_model = _FileObj(pth_path)
                        # Auto-hunt for matching .index in weights/
                        if resolved_index is None:
                            stem = pth_path.stem
                            index_path = Path("weights") / f"{stem}.index"
                            if index_path.exists():
                                resolved_index = _FileObj(index_path)
                    # Use the split-and-stitch processor for RVC.
                    # It falls through to plain convert_voice() for short clips
                    # and auto-chunks long audio to prevent OOM on HF Spaces.
                    return convert_voice_chunked(source, resolved_model, resolved_index, pitch, f0, idx_rate, prot, progress=progress)
                else:
                    return convert_voice_beatrice(
                        source, beat_model,
                        target_speaker=int(beat_speaker),
                        pitch_shift=int(pitch),
                        formant_shift=float(beat_formant),
                        progress=progress
                    )

            convert_btn.click(
                convert_unified,
                [source_audio, model_type, rvc_model_dropdown, model_file, index_file, beatrice_model_file,
                 beatrice_target_speaker, beatrice_formant_shift,
                 pitch_shift, f0_method, index_rate, protect],
                [output_audio, output_info],
                api_name="convert",
                concurrency_limit=1,
            )

            gr.Markdown("**Models:** [HuggingFace](https://huggingface.co/models?search=rvc) | [Weights.gg](https://weights.gg)")

        # ==================== TAB 2: TRAINING ====================
        with gr.Tab("🏋️ Training"):
            # GPU Warning
            gpu_status = "🟢 GPU Available (CUDA)" if torch.cuda.is_available() else "🟡 CPU Only (Training will be slow)"
            gr.Markdown(f"""
            ### Training Status: {gpu_status}

            {'**GPU detected!** Training will use CUDA acceleration.' if torch.cuda.is_available() else '**No GPU detected.** Training will run on CPU (~30 sec/epoch). For faster training, run locally with CUDA GPU.'}
            """)

            # Trainer selector - Beatrice always available (downloads assets from HF)
            trainer_selector = gr.Dropdown(
                choices=["RVC v2", "Beatrice v2"],
                value="RVC v2",
                label="Trainer",
                info="RVC v2: general purpose | Beatrice v2: low latency (~50ms)"
            )

            with gr.Row():
                with gr.Column():
                    train_audio = gr.Audio(label="Training Audio", type="filepath")
                    train_model_name = gr.Textbox(label="Model Name", placeholder="my_voice", value="my_voice")

                    # RVC-specific options
                    with gr.Group(visible=True) as rvc_options:
                        with gr.Row():
                            train_epochs = gr.Slider(1, 500, value=50, step=1, label="Epochs")
                            train_batch = gr.Slider(1, 8, value=2, step=1, label="Batch Size")
                        with gr.Row():
                            train_sr = gr.Radio([32000, 40000, 48000], value=40000, label="Sample Rate")
                            train_f0 = gr.Radio(["rmvpe", "pm", "harvest"], value="rmvpe", label="F0 Method")

                    # Beatrice-specific options
                    with gr.Group(visible=False) as beatrice_options:
                        with gr.Row():
                            beatrice_epochs = gr.Slider(1, 100, value=30, step=1, label="Epochs (30 recommended)")
                            beatrice_batch = gr.Slider(1, 64, value=8, step=1, label="Batch Size")
                        beatrice_resume = gr.Checkbox(label="Resume from checkpoint", value=False)

                    train_btn = gr.Button("Start Training", variant="primary")

                with gr.Column():
                    train_output_model = gr.File(label="Trained Model (.pth)")
                    train_output_index = gr.File(label="Index File (.index)")
                    train_status = gr.Textbox(label="Training Status", lines=6)

            # Toggle visibility based on trainer selection
            def update_trainer_options(trainer):
                is_rvc = trainer == "RVC v2"
                return gr.update(visible=is_rvc), gr.update(visible=not is_rvc)

            trainer_selector.change(
                update_trainer_options,
                [trainer_selector],
                [rvc_options, beatrice_options]
            )

            # Unified training function
            def train_unified(trainer, audio, name, rvc_epochs, rvc_batch, rvc_sr, rvc_f0,
                            beat_epochs, beat_batch, beat_resume, progress=gr.Progress()):
                if trainer == "RVC v2":
                    # Use generator for live updates (yields 3-tuples: model, index, status)
                    for result in train_ui(audio, name, rvc_epochs, rvc_batch, rvc_sr, rvc_f0, progress):
                        yield result
                else:
                    # Beatrice training (embedded - downloads assets from HF)
                    if audio is None:
                        yield None, None, "❌ Please upload training audio"
                        return
                    if not name or name.strip() == "":
                        yield None, None, "❌ Please enter a model name"
                        return

                    try:
                        name = sanitize_model_name(name)
                        output_dir = f"trained_models/beatrice_{name}"
                        data_dir = f"{output_dir}/training_data"
                        logs = []

                        # Preprocess audio into speaker chunks
                        logs.append(f"🚀 Starting Beatrice training on {'GPU (CUDA)' if torch.cuda.is_available() else 'CPU'}")
                        yield None, None, "\n".join(logs)

                        progress(0.05, "Preprocessing audio...")
                        audio_path = audio if isinstance(audio, str) else audio.name
                        chunks, _ = preprocess_audio_for_beatrice(audio_path, data_dir, name)

                        if chunks == 0:
                            yield None, None, "❌ Preprocessing failed - no valid audio chunks"
                            return

                        logs.append(f"✅ Preprocessed {chunks} audio chunks")
                        logs.append("─" * 40)
                        yield None, None, "\n".join(logs)

                        # Train using embedded generator
                        ckpt = None
                        for msg, path in train_beatrice_generator(
                            data_dir=data_dir,
                            output_dir=output_dir,
                            epochs=beat_epochs,
                            batch_size=beat_batch,
                            resume=beat_resume,
                            progress_callback=progress
                        ):
                            logs.append(msg)
                            yield None, None, "\n".join(logs)
                            if path:
                                ckpt = path

                        if ckpt:
                            logs.append("─" * 40)
                            logs.append(f"✅ Training complete!")
                            logs.append(f"📦 Model: {ckpt}")
                            progress(1.0, "Done!")
                            yield ckpt, None, "\n".join(logs)
                        else:
                            logs.append("❌ Training failed")
                            yield None, None, "\n".join(logs)

                    except Exception as e:
                        logger.exception("Beatrice training error")
                        yield None, None, f"❌ Error: {str(e)}"

            train_btn.click(
                train_unified,
                [trainer_selector, train_audio, train_model_name,
                 train_epochs, train_batch, train_sr, train_f0,
                 beatrice_epochs, beatrice_batch, beatrice_resume],
                [train_output_model, train_output_index, train_status],
                api_name="train",
                concurrency_limit=1,
            )

            gr.Markdown("""
            ---
            ### Training Tips

            **RVC v2:**
            - 50-100 epochs for quick test, 200-500 for quality
            - CPU training: ~30 sec/epoch (100 epochs ≈ 50 min)

            **Beatrice v2:**
            - 20-50 epochs recommended, GPU recommended (CPU works but slow)
            - Pretrained assets downloaded automatically from HuggingFace
            - Output: .pt.gz checkpoint (use in Voice Conversion tab)
            - Lower latency (~50ms vs ~100ms)

            ### CLI
            ```bash
            python app.py train -a voice.mp3 -o ./model --epochs 100
            python app.py train-beatrice -a voice.mp3 -o ./beatrice_model --epochs 30
            python app.py infer -i input.wav -m beatrice_model.pt.gz -o output.wav
            ```
            """)


def cli_convert(args):
    """CLI mode conversion - supports both RVC and Beatrice models"""
    print(f"Converting: {args.input}")
    print(f"Model: {args.model}")

    # Auto-detect model type from extension or --type flag
    model_type = getattr(args, 'type', None)
    model_path = args.model
    index_path = getattr(args, 'index', None)

    if args.example:
        print("Downloading example model...")
        model_path = hf_hub_download(repo_id=DEFAULT_MODEL_REPO, filename=DEFAULT_MODEL_FILE)
        index_path = hf_hub_download(repo_id=DEFAULT_MODEL_REPO, filename=DEFAULT_INDEX_FILE)
        model_type = "rvc"
        print(f"Model: {model_path}")

    if not model_path:
        print("Error: No model specified. Use -m MODEL.pth or --example")
        sys.exit(1)

    # Auto-detect type from extension if not specified
    if model_type is None:
        if model_path.endswith('.pt.gz') or model_path.endswith('.gz'):
            model_type = "beatrice"
        else:
            model_type = "rvc"

    if model_type == "beatrice":
        # Beatrice inference
        print(f"Using Beatrice v2 inference")
        output_path, status = convert_voice_beatrice(
            source_audio=args.input,
            model_file=model_path,
            target_speaker=getattr(args, 'speaker', 0),
            pitch_shift=args.pitch,
            formant_shift=getattr(args, 'formant_shift', 0.0),
        )
    else:
        # RVC inference
        class FileObj:
            def __init__(self, path):
                self.name = path

        model_file = FileObj(model_path)
        index_file = FileObj(index_path) if index_path else None

        output_path, status = convert_voice(
            source_audio=args.input,
            model_file=model_file,
            index_file=index_file,
            pitch_shift=args.pitch,
            f0_method=args.f0,
            index_rate=args.index_rate,
            progress=lambda *a, **k: None
        )

    if output_path:
        shutil.copy(output_path, args.output)
        print(f"Output: {args.output}")
        print(status)
    else:
        print(f"Failed: {status}")
        sys.exit(1)

def cli_train_beatrice(args):
    """CLI Beatrice training mode (embedded - downloads assets from HF)"""
    print(f"=== Beatrice v2 Training (Embedded) ===")
    print(f"Input audio: {args.audio}")
    print(f"Output dir: {args.output}")

    # Preprocess audio - Beatrice expects: data_dir/speaker_name/*.wav
    data_dir = os.path.join(args.output, "training_data")
    speaker_name = os.path.splitext(os.path.basename(args.audio))[0]

    print(f"\n[1/2] Preprocessing audio for Beatrice...")
    chunks, _ = preprocess_audio_for_beatrice(args.audio, data_dir, speaker_name)

    if chunks == 0:
        print("Preprocessing failed - no valid audio chunks")
        sys.exit(1)

    print(f"Created {chunks} audio chunks")

    # Train using embedded code
    print(f"\n[2/2] Training Beatrice model ({args.epochs} epochs)...")
    ckpt = None
    for msg, path in train_beatrice_generator(
        data_dir=data_dir,
        output_dir=args.output,
        epochs=args.epochs,
        batch_size=args.batch,
        resume=args.resume,
    ):
        print(msg)
        if path:
            ckpt = path

    if ckpt:
        print(f"\nTraining complete!")
        print(f"Model: {ckpt}")
    else:
        print("Training failed!")
        sys.exit(1)

def cli_train(args):
    """CLI training mode"""
    print(f"=== RVC Training ===")
    print(f"Input audio: {args.audio}")
    print(f"Output dir: {args.output}")

    # Create temp dir for preprocessing
    data_dir = f"{args.output}/data"

    # Preprocess
    print("\n[1/2] Preprocessing audio...")
    result = preprocess_audio_for_training(args.audio, data_dir, target_sr=args.sr, f0_method=args.f0)
    if result is None:
        print("Preprocessing failed!")
        sys.exit(1)

    # Train
    print(f"\n[2/2] Training for {args.epochs} epochs...")
    ckpt, idx = train_rvc(
        data_dir=data_dir,
        output_dir=args.output,
        epochs=args.epochs,
        batch_size=args.batch,
        lr=args.lr,
        target_sr=args.sr
    )

    if ckpt:
        print(f"\nTraining complete!")
        print(f"Model saved: {ckpt}")
        if idx:
            print(f"Index saved: {idx}")
    else:
        print("Training failed!")
        sys.exit(1)

if __name__ == "__main__":
    # Check if any CLI args (besides script name)
    if len(sys.argv) > 1:
        parser = argparse.ArgumentParser(
            description="RVC Voice Conversion - Inference and Training",
            formatter_class=argparse.RawDescriptionHelpFormatter,
        )
        subparsers = parser.add_subparsers(dest="command", help="Commands")

        # Inference subcommand
        infer_parser = subparsers.add_parser("infer", help="Voice conversion inference (RVC + Beatrice)",
            epilog="""
Examples:
  python app.py infer -i voice.wav -m model.pth -o output.wav
  python app.py infer -i voice.wav -m beatrice_model.pt.gz -o output.wav --type beatrice
  python app.py infer -i voice.wav --example -o output.wav
            """)
        infer_parser.add_argument("-i", "--input", required=True, help="Input audio file")
        infer_parser.add_argument("-o", "--output", required=True, help="Output audio file")
        infer_parser.add_argument("-m", "--model", help="Model file (.pth for RVC, .pt.gz for Beatrice)")
        infer_parser.add_argument("--type", choices=["rvc", "beatrice"], default=None, help="Model type (auto-detected from extension)")
        infer_parser.add_argument("--index", help="Index file (.index) - RVC only")
        infer_parser.add_argument("--example", action="store_true", help="Use example model (Benee-RVC)")
        infer_parser.add_argument("-p", "--pitch", type=int, default=0, help="Pitch shift (-12 to 12)")
        infer_parser.add_argument("--f0", choices=["rmvpe", "pm", "harvest"], default="rmvpe", help="F0 method (RVC only)")
        infer_parser.add_argument("--index-rate", type=float, default=0.75, help="Index rate 0-1 (RVC only)")
        infer_parser.add_argument("--speaker", type=int, default=0, help="Target speaker index (Beatrice only)")
        infer_parser.add_argument("--formant-shift", type=float, default=0.0, help="Formant shift -2 to 2 (Beatrice only)")

        # Training subcommand (RVC)
        train_parser = subparsers.add_parser("train", help="Train RVC model",
            epilog="""
Examples:
  python app.py train -a voice.mp3 -o ./my_model --epochs 5
  python app.py train -a dataset.wav -o ./trained --epochs 10 --batch 4
            """)
        train_parser.add_argument("-a", "--audio", required=True, help="Training audio file")
        train_parser.add_argument("-o", "--output", required=True, help="Output directory for model")
        train_parser.add_argument("--epochs", type=int, default=5, help="Number of epochs (default: 5)")
        train_parser.add_argument("--batch", type=int, default=2, help="Batch size (default: 2)")
        train_parser.add_argument("--lr", type=float, default=1e-5, help="Learning rate (default: 1e-5)")
        train_parser.add_argument("--sr", type=int, default=40000, help="Sample rate (default: 40000)")
        train_parser.add_argument("--f0", choices=["rmvpe", "pm", "harvest"], default="rmvpe", help="F0 method (default: rmvpe)")
        
        # Beatrice v2 Training subcommand
        beatrice_parser = subparsers.add_parser("train-beatrice", help="Train Beatrice v2 model (downloads assets from HF)",
            epilog="""
Examples:
  python app.py train-beatrice -a voice.mp3 -o ./beatrice_model --epochs 20
  python app.py train-beatrice -a dataset.wav -o ./trained --epochs 50 --batch 8 --resume

Pretrained assets are downloaded automatically from HuggingFace.
            """)
        beatrice_parser.add_argument("-a", "--audio", required=True, help="Training audio file")
        beatrice_parser.add_argument("-o", "--output", required=True, help="Output directory for model")
        beatrice_parser.add_argument("--epochs", type=int, default=20, help="Number of epochs (default: 20)")
        beatrice_parser.add_argument("--batch", type=int, default=24, help="Batch size (default: 24, reduce for less VRAM)")
        beatrice_parser.add_argument("--resume", action="store_true", help="Resume from checkpoint")

        args = parser.parse_args()

        if args.command == "infer":
            cli_convert(args)
        elif args.command == "train":
            cli_train(args)
        elif args.command == "train-beatrice":
            cli_train_beatrice(args)
        else:
            parser.print_help()
    else:
        # ============================================================
        # GRADIO QUEUE — Stability config for 2-core CPU HF Spaces
        #
        # Why these settings matter:
        #
        # max_size=3
        #   Hard cap on pending jobs. On a 2-core CPU, RVC can take
        #   3–10 minutes per job. Without a cap, users pile up and
        #   the Space exhausts RAM while holding dozens of audio
        #   blobs in the queue. 3 is a safe upper bound: one running
        #   + two waiting. Any new request beyond that gets a clear
        #   "queue full" HTTP 503 instead of a silent timeout.
        #
        # default_concurrency_limit=1
        #   Only one inference job runs at a time. Two simultaneous
        #   RVC jobs on a 2-core CPU would each get 1 thread, which
        #   is slower than one job using both cores sequentially.
        #   Serial execution is strictly faster here.
        #
        # The convert_btn.click already has concurrency_limit=1 set
        # at the event level; this queue-level setting enforces it
        # globally (including any API calls) so nothing bypasses it.
        #
        # status_update_rate="auto"
        #   Gradio sends SSE heartbeats to the browser on this cadence.
        #   "auto" (≈ 1 Hz) is enough to keep the WebSocket / SSE
        #   connection alive across long jobs without flooding the
        #   2-core CPU with keep-alive overhead.
        # ============================================================
        demo.queue(
            max_size=3,
            default_concurrency_limit=1,
            status_update_rate="auto",
        ).launch(
            mcp_server=True,
            show_error=True,
            ssr_mode=False,
        )