Spaces:

himahande45
/

indicvox-hindi-tamil-codeswitching-tts

Running

App Files Files Community

himahande45 commited on 11 days ago

Commit

402a61f

verified ·

1 Parent(s): 5903ab3

Add IndicVox paper demo Space

Browse files

Files changed (42) hide show

.gitattributes +3 -0
README.md +29 -5
app.py +451 -0
assets/voices/hin_m_ref_00.wav +3 -0
assets/voices/tam_f_ref_00.wav +3 -0
assets/voices/tam_m_ref_00.wav +3 -0
code_switch_prompts.json +166 -0
packages.txt +2 -0
requirements.txt +13 -0
voxcpm/__init__.py +5 -0
voxcpm/cli.py +598 -0
voxcpm/core.py +333 -0
voxcpm/model/__init__.py +4 -0
voxcpm/model/utils.py +121 -0
voxcpm/model/voxcpm.py +985 -0
voxcpm/model/voxcpm2.py +1224 -0
voxcpm/modules/__init__.py +0 -0
voxcpm/modules/audiovae/__init__.py +2 -0
voxcpm/modules/audiovae/audio_vae.py +377 -0
voxcpm/modules/audiovae/audio_vae_v2.py +486 -0
voxcpm/modules/layers/__init__.py +1 -0
voxcpm/modules/layers/lora.py +130 -0
voxcpm/modules/layers/scalar_quantization_layer.py +26 -0
voxcpm/modules/locdit/__init__.py +3 -0
voxcpm/modules/locdit/local_dit.py +114 -0
voxcpm/modules/locdit/local_dit_v2.py +116 -0
voxcpm/modules/locdit/unified_cfm.py +232 -0
voxcpm/modules/locenc/__init__.py +1 -0
voxcpm/modules/locenc/local_encoder.py +30 -0
voxcpm/modules/minicpm4/__init__.py +3 -0
voxcpm/modules/minicpm4/cache.py +47 -0
voxcpm/modules/minicpm4/config.py +30 -0
voxcpm/modules/minicpm4/model.py +429 -0
voxcpm/training/__init__.py +27 -0
voxcpm/training/accelerator.py +163 -0
voxcpm/training/config.py +38 -0
voxcpm/training/data.py +214 -0
voxcpm/training/packers.py +296 -0
voxcpm/training/state.py +20 -0
voxcpm/training/tracker.py +78 -0
voxcpm/utils/text_normalize.py +188 -0
voxcpm/zipenhancer.py +72 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/voices/hin_m_ref_00.wav filter=lfs diff=lfs merge=lfs -text
+assets/voices/tam_f_ref_00.wav filter=lfs diff=lfs merge=lfs -text
+assets/voices/tam_m_ref_00.wav filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,36 @@
 ---
-title: Indicvox Hindi Tamil Codeswitching Tts
-emoji: 🏃
-colorFrom: pink
-colorTo: gray
 sdk: gradio
 sdk_version: 6.12.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: "IndicVox: Hindi & Tamil Code-Switching TTS"
+emoji: "🎙️"
+colorFrom: indigo
+colorTo: green
 sdk: gradio
 sdk_version: 6.12.0
 app_file: app.py
 pinned: false
+python_version: "3.10.16"
+suggested_hardware: a10g-small
 ---
+# IndicVox
+IndicVox is a GPU-backed research demo for multilingual text-to-speech across Hindi, Tamil, and code-switched prompts. The Space exposes the paper checkpoints through a clean Gradio UI with built-in voice presets and example prompts.
+## What it includes
+- `Hindi Focus` for Hindi and Hindi-English prompts
+- `Tamil Focus` for Tamil and Tamil-English prompts
+- `Research Baseline` for direct comparison against the untuned multilingual model
+- Built-in research voice presets for fast demo playback
+- Zero-shot `Text Only` mode if you want to skip reference conditioning
+## Usage
+1. Pick a model profile.
+2. Type a Hindi, Tamil, or code-switched prompt.
+3. Pick a built-in voice preset or `Text Only`.
+4. Click `Generate Speech`.
+## Notes
+- The base multilingual model stays resident on GPU memory and the paper checkpoints are swapped on demand.
+- The Space is meant for inference/demo usage, not batch evaluation.

app.py ADDED Viewed

	@@ -0,0 +1,451 @@

+from __future__ import annotations
+import json
+import os
+import sys
+import threading
+import time
+import traceback
+from pathlib import Path
+import gradio as gr
+import numpy as np
+import torch
+from huggingface_hub import snapshot_download
+APP_DIR = Path(__file__).resolve().parent
+def resolve_persist_root() -> Path:
+    data_root = Path("/data")
+    if data_root.exists() and os.access(data_root, os.W_OK):
+        return data_root
+    local_root = APP_DIR / ".cache"
+    local_root.mkdir(parents=True, exist_ok=True)
+    return local_root
+PERSIST_ROOT = resolve_persist_root()
+HF_HOME = PERSIST_ROOT / "huggingface"
+HF_HOME.mkdir(parents=True, exist_ok=True)
+os.environ.setdefault("HF_HOME", str(HF_HOME))
+os.environ.setdefault("HF_HUB_CACHE", str(HF_HOME / "hub"))
+sys.path.insert(0, str(APP_DIR))
+from voxcpm import VoxCPM
+from voxcpm.model.voxcpm import LoRAConfig
+SPACE_TITLE = "IndicVox: Hindi & Tamil Code-Switching TTS"
+MODEL_REPO_ID = "himahande45/multilingual-tts"
+PROMPTS_FILE = APP_DIR / "code_switch_prompts.json"
+VOICE_DIR = APP_DIR / "assets" / "voices"
+DEFAULT_PROFILE = "Tamil Focus"
+DEFAULT_VOICE = "Tamil Female Research Voice"
+DEFAULT_TEXT = "இந்த experimentக்கு clean reference audio use பண்ணணும், இல்லனா output quality drop ஆகும்."
+MODEL_PATTERNS = [
+    "VoxCPM2_local/*",
+    "finetune_checkpoints/step_0000500/lora_config.json",
+    "finetune_checkpoints/step_0000500/lora_weights.safetensors",
+    "finetune_checkpoints/step_0001000/lora_config.json",
+    "finetune_checkpoints/step_0001000/lora_weights.safetensors",
+]
+PROFILES = {
+    "Tamil Focus": {
+        "description": "Best for Tamil and Tamil-English code-switched prompts.",
+        "checkpoint_dir": "finetune_checkpoints/step_0001000",
+    },
+    "Hindi Focus": {
+        "description": "Best for Hindi and Hindi-English code-switched prompts.",
+        "checkpoint_dir": "finetune_checkpoints/step_0000500",
+    },
+    "Research Baseline": {
+        "description": "Base multilingual checkpoint without paper fine-tuning.",
+        "checkpoint_dir": None,
+    },
+}
+VOICE_PRESETS = {
+    "Hindi Research Voice": {
+        "path": VOICE_DIR / "hin_m_ref_00.wav",
+        "transcript": "लेकिन क्या यह हम सभी कार्यक्रमों के साथ कर सकते?",
+        "summary": "Short Hindi reference used for sharper Hindi + English prompting.",
+    },
+    "Tamil Female Research Voice": {
+        "path": VOICE_DIR / "tam_f_ref_00.wav",
+        "transcript": "விக்கற நேரத்தையும் லாபத்தையும் பொறுத்து, இந்த டேக்ஸை ஷார்ட் டேர்ம் இல்ல லாங் டேர்ம்னு பிரிப்பாங்க.",
+        "summary": "Clear Tamil reference with stable conversational prosody.",
+    },
+    "Tamil Male Research Voice": {
+        "path": VOICE_DIR / "tam_m_ref_00.wav",
+        "transcript": "கொரோனா பாதிப்பு காலத்தில் எண்பது கோடி மக்களுக்கு  உணவு தானியம் வழங்கப்பட்டதாகவும் அவர் தெரிவித்தார்.",
+        "summary": "Tamil male reference that holds rhythm well on longer prompts.",
+    },
+    "Text Only": {
+        "path": None,
+        "transcript": None,
+        "summary": "Zero-shot generation without a reference voice clip.",
+    },
+}
+CUSTOM_CSS = """
+#app-shell {
+    max-width: 1180px;
+    margin: 0 auto;
+}
+#hero {
+    padding: 24px 26px 12px 26px;
+    border: 1px solid rgba(255, 255, 255, 0.08);
+    border-radius: 22px;
+    background:
+        radial-gradient(circle at top right, rgba(99, 102, 241, 0.16), transparent 34%),
+        radial-gradient(circle at bottom left, rgba(16, 185, 129, 0.14), transparent 30%),
+        rgba(15, 23, 42, 0.74);
+}
+.stat-chip {
+    display: inline-block;
+    margin: 6px 8px 0 0;
+    padding: 8px 12px;
+    border-radius: 999px;
+    background: rgba(255, 255, 255, 0.06);
+    font-size: 0.92rem;
+}
+.footnote {
+    opacity: 0.78;
+    font-size: 0.94rem;
+}
+footer {
+    visibility: hidden;
+}
+"""
+if torch.cuda.is_available():
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    torch.set_float32_matmul_precision("high")
+THEME = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")
+def load_examples() -> list[list[str]]:
+    with PROMPTS_FILE.open("r", encoding="utf-8") as f:
+        prompt_bank = json.load(f)
+    return [
+        [prompt_bank["hi_en"][0]["text"], "Hindi Focus", "Hindi Research Voice"],
+        [prompt_bank["hi_en"][9]["text"], "Hindi Focus", "Hindi Research Voice"],
+        [prompt_bank["hi_en"][16]["text"], "Hindi Focus", "Hindi Research Voice"],
+        [prompt_bank["ta_en"][0]["text"], "Tamil Focus", "Tamil Female Research Voice"],
+        [prompt_bank["ta_en"][9]["text"], "Tamil Focus", "Tamil Female Research Voice"],
+        [prompt_bank["ta_en"][14]["text"], "Tamil Focus", "Tamil Male Research Voice"],
+    ]
+def profile_markdown(profile_name: str) -> str:
+    description = PROFILES[profile_name]["description"]
+    return f"**{profile_name}**  \n{description}"
+def voice_markdown(voice_name: str) -> str:
+    voice = VOICE_PRESETS[voice_name]
+    if voice["path"] is None:
+        return f"**{voice_name}**  \n{voice['summary']}"
+    transcript = voice["transcript"]
+    return f"**{voice_name}**  \n{voice['summary']}  \nReference transcript: `{transcript}`"
+def dynamic_max_len(text: str) -> int:
+    char_count = max(len(text.strip()), 1)
+    return max(280, min(900, int(char_count * 7.5)))
+class ModelManager:
+    def __init__(self) -> None:
+        self.lock = threading.Lock()
+        self.repo_dir = self._resolve_repo_dir()
+        self.base_dir = self.repo_dir / "VoxCPM2_local"
+        self.loaded_profile: str | None = None
+        self.active_profile: str | None = None
+        self.model = self._load_model()
+        self.activate_profile(DEFAULT_PROFILE)
+    def _resolve_repo_dir(self) -> Path:
+        local_repo = os.getenv("INDICVOX_LOCAL_MODEL_REPO")
+        if local_repo:
+            path = Path(local_repo).expanduser().resolve()
+            if path.exists():
+                return path
+            raise FileNotFoundError(f"INDICVOX_LOCAL_MODEL_REPO does not exist: {path}")
+        token = os.getenv("HF_TOKEN")
+        snapshot_path = snapshot_download(
+            repo_id=MODEL_REPO_ID,
+            repo_type="model",
+            allow_patterns=MODEL_PATTERNS,
+            token=token,
+        )
+        return Path(snapshot_path)
+    def _load_lora_config(self, checkpoint_dir: Path) -> LoRAConfig:
+        payload = json.loads((checkpoint_dir / "lora_config.json").read_text(encoding="utf-8"))
+        return LoRAConfig(**payload["lora_config"])
+    def _load_model(self) -> VoxCPM:
+        if not torch.cuda.is_available():
+            raise RuntimeError("A GPU runtime is required. Request an A10G/L4 Space and restart.")
+        checkpoint_dir = self.repo_dir / PROFILES[DEFAULT_PROFILE]["checkpoint_dir"]
+        lora_config = self._load_lora_config(checkpoint_dir)
+        model = VoxCPM.from_pretrained(
+            hf_model_id=str(self.base_dir),
+            load_denoiser=False,
+            optimize=False,
+            lora_config=lora_config,
+        )
+        return model
+    def activate_profile(self, profile_name: str) -> None:
+        spec = PROFILES[profile_name]
+        checkpoint_dir = spec["checkpoint_dir"]
+        if checkpoint_dir is None:
+            self.model.set_lora_enabled(False)
+            self.active_profile = profile_name
+            return
+        if self.loaded_profile != profile_name:
+            if self.loaded_profile is not None:
+                self.model.unload_lora()
+            self.model.load_lora(str(self.repo_dir / checkpoint_dir))
+            self.loaded_profile = profile_name
+        self.model.set_lora_enabled(True)
+        self.active_profile = profile_name
+    def synthesize(
+        self,
+        text: str,
+        profile_name: str,
+        voice_name: str,
+        cfg_value: float,
+        inference_steps: int,
+    ) -> tuple[tuple[int, np.ndarray], str]:
+        clean_text = text.strip()
+        if not clean_text:
+            raise gr.Error("Enter a prompt first.")
+        start = time.perf_counter()
+        with self.lock:
+            self.activate_profile(profile_name)
+            kwargs = {
+                "text": clean_text,
+                "cfg_value": float(cfg_value),
+                "inference_timesteps": int(inference_steps),
+                "max_len": dynamic_max_len(clean_text),
+            }
+            voice = VOICE_PRESETS[voice_name]
+            if voice["path"] is not None:
+                kwargs["prompt_wav_path"] = str(voice["path"])
+                kwargs["prompt_text"] = voice["transcript"]
+            wav = self.model.generate(**kwargs)
+            sample_rate = int(self.model.tts_model.sample_rate)
+        if isinstance(wav, torch.Tensor):
+            wav = wav.detach().cpu().numpy()
+        wav = np.asarray(wav, dtype=np.float32).squeeze()
+        wav = np.clip(wav, -1.0, 1.0)
+        elapsed = time.perf_counter() - start
+        duration = float(wav.shape[-1]) / sample_rate if wav.size else 0.0
+        rtf = elapsed / duration if duration > 0 else float("nan")
+        speed_line = f"RTF {rtf:.2f}x" if np.isfinite(rtf) else "RTF n/a"
+        status = (
+            f"**Ready**  \n"
+            f"Profile: `{profile_name}`  \n"
+            f"Voice: `{voice_name}`  \n"
+            f"Audio length: `{duration:.2f}s`  \n"
+            f"Generation time: `{elapsed:.2f}s` ({speed_line})"
+        )
+        return (sample_rate, wav), status
+    def boot_markdown(self) -> str:
+        gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU"
+        active_profile = self.active_profile or DEFAULT_PROFILE
+        return (
+            f"**GPU Ready**  \n"
+            f"Runtime: `{gpu_name}`  \n"
+            f"Warm profile: `{active_profile}`  \n"
+            f"Model source: `{MODEL_REPO_ID}`"
+        )
+BOOT_ERROR: str | None = None
+MODEL_MANAGER: ModelManager | None = None
+try:
+    MODEL_MANAGER = ModelManager()
+except Exception:
+    BOOT_ERROR = traceback.format_exc()
+EXAMPLES = load_examples()
+def synthesize(text: str, profile_name: str, voice_name: str, cfg_value: float, inference_steps: int):
+    if MODEL_MANAGER is None:
+        raise gr.Error(f"Model initialization failed.\n\n{BOOT_ERROR}")
+    return MODEL_MANAGER.synthesize(text, profile_name, voice_name, cfg_value, inference_steps)
+def voice_preview(voice_name: str):
+    voice = VOICE_PRESETS[voice_name]
+    preview_path = str(voice["path"]) if voice["path"] is not None else None
+    return preview_path, voice_markdown(voice_name)
+def clear_prompt() -> str:
+    return ""
+def boot_status() -> str:
+    if MODEL_MANAGER is not None:
+        return MODEL_MANAGER.boot_markdown()
+    return f"**Startup Error**  \n```text\n{BOOT_ERROR}\n```"
+with gr.Blocks() as demo:
+    with gr.Column(elem_id="app-shell"):
+        gr.HTML(
+            """
+            <div id="hero">
+              <h1>IndicVox</h1>
+              <p>Research demo for multilingual TTS across Hindi, Tamil, and code-switched prompts.</p>
+              <div>
+                <span class="stat-chip">GPU-backed Space</span>
+                <span class="stat-chip">Warm-loaded model</span>
+                <span class="stat-chip">Hindi + Tamil + English prompts</span>
+              </div>
+            </div>
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=5):
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    value=DEFAULT_TEXT,
+                    lines=5,
+                    max_lines=8,
+                    placeholder="Type Hindi, Tamil, or code-switched text here...",
+                )
+                with gr.Row():
+                    profile = gr.Dropdown(
+                        choices=list(PROFILES.keys()),
+                        value=DEFAULT_PROFILE,
+                        label="Model Profile",
+                        info="Switch between the Hindi-tuned and Tamil-tuned research profiles.",
+                    )
+                    voice = gr.Dropdown(
+                        choices=list(VOICE_PRESETS.keys()),
+                        value=DEFAULT_VOICE,
+                        label="Voice Preset",
+                        info="Built-in research voices plus a zero-shot option.",
+                    )
+                with gr.Accordion("Advanced Settings", open=False):
+                    with gr.Row():
+                        cfg_value = gr.Slider(
+                            minimum=1.0,
+                            maximum=4.0,
+                            value=2.0,
+                            step=0.1,
+                            label="CFG",
+                            info="Higher values usually sound more guided but less relaxed.",
+                        )
+                        inference_steps = gr.Slider(
+                            minimum=6,
+                            maximum=16,
+                            value=10,
+                            step=1,
+                            label="Diffusion Steps",
+                            info="10 is the paper demo default.",
+                        )
+                with gr.Row():
+                    generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
+                    clear_btn = gr.Button("Clear Prompt")
+                with gr.Row():
+                    profile_info = gr.Markdown(profile_markdown(DEFAULT_PROFILE))
+                    voice_info = gr.Markdown(voice_markdown(DEFAULT_VOICE))
+            with gr.Column(scale=4):
+                status = gr.Markdown(boot_status())
+                output_audio = gr.Audio(
+                    label="Synthesized Audio",
+                    autoplay=False,
+                    format="wav",
+                )
+                voice_preview_audio = gr.Audio(
+                    label="Voice Preset Preview",
+                    value=str(VOICE_PRESETS[DEFAULT_VOICE]["path"]),
+                    interactive=False,
+                    autoplay=False,
+                    format="wav",
+                )
+                gr.Markdown(
+                    "The demo keeps the base model resident on GPU and swaps paper checkpoints on demand.",
+                    elem_classes=["footnote"],
+                )
+        with gr.Tabs():
+            with gr.Tab("Hindi + English Examples"):
+                gr.Examples(
+                    examples=[row for row in EXAMPLES if row[1] == "Hindi Focus"],
+                    inputs=[prompt, profile, voice],
+                    cache_examples=False,
+                )
+            with gr.Tab("Tamil + English Examples"):
+                gr.Examples(
+                    examples=[row for row in EXAMPLES if row[1] == "Tamil Focus"],
+                    inputs=[prompt, profile, voice],
+                    cache_examples=False,
+                )
+        gr.Markdown(
+            """
+            **Demo notes**
+            - `Hindi Focus` maps to the Hindi-strong checkpoint from the paper experiments.
+            - `Tamil Focus` maps to the Tamil + code-switch checkpoint and is the default for the Space.
+            - `Text Only` skips the reference clip and runs zero-shot synthesis.
+            """,
+            elem_classes=["footnote"],
+        )
+    generate_btn.click(
+        fn=synthesize,
+        inputs=[prompt, profile, voice, cfg_value, inference_steps],
+        outputs=[output_audio, status],
+        api_name="synthesize",
+    )
+    prompt.submit(
+        fn=synthesize,
+        inputs=[prompt, profile, voice, cfg_value, inference_steps],
+        outputs=[output_audio, status],
+        api_name=False,
+    )
+    profile.change(fn=profile_markdown, inputs=profile, outputs=profile_info, api_name=False)
+    voice.change(fn=voice_preview, inputs=voice, outputs=[voice_preview_audio, voice_info], api_name=False)
+    clear_btn.click(fn=clear_prompt, outputs=prompt, api_name=False)
+demo.queue(default_concurrency_limit=1, max_size=16)
+if __name__ == "__main__":
+    demo.launch(theme=THEME, css=CUSTOM_CSS)

assets/voices/hin_m_ref_00.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5400ed6ce26df5efddce5e264153423f378623af05f987cc1c435c06cfd24df2
+size 398732

assets/voices/tam_f_ref_00.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71564e96ac0378d91f35cf70e27f56e5ad814db267c59eac91ac370e612998f0
+size 605228

assets/voices/tam_m_ref_00.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07ba36b8a8ac46246b0ae5cabe12b90325bd62eb4532dd4b6a117b306c8658d3
+size 573452

code_switch_prompts.json ADDED Viewed

	@@ -0,0 +1,166 @@

+{
+  "hi_en": [
+    {
+      "id": "hi_en_001",
+      "text": "आज morning standup में हमने Hindi और English prompts पर ASR output compare किया।"
+    },
+    {
+      "id": "hi_en_002",
+      "text": "कल client demo से पहले तुम latest checkpoint का audio sample एक बार verify कर लो।"
+    },
+    {
+      "id": "hi_en_003",
+      "text": "अगर final report ready है तो उसे shared drive में upload कर दो।"
+    },
+    {
+      "id": "hi_en_004",
+      "text": "ये model normal sentences अच्छा बोलता है, लेकिन code-switch parts में अभी भी थोड़ा hesitation आता है।"
+    },
+    {
+      "id": "hi_en_005",
+      "text": "मुझे लगता है कि speaker similarity के लिए हमें clean reference clip use करना चाहिए।"
+    },
+    {
+      "id": "hi_en_006",
+      "text": "तुमने meeting notes में Tamil section add किया या वो अभी pending है?"
+    },
+    {
+      "id": "hi_en_007",
+      "text": "आज lab में GPU free है, इसलिए full evaluation run अभी start कर देते हैं।"
+    },
+    {
+      "id": "hi_en_008",
+      "text": "अगर transcript में punctuation ज्यादा हो तो Whisper कभी कभी extra words insert कर देता है।"
+    },
+    {
+      "id": "hi_en_009",
+      "text": "इस experiment के लिए मैंने short reference audio चुना ताकि cloning stable रहे।"
+    },
+    {
+      "id": "hi_en_010",
+      "text": "हम paper में monolingual results और code-switch results अलग tables में दिखाएँगे।"
+    },
+    {
+      "id": "hi_en_011",
+      "text": "please final plots save कर लेना, वरना thesis draft फिर से update करना पड़ेगा।"
+    },
+    {
+      "id": "hi_en_012",
+      "text": "आज के test set में proper nouns, news style और casual conversation तीनों mix किए गए हैं।"
+    },
+    {
+      "id": "hi_en_013",
+      "text": "अगर base model Tamil शब्द गलत बोलता है तो LoRA adaptation का effect तुरंत दिख जाएगा।"
+    },
+    {
+      "id": "hi_en_014",
+      "text": "मैंने summary sheet में WER, CER, switch-WER और speaker similarity सब add कर दिया है।"
+    },
+    {
+      "id": "hi_en_015",
+      "text": "आज evening तक तुम generated audio folders को model-wise sort कर दो।"
+    },
+    {
+      "id": "hi_en_016",
+      "text": "meeting के बाद हम ASR transcripts manually spot-check भी करेंगे ताकि obvious errors miss न हों।"
+    },
+    {
+      "id": "hi_en_017",
+      "text": "ये checkpoint short prompts पर ठीक है, पर long mixed sentences में इसकी rhythm थोड़ी uneven लगती है।"
+    },
+    {
+      "id": "hi_en_018",
+      "text": "अगर inference time ज्यादा हुआ तो पहले pilot run करेंगे और फिर full batch launch करेंगे।"
+    },
+    {
+      "id": "hi_en_019",
+      "text": "reference speaker clean है, लेकिन generated output में English words का stress अभी consistent नहीं है।"
+    },
+    {
+      "id": "hi_en_020",
+      "text": "इस बार final appendix में example prompts, transcripts और metric formulas तीनों include करना।"
+    }
+  ],
+  "ta_en": [
+    {
+      "id": "ta_en_001",
+      "text": "நேத்து team meetingல புதிய checkpoint பற்றி detailedஆ பேசினோம்."
+    },
+    {
+      "id": "ta_en_002",
+      "text": "இந்த experimentக்கு clean reference audio use பண்ணணும், இல்லனா output quality drop ஆகும்."
+    },
+    {
+      "id": "ta_en_003",
+      "text": "final report ready ஆனதும் அதை shared folderல upload பண்ணிடு."
+    },
+    {
+      "id": "ta_en_004",
+      "text": "இந்த model Tamil words நல்லா பேசுது, ஆனா English switch வரும் இடங்களில் இன்னும் slight hesitation இருக்கு."
+    },
+    {
+      "id": "ta_en_005",
+      "text": "speaker similarity score stable ஆகணும்னா same voice reference தொடர்ந்து use பண்ணணும்."
+    },
+    {
+      "id": "ta_en_006",
+      "text": "இன்று full evaluation run start பண்ணலாம், ஏன்னா GPU slot இப்போ free இருக்கு."
+    },
+    {
+      "id": "ta_en_007",
+      "text": "Whisper transcriptல punctuation இல்லாதப்போ சில code-switch words betterஆ capture ஆகுது."
+    },
+    {
+      "id": "ta_en_008",
+      "text": "paper tableல monolingual Tamil resultsவும் Tamil-English resultsவும் separateஆ காட்டணும்."
+    },
+    {
+      "id": "ta_en_009",
+      "text": "இந்த promptல proper noun, news style, casual speech மூன்றும் mixedஆ இருக்கு."
+    },
+    {
+      "id": "ta_en_010",
+      "text": "latest checkpoint load பண்ணதுக்குப் பிறகு ஒரு short sanity test முதலில் run பண்ணலாம்."
+    },
+    {
+      "id": "ta_en_011",
+      "text": "please generated audio files எல்லாம் model-wise sort பண்ணி metrics folderக்குள் move பண்ணு."
+    },
+    {
+      "id": "ta_en_012",
+      "text": "இந்த setupல base modelக்கு Tamil pronunciation கொஞ்சம் weakஆ இருந்தா LoRA gain clearஆ தெரியும்."
+    },
+    {
+      "id": "ta_en_013",
+      "text": "summary sheetல WER, CER, switch-WER, speaker similarity எல்லாமே சேர்க்கணும்."
+    },
+    {
+      "id": "ta_en_014",
+      "text": "meeting முடிஞ்சதும் manual spot-check பண்ணி obvious ASR mistakes இருக்கா என்று பார்க்கலாம்."
+    },
+    {
+      "id": "ta_en_015",
+      "text": "short promptsல output cleanஆ இருக்கு, ஆனா long mixed sentenceல rhythm கொஞ்சம் unevenஆ இருக்கு."
+    },
+    {
+      "id": "ta_en_016",
+      "text": "if the plots look clean, appendixல example promptsமும் generated transcriptsமும் add பண்ணலாம்."
+    },
+    {
+      "id": "ta_en_017",
+      "text": "இந்த reference clip calmஆ இருக்குது, அதனால் generated voiceவும் naturalஆ வர வாய்ப்பு அதிகம்."
+    },
+    {
+      "id": "ta_en_018",
+      "text": "tonightக்குள் full batch finish ஆயிடுச்சுனா நாளைக்கு paper draftல numbers insert பண்ணலாம்."
+    },
+    {
+      "id": "ta_en_019",
+      "text": "speaker clone நல்லா இருக்கு, ஆனால் English stress pattern இன்னும் fully consistent இல்ல."
+    },
+    {
+      "id": "ta_en_020",
+      "text": "இந்த round முடிஞ்சதும் next stepஆ human listening test plan பண்ணலாம்."
+    }
+  ]
+}

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libsndfile1

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio>=6,<7
+huggingface_hub>=1.0
+numpy<3
+torch>=2.5.0
+torchaudio>=2.5.0
+transformers>=4.36.2
+einops>=0.8.0
+inflect>=7.0.0
+wetext
+librosa>=0.10.2
+soundfile>=0.12.1
+pydantic>=2
+safetensors>=0.4.5

voxcpm/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .core import VoxCPM
+__all__ = [
+    "VoxCPM",
+]

voxcpm/cli.py ADDED Viewed

	@@ -0,0 +1,598 @@

+#!/usr/bin/env python3
+"""
+VoxCPM Command Line Interface
+VoxCPM2-first CLI for voice design, cloning, and batch processing.
+"""
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+import soundfile as sf
+from voxcpm.core import VoxCPM
+DEFAULT_HF_MODEL_ID = "openbmb/VoxCPM2"
+# -----------------------------
+# Validators
+# -----------------------------
+def validate_file_exists(file_path: str, file_type: str = "file") -> Path:
+    path = Path(file_path)
+    if not path.exists():
+        raise FileNotFoundError(f"{file_type} '{file_path}' does not exist")
+    return path
+def require_file_exists(file_path: str, parser, file_type: str = "file") -> Path:
+    try:
+        return validate_file_exists(file_path, file_type)
+    except FileNotFoundError as exc:
+        parser.error(str(exc))
+def validate_output_path(output_path: str) -> Path:
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    return path
+def validate_ranges(args, parser):
+    """Validate numeric argument ranges."""
+    if not (0.1 <= args.cfg_value <= 10.0):
+        parser.error("--cfg-value must be between 0.1 and 10.0 (recommended: 1.0–3.0)")
+    if not (1 <= args.inference_timesteps <= 100):
+        parser.error("--inference-timesteps must be between 1 and 100 (recommended: 4–30)")
+    if args.lora_r <= 0:
+        parser.error("--lora-r must be a positive integer")
+    if args.lora_alpha <= 0:
+        parser.error("--lora-alpha must be a positive integer")
+    if not (0.0 <= args.lora_dropout <= 1.0):
+        parser.error("--lora-dropout must be between 0.0 and 1.0")
+def warn_legacy_mode():
+    print(
+        "Warning: legacy root CLI arguments are deprecated. Prefer `voxcpm design|clone|batch ...`.",
+        file=sys.stderr,
+    )
+def build_final_text(text: str, control: str | None) -> str:
+    control = (control or "").strip()
+    return f"({control}){text}" if control else text
+def resolve_prompt_text(args, parser) -> str | None:
+    prompt_text = getattr(args, "prompt_text", None)
+    prompt_file = getattr(args, "prompt_file", None)
+    if prompt_text and prompt_file:
+        parser.error("Use either --prompt-text or --prompt-file, not both.")
+    if prompt_file:
+        prompt_path = require_file_exists(prompt_file, parser, "prompt text file")
+        return prompt_path.read_text(encoding="utf-8").strip()
+    if prompt_text:
+        return prompt_text.strip()
+    return None
+def detect_model_architecture(args) -> str | None:
+    model_location = getattr(args, "model_path", None) or getattr(
+        args, "hf_model_id", None
+    )
+    if not model_location:
+        return None
+    if os.path.isdir(model_location):
+        config_path = Path(model_location) / "config.json"
+        if not config_path.exists():
+            return None
+        with open(config_path, "r", encoding="utf-8") as f:
+            return json.load(f).get("architecture", "voxcpm").lower()
+    model_hint = str(model_location).lower()
+    if "voxcpm2" in model_hint:
+        return "voxcpm2"
+    if (
+        "voxcpm1.5" in model_hint
+        or "voxcpm-1.5" in model_hint
+        or "voxcpm_1.5" in model_hint
+    ):
+        return "voxcpm"
+    return None
+def validate_prompt_related_args(args, parser, prompt_text: str | None):
+    if prompt_text and not args.prompt_audio:
+        parser.error("--prompt-text/--prompt-file requires --prompt-audio.")
+    if args.prompt_audio and not prompt_text:
+        parser.error("--prompt-audio requires --prompt-text or --prompt-file.")
+    if args.control and prompt_text:
+        parser.error(
+            "--control cannot be used together with --prompt-text or --prompt-file."
+        )
+def validate_reference_support(args, parser):
+    if not getattr(args, "reference_audio", None):
+        return
+    arch = detect_model_architecture(args)
+    if arch == "voxcpm":
+        parser.error("--reference-audio is only supported with VoxCPM2 models.")
+def validate_design_args(args, parser):
+    prompt_text = resolve_prompt_text(args, parser)
+    if args.prompt_audio or args.reference_audio or prompt_text:
+        parser.error(
+            "`design` does not accept prompt/reference audio. Use `clone` instead."
+        )
+def validate_clone_args(args, parser):
+    prompt_text = resolve_prompt_text(args, parser)
+    validate_prompt_related_args(args, parser, prompt_text)
+    validate_reference_support(args, parser)
+    if not args.prompt_audio and not args.reference_audio:
+        parser.error(
+            "`clone` requires --reference-audio, or --prompt-audio with --prompt-text/--prompt-file."
+        )
+    return prompt_text
+def validate_batch_args(args, parser):
+    prompt_text = resolve_prompt_text(args, parser)
+    validate_prompt_related_args(args, parser, prompt_text)
+    validate_reference_support(args, parser)
+    return prompt_text
+# -----------------------------
+# Model loading
+# -----------------------------
+def load_model(args) -> VoxCPM:
+    print("Loading VoxCPM model...", file=sys.stderr)
+    zipenhancer_path = getattr(args, "zipenhancer_path", None) or os.environ.get(
+        "ZIPENHANCER_MODEL_PATH", None
+    )
+    # Build LoRA config if provided
+    lora_config = None
+    lora_weights_path = getattr(args, "lora_path", None)
+    if lora_weights_path:
+        from voxcpm.model.voxcpm import LoRAConfig
+        lora_config = LoRAConfig(
+            enable_lm=not args.lora_disable_lm,
+            enable_dit=not args.lora_disable_dit,
+            enable_proj=args.lora_enable_proj,
+            r=args.lora_r,
+            alpha=args.lora_alpha,
+            dropout=args.lora_dropout,
+        )
+        print(
+            f"LoRA config: r={lora_config.r}, alpha={lora_config.alpha}, "
+            f"lm={lora_config.enable_lm}, dit={lora_config.enable_dit}, proj={lora_config.enable_proj}",
+            file=sys.stderr,
+        )
+    # Load local model if specified
+    if args.model_path:
+        try:
+            model = VoxCPM(
+                voxcpm_model_path=args.model_path,
+                zipenhancer_model_path=zipenhancer_path,
+                enable_denoiser=not args.no_denoiser,
+                optimize=not args.no_optimize,
+                lora_config=lora_config,
+                lora_weights_path=lora_weights_path,
+            )
+            print("Model loaded (local).", file=sys.stderr)
+            return model
+        except Exception as e:
+            print(f"Failed to load model (local): {e}", file=sys.stderr)
+            sys.exit(1)
+    # Load from Hugging Face Hub
+    try:
+        model = VoxCPM.from_pretrained(
+            hf_model_id=args.hf_model_id,
+            load_denoiser=not args.no_denoiser,
+            zipenhancer_model_id=zipenhancer_path,
+            cache_dir=args.cache_dir,
+            local_files_only=args.local_files_only,
+            optimize=not args.no_optimize,
+            lora_config=lora_config,
+            lora_weights_path=lora_weights_path,
+        )
+        print("Model loaded (from_pretrained).", file=sys.stderr)
+        return model
+    except Exception as e:
+        print(f"Failed to load model (from_pretrained): {e}", file=sys.stderr)
+        sys.exit(1)
+# -----------------------------
+# Commands
+# -----------------------------
+def _run_single(args, parser, *, text: str, output: str, prompt_text: str | None):
+    output_path = validate_output_path(output)
+    if args.prompt_audio:
+        require_file_exists(args.prompt_audio, parser, "prompt audio file")
+    if args.reference_audio:
+        require_file_exists(args.reference_audio, parser, "reference audio file")
+    model = load_model(args)
+    audio_array = model.generate(
+        text=text,
+        prompt_wav_path=args.prompt_audio,
+        prompt_text=prompt_text,
+        reference_wav_path=args.reference_audio,
+        cfg_value=args.cfg_value,
+        inference_timesteps=args.inference_timesteps,
+        normalize=args.normalize,
+        denoise=args.denoise
+        and (args.prompt_audio is not None or args.reference_audio is not None),
+    )
+    sf.write(str(output_path), audio_array, model.tts_model.sample_rate)
+    duration = len(audio_array) / model.tts_model.sample_rate
+    print(f"Saved audio to: {output_path} ({duration:.2f}s)", file=sys.stderr)
+def cmd_design(args, parser):
+    validate_design_args(args, parser)
+    final_text = build_final_text(args.text, args.control)
+    return _run_single(
+        args, parser, text=final_text, output=args.output, prompt_text=None
+    )
+def cmd_clone(args, parser):
+    prompt_text = validate_clone_args(args, parser)
+    final_text = build_final_text(args.text, args.control)
+    return _run_single(
+        args, parser, text=final_text, output=args.output, prompt_text=prompt_text
+    )
+def cmd_batch(args, parser):
+    input_file = require_file_exists(args.input, parser, "input file")
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    with open(input_file, "r", encoding="utf-8") as f:
+        texts = [line.strip() for line in f if line.strip()]
+    if not texts:
+        sys.exit("Error: Input file is empty")
+    prompt_text = validate_batch_args(args, parser)
+    model = load_model(args)
+    prompt_audio_path = None
+    if args.prompt_audio:
+        prompt_audio_path = str(
+            require_file_exists(args.prompt_audio, parser, "prompt audio file")
+        )
+    reference_audio_path = None
+    if args.reference_audio:
+        reference_audio_path = str(
+            require_file_exists(args.reference_audio, parser, "reference audio file")
+        )
+    success_count = 0
+    for i, text in enumerate(texts, 1):
+        try:
+            final_text = build_final_text(text, args.control)
+            audio_array = model.generate(
+                text=final_text,
+                prompt_wav_path=prompt_audio_path,
+                prompt_text=prompt_text,
+                reference_wav_path=reference_audio_path,
+                cfg_value=args.cfg_value,
+                inference_timesteps=args.inference_timesteps,
+                normalize=args.normalize,
+                denoise=args.denoise
+                and (prompt_audio_path is not None or reference_audio_path is not None),
+            )
+            output_file = output_dir / f"output_{i:03d}.wav"
+            sf.write(str(output_file), audio_array, model.tts_model.sample_rate)
+            duration = len(audio_array) / model.tts_model.sample_rate
+            print(f"Saved: {output_file} ({duration:.2f}s)", file=sys.stderr)
+            success_count += 1
+        except Exception as e:
+            print(f"Failed on line {i}: {e}", file=sys.stderr)
+    print(f"\nBatch finished: {success_count}/{len(texts)} succeeded", file=sys.stderr)
+# -----------------------------
+# Parser
+# -----------------------------
+def _add_common_generation_args(parser):
+    parser.add_argument("--text", "-t", help="Text to synthesize")
+    parser.add_argument(
+        "--control",
+        type=str,
+        help="Control instruction for VoxCPM2 voice design/cloning",
+    )
+    parser.add_argument(
+        "--cfg-value",
+        type=float,
+        default=2.0,
+        help="CFG guidance scale (float, recommended 1.0–3.0, default: 2.0)",
+    )
+    parser.add_argument(
+        "--inference-timesteps",
+        type=int,
+        default=10,
+        help="Inference steps (int, recommended 4–30, default: 10)",
+    )
+    parser.add_argument(
+        "--normalize", action="store_true", help="Enable text normalization"
+    )
+def _add_prompt_reference_args(parser):
+    parser.add_argument(
+        "--prompt-audio",
+        "-pa",
+        help="Prompt audio file path (continuation mode, requires --prompt-text or --prompt-file)",
+    )
+    parser.add_argument(
+        "--prompt-text", "-pt", help="Text corresponding to the prompt audio"
+    )
+    parser.add_argument(
+        "--prompt-file", type=str, help="Text file corresponding to the prompt audio"
+    )
+    parser.add_argument(
+        "--reference-audio",
+        "-ra",
+        help="Reference audio for voice cloning (VoxCPM2 only)",
+    )
+    parser.add_argument(
+        "--denoise",
+        action="store_true",
+        help="Enable prompt/reference speech enhancement",
+    )
+def _add_model_args(parser):
+    parser.add_argument("--model-path", type=str, help="Local VoxCPM model path")
+    parser.add_argument(
+        "--hf-model-id",
+        type=str,
+        default=DEFAULT_HF_MODEL_ID,
+        help=f"Hugging Face repo id (default: {DEFAULT_HF_MODEL_ID})",
+    )
+    parser.add_argument(
+        "--cache-dir", type=str, help="Cache directory for Hub downloads"
+    )
+    parser.add_argument(
+        "--local-files-only", action="store_true", help="Disable network access"
+    )
+    parser.add_argument(
+        "--no-denoiser", action="store_true", help="Disable denoiser model loading"
+    )
+    parser.add_argument(
+        "--no-optimize",
+        action="store_true",
+        help="Disable model optimization during loading",
+    )
+    parser.add_argument(
+        "--zipenhancer-path",
+        type=str,
+        help="ZipEnhancer model id or local path (or env ZIPENHANCER_MODEL_PATH)",
+    )
+def _add_lora_args(parser):
+    parser.add_argument("--lora-path", type=str, help="Path to LoRA weights")
+    parser.add_argument(
+        "--lora-r", type=int, default=32, help="LoRA rank (positive int, default: 32)"
+    )
+    parser.add_argument(
+        "--lora-alpha",
+        type=int,
+        default=16,
+        help="LoRA alpha (positive int, default: 16)",
+    )
+    parser.add_argument(
+        "--lora-dropout",
+        type=float,
+        default=0.0,
+        help="LoRA dropout rate (0.0–1.0, default: 0.0)",
+    )
+    parser.add_argument(
+        "--lora-disable-lm", action="store_true", help="Disable LoRA on LM layers"
+    )
+    parser.add_argument(
+        "--lora-disable-dit", action="store_true", help="Disable LoRA on DiT layers"
+    )
+    parser.add_argument(
+        "--lora-enable-proj",
+        action="store_true",
+        help="Enable LoRA on projection layers",
+    )
+def _build_parser():
+    parser = argparse.ArgumentParser(
+        description="VoxCPM CLI - VoxCPM2-first voice design, cloning, and batch processing",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  voxcpm design --text "Hello world" --output out.wav
+  voxcpm design --text "Hello world" --control "warm female voice" --output out.wav
+  voxcpm clone --text "Hello" --reference-audio ref.wav --output out.wav
+  voxcpm batch --input texts.txt --output-dir ./outs --reference-audio ref.wav
+        """,
+    )
+    subparsers = parser.add_subparsers(dest="command")
+    design_parser = subparsers.add_parser(
+        "design", help="Generate speech with VoxCPM2-first voice design"
+    )
+    _add_common_generation_args(design_parser)
+    _add_prompt_reference_args(design_parser)
+    _add_model_args(design_parser)
+    _add_lora_args(design_parser)
+    design_parser.add_argument(
+        "--output", "-o", required=True, help="Output audio file path"
+    )
+    clone_parser = subparsers.add_parser(
+        "clone", help="Clone a voice with reference/prompt audio"
+    )
+    _add_common_generation_args(clone_parser)
+    _add_prompt_reference_args(clone_parser)
+    _add_model_args(clone_parser)
+    _add_lora_args(clone_parser)
+    clone_parser.add_argument(
+        "--output", "-o", required=True, help="Output audio file path"
+    )
+    batch_parser = subparsers.add_parser(
+        "batch", help="Batch-generate one line per output file"
+    )
+    batch_parser.add_argument(
+        "--input", "-i", required=True, help="Input text file (one text per line)"
+    )
+    batch_parser.add_argument(
+        "--output-dir", "-od", required=True, help="Output directory"
+    )
+    batch_parser.add_argument(
+        "--control",
+        type=str,
+        help="Control instruction for VoxCPM2 voice design/cloning",
+    )
+    _add_prompt_reference_args(batch_parser)
+    batch_parser.add_argument(
+        "--cfg-value",
+        type=float,
+        default=2.0,
+        help="CFG guidance scale (float, recommended 1.0–3.0, default: 2.0)",
+    )
+    batch_parser.add_argument(
+        "--inference-timesteps",
+        type=int,
+        default=10,
+        help="Inference steps (int, recommended 4–30, default: 10)",
+    )
+    batch_parser.add_argument(
+        "--normalize", action="store_true", help="Enable text normalization"
+    )
+    _add_model_args(batch_parser)
+    _add_lora_args(batch_parser)
+    # Legacy root arguments
+    parser.add_argument("--input", "-i", help="Input text file (batch mode only)")
+    parser.add_argument(
+        "--output-dir", "-od", help="Output directory (batch mode only)"
+    )
+    _add_common_generation_args(parser)
+    parser.add_argument(
+        "--output", "-o", help="Output audio file path (single or clone mode)"
+    )
+    _add_prompt_reference_args(parser)
+    _add_model_args(parser)
+    _add_lora_args(parser)
+    return parser
+def _dispatch_legacy(args, parser):
+    warn_legacy_mode()
+    if args.input and args.text:
+        parser.error(
+            "Use either batch mode (--input) or single mode (--text), not both."
+        )
+    if args.input:
+        if not args.output_dir:
+            parser.error("Batch mode requires --output-dir")
+        return cmd_batch(args, parser)
+    if not args.text or not args.output:
+        parser.error("Single-sample legacy mode requires --text and --output")
+    if (
+        args.prompt_audio
+        or args.prompt_text
+        or args.prompt_file
+        or args.reference_audio
+    ):
+        return cmd_clone(args, parser)
+    return cmd_design(args, parser)
+# -----------------------------
+# Entrypoint
+# -----------------------------
+def main():
+    parser = _build_parser()
+    args = parser.parse_args()
+    validate_ranges(args, parser)
+    if args.command == "design":
+        if not args.text:
+            parser.error("`design` requires --text")
+        return cmd_design(args, parser)
+    if args.command == "clone":
+        if not args.text or not args.output:
+            parser.error("`clone` requires --text and --output")
+        return cmd_clone(args, parser)
+    if args.command == "batch":
+        return cmd_batch(args, parser)
+    return _dispatch_legacy(args, parser)
+if __name__ == "__main__":
+    main()

voxcpm/core.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import os
+import sys
+import re
+import json
+import tempfile
+import numpy as np
+from typing import Generator, Optional
+from huggingface_hub import snapshot_download
+from .model.voxcpm import VoxCPMModel, LoRAConfig
+from .model.voxcpm2 import VoxCPM2Model
+class VoxCPM:
+    def __init__(
+        self,
+        voxcpm_model_path: str,
+        zipenhancer_model_path: str | None = "iic/speech_zipenhancer_ans_multiloss_16k_base",
+        enable_denoiser: bool = True,
+        optimize: bool = True,
+        lora_config: Optional[LoRAConfig] = None,
+        lora_weights_path: Optional[str] = None,
+    ):
+        """Initialize VoxCPM TTS pipeline.
+        Args:
+            voxcpm_model_path: Local filesystem path to the VoxCPM model assets
+                (weights, configs, etc.). Typically the directory returned by
+                a prior download step.
+            zipenhancer_model_path: ModelScope acoustic noise suppression model
+                id or local path. If None, denoiser will not be initialized.
+            enable_denoiser: Whether to initialize the denoiser pipeline.
+            optimize: Whether to optimize the model with torch.compile. True by default, but can be disabled for debugging.
+            lora_config: LoRA configuration for fine-tuning. If lora_weights_path is
+                provided without lora_config, a default config will be created.
+            lora_weights_path: Path to pre-trained LoRA weights (.pth file or directory
+                containing lora_weights.ckpt). If provided, LoRA weights will be loaded.
+        """
+        print(
+            f"voxcpm_model_path: {voxcpm_model_path}, zipenhancer_model_path: {zipenhancer_model_path}, enable_denoiser: {enable_denoiser}",
+            file=sys.stderr,
+        )
+        # If lora_weights_path is provided but no lora_config, create a default one
+        if lora_weights_path is not None and lora_config is None:
+            lora_config = LoRAConfig(
+                enable_lm=True,
+                enable_dit=True,
+                enable_proj=False,
+            )
+            print(f"Auto-created default LoRAConfig for loading weights from: {lora_weights_path}", file=sys.stderr)
+        # Determine model type from config.json architecture field
+        config_path = os.path.join(voxcpm_model_path, "config.json")
+        with open(config_path, "r", encoding="utf-8") as f:
+            config = json.load(f)
+        arch = config.get("architecture", "voxcpm").lower()
+        if arch == "voxcpm2":
+            self.tts_model = VoxCPM2Model.from_local(voxcpm_model_path, optimize=optimize, lora_config=lora_config)
+            print("Loaded VoxCPM2Model", file=sys.stderr)
+        elif arch == "voxcpm":
+            self.tts_model = VoxCPMModel.from_local(voxcpm_model_path, optimize=optimize, lora_config=lora_config)
+            print("Loaded VoxCPMModel", file=sys.stderr)
+        else:
+            raise ValueError(f"Unsupported architecture: {arch}")
+        # Load LoRA weights if path is provided
+        if lora_weights_path is not None:
+            print(f"Loading LoRA weights from: {lora_weights_path}", file=sys.stderr)
+            loaded_keys, skipped_keys = self.tts_model.load_lora_weights(lora_weights_path)
+            print(f"Loaded {len(loaded_keys)} LoRA parameters, skipped {len(skipped_keys)}", file=sys.stderr)
+        self.text_normalizer = None
+        self.denoiser = None
+        if enable_denoiser and zipenhancer_model_path is not None:
+            from .zipenhancer import ZipEnhancer
+            self.denoiser = ZipEnhancer(zipenhancer_model_path)
+        else:
+            self.denoiser = None
+        if optimize:
+            print("Warm up VoxCPMModel...", file=sys.stderr)
+            self.tts_model.generate(
+                target_text="Hello, this is the first test sentence.",
+                max_len=10,
+            )
+    @classmethod
+    def from_pretrained(
+        cls,
+        hf_model_id: str = "openbmb/VoxCPM2",
+        load_denoiser: bool = True,
+        zipenhancer_model_id: str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
+        cache_dir: str = None,
+        local_files_only: bool = False,
+        optimize: bool = True,
+        lora_config: Optional[LoRAConfig] = None,
+        lora_weights_path: Optional[str] = None,
+        **kwargs,
+    ):
+        """Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot.
+        Args:
+            hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo") or local path.
+            load_denoiser: Whether to initialize the denoiser pipeline.
+            optimize: Whether to optimize the model with torch.compile. True by default, but can be disabled for debugging.
+            zipenhancer_model_id: Denoiser model id or path for ModelScope
+                acoustic noise suppression.
+            cache_dir: Custom cache directory for the snapshot.
+            local_files_only: If True, only use local files and do not attempt
+                to download.
+            lora_config: LoRA configuration for fine-tuning. If lora_weights_path is
+                provided without lora_config, a default config will be created with
+                enable_lm=True and enable_dit=True.
+            lora_weights_path: Path to pre-trained LoRA weights (.pth file or directory
+                containing lora_weights.ckpt). If provided, LoRA weights will be loaded
+                after model initialization.
+        Kwargs:
+            Additional keyword arguments passed to the ``VoxCPM`` constructor.
+        Returns:
+            VoxCPM: Initialized instance whose ``voxcpm_model_path`` points to
+            the downloaded snapshot directory.
+        Raises:
+            ValueError: If neither a valid ``hf_model_id`` nor a resolvable
+                ``hf_model_id`` is provided.
+        """
+        repo_id = hf_model_id
+        if not repo_id:
+            raise ValueError("You must provide hf_model_id")
+        # Load from local path if provided
+        if os.path.isdir(repo_id):
+            local_path = repo_id
+        else:
+            # Otherwise, try from_pretrained (Hub); exit on failure
+            local_path = snapshot_download(
+                repo_id=repo_id,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+            )
+        return cls(
+            voxcpm_model_path=local_path,
+            zipenhancer_model_path=zipenhancer_model_id if load_denoiser else None,
+            enable_denoiser=load_denoiser,
+            optimize=optimize,
+            lora_config=lora_config,
+            lora_weights_path=lora_weights_path,
+            **kwargs,
+        )
+    def generate(self, *args, **kwargs) -> np.ndarray:
+        return next(self._generate(*args, streaming=False, **kwargs))
+    def generate_streaming(self, *args, **kwargs) -> Generator[np.ndarray, None, None]:
+        return self._generate(*args, streaming=True, **kwargs)
+    def _generate(
+        self,
+        text: str,
+        prompt_wav_path: str = None,
+        prompt_text: str = None,
+        reference_wav_path: str = None,
+        cfg_value: float = 2.0,
+        inference_timesteps: int = 10,
+        min_len: int = 2,
+        max_len: int = 4096,
+        normalize: bool = False,
+        denoise: bool = False,
+        retry_badcase: bool = True,
+        retry_badcase_max_times: int = 3,
+        retry_badcase_ratio_threshold: float = 6.0,
+        streaming: bool = False,
+    ) -> Generator[np.ndarray, None, None]:
+        """Synthesize speech for the given text and return a single waveform.
+        Args:
+            text: Input text to synthesize.
+            prompt_wav_path: Path to prompt audio for continuation mode.
+                Must be paired with ``prompt_text``.
+            prompt_text: Text content corresponding to the prompt audio.
+            reference_wav_path: Path to reference audio for voice cloning
+                (structurally isolated via ref_audio tokens). Can be used
+                alone or combined with ``prompt_wav_path`` + ``prompt_text``.
+            cfg_value: Guidance scale for the generation model.
+            inference_timesteps: Number of inference steps.
+            min_len: Minimum audio length.
+            max_len: Maximum token length during generation.
+            normalize: Whether to run text normalization before generation.
+            denoise: Whether to denoise the prompt/reference audio if a
+                denoiser is available.
+            retry_badcase: Whether to retry badcase.
+            retry_badcase_max_times: Maximum number of times to retry badcase.
+            retry_badcase_ratio_threshold: Threshold for audio-to-text ratio.
+            streaming: Whether to return a generator of audio chunks.
+        Returns:
+            Generator of numpy.ndarray: 1D waveform array (float32) on CPU.
+            Yields audio chunks for each generation step if ``streaming=True``,
+            otherwise yields a single array containing the final audio.
+        """
+        if not text.strip() or not isinstance(text, str):
+            raise ValueError("target text must be a non-empty string")
+        if prompt_wav_path is not None:
+            if not os.path.exists(prompt_wav_path):
+                raise FileNotFoundError(f"prompt_wav_path does not exist: {prompt_wav_path}")
+        if reference_wav_path is not None:
+            if not os.path.exists(reference_wav_path):
+                raise FileNotFoundError(f"reference_wav_path does not exist: {reference_wav_path}")
+        if (prompt_wav_path is None) != (prompt_text is None):
+            raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None")
+        is_v2 = isinstance(self.tts_model, VoxCPM2Model)
+        if reference_wav_path is not None and not is_v2:
+            raise ValueError("reference_wav_path is only supported with VoxCPM2 models")
+        text = text.replace("\n", " ")
+        text = re.sub(r"\s+", " ", text)
+        temp_files = []
+        try:
+            actual_prompt_path = prompt_wav_path
+            actual_ref_path = reference_wav_path
+            if denoise and self.denoiser is not None:
+                if prompt_wav_path is not None:
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+                        temp_files.append(tmp.name)
+                    self.denoiser.enhance(prompt_wav_path, output_path=temp_files[-1])
+                    actual_prompt_path = temp_files[-1]
+                if reference_wav_path is not None:
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+                        temp_files.append(tmp.name)
+                    self.denoiser.enhance(reference_wav_path, output_path=temp_files[-1])
+                    actual_ref_path = temp_files[-1]
+            if actual_prompt_path is not None or actual_ref_path is not None:
+                if is_v2:
+                    fixed_prompt_cache = self.tts_model.build_prompt_cache(
+                        prompt_text=prompt_text,
+                        prompt_wav_path=actual_prompt_path,
+                        reference_wav_path=actual_ref_path,
+                    )
+                else:
+                    fixed_prompt_cache = self.tts_model.build_prompt_cache(
+                        prompt_text=prompt_text,
+                        prompt_wav_path=actual_prompt_path,
+                    )
+            else:
+                fixed_prompt_cache = None
+            if normalize:
+                if self.text_normalizer is None:
+                    from .utils.text_normalize import TextNormalizer
+                    self.text_normalizer = TextNormalizer()
+                text = self.text_normalizer.normalize(text)
+            generate_result = self.tts_model._generate_with_prompt_cache(
+                target_text=text,
+                prompt_cache=fixed_prompt_cache,
+                min_len=min_len,
+                max_len=max_len,
+                inference_timesteps=inference_timesteps,
+                cfg_value=cfg_value,
+                retry_badcase=retry_badcase,
+                retry_badcase_max_times=retry_badcase_max_times,
+                retry_badcase_ratio_threshold=retry_badcase_ratio_threshold,
+                streaming=streaming,
+            )
+            for wav, _, _ in generate_result:
+                yield wav.squeeze(0).cpu().numpy()
+        finally:
+            for tmp_path in temp_files:
+                if tmp_path and os.path.exists(tmp_path):
+                    try:
+                        os.unlink(tmp_path)
+                    except OSError:
+                        pass
+    # ------------------------------------------------------------------ #
+    # LoRA Interface (delegated to VoxCPMModel)
+    # ------------------------------------------------------------------ #
+    def load_lora(self, lora_weights_path: str) -> tuple:
+        """Load LoRA weights from a checkpoint file.
+        Args:
+            lora_weights_path: Path to LoRA weights (.pth file or directory
+                containing lora_weights.ckpt).
+        Returns:
+            tuple: (loaded_keys, skipped_keys) - lists of loaded and skipped parameter names.
+        Raises:
+            RuntimeError: If model was not initialized with LoRA config.
+        """
+        if self.tts_model.lora_config is None:
+            raise RuntimeError(
+                "Cannot load LoRA weights: model was not initialized with LoRA config. "
+                "Please reinitialize with lora_config or lora_weights_path parameter."
+            )
+        return self.tts_model.load_lora_weights(lora_weights_path)
+    def unload_lora(self):
+        """Unload LoRA by resetting all LoRA weights to initial state (effectively disabling LoRA)."""
+        self.tts_model.reset_lora_weights()
+    def set_lora_enabled(self, enabled: bool):
+        """Enable or disable LoRA layers without unloading weights.
+        Args:
+            enabled: If True, LoRA layers are active; if False, only base model is used.
+        """
+        self.tts_model.set_lora_enabled(enabled)
+    def get_lora_state_dict(self) -> dict:
+        """Get current LoRA parameters state dict.
+        Returns:
+            dict: State dict containing all LoRA parameters (lora_A, lora_B).
+        """
+        return self.tts_model.get_lora_state_dict()
+    @property
+    def lora_enabled(self) -> bool:
+        """Check if LoRA is currently configured."""
+        return self.tts_model.lora_config is not None

voxcpm/model/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .voxcpm import VoxCPMModel
+from .voxcpm2 import VoxCPM2Model
+__all__ = ["VoxCPMModel", "VoxCPM2Model"]

voxcpm/model/utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from typing import List
+import torch
+from transformers import PreTrainedTokenizer
+def mask_multichar_chinese_tokens(tokenizer: PreTrainedTokenizer):
+    """Create a tokenizer wrapper that converts multi-character Chinese tokens to single characters.
+    This function creates a wrapper around the provided tokenizer that automatically
+    splits multi-character Chinese tokens into individual characters. This is useful
+    for ensuring consistent tokenization of Chinese text.
+    Args:
+        tokenizer: The base tokenizer to wrap
+    Returns:
+        A CharTokenizerWrapper instance that handles multi-character Chinese tokens
+    Example:
+        >>> from transformers import LlamaTokenizerFast
+        >>> tokenizer = LlamaTokenizerFast.from_pretrained("path/to/tokenizer")
+        >>> wrapped_tokenizer = mask_multichar_chinese_tokens(tokenizer)
+        >>> tokens = wrapped_tokenizer("你好世界")
+    """
+    # Pre-compute multi-character tokens (length >= 2, pure Chinese characters)
+    multichar_tokens = {
+        token for token in tokenizer.vocab.keys() if len(token) >= 2 and all("\u4e00" <= c <= "\u9fff" for c in token)
+    }
+    class CharTokenizerWrapper:
+        """Wrapper class for tokenizers that handles multi-character Chinese tokens.
+        This wrapper automatically splits multi-character Chinese tokens into
+        individual characters while preserving the original tokenizer's interface.
+        """
+        def __init__(self, base_tokenizer: PreTrainedTokenizer) -> None:
+            """Initialize the wrapper with a base tokenizer.
+            Args:
+                base_tokenizer: The tokenizer to wrap
+            """
+            self.tokenizer = base_tokenizer
+            self.multichar_tokens = multichar_tokens
+        def tokenize(self, text: str, **kwargs) -> List[str]:
+            """Tokenize text and split multi-character Chinese tokens into single characters.
+            Args:
+                text: Input text to tokenize
+                **kwargs: Additional arguments passed to the base tokenizer
+            Returns:
+                List of processed tokens with multi-character Chinese tokens split
+            Example:
+                >>> wrapper = CharTokenizerWrapper(tokenizer)
+                >>> tokens = wrapper.tokenize("你好世界")
+                >>> # Returns ["你", "好", "世", "界"] instead of ["你好", "世界"]
+            """
+            if not isinstance(text, str):
+                raise TypeError(f"Expected string input, got {type(text)}")
+            tokens = self.tokenizer.tokenize(text, **kwargs)
+            processed = []
+            for token in tokens:
+                # Remove possible subword prefix
+                clean_token = token.replace("▁", "")
+                if clean_token in self.multichar_tokens:
+                    # Split multi-character token into single characters
+                    chars = list(clean_token)
+                    processed.extend(chars)
+                else:
+                    processed.append(token)
+            return processed
+        def __call__(self, text: str, **kwargs) -> List[int]:
+            """Call the tokenizer and return token IDs.
+            This method provides the same interface as the original tokenizer
+            but with multi-character Chinese token handling.
+            Args:
+                text: Input text to tokenize
+                **kwargs: Additional arguments passed to the base tokenizer
+            Returns:
+                List of token IDs
+            Raises:
+                TypeError: If input is not a string
+                ValueError: If tokenization fails
+            """
+            try:
+                tokens = self.tokenize(text, **kwargs)
+                result = self.tokenizer.convert_tokens_to_ids(tokens)
+                return result
+            except Exception as e:
+                raise ValueError(f"Tokenization failed: {str(e)}") from e
+    return CharTokenizerWrapper(tokenizer)
+def get_dtype(dtype: str):
+    if dtype == "bfloat16":
+        return torch.bfloat16
+    elif dtype == "bf16":
+        return torch.bfloat16
+    elif dtype == "float16":
+        return torch.float16
+    elif dtype == "fp16":
+        return torch.float16
+    elif dtype == "float32":
+        return torch.float32
+    elif dtype == "fp32":
+        return torch.float32
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")

voxcpm/model/voxcpm.py ADDED Viewed

	@@ -0,0 +1,985 @@

+"""
+VoxCPM: A Tokenizer-free speech generation model
+This module contains the main VoxCPM model implementation, including configuration classes
+and the core VoxCPMModel for text-to-speech generation.
+Copyright 2025 OpenBMB
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import sys
+from typing import Tuple, Union, Generator, List, Optional
+import torch
+import torch.nn as nn
+import torchaudio
+import warnings
+from einops import rearrange
+from pydantic import BaseModel
+try:
+    from safetensors.torch import load_file
+    SAFETENSORS_AVAILABLE = True
+except ImportError:
+    SAFETENSORS_AVAILABLE = False
+from tqdm import tqdm
+from transformers import LlamaTokenizerFast
+from ..modules.audiovae import AudioVAE, AudioVAEConfig
+from ..modules.layers import ScalarQuantizationLayer
+from ..modules.layers.lora import apply_lora_to_named_linear_modules
+from ..modules.locdit import CfmConfig, UnifiedCFM, VoxCPMLocDiT
+from ..modules.locenc import VoxCPMLocEnc
+from ..modules.minicpm4 import MiniCPM4Config, MiniCPMModel
+from .utils import get_dtype, mask_multichar_chinese_tokens
+class VoxCPMEncoderConfig(BaseModel):
+    hidden_dim: int = 1024
+    ffn_dim: int = 4096
+    num_heads: int = 16
+    num_layers: int = 4
+    kv_channels: int = None
+class VoxCPMDitConfig(BaseModel):
+    hidden_dim: int = 1024
+    ffn_dim: int = 4096
+    num_heads: int = 16
+    num_layers: int = 4
+    kv_channels: int = None
+    cfm_config: CfmConfig
+class VoxCPMConfig(BaseModel):
+    lm_config: MiniCPM4Config
+    patch_size: int = 2
+    feat_dim: int = 64
+    residual_lm_num_layers: int = 6
+    scalar_quantization_latent_dim: int = 256
+    scalar_quantization_scale: int = 9
+    encoder_config: VoxCPMEncoderConfig
+    dit_config: VoxCPMDitConfig
+    audio_vae_config: Optional[AudioVAEConfig] = None
+    max_length: int = 4096
+    device: str = "cuda"
+    dtype: str = "bfloat16"
+    dit_mean_mode: bool = False
+class LoRAConfig(BaseModel):
+    enable_lm: bool = False  # Apply LoRA to base_lm + residual_lm
+    enable_dit: bool = False  # Apply LoRA to VoxCPMLocDiT
+    enable_proj: bool = False  # Apply LoRA to projection Linear layers
+    r: int = 8
+    alpha: int = 16
+    dropout: float = 0.0
+    # Target linear layer names for LM & DiT (matched by attribute name)
+    target_modules_lm: list[str] = ["q_proj", "v_proj", "k_proj", "o_proj"]
+    target_modules_dit: list[str] = ["q_proj", "v_proj", "k_proj", "o_proj"]
+    # Projection layer attribute names to find on VoxCPMModel
+    target_proj_modules: list[str] = ["enc_to_lm_proj", "lm_to_dit_proj", "res_to_dit_proj"]
+VoxCPMConfig.model_rebuild()
+class VoxCPMModel(nn.Module):
+    def __init__(
+        self,
+        config: VoxCPMConfig,
+        tokenizer: LlamaTokenizerFast,
+        audio_vae: AudioVAE,
+        lora_config: LoRAConfig = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.lora_config = lora_config
+        self.feat_dim = config.feat_dim
+        self.patch_size = config.patch_size
+        self.device = config.device
+        if not torch.cuda.is_available():
+            if torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+        print(f"Running on device: {self.device}, dtype: {self.config.dtype}", file=sys.stderr)
+        # Text-Semantic LM
+        self.base_lm = MiniCPMModel(config.lm_config)
+        self.base_lm.setup_cache(1, config.max_length, self.device, get_dtype(self.config.dtype))
+        self.text_tokenizer = mask_multichar_chinese_tokens(tokenizer)
+        self.audio_start_token = 101
+        self.audio_end_token = 102
+        # Residual Acoustic LM
+        residual_lm_config = config.lm_config.model_copy(deep=True)
+        residual_lm_config.num_hidden_layers = config.residual_lm_num_layers
+        residual_lm_config.vocab_size = 0
+        self.residual_lm = MiniCPMModel(residual_lm_config)
+        self.residual_lm.setup_cache(1, config.max_length, self.device, get_dtype(self.config.dtype))
+        # Local Encoder
+        encoder_config = config.lm_config.model_copy(deep=True)
+        encoder_config.hidden_size = config.encoder_config.hidden_dim
+        encoder_config.intermediate_size = config.encoder_config.ffn_dim
+        encoder_config.num_attention_heads = config.encoder_config.num_heads
+        encoder_config.num_hidden_layers = config.encoder_config.num_layers
+        encoder_config.kv_channels = config.encoder_config.kv_channels
+        encoder_config.vocab_size = 0
+        self.feat_encoder = VoxCPMLocEnc(encoder_config, input_dim=config.feat_dim)
+        # Local DiT
+        decoder_config = config.lm_config.model_copy(deep=True)
+        decoder_config.hidden_size = config.dit_config.hidden_dim
+        decoder_config.intermediate_size = config.dit_config.ffn_dim
+        decoder_config.num_attention_heads = config.dit_config.num_heads
+        decoder_config.num_hidden_layers = config.dit_config.num_layers
+        decoder_config.kv_channels = config.dit_config.kv_channels
+        decoder_config.vocab_size = 0
+        self.feat_decoder = UnifiedCFM(
+            in_channels=config.feat_dim,
+            cfm_params=config.dit_config.cfm_config,
+            estimator=VoxCPMLocDiT(decoder_config, in_channels=config.feat_dim),
+            mean_mode=config.dit_mean_mode,
+        )
+        # Projection layers
+        self.fsq_layer = ScalarQuantizationLayer(
+            config.lm_config.hidden_size,
+            config.lm_config.hidden_size,
+            config.scalar_quantization_latent_dim,
+            config.scalar_quantization_scale,
+        )
+        self.enc_to_lm_proj = nn.Linear(config.encoder_config.hidden_dim, config.lm_config.hidden_size)
+        self.lm_to_dit_proj = nn.Linear(config.lm_config.hidden_size, config.dit_config.hidden_dim)
+        self.res_to_dit_proj = nn.Linear(config.lm_config.hidden_size, config.dit_config.hidden_dim)
+        # Stop Predictor
+        self.stop_proj = nn.Linear(config.lm_config.hidden_size, config.lm_config.hidden_size)
+        self.stop_actn = nn.SiLU()
+        self.stop_head = nn.Linear(config.lm_config.hidden_size, 2, bias=False)
+        self.stop_loss = nn.CrossEntropyLoss(reduction="none")
+        # Audio VAE
+        self.audio_vae = audio_vae
+        self.chunk_size = audio_vae.chunk_size
+        self.sample_rate = audio_vae.sample_rate
+        if self.lora_config is not None:
+            self._apply_lora()
+    def _apply_lora(self):
+        """注入 LoRA 到 LM / DiT / 投影层"""
+        cfg = self.lora_config
+        lora_kwargs = dict(r=cfg.r, alpha=cfg.alpha, dropout=cfg.dropout)
+        # LM: base_lm + residual_lm
+        if cfg.enable_lm:
+            for lm in [self.base_lm, self.residual_lm]:
+                apply_lora_to_named_linear_modules(lm, target_submodule_names=cfg.target_modules_lm, **lora_kwargs)
+        # DiT: feat_decoder.estimator
+        if cfg.enable_dit:
+            apply_lora_to_named_linear_modules(
+                self.feat_decoder.estimator, target_submodule_names=cfg.target_modules_dit, **lora_kwargs
+            )
+        # 投影层
+        if cfg.enable_proj:
+            from ..modules.layers.lora import LoRALinear
+            for attr_name in cfg.target_proj_modules:
+                module = getattr(self, attr_name, None)
+                if isinstance(module, nn.Linear):
+                    setattr(self, attr_name, LoRALinear(base=module, **lora_kwargs))
+    def optimize(self, disable: bool = False):
+        if disable:
+            return self
+        try:
+            if self.device != "cuda":
+                raise ValueError("VoxCPMModel can only be optimized on CUDA device")
+            try:
+                import triton  # noqa: F401
+            except ImportError:
+                raise ValueError("triton is not installed")
+            self.base_lm.forward_step = torch.compile(self.base_lm.forward_step, mode="reduce-overhead", fullgraph=True)
+            self.residual_lm.forward_step = torch.compile(
+                self.residual_lm.forward_step, mode="reduce-overhead", fullgraph=True
+            )
+            self.feat_encoder = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True)
+            self.feat_decoder.estimator = torch.compile(
+                self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True
+            )
+        except Exception as e:
+            print(f"Warning: torch.compile disabled - {e}", file=sys.stderr)
+        return self
+    def forward(
+        self,
+        text_tokens: torch.Tensor,
+        text_mask: torch.Tensor,
+        audio_feats: torch.Tensor,
+        audio_mask: torch.Tensor,
+        loss_mask: torch.Tensor,
+        position_ids: torch.Tensor,
+        labels: torch.Tensor,
+        *,
+        progress: float = 0.0,
+        sample_generate: bool = False,
+    ):
+        del position_ids  # not used yet
+        text_tokens = text_tokens.to(self.device, dtype=torch.long)
+        text_mask = text_mask.to(self.device, dtype=self._dtype())
+        audio_feats = audio_feats.to(self.device, dtype=self._dtype())
+        audio_mask = audio_mask.to(self.device, dtype=self._dtype())
+        loss_mask = loss_mask.to(self.device, dtype=self._dtype())
+        labels = labels.to(self.device, dtype=torch.long)
+        B, T, P, D = audio_feats.shape
+        feat_embed = self.feat_encoder(audio_feats)
+        feat_embed = self.enc_to_lm_proj(feat_embed)
+        scale_emb = getattr(self.config.lm_config, "scale_emb", 1.0)
+        if not getattr(self.config.lm_config, "use_mup", False):
+            scale_emb = 1.0
+        text_embed = self.base_lm.embed_tokens(text_tokens) * scale_emb
+        combined_embed = text_mask.unsqueeze(-1) * text_embed + audio_mask.unsqueeze(-1) * feat_embed
+        enc_outputs, _ = self.base_lm(inputs_embeds=combined_embed, is_causal=True)
+        enc_outputs = enc_outputs.to(self._dtype())
+        enc_outputs = self.fsq_layer(enc_outputs) * audio_mask.unsqueeze(-1) + enc_outputs * text_mask.unsqueeze(-1)
+        lm_hidden = torch.cat((torch.zeros_like(enc_outputs[:, 0:1, :]), enc_outputs[:, :-1, :]), dim=1)
+        residual_inputs = enc_outputs + audio_mask.unsqueeze(-1) * feat_embed
+        residual_outputs, _ = self.residual_lm(inputs_embeds=residual_inputs, is_causal=True)
+        residual_outputs = residual_outputs.to(self._dtype())
+        residual_hidden = torch.cat(
+            (torch.zeros_like(residual_outputs[:, 0:1, :]), residual_outputs[:, :-1, :]),
+            dim=1,
+        )
+        dit_hidden = self.lm_to_dit_proj(lm_hidden) + self.res_to_dit_proj(residual_hidden)
+        dit_hidden = rearrange(dit_hidden, "b t c -> (b t) c")
+        # Keep diffusion inputs in the same dtype as the model (e.g., bfloat16)
+        target_dtype = self._dtype()
+        feat_gt = rearrange(audio_feats.to(target_dtype), "b t p d -> (b t) p d")
+        feat_cond = torch.cat(
+            (torch.zeros_like(audio_feats[:, 0:1, ...]), audio_feats[:, :-1, ...]),
+            dim=1,
+        )
+        feat_cond = rearrange(feat_cond.to(target_dtype), "b t p d -> (b t) p d")
+        loss_seq_mask = loss_mask.unsqueeze(-1).repeat(1, 1, self.patch_size)
+        loss_seq_mask = rearrange(loss_seq_mask, "b t p -> (b t) p 1").to(target_dtype)
+        diff_loss = self.feat_decoder.compute_loss(
+            feat_gt.transpose(1, 2).contiguous(),
+            dit_hidden,
+            cond=feat_cond.transpose(1, 2).contiguous(),
+            tgt_mask=loss_seq_mask.transpose(1, 2).contiguous(),
+            progress=progress,
+        )
+        stop_logits = self.stop_head(self.stop_actn(self.stop_proj(lm_hidden)))
+        stop_losses = self.stop_loss(stop_logits.transpose(1, 2), labels)
+        denom = torch.clamp(loss_mask.sum(), min=1.0)
+        stop_loss = (stop_losses * loss_mask).sum() / denom
+        feat_pred = None
+        if sample_generate:
+            feat_cond_for_sample = feat_cond.transpose(1, 2).contiguous()
+            feat_pred_seq = self.feat_decoder(
+                mu=dit_hidden,
+                patch_size=self.patch_size,
+                cond=feat_cond_for_sample,
+                n_timesteps=(
+                    self.config.dit_config.cfm_config.inference_cfg_rate
+                    if hasattr(self.config.dit_config.cfm_config, "inference_cfg_rate")
+                    else 10
+                ),
+            )
+            feat_pred = rearrange(feat_pred_seq.transpose(1, 2), "(b t) d p -> b d (t p)", b=B, p=self.patch_size)
+        feat_gt_tensor = rearrange(feat_gt, "(b t) p d -> b d (t p)", b=B, p=self.patch_size)
+        return {
+            "loss/diff": diff_loss,
+            "loss/stop": stop_loss,
+            "feat_gt": feat_gt_tensor,
+            "feat_pred": feat_pred,
+        }
+    def _dtype(self):
+        return get_dtype(self.config.dtype)
+    def generate(self, *args, **kwargs) -> torch.Tensor:
+        return next(self._generate(*args, streaming=False, **kwargs))
+    def generate_streaming(self, *args, **kwargs) -> Generator[torch.Tensor, None, None]:
+        return self._generate(*args, streaming=True, **kwargs)
+    @torch.inference_mode()
+    def _generate(
+        self,
+        target_text: str,
+        prompt_text: str = "",
+        prompt_wav_path: str = "",
+        min_len: int = 2,
+        max_len: int = 2000,
+        inference_timesteps: int = 10,
+        cfg_value: float = 2.0,
+        retry_badcase: bool = False,
+        retry_badcase_max_times: int = 3,
+        retry_badcase_ratio_threshold: float = 6.0,  # setting acceptable ratio of audio length to text length (for badcase detection)
+        streaming: bool = False,
+    ) -> Generator[torch.Tensor, None, None]:
+        if retry_badcase and streaming:
+            warnings.warn("Retry on bad cases is not supported in streaming mode, setting retry_badcase=False.")
+            retry_badcase = False
+        if len(prompt_wav_path) == 0:
+            text = target_text
+            text_token = torch.LongTensor(self.text_tokenizer(text))
+            text_token = torch.cat(
+                [
+                    text_token,
+                    torch.tensor(
+                        [self.audio_start_token],
+                        dtype=torch.int32,
+                        device=text_token.device,
+                    ),
+                ],
+                dim=-1,
+            )
+            text_length = text_token.shape[0]
+            audio_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            text_mask = torch.ones(text_length).type(torch.int32).to(text_token.device)
+            audio_mask = torch.zeros(text_length).type(torch.int32).to(text_token.device)
+        else:
+            text = prompt_text + target_text
+            text_token = torch.LongTensor(self.text_tokenizer(text))
+            text_token = torch.cat(
+                [
+                    text_token,
+                    torch.tensor([self.audio_start_token], dtype=torch.int32, device=text_token.device),
+                ],
+                dim=-1,
+            )
+            text_length = text_token.shape[0]
+            audio, sr = torchaudio.load(prompt_wav_path)
+            if audio.size(0) > 1:
+                audio = audio.mean(dim=0, keepdim=True)
+            if sr != self.sample_rate:
+                audio = torchaudio.functional.resample(audio, sr, self.sample_rate)
+            patch_len = self.patch_size * self.chunk_size
+            if audio.size(1) % patch_len != 0:
+                # 左填充：在音频开头填充，保持有效音频数据在序列末尾
+                padding_size = patch_len - audio.size(1) % patch_len
+                audio = torch.nn.functional.pad(audio, (padding_size, 0))
+            # (B, D, T)
+            audio_feat = self.audio_vae.encode(audio.to(self.device), self.sample_rate).cpu()
+            audio_feat = audio_feat.view(
+                self.audio_vae.latent_dim,
+                -1,
+                self.patch_size,
+            ).permute(1, 2, 0)
+            audio_length = audio_feat.size(0)
+            text_pad_token = torch.zeros(audio_length, dtype=torch.int32, device=text_token.device)
+            text_token = torch.cat([text_token, text_pad_token])
+            audio_pad_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            audio_feat = torch.cat([audio_pad_feat, audio_feat], dim=0)
+            text_mask = (
+                torch.cat([torch.ones(text_length), torch.zeros(audio_length)]).type(torch.int32).to(text_token.device)
+            )
+            audio_mask = (
+                torch.cat([torch.zeros(text_length), torch.ones(audio_length)]).type(torch.int32).to(text_token.device)
+            )
+        text_token = text_token.unsqueeze(0).to(self.device)
+        text_mask = text_mask.unsqueeze(0).to(self.device)
+        audio_feat = audio_feat.unsqueeze(0).to(self.device).to(get_dtype(self.config.dtype))
+        audio_mask = audio_mask.unsqueeze(0).to(self.device)
+        target_text_length = len(self.text_tokenizer(target_text))
+        retry_badcase_times = 0
+        while retry_badcase_times < retry_badcase_max_times:
+            inference_result = self._inference(
+                text_token,
+                text_mask,
+                audio_feat,
+                audio_mask,
+                min_len=min_len,
+                max_len=min(
+                    int(target_text_length * retry_badcase_ratio_threshold + 10), max_len
+                ),  # avoid too long audio
+                inference_timesteps=inference_timesteps,
+                cfg_value=cfg_value,
+                streaming=streaming,
+            )
+            if streaming:
+                patch_len = self.patch_size * self.chunk_size
+                for latent_pred, _ in inference_result:
+                    decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
+                    decode_audio = decode_audio[..., -patch_len:].squeeze(1).cpu()
+                    yield decode_audio
+                break
+            else:
+                latent_pred, pred_audio_feat = next(inference_result)
+                if retry_badcase:
+                    if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
+                        print(
+                            f"  Badcase detected, audio_text_ratio={pred_audio_feat.shape[0] / target_text_length}, retrying...",
+                            file=sys.stderr,
+                        )
+                        retry_badcase_times += 1
+                        continue
+                    else:
+                        break
+                else:
+                    break
+        if not streaming:
+            decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
+            yield decode_audio
+    @torch.inference_mode()
+    def build_prompt_cache(
+        self,
+        prompt_text: str,
+        prompt_wav_path: str,
+    ):
+        """
+        Build prompt cache for subsequent fast generation.
+        Args:
+            prompt_text: prompt text (required)
+            prompt_wav_path: prompt audio path (required)
+        Returns:
+            prompt_cache: dict with prompt_text (raw text) and audio features.
+                         Text tokenization will be done during generation for consistency.
+        """
+        if not prompt_text or not prompt_wav_path:
+            raise ValueError("prompt_text and prompt_wav_path are required")
+        # load audio
+        audio, sr = torchaudio.load(prompt_wav_path)
+        if audio.size(0) > 1:
+            audio = audio.mean(dim=0, keepdim=True)
+        if sr != self.sample_rate:
+            audio = torchaudio.functional.resample(audio, sr, self.sample_rate)
+        patch_len = self.patch_size * self.chunk_size
+        if audio.size(1) % patch_len != 0:
+            # Left padding: pad at the beginning of the audio to keep valid audio data at the end of the sequence
+            padding_size = patch_len - audio.size(1) % patch_len
+            audio = torch.nn.functional.pad(audio, (padding_size, 0))
+        # extract audio features
+        audio_feat = self.audio_vae.encode(audio.to(self.device), self.sample_rate).cpu()
+        audio_feat = audio_feat.view(
+            self.audio_vae.latent_dim,
+            -1,
+            self.patch_size,
+        ).permute(
+            1, 2, 0
+        )  # (D, T, P)
+        # build prompt cache - only save raw text and audio features
+        prompt_cache = {
+            "prompt_text": prompt_text,
+            "audio_feat": audio_feat,
+        }
+        return prompt_cache
+    def merge_prompt_cache(
+        self,
+        original_cache: dict,
+        new_text: str,
+        new_audio_feat: torch.Tensor,
+    ):
+        """
+        Merge original prompt cache with newly generated content to stabilize voice.
+        Args:
+            original_cache: original prompt cache
+            new_text: newly generated text
+            new_audio_feat: newly generated audio features
+        Returns:
+            merged_cache: merged cache with prompt_text and audio_feat
+        """
+        if original_cache is None:
+            return {
+                "prompt_text": new_text,
+                "audio_feat": new_audio_feat,
+            }
+        original_prompt_text = original_cache["prompt_text"]
+        original_audio_feat = original_cache["audio_feat"]
+        # Merge text by concatenation
+        merged_prompt_text = original_prompt_text + new_text
+        merged_audio_feat = torch.cat([original_audio_feat, new_audio_feat], dim=0)
+        # build new cache
+        merged_cache = {
+            "prompt_text": merged_prompt_text,
+            "audio_feat": merged_audio_feat,
+        }
+        return merged_cache
+    def generate_with_prompt_cache(self, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        return next(self._generate_with_prompt_cache(*args, streaming=False, **kwargs))
+    def generate_with_prompt_cache_streaming(
+        self, *args, **kwargs
+    ) -> Generator[Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]], None, None]:
+        return self._generate_with_prompt_cache(*args, streaming=True, **kwargs)
+    @torch.inference_mode()
+    def _generate_with_prompt_cache(
+        self,
+        target_text: str,
+        prompt_cache: dict,
+        min_len: int = 2,
+        max_len: int = 2000,
+        inference_timesteps: int = 10,
+        cfg_value: float = 2.0,
+        retry_badcase: bool = False,
+        retry_badcase_max_times: int = 3,
+        retry_badcase_ratio_threshold: float = 6.0,
+        streaming: bool = False,
+        streaming_prefix_len: int = 3,
+    ) -> Generator[Tuple[torch.Tensor, torch.Tensor, Union[torch.Tensor, List[torch.Tensor]]], None, None]:
+        """
+        Generate audio using pre-built prompt cache.
+        Args:
+            target_text: Text to convert to speech
+            prompt_cache: Cache built by build_prompt_cache (can be None)
+            min_len: Minimum audio length to avoid very short audio
+            max_len: Maximum audio length
+            inference_timesteps: Number of diffusion sampling steps
+            cfg_value: Classifier-free guidance value
+            retry_badcase: Whether to retry on bad cases
+            retry_badcase_max_times: Maximum retry attempts
+            retry_badcase_ratio_threshold: Threshold for audio-to-text ratio
+            streaming: Whether to return a generator of audio chunks
+            streaming_prefix_len: Number of prefix audio patches to use for streaming mode
+        Returns:
+            Generator of Tuple containing:
+                - Decoded audio tensor for the current step if ``streaming=True``, else final decoded audio tensor
+                - Tensor of new text tokens
+                - New audio features up to the current step as a List if ``streaming=True``, else as a concatenated Tensor
+        """
+        if retry_badcase and streaming:
+            warnings.warn("Retry on bad cases is not supported in streaming mode, setting retry_badcase=False.")
+            retry_badcase = False
+        # get prompt from cache
+        if prompt_cache is None:
+            prompt_audio_feat = torch.empty((0, self.patch_size, self.audio_vae.latent_dim), dtype=torch.float32)
+            text = target_text
+        else:
+            prompt_audio_feat = prompt_cache["audio_feat"]
+            prompt_text = prompt_cache["prompt_text"]
+            text = prompt_text + target_text
+        text_token = torch.LongTensor(self.text_tokenizer(text))
+        text_token = torch.cat(
+            [
+                text_token,
+                torch.tensor(
+                    [self.audio_start_token],
+                    dtype=torch.int32,
+                    device=text_token.device,
+                ),
+            ],
+            dim=-1,
+        )
+        target_text_token = torch.LongTensor(self.text_tokenizer(target_text))
+        audio_length = prompt_audio_feat.size(0)
+        text_length = text_token.shape[0]
+        text_pad_token = torch.zeros(audio_length, dtype=torch.int32, device=text_token.device)
+        audio_pad_feat = torch.zeros(
+            (text_token.shape[0], self.patch_size, self.audio_vae.latent_dim),
+            dtype=torch.float32,
+            device=text_token.device,
+        )
+        text_token = torch.cat([text_token, text_pad_token])
+        audio_feat = torch.cat([audio_pad_feat, prompt_audio_feat], dim=0)
+        text_mask = (
+            torch.cat([torch.ones(text_length), torch.zeros(audio_length)]).type(torch.int32).to(text_token.device)
+        )
+        audio_mask = (
+            torch.cat([torch.zeros(text_length), torch.ones(audio_length)]).type(torch.int32).to(text_token.device)
+        )
+        text_token = text_token.unsqueeze(0).to(self.device)
+        text_mask = text_mask.unsqueeze(0).to(self.device)
+        audio_feat = audio_feat.unsqueeze(0).to(self.device).to(get_dtype(self.config.dtype))
+        audio_mask = audio_mask.unsqueeze(0).to(self.device)
+        # run inference
+        target_text_length = len(self.text_tokenizer(target_text))
+        retry_badcase_times = 0
+        while retry_badcase_times < retry_badcase_max_times:
+            inference_result = self._inference(
+                text_token,
+                text_mask,
+                audio_feat,
+                audio_mask,
+                min_len=min_len,
+                max_len=min(
+                    int(target_text_length * retry_badcase_ratio_threshold + 10), max_len
+                ),  # avoid too long audio
+                inference_timesteps=inference_timesteps,
+                cfg_value=cfg_value,
+                streaming=streaming,
+                streaming_prefix_len=streaming_prefix_len,
+            )
+            if streaming:
+                patch_len = self.patch_size * self.chunk_size
+                for latent_pred, pred_audio_feat in inference_result:
+                    decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
+                    decode_audio = decode_audio[..., -patch_len:].squeeze(1).cpu()
+                    yield (decode_audio, target_text_token, pred_audio_feat)
+                break
+            else:
+                latent_pred, pred_audio_feat = next(inference_result)
+                if retry_badcase:
+                    if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
+                        print(
+                            f"  Badcase detected, audio_text_ratio={pred_audio_feat.shape[0] / target_text_length}, retrying...",
+                            file=sys.stderr,
+                        )
+                        retry_badcase_times += 1
+                        continue
+                    else:
+                        break
+                else:
+                    break
+        if not streaming:
+            decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
+            patch_len = self.patch_size * self.chunk_size
+            if audio_mask.sum().item() > 0:
+                decode_audio = decode_audio[..., patch_len * (streaming_prefix_len - 1) :].squeeze(1).cpu()
+            else:
+                decode_audio = decode_audio[..., :].squeeze(1).cpu()
+            yield (decode_audio, target_text_token, pred_audio_feat)
+    def inference(self, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
+        return next(self._inference(*args, streaming=False, **kwargs))
+    def inference_streaming(self, *args, **kwargs) -> Generator[Tuple[torch.Tensor, List[torch.Tensor]], None, None]:
+        return self._inference(*args, streaming=True, **kwargs)
+    @torch.inference_mode()
+    def _inference(
+        self,
+        text: torch.Tensor,
+        text_mask: torch.Tensor,
+        feat: torch.Tensor,
+        feat_mask: torch.Tensor,
+        min_len: int = 2,
+        max_len: int = 2000,
+        inference_timesteps: int = 10,
+        cfg_value: float = 2.0,
+        streaming: bool = False,
+        streaming_prefix_len: int = 3,
+    ) -> Generator[Tuple[torch.Tensor, Union[torch.Tensor, List[torch.Tensor]]], None, None]:
+        """Core inference method for audio generation.
+        This is the main inference loop that generates audio features
+        using the language model and diffusion transformer.
+        Args:
+            text: Input text tokens
+            text_mask: Mask for text tokens
+            feat: Input audio features
+            feat_mask: Mask for audio features
+            min_len: Minimum generation length
+            max_len: Maximum generation length
+            inference_timesteps: Number of diffusion steps
+            cfg_value: Classifier-free guidance value
+            streaming: Whether to yield each step latent feature or just the final result
+        Returns:
+            Generator of Tuple containing:
+                - Predicted latent feature at the current step if ``streaming=True``, else final latent features
+                - Predicted audio feature sequence so far as a List if ``streaming=True``, else as a concatenated Tensor
+        """
+        B, T, P, D = feat.shape
+        feat_embed = self.feat_encoder(feat)  # [b, t, h_feat]
+        feat_embed = self.enc_to_lm_proj(feat_embed)
+        if self.config.lm_config.use_mup:
+            scale_emb = self.config.lm_config.scale_emb
+        else:
+            scale_emb = 1.0
+        text_embed = self.base_lm.embed_tokens(text) * scale_emb
+        combined_embed = text_mask.unsqueeze(-1) * text_embed + feat_mask.unsqueeze(-1) * feat_embed
+        prefix_feat_cond = feat[:, -1, ...]  # b, p, d
+        pred_feat_seq = []  # b, t, p, d
+        curr_embed = None
+        # Prepare prompt context patches for streaming mode
+        # When there's a prompt audio, use its last (streaming_prefix_len - 1) patches as initial context
+        prompt_context_patches = []
+        audio_patch_count = int(feat_mask.sum().item())
+        if audio_patch_count > 0:
+            context_len = min(streaming_prefix_len - 1, audio_patch_count)
+            # Take the last context_len patches from prompt audio as initial context
+            # Split into list of [b, 1, p, d] tensors to match pred_feat_seq format
+            prompt_context_patches = list(feat[:, -context_len:, :, :].split(1, dim=1))
+            pred_feat_seq = prompt_context_patches + pred_feat_seq
+        enc_outputs, kv_cache_tuple = self.base_lm(
+            inputs_embeds=combined_embed,
+            is_causal=True,
+        )
+        self.base_lm.kv_cache.fill_caches(kv_cache_tuple)
+        enc_outputs = self.fsq_layer(enc_outputs) * feat_mask.unsqueeze(-1) + enc_outputs * text_mask.unsqueeze(-1)
+        lm_hidden = enc_outputs[:, -1, :]
+        residual_enc_outputs, residual_kv_cache_tuple = self.residual_lm(
+            inputs_embeds=enc_outputs + feat_mask.unsqueeze(-1) * feat_embed,
+            is_causal=True,
+        )
+        self.residual_lm.kv_cache.fill_caches(residual_kv_cache_tuple)
+        residual_hidden = residual_enc_outputs[:, -1, :]
+        for i in tqdm(range(max_len)):
+            dit_hidden_1 = self.lm_to_dit_proj(lm_hidden)  # [b, h_dit]
+            dit_hidden_2 = self.res_to_dit_proj(residual_hidden)  # [b, h_dit]
+            dit_hidden = dit_hidden_1 + dit_hidden_2  # [b, h_dit]
+            pred_feat = self.feat_decoder(
+                mu=dit_hidden,
+                patch_size=self.patch_size,
+                cond=prefix_feat_cond.transpose(1, 2).contiguous(),
+                n_timesteps=inference_timesteps,
+                cfg_value=cfg_value,
+            ).transpose(
+                1, 2
+            )  # [b, p, d]
+            curr_embed = self.feat_encoder(pred_feat.unsqueeze(1))  # b, 1, c
+            curr_embed = self.enc_to_lm_proj(curr_embed)
+            pred_feat_seq.append(pred_feat.unsqueeze(1))  # b, 1, p, d
+            prefix_feat_cond = pred_feat
+            if streaming:
+                # return the last three predicted latent features to provide enough context for smooth decoding
+                pred_feat_chunk = torch.cat(pred_feat_seq[-streaming_prefix_len:], dim=1)
+                feat_pred = rearrange(pred_feat_chunk, "b t p d -> b d (t p)", b=B, p=self.patch_size)
+                yield feat_pred, pred_feat_seq
+            stop_flag = self.stop_head(self.stop_actn(self.stop_proj(lm_hidden))).argmax(dim=-1)[0].cpu().item()
+            if i > min_len and stop_flag == 1:
+                break
+            lm_hidden = self.base_lm.forward_step(
+                curr_embed[:, 0, :], torch.tensor([self.base_lm.kv_cache.step()], device=curr_embed.device)
+            ).clone()
+            lm_hidden = self.fsq_layer(lm_hidden)
+            residual_hidden = self.residual_lm.forward_step(
+                lm_hidden + curr_embed[:, 0, :],
+                torch.tensor([self.residual_lm.kv_cache.step()], device=curr_embed.device),
+            ).clone()
+        if not streaming:
+            pred_feat_seq = torch.cat(pred_feat_seq, dim=1)  # b, t, p, d
+            feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size)
+            yield feat_pred, pred_feat_seq.squeeze(0).cpu()
+    @classmethod
+    def from_local(cls, path: str, optimize: bool = True, training: bool = False, lora_config: LoRAConfig = None):
+        config = VoxCPMConfig.model_validate_json(open(os.path.join(path, "config.json")).read())
+        tokenizer = LlamaTokenizerFast.from_pretrained(path)
+        audio_vae_config = getattr(config, "audio_vae_config", None)
+        audio_vae = AudioVAE(config=audio_vae_config) if audio_vae_config else AudioVAE()
+        # Try to load AudioVAE from safetensors first, fallback to pytorch
+        audiovae_safetensors_path = os.path.join(path, "audiovae.safetensors")
+        audiovae_pth_path = os.path.join(path, "audiovae.pth")
+        if os.path.exists(audiovae_safetensors_path) and SAFETENSORS_AVAILABLE:
+            print(f"Loading AudioVAE from safetensors: {audiovae_safetensors_path}", file=sys.stderr)
+            vae_state_dict = load_file(audiovae_safetensors_path, device="cpu")
+        elif os.path.exists(audiovae_pth_path):
+            print(f"Loading AudioVAE from pytorch: {audiovae_pth_path}", file=sys.stderr)
+            checkpoint = torch.load(
+                audiovae_pth_path,
+                map_location="cpu",
+                weights_only=True,
+            )
+            vae_state_dict = checkpoint.get("state_dict", checkpoint)
+        else:
+            raise FileNotFoundError(
+                f"AudioVAE checkpoint not found. Expected either {audiovae_safetensors_path} or {audiovae_pth_path}"
+            )
+        model = cls(config, tokenizer, audio_vae, lora_config)
+        if not training:
+            lm_dtype = get_dtype(model.config.dtype)
+            model = model.to(lm_dtype)
+        else:  # training mode
+            for name, param in model.named_parameters():
+                if "audio_vae" in name:  # freeze VAE weights
+                    param.requires_grad = False
+                    continue
+                if lora_config is not None:
+                    if "lora" not in name:  # freeze non-LoRA weights
+                        param.requires_grad = False
+        model.audio_vae = model.audio_vae.to(torch.float32)
+        # Try to load from safetensors first, fallback to pytorch_model.bin
+        safetensors_path = os.path.join(path, "model.safetensors")
+        pytorch_model_path = os.path.join(path, "pytorch_model.bin")
+        if os.path.exists(safetensors_path) and SAFETENSORS_AVAILABLE:
+            print(f"Loading model from safetensors: {safetensors_path}", file=sys.stderr)
+            model_state_dict = load_file(safetensors_path)
+        elif os.path.exists(pytorch_model_path):
+            print(f"Loading model from pytorch_model.bin: {pytorch_model_path}", file=sys.stderr)
+            checkpoint = torch.load(
+                pytorch_model_path,
+                map_location="cpu",
+                weights_only=True,
+            )
+            model_state_dict = checkpoint.get("state_dict", checkpoint)
+        else:
+            raise FileNotFoundError(f"Model file not found. Expected either {safetensors_path} or {pytorch_model_path}")
+        for kw, val in vae_state_dict.items():
+            model_state_dict[f"audio_vae.{kw}"] = val
+        # LoRALinear holds weight/bias directly, compatible with nn.Linear state_dict keys.
+        # Using strict=False since pretrained weights don't contain lora_A/lora_B.
+        model.load_state_dict(model_state_dict, strict=False)
+        if training:
+            return model
+        return model.to(model.device).eval().optimize(disable=not optimize)
+    # ------------------------------------------------------------------ #
+    # LoRA Weight Management
+    # ------------------------------------------------------------------ #
+    def _iter_lora_modules(self):
+        """Iterate over all LoRA modules."""
+        from ..modules.layers.lora import LoRALinear
+        for module in self.modules():
+            if isinstance(module, LoRALinear):
+                yield module
+    def load_lora_weights(self, lora_path: str, device: str = None):
+        """
+        Load LoRA weights from file, supports calling after torch.compile.
+        Uses named_parameters() to handle compile's _orig_mod wrapper.
+        Supports both safetensors and pytorch formats.
+        Args:
+            lora_path: Checkpoint path (directory or .safetensors/.ckpt file)
+            device: Target device, defaults to model's current device
+        Returns:
+            tuple: (loaded_keys, skipped_keys)
+        """
+        from pathlib import Path
+        device = device or self.device
+        lora_p = Path(lora_path)
+        # Try safetensors first, then fallback to .ckpt
+        if lora_p.is_dir():
+            safetensors_file = lora_p / "lora_weights.safetensors"
+            ckpt_file = lora_p / "lora_weights.ckpt"
+        else:
+            safetensors_file = lora_p if lora_p.suffix == ".safetensors" else None
+            ckpt_file = lora_p if lora_p.suffix in [".ckpt", ".pth"] else None
+        # Load from safetensors if available
+        if safetensors_file and safetensors_file.exists() and SAFETENSORS_AVAILABLE:
+            state_dict = load_file(str(safetensors_file), device=device)
+        elif ckpt_file and ckpt_file.exists():
+            ckpt = torch.load(ckpt_file, map_location=device, weights_only=False)
+            state_dict = ckpt.get("state_dict", ckpt)
+        else:
+            raise FileNotFoundError(f"LoRA checkpoint not found. Expected either {safetensors_file} or {ckpt_file}")
+        # Build param mapping (handle torch.compile's _orig_mod prefix)
+        model_params = dict(self.named_parameters())
+        key_mapping = {k.replace("._orig_mod.", "."): k for k in model_params if "._orig_mod." in k}
+        loaded_keys, skipped_keys = [], []
+        for key, value in state_dict.items():
+            target_key = key if key in model_params else key_mapping.get(key)
+            if target_key:
+                model_params[target_key].data.copy_(value.to(device))
+                loaded_keys.append(key)
+            else:
+                skipped_keys.append(key)
+        return loaded_keys, skipped_keys
+    def set_lora_enabled(self, enabled: bool):
+        """Enable/disable all LoRA layers."""
+        for module in self._iter_lora_modules():
+            module.set_enabled(enabled)
+    def reset_lora_weights(self):
+        """Reset all LoRA weights (A: kaiming, B: zeros), effectively unloading LoRA."""
+        for module in self._iter_lora_modules():
+            module.reset_lora_parameters()
+    def get_lora_state_dict(self) -> dict:
+        """Get all LoRA parameters (lora_A/lora_B)."""
+        return {name: param.data.clone() for name, param in self.named_parameters() if "lora_" in name}

voxcpm/model/voxcpm2.py ADDED Viewed

	@@ -0,0 +1,1224 @@

+"""
+VoxCPM: A Tokenizer-free speech generation model
+This module contains the main VoxCPM model implementation, including configuration classes
+and the core VoxCPMModel for text-to-speech generation.
+Copyright 2026 OpenBMB
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import sys
+from typing import Tuple, Union, Generator, List, Optional
+import torch
+import torch.nn as nn
+import warnings
+import librosa
+import numpy as np
+from einops import rearrange
+from pydantic import BaseModel
+try:
+    from safetensors.torch import load_file
+    SAFETENSORS_AVAILABLE = True
+except ImportError:
+    SAFETENSORS_AVAILABLE = False
+from tqdm import tqdm
+from transformers import LlamaTokenizerFast
+from ..modules.audiovae import AudioVAEV2, AudioVAEConfigV2
+from ..modules.layers import ScalarQuantizationLayer
+from ..modules.layers.lora import apply_lora_to_named_linear_modules
+from ..modules.locdit import CfmConfig, UnifiedCFM, VoxCPMLocDiTV2
+from ..modules.locenc import VoxCPMLocEnc
+from ..modules.minicpm4 import MiniCPM4Config, MiniCPMModel
+from .utils import get_dtype, mask_multichar_chinese_tokens
+def _trim_audio_silence_vad(
+    audio: torch.Tensor,
+    sample_rate: int,
+    max_silence_ms: float = 200.0,
+    top_db: float = 35.0,
+) -> torch.Tensor:
+    """使用能量阈值（VAD 方式）截取首尾静音及尾部长段伪静音，首尾各最多保留 max_silence_ms 毫秒静音。
+    会同时截掉末尾的长段伪静音（低能量但非完全静音的段落，如长时间底噪）。
+    Args:
+        audio: (1, T) 的音频 tensor
+        sample_rate: 采样率
+        max_silence_ms: 首尾允许保留的最大静音长度（毫秒）
+        top_db: 低于参考电平多少 dB 视为静音
+    Returns:
+        截取后的 (1, T') tensor
+    """
+    if audio.numel() == 0:
+        return audio
+    y = audio.squeeze(0).numpy()
+    n = len(y)
+    frame_length = 2048
+    hop_length = 512
+    ref = np.max(np.abs(y))
+    if ref <= 0:
+        return audio
+    threshold = ref * (10.0 ** (-top_db / 20.0))
+    try:
+        _, (start, end) = librosa.effects.trim(
+            y, top_db=top_db, ref=np.max, frame_length=frame_length, hop_length=hop_length
+        )
+    except Exception:
+        start, end = 0, n
+    # 用逐帧 RMS 找「最后一段有持续能量的位置」，截掉末尾长伪静音（低能量底噪等）
+    n_frames = max(0, (n - frame_length) // hop_length + 1)
+    last_voice_frame = -1
+    for j in range(n_frames):
+        idx = j * hop_length
+        if idx + frame_length > n:
+            break
+        rms = np.sqrt(np.mean(y[idx : idx + frame_length] ** 2))
+        if rms >= threshold:
+            last_voice_frame = j
+    if last_voice_frame >= 0:
+        end_by_vad = min(n, (last_voice_frame + 1) * hop_length + (frame_length - hop_length))
+        end = min(end, end_by_vad)
+    max_silence_samples = int(max_silence_ms * sample_rate / 1000.0)
+    new_start = max(0, start - max_silence_samples)
+    new_end = min(n, end + max_silence_samples)
+    return audio[:, new_start:new_end]
+class VoxCPMEncoderConfig(BaseModel):
+    hidden_dim: int = 1024
+    ffn_dim: int = 4096
+    num_heads: int = 16
+    num_layers: int = 4
+    kv_channels: int = None
+class VoxCPMDitConfig(BaseModel):
+    hidden_dim: int = 1024
+    ffn_dim: int = 4096
+    num_heads: int = 16
+    num_layers: int = 4
+    kv_channels: int = None
+    dit_mean_mode: bool = False
+    cfm_config: CfmConfig
+class VoxCPMConfig(BaseModel):
+    lm_config: MiniCPM4Config
+    patch_size: int = 4
+    feat_dim: int = 64
+    residual_lm_num_layers: int = 8
+    residual_lm_no_rope: bool = False
+    scalar_quantization_latent_dim: int = 512
+    scalar_quantization_scale: int = 9
+    encoder_config: VoxCPMEncoderConfig
+    dit_config: VoxCPMDitConfig
+    audio_vae_config: Optional[AudioVAEConfigV2] = None
+    max_length: int = 8192
+    device: str = "cuda"
+    dtype: str = "bfloat16"
+class LoRAConfig(BaseModel):
+    enable_lm: bool = False  # Apply LoRA to base_lm + residual_lm
+    enable_dit: bool = False  # Apply LoRA to VoxCPMLocDiT
+    enable_proj: bool = False  # Apply LoRA to projection Linear layers
+    r: int = 8
+    alpha: int = 16
+    dropout: float = 0.0
+    # Target linear layer names for LM & DiT (matched by attribute name)
+    target_modules_lm: list[str] = ["q_proj", "v_proj", "k_proj", "o_proj"]
+    target_modules_dit: list[str] = ["q_proj", "v_proj", "k_proj", "o_proj"]
+    # Projection layer attribute names to find on VoxCPM2Model
+    target_proj_modules: list[str] = ["enc_to_lm_proj", "lm_to_dit_proj", "res_to_dit_proj", "fusion_concat_proj"]
+VoxCPMConfig.model_rebuild()
+class VoxCPM2Model(nn.Module):
+    def __init__(
+        self,
+        config: VoxCPMConfig,
+        tokenizer: LlamaTokenizerFast,
+        audio_vae: AudioVAEV2,
+        lora_config: LoRAConfig = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.lora_config = lora_config
+        self.feat_dim = config.feat_dim
+        self.patch_size = config.patch_size
+        self.device = config.device
+        if not torch.cuda.is_available():
+            if torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+        print(f"Running on device: {self.device}, dtype: {self.config.dtype}", file=sys.stderr)
+        # Text-Semantic LM
+        self.base_lm = MiniCPMModel(config.lm_config)
+        self.base_lm.setup_cache(1, config.max_length, self.device, get_dtype(self.config.dtype))
+        self.text_tokenizer = mask_multichar_chinese_tokens(tokenizer)
+        self.audio_start_token = 101
+        self.audio_end_token = 102
+        self.ref_audio_start_token = 103
+        self.ref_audio_end_token = 104
+        # Residual Acoustic LM
+        residual_lm_config = config.lm_config.model_copy(deep=True)
+        residual_lm_config.num_hidden_layers = config.residual_lm_num_layers
+        residual_lm_config.vocab_size = 0
+        residual_lm_config.no_rope = config.residual_lm_no_rope
+        self.residual_lm = MiniCPMModel(residual_lm_config)
+        self.residual_lm.setup_cache(1, config.max_length, self.device, get_dtype(self.config.dtype))
+        # Local Encoder
+        encoder_config = config.lm_config.model_copy(deep=True)
+        encoder_config.hidden_size = config.encoder_config.hidden_dim
+        encoder_config.intermediate_size = config.encoder_config.ffn_dim
+        encoder_config.num_attention_heads = config.encoder_config.num_heads
+        encoder_config.num_hidden_layers = config.encoder_config.num_layers
+        encoder_config.kv_channels = config.encoder_config.kv_channels
+        encoder_config.vocab_size = 0
+        self.feat_encoder = VoxCPMLocEnc(encoder_config, input_dim=config.feat_dim)
+        # Local DiT
+        decoder_config = config.lm_config.model_copy(deep=True)
+        decoder_config.hidden_size = config.dit_config.hidden_dim
+        decoder_config.intermediate_size = config.dit_config.ffn_dim
+        decoder_config.num_attention_heads = config.dit_config.num_heads
+        decoder_config.num_hidden_layers = config.dit_config.num_layers
+        decoder_config.kv_channels = config.dit_config.kv_channels
+        decoder_config.vocab_size = 0
+        self.feat_decoder = UnifiedCFM(
+            in_channels=config.feat_dim,
+            cfm_params=config.dit_config.cfm_config,
+            estimator=VoxCPMLocDiTV2(decoder_config, in_channels=config.feat_dim),
+            mean_mode=config.dit_config.dit_mean_mode,
+        )
+        # Projection layers
+        self.fsq_layer = ScalarQuantizationLayer(
+            config.lm_config.hidden_size,
+            config.lm_config.hidden_size,
+            config.scalar_quantization_latent_dim,
+            config.scalar_quantization_scale,
+        )
+        self.enc_to_lm_proj = nn.Linear(config.encoder_config.hidden_dim, config.lm_config.hidden_size)
+        self.lm_to_dit_proj = nn.Linear(config.lm_config.hidden_size, config.dit_config.hidden_dim)
+        self.res_to_dit_proj = nn.Linear(config.lm_config.hidden_size, config.dit_config.hidden_dim)
+        self.fusion_concat_proj = nn.Linear(config.lm_config.hidden_size * 2, config.lm_config.hidden_size)
+        # Stop Predictor
+        self.stop_proj = nn.Linear(config.lm_config.hidden_size, config.lm_config.hidden_size)
+        self.stop_actn = nn.SiLU()
+        self.stop_head = nn.Linear(config.lm_config.hidden_size, 2, bias=False)
+        self.stop_loss = nn.CrossEntropyLoss(reduction="none")
+        # Audio VAE
+        self.audio_vae = audio_vae
+        self.chunk_size = audio_vae.chunk_size
+        self._encode_sample_rate = audio_vae.sample_rate
+        self.sample_rate = getattr(audio_vae, "out_sample_rate", audio_vae.sample_rate)
+        if self.lora_config is not None:
+            self._apply_lora()
+    def _apply_lora(self):
+        """注入 LoRA 到 LM / DiT / 投影层"""
+        cfg = self.lora_config
+        lora_kwargs = dict(r=cfg.r, alpha=cfg.alpha, dropout=cfg.dropout)
+        # LM: base_lm + residual_lm
+        if cfg.enable_lm:
+            for lm in [self.base_lm, self.residual_lm]:
+                apply_lora_to_named_linear_modules(lm, target_submodule_names=cfg.target_modules_lm, **lora_kwargs)
+        # DiT: feat_decoder.estimator
+        if cfg.enable_dit:
+            apply_lora_to_named_linear_modules(
+                self.feat_decoder.estimator, target_submodule_names=cfg.target_modules_dit, **lora_kwargs
+            )
+        # 投影层
+        if cfg.enable_proj:
+            from ..modules.layers.lora import LoRALinear
+            for attr_name in cfg.target_proj_modules:
+                module = getattr(self, attr_name, None)
+                if isinstance(module, nn.Linear):
+                    setattr(self, attr_name, LoRALinear(base=module, **lora_kwargs))
+    def optimize(self, disable: bool = False):
+        if disable:
+            return self
+        try:
+            if self.device != "cuda":
+                raise ValueError("VoxCPMModel can only be optimized on CUDA device")
+            try:
+                import triton  # noqa: F401
+            except ImportError:
+                raise ValueError("triton is not installed")
+            self.base_lm.forward_step = torch.compile(self.base_lm.forward_step, mode="reduce-overhead", fullgraph=True)
+            self.residual_lm.forward_step = torch.compile(
+                self.residual_lm.forward_step, mode="reduce-overhead", fullgraph=True
+            )
+            self.feat_encoder = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True)
+            self.feat_decoder.estimator = torch.compile(
+                self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True
+            )
+        except Exception as e:
+            print(f"Warning: torch.compile disabled - {e}", file=sys.stderr)
+        return self
+    def forward(
+        self,
+        text_tokens: torch.Tensor,
+        text_mask: torch.Tensor,
+        audio_feats: torch.Tensor,
+        audio_mask: torch.Tensor,
+        loss_mask: torch.Tensor,
+        position_ids: torch.Tensor,
+        labels: torch.Tensor,
+        *,
+        progress: float = 0.0,
+        sample_generate: bool = False,
+    ):
+        del position_ids  # not used yet
+        text_tokens = text_tokens.to(self.device, dtype=torch.long)
+        text_mask = text_mask.to(self.device, dtype=self._dtype())
+        audio_feats = audio_feats.to(self.device, dtype=self._dtype())
+        audio_mask = audio_mask.to(self.device, dtype=self._dtype())
+        loss_mask = loss_mask.to(self.device, dtype=self._dtype())
+        labels = labels.to(self.device, dtype=torch.long)
+        B, T, P, D = audio_feats.shape
+        feat_embed = self.feat_encoder(audio_feats)
+        feat_embed = self.enc_to_lm_proj(feat_embed)
+        scale_emb = getattr(self.config.lm_config, "scale_emb", 1.0)
+        if not getattr(self.config.lm_config, "use_mup", False):
+            scale_emb = 1.0
+        text_embed = self.base_lm.embed_tokens(text_tokens) * scale_emb
+        combined_embed = text_mask.unsqueeze(-1) * text_embed + audio_mask.unsqueeze(-1) * feat_embed
+        enc_outputs, _ = self.base_lm(inputs_embeds=combined_embed, is_causal=True)
+        enc_outputs = enc_outputs.to(self._dtype())
+        enc_outputs = self.fsq_layer(enc_outputs) * audio_mask.unsqueeze(-1) + enc_outputs * text_mask.unsqueeze(-1)
+        lm_hidden = torch.cat((torch.zeros_like(enc_outputs[:, 0:1, :]), enc_outputs[:, :-1, :]), dim=1)
+        residual_inputs = self.fusion_concat_proj(
+            torch.cat((enc_outputs, audio_mask.unsqueeze(-1) * feat_embed), dim=-1)
+        )
+        residual_outputs, _ = self.residual_lm(inputs_embeds=residual_inputs, is_causal=True)
+        residual_outputs = residual_outputs.to(self._dtype())
+        residual_hidden = torch.cat(
+            (torch.zeros_like(residual_outputs[:, 0:1, :]), residual_outputs[:, :-1, :]),
+            dim=1,
+        )
+        dit_hidden = torch.cat((self.lm_to_dit_proj(lm_hidden), self.res_to_dit_proj(residual_hidden)), dim=-1)
+        dit_hidden = rearrange(dit_hidden, "b t c -> (b t) c")
+        # Keep diffusion inputs in the same dtype as the model (e.g., bfloat16)
+        target_dtype = self._dtype()
+        feat_gt = rearrange(audio_feats.to(target_dtype), "b t p d -> (b t) p d")
+        feat_cond = torch.cat(
+            (torch.zeros_like(audio_feats[:, 0:1, ...]), audio_feats[:, :-1, ...]),
+            dim=1,
+        )
+        feat_cond = rearrange(feat_cond.to(target_dtype), "b t p d -> (b t) p d")
+        loss_seq_mask = loss_mask.unsqueeze(-1).repeat(1, 1, self.patch_size)
+        loss_seq_mask = rearrange(loss_seq_mask, "b t p -> (b t) p 1").to(target_dtype)
+        diff_loss = self.feat_decoder.compute_loss(
+            feat_gt.transpose(1, 2).contiguous(),
+            dit_hidden,
+            cond=feat_cond.transpose(1, 2).contiguous(),
+            tgt_mask=loss_seq_mask.transpose(1, 2).contiguous(),
+            progress=progress,
+        )
+        stop_logits = self.stop_head(self.stop_actn(self.stop_proj(lm_hidden)))
+        stop_losses = self.stop_loss(stop_logits.transpose(1, 2), labels)
+        denom = torch.clamp(loss_mask.sum(), min=1.0)
+        stop_loss = (stop_losses * loss_mask).sum() / denom
+        feat_pred = None
+        if sample_generate:
+            feat_cond_for_sample = feat_cond.transpose(1, 2).contiguous()
+            feat_pred_seq = self.feat_decoder(
+                mu=dit_hidden,
+                patch_size=self.patch_size,
+                cond=feat_cond_for_sample,
+                n_timesteps=(
+                    self.config.dit_config.cfm_config.inference_cfg_rate
+                    if hasattr(self.config.dit_config.cfm_config, "inference_cfg_rate")
+                    else 10
+                ),
+            )
+            feat_pred = rearrange(feat_pred_seq.transpose(1, 2), "(b t) d p -> b d (t p)", b=B, p=self.patch_size)
+        feat_gt_tensor = rearrange(feat_gt, "(b t) p d -> b d (t p)", b=B, p=self.patch_size)
+        return {
+            "loss/diff": diff_loss,
+            "loss/stop": stop_loss,
+            "feat_gt": feat_gt_tensor,
+            "feat_pred": feat_pred,
+        }
+    def _dtype(self):
+        return get_dtype(self.config.dtype)
+    def _encode_wav(self, wav_path: str, padding_mode: str = "right") -> torch.Tensor:
+        """Load, trim, pad and VAE-encode an audio file.
+        Args:
+            wav_path: path to the audio file.
+            padding_mode: "right" (default) or "left" padding for alignment.
+        Returns:
+            audio_feat: (T, P, D) tensor of latent patches.
+        """
+        audio, _ = librosa.load(wav_path, sr=self._encode_sample_rate, mono=True)
+        audio = torch.from_numpy(audio).unsqueeze(0)
+        audio = _trim_audio_silence_vad(audio, self._encode_sample_rate, max_silence_ms=200.0)
+        patch_len = self.patch_size * self.chunk_size
+        if audio.size(1) % patch_len != 0:
+            padding_size = patch_len - audio.size(1) % patch_len
+            pad = (padding_size, 0) if padding_mode == "left" else (0, padding_size)
+            audio = torch.nn.functional.pad(audio, pad)
+        feat = self.audio_vae.encode(audio.to(self.device), self._encode_sample_rate).cpu()
+        return feat.view(self.audio_vae.latent_dim, -1, self.patch_size).permute(1, 2, 0)
+    def _make_ref_prefix(self, ref_feat: torch.Tensor, device: torch.device):
+        """Build the [ref_start ref_audio ref_end] prefix segments.
+        Returns:
+            tokens, feats, text_mask, audio_mask  (all 1-D / 2-D tensors)
+        """
+        ref_len = ref_feat.size(0)
+        z1 = torch.zeros((1, self.patch_size, self.audio_vae.latent_dim), dtype=torch.float32, device=device)
+        tokens = torch.cat(
+            [
+                torch.tensor([self.ref_audio_start_token], dtype=torch.int32, device=device),
+                torch.zeros(ref_len, dtype=torch.int32, device=device),
+                torch.tensor([self.ref_audio_end_token], dtype=torch.int32, device=device),
+            ]
+        )
+        feats = torch.cat([z1, ref_feat, z1], dim=0)
+        t_mask = torch.cat(
+            [
+                torch.tensor([1], dtype=torch.int32),
+                torch.zeros(ref_len, dtype=torch.int32),
+                torch.tensor([1], dtype=torch.int32),
+            ]
+        ).to(device)
+        a_mask = torch.cat(
+            [
+                torch.tensor([0], dtype=torch.int32),
+                torch.ones(ref_len, dtype=torch.int32),
+                torch.tensor([0], dtype=torch.int32),
+            ]
+        ).to(device)
+        return tokens, feats, t_mask, a_mask
+    def generate(self, *args, **kwargs) -> torch.Tensor:
+        return next(self._generate(*args, streaming=False, **kwargs))
+    def generate_streaming(self, *args, **kwargs) -> Generator[torch.Tensor, None, None]:
+        return self._generate(*args, streaming=True, **kwargs)
+    @torch.inference_mode()
+    def _generate(
+        self,
+        target_text: str,
+        prompt_text: str = "",
+        prompt_wav_path: str = "",
+        reference_wav_path: str = "",
+        min_len: int = 2,
+        max_len: int = 2000,
+        inference_timesteps: int = 10,
+        cfg_value: float = 2.0,
+        retry_badcase: bool = False,
+        retry_badcase_max_times: int = 3,
+        retry_badcase_ratio_threshold: float = 6.0,
+        streaming: bool = False,
+        streaming_prefix_len: int = 4,
+    ) -> Generator[torch.Tensor, None, None]:
+        if retry_badcase and streaming:
+            warnings.warn("Retry on bad cases is not supported in streaming mode, setting retry_badcase=False.")
+            retry_badcase = False
+        if reference_wav_path and prompt_wav_path:
+            # Combined mode: reference isolation prefix + continuation suffix
+            text = prompt_text + target_text
+            text_token = torch.LongTensor(self.text_tokenizer(text))
+            text_token = torch.cat(
+                [
+                    text_token,
+                    torch.tensor([self.audio_start_token], dtype=torch.int32, device=text_token.device),
+                ],
+                dim=-1,
+            )
+            text_length = text_token.shape[0]
+            ref_feat = self._encode_wav(reference_wav_path, padding_mode="right")
+            prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left")
+            prompt_audio_length = prompt_feat.size(0)
+            ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device)
+            prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32, device=text_token.device)
+            text_pad_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            text_token = torch.cat([ref_tokens, text_token, prompt_pad_token])
+            audio_feat = torch.cat([ref_feats, text_pad_feat, prompt_feat], dim=0)
+            text_mask = torch.cat(
+                [
+                    ref_t_mask,
+                    torch.ones(text_length, dtype=torch.int32).to(text_token.device),
+                    torch.zeros(prompt_audio_length, dtype=torch.int32).to(text_token.device),
+                ]
+            )
+            audio_mask = torch.cat(
+                [
+                    ref_a_mask,
+                    torch.zeros(text_length, dtype=torch.int32).to(text_token.device),
+                    torch.ones(prompt_audio_length, dtype=torch.int32).to(text_token.device),
+                ]
+            )
+        elif reference_wav_path:
+            # Reference-only mode (prompt isolation)
+            text = target_text
+            text_token = torch.LongTensor(self.text_tokenizer(text))
+            text_token = torch.cat(
+                [
+                    text_token,
+                    torch.tensor([self.audio_start_token], dtype=torch.int32, device=text_token.device),
+                ],
+                dim=-1,
+            )
+            text_length = text_token.shape[0]
+            ref_feat = self._encode_wav(reference_wav_path, padding_mode="right")
+            ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_feat, text_token.device)
+            text_pad_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            text_token = torch.cat([ref_tokens, text_token])
+            audio_feat = torch.cat([ref_feats, text_pad_feat], dim=0)
+            text_mask = torch.cat(
+                [
+                    ref_t_mask,
+                    torch.ones(text_length, dtype=torch.int32).to(text_token.device),
+                ]
+            )
+            audio_mask = torch.cat(
+                [
+                    ref_a_mask,
+                    torch.zeros(text_length, dtype=torch.int32).to(text_token.device),
+                ]
+            )
+        elif len(prompt_wav_path) == 0:
+            # Zero-shot mode
+            text = target_text
+            text_token = torch.LongTensor(self.text_tokenizer(text))
+            text_token = torch.cat(
+                [
+                    text_token,
+                    torch.tensor([self.audio_start_token], dtype=torch.int32, device=text_token.device),
+                ],
+                dim=-1,
+            )
+            text_length = text_token.shape[0]
+            audio_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            text_mask = torch.ones(text_length, dtype=torch.int32).to(text_token.device)
+            audio_mask = torch.zeros(text_length, dtype=torch.int32).to(text_token.device)
+        else:
+            # Continuation-only mode
+            text = prompt_text + target_text
+            text_token = torch.LongTensor(self.text_tokenizer(text))
+            text_token = torch.cat(
+                [
+                    text_token,
+                    torch.tensor([self.audio_start_token], dtype=torch.int32, device=text_token.device),
+                ],
+                dim=-1,
+            )
+            text_length = text_token.shape[0]
+            prompt_feat = self._encode_wav(prompt_wav_path, padding_mode="left")
+            prompt_audio_length = prompt_feat.size(0)
+            prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32, device=text_token.device)
+            text_pad_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            text_token = torch.cat([text_token, prompt_pad_token])
+            audio_feat = torch.cat([text_pad_feat, prompt_feat], dim=0)
+            text_mask = torch.cat(
+                [
+                    torch.ones(text_length, dtype=torch.int32),
+                    torch.zeros(prompt_audio_length, dtype=torch.int32),
+                ]
+            ).to(text_token.device)
+            audio_mask = torch.cat(
+                [
+                    torch.zeros(text_length, dtype=torch.int32),
+                    torch.ones(prompt_audio_length, dtype=torch.int32),
+                ]
+            ).to(text_token.device)
+        text_token = text_token.unsqueeze(0).to(self.device)
+        text_mask = text_mask.unsqueeze(0).to(self.device)
+        audio_feat = audio_feat.unsqueeze(0).to(self.device).to(get_dtype(self.config.dtype))
+        audio_mask = audio_mask.unsqueeze(0).to(self.device)
+        target_text_length = len(self.text_tokenizer(target_text))
+        retry_badcase_times = 0
+        while retry_badcase_times < retry_badcase_max_times:
+            inference_result = self._inference(
+                text_token,
+                text_mask,
+                audio_feat,
+                audio_mask,
+                min_len=min_len,
+                max_len=min(int(target_text_length * retry_badcase_ratio_threshold + 10), max_len),
+                inference_timesteps=inference_timesteps,
+                cfg_value=cfg_value,
+                streaming=streaming,
+                streaming_prefix_len=streaming_prefix_len,
+            )
+            if streaming:
+                out_patch_len = self.patch_size * self.chunk_size * (self.sample_rate // self._encode_sample_rate)
+                for latent_pred, _ in inference_result:
+                    decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
+                    decode_audio = decode_audio[..., -out_patch_len:].squeeze(1).cpu()
+                    yield decode_audio
+                break
+            else:
+                latent_pred, pred_audio_feat = next(inference_result)
+                if retry_badcase:
+                    if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
+                        print(
+                            f"  Badcase detected, audio_text_ratio={pred_audio_feat.shape[0] / target_text_length}, retrying...",
+                            file=sys.stderr,
+                        )
+                        retry_badcase_times += 1
+                        continue
+                    else:
+                        break
+                else:
+                    break
+        if not streaming:
+            decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
+            out_patch_len = self.patch_size * self.chunk_size * (self.sample_rate // self._encode_sample_rate)
+            has_continuation = bool(prompt_wav_path)
+            if has_continuation:
+                decode_audio = decode_audio[..., out_patch_len * (streaming_prefix_len - 1):].squeeze(1).cpu()
+            else:
+                decode_audio = decode_audio.squeeze(1).cpu()
+            yield decode_audio
+    @torch.inference_mode()
+    def build_prompt_cache(
+        self,
+        prompt_text: str = None,
+        prompt_wav_path: str = None,
+        reference_wav_path: str = None,
+    ):
+        """
+        Build prompt cache for subsequent generation.
+        Supports the same parameter combinations as ``generate()``:
+        - ``reference_wav_path`` only -> reference mode (voice cloning, isolated)
+        - ``prompt_text`` + ``prompt_wav_path`` -> continuation mode
+        - all three -> combined ref + continuation mode
+        Args:
+            prompt_text: prompt text for continuation mode.
+                Must be paired with ``prompt_wav_path``.
+            prompt_wav_path: prompt audio path for continuation mode.
+                Must be paired with ``prompt_text``.
+            reference_wav_path: reference audio path for voice cloning
+                (structurally isolated via ref_audio tokens).
+        Returns:
+            prompt_cache: dict used by ``_generate_with_prompt_cache``.
+        """
+        if (prompt_wav_path is None) != (prompt_text is None):
+            raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None")
+        if prompt_wav_path is None and reference_wav_path is None:
+            raise ValueError("At least one of prompt_wav_path or reference_wav_path must be provided")
+        cache = {}
+        if reference_wav_path:
+            cache["ref_audio_feat"] = self._encode_wav(reference_wav_path, padding_mode="right")
+        if prompt_wav_path and prompt_text is not None:
+            cache["prompt_text"] = prompt_text
+            cache["audio_feat"] = self._encode_wav(prompt_wav_path, padding_mode="left")
+        has_ref = "ref_audio_feat" in cache
+        has_prompt = "audio_feat" in cache
+        if has_ref and has_prompt:
+            cache["mode"] = "ref_continuation"
+        elif has_ref:
+            cache["mode"] = "reference"
+        else:
+            cache["mode"] = "continuation"
+        return cache
+    def merge_prompt_cache(
+        self,
+        original_cache: dict,
+        new_text: str,
+        new_audio_feat: torch.Tensor,
+    ):
+        """
+        Merge original prompt cache with newly generated content to stabilize voice.
+        Args:
+            original_cache: original prompt cache (any mode)
+            new_text: newly generated text
+            new_audio_feat: newly generated audio features
+        Returns:
+            merged_cache: merged cache with prompt_text and audio_feat
+        """
+        if original_cache is None:
+            return {
+                "prompt_text": new_text,
+                "audio_feat": new_audio_feat,
+                "mode": "continuation",
+            }
+        merged = {}
+        if "ref_audio_feat" in original_cache:
+            merged["ref_audio_feat"] = original_cache["ref_audio_feat"]
+        merged["prompt_text"] = original_cache.get("prompt_text", "") + new_text
+        old_feat = original_cache.get("audio_feat", new_audio_feat.new_empty(0, *new_audio_feat.shape[1:]))
+        merged["audio_feat"] = torch.cat([old_feat, new_audio_feat], dim=0)
+        merged["mode"] = "ref_continuation" if "ref_audio_feat" in merged else "continuation"
+        return merged
+    def generate_with_prompt_cache(self, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        return next(self._generate_with_prompt_cache(*args, streaming=False, **kwargs))
+    def generate_with_prompt_cache_streaming(
+        self, *args, **kwargs
+    ) -> Generator[Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]], None, None]:
+        return self._generate_with_prompt_cache(*args, streaming=True, **kwargs)
+    @torch.inference_mode()
+    def _generate_with_prompt_cache(
+        self,
+        target_text: str,
+        prompt_cache: dict,
+        min_len: int = 2,
+        max_len: int = 2000,
+        inference_timesteps: int = 10,
+        cfg_value: float = 2.0,
+        retry_badcase: bool = False,
+        retry_badcase_max_times: int = 3,
+        retry_badcase_ratio_threshold: float = 6.0,
+        streaming: bool = False,
+        streaming_prefix_len: int = 4,
+    ) -> Generator[Tuple[torch.Tensor, torch.Tensor, Union[torch.Tensor, List[torch.Tensor]]], None, None]:
+        """
+        Generate audio using pre-built prompt cache.
+        Args:
+            target_text: Text to convert to speech
+            prompt_cache: Cache built by ``build_prompt_cache()``. Can be None
+                for zero-shot generation.
+            min_len: Minimum audio length to avoid very short audio
+            max_len: Maximum audio length
+            inference_timesteps: Number of diffusion sampling steps
+            cfg_value: Classifier-free guidance value
+            retry_badcase: Whether to retry on bad cases
+            retry_badcase_max_times: Maximum retry attempts
+            retry_badcase_ratio_threshold: Threshold for audio-to-text ratio
+            streaming: Whether to return a generator of audio chunks
+            streaming_prefix_len: Number of prefix audio patches to use for streaming mode
+        Returns:
+            Generator of Tuple containing:
+                - Decoded audio tensor for the current step if ``streaming=True``, else final decoded audio tensor
+                - Tensor of new text tokens
+                - New audio features up to the current step as a List if ``streaming=True``, else as a concatenated Tensor
+        """
+        if retry_badcase and streaming:
+            warnings.warn("Retry on bad cases is not supported in streaming mode, setting retry_badcase=False.")
+            retry_badcase = False
+        # Determine mode from cache
+        if prompt_cache is None:
+            mode = "zero_shot"
+            text = target_text
+        else:
+            mode = prompt_cache.get("mode", "continuation")
+            if mode in ("continuation", "ref_continuation"):
+                prompt_text = prompt_cache.get("prompt_text", "")
+                text = prompt_text + target_text
+            else:
+                text = target_text
+        text_token = torch.LongTensor(self.text_tokenizer(text))
+        text_token = torch.cat(
+            [
+                text_token,
+                torch.tensor([self.audio_start_token], dtype=torch.int32, device=text_token.device),
+            ],
+            dim=-1,
+        )
+        target_text_token = torch.LongTensor(self.text_tokenizer(target_text))
+        text_length = text_token.shape[0]
+        if mode in ("zero_shot", "continuation"):
+            prompt_audio_feat = (
+                prompt_cache["audio_feat"]
+                if prompt_cache
+                else torch.empty((0, self.patch_size, self.audio_vae.latent_dim), dtype=torch.float32)
+            )
+            audio_length = prompt_audio_feat.size(0)
+            text_pad_token = torch.zeros(audio_length, dtype=torch.int32, device=text_token.device)
+            text_pad_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            text_token = torch.cat([text_token, text_pad_token])
+            audio_feat = torch.cat([text_pad_feat, prompt_audio_feat], dim=0)
+            text_mask = torch.cat(
+                [torch.ones(text_length, dtype=torch.int32), torch.zeros(audio_length, dtype=torch.int32)]
+            ).to(text_token.device)
+            audio_mask = torch.cat(
+                [torch.zeros(text_length, dtype=torch.int32), torch.ones(audio_length, dtype=torch.int32)]
+            ).to(text_token.device)
+        elif mode == "reference":
+            ref_audio_feat = prompt_cache["ref_audio_feat"]
+            ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_audio_feat, text_token.device)
+            text_pad_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            text_token = torch.cat([ref_tokens, text_token])
+            audio_feat = torch.cat([ref_feats, text_pad_feat], dim=0)
+            text_mask = torch.cat([ref_t_mask, torch.ones(text_length, dtype=torch.int32).to(text_token.device)])
+            audio_mask = torch.cat([ref_a_mask, torch.zeros(text_length, dtype=torch.int32).to(text_token.device)])
+        else:
+            # ref_continuation mode
+            ref_audio_feat = prompt_cache["ref_audio_feat"]
+            prompt_audio_feat = prompt_cache["audio_feat"]
+            prompt_audio_length = prompt_audio_feat.size(0)
+            ref_tokens, ref_feats, ref_t_mask, ref_a_mask = self._make_ref_prefix(ref_audio_feat, text_token.device)
+            prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32, device=text_token.device)
+            text_pad_feat = torch.zeros(
+                (text_length, self.patch_size, self.audio_vae.latent_dim),
+                dtype=torch.float32,
+                device=text_token.device,
+            )
+            text_token = torch.cat([ref_tokens, text_token, prompt_pad_token])
+            audio_feat = torch.cat([ref_feats, text_pad_feat, prompt_audio_feat], dim=0)
+            text_mask = torch.cat(
+                [
+                    ref_t_mask,
+                    torch.ones(text_length, dtype=torch.int32).to(text_token.device),
+                    torch.zeros(prompt_audio_length, dtype=torch.int32).to(text_token.device),
+                ]
+            )
+            audio_mask = torch.cat(
+                [
+                    ref_a_mask,
+                    torch.zeros(text_length, dtype=torch.int32).to(text_token.device),
+                    torch.ones(prompt_audio_length, dtype=torch.int32).to(text_token.device),
+                ]
+            )
+        text_token = text_token.unsqueeze(0).to(self.device)
+        text_mask = text_mask.unsqueeze(0).to(self.device)
+        audio_feat = audio_feat.unsqueeze(0).to(self.device).to(get_dtype(self.config.dtype))
+        audio_mask = audio_mask.unsqueeze(0).to(self.device)
+        # run inference
+        target_text_length = len(self.text_tokenizer(target_text))
+        retry_badcase_times = 0
+        while retry_badcase_times < retry_badcase_max_times:
+            inference_result = self._inference(
+                text_token,
+                text_mask,
+                audio_feat,
+                audio_mask,
+                min_len=min_len,
+                max_len=min(int(target_text_length * retry_badcase_ratio_threshold + 10), max_len),
+                inference_timesteps=inference_timesteps,
+                cfg_value=cfg_value,
+                streaming=streaming,
+                streaming_prefix_len=streaming_prefix_len,
+            )
+            if streaming:
+                out_patch_len = self.patch_size * self.chunk_size * (self.sample_rate // self._encode_sample_rate)
+                for latent_pred, pred_audio_feat in inference_result:
+                    decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
+                    decode_audio = decode_audio[..., -out_patch_len:].squeeze(1).cpu()
+                    yield (decode_audio, target_text_token, pred_audio_feat)
+                break
+            else:
+                latent_pred, pred_audio_feat = next(inference_result)
+                if retry_badcase:
+                    if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
+                        print(
+                            f"  Badcase detected, audio_text_ratio={pred_audio_feat.shape[0] / target_text_length}, retrying...",
+                            file=sys.stderr,
+                        )
+                        retry_badcase_times += 1
+                        continue
+                    else:
+                        break
+                else:
+                    break
+        if not streaming:
+            decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32))
+            out_patch_len = self.patch_size * self.chunk_size * (self.sample_rate // self._encode_sample_rate)
+            if mode in ("continuation", "ref_continuation"):
+                decode_audio = decode_audio[..., out_patch_len * (streaming_prefix_len - 1) :].squeeze(1).cpu()
+            else:
+                decode_audio = decode_audio[..., :].squeeze(1).cpu()
+            yield (decode_audio, target_text_token, pred_audio_feat)
+    def inference(self, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
+        return next(self._inference(*args, streaming=False, **kwargs))
+    def inference_streaming(self, *args, **kwargs) -> Generator[Tuple[torch.Tensor, List[torch.Tensor]], None, None]:
+        return self._inference(*args, streaming=True, **kwargs)
+    @torch.inference_mode()
+    def _inference(
+        self,
+        text: torch.Tensor,
+        text_mask: torch.Tensor,
+        feat: torch.Tensor,
+        feat_mask: torch.Tensor,
+        min_len: int = 2,
+        max_len: int = 2000,
+        inference_timesteps: int = 10,
+        cfg_value: float = 2.0,
+        streaming: bool = False,
+        streaming_prefix_len: int = 4,
+    ) -> Generator[Tuple[torch.Tensor, Union[torch.Tensor, List[torch.Tensor]]], None, None]:
+        """Core inference method for audio generation.
+        This is the main inference loop that generates audio features
+        using the language model and diffusion transformer.
+        Args:
+            text: Input text tokens
+            text_mask: Mask for text tokens
+            feat: Input audio features
+            feat_mask: Mask for audio features
+            min_len: Minimum generation length
+            max_len: Maximum generation length
+            inference_timesteps: Number of diffusion steps
+            cfg_value: Classifier-free guidance value
+            streaming: Whether to yield each step latent feature or just the final result
+        Returns:
+            Generator of Tuple containing:
+                - Predicted latent feature at the current step if ``streaming=True``, else final latent features
+                - Predicted audio feature sequence so far as a List if ``streaming=True``, else as a concatenated Tensor
+        """
+        B, T, P, D = feat.shape
+        feat_embed = self.feat_encoder(feat)  # [b, t, h_feat]
+        feat_embed = self.enc_to_lm_proj(feat_embed)
+        if self.config.lm_config.use_mup:
+            scale_emb = self.config.lm_config.scale_emb
+        else:
+            scale_emb = 1.0
+        text_embed = self.base_lm.embed_tokens(text) * scale_emb
+        combined_embed = text_mask.unsqueeze(-1) * text_embed + feat_mask.unsqueeze(-1) * feat_embed
+        prefix_feat_cond = feat[:, -1, ...]  # b, p, d
+        pred_feat_seq = []  # b, t, p, d
+        curr_embed = None
+        # Prepare prompt context patches for streaming mode
+        # - Continuation modes (feat_mask ends with 1): use the last (streaming_prefix_len - 1)
+        #   trailing audio patches as initial context so the VAE can decode smoothly.
+        # - Reference-only / zero-shot (feat_mask ends with 0): start from scratch.
+        has_continuation_audio = feat_mask[0, -1].item() == 1
+        if has_continuation_audio:
+            audio_indices = feat_mask.squeeze(0).nonzero(as_tuple=True)[0]
+            context_len = min(streaming_prefix_len - 1, len(audio_indices))
+            last_audio_indices = audio_indices[-context_len:]
+            pred_feat_seq = list(feat[:, last_audio_indices, :, :].split(1, dim=1))
+        else:
+            pred_feat_seq = []
+        enc_outputs, kv_cache_tuple = self.base_lm(
+            inputs_embeds=combined_embed,
+            is_causal=True,
+        )
+        self.base_lm.kv_cache.fill_caches(kv_cache_tuple)
+        enc_outputs = self.fsq_layer(enc_outputs) * feat_mask.unsqueeze(-1) + enc_outputs * text_mask.unsqueeze(-1)
+        lm_hidden = enc_outputs[:, -1, :]
+        residual_enc_inputs = self.fusion_concat_proj(
+            torch.cat((enc_outputs, feat_mask.unsqueeze(-1) * feat_embed), dim=-1)
+        )
+        residual_enc_outputs, residual_kv_cache_tuple = self.residual_lm(
+            inputs_embeds=residual_enc_inputs,
+            is_causal=True,
+        )
+        self.residual_lm.kv_cache.fill_caches(residual_kv_cache_tuple)
+        residual_hidden = residual_enc_outputs[:, -1, :]
+        for i in tqdm(range(max_len)):
+            dit_hidden_1 = self.lm_to_dit_proj(lm_hidden)  # [b, h_dit]
+            dit_hidden_2 = self.res_to_dit_proj(residual_hidden)  # [b, h_dit]
+            dit_hidden = torch.cat((dit_hidden_1, dit_hidden_2), dim=-1)
+            pred_feat = self.feat_decoder(
+                mu=dit_hidden,
+                patch_size=self.patch_size,
+                cond=prefix_feat_cond.transpose(1, 2).contiguous(),
+                n_timesteps=inference_timesteps,
+                cfg_value=cfg_value,
+            ).transpose(
+                1, 2
+            )  # [b, p, d]
+            curr_embed = self.feat_encoder(pred_feat.unsqueeze(1))  # b, 1, c
+            curr_embed = self.enc_to_lm_proj(curr_embed)
+            pred_feat_seq.append(pred_feat.unsqueeze(1))  # b, 1, p, d
+            prefix_feat_cond = pred_feat
+            if streaming:
+                # return the last three predicted latent features to provide enough context for smooth decoding
+                pred_feat_chunk = torch.cat(pred_feat_seq[-streaming_prefix_len:], dim=1)
+                feat_pred = rearrange(pred_feat_chunk, "b t p d -> b d (t p)", b=B, p=self.patch_size)
+                yield feat_pred, pred_feat_seq
+            stop_flag = self.stop_head(self.stop_actn(self.stop_proj(lm_hidden))).argmax(dim=-1)[0].cpu().item()
+            if i > min_len and stop_flag == 1:
+                break
+            lm_hidden = self.base_lm.forward_step(
+                curr_embed[:, 0, :], torch.tensor([self.base_lm.kv_cache.step()], device=curr_embed.device)
+            ).clone()
+            lm_hidden = self.fsq_layer(lm_hidden)
+            curr_residual_input = self.fusion_concat_proj(torch.cat((lm_hidden, curr_embed[:, 0, :]), dim=-1))
+            residual_hidden = self.residual_lm.forward_step(
+                curr_residual_input, torch.tensor([self.residual_lm.kv_cache.step()], device=curr_embed.device)
+            ).clone()
+        if not streaming:
+            pred_feat_seq = torch.cat(pred_feat_seq, dim=1)  # b, t, p, d
+            feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size)
+            yield feat_pred, pred_feat_seq.squeeze(0).cpu()
+    @classmethod
+    def from_local(cls, path: str, optimize: bool = True, training: bool = False, lora_config: LoRAConfig = None):
+        config = VoxCPMConfig.model_validate_json(open(os.path.join(path, "config.json")).read())
+        tokenizer = LlamaTokenizerFast.from_pretrained(path)
+        audio_vae_config = getattr(config, "audio_vae_config", None)
+        audio_vae = AudioVAEV2(config=audio_vae_config) if audio_vae_config else AudioVAEV2()
+        # Try to load AudioVAE from safetensors first, fallback to pytorch
+        audiovae_safetensors_path = os.path.join(path, "audiovae.safetensors")
+        audiovae_pth_path = os.path.join(path, "audiovae.pth")
+        if os.path.exists(audiovae_safetensors_path) and SAFETENSORS_AVAILABLE:
+            print(f"Loading AudioVAE from safetensors: {audiovae_safetensors_path}", file=sys.stderr)
+            vae_state_dict = load_file(audiovae_safetensors_path, device="cpu")
+        elif os.path.exists(audiovae_pth_path):
+            print(f"Loading AudioVAE from pytorch: {audiovae_pth_path}", file=sys.stderr)
+            checkpoint = torch.load(
+                audiovae_pth_path,
+                map_location="cpu",
+                weights_only=True,
+            )
+            vae_state_dict = checkpoint.get("state_dict", checkpoint)
+        else:
+            raise FileNotFoundError(
+                f"AudioVAE checkpoint not found. Expected either {audiovae_safetensors_path} or {audiovae_pth_path}"
+            )
+        model = cls(config, tokenizer, audio_vae, lora_config)
+        if not training:
+            lm_dtype = get_dtype(model.config.dtype)
+            model = model.to(lm_dtype)
+        else:  # training mode
+            for name, param in model.named_parameters():
+                if "audio_vae" in name:  # freeze VAE weights
+                    param.requires_grad = False
+                    continue
+                if lora_config is not None:
+                    if "lora" not in name:  # freeze non-LoRA weights
+                        param.requires_grad = False
+        model.audio_vae = model.audio_vae.to(torch.float32)
+        # Try to load from safetensors first, fallback to pytorch_model.bin
+        safetensors_path = os.path.join(path, "model.safetensors")
+        pytorch_model_path = os.path.join(path, "pytorch_model.bin")
+        if os.path.exists(safetensors_path) and SAFETENSORS_AVAILABLE:
+            print(f"Loading model from safetensors: {safetensors_path}", file=sys.stderr)
+            model_state_dict = load_file(safetensors_path)
+        elif os.path.exists(pytorch_model_path):
+            print(f"Loading model from pytorch_model.bin: {pytorch_model_path}", file=sys.stderr)
+            checkpoint = torch.load(
+                pytorch_model_path,
+                map_location="cpu",
+                weights_only=True,
+            )
+            model_state_dict = checkpoint.get("state_dict", checkpoint)
+        else:
+            raise FileNotFoundError(f"Model file not found. Expected either {safetensors_path} or {pytorch_model_path}")
+        for kw, val in vae_state_dict.items():
+            model_state_dict[f"audio_vae.{kw}"] = val
+        # LoRALinear keeps weight/bias compatible with nn.Linear but adds
+        # lora_A/lora_B, which are absent from base pretrained checkpoints.
+        model.load_state_dict(model_state_dict, strict=False)
+        if training:
+            return model
+        return model.to(model.device).eval().optimize(disable=not optimize)
+    # ------------------------------------------------------------------ #
+    # LoRA Weight Management
+    # ------------------------------------------------------------------ #
+    def _iter_lora_modules(self):
+        """Iterate over all LoRA modules."""
+        from ..modules.layers.lora import LoRALinear
+        for module in self.modules():
+            if isinstance(module, LoRALinear):
+                yield module
+    def load_lora_weights(self, lora_path: str, device: str = None):
+        """
+        Load LoRA weights from file, supports calling after torch.compile.
+        Uses named_parameters() to handle compile's _orig_mod wrapper.
+        Supports both safetensors and pytorch formats.
+        Args:
+            lora_path: Checkpoint path (directory or .safetensors/.ckpt file)
+            device: Target device, defaults to model's current device
+        Returns:
+            tuple: (loaded_keys, skipped_keys)
+        """
+        from pathlib import Path
+        device = device or self.device
+        lora_p = Path(lora_path)
+        # Try safetensors first, then fallback to .ckpt
+        if lora_p.is_dir():
+            safetensors_file = lora_p / "lora_weights.safetensors"
+            ckpt_file = lora_p / "lora_weights.ckpt"
+        else:
+            safetensors_file = lora_p if lora_p.suffix == ".safetensors" else None
+            ckpt_file = lora_p if lora_p.suffix in [".ckpt", ".pth"] else None
+        # Load from safetensors if available
+        if safetensors_file and safetensors_file.exists() and SAFETENSORS_AVAILABLE:
+            state_dict = load_file(str(safetensors_file), device=device)
+        elif ckpt_file and ckpt_file.exists():
+            ckpt = torch.load(ckpt_file, map_location=device, weights_only=False)
+            state_dict = ckpt.get("state_dict", ckpt)
+        else:
+            raise FileNotFoundError(f"LoRA checkpoint not found. Expected either {safetensors_file} or {ckpt_file}")
+        # Build param mapping (handle torch.compile's _orig_mod prefix)
+        model_params = dict(self.named_parameters())
+        key_mapping = {k.replace("._orig_mod.", "."): k for k in model_params if "._orig_mod." in k}
+        loaded_keys, skipped_keys = [], []
+        for key, value in state_dict.items():
+            target_key = key if key in model_params else key_mapping.get(key)
+            if target_key:
+                model_params[target_key].data.copy_(value.to(device))
+                loaded_keys.append(key)
+            else:
+                skipped_keys.append(key)
+        return loaded_keys, skipped_keys
+    def set_lora_enabled(self, enabled: bool):
+        """Enable/disable all LoRA layers."""
+        for module in self._iter_lora_modules():
+            module.set_enabled(enabled)
+    def reset_lora_weights(self):
+        """Reset all LoRA weights (A: kaiming, B: zeros), effectively unloading LoRA."""
+        for module in self._iter_lora_modules():
+            module.reset_lora_parameters()
+    def get_lora_state_dict(self) -> dict:
+        """Get all LoRA parameters (lora_A/lora_B)."""
+        return {name: param.data.clone() for name, param in self.named_parameters() if "lora_" in name}

voxcpm/modules/__init__.py ADDED Viewed

File without changes

voxcpm/modules/audiovae/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .audio_vae import AudioVAE, AudioVAEConfig
2	+ from .audio_vae_v2 import AudioVAE as AudioVAEV2, AudioVAEConfig as AudioVAEConfigV2

voxcpm/modules/audiovae/audio_vae.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import math
+from typing import List
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+from pydantic import BaseModel
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+class CausalConv1d(nn.Conv1d):
+    def __init__(self, *args, padding: int = 0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__padding = padding
+    def forward(self, x):
+        x_pad = F.pad(x, (self.__padding * 2, 0))
+        return super().forward(x_pad)
+class CausalTransposeConv1d(nn.ConvTranspose1d):
+    def __init__(self, *args, padding: int = 0, output_padding: int = 0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__padding = padding
+        self.__output_padding = output_padding
+    def forward(self, x):
+        return super().forward(x)[..., : -(self.__padding * 2 - self.__output_padding)]
+def WNCausalConv1d(*args, **kwargs):
+    return weight_norm(CausalConv1d(*args, **kwargs))
+def WNCausalTransposeConv1d(*args, **kwargs):
+    return weight_norm(CausalTransposeConv1d(*args, **kwargs))
+# Scripting this brings model speed up 1.4x
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+class CausalResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1, kernel: int = 7, groups: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNCausalConv1d(
+                dim,
+                dim,
+                kernel_size=kernel,
+                dilation=dilation,
+                padding=pad,
+                groups=groups,
+            ),
+            Snake1d(dim),
+            WNCausalConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        y = self.block(x)
+        pad = (x.shape[-1] - y.shape[-1]) // 2
+        assert pad == 0
+        if pad > 0:
+            x = x[..., pad:-pad]
+        return x + y
+class CausalEncoderBlock(nn.Module):
+    def __init__(self, output_dim: int = 16, input_dim=None, stride: int = 1, groups=1):
+        super().__init__()
+        input_dim = input_dim or output_dim // 2
+        self.block = nn.Sequential(
+            CausalResidualUnit(input_dim, dilation=1, groups=groups),
+            CausalResidualUnit(input_dim, dilation=3, groups=groups),
+            CausalResidualUnit(input_dim, dilation=9, groups=groups),
+            Snake1d(input_dim),
+            WNCausalConv1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+        )
+    def forward(self, x):
+        return self.block(x)
+class CausalEncoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int = 64,
+        latent_dim: int = 32,
+        strides: list = [2, 4, 8, 8],
+        depthwise: bool = False,
+    ):
+        super().__init__()
+        # Create first convolution
+        self.block = [WNCausalConv1d(1, d_model, kernel_size=7, padding=3)]
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride in strides:
+            d_model *= 2
+            groups = d_model // 2 if depthwise else 1
+            self.block += [CausalEncoderBlock(output_dim=d_model, stride=stride, groups=groups)]
+        groups = d_model if depthwise else 1
+        # Create two convolution, for mu and logvar
+        self.fc_mu = WNCausalConv1d(d_model, latent_dim, kernel_size=3, padding=1)
+        self.fc_logvar = WNCausalConv1d(d_model, latent_dim, kernel_size=3, padding=1)
+        # Wrap black into nn.Sequential
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+    def forward(self, x):
+        hidden_state = self.block(x)
+        return {
+            "hidden_state": hidden_state,
+            "mu": self.fc_mu(hidden_state),
+            "logvar": self.fc_logvar(hidden_state),
+        }
+class NoiseBlock(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.linear = WNCausalConv1d(dim, dim, kernel_size=1, bias=False)
+    def forward(self, x):
+        B, C, T = x.shape
+        noise = torch.randn((B, 1, T), device=x.device, dtype=x.dtype)
+        h = self.linear(x)
+        n = noise * h
+        x = x + n
+        return x
+class CausalDecoderBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 16,
+        output_dim: int = 8,
+        stride: int = 1,
+        groups=1,
+        use_noise_block: bool = False,
+    ):
+        super().__init__()
+        layers = [
+            Snake1d(input_dim),
+            WNCausalTransposeConv1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                output_padding=stride % 2,
+            ),
+        ]
+        if use_noise_block:
+            layers.append(NoiseBlock(output_dim))
+        layers.extend(
+            [
+                CausalResidualUnit(output_dim, dilation=1, groups=groups),
+                CausalResidualUnit(output_dim, dilation=3, groups=groups),
+                CausalResidualUnit(output_dim, dilation=9, groups=groups),
+            ]
+        )
+        self.block = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.block(x)
+class TransposeLastTwoDim(torch.nn.Module):
+    def forward(self, x):
+        return torch.transpose(x, -1, -2)
+class CausalDecoder(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        depthwise: bool = False,
+        d_out: int = 1,
+        use_noise_block: bool = False,
+    ):
+        super().__init__()
+        # Add first conv layer
+        if depthwise:
+            layers = [
+                WNCausalConv1d(
+                    input_channel,
+                    input_channel,
+                    kernel_size=7,
+                    padding=3,
+                    groups=input_channel,
+                ),
+                WNCausalConv1d(input_channel, channels, kernel_size=1),
+            ]
+        else:
+            layers = [WNCausalConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, stride in enumerate(rates):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            groups = output_dim if depthwise else 1
+            layers += [
+                CausalDecoderBlock(
+                    input_dim,
+                    output_dim,
+                    stride,
+                    groups=groups,
+                    use_noise_block=use_noise_block,
+                )
+            ]
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            WNCausalConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.model(x)
+class AudioVAEConfig(BaseModel):
+    encoder_dim: int = 128
+    encoder_rates: List[int] = [2, 5, 8, 8]
+    latent_dim: int = 64
+    decoder_dim: int = 1536
+    decoder_rates: List[int] = [8, 8, 5, 2]
+    depthwise: bool = True
+    sample_rate: int = 16000
+    use_noise_block: bool = False
+class AudioVAE(nn.Module):
+    """
+    Args:
+    """
+    def __init__(
+        self,
+        config: AudioVAEConfig = None,
+    ):
+        # 如果没有传入config，使用默认配置
+        if config is None:
+            config = AudioVAEConfig()
+        super().__init__()
+        encoder_dim = config.encoder_dim
+        encoder_rates = config.encoder_rates
+        latent_dim = config.latent_dim
+        decoder_dim = config.decoder_dim
+        decoder_rates = config.decoder_rates
+        depthwise = config.depthwise
+        sample_rate = config.sample_rate
+        use_noise_block = config.use_noise_block
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.depthwise = depthwise
+        self.use_noise_block = use_noise_block
+        if latent_dim is None:
+            latent_dim = encoder_dim * (2 ** len(encoder_rates))
+        self.latent_dim = latent_dim
+        self.hop_length = np.prod(encoder_rates)
+        self.encoder = CausalEncoder(
+            encoder_dim,
+            latent_dim,
+            encoder_rates,
+            depthwise=depthwise,
+        )
+        self.decoder = CausalDecoder(
+            latent_dim,
+            decoder_dim,
+            decoder_rates,
+            depthwise=depthwise,
+            use_noise_block=use_noise_block,
+        )
+        self.sample_rate = sample_rate
+        self.chunk_size = math.prod(encoder_rates)
+    def preprocess(self, audio_data, sample_rate):
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        assert sample_rate == self.sample_rate
+        pad_to = self.hop_length
+        length = audio_data.shape[-1]
+        right_pad = math.ceil(length / pad_to) * pad_to - length
+        audio_data = nn.functional.pad(audio_data, (0, right_pad))
+        return audio_data
+    def decode(self, z: torch.Tensor):
+        """Decode given latent codes and return audio data
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+            Quantized continuous representation of input
+        length : int, optional
+            Number of samples in output audio, by default None
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        return self.decoder(z)
+    def encode(self, audio_data: torch.Tensor, sample_rate: int):
+        """
+        Args:
+            audio_data: Tensor[B x 1 x T]
+            sample_rate: int
+        Returns:
+            z: Tensor[B x D x T]
+        """
+        if audio_data.ndim == 2:
+            audio_data = audio_data.unsqueeze(1)
+        audio_data = self.preprocess(audio_data, sample_rate)
+        return self.encoder(audio_data)["mu"]

voxcpm/modules/audiovae/audio_vae_v2.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import math
+from typing import List, Optional
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+from pydantic import BaseModel
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+class CausalConv1d(nn.Conv1d):
+    def __init__(self, *args, padding: int = 0, output_padding: int = 0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__padding = padding
+        self.__output_padding = output_padding
+    def forward(self, x):
+        x_pad = F.pad(x, (self.__padding * 2 - self.__output_padding, 0))
+        return super().forward(x_pad)
+class CausalTransposeConv1d(nn.ConvTranspose1d):
+    def __init__(self, *args, padding: int = 0, output_padding: int = 0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__padding = padding
+        self.__output_padding = output_padding
+    def forward(self, x):
+        return super().forward(x)[..., : -(self.__padding * 2 - self.__output_padding)]
+def WNCausalConv1d(*args, **kwargs):
+    return weight_norm(CausalConv1d(*args, **kwargs))
+def WNCausalTransposeConv1d(*args, **kwargs):
+    return weight_norm(CausalTransposeConv1d(*args, **kwargs))
+# Scripting this brings model speed up 1.4x
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+class CausalResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1, kernel: int = 7, groups: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNCausalConv1d(
+                dim,
+                dim,
+                kernel_size=kernel,
+                dilation=dilation,
+                padding=pad,
+                groups=groups,
+            ),
+            Snake1d(dim),
+            WNCausalConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        y = self.block(x)
+        pad = (x.shape[-1] - y.shape[-1]) // 2
+        assert pad == 0
+        if pad > 0:
+            x = x[..., pad:-pad]
+        return x + y
+class CausalEncoderBlock(nn.Module):
+    def __init__(self, output_dim: int = 16, input_dim=None, stride: int = 1, groups=1):
+        super().__init__()
+        input_dim = input_dim or output_dim // 2
+        self.block = nn.Sequential(
+            CausalResidualUnit(input_dim, dilation=1, groups=groups),
+            CausalResidualUnit(input_dim, dilation=3, groups=groups),
+            CausalResidualUnit(input_dim, dilation=9, groups=groups),
+            Snake1d(input_dim),
+            WNCausalConv1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                output_padding=stride % 2,
+            ),
+        )
+    def forward(self, x):
+        return self.block(x)
+class CausalEncoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int = 64,
+        latent_dim: int = 32,
+        strides: list = [2, 4, 8, 8],
+        depthwise: bool = False,
+    ):
+        super().__init__()
+        # Create first convolution
+        self.block = [WNCausalConv1d(1, d_model, kernel_size=7, padding=3)]
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride in strides:
+            d_model *= 2
+            groups = d_model // 2 if depthwise else 1
+            self.block += [CausalEncoderBlock(output_dim=d_model, stride=stride, groups=groups)]
+        groups = d_model if depthwise else 1
+        # Create two convolution, for mu and logvar
+        self.fc_mu = WNCausalConv1d(d_model, latent_dim, kernel_size=3, padding=1)
+        self.fc_logvar = WNCausalConv1d(d_model, latent_dim, kernel_size=3, padding=1)
+        # Wrap black into nn.Sequential
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+    def forward(self, x):
+        hidden_state = self.block(x)
+        return {
+            "hidden_state": hidden_state,
+            "mu": self.fc_mu(hidden_state),
+            "logvar": self.fc_logvar(hidden_state),
+        }
+class NoiseBlock(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.linear = WNCausalConv1d(dim, dim, kernel_size=1, bias=False)
+    def forward(self, x):
+        B, C, T = x.shape
+        noise = torch.randn((B, 1, T), device=x.device, dtype=x.dtype)
+        h = self.linear(x)
+        n = noise * h
+        x = x + n
+        return x
+class CausalDecoderBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 16,
+        output_dim: int = 8,
+        stride: int = 1,
+        groups=1,
+        use_noise_block: bool = False,
+    ):
+        super().__init__()
+        layers = [
+            Snake1d(input_dim),
+            WNCausalTransposeConv1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                output_padding=stride % 2,
+            ),
+        ]
+        if use_noise_block:
+            layers.append(NoiseBlock(output_dim))
+        layers.extend(
+            [
+                CausalResidualUnit(output_dim, dilation=1, groups=groups),
+                CausalResidualUnit(output_dim, dilation=3, groups=groups),
+                CausalResidualUnit(output_dim, dilation=9, groups=groups),
+            ]
+        )
+        self.block = nn.Sequential(*layers)
+        self.input_channels = input_dim
+    def forward(self, x):
+        return self.block(x)
+class TransposeLastTwoDim(torch.nn.Module):
+    def forward(self, x):
+        return torch.transpose(x, -1, -2)
+class SampleRateConditionLayer(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        sr_bin_buckets: int = None,
+        cond_type: str = "scale_bias",
+        cond_dim: int = 128,
+        out_layer: bool = False,
+    ):
+        super().__init__()
+        self.cond_type, out_layer_in_dim = cond_type, input_dim
+        if cond_type == "scale_bias":
+            self.scale_embed = nn.Embedding(sr_bin_buckets, input_dim)
+            self.bias_embed = nn.Embedding(sr_bin_buckets, input_dim)
+            nn.init.ones_(self.scale_embed.weight)
+            nn.init.zeros_(self.bias_embed.weight)
+        elif cond_type == "scale_bias_init":
+            self.scale_embed = nn.Embedding(sr_bin_buckets, input_dim)
+            self.bias_embed = nn.Embedding(sr_bin_buckets, input_dim)
+            nn.init.normal_(self.scale_embed.weight, mean=1)
+            nn.init.normal_(self.bias_embed.weight)
+        elif cond_type == "add":
+            self.cond_embed = nn.Embedding(sr_bin_buckets, input_dim)
+            nn.init.normal_(self.cond_embed.weight)
+        elif cond_type == "concat":
+            self.cond_embed = nn.Embedding(sr_bin_buckets, cond_dim)
+            assert out_layer, "out_layer must be True for concat cond_type"
+            out_layer_in_dim = input_dim + cond_dim
+        else:
+            raise ValueError(f"Invalid cond_type: {cond_type}")
+        if out_layer:
+            self.out_layer = nn.Sequential(
+                Snake1d(out_layer_in_dim),
+                WNCausalConv1d(out_layer_in_dim, input_dim, kernel_size=1),
+            )
+        else:
+            self.out_layer = nn.Identity()
+    def forward(self, x, sr_cond):
+        if self.cond_type == "scale_bias" or self.cond_type == "scale_bias_init":
+            x = x * self.scale_embed(sr_cond).unsqueeze(-1) + self.bias_embed(sr_cond).unsqueeze(-1)
+        elif self.cond_type == "add":
+            x = x + self.cond_embed(sr_cond).unsqueeze(-1)
+        elif self.cond_type == "concat":
+            x = torch.cat([x, self.cond_embed(sr_cond).unsqueeze(-1).repeat(1, 1, x.shape[-1])], dim=1)
+        return self.out_layer(x)
+class CausalDecoder(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        depthwise: bool = False,
+        d_out: int = 1,
+        use_noise_block: bool = False,
+        sr_bin_boundaries: List[int] = None,
+        cond_type: str = "scale_bias",
+        cond_dim: int = 128,
+        cond_out_layer: bool = False,
+    ):
+        super().__init__()
+        # Add first conv layer
+        if depthwise:
+            layers = [
+                WNCausalConv1d(input_channel, input_channel, kernel_size=7, padding=3, groups=input_channel),
+                WNCausalConv1d(input_channel, channels, kernel_size=1),
+            ]
+        else:
+            layers = [WNCausalConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, stride in enumerate(rates):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            groups = output_dim if depthwise else 1
+            layers += [
+                CausalDecoderBlock(
+                    input_dim,
+                    output_dim,
+                    stride,
+                    groups=groups,
+                    use_noise_block=use_noise_block,
+                )
+            ]
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            WNCausalConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        if sr_bin_boundaries is None:
+            self.model = nn.Sequential(*layers)
+            self.sr_bin_boundaries = None
+        else:
+            self.model = nn.ModuleList(layers)
+            self.register_buffer("sr_bin_boundaries", torch.tensor(sr_bin_boundaries, dtype=torch.int32))
+            self.sr_bin_buckets = len(sr_bin_boundaries) + 1
+            cond_layers = []
+            for layer in self.model:
+                if layer.__class__.__name__ == "CausalDecoderBlock":
+                    cond_layers.append(
+                        SampleRateConditionLayer(
+                            input_dim=layer.input_channels,
+                            sr_bin_buckets=self.sr_bin_buckets,
+                            cond_type=cond_type,
+                            cond_dim=cond_dim,
+                            out_layer=cond_out_layer,
+                        )
+                    )
+                else:
+                    cond_layers.append(None)
+            self.sr_cond_model = nn.ModuleList(cond_layers)
+    def get_sr_idx(self, sr):
+        return torch.bucketize(sr, self.sr_bin_boundaries)
+    def forward(self, x, sr_cond=None):
+        if self.sr_bin_boundaries is not None:
+            # assert sr_cond is not None
+            sr_cond = self.get_sr_idx(sr_cond)
+            for layer, sr_cond_layer in zip(self.model, self.sr_cond_model):
+                if sr_cond_layer is not None:
+                    x = sr_cond_layer(x, sr_cond)
+                x = layer(x)
+            return x
+        else:
+            return self.model(x)
+class AudioVAEConfig(BaseModel):
+    encoder_dim: int = 128
+    encoder_rates: List[int] = [2, 5, 8, 8]
+    latent_dim: int = 64
+    decoder_dim: int = 2048
+    decoder_rates: List[int] = [8, 6, 5, 2, 2, 2]
+    depthwise: bool = True
+    sample_rate: int = 16000
+    out_sample_rate: int = 48000
+    use_noise_block: bool = False
+    sr_bin_boundaries: Optional[List[int]] = [20000, 30000, 40000]
+    cond_type: str = "scale_bias"
+    cond_dim: int = 128
+    cond_out_layer: bool = False
+class AudioVAE(nn.Module):
+    """
+    Args:
+    """
+    def __init__(
+        self,
+        config: AudioVAEConfig = None,
+    ):
+        # 如果没有传入config，使用默认配置
+        if config is None:
+            config = AudioVAEConfig()
+        super().__init__()
+        encoder_dim = config.encoder_dim
+        encoder_rates = config.encoder_rates
+        latent_dim = config.latent_dim
+        decoder_dim = config.decoder_dim
+        decoder_rates = config.decoder_rates
+        depthwise = config.depthwise
+        sample_rate = config.sample_rate
+        out_sample_rate = config.out_sample_rate
+        use_noise_block = config.use_noise_block
+        sr_bin_boundaries = config.sr_bin_boundaries
+        cond_type = config.cond_type
+        cond_dim = config.cond_dim
+        cond_out_layer = config.cond_out_layer
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.depthwise = depthwise
+        self.use_noise_block = use_noise_block
+        if latent_dim is None:
+            latent_dim = encoder_dim * (2 ** len(encoder_rates))
+        self.latent_dim = latent_dim
+        self.hop_length = np.prod(encoder_rates)
+        self.encoder = CausalEncoder(
+            encoder_dim,
+            latent_dim,
+            encoder_rates,
+            depthwise=depthwise,
+        )
+        self.decoder = CausalDecoder(
+            latent_dim,
+            decoder_dim,
+            decoder_rates,
+            depthwise=depthwise,
+            use_noise_block=use_noise_block,
+            sr_bin_boundaries=sr_bin_boundaries,
+            cond_type=cond_type,
+            cond_dim=cond_dim,
+            cond_out_layer=cond_out_layer,
+        )
+        self.sample_rate = sample_rate
+        self.out_sample_rate = out_sample_rate
+        self.sr_bin_boundaries = sr_bin_boundaries
+        self.chunk_size = math.prod(encoder_rates)
+    def preprocess(self, audio_data, sample_rate):
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        assert sample_rate == self.sample_rate
+        pad_to = self.hop_length
+        length = audio_data.shape[-1]
+        right_pad = math.ceil(length / pad_to) * pad_to - length
+        audio_data = nn.functional.pad(audio_data, (0, right_pad))
+        return audio_data
+    def decode(self, z: torch.Tensor, sr_cond: torch.Tensor = None):
+        """Decode given latent codes and return audio data
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+            Quantized continuous representation of input
+        length : int, optional
+            Number of samples in output audio, by default None
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        if self.sr_bin_boundaries is not None:
+            # use default output sample rate
+            if sr_cond is None:
+                sr_cond = torch.tensor([self.out_sample_rate], device=z.device, dtype=torch.int32)
+        return self.decoder(z, sr_cond)
+    def encode(self, audio_data: torch.Tensor, sample_rate: int):
+        """
+        Args:
+            audio_data: Tensor[B x 1 x T]
+            sample_rate: int
+        Returns:
+            z: Tensor[B x D x T]
+        """
+        if audio_data.ndim == 2:
+            audio_data = audio_data.unsqueeze(1)
+        audio_data = self.preprocess(audio_data, sample_rate)
+        return self.encoder(audio_data)["mu"]

voxcpm/modules/layers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .scalar_quantization_layer import ScalarQuantizationLayer

voxcpm/modules/layers/lora.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LoRALinear(nn.Module):
+    """
+    LoRA 线性层：直接持有 weight/bias，保持与 nn.Linear 相同的 state_dict key 结构。
+    state_dict 结构：
+        - weight: 原始权重（与 nn.Linear 一致）
+        - bias: 原始偏置（与 nn.Linear 一致）
+        - lora_A: LoRA 低秩矩阵 A
+        - lora_B: LoRA 低秩矩阵 B
+    这样设计的好处：加载预训练权重时无需做 key 转换。
+    """
+    def __init__(
+        self,
+        base: nn.Linear,
+        r: int,
+        alpha: float = 1.0,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        assert isinstance(base, nn.Linear), "LoRALinear only supports wrapping nn.Linear."
+        self.in_features = base.in_features
+        self.out_features = base.out_features
+        self.r = r
+        self.alpha = alpha
+        self._base_scaling = alpha / r if r > 0 else 0.0
+        # 使用 buffer 存储 scaling，这样修改值不会触发 torch.compile 重编译
+        # persistent=False 表示不保存到 state_dict，避免加载时 missing key
+        self.register_buffer("scaling", torch.tensor(self._base_scaling), persistent=False)
+        # 直接持有 weight 和 bias（从原始 Linear 转移过来）
+        self.weight = base.weight
+        self.bias = base.bias  # 可能是 None
+        # LoRA 参数
+        if r > 0:
+            self.lora_A = nn.Parameter(torch.zeros(r, self.in_features))
+            self.lora_B = nn.Parameter(torch.zeros(self.out_features, r))
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+        else:
+            self.register_parameter("lora_A", None)
+            self.register_parameter("lora_B", None)
+        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # 基础 Linear 计算
+        result = F.linear(x, self.weight, self.bias)
+        if self.r <= 0 or self.lora_A is None:
+            return result
+        # LoRA: result + dropout(x @ A^T @ B^T) * scaling
+        lora_out = F.linear(F.linear(x, self.lora_A), self.lora_B)
+        return result + self.dropout(lora_out) * self.scaling
+    def reset_lora_parameters(self):
+        """重置 LoRA 参数到初始状态"""
+        if self.r > 0 and self.lora_A is not None:
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+    def set_enabled(self, enabled: bool):
+        """启用/禁用 LoRA（通过 scaling 控制，兼容 torch.compile）"""
+        # 使用 fill_ 原地修改 buffer 值，不会触发重编译
+        self.scaling.fill_(self._base_scaling if enabled else 0.0)
+    @property
+    def enabled(self) -> bool:
+        return self.scaling.item() != 0.0
+def _get_parent_module(root: nn.Module, name: str) -> Optional[nn.Module]:
+    """
+    根据类似 'layers.0.self_attn.q_proj' 的全名，返回 parent module（即 q_proj 的上一级）。
+    """
+    parts = name.split(".")
+    if len(parts) == 1:
+        return root
+    parent = root
+    for p in parts[:-1]:
+        if not hasattr(parent, p):
+            return None
+        parent = getattr(parent, p)
+    return parent
+def apply_lora_to_named_linear_modules(
+    root: nn.Module,
+    *,
+    target_submodule_names: list[str],
+    r: int,
+    alpha: float,
+    dropout: float,
+) -> None:
+    """
+    在给定模块及其子模块中，对名字以 target_submodule_names 结尾的 Linear 层注入 LoRA。
+    例如 target_submodule_names=["q_proj", "v_proj"] 时，
+    会在所有名为 *.q_proj / *.v_proj 的 nn.Linear 上替换为 LoRALinear。
+    """
+    for full_name, module in list(root.named_modules()):
+        if not isinstance(module, nn.Linear):
+            continue
+        short_name = full_name.split(".")[-1]
+        if short_name not in target_submodule_names:
+            continue
+        parent = _get_parent_module(root, full_name)
+        if parent is None:
+            continue
+        # 用 LoRALinear 替换原始 Linear
+        lora_layer = LoRALinear(
+            base=module,
+            r=r,
+            alpha=alpha,
+            dropout=dropout,
+        )
+        setattr(parent, short_name, lora_layer)

voxcpm/modules/layers/scalar_quantization_layer.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+class ScalarQuantizationLayer(nn.Module):
+    def __init__(self, in_dim, out_dim, latent_dim: int = 64, scale: int = 9):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.latent_dim = latent_dim
+        self.scale = scale
+        self.in_proj = nn.Linear(in_dim, latent_dim)
+        self.out_proj = nn.Linear(latent_dim, out_dim)
+    def forward(self, hidden):
+        hidden = self.in_proj(hidden)
+        hidden = torch.tanh(hidden)
+        if self.training:
+            quantized = torch.round(hidden * self.scale) / self.scale
+            hidden = hidden + (quantized - hidden).detach()
+        else:
+            hidden = torch.round(hidden * self.scale) / self.scale
+        return self.out_proj(hidden)

voxcpm/modules/locdit/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .unified_cfm import UnifiedCFM, CfmConfig
+from .local_dit import VoxCPMLocDiT
+from .local_dit_v2 import VoxCPMLocDiT as VoxCPMLocDiTV2

voxcpm/modules/locdit/local_dit.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+from ..minicpm4 import MiniCPMModel, MiniCPM4Config
+import torch.nn as nn
+import math
+class SinusoidalPosEmb(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
+    def forward(self, x, scale=1000):
+        if x.ndim < 1:
+            x = x.unsqueeze(0)
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=x.dtype, device=device) * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        out_dim: int = None,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, bias=True)
+        self.act = nn.SiLU()
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, bias=True)
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        sample = self.act(sample)
+        sample = self.linear_2(sample)
+        return sample
+class VoxCPMLocDiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        config: MiniCPM4Config,
+        in_channels: int = 64,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.config = config
+        self.in_proj = nn.Linear(in_channels, config.hidden_size, bias=True)
+        self.cond_proj = nn.Linear(in_channels, config.hidden_size, bias=True)
+        self.out_proj = nn.Linear(config.hidden_size, self.out_channels, bias=True)
+        self.time_embeddings = SinusoidalPosEmb(config.hidden_size)
+        self.time_mlp = TimestepEmbedding(
+            in_channels=config.hidden_size,
+            time_embed_dim=config.hidden_size,
+        )
+        self.delta_time_mlp = TimestepEmbedding(
+            in_channels=config.hidden_size,
+            time_embed_dim=config.hidden_size,
+        )
+        assert config.vocab_size == 0, "vocab_size must be 0 for local DiT"
+        self.decoder = MiniCPMModel(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        t: torch.Tensor,
+        cond: torch.Tensor,
+        dt: torch.Tensor,
+    ):
+        """
+        Forward pass of DiT.
+        x: (N, C, T) tensor of inputs
+        mu: (N, C) tensor of hidden embedding
+        t: (N,) tensor of diffusion timesteps
+        cond: (N, C, T') tensor of prefix conditions
+        dt: (N,) used for mean velocity (may be supported in the future...)
+        """
+        x = self.in_proj(x.transpose(1, 2).contiguous())
+        cond = self.cond_proj(cond.transpose(1, 2).contiguous())
+        prefix = cond.size(1)
+        t = self.time_embeddings(t).to(x.dtype)
+        t = self.time_mlp(t)
+        dt = self.time_embeddings(dt).to(x.dtype)
+        dt = self.delta_time_mlp(dt)
+        t = t + dt
+        x = torch.cat([(mu + t).unsqueeze(1), cond, x], dim=1)
+        hidden, _ = self.decoder(x, is_causal=False)
+        hidden = hidden[:, prefix + 1 :, :]
+        hidden = self.out_proj(hidden)
+        return hidden.transpose(1, 2).contiguous()

voxcpm/modules/locdit/local_dit_v2.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+from ..minicpm4 import MiniCPMModel, MiniCPM4Config
+import torch.nn as nn
+import math
+class SinusoidalPosEmb(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
+    def forward(self, x, scale=1000):
+        if x.ndim < 1:
+            x = x.unsqueeze(0)
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=x.dtype, device=device) * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        out_dim: int = None,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, bias=True)
+        self.act = nn.SiLU()
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, bias=True)
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        sample = self.act(sample)
+        sample = self.linear_2(sample)
+        return sample
+class VoxCPMLocDiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        config: MiniCPM4Config,
+        in_channels: int = 64,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.config = config
+        self.in_proj = nn.Linear(in_channels, config.hidden_size, bias=True)
+        self.cond_proj = nn.Linear(in_channels, config.hidden_size, bias=True)
+        self.out_proj = nn.Linear(config.hidden_size, self.out_channels, bias=True)
+        self.time_embeddings = SinusoidalPosEmb(config.hidden_size)
+        self.time_mlp = TimestepEmbedding(
+            in_channels=config.hidden_size,
+            time_embed_dim=config.hidden_size,
+        )
+        self.delta_time_mlp = TimestepEmbedding(
+            in_channels=config.hidden_size,
+            time_embed_dim=config.hidden_size,
+        )
+        assert config.vocab_size == 0, "vocab_size must be 0 for local DiT"
+        self.decoder = MiniCPMModel(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        t: torch.Tensor,
+        cond: torch.Tensor,
+        dt: torch.Tensor,
+    ):
+        """
+        Forward pass of DiT.
+        x: (N, C, T) tensor of inputs
+        mu: (N, C) tensor of hidden embedding
+        t: (N,) tensor of diffusion timesteps
+        cond: (N, C, T') tensor of prefix conditions
+        dt: (N,) used for mean velocity (may be supported in the future...)
+        """
+        x = self.in_proj(x.transpose(1, 2).contiguous())
+        cond = self.cond_proj(cond.transpose(1, 2).contiguous())
+        prefix = cond.size(1)
+        t = self.time_embeddings(t).to(x.dtype)
+        t = self.time_mlp(t)
+        dt = self.time_embeddings(dt).to(x.dtype)
+        dt = self.delta_time_mlp(dt)
+        t = t + dt
+        mu = mu.view(x.size(0), -1, x.size(-1))
+        x = torch.cat([mu, (t).unsqueeze(1), cond, x], dim=1)
+        hidden, _ = self.decoder(x, is_causal=False)
+        hidden = hidden[:, prefix + mu.size(1) + 1 :, :]
+        hidden = self.out_proj(hidden)
+        return hidden.transpose(1, 2).contiguous()

voxcpm/modules/locdit/unified_cfm.py ADDED Viewed

	@@ -0,0 +1,232 @@

+from typing import Tuple
+import torch
+import torch.nn.functional as F
+from torch.func import jvp
+from pydantic import BaseModel
+from .local_dit import VoxCPMLocDiT
+class CfmConfig(BaseModel):
+    sigma_min: float = 1e-6
+    solver: str = "euler"
+    t_scheduler: str = "log-norm"
+    training_cfg_rate: float = 0.1
+    inference_cfg_rate: float = 1.0
+    reg_loss_type: str = "l1"
+    ratio_r_neq_t_range: Tuple[float, float] = (0.25, 0.75)
+    noise_cond_prob_range: Tuple[float, float] = (0.0, 0.0)
+    noise_cond_scale: float = 0.0
+class UnifiedCFM(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        cfm_params: CfmConfig,
+        estimator: VoxCPMLocDiT,
+        mean_mode: bool = False,
+    ):
+        super().__init__()
+        self.solver = cfm_params.solver
+        self.sigma_min = cfm_params.sigma_min
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        self.reg_loss_type = cfm_params.reg_loss_type
+        self.ratio_r_neq_t_range = cfm_params.ratio_r_neq_t_range
+        self.noise_cond_prob_range = cfm_params.noise_cond_prob_range
+        self.noise_cond_scale = cfm_params.noise_cond_scale
+        self.in_channels = in_channels
+        self.mean_mode = mean_mode
+        self.estimator = estimator
+    # ------------------------------------------------------------------ #
+    # Inference
+    # ------------------------------------------------------------------ #
+    @torch.inference_mode()
+    def forward(
+        self,
+        mu: torch.Tensor,
+        n_timesteps: int,
+        patch_size: int,
+        cond: torch.Tensor,
+        temperature: float = 1.0,
+        cfg_value: float = 1.0,
+        sway_sampling_coef: float = 1.0,
+        use_cfg_zero_star: bool = True,
+    ):
+        b, _ = mu.shape
+        t = patch_size
+        z = torch.randn((b, self.in_channels, t), device=mu.device, dtype=mu.dtype) * temperature
+        t_span = torch.linspace(1, 0, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        t_span = t_span + sway_sampling_coef * (torch.cos(torch.pi / 2 * t_span) - 1 + t_span)
+        return self.solve_euler(
+            x=z,
+            t_span=t_span,
+            mu=mu,
+            cond=cond,
+            cfg_value=cfg_value,
+            use_cfg_zero_star=use_cfg_zero_star,
+        )
+    def optimized_scale(self, positive_flat: torch.Tensor, negative_flat: torch.Tensor):
+        dot_product = torch.sum(positive_flat * negative_flat, dim=1, keepdim=True)
+        squared_norm = torch.sum(negative_flat**2, dim=1, keepdim=True) + 1e-8
+        st_star = dot_product / squared_norm
+        return st_star
+    def solve_euler(
+        self,
+        x: torch.Tensor,
+        t_span: torch.Tensor,
+        mu: torch.Tensor,
+        cond: torch.Tensor,
+        cfg_value: float = 1.0,
+        use_cfg_zero_star: bool = True,
+    ):
+        t, _, dt = t_span[0], t_span[-1], t_span[0] - t_span[1]
+        sol = []
+        zero_init_steps = max(1, int(len(t_span) * 0.04))
+        for step in range(1, len(t_span)):
+            if use_cfg_zero_star and step <= zero_init_steps:
+                dphi_dt = torch.zeros_like(x)
+            else:
+                # Classifier-Free Guidance inference introduced in VoiceBox
+                b = x.size(0)
+                x_in = torch.zeros([2 * b, self.in_channels, x.size(2)], device=x.device, dtype=x.dtype)
+                mu_in = torch.zeros([2 * b, mu.size(1)], device=x.device, dtype=x.dtype)
+                t_in = torch.zeros([2 * b], device=x.device, dtype=x.dtype)
+                dt_in = torch.zeros([2 * b], device=x.device, dtype=x.dtype)
+                cond_in = torch.zeros([2 * b, self.in_channels, cond.size(2)], device=x.device, dtype=x.dtype)
+                x_in[:b], x_in[b:] = x, x
+                mu_in[:b] = mu
+                t_in[:b], t_in[b:] = t.unsqueeze(0), t.unsqueeze(0)
+                dt_in[:b], dt_in[b:] = dt.unsqueeze(0), dt.unsqueeze(0)
+                # not used now
+                if not self.mean_mode:
+                    dt_in = torch.zeros_like(dt_in)
+                cond_in[:b], cond_in[b:] = cond, cond
+                dphi_dt = self.estimator(x_in, mu_in, t_in, cond_in, dt_in)
+                dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+                if use_cfg_zero_star:
+                    positive_flat = dphi_dt.view(b, -1)
+                    negative_flat = cfg_dphi_dt.view(b, -1)
+                    st_star = self.optimized_scale(positive_flat, negative_flat)
+                    st_star = st_star.view(b, *([1] * (len(dphi_dt.shape) - 1)))
+                else:
+                    st_star = 1.0
+                dphi_dt = cfg_dphi_dt * st_star + cfg_value * (dphi_dt - cfg_dphi_dt * st_star)
+            x = x - dt * dphi_dt
+            t = t - dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t - t_span[step + 1]
+        return sol[-1]
+    # ------------------------------------------------------------------ #
+    # Training loss
+    # ------------------------------------------------------------------ #
+    def adaptive_loss_weighting(
+        self, losses: torch.Tensor, mask: torch.Tensor | None = None, p: float = 0.0, epsilon: float = 1e-3
+    ):
+        weights = 1.0 / ((losses + epsilon).pow(p))
+        if mask is not None:
+            weights = weights * mask
+        return weights.detach()
+    def sample_r_t(self, x: torch.Tensor, mu: float = -0.4, sigma: float = 1.0, ratio_r_neq_t: float = 0.0):
+        batch_size = x.shape[0]
+        if self.t_scheduler == "log-norm":
+            s_r = torch.randn(batch_size, device=x.device, dtype=x.dtype) * sigma + mu
+            s_t = torch.randn(batch_size, device=x.device, dtype=x.dtype) * sigma + mu
+            r = torch.sigmoid(s_r)
+            t = torch.sigmoid(s_t)
+        elif self.t_scheduler == "uniform":
+            r = torch.rand(batch_size, device=x.device, dtype=x.dtype)
+            t = torch.rand(batch_size, device=x.device, dtype=x.dtype)
+        else:
+            raise ValueError(f"Unsupported t_scheduler: {self.t_scheduler}")
+        mask = torch.rand(batch_size, device=x.device, dtype=x.dtype) < ratio_r_neq_t
+        r, t = torch.where(
+            mask,
+            torch.stack([torch.min(r, t), torch.max(r, t)], dim=0),
+            torch.stack([t, t], dim=0),
+        )
+        return r.squeeze(), t.squeeze()
+    def compute_loss(
+        self,
+        x1: torch.Tensor,
+        mu: torch.Tensor,
+        cond: torch.Tensor | None = None,
+        tgt_mask: torch.Tensor | None = None,
+        progress: float = 0.0,
+    ):
+        b, _, _ = x1.shape
+        if self.training_cfg_rate > 0:
+            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1)
+        if cond is None:
+            cond = torch.zeros_like(x1)
+        noisy_mask = torch.rand(b, device=x1.device) > (
+            1.0
+            - (
+                self.noise_cond_prob_range[0]
+                + progress * (self.noise_cond_prob_range[1] - self.noise_cond_prob_range[0])
+            )
+        )
+        cond = cond + noisy_mask.view(-1, 1, 1) * torch.randn_like(cond) * self.noise_cond_scale
+        ratio_r_neq_t = (
+            self.ratio_r_neq_t_range[0] + progress * (self.ratio_r_neq_t_range[1] - self.ratio_r_neq_t_range[0])
+            if self.mean_mode
+            else 0.0
+        )
+        r, t = self.sample_r_t(x1, ratio_r_neq_t=ratio_r_neq_t)
+        r_ = r.detach().clone()
+        t_ = t.detach().clone()
+        z = torch.randn_like(x1)
+        y = (1 - t_.view(-1, 1, 1)) * x1 + t_.view(-1, 1, 1) * z
+        v = z - x1
+        def model_fn(z_sample, r_sample, t_sample):
+            return self.estimator(z_sample, mu, t_sample, cond, dt=t_sample - r_sample)
+        if self.mean_mode:
+            v_r = torch.zeros_like(r)
+            v_t = torch.ones_like(t)
+            from torch.backends.cuda import sdp_kernel
+            with sdp_kernel(enable_flash=False, enable_mem_efficient=False):
+                u_pred, dudt = jvp(model_fn, (y, r, t), (v, v_r, v_t))
+            u_tgt = v - (t_ - r_).view(-1, 1, 1) * dudt
+        else:
+            u_pred = model_fn(y, r, t)
+            u_tgt = v
+        losses = F.mse_loss(u_pred, u_tgt.detach(), reduction="none").mean(dim=1)
+        if tgt_mask is not None:
+            weights = self.adaptive_loss_weighting(losses, tgt_mask.squeeze(1))
+            loss = (weights * losses).sum() / torch.sum(tgt_mask)
+        else:
+            loss = losses.mean()
+        return loss

voxcpm/modules/locenc/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .local_encoder import VoxCPMLocEnc

voxcpm/modules/locenc/local_encoder.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import torch.nn as nn
+from ..minicpm4 import MiniCPMModel, MiniCPM4Config
+from einops import rearrange
+class VoxCPMLocEnc(nn.Module):
+    def __init__(self, config: MiniCPM4Config, input_dim: int = 64):
+        super().__init__()
+        self.config = config
+        self.special_token = nn.Parameter(torch.randn(1, 1, 1, config.hidden_size))
+        self.in_proj = nn.Linear(input_dim, config.hidden_size, bias=True)
+        assert config.vocab_size == 0, "vocab_size must be 0 for local encoder"
+        self.encoder = MiniCPMModel(config)
+    def forward(self, x):
+        """
+        x: [B, T, P, D]
+        """
+        B, T, P, D = x.shape
+        x = self.in_proj(x)
+        special_tokens = self.special_token.expand(B, T, 1, -1)
+        x = torch.cat([special_tokens, x], dim=2)
+        x = rearrange(x, "b t p c -> (b t) p c")
+        outputs, _ = self.encoder(x, is_causal=False)
+        cls_output = outputs[:, 0, :]
+        return rearrange(cls_output, "(b t) c -> b t c", b=B)

voxcpm/modules/minicpm4/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .config import MiniCPM4Config
+from .model import MiniCPMModel
+from .cache import StaticKVCache

voxcpm/modules/minicpm4/cache.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import List, Tuple
+import torch
+class StaticKVCache:
+    def __init__(
+        self,
+        num_layers: int,
+        num_kv_heads: int,
+        dim_kv_head: int,
+        batch_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        max_length: int = 8192,
+    ):
+        self.max_length = max_length
+        self.num_layers = num_layers
+        self.kv_cache = torch.zeros(
+            2,
+            num_layers,
+            batch_size,
+            num_kv_heads,
+            max_length,
+            dim_kv_head,
+            device=device,
+            dtype=dtype,
+        )
+        self.current_length = 0
+    def get_layer_cache(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.kv_cache[0, layer_idx], self.kv_cache[1, layer_idx]
+    def step(self) -> int:
+        if self.current_length >= self.max_length:
+            raise ValueError("KV cache is full")
+        ret = self.current_length
+        self.current_length += 1
+        return ret
+    def fill_caches(self, kv_caches: List[Tuple[torch.Tensor, torch.Tensor]]):
+        self.current_length = kv_caches[0][0].size(2)
+        self.kv_cache.zero_()
+        for i in range(self.num_layers):
+            self.kv_cache[0, i, :, :, : self.current_length, :] = kv_caches[i][0]
+            self.kv_cache[1, i, :, :, : self.current_length, :] = kv_caches[i][1]

voxcpm/modules/minicpm4/config.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from pydantic import BaseModel
+from typing import List
+class RopeScalingConfig(BaseModel):
+    type: str
+    long_factor: List[float]
+    short_factor: List[float]
+    original_max_position_embeddings: int
+class MiniCPM4Config(BaseModel):
+    bos_token_id: int
+    eos_token_id: int
+    hidden_size: int
+    intermediate_size: int
+    max_position_embeddings: int
+    num_attention_heads: int
+    num_hidden_layers: int
+    num_key_value_heads: int
+    rms_norm_eps: float
+    rope_scaling: RopeScalingConfig
+    vocab_size: int
+    use_mup: bool = True
+    scale_emb: float
+    dim_model_base: int
+    scale_depth: float
+    rope_theta: float
+    kv_channels: int = None
+    no_rope: bool = False

voxcpm/modules/minicpm4/model.py ADDED Viewed

	@@ -0,0 +1,429 @@

+from .config import MiniCPM4Config
+import torch
+import torch.nn as nn
+from typing import List, Tuple
+import math
+from .cache import StaticKVCache
+def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
+    old_dtype = hidden.dtype
+    variance = hidden.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
+    hidden = (hidden * torch.rsqrt(variance + eps)).to(old_dtype)
+    return hidden * weight
+class MiniCPMRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MiniCPMRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
+    """
+    Args:
+        q: Tensor(batch_size, num_heads, seq_len, head_dim)
+        k: Tensor(batch_size, num_key_value_heads, seq_len, head_dim)
+        cos: Tensor(seq_len, head_dim)
+        sin: Tensor(seq_len, head_dim)
+    Returns:
+        Tensor(batch_size, num_heads, seq_len, head_dim), Tensor(batch_size, num_key_value_heads, seq_len, head_dim)
+    """
+    orig_dtype = q.dtype
+    q = q.to(torch.float32)
+    k = k.to(torch.float32)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(orig_dtype), k_embed.to(orig_dtype)
+class MiniCPMLongRoPE(nn.Module):
+    """MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, config: MiniCPM4Config):
+        super().__init__()
+        self.config = config
+        self.dim = config.kv_channels if config.kv_channels else config.hidden_size // config.num_attention_heads
+        self.base = config.rope_theta
+        self.max_position_embeddings = config.max_position_embeddings
+        self.short_factor = config.rope_scaling.short_factor
+        self.long_factor = config.rope_scaling.long_factor
+        self.original_max_position_embeddings = config.rope_scaling.original_max_position_embeddings
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
+        self.scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.max_seq_len_cached = 0
+        self.register_buffer("cos_cached", torch.empty(0), persistent=False)
+        self.register_buffer("sin_cached", torch.empty(0), persistent=False)
+        self._set_cos_sin_cache(seq_len=self.max_position_embeddings, device=self.inv_freq.device, dtype=torch.float32)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        """设置cos和sin缓存"""
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=device)
+        freqs = torch.mul(
+            torch.outer(t, 1.0 / ext_factors).to(device=device), self.inv_freq.to(device=device).to(dtype)
+        )
+        # 创建embeddings
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.cos_cached = emb.cos().to(dtype) * self.scaling_factor
+        self.sin_cached = emb.sin().to(dtype) * self.scaling_factor
+    def forward(self, position_ids: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            position_ids: Tensor(seq_len) 或 Tensor(batch_size, seq_len)
+        Returns:
+            Tensor(seq_len, head_dim), Tensor(seq_len, head_dim)
+        """
+        cos = self.cos_cached[position_ids]
+        sin = self.sin_cached[position_ids]
+        return cos, sin
+class MiniCPMAttention(nn.Module):
+    def __init__(self, config: MiniCPM4Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = 10000.0
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_emb: Tuple[torch.Tensor, torch.Tensor],
+        is_causal: bool,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_emb is not None:
+            cos, sin = position_emb
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        # ref: https://github.com/pytorch/pytorch/issues/163597
+        # there is a bug in MPS for non-contiguous tensors, so we need to make them contiguous
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            is_causal=is_causal,
+            enable_gqa=True,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+        past_key_value = (key_states, value_states)
+        return attn_output, past_key_value
+    def forward_step(
+        self,
+        hidden_states: torch.Tensor,
+        position_emb: Tuple[torch.Tensor, torch.Tensor],
+        position_id: int,
+        kv_cache: Tuple[torch.Tensor, torch.Tensor],
+    ) -> torch.Tensor:
+        bsz, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, 1, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, 1, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, 1, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_emb is not None:
+            cos, sin = position_emb
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        key_cache, value_cache = kv_cache
+        key_cache[:, :, position_id, :] = key_states
+        value_cache[:, :, position_id, :] = value_states
+        attn_mask = (torch.arange(key_cache.size(2), device=key_cache.device) <= position_id).view(1, 1, 1, -1)
+        # ref: https://github.com/pytorch/pytorch/issues/163597
+        # there is a bug in MPS for non-contiguous tensors, so we need to make them contiguous
+        query_states = query_states.contiguous()
+        key_cache = key_cache.contiguous()
+        value_cache = value_cache.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_cache,
+            value_cache,
+            attn_mask=attn_mask,
+            enable_gqa=True,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, self.num_heads * self.head_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class MiniCPMMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class MiniCPMDecoderLayer(nn.Module):
+    def __init__(self, config: MiniCPM4Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = MiniCPMAttention(config=config, layer_idx=layer_idx)
+        self.mlp = MiniCPMMLP(config)
+        self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.scale_depth = config.scale_depth
+        self.num_hidden_layers = config.num_hidden_layers
+        self.use_mup = config.use_mup
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_emb: Tuple[torch.Tensor, torch.Tensor],
+        is_causal: bool,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            position_ids (`torch.LongTensor`): position ids of shape `(batch_size, seq_len)`
+            is_causal (`bool`): whether the attention mask is causal
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            position_emb=position_emb,
+            is_causal=is_causal,
+        )
+        if self.use_mup:
+            hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+        else:
+            hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.use_mup:
+            hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+        else:
+            hidden_states = residual + hidden_states
+        return hidden_states, present_key_value
+    def forward_step(
+        self,
+        hidden_states: torch.Tensor,
+        position_emb: Tuple[torch.Tensor, torch.Tensor],
+        position_id: torch.Tensor,
+        kv_cache: Tuple[torch.Tensor, torch.Tensor],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states = self.self_attn.forward_step(
+            hidden_states=hidden_states,
+            position_emb=position_emb,
+            position_id=position_id,
+            kv_cache=kv_cache,
+        )
+        if self.use_mup:
+            hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+        else:
+            hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.use_mup:
+            hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
+        else:
+            hidden_states = residual + hidden_states
+        return hidden_states
+class MiniCPMModel(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
+    Args:
+        config: MiniCPMConfig
+    """
+    def __init__(self, config: MiniCPM4Config):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.config = config
+        if config.vocab_size > 0:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        else:
+            self.embed_tokens = nn.Identity()
+        self.layers = nn.ModuleList(
+            [MiniCPMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if config.no_rope:
+            self.rope_emb = None
+        else:
+            self.rope_emb = MiniCPMLongRoPE(config)
+        self.kv_cache = None
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        is_causal: bool = True,
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        """
+        Args:
+            inputs_embeds: Tensor(batch_size, seq_length, hidden_size)
+            is_causal: bool, whether the attention mask is causal
+        Returns:
+            hidden_states: Tensor(batch_size, seq_length, hidden_size)
+            next_decoder_cache: List[(batch_size, num_heads, seq_length, head_dim), (batch_size, num_heads, seq_length, head_dim)]
+        """
+        if self.rope_emb is not None:
+            position_ids = torch.arange(0, inputs_embeds.size(1), dtype=torch.long, device=inputs_embeds.device)
+            position_emb = self.rope_emb(position_ids)
+        else:
+            position_emb = None
+        hidden_states = inputs_embeds
+        next_decoder_cache = []
+        for decoder_layer in self.layers:
+            hidden_states, this_cache = decoder_layer(
+                hidden_states,
+                position_emb,
+                is_causal,
+            )
+            next_decoder_cache.append(this_cache)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states, next_decoder_cache
+    def forward_step(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_id: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            inputs_embeds: Tensor(batch_size, hidden_size)
+        Returns:
+            hidden_states: Tensor(batch_size, hidden_size)
+        """
+        assert self.kv_cache is not None, "KV cache is not setup"
+        if self.rope_emb is not None:
+            position_emb = self.rope_emb(position_id)
+        else:
+            position_emb = None
+        hidden_states = inputs_embeds
+        for i, decoder_layer in enumerate(self.layers):
+            hidden_states = decoder_layer.forward_step(
+                hidden_states,
+                position_emb,
+                position_id,
+                self.kv_cache.get_layer_cache(i),
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+    def setup_cache(self, batch_size: int, max_length: int, device, dtype: torch.dtype):
+        self.kv_cache = StaticKVCache(
+            num_layers=self.config.num_hidden_layers,
+            num_kv_heads=self.config.num_key_value_heads,
+            dim_kv_head=(
+                self.config.hidden_size // self.config.num_attention_heads
+                if self.config.kv_channels is None
+                else self.config.kv_channels
+            ),
+            batch_size=batch_size,
+            device=device,
+            dtype=dtype,
+            max_length=max_length,
+        )

voxcpm/training/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Training utilities for VoxCPM fine-tuning.
+This package mirrors the training mechanics used in the minicpm-audio
+tooling while relying solely on local audio-text datasets managed via
+the HuggingFace ``datasets`` library.
+"""
+from .accelerator import Accelerator
+from .tracker import TrainingTracker
+from .data import (
+    load_audio_text_datasets,
+    HFVoxCPMDataset,
+    build_dataloader,
+    BatchProcessor,
+)
+from .state import TrainingState
+__all__ = [
+    "Accelerator",
+    "TrainingTracker",
+    "HFVoxCPMDataset",
+    "BatchProcessor",
+    "TrainingState",
+    "load_audio_text_datasets",
+    "build_dataloader",
+]

voxcpm/training/accelerator.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from __future__ import annotations
+import contextlib
+import os
+import random
+import typing
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.utils.data
+from torch.nn.parallel import DistributedDataParallel
+class Accelerator:
+    """
+    Simplified accelerator that mirrors the behaviour of the minicpm-audio
+    training utilities. It initializes a distributed process group when
+    ``torchrun`` is used and exposes helpers for AMP, gradient scaling and
+    preparing models/dataloaders for DDP.
+    """
+    def __init__(self, amp: bool = False, seed: int = 42):
+        self.world_size = int(os.getenv("WORLD_SIZE", "1"))
+        if self.world_size > 1 and not dist.is_initialized():
+            dist.init_process_group("nccl", init_method="env://")
+        self.rank = dist.get_rank() if dist.is_initialized() else 0
+        self.local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+        self.amp = amp
+        # Set random seed to ensure model initialization consistency
+        self._set_seed(seed)
+        class DummyScaler:
+            def step(self, optimizer):
+                optimizer.step()
+            def scale(self, loss):
+                return loss
+            def unscale_(self, optimizer):
+                return optimizer
+            def update(self):
+                pass
+        self.scaler = torch.amp.GradScaler("cuda") if (amp and torch.cuda.is_available()) else DummyScaler()
+        self.device_ctx = torch.cuda.device(self.local_rank) if torch.cuda.is_available() else None
+        self._ddp_model = None  # For no_sync support
+    def _set_seed(self, seed: int):
+        """Set random seed to ensure model initialization consistency across multiple GPUs"""
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        random.seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+    def __enter__(self):
+        if self.device_ctx is not None:
+            self.device_ctx.__enter__()
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if self.device_ctx is not None:
+            self.device_ctx.__exit__(exc_type, exc_value, traceback)
+    def barrier(self):
+        """Synchronize all processes"""
+        if dist.is_initialized():
+            dist.barrier()
+    def all_reduce(self, tensor: torch.Tensor, op=dist.ReduceOp.AVG):
+        """All-reduce tensor across processes"""
+        if dist.is_initialized():
+            dist.all_reduce(tensor, op=op)
+        return tensor
+    # ------------------------------------------------------------------ #
+    # Model helpers
+    # ------------------------------------------------------------------ #
+    def prepare_model(self, model: torch.nn.Module, **kwargs):
+        if hasattr(model, "device"):  # make sure the matrix will be moved to the correct device
+            model.device = self.device
+        model = model.to(self.device)
+        if self.world_size > 1:
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            model = DistributedDataParallel(model, device_ids=[self.local_rank], **kwargs)
+            self._ddp_model = model  # Save DDP model reference for no_sync support
+        return model
+    @contextlib.contextmanager
+    def no_sync(self):
+        """
+        Context manager to skip gradient synchronization during gradient accumulation.
+        Only used outside the last micro-batch.
+        """
+        if self._ddp_model is not None:
+            with self._ddp_model.no_sync():
+                yield
+        else:
+            yield
+    @property
+    def device(self):
+        if torch.cuda.is_available():
+            return torch.device("cuda", self.local_rank)
+        if torch.backends.mps.is_available():
+            return torch.device("mps")
+        return torch.device("cpu")
+    # ------------------------------------------------------------------ #
+    # AMP helpers
+    # ------------------------------------------------------------------ #
+    def autocast(self, *args, **kwargs):
+        return torch.amp.autocast("cuda", enabled=self.amp, *args, **kwargs)
+    def backward(self, loss: torch.Tensor):
+        self.scaler.scale(loss).backward()
+    def step(self, optimizer: torch.optim.Optimizer):
+        self.scaler.step(optimizer)
+    def update(self):
+        self.scaler.update()
+    # ------------------------------------------------------------------ #
+    # Data helpers
+    # ------------------------------------------------------------------ #
+    def prepare_dataloader(
+        self,
+        dataset: typing.Iterable,
+        *,
+        batch_size: int,
+        num_workers: int = 0,
+        shuffle: bool = True,
+        collate_fn=None,
+        drop_last: bool = False,
+    ) -> torch.utils.data.DataLoader:
+        if self.world_size > 1:
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle
+            )
+            shuffle = False
+        else:
+            sampler = None
+        return torch.utils.data.DataLoader(
+            dataset,
+            batch_size=batch_size,
+            shuffle=shuffle if sampler is None else False,
+            sampler=sampler,
+            num_workers=num_workers,
+            collate_fn=collate_fn,
+            drop_last=drop_last,
+            pin_memory=True,
+        )
+    @staticmethod
+    def unwrap(model: torch.nn.Module) -> torch.nn.Module:
+        return model.module if hasattr(model, "module") else model

voxcpm/training/config.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from __future__ import annotations
+import argbind
+import yaml
+from pathlib import Path
+from typing import Dict, Any
+def load_yaml_config(path: str | Path) -> Dict[str, Any]:
+    """
+    Load a YAML configuration file into a dictionary suitable for argbind.
+    """
+    path = Path(path)
+    with path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    if not isinstance(data, dict):
+        raise ValueError(f"Configuration file {path} must contain a top-level mapping.")
+    return data
+def parse_args_with_config(config_path: str | Path | None = None):
+    """
+    Helper to unify CLI arguments and YAML configuration.
+    Usage mirrors minicpm-audio:
+        args = parse_args_with_config("conf/voxcpm/finetune.yml")
+        with argbind.scope(args):
+            ...
+    """
+    cli_args = argbind.parse_args()
+    if config_path is None:
+        return cli_args
+    yaml_args = load_yaml_config(config_path)
+    with argbind.scope(cli_args):
+        yaml_args = argbind.parse_args(yaml_args=yaml_args, argv=[])
+    cli_args.update(yaml_args)
+    return cli_args

voxcpm/training/data.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import math
+from typing import Dict, List, Optional, Tuple
+import argbind
+import torch
+from datasets import Audio, Dataset, DatasetDict, load_dataset
+from torch.utils.data import Dataset as TorchDataset
+from ..model.voxcpm import VoxCPMConfig
+from ..modules.audiovae import AudioVAE
+from .packers import AudioFeatureProcessingPacker
+DEFAULT_TEXT_COLUMN = "text"
+DEFAULT_AUDIO_COLUMN = "audio"
+DEFAULT_ID_COLUMN = "dataset_id"
+@argbind.bind()
+def load_audio_text_datasets(
+    train_manifest: str,
+    val_manifest: str = "",
+    text_column: str = DEFAULT_TEXT_COLUMN,
+    audio_column: str = DEFAULT_AUDIO_COLUMN,
+    dataset_id_column: str = DEFAULT_ID_COLUMN,
+    sample_rate: int = 16_000,
+    num_proc: int = 1,
+) -> Tuple[Dataset, Optional[Dataset]]:
+    data_files = {"train": train_manifest}
+    if val_manifest:
+        data_files["validation"] = val_manifest
+    dataset_dict: DatasetDict = load_dataset("json", data_files=data_files)
+    def prepare(ds: Dataset) -> Dataset:
+        if audio_column not in ds.column_names:
+            raise ValueError(f"Expected '{audio_column}' column in manifest.")
+        # We cast to Audio to ensure proper handling during training,
+        # but for length calculation we might need raw path or duration if available.
+        # HF datasets usually don't compute duration automatically for 'Audio' column.
+        ds = ds.cast_column(audio_column, Audio(sampling_rate=sample_rate))
+        if audio_column != DEFAULT_AUDIO_COLUMN:
+            ds = ds.rename_column(audio_column, DEFAULT_AUDIO_COLUMN)
+        if text_column != DEFAULT_TEXT_COLUMN:
+            ds = ds.rename_column(text_column, DEFAULT_TEXT_COLUMN)
+        if dataset_id_column and dataset_id_column in ds.column_names:
+            if dataset_id_column != DEFAULT_ID_COLUMN:
+                ds = ds.rename_column(dataset_id_column, DEFAULT_ID_COLUMN)
+        else:
+            ds = ds.add_column(DEFAULT_ID_COLUMN, [0] * len(ds))
+        return ds
+    train_ds = prepare(dataset_dict["train"])
+    val_ds = prepare(dataset_dict["validation"]) if "validation" in dataset_dict else None
+    return train_ds, val_ds
+def compute_sample_lengths(
+    ds: Dataset,
+    audio_vae_fps: int = 25,
+    patch_size: int = 1,
+) -> List[int]:
+    """
+    预估每个样本经过 packer 之后的大致序列长度（text+audio），用于过滤超长样本。
+    逻辑与 AudioFeatureProcessingPacker / AudioVAE 一致：
+    - 文本长度: len(text_ids)
+    - 音频长度:
+        duration(s) * audio_vae_fps -> 近似 VAE 帧数 t_vae
+        t_seq = ceil(t_vae / patch_size)
+    - 序列总长约为: text_len + t_seq + 2
+    Optimized: Use batch column access instead of iterating item by item.
+    """
+    # Batch access columns - much faster than per-item access
+    text_ids_list = ds["text_ids"]
+    text_lens = [len(t) for t in text_ids_list]
+    has_duration = "duration" in ds.column_names
+    if has_duration:
+        durations = ds["duration"]
+    else:
+        # Fallback: need to compute from audio (slow, but unavoidable without duration column)
+        durations = []
+        for i in range(len(ds)):
+            audio = ds[i][DEFAULT_AUDIO_COLUMN]
+            durations.append(len(audio["array"]) / float(audio["sampling_rate"]))
+    # Vectorized length computation
+    lengths = []
+    for text_len, duration in zip(text_lens, durations):
+        t_vae = math.ceil(float(duration) * audio_vae_fps)
+        t_seq = math.ceil(t_vae / patch_size)
+        total_len = text_len + t_seq + 2
+        lengths.append(total_len)
+    return lengths
+class HFVoxCPMDataset(TorchDataset):
+    """
+    Thin wrapper around a tokenized HuggingFace dataset that returns
+    PyTorch-friendly samples.
+    """
+    def __init__(self, dataset: Dataset):
+        self.dataset = dataset
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx: int):
+        item = self.dataset[idx]
+        audio = item[DEFAULT_AUDIO_COLUMN]
+        return {
+            "text_ids": item["text_ids"],
+            "audio_array": audio["array"],
+            "audio_sampling_rate": audio["sampling_rate"],
+            "dataset_id": item.get(DEFAULT_ID_COLUMN, 0),
+            "is_prompt": item.get("is_prompt", False),
+        }
+    @staticmethod
+    def pad_sequences(seqs: List[torch.Tensor], pad_value: float):
+        if not seqs:
+            return torch.empty(0)
+        max_len = max(seq.shape[0] for seq in seqs)
+        padded = []
+        for seq in seqs:
+            if seq.shape[0] < max_len:
+                pad_width = (0, max_len - seq.shape[0])
+                seq = torch.nn.functional.pad(seq, pad_width, value=pad_value)
+            padded.append(seq)
+        return torch.stack(padded)
+    @classmethod
+    def collate_fn(cls, batch: List[Dict]):
+        text_tensors = [torch.tensor(sample["text_ids"], dtype=torch.int32) for sample in batch]
+        audio_tensors = [torch.tensor(sample["audio_array"], dtype=torch.float32) for sample in batch]
+        dataset_ids = torch.tensor([sample["dataset_id"] for sample in batch], dtype=torch.int32)
+        is_prompts = [bool(sample.get("is_prompt", False)) for sample in batch]
+        text_padded = cls.pad_sequences(text_tensors, pad_value=-100)
+        audio_padded = cls.pad_sequences(audio_tensors, pad_value=-100.0)
+        task_ids = torch.ones(text_padded.size(0), dtype=torch.int32)
+        return {
+            "text_tokens": text_padded,
+            "audio_tokens": audio_padded,
+            "task_ids": task_ids,
+            "dataset_ids": dataset_ids,
+            "is_prompts": is_prompts,
+        }
+class BatchProcessor:
+    """
+    Wraps ``AudioFeatureProcessingPacker`` so the training loop can mirror
+    the minicpm-audio mechanics.
+    """
+    def __init__(
+        self,
+        *,
+        config: VoxCPMConfig,
+        audio_vae: AudioVAE,
+        dataset_cnt: int,
+        device: torch.device,
+    ):
+        self.device = device
+        self.dataset_cnt = dataset_cnt
+        self.audio_vae = audio_vae
+        self.audio_vae.to(device)
+        self.packer = AudioFeatureProcessingPacker(
+            dataset_cnt=dataset_cnt,
+            max_len=config.max_length,
+            patch_size=config.patch_size,
+            feat_dim=config.feat_dim,
+            audio_vae=self.audio_vae,
+        )
+    def __call__(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        audio_tokens = batch["audio_tokens"].to(self.device)
+        text_tokens = batch["text_tokens"].to(self.device)
+        task_ids = batch["task_ids"].to(self.device)
+        dataset_ids = batch["dataset_ids"].to(self.device)
+        packed = self.packer(
+            audio_tokens=audio_tokens,
+            text_tokens=text_tokens,
+            task_ids=task_ids,
+            dataset_ids=dataset_ids,
+            is_prompts=batch["is_prompts"],
+        )
+        return packed
+def build_dataloader(
+    hf_dataset: Dataset,
+    *,
+    accelerator,
+    batch_size: int,
+    num_workers: int,
+    drop_last: bool = False,
+) -> torch.utils.data.DataLoader:
+    torch_dataset = HFVoxCPMDataset(hf_dataset)
+    # Standard padding-based batching; Accelerator will attach DistributedSampler if needed.
+    return accelerator.prepare_dataloader(
+        torch_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+        collate_fn=HFVoxCPMDataset.collate_fn,
+        drop_last=drop_last,
+    )

voxcpm/training/packers.py ADDED Viewed

	@@ -0,0 +1,296 @@

+from typing import Dict, List
+import torch
+import torch.nn as nn
+from einops import rearrange
+class AudioFeatureProcessingPacker:
+    """
+    Adapted from the minicpm-audio training utilities. It converts raw text and
+    audio tokens into the packed multimodal representation required by VoxCPM.
+    """
+    def __init__(self, dataset_cnt: int, max_len: int, patch_size: int, feat_dim: int, audio_vae: nn.Module):
+        self.audio_start_id = 101
+        self.audio_end_id = 102
+        # unused now
+        self.audio_prompt_start_id = 103
+        self.audio_prompt_end_id = 104
+        self.text_eos_token_id = 2
+        self.patch_size = patch_size
+        self.patch_len = audio_vae.hop_length * self.patch_size
+        self.feat_dim = feat_dim
+        self.dataset_cnt = max(dataset_cnt, 1)
+        self.max_len = max_len
+        self.audio_vae = audio_vae
+        self.process_functions = {"tts": self.process_tts_data}
+        self.task_id_map = {"tts": 1}
+        self.id_to_task = {idx: usage for usage, idx in self.task_id_map.items()}
+    # ------------------------------------------------------------------ #
+    # Helpers
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def _first_pad_position(tokens: torch.Tensor):
+        positions = (tokens == -100).nonzero(as_tuple=True)
+        if positions[0].numel() == 0:
+            return None
+        return int(positions[0][0])
+    def unpad_text_tokens(self, tokens: torch.Tensor):
+        pad_pos = self._first_pad_position(tokens)
+        return tokens if pad_pos is None else tokens[:pad_pos]
+    def unpad_audio_tokens(self, tokens: torch.Tensor):
+        pad_pos = self._first_pad_position(tokens)
+        return tokens if pad_pos is None else tokens[:pad_pos]
+    def encode_audio(self, wav: torch.Tensor):
+        """
+        Encode raw waveform into latent features using AudioVAE.
+        AudioVAE.encode expects shape [B, 1, T'] and returns [B, D, T].
+        We then transpose to [B, T, D] to match downstream expectations.
+        """
+        wav = wav.unsqueeze(0)  # [1, T]
+        wav = wav.unsqueeze(1)  # [1, 1, T]
+        wav_len = wav.size(-1)
+        if wav_len % self.patch_len != 0:
+            padding_size = self.patch_len - wav_len % self.patch_len
+            wav = torch.nn.functional.pad(wav, (0, padding_size))
+        with torch.no_grad():
+            z = self.audio_vae.encode(wav, self.audio_vae.sample_rate)  # [1, D, T']
+            feat = z.transpose(1, 2)  # [1, T', D]
+        return feat
+    # ------------------------------------------------------------------ #
+    # Main entry point
+    # ------------------------------------------------------------------ #
+    def __call__(
+        self,
+        audio_tokens: torch.Tensor,
+        text_tokens: torch.Tensor,
+        task_ids: torch.Tensor,
+        dataset_ids: torch.Tensor,
+        is_prompts: List[bool],
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Padding-based batching: each sample in the input batch is processed
+        independently and then padded to a common length (capped by ``max_len``).
+        The result tensors all have shape [B, T, ...].
+        """
+        device = audio_tokens.device
+        max_dataset_id = int(dataset_ids.max().item()) if dataset_ids.numel() > 0 else -1
+        dataset_cnt = max(self.dataset_cnt, max_dataset_id + 1)
+        text_tokens_list: List[torch.Tensor] = []
+        audio_feats_list: List[torch.Tensor] = []
+        text_mask_list: List[torch.Tensor] = []
+        audio_mask_list: List[torch.Tensor] = []
+        loss_mask_list: List[torch.Tensor] = []
+        labels_list: List[torch.Tensor] = []
+        audio_task_ids_list: List[torch.Tensor] = []
+        audio_dataset_ids_list: List[torch.Tensor] = []
+        lengths: List[int] = []
+        audio_duration_consumed = torch.zeros(dataset_cnt, dtype=torch.float32, device=device)
+        text_token_consumed = torch.zeros(dataset_cnt, dtype=torch.float32, device=device)
+        for audio_token, text_token, task_id, dataset_idx, is_prompt in zip(
+            audio_tokens, text_tokens, task_ids.tolist(), dataset_ids.tolist(), is_prompts
+        ):
+            unpad_audio_token = self.unpad_audio_tokens(audio_token).to(torch.float32)
+            unpad_text_token = self.unpad_text_tokens(text_token)
+            usage = self.id_to_task[task_id]
+            (
+                packed_text,
+                audio_feat,
+                text_mask,
+                audio_mask,
+                loss_mask,
+                labels,
+                audio_duration,
+                text_token_count,
+            ) = self.process_functions[usage](unpad_audio_token, unpad_text_token, is_prompt)
+            audio_duration_consumed[dataset_idx] += audio_duration
+            text_token_consumed[dataset_idx] += text_token_count
+            audio_task_id = torch.zeros_like(audio_mask)
+            audio_task_id[audio_mask == 1] = self.task_id_map[usage]
+            audio_dataset_id = torch.zeros_like(audio_mask)
+            audio_dataset_id[audio_mask == 1] = dataset_idx + 1
+            text_tokens_list.append(packed_text)
+            text_mask_list.append(text_mask)
+            audio_feats_list.append(audio_feat)
+            audio_mask_list.append(audio_mask)
+            loss_mask_list.append(loss_mask)
+            labels_list.append(labels)
+            audio_task_ids_list.append(audio_task_id)
+            audio_dataset_ids_list.append(audio_dataset_id)
+            lengths.append(packed_text.shape[0])
+        # Determine padded length per batch (cap by self.max_len)
+        if lengths:
+            max_len = min(self.max_len, max(lengths))
+        else:
+            max_len = self.max_len
+        def pad_1d(x: torch.Tensor, pad_value: int = 0) -> torch.Tensor:
+            if x.size(0) >= max_len:
+                return x[:max_len]
+            pad = torch.full((max_len - x.size(0),), pad_value, dtype=x.dtype, device=x.device)
+            return torch.cat([x, pad], dim=0)
+        def pad_3d(x: torch.Tensor) -> torch.Tensor:
+            # x: [T, P, D]
+            if x.size(0) >= max_len:
+                return x[:max_len]
+            pad = torch.zeros((max_len - x.size(0),) + x.shape[1:], dtype=x.dtype, device=x.device)
+            return torch.cat([x, pad], dim=0)
+        if lengths:
+            text_tokens_batch = torch.stack([pad_1d(t, pad_value=0) for t in text_tokens_list], dim=0)
+            text_mask_batch = torch.stack([pad_1d(m, pad_value=0) for m in text_mask_list], dim=0)
+            audio_feats_batch = torch.stack([pad_3d(f) for f in audio_feats_list], dim=0)
+            audio_mask_batch = torch.stack([pad_1d(m, pad_value=0) for m in audio_mask_list], dim=0)
+            loss_mask_batch = torch.stack([pad_1d(m, pad_value=0) for m in loss_mask_list], dim=0)
+            labels_batch = torch.stack([pad_1d(lbl, pad_value=0) for lbl in labels_list], dim=0)
+            audio_task_ids_batch = torch.stack([pad_1d(t, pad_value=0) for t in audio_task_ids_list], dim=0)
+            audio_dataset_ids_batch = torch.stack([pad_1d(d, pad_value=0) for d in audio_dataset_ids_list], dim=0)
+            # Position ids: [B, T], simple 0..L_i-1 then padded with 0
+            position_ids_list = []
+            for L in lengths:
+                L_clip = min(L, max_len)
+                pos = torch.arange(0, L_clip, device=device)
+                if L_clip < max_len:
+                    pad = torch.zeros(max_len - L_clip, dtype=pos.dtype, device=device)
+                    pos = torch.cat([pos, pad], dim=0)
+                position_ids_list.append(pos)
+            position_ids = torch.stack(position_ids_list, dim=0)
+        else:
+            # Empty batch fallback (shouldn't really happen)
+            text_tokens_batch = torch.zeros((0, self.max_len), dtype=torch.int32, device=device)
+            text_mask_batch = torch.zeros_like(text_tokens_batch)
+            audio_feats_batch = torch.zeros(
+                (0, self.max_len, self.patch_size, self.feat_dim), dtype=torch.float32, device=device
+            )
+            audio_mask_batch = torch.zeros_like(text_tokens_batch)
+            loss_mask_batch = torch.zeros_like(text_tokens_batch)
+            labels_batch = torch.zeros_like(text_tokens_batch)
+            audio_task_ids_batch = torch.zeros_like(text_tokens_batch)
+            audio_dataset_ids_batch = torch.zeros_like(text_tokens_batch)
+            position_ids = torch.zeros_like(text_tokens_batch)
+        audio_duration_consumed = audio_duration_consumed.to(torch.long)
+        text_token_consumed = text_token_consumed.to(torch.long)
+        return {
+            "text_tokens": text_tokens_batch,
+            "audio_feats": audio_feats_batch,
+            "text_mask": text_mask_batch,
+            "audio_mask": audio_mask_batch,
+            "loss_mask": loss_mask_batch,
+            "position_ids": position_ids,
+            "labels": labels_batch,
+            "audio_task_ids": audio_task_ids_batch,
+            "audio_dataset_ids": audio_dataset_ids_batch,
+            "audio_duration_consumed": audio_duration_consumed,
+            "text_token_consumed": text_token_consumed,
+        }
+    # ------------------------------------------------------------------ #
+    # Feature extraction helpers
+    # ------------------------------------------------------------------ #
+    def extract_audio_feats(self, audio_data: torch.Tensor):
+        audio_feats = self.encode_audio(audio_data)
+        if audio_feats.size(1) % self.patch_size != 0:
+            audio_feats_ = audio_feats.transpose(1, 2)
+            padding = nn.functional.pad(audio_feats_, (0, self.patch_size - audio_feats.size(1) % self.patch_size))
+            audio_feats = padding.transpose(1, 2)
+        audio_duration = audio_feats.size(1) / 25
+        audio_feats = rearrange(audio_feats, "b (t p) c -> b t p c", p=self.patch_size)
+        return audio_feats, audio_duration
+    def process_tts_data(self, audio_token: torch.Tensor, text_token: torch.Tensor, is_prompt: bool = False):
+        text_token_info = torch.cat(
+            [
+                text_token,
+                torch.tensor(
+                    [self.audio_prompt_start_id if is_prompt else self.audio_start_id],
+                    dtype=torch.int32,
+                    device=text_token.device,
+                ),
+            ],
+            dim=-1,
+        )
+        text_token_count = len(text_token)
+        text_length = text_token_info.shape[0]
+        audio_feat_info, audio_duration = self.extract_audio_feats(audio_token)
+        audio_feat_info = audio_feat_info.squeeze(0)
+        audio_length = audio_feat_info.shape[0]
+        text_pad_token = torch.zeros(audio_length, dtype=torch.int32, device=text_token.device)
+        text_token_info = torch.cat(
+            [
+                text_token_info,
+                text_pad_token,
+                torch.tensor(
+                    [self.audio_prompt_end_id if is_prompt else self.audio_end_id],
+                    dtype=torch.int32,
+                    device=text_token.device,
+                ),
+            ]
+        )
+        audio_pad_feat = torch.zeros(
+            (text_length, self.patch_size, audio_feat_info.size(-1)),
+            dtype=torch.float32,
+            device=text_token.device,
+        )
+        audio_feat_info = torch.cat([audio_pad_feat, audio_feat_info, audio_pad_feat[0:1, ...]], dim=0)
+        text_mask = (
+            torch.cat([torch.ones(text_length), torch.zeros(audio_length), torch.ones(1)])
+            .type(torch.int32)
+            .to(text_token.device)
+        )
+        audio_mask = (
+            torch.cat([torch.zeros(text_length), torch.ones(audio_length), torch.zeros(1)])
+            .type(torch.int32)
+            .to(text_token.device)
+        )
+        loss_mask = (
+            torch.cat(
+                [
+                    torch.zeros(text_length),
+                    torch.zeros(audio_length) if is_prompt else torch.ones(audio_length),
+                    torch.zeros(1),
+                ]
+            )
+            .type(torch.int32)
+            .to(text_token.device)
+        )
+        labels = torch.zeros(text_length + audio_length + 1).type(torch.int32).to(text_token.device)
+        labels[-2] = 1
+        return (
+            text_token_info,
+            audio_feat_info,
+            text_mask,
+            audio_mask,
+            loss_mask,
+            labels,
+            audio_duration,
+            text_token_count,
+        )

voxcpm/training/state.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass
+class TrainingState:
+    """
+    Container that mirrors the object returned in the minicpm-audio training
+    loop. It holds persistent references to the model, optimizer, scheduler,
+    dataloaders and tracker.
+    """
+    generator: object
+    optimizer: object
+    scheduler: object
+    train_loader: object
+    val_loader: object
+    tracker: object
+    batch_processor: object

voxcpm/training/tracker.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from __future__ import annotations
+import contextlib
+import sys
+import time
+from pathlib import Path
+from typing import Dict, Optional
+class TrainingTracker:
+    """
+    Lightweight tracker inspired by the minimcpm-audio training workflow.
+    It keeps track of the current global step, prints rank-aware messages,
+    optionally writes to TensorBoard via a provided writer, and stores progress
+    in a logfile for later inspection.
+    """
+    def __init__(
+        self,
+        *,
+        writer=None,
+        log_file: Optional[str] = None,
+        rank: int = 0,
+    ):
+        self.writer = writer
+        self.log_file = Path(log_file) if log_file else None
+        if self.log_file:
+            self.log_file.parent.mkdir(parents=True, exist_ok=True)
+        self.rank = rank
+        self.step = 0
+        # Record the time of the last log to calculate the interval
+        self._last_log_time: float | None = None
+    # ------------------------------------------------------------------ #
+    # Logging helpers
+    # ------------------------------------------------------------------ #
+    def print(self, message: str):
+        if self.rank == 0:
+            print(message, flush=True, file=sys.stderr)
+            if self.log_file:
+                with self.log_file.open("a", encoding="utf-8") as f:
+                    f.write(message + "\n")
+    def log_metrics(self, metrics: Dict[str, float], split: str):
+        if self.rank == 0:
+            now = time.time()
+            dt_str = ""
+            if self._last_log_time is not None:
+                dt = now - self._last_log_time
+                dt_str = f", log interval: {dt:.2f}s"
+            self._last_log_time = now
+            formatted = ", ".join(f"{k}: {v:.6f}" for k, v in metrics.items())
+            self.print(f"[{split}] step {self.step}: {formatted}{dt_str}")
+        if self.writer is not None:
+            for key, value in metrics.items():
+                if isinstance(value, (int, float)):
+                    self.writer.add_scalar(f"{split}/{key}", value, self.step)
+    def done(self, split: str, message: str):
+        self.print(f"[{split}] {message}")
+    # ------------------------------------------------------------------ #
+    # State dict
+    # ------------------------------------------------------------------ #
+    def state_dict(self):
+        return {"step": self.step}
+    def load_state_dict(self, state):
+        self.step = int(state.get("step", 0))
+    # ------------------------------------------------------------------ #
+    # Context manager compatibility (for parity with minicpm-audio code)
+    # ------------------------------------------------------------------ #
+    @contextlib.contextmanager
+    def live(self):
+        yield

voxcpm/utils/text_normalize.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# some functions are copied from https://github.com/FunAudioLLM/CosyVoice/blob/main/cosyvoice/utils/frontend_utils.py
+import re
+import regex
+import inflect
+from wetext import Normalizer
+chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]+")
+# whether contain chinese character
+def contains_chinese(text):
+    return bool(chinese_char_pattern.search(text))
+# replace special symbol
+def replace_corner_mark(text):
+    text = text.replace("²", "平方")
+    text = text.replace("³", "立方")
+    text = text.replace("√", "根号")
+    text = text.replace("≈", "约等于")
+    text = text.replace("<", "小于")
+    return text
+# remove meaningless symbol
+def remove_bracket(text):
+    text = text.replace("（", " ").replace("）", " ")
+    text = text.replace("【", " ").replace("】", " ")
+    text = text.replace("`", "").replace("`", "")
+    text = text.replace("——", " ")
+    return text
+# spell Arabic numerals
+def spell_out_number(text: str, inflect_parser):
+    new_text = []
+    st = None
+    for i, c in enumerate(text):
+        if not c.isdigit():
+            if st is not None:
+                num_str = inflect_parser.number_to_words(text[st:i])
+                new_text.append(num_str)
+                st = None
+            new_text.append(c)
+        else:
+            if st is None:
+                st = i
+    if st is not None and st < len(text):
+        num_str = inflect_parser.number_to_words(text[st:])
+        new_text.append(num_str)
+    return "".join(new_text)
+# split paragrah logic：
+# 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
+# 2. cal sentence len according to lang
+# 3. split sentence according to puncatation
+def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False):
+    def calc_utt_length(_text: str):
+        if lang == "zh":
+            return len(_text)
+        else:
+            return len(tokenize(_text))
+    def should_merge(_text: str):
+        if lang == "zh":
+            return len(_text) < merge_len
+        else:
+            return len(tokenize(_text)) < merge_len
+    if lang == "zh":
+        pounc = ["。", "？", "！", "；", "：", "、", ".", "?", "!", ";"]
+    else:
+        pounc = [".", "?", "!", ";", ":"]
+    if comma_split:
+        pounc.extend(["，", ","])
+    st = 0
+    utts = []
+    for i, c in enumerate(text):
+        if c in pounc:
+            if len(text[st:i]) > 0:
+                utts.append(text[st:i] + c)
+            if i + 1 < len(text) and text[i + 1] in ['"', "”"]:
+                tmp = utts.pop(-1)
+                utts.append(tmp + text[i + 1])
+                st = i + 2
+            else:
+                st = i + 1
+    if len(utts) == 0:
+        if lang == "zh":
+            utts.append(text + "。")
+        else:
+            utts.append(text + ".")
+    final_utts = []
+    cur_utt = ""
+    for utt in utts:
+        if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
+            final_utts.append(cur_utt)
+            cur_utt = ""
+        cur_utt = cur_utt + utt
+    if len(cur_utt) > 0:
+        if should_merge(cur_utt) and len(final_utts) != 0:
+            final_utts[-1] = final_utts[-1] + cur_utt
+        else:
+            final_utts.append(cur_utt)
+    return final_utts
+# remove blank between chinese character
+def replace_blank(text: str):
+    out_str = []
+    for i, c in enumerate(text):
+        if c == " ":
+            if (text[i + 1].isascii() and text[i + 1] != " ") and (text[i - 1].isascii() and text[i - 1] != " "):
+                out_str.append(c)
+        else:
+            out_str.append(c)
+    return "".join(out_str)
+def clean_markdown(md_text: str) -> str:
+    # 去除代码块 ``` ```（包括多行）
+    md_text = re.sub(r"```.*?```", "", md_text, flags=re.DOTALL)
+    # 去除内联代码 `code`
+    md_text = re.sub(r"`[^`]*`", "", md_text)
+    # 去除图片语法 ![alt](url)
+    md_text = re.sub(r"!\[[^\]]*\]\([^\)]+\)", "", md_text)
+    # 去除链接但保留文本 [text](url) -> text
+    md_text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", md_text)
+    # 替换无序列表符号
+    md_text = re.sub(r"^(\s*)-\s+", r"\1", md_text, flags=re.MULTILINE)
+    # 去除HTML标签
+    md_text = re.sub(r"<[^>]+>", "", md_text)
+    # 去除标题符号（#）
+    md_text = re.sub(r"^#{1,6}\s*", "", md_text, flags=re.MULTILINE)
+    # 去除多余空格和空行
+    md_text = re.sub(r"\n\s*\n", "\n", md_text)  # 多余空行
+    md_text = md_text.strip()
+    return md_text
+def clean_text(text):
+    # 去除 Markdown 语法
+    text = clean_markdown(text)
+    # 匹配并移除表情符号
+    text = regex.compile(r"\p{Emoji_Presentation}|\p{Emoji}\uFE0F", flags=regex.UNICODE).sub("", text)
+    # 去除换行符
+    text = text.replace("\n", " ")
+    text = text.replace("\t", " ")
+    text = text.replace("“", '"').replace("”", '"')
+    return text
+class TextNormalizer:
+    def __init__(self, tokenizer=None):
+        self.tokenizer = tokenizer
+        self.zh_tn_model = Normalizer(lang="zh", operator="tn", remove_erhua=True)
+        self.en_tn_model = Normalizer(lang="en", operator="tn")
+        self.inflect_parser = inflect.engine()
+    def normalize(self, text, split=False):
+        # 去除 Markdown 语法，去除表情符号，去除换行符
+        lang = "zh" if contains_chinese(text) else "en"
+        text = clean_text(text)
+        if lang == "zh":
+            text = text.replace(
+                "=", "等于"
+            )  # 修复 ”550 + 320 等于 870 千卡。“ 被错误正则为 ”五百五十加三百二十等于八七十千卡.“
+            if re.search(r"([\d$%^*_+≥≤≠×÷?=])", text):  # 避免 英文连字符被错误正则为减
+                text = re.sub(r"(?<=[a-zA-Z0-9])-(?=\d)", " - ", text)  # 修复 x-2 被正则为 x负2
+            text = self.zh_tn_model.normalize(text)
+            text = replace_blank(text)
+            text = replace_corner_mark(text)
+            text = remove_bracket(text)
+        else:
+            text = self.en_tn_model.normalize(text)
+            text = spell_out_number(text, self.inflect_parser)
+        if split is False:
+            return text

voxcpm/zipenhancer.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""
+ZipEnhancer Module - Audio Denoising Enhancer
+Provides on-demand import ZipEnhancer functionality for audio denoising processing.
+Related dependencies are imported only when denoising functionality is needed.
+"""
+import os
+import tempfile
+from typing import Optional
+import torchaudio
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+class ZipEnhancer:
+    """ZipEnhancer Audio Denoising Enhancer"""
+    def __init__(self, model_path: str = "iic/speech_zipenhancer_ans_multiloss_16k_base"):
+        """
+        Initialize ZipEnhancer
+        Args:
+            model_path: ModelScope model path or local path
+        """
+        self.model_path = model_path
+        self._pipeline = pipeline(Tasks.acoustic_noise_suppression, model=self.model_path)
+    def _normalize_loudness(self, wav_path: str):
+        """
+        Audio loudness normalization
+        Args:
+            wav_path: Audio file path
+        """
+        audio, sr = torchaudio.load(wav_path)
+        loudness = torchaudio.functional.loudness(audio, sr)
+        normalized_audio = torchaudio.functional.gain(audio, -20 - loudness)
+        torchaudio.save(wav_path, normalized_audio, sr)
+    def enhance(self, input_path: str, output_path: Optional[str] = None, normalize_loudness: bool = True) -> str:
+        """
+        Audio denoising enhancement
+        Args:
+            input_path: Input audio file path
+            output_path: Output audio file path (optional, creates temp file by default)
+            normalize_loudness: Whether to perform loudness normalization
+        Returns:
+            str: Output audio file path
+        Raises:
+            RuntimeError: If pipeline is not initialized or processing fails
+        """
+        if not os.path.exists(input_path):
+            raise FileNotFoundError(f"Input audio file does not exist: {input_path}")
+        # Create temporary file if no output path is specified
+        if output_path is None:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+                output_path = tmp_file.name
+        try:
+            # Perform denoising processing
+            self._pipeline(input_path, output_path=output_path)
+            # Loudness normalization
+            if normalize_loudness:
+                self._normalize_loudness(output_path)
+            return output_path
+        except Exception as e:
+            # Clean up possibly created temporary files
+            if output_path and os.path.exists(output_path):
+                try:
+                    os.unlink(output_path)
+                except OSError:
+                    pass
+            raise RuntimeError(f"Audio denoising processing failed: {e}")