Add files using upload-large-folder tool

Browse files

Files changed (5) hide show

Convert.py +71 -0
__init__.py +1 -0
config.json +4 -0
modeling_omnivoice.py +1598 -0
tokenizer.model +3 -0

Convert.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "accelerate>=1.13.0",
+#     "flash-linear-attention>=0.4.2",
+#     "hf-xet>=1.4.3",
+#     "huggingface-hub>=1.8.0",
+#     "onnx>=1.21.0",
+#     "onnx-ir>=0.2.0",
+#     "onnxruntime>=1.24.4",
+#     "onnxruntime-genai>=0.13.1",
+#     "optimum>=2.1.0",
+#     "sentencepiece>=0.2.1",
+#     "tiktoken>=0.12.0",
+#     "torch>=2.11.0",
+#     "transformers==5.7.0",
+# ]
+# ///
+import argparse
+from pathlib import Path
+from huggingface_hub import snapshot_download
+#from onnxruntime_genai.python.models.builder import create_model
+from onnxruntime_genai.models.builder import create_model
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--name", required=False,default=None)
+    parser.add_argument("--token",required=False)
+    args = parser.parse_args()
+    token = args.token if args.token else None
+    pwd = Path.cwd()
+    model_dir = pwd / "model"
+    onnx_dir = pwd / "onnx"
+    cache_dir = pwd / "cache"
+    model_dir.mkdir(exist_ok=True)
+    onnx_dir.mkdir(exist_ok=True)
+    cache_dir.mkdir(exist_ok=True)
+    # ===== STEP 1: DOWNLOAD (HF HUB + XET backend automatically used) =====
+    print(">> Downloading model via huggingface_hub (Xet enabled if installed)...")
+    # local_path = snapshot_download(
+    #     repo_id=args.name,
+    #     local_dir=str(model_dir),
+    #     token=token
+    #     #local_dir_use_symlinks=False  # important for ONNX tools
+    # )
+    #print(f"Model downloaded to: {local_path}")
+    # ===== STEP 2: CONVERT USING ONNX GENAI BUILDER =====
+    print(">> Converting to ONNX (GenAI format)...")
+    create_model(
+        model_name=args.name,
+        input_path=str(model_dir),     # HF model directory
+        output_dir=str(onnx_dir),     # ONNX output
+        precision="fp16",              # fp32 | fp16 | int8 | int4 (if supported)
+        execution_provider="cpu",      # cpu | cuda | dml
+        cache_dir=str(cache_dir),  # optional cache
+        extra_options={}
+    )
+    print("\n✅ Done")
+    print(f"ONNX model at: {onnx_dir}")
+if __name__ == "__main__":
+    main()

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .modeling_omnivoice import OmniVoice, OmniVoiceConfig

config.json CHANGED Viewed

@@ -2,6 +2,10 @@
   "architectures": [
     "OmniVoice"
   ],
   "audio_codebook_weights": [
     8,
     8,

   "architectures": [
     "OmniVoice"
   ],
+  "auto_map": {
+    "AutoConfig": "modeling_omnivoice.OmniVoiceConfig",
+    "AutoModel": "modeling_omnivoice.OmniVoice"
+  },
   "audio_codebook_weights": [
     8,
     8,

modeling_omnivoice.py ADDED Viewed

	@@ -0,0 +1,1598 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Core OmniVoice model implementation.
+Defines the ``OmniVoice`` model class, generation config, and inference pipeline.
+This is the main entry point for both inference and training:
+- **Inference**: ``OmniVoice.from_pretrained()`` loads the model, then
+  ``model.generate()`` supports voice cloning, voice design, and auto voice.
+- **Training**: ``model.forward()`` computes the training loss; the model is
+  built and used by ``omnivoice.training.builder`` and ``omnivoice.training.trainer``.
+"""
+import difflib
+import logging
+import math
+import os
+import re
+from dataclasses import dataclass, fields
+from functools import partial
+from typing import Any, List, Optional, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+try:
+    from torch.nn.attention.flex_attention import create_block_mask
+    _flex_attention_available = True
+except ImportError:
+    _flex_attention_available = False
+from transformers import (
+    AutoFeatureExtractor,
+    AutoModel,
+    AutoTokenizer,
+    HiggsAudioV2TokenizerModel,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from transformers.modeling_outputs import ModelOutput
+from transformers.models.auto import CONFIG_MAPPING, AutoConfig
+from omnivoice.utils.audio import (
+    cross_fade_chunks,
+    fade_and_pad_audio,
+    load_audio,
+    remove_silence,
+    trim_long_audio,
+)
+from omnivoice.utils.duration import RuleDurationEstimator
+from omnivoice.utils.lang_map import LANG_IDS, LANG_NAMES
+from omnivoice.utils.text import add_punctuation, chunk_text_punctuation
+from omnivoice.utils.voice_design import (
+    _INSTRUCT_ALL_VALID,
+    _INSTRUCT_EN_TO_ZH,
+    _INSTRUCT_MUTUALLY_EXCLUSIVE,
+    _INSTRUCT_VALID_EN,
+    _INSTRUCT_VALID_ZH,
+    _INSTRUCT_ZH_TO_EN,
+    _ZH_RE,
+)
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+@dataclass
+class VoiceClonePrompt:
+    ref_audio_tokens: torch.Tensor  # (C, T)
+    ref_text: str
+    ref_rms: float
+@dataclass
+class OmniVoiceGenerationConfig:
+    num_step: int = 32
+    guidance_scale: float = 2.0
+    t_shift: float = 0.1
+    layer_penalty_factor: float = 5.0
+    position_temperature: float = 5.0
+    class_temperature: float = 0.0
+    denoise: bool = True
+    preprocess_prompt: bool = True
+    postprocess_output: bool = True
+    audio_chunk_duration: float = 15.0
+    audio_chunk_threshold: float = 30.0
+    @classmethod
+    def from_dict(cls, kwargs_dict):
+        valid_keys = {f.name for f in fields(cls)}
+        filtered = {k: v for k, v in kwargs_dict.items() if k in valid_keys}
+        return cls(**filtered)
+@dataclass
+class GenerationTask:
+    batch_size: int
+    texts: List[str]
+    target_lens: List[int]
+    langs: List[Optional[str]]
+    instructs: List[Optional[str]]
+    ref_texts: List[Optional[str]]
+    ref_audio_tokens: List[Optional[torch.Tensor]]
+    ref_rms: List[Optional[float]]
+    speed: Optional[List[float]] = None
+    def get_indices(self, config: OmniVoiceGenerationConfig, frame_rate: int):
+        threshold = int(config.audio_chunk_threshold * frame_rate)
+        short_idx = [i for i, l in enumerate(self.target_lens) if l <= threshold]
+        long_idx = [i for i, l in enumerate(self.target_lens) if l > threshold]
+        return short_idx, long_idx
+    def slice_task(self, indices: List[int]):
+        if not indices:
+            return None
+        return GenerationTask(
+            batch_size=len(indices),
+            texts=[self.texts[i] for i in indices],
+            target_lens=[self.target_lens[i] for i in indices],
+            langs=[self.langs[i] for i in indices],
+            instructs=[self.instructs[i] for i in indices],
+            ref_texts=[self.ref_texts[i] for i in indices],
+            ref_audio_tokens=[self.ref_audio_tokens[i] for i in indices],
+            ref_rms=[self.ref_rms[i] for i in indices],
+            speed=[self.speed[i] for i in indices] if self.speed else None,
+        )
+@dataclass
+class OmniVoiceModelOutput(ModelOutput):
+    loss: Optional[torch.Tensor] = None
+    logits: Optional[torch.Tensor] = None
+# ---------------------------------------------------------------------------
+# Config & Model
+# ---------------------------------------------------------------------------
+class OmniVoiceConfig(PretrainedConfig):
+    model_type = "omnivoice"
+    sub_configs = {"llm_config": AutoConfig}
+    def __init__(
+        self,
+        audio_vocab_size: int = 1025,
+        audio_mask_id: int = 1024,
+        num_audio_codebook: int = 8,
+        audio_codebook_weights: Optional[list[float]] = None,
+        llm_config: Optional[Union[dict, PretrainedConfig]] = None,
+        **kwargs,
+    ):
+        if isinstance(llm_config, dict):
+            llm_config = CONFIG_MAPPING[llm_config["model_type"]](**llm_config)
+        self.llm_config = llm_config
+        super().__init__(**kwargs)
+        self.audio_vocab_size = audio_vocab_size
+        self.audio_mask_id = audio_mask_id
+        self.num_audio_codebook = num_audio_codebook
+        if audio_codebook_weights is None:
+            audio_codebook_weights = [8, 8, 6, 6, 4, 4, 2, 2]
+        self.audio_codebook_weights = audio_codebook_weights
+def _resolve_model_path(name_or_path: str) -> str:
+    if os.path.isdir(name_or_path):
+        return name_or_path
+    from huggingface_hub import snapshot_download
+    return snapshot_download(name_or_path)
+class OmniVoice(PreTrainedModel):
+    _supports_flex_attn = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    config_class = OmniVoiceConfig
+    def __init__(self, config: OmniVoiceConfig, llm: Optional[PreTrainedModel] = None):
+        super().__init__(config)
+        if llm is not None:
+            # If an LLM instance is provided, use it directly
+            # (skipping config-based init).
+            self.llm = llm
+        else:
+            # Otherwise, initialize the LLM from the config.
+            self.llm = AutoModel.from_config(self.config.llm_config)
+        self.audio_embeddings = nn.Embedding(
+            config.num_audio_codebook * config.audio_vocab_size,
+            self.config.llm_config.hidden_size,
+        )
+        self.register_buffer(
+            "codebook_layer_offsets",
+            torch.arange(config.num_audio_codebook) * config.audio_vocab_size,
+        )
+        self.audio_heads = nn.Linear(
+            self.config.llm_config.hidden_size,
+            config.num_audio_codebook * config.audio_vocab_size,
+            bias=False,
+        )
+        self.normalized_audio_codebook_weights = [
+            w / sum(config.audio_codebook_weights)
+            for w in config.audio_codebook_weights
+        ]
+        self.post_init()
+        # Inference-only attributes (set by from_pretrained when not in train mode)
+        self.text_tokenizer = None
+        self.audio_tokenizer = None
+        self.duration_estimator = None
+        self.sampling_rate = None
+        self._asr_pipe = None
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        train_mode = kwargs.pop("train", False)
+        load_asr = kwargs.pop("load_asr", False)
+        asr_model_name = kwargs.pop("asr_model_name", "openai/whisper-large-v3-turbo")
+        # Suppress noisy INFO logs from transformers/huggingface_hub during loading
+        _prev_disable = logging.root.manager.disable
+        logging.disable(logging.INFO)
+        try:
+            # Resolve to local path first; download only if not already cached
+            resolved_path = _resolve_model_path(pretrained_model_name_or_path)
+            model = super().from_pretrained(resolved_path, *args, **kwargs)
+            if not train_mode:
+                model.text_tokenizer = AutoTokenizer.from_pretrained(resolved_path)
+                audio_tokenizer_path = os.path.join(resolved_path, "audio_tokenizer")
+                if not os.path.isdir(audio_tokenizer_path):
+                    audio_tokenizer_path = _resolve_model_path(
+                        "eustlb/higgs-audio-v2-tokenizer"
+                    )
+                # higgs-audio-v2-tokenizer does not support MPS
+                # (output channels > 65536)
+                tokenizer_device = (
+                    "cpu" if str(model.device).startswith("mps") else model.device
+                )
+                model.audio_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained(
+                    audio_tokenizer_path, device_map=tokenizer_device
+                )
+                model.feature_extractor = AutoFeatureExtractor.from_pretrained(
+                    audio_tokenizer_path
+                )
+                model.sampling_rate = model.feature_extractor.sampling_rate
+                model.duration_estimator = RuleDurationEstimator()
+                if load_asr:
+                    model.load_asr_model(model_name=asr_model_name)
+        finally:
+            logging.disable(_prev_disable)
+        return model
+    # -------------------------------------------------------------------
+    # ASR support (optional, for auto-transcription)
+    # -------------------------------------------------------------------
+    def load_asr_model(self, model_name: str = "openai/whisper-large-v3-turbo"):
+        """Load a Whisper ASR model for reference audio transcription.
+        Args:
+            model_name: HuggingFace model name or local path for the Whisper model.
+        """
+        from transformers import pipeline as hf_pipeline
+        logger.info("Loading ASR model %s ...", model_name)
+        asr_dtype = (
+            torch.float16 if str(self.device).startswith("cuda") else torch.float32
+        )
+        model_name = _resolve_model_path(model_name)
+        self._asr_pipe = hf_pipeline(
+            "automatic-speech-recognition",
+            model=model_name,
+            dtype=asr_dtype,
+            device_map=self.device,
+        )
+        logger.info("ASR model loaded on %s.", self.device)
+    @torch.inference_mode()
+    def transcribe(
+        self,
+        audio: Union[str, tuple],
+    ) -> str:
+        """Transcribe audio using the loaded Whisper ASR model.
+        Args:
+            audio: File path or ``(waveform, sample_rate)`` tuple.
+                Waveform can be a numpy array or torch.Tensor of shape
+                ``(1, T)`` or ``(T,)``.
+        Returns:
+            Transcribed text.
+        """
+        if self._asr_pipe is None:
+            raise RuntimeError(
+                "ASR model is not loaded. Call model.load_asr_model() first."
+            )
+        if isinstance(audio, str):
+            return self._asr_pipe(audio)["text"].strip()
+        else:
+            waveform, sr = audio
+            if isinstance(waveform, torch.Tensor):
+                waveform = waveform.cpu().numpy()
+            waveform = np.squeeze(waveform)  # (1, T) or (T,) → (T,)
+            audio_input = {
+                "array": waveform,
+                "sampling_rate": sr,
+            }
+            return self._asr_pipe(audio_input)["text"].strip()
+    def get_input_embeddings(self):
+        return self.llm.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.llm.set_input_embeddings(value)
+    def _prepare_embed_inputs(
+        self, input_ids: torch.Tensor, audio_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Prepares embeddings from input_ids of shape (batch_size, layers, seq_length).
+        Embedding shape is (batch_size, seq_length, hidden_size).
+        """
+        text_embeds = self.get_input_embeddings()(input_ids[:, 0, :])
+        # Apply shift to audio IDs based on codebook layer
+        # audio_ids: [Batch, 8, Seq]
+        # codebook_layer_offsets: [1, 8, 1]
+        # Result: Layer 0 ID Layer 1 ID + Layer 2 ID + 2050...
+        shifted_ids = (
+            input_ids * audio_mask.unsqueeze(1)
+        ) + self.codebook_layer_offsets.view(1, -1, 1)
+        # input: [Batch, 8, Seq] -> output: [Batch, Seq, Hidden]
+        audio_embeds = self.audio_embeddings(shifted_ids).sum(dim=1)
+        return torch.where(audio_mask.unsqueeze(-1), audio_embeds, text_embeds)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        audio_mask: torch.Tensor,
+        labels: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        document_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        inputs_embeds = self._prepare_embed_inputs(input_ids, audio_mask)
+        if attention_mask is None and document_ids is not None:
+            if not _flex_attention_available:
+                raise RuntimeError(
+                    "flex_attention is not available in the current environment. "
+                    "If you do not need flex_attention, set "
+                    '"attn_implementation": "sdpa" in your training config.'
+                )
+            attention_mask = create_block_mask(
+                _get_packed_mask(
+                    document_ids[0].to(inputs_embeds.device),
+                ),
+                B=None,
+                H=None,
+                Q_LEN=input_ids.size(-1),
+                KV_LEN=input_ids.size(-1),
+                _compile=True,
+                device=inputs_embeds.device,
+            )
+        llm_outputs = self.llm(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            return_dict=True,
+            position_ids=position_ids,
+        )
+        hidden_states = llm_outputs[0]
+        loss = None
+        # Shape: [B, S, C * Vocab]
+        batch_size, seq_len, _ = hidden_states.shape
+        logits_flat = self.audio_heads(hidden_states)
+        # Shape: [B, S, C, Vocab] -> [B, C, S, Vocab]
+        audio_logits = logits_flat.view(
+            batch_size,
+            seq_len,
+            self.config.num_audio_codebook,
+            self.config.audio_vocab_size,
+        ).permute(0, 2, 1, 3)
+        if labels is not None:
+            # audio_logits.permute(0, 3, 1, 2):
+            # [Batch, Layer, Seq, Vocab] -> [Batch, Vocab, Layer, Seq]
+            # per_token_loss shape: [Batch, Layer, Seq]，ignore -100
+            per_token_loss = torch.nn.functional.cross_entropy(
+                audio_logits.permute(0, 3, 1, 2),
+                labels,
+                reduction="none",
+                ignore_index=-100,
+            )
+            # valid_mask shape: [Batch, Layer, Seq]
+            valid_mask = (labels != -100).float()
+            # layer_means shape: [num_layers]
+            layer_means = (per_token_loss * valid_mask).sum(
+                dim=(0, 2)
+            ) / valid_mask.sum(dim=(0, 2)).clamp(min=1.0)
+            weights = torch.tensor(
+                self.normalized_audio_codebook_weights, device=audio_logits.device
+            )
+            loss = (layer_means * weights).sum()
+        return OmniVoiceModelOutput(
+            loss=loss,
+            logits=audio_logits,
+        )
+    def supported_language_ids(self) -> set[str]:
+        """Return a list of supported language IDs."""
+        return LANG_IDS
+    def supported_language_names(self) -> set[str]:
+        """Return a list of supported language names."""
+        return LANG_NAMES
+    # -------------------------------------------------------------------
+    # Inference API
+    # -------------------------------------------------------------------
+    @torch.inference_mode()
+    def generate(
+        self,
+        text: Union[str, list[str]],
+        language: Union[str, list[str], None] = None,
+        ref_text: Union[str, list[str], None] = None,
+        ref_audio: Union[
+            str,
+            list[str],
+            tuple[torch.Tensor, int],
+            list[tuple[torch.Tensor, int]],
+            None,
+        ] = None,
+        voice_clone_prompt: Union[
+            VoiceClonePrompt, list[VoiceClonePrompt], None
+        ] = None,
+        instruct: Union[str, list[str], None] = None,
+        duration: Union[float, list[Optional[float]], None] = None,
+        speed: Union[float, list[Optional[float]], None] = None,
+        generation_config: Optional[OmniVoiceGenerationConfig] = None,
+        **kwargs,
+    ) -> list[np.ndarray]:
+        """Generate speech audio given text in various modes.
+        Supports three modes:
+        1. **Voice clone** — clone the voice style from the reference audio.
+            Should provide ``voice_clone_prompt`` (from
+           :meth:`create_voice_clone_prompt`) or ``ref_text`` + ``ref_audio``.
+        2. **Voice design** — provide ``instruct`` text describing
+           the desired voice style; no reference audio needed.
+        3. **Auto** — provide neither; the model picks a voice itself.
+        Args:
+            text: Target text (single string or list for batch).
+            language: Language name (e.g. ``"English"``) or code
+                (e.g. ``"en"``). ``None`` for language-agnostic mode.
+                Performance is slightly better if you specify the language.
+            ref_text: Optional reference text for voice cloning mode.
+            ref_audio: Optional reference audio for voice cloning mode.
+                Can be a file path or a (waveform, sample_rate) tuple.
+            voice_clone_prompt: Reusable prompt from :meth:`create_voice_clone_prompt`.
+                If provided, it overrides ``ref_text`` and ``ref_audio``.
+            instruct: Style instruction for voice design mode.
+            duration: Fixed output duration in seconds. If a single float,
+                applies to all items; if a list, one value per item.
+                ``None`` (default) lets the model estimate duration from text.
+                Overrides ``speed`` when both are provided.
+            speed: Speaking speed factor. ``> 1.0`` for faster, ``< 1.0`` for
+                slower. If a list, one value per item. ``None`` (default) uses
+                the model's default estimation.
+            generation_config: Explicit config object. If provided, takes
+                precedence over ``**kwargs``.
+            **kwargs: Generation config or its fields:
+                denoise: Whether to prepend the ``<|denoise|>`` token.
+                num_step: Number of iterative decoding steps.
+                guidance_scale: Classifier-free guidance scale.
+                t_shift: Time-step shift (smaller → emphasise low-SNR).
+                postprocess_output: Post-process output (remove silence, fade-in/out, pad edges).
+                layer_penalty_factor: Penalty encouraging earlier codebook
+                    layers to unmask first.
+                position_temperature: Temperature for position selection.
+                class_temperature: Temperature for token sampling (0 = greedy).
+                audio_chunk_duration: If > 0, split long text into chunks of
+                    this duration (seconds) and generate chunk by chunk.
+                audio_chunk_threshold: Only apply chunking if estimated audio
+                    duration exceeds this threshold (seconds).
+        Returns:
+            ``audios`` a list of 1-D ``np.ndarray`` with shape ``(T,)`` and
+            sampling rate consistent with the model's audio tokenizer
+            (usually 24 000 Hz).  Can be saved directly with
+            ``soundfile.write("out.wav", audios[0], model.sampling_rate)``.
+        """
+        if self.audio_tokenizer is None or self.text_tokenizer is None:
+            raise RuntimeError(
+                "Model is not loaded with audio/text tokenizers. Make sure you "
+                "loaded the model with OmniVoice.from_pretrained()."
+            )
+        gen_config = (
+            generation_config
+            if generation_config is not None
+            else OmniVoiceGenerationConfig.from_dict(kwargs)
+        )
+        self.eval()
+        full_task = self._preprocess_all(
+            text=text,
+            language=language,
+            ref_text=ref_text,
+            ref_audio=ref_audio,
+            voice_clone_prompt=voice_clone_prompt,
+            instruct=instruct,
+            preprocess_prompt=gen_config.preprocess_prompt,
+            speed=speed,
+            duration=duration,
+        )
+        short_idx, long_idx = full_task.get_indices(
+            gen_config, self.audio_tokenizer.config.frame_rate
+        )
+        results = [None] * full_task.batch_size
+        if short_idx:
+            short_task = full_task.slice_task(short_idx)
+            short_results = self._generate_iterative(short_task, gen_config)
+            for idx, res in zip(short_idx, short_results):
+                results[idx] = res
+        if long_idx:
+            long_task = full_task.slice_task(long_idx)
+            long_results = self._generate_chunked(long_task, gen_config)
+            for idx, res in zip(long_idx, long_results):
+                results[idx] = res
+        generated_audios = []
+        for i in range(full_task.batch_size):
+            assert results[i] is not None, f"Result {i} was not generated"
+            generated_audios.append(
+                self._decode_and_post_process(
+                    results[i], full_task.ref_rms[i], gen_config  # type: ignore[arg-type]
+                )
+            )
+        return generated_audios
+    def create_voice_clone_prompt(
+        self,
+        ref_audio: Union[str, tuple[torch.Tensor, int]],
+        ref_text: Optional[str] = None,
+        preprocess_prompt: bool = True,
+    ) -> VoiceClonePrompt:
+        """Create a reusable voice clone prompt from reference audio.
+        Args:
+            ref_audio: File path (str) or ``(waveform, sample_rate)`` tuple.
+                waveform should be a 1-D or 2-D torch.Tensor (channels x samples).
+            ref_text: Transcript of the reference audio. If ``None``, the
+                ASR model will be used to auto-transcribe (must call
+                :meth:`load_asr_model` first).
+            preprocess_prompt: If ``True`` (default), apply silence removal and
+                trimming to the reference audio, add punctuation in the end
+                of reference text (if not already)
+        Returns:
+            A :class:`VoiceClonePrompt` that can be passed to :meth:`generate`.
+        """
+        if self.audio_tokenizer is None:
+            raise RuntimeError(
+                "Audio tokenizer is not loaded. Make sure you loaded the model "
+                "with OmniVoice.from_pretrained()."
+            )
+        if isinstance(ref_audio, str):
+            ref_wav = load_audio(ref_audio, self.sampling_rate)
+        else:
+            waveform, sr = ref_audio
+            if isinstance(waveform, torch.Tensor):
+                waveform = waveform.cpu().numpy()
+            if waveform.ndim == 1:
+                waveform = waveform[np.newaxis, :]
+            if waveform.shape[0] > 1:
+                waveform = np.mean(waveform, axis=0, keepdims=True)
+            if sr != self.sampling_rate:
+                waveform = torchaudio.functional.resample(
+                    torch.from_numpy(waveform),
+                    orig_freq=sr,
+                    new_freq=self.sampling_rate,
+                ).numpy()
+            ref_wav = waveform
+        ref_rms = float(np.sqrt(np.mean(ref_wav**2)))
+        if 0 < ref_rms < 0.1:
+            ref_wav = ref_wav * 0.1 / ref_rms
+        if preprocess_prompt:
+            # Trim long reference audio (>20s) by splitting at the largest silence gap.
+            # Skip trimming when ref_text is user-provided, otherwise the
+            # trimmed audio will no longer match the full transcript.
+            if ref_text is None:
+                ref_wav = trim_long_audio(
+                    ref_wav, self.sampling_rate, trim_threshold=20.0
+                )
+            ref_wav = remove_silence(
+                ref_wav,
+                self.sampling_rate,
+                mid_sil=200,
+                lead_sil=100,
+                trail_sil=200,
+            )
+            if ref_wav.shape[-1] == 0:
+                raise ValueError(
+                    "Reference audio is empty after silence removal. "
+                    "Try setting preprocess_prompt=False."
+                )
+        ref_duration = ref_wav.shape[-1] / self.sampling_rate
+        if ref_duration > 20.0:
+            logger.warning(
+                "Reference audio is %.1fs long (>20s). This may cause slower "
+                "generation, higher memory usage, and degraded voice cloning "
+                "quality. We recommend trimming it to 3-10s.",
+                ref_duration,
+            )
+        # Auto-transcribe if ref_text not provided
+        if ref_text is None:
+            if self._asr_pipe is None:
+                logger.info("ASR model not loaded yet, loading on-the-fly ...")
+                self.load_asr_model()
+            ref_text = self.transcribe((ref_wav, self.sampling_rate))
+            logger.debug("Auto-transcribed ref_text: %s", ref_text)
+        chunk_size = self.audio_tokenizer.config.hop_length
+        clip_size = int(ref_wav.shape[-1] % chunk_size)
+        ref_wav = ref_wav[:, :-clip_size] if clip_size > 0 else ref_wav
+        # numpy → torch at tokenizer boundary
+        ref_wav_tensor = torch.from_numpy(ref_wav).to(self.audio_tokenizer.device)
+        ref_audio_tokens = self.audio_tokenizer.encode(
+            ref_wav_tensor.unsqueeze(0),
+        ).audio_codes.squeeze(
+            0
+        )  # (C, T)
+        if preprocess_prompt:
+            ref_text = add_punctuation(ref_text)
+        return VoiceClonePrompt(
+            ref_audio_tokens=ref_audio_tokens,
+            ref_text=ref_text,
+            ref_rms=ref_rms,
+        )
+    def _decode_and_post_process(
+        self,
+        tokens: Union[torch.Tensor, List[torch.Tensor]],
+        rms: Union[float, None],
+        gen_config: OmniVoiceGenerationConfig,
+    ) -> np.ndarray:
+        """
+        Args:
+            tokens: Audio tokens — either a single tensor of shape
+                (num_codebooks, seq_len) or a list of chunk tensors.
+            rms: RMS of the reference audio for volume adjustment.
+            gen_config: Generation config for post-processing options.
+        Returns:
+            Decoded and post-processed audio array of shape (T,).
+        """
+        tokenizer_device = self.audio_tokenizer.device
+        if isinstance(tokens, list):
+            chunk_audios = [
+                self.audio_tokenizer.decode(t.to(tokenizer_device).unsqueeze(0))
+                .audio_values[0]
+                .cpu()
+                .numpy()
+                for t in tokens
+            ]
+            audio_waveform = cross_fade_chunks(chunk_audios, self.sampling_rate)
+        else:
+            audio_waveform = (
+                self.audio_tokenizer.decode(tokens.to(tokenizer_device).unsqueeze(0))
+                .audio_values[0]
+                .cpu()
+                .numpy()
+            )
+        audio_waveform = self._post_process_audio(
+            audio_waveform,
+            postprocess_output=gen_config.postprocess_output,
+            ref_rms=rms,
+        )
+        return audio_waveform.squeeze(0)
+    def _post_process_audio(
+        self,
+        generated_audio: np.ndarray,
+        postprocess_output: bool,
+        ref_rms: Union[float, None],
+    ) -> np.ndarray:
+        """Optionally remove long silences, adjust volume, and add edge padding.
+        Args:
+            generated_audio: Numpy array of shape (1, T).
+            postprocess_output: If True, remove long silences and apply fade/pad.
+            ref_rms: RMS of the reference audio for volume normalisation.
+        Returns:
+            Processed numpy array of shape (1, T).
+        """
+        if postprocess_output:
+            generated_audio = remove_silence(
+                generated_audio,
+                self.sampling_rate,
+                mid_sil=500,
+                lead_sil=100,
+                trail_sil=100,
+            )
+        if ref_rms is not None and ref_rms < 0.1:
+            generated_audio = generated_audio * ref_rms / 0.1
+        elif ref_rms is None:
+            peak = np.abs(generated_audio).max()
+            if peak > 1e-6:
+                generated_audio = generated_audio / peak * 0.5
+        generated_audio = fade_and_pad_audio(
+            generated_audio,
+            sample_rate=self.sampling_rate,
+        )
+        return generated_audio
+    def _generate_chunked(
+        self, task: GenerationTask, gen_config: OmniVoiceGenerationConfig
+    ) -> List[List[torch.Tensor]]:
+        """Generate long audio by splitting text into chunks and batching.
+        Each item in the returned list corresponds to one input and contains
+        a list of audio token tensors — one per text chunk.
+        Args:
+            task: A :class:`GenerationTask` with one or more items whose
+                estimated audio exceeds ``audio_chunk_threshold``.
+            gen_config: Generation config (``audio_chunk_duration`` controls
+                chunk size).
+        Returns:
+            Per-item list of chunk token-tensor lists.
+        """
+        # Chunk each item's text
+        all_chunks = []
+        for i in range(task.batch_size):
+            avg_tokens_per_char = task.target_lens[i] / len(task.texts[i])
+            text_chunk_len = int(
+                gen_config.audio_chunk_duration
+                * self.audio_tokenizer.config.frame_rate
+                / avg_tokens_per_char
+            )
+            chunks = chunk_text_punctuation(
+                text=task.texts[i],
+                chunk_len=text_chunk_len,
+                min_chunk_len=3,
+            )
+            logger.debug(f"Item {i} chunked into {len(chunks)} pieces: {chunks}")
+            all_chunks.append(chunks)
+        has_ref = [t is not None for t in task.ref_audio_tokens]
+        assert all(has_ref) or not any(has_ref), (
+            "Chunked inference requires all items to either have or not have "
+            "ref_audio. Mixed ref/non-ref is not supported."
+        )
+        max_num_chunks = max(len(c) for c in all_chunks)
+        # chunk_results[item_idx] = list of generated token tensors per chunk
+        chunk_results = [[] for _ in range(task.batch_size)]
+        def _run_batch(indices, texts, ref_audios, ref_texts):
+            speed_list = task.speed
+            target_lens = [
+                self._estimate_target_tokens(
+                    texts[j],
+                    ref_texts[j],
+                    ref_audios[j].size(-1) if ref_audios[j] is not None else None,
+                    speed=speed_list[i] if speed_list else 1.0,
+                )
+                for j, i in enumerate(indices)
+            ]
+            sub_task = GenerationTask(
+                batch_size=len(indices),
+                texts=texts,
+                target_lens=target_lens,
+                langs=[task.langs[i] for i in indices],
+                instructs=[task.instructs[i] for i in indices],
+                ref_texts=ref_texts,
+                ref_audio_tokens=ref_audios,
+                ref_rms=[task.ref_rms[i] for i in indices],
+                speed=[task.speed[i] for i in indices] if task.speed else None,
+            )
+            gen_tokens = self._generate_iterative(sub_task, gen_config)
+            for j, idx in enumerate(indices):
+                chunk_results[idx].append(gen_tokens[j])
+        if all(has_ref):
+            # All items have reference audio.
+            # We still sequentially generate chunks within each item, but we
+            # batch across items for the same chunk index. This allows to keep
+            # the VRAM usage manageable while still benefiting from batching.
+            for ci in range(max_num_chunks):
+                indices = [i for i in range(task.batch_size) if ci < len(all_chunks[i])]
+                if not indices:
+                    continue
+                _run_batch(
+                    indices,
+                    texts=[all_chunks[i][ci] for i in indices],
+                    ref_audios=[task.ref_audio_tokens[i] for i in indices],
+                    ref_texts=[task.ref_texts[i] for i in indices],
+                )
+        else:
+            # No reference audio — generate chunk 0 for all items first,
+            # then use chunk 0 output as reference for all subsequent chunks.
+            indices_0 = [i for i in range(task.batch_size) if len(all_chunks[i]) > 0]
+            _run_batch(
+                indices_0,
+                texts=[all_chunks[i][0] for i in indices_0],
+                ref_audios=[None] * len(indices_0),
+                ref_texts=[None] * len(indices_0),
+            )
+            first_chunk_map = {idx: chunk_results[idx][0] for idx in indices_0}
+            # Batch all remaining chunks, using chunk 0 as fixed reference
+            for ci in range(1, max_num_chunks):
+                indices = [i for i in range(task.batch_size) if ci < len(all_chunks[i])]
+                if not indices:
+                    continue
+                _run_batch(
+                    indices,
+                    texts=[all_chunks[i][ci] for i in indices],
+                    ref_audios=[first_chunk_map[i] for i in indices],
+                    ref_texts=[all_chunks[i][0] for i in indices],
+                )
+        return chunk_results
+    def _preprocess_all(
+        self,
+        text: Union[str, list[str]],
+        language: Union[str, list[str], None] = None,
+        ref_text: Union[str, list[str], None] = None,
+        ref_audio: Union[
+            str,
+            list[str],
+            tuple[torch.Tensor, int],
+            list[tuple[torch.Tensor, int]],
+            None,
+        ] = None,
+        voice_clone_prompt: Union[
+            VoiceClonePrompt, list[VoiceClonePrompt], None
+        ] = None,
+        instruct: Union[str, list[str], None] = None,
+        preprocess_prompt: bool = True,
+        speed: Union[float, list[Optional[float]], None] = None,
+        duration: Union[float, list[Optional[float]], None] = None,
+    ) -> GenerationTask:
+        if isinstance(text, str):
+            text_list = [text]
+        else:
+            assert isinstance(
+                text, list
+            ), "text should be a string or a list of strings"
+            text_list = text
+        batch_size = len(text_list)
+        language_list = self._ensure_list(language, batch_size)
+        language_list = [_resolve_language(lang) for lang in language_list]
+        instruct_list = self._ensure_list(instruct, batch_size)
+        for i, s in enumerate(instruct_list):
+            if s is None:
+                continue
+            use_zh = bool(text_list[i] and _ZH_RE.search(text_list[i]))
+            instruct_list[i] = _resolve_instruct(s, use_zh=use_zh)
+        if voice_clone_prompt is not None and (
+            ref_text is not None or ref_audio is not None
+        ):
+            logger.warning(
+                "Both voice_clone_prompt and ref_text/ref_audio are provided. "
+                "ref_text/ref_audio will be ignored."
+            )
+        if voice_clone_prompt is None and ref_audio is not None:
+            # If voice_clone_prompt is not provided, create it from
+            # ref_audio (ref_text will be auto-transcribed if not given).
+            ref_text_list = self._ensure_list(ref_text, batch_size, auto_repeat=False)
+            ref_audio_list = self._ensure_list(ref_audio, batch_size, auto_repeat=False)
+            voice_clone_prompt = []
+            for i in range(len(ref_text_list)):
+                voice_clone_prompt.append(
+                    self.create_voice_clone_prompt(
+                        ref_audio=ref_audio_list[i],
+                        ref_text=ref_text_list[i],
+                        preprocess_prompt=preprocess_prompt,
+                    )
+                )
+        voice_clone_prompt_list = self._ensure_list(voice_clone_prompt, batch_size)
+        if voice_clone_prompt_list[0] is not None:
+            ref_text_list = [vc.ref_text for vc in voice_clone_prompt_list]
+            ref_audio_tokens_list = [
+                vc.ref_audio_tokens for vc in voice_clone_prompt_list
+            ]
+            ref_rms_list = [vc.ref_rms for vc in voice_clone_prompt_list]
+        else:
+            ref_text_list = [None] * batch_size
+            ref_audio_tokens_list = [None] * batch_size
+            ref_rms_list = [None] * batch_size
+        # Normalize speed/duration to per-item lists (may contain None).
+        if speed is not None:
+            if isinstance(speed, (int, float)):
+                user_speed = [float(speed)] * batch_size
+            else:
+                user_speed = list(speed)
+        else:
+            user_speed = None
+        if duration is not None:
+            if isinstance(duration, (int, float)):
+                durations = [float(duration)] * batch_size
+            else:
+                durations = list(duration)
+        else:
+            durations = None
+        num_target_tokens_list = []
+        for i in range(batch_size):
+            # duration[i] overrides speed for estimation: use speed=1.0
+            # to get the raw estimate, then override target_lens below.
+            has_dur = durations is not None and durations[i] is not None
+            item_speed = 1.0 if has_dur else (user_speed[i] if user_speed else 1.0)
+            est = self._estimate_target_tokens(
+                text_list[i],
+                ref_text_list[i],
+                ref_audio_tokens_list[i].size(-1)
+                if ref_audio_tokens_list[i] is not None
+                else None,
+                speed=item_speed,
+            )
+            num_target_tokens_list.append(est)
+        # Per-item duration overrides: set target_lens to exact frame count
+        # and compute speed ratio so chunked generation scales proportionally.
+        speed_list: Optional[List[float]] = None
+        if durations is not None:
+            frame_rate = self.audio_tokenizer.config.frame_rate
+            speed_list = []
+            for i in range(batch_size):
+                if durations[i] is not None:
+                    target_tokens = max(1, int(durations[i] * frame_rate))
+                    est = num_target_tokens_list[i]
+                    speed_list.append(est / target_tokens if target_tokens > 0 else 1.0)
+                    num_target_tokens_list[i] = target_tokens
+                else:
+                    s = user_speed[i] if user_speed else None
+                    speed_list.append(s if s is not None else 1.0)
+        elif user_speed is not None:
+            speed_list = [s if s is not None else 1.0 for s in user_speed]
+        return GenerationTask(
+            batch_size=batch_size,
+            texts=text_list,
+            target_lens=num_target_tokens_list,
+            langs=language_list,
+            instructs=instruct_list,
+            ref_texts=ref_text_list,
+            ref_audio_tokens=ref_audio_tokens_list,
+            ref_rms=ref_rms_list,
+            speed=speed_list,
+        )
+    def _estimate_target_tokens(self, text, ref_text, num_ref_audio_tokens, speed=1.0):
+        """Estimate number of target audio tokens."""
+        if num_ref_audio_tokens is None or ref_text is None or len(ref_text) == 0:
+            # Fall back to a simple heuristic
+            ref_text = "Nice to meet you."
+            num_ref_audio_tokens = 25
+        est = self.duration_estimator.estimate_duration(
+            text, ref_text, num_ref_audio_tokens
+        )
+        if speed > 0 and speed != 1.0:
+            est = est / speed
+        return max(1, int(est))
+    def _ensure_list(
+        self, x: Union[Any, List[Any]], batch_size: int, auto_repeat: bool = True
+    ) -> List[Any]:
+        x_list = x if isinstance(x, list) else [x]
+        if len(x_list) not in (
+            1,
+            batch_size,
+        ):
+            raise ValueError(
+                f"should be either the number of the text or 1, but got {len(x_list)}"
+            )
+        if auto_repeat and len(x_list) == 1 and batch_size is not None:
+            x_list = x_list * batch_size
+        return x_list
+    def _prepare_inference_inputs(
+        self,
+        text: str,
+        num_target_tokens: int,
+        ref_text: Optional[str] = None,
+        ref_audio_tokens: Optional[torch.Tensor] = None,
+        lang: Optional[str] = None,
+        instruct: Optional[str] = None,
+        denoise: bool = True,
+    ):
+        """Prepare input_ids and audio masks for inference.
+        Args:
+            text: Target text to generate.
+            num_target_tokens: Number of audio tokens to generate.
+            ref_text: Optional reference text for voice cloning.
+            ref_audio_tokens: Optional reference audio tokens for voice cloning.
+                with shape (C, T).
+            lang: Optional language ID.
+            instruct: Optional style instruction for voice design.
+            denoise: Whether to include the <|denoise|> token.
+        """
+        # Build style tokens: <|denoise|> + <|lang_start|>...<|lang_end|>
+        #                      + <|instruct_start|>...<|instruct_end|>
+        style_text = ""
+        if denoise and ref_audio_tokens is not None:
+            style_text += "<|denoise|>"
+        lang_str = lang if lang else "None"
+        instruct_str = instruct if instruct else "None"
+        style_text += f"<|lang_start|>{lang_str}<|lang_end|>"
+        style_text += f"<|instruct_start|>{instruct_str}<|instruct_end|>"
+        style_tokens = (
+            self.text_tokenizer(style_text, return_tensors="pt")
+            .input_ids.repeat(self.config.num_audio_codebook, 1)
+            .unsqueeze(0)
+        ).to(
+            self.device
+        )  # [1, C, N1]
+        # Build text tokens
+        full_text = _combine_text(ref_text=ref_text, text=text)
+        wrapped_text = f"<|text_start|>{full_text}<|text_end|>"
+        text_tokens = (
+            _tokenize_with_nonverbal_tags(wrapped_text, self.text_tokenizer)
+            .repeat(self.config.num_audio_codebook, 1)
+            .unsqueeze(0)
+        ).to(
+            self.device
+        )  # [1, C, N2]
+        # Target: all MASK
+        target_audio_tokens = torch.full(
+            (1, self.config.num_audio_codebook, num_target_tokens),
+            self.config.audio_mask_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        # Conditional input
+        parts = [style_tokens, text_tokens]
+        if ref_audio_tokens is not None:
+            parts.append(ref_audio_tokens.unsqueeze(0).to(self.device))
+        parts.append(target_audio_tokens)
+        cond_input_ids = torch.cat(parts, dim=2)
+        cond_total_length = cond_input_ids.shape[2]
+        cond_audio_start_idx = cond_total_length - num_target_tokens
+        if ref_audio_tokens is not None:
+            cond_audio_start_idx -= ref_audio_tokens.size(-1)
+        cond_audio_mask = torch.zeros(
+            1, cond_total_length, dtype=torch.bool, device=self.device
+        )
+        cond_audio_mask[0, cond_audio_start_idx:] = True
+        return {
+            "input_ids": cond_input_ids,
+            "audio_mask": cond_audio_mask,
+        }
+    def _generate_iterative(
+        self, task: GenerationTask, gen_config: OmniVoiceGenerationConfig
+    ) -> List[torch.Tensor]:
+        """N-step iterative unmasked decoding.
+        Args:
+            task: A :class:`GenerationTask` containing batch texts, target
+                lengths, languages, instructions, and optional reference data.
+            gen_config: A :class:`OmniVoiceGenerationConfig` controlling
+                decoding steps, guidance, temperatures, etc.
+        Returns:
+            List of generated audio token tensors of shape (C, T) (one per
+            input text).
+        """
+        B = task.batch_size
+        for i in range(B):
+            logger.debug(
+                "Item %d — text: %s | ref_text: %s | instruct: %s | lang: %s | target_tokens: %d",
+                i,
+                task.texts[i],
+                task.ref_texts[i],
+                task.instructs[i],
+                task.langs[i],
+                task.target_lens[i],
+            )
+        inputs_list = [
+            self._prepare_inference_inputs(
+                task.texts[i],
+                task.target_lens[i],
+                task.ref_texts[i],
+                task.ref_audio_tokens[i],
+                task.langs[i],
+                task.instructs[i],
+                gen_config.denoise,
+            )
+            for i in range(B)
+        ]
+        c_lens = [inp["input_ids"].size(2) for inp in inputs_list]
+        max_c_len = max(c_lens)
+        pad_id = self.config.audio_mask_id  # Or any other tokens
+        batch_input_ids = torch.full(
+            (2 * B, self.config.num_audio_codebook, max_c_len),
+            pad_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        batch_audio_mask = torch.zeros(
+            (2 * B, max_c_len), dtype=torch.bool, device=self.device
+        )
+        batch_attention_mask = torch.zeros(
+            (2 * B, 1, max_c_len, max_c_len), dtype=torch.bool, device=self.device
+        )
+        for i, inp in enumerate(inputs_list):
+            c_len, u_len = c_lens[i], task.target_lens[i]
+            # Cond (0 ~ B-1)
+            batch_input_ids[i, :, :c_len] = inp["input_ids"]
+            batch_audio_mask[i, :c_len] = inp["audio_mask"]
+            batch_attention_mask[i, :, :c_len, :c_len] = True
+            # Uncond (B ~ 2B-1)
+            batch_input_ids[B + i, :, :u_len] = inp["input_ids"][..., -u_len:]
+            batch_audio_mask[B + i, :u_len] = inp["audio_mask"][..., -u_len:]
+            batch_attention_mask[B + i, :, :u_len, :u_len] = True
+            if max_c_len > u_len:
+                pad_diag = torch.arange(u_len, max_c_len, device=self.device)
+                batch_attention_mask[B + i, :, pad_diag, pad_diag] = True
+        tokens = torch.full(
+            (B, self.config.num_audio_codebook, max(task.target_lens)),
+            self.config.audio_mask_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        timesteps = _get_time_steps(
+            t_start=0.0,
+            t_end=1.0,
+            num_step=gen_config.num_step,
+            t_shift=gen_config.t_shift,
+        ).tolist()
+        schedules = []
+        for t_len in task.target_lens:
+            total_mask = t_len * self.config.num_audio_codebook
+            rem = total_mask
+            sched = []
+            for step in range(gen_config.num_step):
+                num = (
+                    rem
+                    if step == gen_config.num_step - 1
+                    else min(
+                        math.ceil(total_mask * (timesteps[step + 1] - timesteps[step])),
+                        rem,
+                    )
+                )
+                sched.append(int(num))
+                rem -= int(num)
+            schedules.append(sched)
+        layer_ids = torch.arange(
+            self.config.num_audio_codebook, device=self.device
+        ).view(1, -1, 1)
+        for step in range(gen_config.num_step):
+            batch_logits = self(
+                input_ids=batch_input_ids,
+                audio_mask=batch_audio_mask,
+                attention_mask=batch_attention_mask,
+            ).logits.to(torch.float32)
+            for i in range(B):
+                k = schedules[i][step]
+                if k <= 0:
+                    continue
+                c_len, t_len = c_lens[i], task.target_lens[i]
+                # Extract real target Logits
+                # [1, C, T, V]
+                c_logits = batch_logits[i : i + 1, :, c_len - t_len : c_len, :]
+                u_logits = batch_logits[B + i : B + i + 1, :, :t_len, :]
+                pred_tokens, scores = self._predict_tokens_with_scoring(
+                    c_logits, u_logits, gen_config
+                )
+                scores = scores - (layer_ids * gen_config.layer_penalty_factor)
+                if gen_config.position_temperature > 0.0:
+                    scores = _gumbel_sample(scores, gen_config.position_temperature)
+                sample_tokens = tokens[i : i + 1, :, :t_len]
+                scores.masked_fill_(
+                    sample_tokens != self.config.audio_mask_id, -float("inf")
+                )
+                _, topk_idx = torch.topk(scores.flatten(), k)
+                flat_tokens = sample_tokens.flatten()
+                flat_tokens[topk_idx] = pred_tokens.flatten()[topk_idx]
+                sample_tokens.copy_(flat_tokens.view_as(sample_tokens))
+                # Update individual slices into batched structure
+                tokens[i : i + 1, :, :t_len] = sample_tokens
+                batch_input_ids[i : i + 1, :, c_len - t_len : c_len] = sample_tokens
+                batch_input_ids[B + i : B + i + 1, :, :t_len] = sample_tokens
+        return [tokens[i, :, : task.target_lens[i]] for i in range(B)]
+    def _predict_tokens_with_scoring(self, c_logits, u_logits, gen_config):
+        if gen_config.guidance_scale != 0:
+            c_log_probs = F.log_softmax(c_logits, dim=-1)
+            u_log_probs = F.log_softmax(u_logits, dim=-1)
+            log_probs = torch.log_softmax(
+                c_log_probs + gen_config.guidance_scale * (c_log_probs - u_log_probs),
+                dim=-1,
+            )
+        else:
+            log_probs = F.log_softmax(c_logits, dim=-1)
+        log_probs[..., self.config.audio_mask_id] = -float("inf")
+        if gen_config.class_temperature > 0.0:
+            filtered_probs = _filter_top_k(log_probs, ratio=0.1)
+            pred_tokens = _gumbel_sample(
+                filtered_probs, gen_config.class_temperature
+            ).argmax(dim=-1)
+        else:
+            pred_tokens = log_probs.argmax(dim=-1)
+        confidence_scores = log_probs.max(dim=-1)[0]
+        return pred_tokens, confidence_scores
+# ---------------------------------------------------------------------------
+# Standalone helpers
+# ---------------------------------------------------------------------------
+def _get_packed_mask(document_ids):
+    return partial(_mask_mod_packed, document_ids)
+def _mask_mod_packed(document_ids, b, h, q_idx, kv_idx):
+    # 1. Sequence Packing Logic: Tokens must belong to the same document.
+    # Note: The doc_id for padding tokens is -1, which will automatically not match
+    # (if handled correctly) or be ignored.
+    same_doc = document_ids[q_idx] == document_ids[kv_idx]
+    return same_doc
+def _resolve_language(language: Optional[str]) -> Union[str, None]:
+    from omnivoice.utils.lang_map import LANG_IDS, LANG_NAME_TO_ID
+    if language is None or language.lower() == "none":
+        return None
+    if language in LANG_IDS:
+        return language
+    key = language.lower()
+    if key in LANG_NAME_TO_ID:
+        return LANG_NAME_TO_ID[key]
+    logger.warning(
+        f"Language '{language}' is not recognized. "
+        f"Please use a valid language ID (e.g., 'en', 'zh', 'ja', 'de') "
+        f"or a full language name (e.g., 'English', 'Chinese', 'Japanese'). "
+        f"See supported_language_ids() or supported_language_names() for details. "
+        f"Falling back to None (language-agnostic mode)."
+    )
+    return None
+def _resolve_instruct(
+    instruct: Optional[str], use_zh: bool = False
+) -> Union[str, None]:
+    """Validate and normalise a voice-design instruct string.
+    Supported instruct items (case-insensitive for English):
+    English (comma + space separated):
+        gender: male, female
+        age: child, teenager, young adult, middle-aged, elderly
+        pitch: very low pitch, low pitch, moderate pitch,
+               high pitch, very high pitch
+        style: whisper
+        accent: american accent, british accent, australian accent, ...
+    Chinese (full-width comma separated):
+        gender: 男, 女
+        age: 儿童, 少年, 青年, 中年, 老年
+        pitch: 极低音调, 低音调, 中音调, 高音调, 极高音调
+        style: 耳语
+        dialect: 河南话, 陕西话, 四川话, 贵州话, 云南话,
+                 桂林话, 济南话, 石家庄话, 甘肃话, 宁夏话,
+                 青岛话, 东北话
+    Minor issues (auto-fixed):
+      - Wrong separator (half-width comma in Chinese instruct or
+        full-width comma in English instruct)
+      - Leading / trailing commas
+    Major issues (raise ``ValueError``):
+      - Unsupported or misspelled instruct items
+      - Suggestions are offered for close matches
+    Args:
+        instruct: Raw instruct string, or ``None``.
+        use_zh: If True, normalise all items to Chinese (used when the
+            synthesis text contains Chinese and no accent is specified).
+    Returns:
+        Normalised instruct string, or ``None``.
+    Raises:
+        ValueError: if any instruct item is unsupported or misspelled.
+    """
+    if instruct is None:
+        return None
+    instruct_str = instruct.strip()
+    if not instruct_str:
+        return None
+    # Split on both half-width and full-width commas
+    raw_items = re.split(r"\s*[,，]\s*", instruct_str)
+    raw_items = [x for x in raw_items if x]
+    # Validate each item
+    unknown = []
+    normalised = []
+    for raw in raw_items:
+        n = raw.strip().lower()
+        if n in _INSTRUCT_ALL_VALID:
+            normalised.append(n)
+        else:
+            sug = difflib.get_close_matches(n, _INSTRUCT_ALL_VALID, n=1, cutoff=0.6)
+            unknown.append((raw, n, sug[0] if sug else None))
+    if unknown:
+        lines = []
+        for raw, n, sug in unknown:
+            if sug:
+                lines.append(f"  '{raw}' -> '{n}' (unsupported; did you mean '{sug}'?)")
+            else:
+                lines.append(f"  '{raw}' -> '{n}' (unsupported)")
+        err = (
+            f"Unsupported instruct items found in {instruct_str}:\n"
+            + "\n".join(lines)
+            + "\n\nValid English items: "
+            + ", ".join(sorted(_INSTRUCT_VALID_EN))
+            + "\nValid Chinese items: "
+            + "，".join(sorted(_INSTRUCT_VALID_ZH))
+            + "\n\nTip: Use only English or only Chinese instructs. "
+            "English instructs should use comma + space (e.g. "
+            "'male, indian accent'),\nChinese instructs should use full-width "
+            "comma (e.g. '男，河南话')."
+        )
+        raise ValueError(err)
+    # --- Language consistency: dialect forces Chinese, accent forces English ---
+    has_dialect = any(n.endswith("话") for n in normalised)
+    has_accent = any(" accent" in n for n in normalised)
+    if has_dialect and has_accent:
+        raise ValueError(
+            "Cannot mix Chinese dialect and English accent in a single instruct. "
+            "Dialects are for Chinese speech, accents for English speech."
+        )
+    if has_dialect:
+        use_zh = True
+    elif has_accent:
+        use_zh = False
+    # --- Unify to single language ---
+    if use_zh:
+        normalised = [_INSTRUCT_EN_TO_ZH.get(n, n) for n in normalised]
+    else:
+        normalised = [_INSTRUCT_ZH_TO_EN.get(n, n) for n in normalised]
+    # --- Category conflict check ---
+    conflicts = []
+    for cat in _INSTRUCT_MUTUALLY_EXCLUSIVE:
+        hits = [n for n in normalised if n in cat]
+        if len(hits) > 1:
+            conflicts.append(hits)
+    if conflicts:
+        parts = []
+        for group in conflicts:
+            parts.append(" vs ".join(f"'{x}'" for x in group))
+        raise ValueError(
+            "Conflicting instruct items within the same category: "
+            + "; ".join(parts)
+            + ". Each category (gender, age, pitch, style, accent, dialect) "
+            "allows at most one item."
+        )
+    # Determine separator based on language
+    has_zh = any(any("\u4e00" <= c <= "\u9fff" for c in n) for n in normalised)
+    separator = "，" if has_zh else ", "
+    return separator.join(normalised)
+def _filter_top_k(logits: torch.Tensor, ratio: float = 0.1) -> torch.Tensor:
+    k = math.ceil(ratio * logits.shape[-1])
+    val, ind = logits.topk(k, dim=-1)
+    probs = torch.full_like(logits, float("-inf"))
+    probs.scatter_(-1, ind, val)
+    return probs
+def _gumbel_sample(logits: torch.Tensor, temperature: float) -> torch.Tensor:
+    scaled_logits = logits / temperature
+    u = torch.rand_like(scaled_logits)
+    gumbel_noise = -torch.log(-torch.log(u + 1e-10) + 1e-10)
+    return scaled_logits + gumbel_noise
+def _get_time_steps(
+    t_start: float = 0.0,
+    t_end: float = 1.0,
+    num_step: int = 10,
+    t_shift: float = 1.0,
+    device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    timesteps = torch.linspace(t_start, t_end, num_step + 1).to(device)
+    timesteps = t_shift * timesteps / (1 + (t_shift - 1) * timesteps)
+    return timesteps
+_NONVERBAL_PATTERN = re.compile(
+    r"\[(laughter|sigh|confirmation-en|question-en|question-ah|question-oh|"
+    r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
+    r"surprise-yo|dissatisfaction-hnn)\]"
+)
+def _tokenize_with_nonverbal_tags(text: str, tokenizer) -> torch.Tensor:
+    """Tokenize text containing non-verbal tags, handling each tag independently.
+    Non-verbal tags are tokenized standalone to guarantee consistent token
+    IDs regardless of surrounding language context (Chinese, English, etc.).
+    Args:
+        text: Full text string potentially containing non-verbal tags.
+        tokenizer: HuggingFace text tokenizer instance.
+    Returns:
+        Token IDs tensor of shape (1, seq_len).
+    """
+    parts = []
+    last_end = 0
+    for m in _NONVERBAL_PATTERN.finditer(text):
+        if m.start() > last_end:
+            segment = text[last_end : m.start()]
+            ids = tokenizer(segment, add_special_tokens=False).input_ids
+            if ids:
+                parts.append(ids)
+        tag_ids = tokenizer(m.group(), add_special_tokens=False).input_ids
+        if tag_ids:
+            parts.append(tag_ids)
+        last_end = m.end()
+    if last_end < len(text):
+        segment = text[last_end:]
+        ids = tokenizer(segment, add_special_tokens=False).input_ids
+        if ids:
+            parts.append(ids)
+    if not parts:
+        result = tokenizer(text, return_tensors="pt").input_ids
+    else:
+        combined = []
+        for p in parts:
+            combined.extend(p)
+        result = torch.tensor([combined], dtype=torch.long)
+    return result
+def _combine_text(text, ref_text: Optional[str] = None) -> str:
+    # combine with reference text if not None
+    if ref_text:
+        full_text = ref_text.strip() + " " + text.strip()
+    else:
+        full_text = text.strip()
+    # filter out newline / carriage-return characters
+    full_text = re.sub(r"[\r\n]+", "", full_text)
+    # replace Chinese parentheses with English ones
+    full_text = full_text.replace("\uff08", "(").replace("\uff09", ")")
+    # collapse consecutive spaces / tabs into a single space
+    full_text = re.sub(r"[ \t]+", " ", full_text)
+    # remove spaces around chinese characters
+    chinese_range = r"[\u4e00-\u9fff]"
+    pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
+    full_text = re.sub(pattern, "", full_text)
+    return full_text
+# ---------------------------------------------------------------------------
+# Register with HuggingFace Auto classes
+# ---------------------------------------------------------------------------
+AutoConfig.register("omnivoice", OmniVoiceConfig)
+AutoModel.register(OmniVoiceConfig, OmniVoice)

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c353ee1479b536bf414c1b247f5542b6607fb8ae91320e5af1781fee200fddff
+size 470897