Spaces:

XiaomiMiMo
/

MiMo-V2.5-ASR

Running on A100

App Files Files Community

MarkDaniel212 commited on 30 days ago

Commit

2c4c098

verified ·

1 Parent(s): 51637ef

Initial Docker-based ASR demo (app.py + src + requirements)

Browse files

Files changed (14) hide show

.gitignore +4 -0
Dockerfile +49 -0
README.md +5 -5
app.py +181 -0
requirements.txt +11 -0
src/mimo_audio/mimo_audio.py +376 -0
src/mimo_audio/modeling_mimo_audio.py +835 -0
src/mimo_audio/process_speechdata.py +289 -0
src/mimo_audio/templates.py +54 -0
src/mimo_audio_tokenizer/__init__.py +6 -0
src/mimo_audio_tokenizer/configuration_audio_tokenizer.py +104 -0
src/mimo_audio_tokenizer/modeling_audio_tokenizer.py +857 -0
src/mimo_audio_tokenizer/modeling_rope_utils.py +878 -0
src/mimo_audio_tokenizer/quantization.py +480 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+assets/models/
+.venv/
+__pycache__/
+*.pyc

Dockerfile ADDED Viewed

	@@ -0,0 +1,49 @@

+FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+    wget \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    libffi-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    liblzma-dev \
+    git \
+    ca-certificates \
+    curl \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /tmp
+RUN wget https://www.python.org/ftp/python/3.12.7/Python-3.12.7.tgz \
+    && tar -xvf Python-3.12.7.tgz \
+    && cd Python-3.12.7 \
+    && ./configure --enable-optimizations --prefix=/usr/local \
+    && make -j$(nproc) \
+    && make altinstall \
+    && cd .. \
+    && rm -rf Python-3.12.7 Python-3.12.7.tgz
+RUN python3.12 -m ensurepip --upgrade
+RUN python3.12 -m pip install --upgrade pip
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user
+ENV PATH="$HOME/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN python3.12 -m pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["python3.12", "app.py"]

README.md CHANGED Viewed

@@ -3,13 +3,13 @@ title: MiMo V2.5 ASR
 emoji: 🦀
 colorFrom: blue
 colorTo: green
-sdk: gradio
-sdk_version: 6.13.0
-python_version: '3.12'
-app_file: app.py
 pinned: false
 license: apache-2.0
 short_description: Leading ASR models from Xiaomi MiMo
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 emoji: 🦀
 colorFrom: blue
 colorTo: green
+sdk: docker
+app_port: 7898
 pinned: false
 license: apache-2.0
 short_description: Leading ASR models from Xiaomi MiMo
 ---
+MiMo-V2.5-ASR: Robust Speech Recognition across languages, dialects, and complex acoustic scenarios.
+Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>.

app.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# Copyright 2025 Xiaomi Corporation.
+import os
+import time
+import gradio as gr
+import torch
+from huggingface_hub import snapshot_download
+from src.mimo_audio.mimo_audio import MimoAudio
+MODEL_REPO = "XiaomiMiMo/MiMo-V2.5-ASR"
+TOKENIZER_REPO = "XiaomiMiMo/MiMo-Audio-Tokenizer"
+DOWNLOAD_ROOT = os.environ.get("MIMO_DOWNLOAD_ROOT", "assets/models")
+LANGUAGE_TAGS = {
+    "Auto": "",
+    "Chinese": "<chinese>",
+    "English": "<english>",
+}
+def download_models():
+    os.makedirs(DOWNLOAD_ROOT, exist_ok=True)
+    hf_token = os.getenv("HF_TOKEN")
+    model_path = os.path.join(DOWNLOAD_ROOT, MODEL_REPO.replace("/", "_"))
+    tokenizer_path = os.path.join(DOWNLOAD_ROOT, TOKENIZER_REPO.replace("/", "_"))
+    print(f"[download] {MODEL_REPO} -> {model_path}")
+    snapshot_download(repo_id=MODEL_REPO, token=hf_token, local_dir=model_path)
+    print(f"[download] {TOKENIZER_REPO} -> {tokenizer_path}")
+    snapshot_download(repo_id=TOKENIZER_REPO, token=hf_token, local_dir=tokenizer_path)
+    return model_path, tokenizer_path
+class ASRGenerator:
+    def __init__(self, model):
+        self.model = model
+    def transcribe(self, audio_path, audio_tag=""):
+        return self.model.asr_sft(audio_path, audio_tag=audio_tag)
+class MiMoV25ASRInterface:
+    def __init__(self, model_path, tokenizer_path):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"[init] device={device}")
+        print(f"[init] model_path={model_path}")
+        print(f"[init] tokenizer_path={tokenizer_path}")
+        self.model = MimoAudio(model_path, tokenizer_path)
+        self.asr_generator = ASRGenerator(self.model)
+        print("[init] model ready")
+    def transcribe(self, uploaded_audio, recorded_audio, language_choice):
+        audio_path = uploaded_audio or recorded_audio
+        if audio_path is None:
+            return "", "❌ Error: Please upload an audio file or record from your microphone."
+        audio_tag = LANGUAGE_TAGS.get(language_choice, "")
+        try:
+            print(f"Performing ASR task:")
+            print(f"  Audio: {audio_path}")
+            print(f"  Language: {language_choice} (tag='{audio_tag}')")
+            start = time.time()
+            transcript = self.asr_generator.transcribe(audio_path, audio_tag=audio_tag)
+            elapsed = time.time() - start
+            status_msg = (
+                f"✅ Transcription completed in {elapsed:.2f}s\n"
+                f"🎵 Input audio: {os.path.basename(audio_path)}\n"
+                f"🌐 Language tag: {language_choice}"
+            )
+            return transcript, status_msg
+        except Exception as e:
+            error_msg = f"❌ Error during transcription: {str(e)}"
+            print(error_msg)
+            return "", error_msg
+    def create_interface(self):
+        with gr.Blocks(title="MiMo-V2.5-ASR Speech Recognition", theme=gr.themes.Soft()) as iface:
+            gr.Markdown("# 🎙️ MiMo-V2.5-ASR: Robust Speech Recognition")
+            gr.Markdown(
+                "Upload an audio file **or** record directly from your microphone. "
+                "Supports Chinese, English, Chinese dialects, code-switch, singing, "
+                "noisy environments, and multi-speaker scenarios."
+            )
+            with gr.Row():
+                with gr.Column():
+                    uploaded_audio = gr.Audio(
+                        label="Upload Audio File",
+                        type="filepath",
+                        sources=["upload"],
+                        interactive=True,
+                    )
+                    recorded_audio = gr.Audio(
+                        label="Or Record from Microphone",
+                        type="filepath",
+                        sources=["microphone"],
+                        interactive=True,
+                    )
+                    language_choice = gr.Radio(
+                        label="Language Tag",
+                        choices=list(LANGUAGE_TAGS.keys()),
+                        value="Auto",
+                        info=(
+                            "Auto: automatic language detection (recommended for "
+                            "code-switched speech). Select Chinese or English to "
+                            "bias the model toward that language."
+                        ),
+                    )
+                    transcribe_btn = gr.Button(
+                        "🎧 Transcribe", variant="primary", size="lg"
+                    )
+                with gr.Column():
+                    output_text = gr.Textbox(
+                        label="Transcription",
+                        lines=10,
+                        interactive=False,
+                        placeholder="Transcription result will appear here...",
+                        show_copy_button=True,
+                    )
+                    status = gr.Textbox(
+                        label="Status",
+                        lines=4,
+                        interactive=False,
+                        placeholder="Processing status will be shown here...",
+                    )
+                    with gr.Row():
+                        clear_btn = gr.Button("🗑️ Clear", size="sm")
+            transcribe_btn.click(
+                fn=self.transcribe,
+                inputs=[uploaded_audio, recorded_audio, language_choice],
+                outputs=[output_text, status],
+            )
+            def clear_all():
+                return None, None, "Auto", "", ""
+            clear_btn.click(
+                fn=clear_all,
+                outputs=[
+                    uploaded_audio,
+                    recorded_audio,
+                    language_choice,
+                    output_text,
+                    status,
+                ],
+            )
+        return iface
+def main():
+    print("🚀 Launch MiMo-V2.5-ASR demo...")
+    model_path, tokenizer_path = download_models()
+    interface = MiMoV25ASRInterface(model_path, tokenizer_path)
+    iface = interface.create_interface()
+    host = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
+    port = int(os.environ.get("GRADIO_SERVER_PORT", "7898"))
+    print(f"🌐 Launch service - {host}:{port}")
+    iface.queue().launch(
+        server_name=host,
+        server_port=port,
+    )
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+accelerate==1.9.0
+librosa==0.11.0
+scipy==1.16.1
+torch @ https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl
+torchaudio @ https://download.pytorch.org/whl/cu126/torchaudio-2.6.0%2Bcu126-cp312-cp312-linux_x86_64.whl
+flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.2/flash_attn-2.8.2%2Bcu12torch2.6cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+transformers==4.49.0
+triton==3.2.0
+gradio==5.46.1
+zhon==2.1.1
+huggingface_hub>=0.26.0

src/mimo_audio/mimo_audio.py ADDED Viewed

	@@ -0,0 +1,376 @@

+# Copyright 2025 Xiaomi Corporation.
+import time
+import random
+import torch
+import torchaudio
+from typing import Union
+from torchaudio.transforms import MelSpectrogram
+from transformers import (
+    AutoTokenizer,
+    GenerationConfig
+)
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from .process_speechdata import InputSegment
+from ..mimo_audio_tokenizer import MiMoAudioTokenizer
+from .templates import asr_en_templates, asr_zh_templates
+from .modeling_mimo_audio import (
+    MiMoAudioArguments,
+    MiMoAudioForCausalLM,
+    MiMoSampler,
+    MiMoStopper,
+)
+class MimoAudio:
+    def __init__(
+        self,
+        model_path: str,
+        mimo_audio_tokenizer_path: str,
+        device: str | None = None,
+    ) -> None:
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.path = model_path
+        self.mimo_audio_tokenizer_path = mimo_audio_tokenizer_path
+        self.tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(
+            self.path
+        )
+        self.padding_idx = int(self.tokenizer.pad_token_id)
+        special_tokens = [
+            "<|sosp|>",
+            "<|eosp|>",
+            "<|empty|>",
+            "<|Human|>",
+            "<|SpeechLM|>",
+            "<|sostm|>",
+            "<|eostm|>",
+            "<|eot|>",
+        ]
+        for token in special_tokens:
+            if token not in self.tokenizer.get_vocab():
+                print(f"Add special tokens {token} to tokenizer.vocab")
+                self.tokenizer.add_tokens([token], special_tokens=True)
+        self.sosp_idx = self.tokenizer.convert_tokens_to_ids("<|sosp|>")
+        self.eosp_idx = self.tokenizer.convert_tokens_to_ids("<|eosp|>")
+        self.empty_token = self.tokenizer.convert_tokens_to_ids("<|empty|>")
+        self.sostm_idx = self.tokenizer.convert_tokens_to_ids("<|sostm|>")
+        self.eostm_idx = self.tokenizer.convert_tokens_to_ids("<|eostm|>")
+        self.eot_idx = self.tokenizer.convert_tokens_to_ids("<|eot|>")
+        self.im_start_idx = self.tokenizer.convert_tokens_to_ids("<|im_start|>")
+        self.im_end_idx = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
+        model_args = MiMoAudioArguments(
+            model_name_or_path=self.path,
+            sosp_idx=self.sosp_idx,
+            eosp_idx=self.eosp_idx,
+            empty_idx=self.empty_token,
+            sostm_idx=self.sostm_idx,
+            eostm_idx=self.eostm_idx,
+            eot_idx=self.eot_idx,
+        )
+        start_loading_time = time.monotonic()
+        self.model = MiMoAudioForCausalLM.from_pretrained(
+            self.path,
+            args=model_args,
+            torch_dtype=torch.bfloat16,
+            device_map={"": self.device},
+        )
+        self.group_size=self.model.config.group_size
+        self.audio_channels=self.model.config.audio_channels
+        self.delay_pattern = self.model.config.delay_pattern
+        self.vocab_size = self.model.config.vocab_size
+        self.speech_zeroemb_idx = self.model.speech_empty_ids
+        self.model.eval()
+        print(
+            f"Model loaded in {time.monotonic() - start_loading_time:.2f} seconds, device: {self.device}"
+        )
+        self.generate_kwargs = {
+            "max_length": 8192,
+            "eos_token_id": self.tokenizer.eos_token_id,
+            "pad_token_id": self.tokenizer.pad_token_id,
+        }
+        self.default_global_sampler = MiMoSampler(
+            do_sample=True, temperature=0.6, top_k=50, top_p=0.95
+        )
+        self.default_local_sampler = MiMoSampler(
+            do_sample=True, temperature=0.9, top_k=50, top_p=0.95
+        )
+        self.task_sampler_configs = {
+            "asr": {
+                "global": MiMoSampler(do_sample=False, temperature=1.0, top_p=1.0),
+                "local": MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
+            },
+        }
+        start_loading_mimo_audio_tokenizer_time = time.monotonic()
+        self.mimo_audio_tokenizer = MiMoAudioTokenizer.from_pretrained(self.mimo_audio_tokenizer_path)
+        self.mimo_audio_tokenizer.eval().bfloat16().to(self.device)
+        print(
+            f"MiMo-Audio Tokenizer loaded in {time.monotonic() - start_loading_mimo_audio_tokenizer_time:.2f} seconds, device: {self.device}"
+        )
+        # Initialize mel spectrogram transform for consistent processing
+        self.mel_transform = MelSpectrogram(
+            sample_rate=self.mimo_audio_tokenizer.config.sampling_rate,
+            n_fft=self.mimo_audio_tokenizer.config.nfft,
+            hop_length=self.mimo_audio_tokenizer.config.hop_length,
+            win_length=self.mimo_audio_tokenizer.config.window_size,
+            f_min=self.mimo_audio_tokenizer.config.fmin,
+            f_max=self.mimo_audio_tokenizer.config.fmax,
+            n_mels=self.mimo_audio_tokenizer.config.n_mels,
+            power=1.0,
+            center=True,
+        ).to(self.device)
+    def get_task_sampler(self, task_name):
+        if task_name not in self.task_sampler_configs:
+            return {
+                "global": self.default_global_sampler,
+                "local": self.default_local_sampler
+            }
+        return self.task_sampler_configs[task_name]
+    def wav2mel(self, wav):
+        spec = self.mel_transform(wav[None, :])
+        return torch.log(torch.clip(spec, min=1e-7)).squeeze()
+    def resample_audio_if_needed(self, wav_tensor: torch.Tensor, original_sr: int):
+        target_sr = self.mimo_audio_tokenizer.config.sampling_rate
+        if original_sr != target_sr:
+            wav_tensor = torchaudio.functional.resample(
+                wav_tensor, original_sr, target_sr
+            )
+        return wav_tensor
+    def group_by_length(self, features: torch.Tensor, lengths: torch.Tensor, max_length: int):
+        if features.size(0) != lengths.sum().item():
+            raise ValueError(f"Feature size mismatch: {features.size(0)} vs {lengths.sum().item()}")
+        split_points = []
+        current_sum = 0
+        for i, seq_len in enumerate(lengths):
+            if current_sum + seq_len > max_length and current_sum > 0:
+                split_points.append(i)
+                current_sum = seq_len.item()
+            else:
+                current_sum += seq_len.item()
+        # Convert split points to group sizes
+        group_sizes = []
+        prev = 0
+        for point in split_points:
+            group_sizes.append(point - prev)
+            prev = point
+        if prev < len(lengths):
+            group_sizes.append(len(lengths) - prev)
+        len_groups = torch.split(lengths, group_sizes)
+        feature_sizes = [group.sum().item() for group in len_groups]
+        feature_groups = torch.split(features, feature_sizes)
+        return feature_groups, len_groups
+    def encode_batch(self, input_features: torch.Tensor, input_lens: torch.Tensor, max_length: int = 256000):
+        feature_groups, len_groups = self.group_by_length(input_features, input_lens, max_length)
+        encoded_parts = []
+        for features, lengths in zip(feature_groups, len_groups):
+            with torch.no_grad():
+                codes, _ = self.mimo_audio_tokenizer.encoder.encode(
+                    input_features=features.to(self.device),
+                    input_lens=lengths.to(self.device),
+                    return_codes_only=True
+                )
+                encoded_parts.append(codes)
+        return torch.cat(encoded_parts, dim=-1)
+    def preprocess_input(
+        self,
+        input: Union[str, torch.Tensor],
+    ):
+        if isinstance(input, torch.Tensor):
+            wav = input
+        else:
+            wav, sr = torchaudio.load(input)
+            if wav.ndim == 2:
+                wav = wav.mean(dim=0)
+            wav = self.resample_audio_if_needed(wav, sr)
+        wav = wav.to(self.device)
+        # Split waveform into 30s chunks, tokenize each separately, then concatenate codes
+        target_sr = self.mimo_audio_tokenizer.config.sampling_rate
+        chunk_samples = 30 * target_sr
+        n_fft = self.mimo_audio_tokenizer.config.nfft
+        total_samples = wav.shape[-1]
+        code_parts = []
+        start = 0
+        while start < total_samples:
+            end = min(start + chunk_samples, total_samples)
+            # Merge a too-short trailing chunk (would break mel reflect padding)
+            # into the current one.
+            if 0 < total_samples - end < n_fft:
+                end = total_samples
+            chunk = wav[start:end]
+            # Zero-pad if the entire audio is shorter than n_fft.
+            if chunk.shape[-1] < n_fft:
+                chunk = torch.nn.functional.pad(chunk, (0, n_fft - chunk.shape[-1]))
+            mel = self.wav2mel(chunk).transpose(0, 1)  # (seq_len, n_mels)
+            codes_chunk = self.encode_batch(
+                input_features=mel,
+                input_lens=torch.tensor([mel.size(0)]),
+            )
+            code_parts.append(codes_chunk)
+            start = end
+        codes_packed = torch.cat(code_parts, dim=-1)
+        codes = codes_packed.transpose(0, 1).detach().cpu()
+        audio_codes = codes[:, :self.audio_channels]
+        # Pad the sequence to be a multiple of group_size by repeating the last frame
+        num_timesteps = audio_codes.shape[0]
+        if num_timesteps % self.group_size != 0:
+            padding_needed = self.group_size - (num_timesteps % self.group_size)
+            last_tokens = audio_codes[-1:, :] # Keep dim for repeat
+            padding_tokens = last_tokens.repeat(padding_needed, 1)
+            audio_codes = torch.cat([audio_codes, padding_tokens], dim=0)
+        audio_tokenized = audio_codes.reshape(-1)
+        return audio_tokenized
+    def get_input_ids(self, prompt):
+        input_ids = [
+            seg.to_input_id(
+                self.tokenizer,
+                self.group_size,
+                self.audio_channels,
+            )
+            for seg in prompt
+        ]
+        input_ids = torch.cat(input_ids, dim=1)
+        return input_ids.to(self.device)
+    def get_asr_sft_prompt(
+        self,
+        input: Union[None, str] = None,
+        audio_tag="",
+    ):
+        audio_tokenized = self.preprocess_input(input)
+        if '<chinese>' in audio_tag:
+            template = random.choice(asr_zh_templates)
+        elif '<english>' in audio_tag:
+            template = random.choice(asr_en_templates)
+        else:
+            template = random.choice(asr_zh_templates + asr_en_templates)
+        lm_prompt = [
+            InputSegment(
+                text=f"<|im_start|>user\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                audio=audio_tokenized,
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=template,
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<|im_end|>\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<|im_start|>assistant\n",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            ),
+            InputSegment(
+                text=f"<think>\n\n</think>\n{audio_tag}",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.empty_token,
+            )
+        ]
+        input_ids = self.get_input_ids(lm_prompt)
+        return input_ids
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids,
+        stopping_criteria=None,
+        min_new_tokens=0,
+        max_new_tokens=8192,
+        task_name=None,
+    ):
+        task_sampler = self.get_task_sampler(task_name)
+        generation_kwargs = self.generate_kwargs.copy()
+        generation_config = GenerationConfig(**generation_kwargs)
+        input_ids = input_ids.T.reshape(1, -1) # [B, flattened(T, audio_channels + 1)]
+        prompt_length = input_ids.shape[1] // (self.audio_channels+1)
+        max_length = prompt_length // self.group_size + max_new_tokens
+        min_length = prompt_length // self.group_size + min_new_tokens
+        if stopping_criteria is not None:
+            for criterion in stopping_criteria:
+                if isinstance(criterion, MiMoStopper):
+                    criterion.max_length = max_length
+                    criterion.min_length = min_length
+        generated_ids = self.model.generate(
+            input_ids,
+            generation_config,
+            stopping_criteria=stopping_criteria,
+            global_sampler=task_sampler["global"],
+            local_sampler=task_sampler["local"],
+        )
+        generated_ids = generated_ids.int().cpu().reshape(-1, self.audio_channels+1).T[:, prompt_length:]
+        text = generated_ids[0, ::self.group_size][:-1]
+        detokenized_text = self.tokenizer.decode(text, skip_special_tokens=False).strip().replace("<|empty|>", "").replace("<|eot|>", "").replace("<|eostm|>", "")
+        print("Text channel:\t", detokenized_text)
+        return detokenized_text
+    def asr_sft(self, audio, audio_tag=""):
+        stopping_criteria = [
+            MiMoStopper(
+                stop_tokens=[self.tokenizer.eos_token_id, self.im_end_idx],
+                group_size=self.group_size,
+                audio_channels=self.audio_channels,
+            )
+        ]
+        input_ids = self.get_asr_sft_prompt(audio, audio_tag)
+        result = self.forward(input_ids, stopping_criteria=stopping_criteria, task_name="asr")
+        if '<chinese>' in result or '<english>' in result:
+            result = result.replace('<chinese>', '').replace('<english>', '').strip()
+        return result

src/mimo_audio/modeling_mimo_audio.py ADDED Viewed

	@@ -0,0 +1,835 @@

+# Copyright 2025 Xiaomi Corporation.
+import copy
+import logging
+from dataclasses import dataclass
+from typing import List, Optional, Union, cast
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers import StoppingCriteria
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation.streamers import BaseStreamer
+from transformers.generation.utils import (
+    GenerateOutput,
+    GenerationConfig,
+    StoppingCriteriaList,
+    is_deepspeed_zero3_enabled,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2Model,
+    Qwen2PreTrainedModel,
+)
+from transformers.utils import is_torchdynamo_compiling
+logger = logging.getLogger(__name__)
+class MiMoStopper(StoppingCriteria):
+    def __init__(
+        self,
+        group_size: int,
+        audio_channels: int,
+        stop_tokens: list[int] | None = None,
+        max_length: int | None = None,
+        min_length: int | None = None,
+    ) -> None:
+        super().__init__()
+        self.group_size = group_size
+        self.audio_channels = audio_channels
+        self.step = (audio_channels + 1) * group_size
+        self.stop_token_ids = set(stop_tokens or [])
+        self.max_length = max_length
+        self.min_length = min_length or 0
+    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor):
+        is_done = False
+        cur_len = input_ids.shape[-1] // self.step
+        if self.max_length:
+            is_done |= cur_len >= self.max_length
+        if (self.stop_token_ids and
+            input_ids.shape[1] >= self.step and
+            cur_len >= self.min_length):
+            last_token = input_ids[0, -self.step].item()
+            is_done |= last_token in self.stop_token_ids
+        return torch.full(
+            (input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool
+        )
+@dataclass
+class MiMoSampler:
+    do_sample: bool | None = None
+    temperature: float | None = None
+    top_k: int | None = None
+    top_p: float | None = None
+    def process(self, scores: torch.Tensor):
+        if self.temperature is not None:
+            scores = scores / self.temperature
+        if self.top_k is not None and self.top_k > 0:
+            top_k = min(self.top_k, scores.shape[-1])
+            indices_to_remove = scores < torch.topk(scores, top_k)[0][:, -1]
+            scores = scores.masked_fill(indices_to_remove, float("-inf"))
+        if self.top_p is not None and 0.0 < self.top_p <= 1.0:
+            top_p = self.top_p if 0.0 < self.top_p <= 1.0 else 1.0
+            sorted_logits, sorted_indices = torch.sort(scores)
+            cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+            sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+            sorted_indices_to_remove[:, -1] = 0
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                1, sorted_indices, sorted_indices_to_remove
+            )
+            scores = scores.masked_fill(indices_to_remove, float("-inf"))
+        return scores
+    def sample(self, scores: torch.Tensor, removed_tokens: list[int] | None = None):
+        scores = self.process(scores)
+        for t in removed_tokens or []:
+            scores[:, t] = float("-inf")
+        if self.do_sample:
+            probs = scores.softmax(dim=-1)
+            return torch.multinomial(probs, num_samples=1).squeeze(-1)
+        return torch.argmax(scores, dim=-1)
+@dataclass
+class MiMoAudioOutput(ModelOutput):
+    text_logits: torch.FloatTensor | None = None
+    local_hidden_states: torch.FloatTensor | None = None
+    past_key_values: Cache | None = None
+    """Downcast hidden states for local transformer generation"""
+@dataclass
+class MiMoAudioConfig(Qwen2Config):
+    def __init__(
+        self,
+        *,
+        speech_vocab_size: str | int = "1025-1025-129-129-129-129-129-129",
+        speech_zeroemb_idx: str | int = "1024-1024-128-128-128-128-128-128",
+        delay_pattern: str = "0-1-2-3-4-5-6-7",
+        head_dim: int = 128,
+        group_size: int = 4,
+        audio_channels: int = 8,
+        local_dim: int = 1024,
+        local_layers: int = 16,
+        local_attn_heads: int = 64,
+        local_ffn_dim: int = 4096,
+        local_attn_dropout: float = 0.1,
+        input_local_layers: int = 6,
+        input_local_dim: int | None = None,
+        input_full_attention: bool | None = None,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+        self.speech_vocab_size = speech_vocab_size
+        self.speech_zeroemb_idx = speech_zeroemb_idx
+        self.delay_pattern = delay_pattern
+        self.head_dim = head_dim
+        self.group_size = group_size
+        self.audio_channels = audio_channels
+        self.local_dim = local_dim
+        self.local_layers = local_layers
+        self.local_attn_heads = local_attn_heads
+        self.local_ffn_dim = local_ffn_dim
+        self.local_attn_dropout = local_attn_dropout
+        self.input_local_layers = input_local_layers
+        self.input_local_dim = input_local_dim or local_dim
+        self.input_full_attention = input_full_attention
+    def _parse_maybe_list(self, value: str | int, length: int) -> List[int]:
+        if isinstance(value, str) and "-" in value:
+            return [int(s) for s in value.split("-")]
+        return [int(value)] * length
+    def parsed_speech_empty_ids(self):
+        return self._parse_maybe_list(self.speech_zeroemb_idx, self.audio_channels)
+    def parsed_speech_vocab_sizes(self):
+        return self._parse_maybe_list(self.speech_vocab_size, self.audio_channels)
+    def parsed_delay_pattern(self):
+        return self._parse_maybe_list(self.delay_pattern, self.audio_channels)
+    def local_config(self):
+        config = copy.deepcopy(self)
+        config.hidden_size = self.local_dim
+        config.num_hidden_layers = self.local_layers
+        config.num_attention_heads = self.local_attn_heads
+        config.num_key_value_heads = self.local_attn_heads
+        config.head_dim = config.hidden_size // self.local_attn_heads
+        config.intermediate_size = self.local_ffn_dim
+        config.attention_dropout = self.local_attn_dropout
+        return config
+    def input_local_config(self):
+        config = copy.deepcopy(self)
+        config.hidden_size = self.input_local_dim
+        config.num_hidden_layers = self.input_local_layers
+        config.num_attention_heads = self.local_attn_heads
+        config.num_key_value_heads = self.local_attn_heads
+        config.head_dim = config.hidden_size // self.local_attn_heads
+        config.intermediate_size = config.hidden_size * 4
+        config.attention_dropout = self.local_attn_dropout
+        return config
+@dataclass
+class MiMoAudioArguments:
+    model_name_or_path: str
+    sosp_idx: int
+    eosp_idx: int
+    sostm_idx: int
+    eostm_idx: int
+    eot_idx: int
+    empty_idx: int
+    def to_dict(self):
+        return {
+            "model_name_or_path": self.model_name_or_path,
+            "sosp_idx": self.sosp_idx,
+            "eosp_idx": self.eosp_idx,
+            "sostm_idx": self.sostm_idx,
+            "eostm_idx": self.eostm_idx,
+            "eot_idx": self.eot_idx,
+            "empty_idx": self.empty_idx,
+        }
+class MiMoAudioForCausalLM(Qwen2PreTrainedModel):
+    def __init__(
+        self,
+        config: MiMoAudioConfig | Qwen2Config,
+        args: MiMoAudioArguments | dict,
+    ):
+        super().__init__(config)
+        config = (
+            MiMoAudioConfig(**vars(config))
+            if isinstance(config, Qwen2Config)
+            else config
+        )
+        args = MiMoAudioArguments(**args) if isinstance(args, dict) else args
+        self.config = config
+        self.args = args
+        self.model = Qwen2Model(config)
+        self.speech_vocab_sizes = config.parsed_speech_vocab_sizes()
+        self.speech_empty_ids = config.parsed_speech_empty_ids()
+        self.delay_pattern = config.parsed_delay_pattern()
+        self.group_size = config.group_size
+        self.audio_channels = config.audio_channels
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Construct local transformer
+        self.local_config = config.local_config()
+        self.local_transformer = Qwen2Model(self.local_config)
+        self.local_transformer.embed_tokens = None
+        # Add input local transformer if configured
+        self.input_local_config = config.input_local_config()
+        self.input_local_transformer = Qwen2Model(self.input_local_config)
+        self.input_local_transformer.embed_tokens = None
+        self.local_transformer_lm_heads = nn.ModuleList(
+            [
+                nn.Linear(
+                    self.local_config.hidden_size,
+                    self.speech_vocab_sizes[i],
+                    bias=False,
+                )
+                for i in range(self.audio_channels)
+            ]
+        )
+        self.speech_embeddings = nn.ModuleList(
+            [
+                nn.Embedding(
+                    self.speech_vocab_sizes[i],
+                    self.input_local_config.hidden_size,
+                    padding_idx=self.speech_empty_ids[i],
+                )
+                for i in range(self.audio_channels)
+            ]
+        )
+        if self.input_local_config.hidden_size != self.local_config.hidden_size:
+            self.speech_embeddings_to_local = nn.Linear(
+                self.input_local_config.hidden_size,
+                self.local_config.hidden_size,
+                bias=False,
+            )
+        else:
+            self.speech_embeddings_to_local = None
+        # Create speech_group_downcast_first for group_first_in_global_context
+        self.speech_group_downcast = nn.Linear(
+            self.input_local_config.hidden_size * config.group_size,
+            config.hidden_size,
+            bias=False,
+        )
+        self.hidden_states_downcast = nn.Linear(
+            config.hidden_size,
+            self.local_config.hidden_size,
+            bias=False,
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def apply_input_local_transformer(self, speech_embeddings: torch.Tensor):
+        B, T_groups, group_size, hidden_size = speech_embeddings.shape
+        # Process each group independently: [B*T//group_size, group_size, hidden_size]
+        input_embeddings = speech_embeddings.reshape(
+            B * T_groups, group_size, hidden_size
+        )
+        output: BaseModelOutputWithPast = self.input_local_transformer(
+            inputs_embeds=input_embeddings,
+            return_dict=True,
+            is_causal=not self.config.input_full_attention,  # for SDPA
+        )
+        encoded_embeddings = output.last_hidden_state
+        # Reshape back to original format
+        # [B*T//group_size, group_size, hidden_size] -> [B, T//group_size, group_size, hidden_size]
+        encoded_embeddings = encoded_embeddings.reshape(
+            B, T_groups, group_size, hidden_size
+        )
+        return encoded_embeddings
+    def _prepare_input_embeds(
+        self,
+        input_ids: torch.LongTensor,  # [B, audio_channels + 1, new_T]
+    ):
+        B = input_ids.shape[0]
+        input_ids = input_ids.int()
+        group_size = self.config.group_size
+        text_input_ids = input_ids[:, 0, ::group_size]
+        speech_input_ids = (
+            input_ids[:, 1:, :]
+            .view(B, self.audio_channels, -1, group_size)
+            .transpose(1, 2)
+        )  # [B, T//group_size, audio_channels, group_size]
+        is_speech = text_input_ids == self.args.empty_idx  # [B, T//group_size]
+        speech_embeds = torch.zeros(
+            (
+                B,
+                is_speech.shape[1],
+                group_size,
+                self.input_local_config.hidden_size,
+            ),
+            device=input_ids.device,
+            dtype=torch.bfloat16,
+        )
+        for idx in range(self.audio_channels):
+            cur_empty = self.speech_empty_ids[idx]
+            cur_embed = self.speech_embeddings[idx]
+            cur_speech_ids = speech_input_ids[:, :, idx, :]
+            cur_speech_embeds: torch.Tensor = cur_embed(cur_speech_ids)
+            # [B, T_groups, group_size, hidden_size]
+            cur_mask = cur_speech_ids == cur_empty
+            cur_speech_embeds.masked_fill_(cur_mask.unsqueeze(-1), 0.0)
+            speech_embeds += cur_speech_embeds
+        speech_embeds = speech_embeds * is_speech.unsqueeze(-1).unsqueeze(-1)
+        # Apply input local transformer if configured
+        speech_embeds = self.apply_input_local_transformer(speech_embeds)
+        speech_embeds = speech_embeds * is_speech.unsqueeze(-1).unsqueeze(-1)
+        T_groups = speech_embeds.shape[1]
+        speech_grouped_embeds: torch.Tensor = self.speech_group_downcast(
+            speech_embeds.view(B, T_groups, -1)
+        )  # [B, T_groups, hidden_size]
+        text_embeds: torch.Tensor = self.model.embed_tokens(text_input_ids)
+        text_zero_mask = text_input_ids == self.args.empty_idx
+        text_embeds.masked_fill_(text_zero_mask.unsqueeze(-1), 0.0)
+        return text_embeds + speech_grouped_embeds
+    def forward(
+        self,
+        input_ids: torch.LongTensor,  # [B, audio_channels + 1, new_T]
+        attention_mask: torch.Tensor,  # [B, T_group]
+        position_ids: torch.LongTensor,  # [B, new_T_group]
+        past_key_values: Cache | None = None,
+        cache_position: torch.LongTensor | None = None,  # [new_T_group]
+        **_kwargs,
+    ):
+        inputs_embeds = self._prepare_input_embeds(input_ids)
+        outputs: BaseModelOutputWithPast = self.model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=True,
+            return_dict=True,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs.last_hidden_state  # [B, new_T_group, hidden_size]
+        text_logits: torch.Tensor = self.lm_head(
+            hidden_states[:, -1:, :]
+        )  # [B, 1, vocab_size]
+        shift_hidden_states: torch.Tensor = self.hidden_states_downcast(
+            hidden_states[:, -1:, :]
+        )  # [B, 1, hidden_size]
+        return MiMoAudioOutput(
+            text_logits=text_logits,
+            local_hidden_states=shift_hidden_states,
+            past_key_values=outputs.past_key_values,
+        )
+    def local_forward(
+        self,
+        local_embeds: torch.FloatTensor,  # [B, 1, hidden_size]
+        tokens_dtype: torch.dtype,
+        tokens_device: torch.device,
+        local_sampler: MiMoSampler | None = None,
+    ):
+        B = local_embeds.shape[0]
+        delay_iters = self.group_size + max(self.delay_pattern)
+        past_key_values = DynamicCache()
+        local_tokens = torch.zeros(
+            (B, self.group_size, self.audio_channels),
+            dtype=tokens_dtype,
+            device=tokens_device,
+        )
+        if local_sampler is None:
+            local_sampler = MiMoSampler()
+        for t in range(delay_iters):
+            output: BaseModelOutputWithPast = self.local_transformer(
+                inputs_embeds=local_embeds,
+                past_key_values=past_key_values,
+                return_dict=True,
+                use_cache=True,
+            )
+            hidden_state = output.last_hidden_state
+            past_key_values = output.past_key_values
+            local_embeds = torch.zeros_like(local_embeds)
+            for idx in range(self.audio_channels):
+                cur_start = self.delay_pattern[idx]
+                cur_end = cur_start + self.group_size
+                cur_empty = self.speech_empty_ids[idx]
+                if cur_start <= t < cur_end:
+                    cur_lm_head = self.local_transformer_lm_heads[idx]
+                    cur_scores: torch.Tensor = cur_lm_head(hidden_state)[:, -1, :]
+                    # [B, vocab_size]
+                    cur_token = local_sampler.sample(
+                        cur_scores,
+                        [cur_empty],
+                    )
+                    local_tokens[:, t - cur_start, idx] = cur_token
+                    cur_input_embed = self.speech_embeddings[idx](
+                        cur_token.unsqueeze(1)
+                    )
+                    if self.speech_embeddings_to_local is not None:
+                        cur_input_embed = self.speech_embeddings_to_local(
+                            cur_input_embed
+                        )
+                    local_embeds += cur_input_embed
+        return local_tokens  # [B, group_size, audio_channels]
+    def _prepare_attention_mask(
+        self, inputs: torch.Tensor, input_ids_length: int
+    ) -> torch.Tensor:
+        # No information for attention mask inference -> return default attention mask
+        return torch.ones(
+            (inputs.shape[0], input_ids_length),
+            dtype=torch.bool,
+            device=inputs.device,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        """
+        Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or
+        slicing inputs given the existing cache.
+        See the forward pass in the model documentation for expected arguments (different models might have different
+        requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
+        """
+        # 1. Handle BC:
+        model_inputs = {}
+        input_ids = input_ids.reshape(
+            input_ids.shape[0], -1, (self.audio_channels + 1) * self.config.group_size
+        ).transpose(1, 2)  # [B, audio_channels*group_size, T]
+        # - some models don't have `Cache` support (which implies they don't expect `cache_position` in `forward`)
+        if self._supports_cache_class:
+            model_inputs["cache_position"] = cache_position
+        # - `cache_position` was not a mandatory input in `prepare_inputs_for_generation` for those models, and this
+        #   function may be called outside of `generate`. Handle most use cases by creating `cache_position` on the fly
+        #   (this alternative is not as robust as calling `generate` and letting it create `cache_position`)
+        elif cache_position is None:
+            past_length = (
+                past_key_values[0][0].shape[2] if past_key_values is not None else 0
+            )
+            cache_position = torch.arange(
+                past_length,
+                input_ids.shape[2],
+                dtype=torch.long,
+                device=input_ids.device,
+            )
+        # 2. Generic cache-dependent input preparation
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case
+        if past_key_values is not None:
+            model_inputs["past_key_values"] = past_key_values
+            if (
+                inputs_embeds is not None or cache_position[-1] >= input_ids.shape[2]
+            ):  # Exception 1 or Exception 3
+                input_ids = input_ids[:, :, -cache_position.shape[0] :]
+            elif (
+                input_ids.shape[2] != cache_position.shape[0]
+            ):  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, :, cache_position]
+        # 3. Prepare base model inputs
+        input_ids_key = (
+            "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+        )
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if not self.config.is_encoder_decoder:
+            if inputs_embeds is not None and cache_position[0] == 0:
+                model_inputs[input_ids_key] = None
+                model_inputs["inputs_embeds"] = inputs_embeds
+            else:
+                # `clone` calls in this function ensure a consistent stride. See #32227
+                model_inputs[input_ids_key] = input_ids.clone(
+                    memory_format=torch.contiguous_format
+                )
+                model_inputs["inputs_embeds"] = None
+        else:
+            model_inputs[input_ids_key] = input_ids.clone(
+                memory_format=torch.contiguous_format
+            )
+        # 4. Create missing `position_ids` on the fly
+        if attention_mask is not None and kwargs.get("position_ids") is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            kwargs["position_ids"] = (
+                position_ids  # placed in kwargs for further processing (see below)
+            )
+        # 5. Slice model inputs if it's an input that should have the same length as `input_ids`
+        for model_input_name in ["position_ids", "token_type_ids"]:
+            model_input: torch.Tensor = kwargs.get(model_input_name)
+            if model_input is not None:
+                if past_key_values:
+                    model_input = model_input[:, -input_ids.shape[2] :]
+                    model_input = model_input.clone(
+                        memory_format=torch.contiguous_format
+                    )
+                model_inputs[model_input_name] = model_input
+        if attention_mask is not None:
+            model_inputs["attention_mask"] = attention_mask
+        # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+        if model_inputs[input_ids_key] is not None:
+            model_inputs[input_ids_key] = (
+                cast(torch.Tensor, model_inputs[input_ids_key])
+                .transpose(1, 2)
+                .reshape(input_ids.shape[0], -1, (self.audio_channels + 1))
+                .transpose(1, 2)
+            )  # [B, audio_channels, T*group_size]
+        # 8. Remove unexpected `generate` inputs (TODO @joao: fix trainer and examples)
+        model_inputs.pop("labels", None)
+        return model_inputs
+    def _get_initial_cache_position(self, input_ids: torch.Tensor, model_kwargs: dict):
+        """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length"""
+        # `torch.compile`-friendly `torch.arange` from a shape -- the lines below are equivalent to `torch.arange`
+        if "inputs_embeds" in model_kwargs:
+            cache_position = (
+                torch.ones_like(
+                    model_kwargs["inputs_embeds"][0, :, 0], dtype=torch.int64
+                ).cumsum(0)
+                - 1
+            )
+        else:
+            cache_position = (
+                torch.ones(
+                    (
+                        input_ids.shape[1]
+                        // (self.audio_channels + 1)
+                        // self.config.group_size,
+                    ),
+                    dtype=torch.int64,
+                    device=input_ids.device,
+                ).cumsum(0)
+                - 1
+            )
+        past_length = 0
+        if model_kwargs.get("past_key_values") is not None:
+            cache = model_kwargs["past_key_values"]
+            past_length = 0
+            if not isinstance(cache, Cache):
+                past_length = cache[0][0].shape[2]
+            elif (
+                hasattr(cache, "get_seq_length") and cache.get_seq_length() is not None
+            ):
+                past_length = cache.get_seq_length()
+            # TODO(joao): this is not torch.compile-friendly, find a work-around. If the cache is not empty,
+            # end-to-end compilation will yield bad results because `cache_position` will be incorrect.
+            if not is_torchdynamo_compiling():
+                cache_position = cache_position[past_length:]
+        model_kwargs["cache_position"] = cache_position
+        return model_kwargs
+    @torch.inference_mode()
+    def generate(
+        self,
+        inputs: torch.Tensor | None = None,
+        generation_config: GenerationConfig | None = None,
+        stopping_criteria: StoppingCriteriaList | list | None = None,
+        streamer: BaseStreamer | None = None,
+        synced_gpus: bool | None = None,
+        global_sampler: MiMoSampler | None = None,
+        local_sampler: MiMoSampler | None = None,
+        warmup_run: bool | None = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        generation_config, model_kwargs = self._prepare_generation_config(
+            generation_config, **kwargs
+        )
+        self._validate_model_kwargs(model_kwargs.copy())
+        # 2. Set generation parameters if not already defined
+        if synced_gpus is None:
+            if is_deepspeed_zero3_enabled() and dist.get_world_size() > 1:
+                synced_gpus = True
+            else:
+                synced_gpus = False
+        # 3. Define model inputs
+        input_ids, _model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        input_ids_length = input_ids.shape[-1]
+        input_ids_length //= self.group_size * (self.audio_channels + 1)
+        if streamer is not None:
+            streamer.put(input_ids.cpu())
+        if "attention_mask" not in model_kwargs:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask(
+                inputs, input_ids_length
+            )
+        device = input_ids.device
+        self._prepare_special_tokens(generation_config, True, device=device)
+        model_kwargs["use_cache"] = True
+        model_kwargs["past_key_values"] = DynamicCache()
+        prepared_stopping_criteria = StoppingCriteriaList(
+            stopping_criteria if stopping_criteria is not None else []
+        )
+        prepared_stopping_criteria.append(
+            MiMoStopper(
+                self.group_size,
+                self.audio_channels,
+                max_length=generation_config.max_length,
+            )
+        )
+        stance = "default" if warmup_run else "eager_on_recompile"
+        with torch.compiler.set_stance(stance):
+            return self.slm_sample(
+                input_ids,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                global_sampler=global_sampler,
+                local_sampler=local_sampler,
+                **model_kwargs,
+            )
+    def slm_sample(
+        self,
+        input_ids: torch.LongTensor,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: BaseStreamer | None,
+        global_sampler: MiMoSampler | None = None,
+        local_sampler: MiMoSampler | None = None,
+        **model_kwargs,
+    ) -> torch.LongTensor:
+        max_length = generation_config.max_length
+        B, cur_len = input_ids.shape
+        cur_len //= self.group_size * (self.audio_channels + 1)
+        initial_len = cur_len
+        this_peer_finished = False
+        unfinished_sequences = torch.ones(B, dtype=torch.long, device=input_ids.device)
+        min_length = 0
+        stop_token_ids = set()
+        for criterion in stopping_criteria:
+            if isinstance(criterion, MiMoStopper):
+                if criterion.min_length is not None:
+                    min_length = max(min_length, criterion.min_length)
+                stop_token_ids.update(criterion.stop_token_ids)
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+        while self._has_unfinished_sequences(
+            this_peer_finished,
+            synced_gpus,
+            device=input_ids.device,
+            cur_len=cur_len,
+            max_length=max_length,
+        ):
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            if (
+                cast(torch.Tensor, model_inputs["input_ids"]).shape[2]
+                != self.group_size
+            ):
+                # prefill run
+                with torch.compiler.set_stance("force_eager"):
+                    outputs: MiMoAudioOutput = self(**model_inputs)
+            else:
+                outputs: MiMoAudioOutput = self(**model_inputs)
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+            text_logits: torch.Tensor = outputs.text_logits[:, -1, :].clone()
+            # [B, vocab_size]
+            removed_tokens = None
+            if cur_len < min_length:
+                removed_tokens = list(stop_token_ids)
+            next_text_tokens = global_sampler.sample(text_logits, removed_tokens=removed_tokens)
+            # [B]
+            local_hidden_states = outputs.local_hidden_states
+            # Only Supports batch_size=1 here
+            if next_text_tokens[0] != self.args.empty_idx:
+                zero_embed_tensor = torch.tensor(
+                    self.speech_empty_ids,
+                    device=next_text_tokens.device,
+                    dtype=input_ids.dtype,
+                )
+                next_speech_tokens = zero_embed_tensor.view(
+                    1, 1, self.audio_channels
+                ).expand(B, self.config.group_size, -1)
+            else:
+                next_speech_tokens = self.local_forward(
+                    local_embeds=local_hidden_states,
+                    tokens_dtype=next_text_tokens.dtype,
+                    tokens_device=next_text_tokens.device,
+                    local_sampler=local_sampler,
+                )
+            next_text_tokens = next_text_tokens.reshape(B, 1, 1).expand(
+                -1, self.group_size, -1
+            )  # [B, group_size, 1]
+            # generate speech tokens
+            next_tokens = torch.cat(
+                (next_text_tokens, next_speech_tokens), dim=-1
+            ).reshape(B, -1)  # [B, group_size * (audio_channels + 1)]
+            input_ids = torch.cat(
+                [input_ids, next_tokens], dim=-1
+            )  # [B, T*group_size*vq]
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(
+                input_ids, None
+            )
+            this_peer_finished = unfinished_sequences.max() == 0
+            cur_len += 1
+            # This is needed to properly delete outputs.logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            del outputs
+        if streamer is not None:
+            streamer.end()
+        input_ids = input_ids[:B]
+        return input_ids

src/mimo_audio/process_speechdata.py ADDED Viewed

	@@ -0,0 +1,289 @@

+#!/usr/bin/env python3
+# Copyright 2025 Xiaomi Corporation.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import torch
+import torch.nn.functional as F
+from typing import Tuple, Union, List
+class InputSegment:
+    def __init__(
+        self,
+        text: str = "",
+        audio: torch.Tensor = None,
+        tokenized_text: torch.Tensor = None,
+        speech_zeroemb_idx: Union[int, List[int]] = 1024,
+        text_zeroemb_idx: int = 152067,
+        add_sosp_eosp=True,
+    ) -> None:
+        has_text = text is not None
+        has_tokenized_text = tokenized_text is not None
+        assert has_text or has_tokenized_text, "Text or tokenized text must be provided"
+        self.audio = audio
+        self.text = text
+        self.tokenized_text = tokenized_text
+        self.speech_zeroemb_idx = speech_zeroemb_idx
+        self.text_zeroemb_idx = text_zeroemb_idx
+        self.add_sosp_eosp = add_sosp_eosp
+    @staticmethod
+    def insert_between(tensor, i, value=-1):
+        return torch.scatter(
+            torch.full(
+                (1, tensor.shape[1] + (tensor.shape[1] - 1) * i + i),
+                value,
+                dtype=tensor.dtype,
+            ),
+            1,
+            torch.arange(0, tensor.shape[1], dtype=torch.int64)[None] * (i + 1),
+            tensor,
+        )
+    def to_input_id(
+        self,
+        tokenizer,
+        group_size: int,
+        audio_channels: int = 8,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.audio is None:
+            if self.tokenized_text is None:
+                tokenized_text = tokenizer(
+                    self.text,
+                    return_tensors="pt",
+                    truncation=True,
+                    max_length=999999,
+                    padding=False,
+                    add_special_tokens=False,
+                )["input_ids"].int()
+            else:
+                tokenized_text = self.tokenized_text.unsqueeze(0)
+            if group_size > 1:
+                tokenized_text = self.insert_between(
+                    tokenized_text, group_size - 1, value=-100
+                )
+            if isinstance(self.speech_zeroemb_idx, list):
+                audio_part_input_id = torch.zeros((audio_channels, tokenized_text.shape[1]), dtype=torch.int)
+                for i, idx in enumerate(self.speech_zeroemb_idx):
+                    audio_part_input_id[i, :] = idx
+            else:
+                audio_part_input_id = torch.full(
+                    (audio_channels, tokenized_text.shape[1]), self.speech_zeroemb_idx, dtype=torch.int
+                )
+        else:
+            sosp_token = (
+                tokenizer.convert_tokens_to_ids("<|sosp|>")
+                if self.add_sosp_eosp
+                else None
+            )
+            eosp_token = (
+                tokenizer.convert_tokens_to_ids("<|eosp|>")
+                if self.add_sosp_eosp
+                else None
+            )
+            audio_part = self.audio.reshape(-1, audio_channels).T  # [audio_channels, seqlen]
+            assert (
+                audio_part.shape[1] % group_size == 0
+            ), f"Audio shape {audio_part.shape} is not divisible by group_size {group_size}"
+            text_len = audio_part.shape[1] // group_size
+            empty_token = self.text_zeroemb_idx
+            if empty_token is None:
+                empty_token = tokenizer.eod
+            tokenized_text = torch.full((1, text_len), empty_token, dtype=torch.int)
+            tokenized_text = (
+                torch.cat(
+                    [
+                        torch.tensor([[sosp_token]], dtype=torch.int),
+                        tokenized_text,
+                        torch.tensor([[eosp_token]], dtype=torch.int),
+                    ],
+                    dim=1,
+                )
+                if self.add_sosp_eosp
+                else tokenized_text
+            )
+            tokenized_text = self.insert_between(
+                tokenized_text, group_size - 1, value=-100
+            )
+            if self.add_sosp_eosp:
+                if isinstance(self.speech_zeroemb_idx, list):
+                    sosp_part = torch.zeros((audio_channels, group_size), dtype=torch.int)
+                    eosp_part = torch.zeros((audio_channels, group_size), dtype=torch.int)
+                    for i, idx in enumerate(self.speech_zeroemb_idx):
+                        sosp_part[i, :] = idx
+                        eosp_part[i, :] = idx
+                    audio_part_input_id = torch.cat([sosp_part, audio_part, eosp_part], dim=1)
+                else:
+                    audio_part_input_id = torch.cat(
+                        [
+                            torch.full((audio_channels, group_size), self.speech_zeroemb_idx, dtype=torch.int),
+                            audio_part,
+                            torch.full((audio_channels, group_size), self.speech_zeroemb_idx, dtype=torch.int),
+                        ],
+                        dim=1,
+                    )
+            else:
+                audio_part_input_id = audio_part
+        input_ids = torch.cat(
+            [tokenized_text, audio_part_input_id], dim=0
+        )  # [n_rvq + 1, seqlen]
+        return input_ids
+class StreamingInputSegment:
+    def __init__(
+        self,
+        text: str = "",
+        audio: torch.Tensor = None,
+        tokenized_text: torch.Tensor = None,
+        speech_zeroemb_idx: Union[int, List[int]] = 1024,
+        text_zeroemb_idx: int = 152067,
+        text_segment_size: int = 5,
+        audio_segment_size: int = 5,
+        tokenizer=None,
+        group_size=None,
+        audio_channels=None,
+    ) -> None:
+        has_text = text is not None
+        has_tokenized_text = tokenized_text is not None
+        assert has_text or has_tokenized_text, "Text or tokenized text must be provided"
+        self.audio = audio
+        self.text = text
+        self.tokenized_text = tokenized_text
+        self.speech_zeroemb_idx = speech_zeroemb_idx
+        self.text_zeroemb_idx = text_zeroemb_idx
+        self.text_segment_size = text_segment_size
+        self.audio_segment_size = audio_segment_size
+        self.tokenizer = tokenizer
+        self.group_size = group_size
+        self.audio_channels = audio_channels
+    def to_input_id(
+        self,
+        tokenizer,
+        group_size: int,
+        audio_channels: int = 8,
+    ):
+        if self.tokenized_text is None:
+            tokenized_text = tokenizer(
+                self.text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=999999,
+                padding=False,
+                add_special_tokens=False,
+            )["input_ids"].int()  # [1, seqlen]
+        else:
+            tokenized_text = self.tokenized_text.unsqueeze(0)
+        tokenized_text = tokenized_text.squeeze(0)
+        text_segments = tokenized_text.split(self.text_segment_size, dim=0)
+        audio_segments = self.audio.split(self.audio_segment_size*group_size*audio_channels, dim=0)
+        tokenized_segments = []
+        tokenized_segments.append(
+            InputSegment(
+                text='<|sostm|>',
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.text_zeroemb_idx,
+            ),
+        )
+        eot_tokens = tokenizer(
+            "<|eot|>",
+            return_tensors="pt",
+            truncation=True,
+            max_length=999999,
+            padding=False,
+            add_special_tokens=False,
+        )["input_ids"][0].to(text_segments[-1])
+        text_segments = text_segments[:-1] + (torch.cat([text_segments[-1], eot_tokens], dim=0),)
+        length = min(len(text_segments), len(audio_segments))
+        for i in range(length):
+            text_segment = text_segments[i]
+            audio_segment = audio_segments[i]
+            tokenized_segments.append(
+                InputSegment(
+                    tokenized_text=text_segment,
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.text_zeroemb_idx,
+                ),
+            )
+            tokenized_segments.append(
+                InputSegment(
+                    audio=audio_segment,
+                    add_sosp_eosp=False,
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.text_zeroemb_idx,
+                ),
+            )
+        for j in range(length, len(text_segments)):
+            tokenized_segments.append(
+                InputSegment(
+                    tokenized_text=text_segments[j],
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.text_zeroemb_idx,
+                ),
+            )
+        for j in range(length, len(audio_segments)):
+            tokenized_segments.append(
+                InputSegment(
+                    audio=audio_segments[j],
+                    add_sosp_eosp=False,
+                    speech_zeroemb_idx=self.speech_zeroemb_idx,
+                    text_zeroemb_idx=self.text_zeroemb_idx,
+                ),
+            )
+        tokenized_segments.append(
+            InputSegment(
+                text="<|eostm|>",
+                speech_zeroemb_idx=self.speech_zeroemb_idx,
+                text_zeroemb_idx=self.text_zeroemb_idx,
+            ),
+        )
+        input_ids = [
+            seg.to_input_id(
+                self.tokenizer,
+                self.group_size,
+                self.audio_channels,
+            )
+            for seg in tokenized_segments
+        ]
+        input_ids = torch.cat(input_ids, dim=1).type(torch.int64)  # [n_rvq + 1, seqlen]
+        return input_ids

src/mimo_audio/templates.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright 2025 Xiaomi Corporation.
+asr_zh_templates = [
+    "请将这段语音转换为文字",
+    "帮我识别这个音频文件中的内容",
+    "把这段录音转成文本",
+    "请转录这段语音",
+    "将音频内容转换成文字格式",
+    "识别并转写这段语音",
+    "把语音内容写成文字",
+    "转录这个音频片段",
+    "将这段对话转换为文本",
+    "麻烦帮我把这段录音整理成详细的文字记录",
+  ]
+asr_en_templates = [
+    "Please transcribe this audio file",
+    "Convert this speech recording to text",
+    "Transcribe the following voice message",
+    "Turn this audio into readable text",
+    "Please convert the recording to written format",
+    "Transcribe what you hear in this audio",
+    "Convert this spoken content to text",
+    "Please write down what is said in this recording",
+    "Transcribe this voice recording",
+    "Could you please help me transcribe this important recording?",
+    "Would you mind converting this voice message into a readable text format?",
+    "I'd really appreciate it if you could turn this audio file into a written document",
+  ]
+tts_zh_templates = [
+    "请将这段文字转换为语音",
+    "帮我把这个文本读出来",
+    "将这些文字生成音频",
+    "请朗读这段内容",
+    "把这段话转换成语音文件",
+    "生成这段文字的语音版本",
+    "请用语音播报这些内容",
+    "将文本转换为可听的音频",
+    "帮我朗读这段文字",
+    "把这些内容念出来",
+  ]
+tts_en_templates = [
+    "Please convert this text to speech",
+    "Turn this writing into audio",
+    "Generate speech from this text",
+    "Read this content out loud",
+    "Convert these words to voice",
+    "Create an audio version of this text",
+    "Please vocalize this content",
+    "Turn this text into audible format",
+    "Help me convert this writing to speech",
+    "Make this text into spoken audio",
+  ]

src/mimo_audio_tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright 2025 Xiaomi Corporation.
+from .modeling_audio_tokenizer import MiMoAudioTokenizer, StreamingConfig, StreamingCache
+from .configuration_audio_tokenizer import MiMoAudioTokenizerConfig
+__all__ = ['MiMoAudioTokenizer', 'StreamingConfig', 'StreamingCache', 'MiMoAudioTokenizerConfig']

src/mimo_audio_tokenizer/configuration_audio_tokenizer.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright 2025 Xiaomi Corporation.
+from transformers import PretrainedConfig
+class MiMoAudioTokenizerConfig(PretrainedConfig):
+    model_type = "mimo_audio_tokenizer"
+    def __init__(
+        self,
+        max_audio_seconds: int = 1800,
+        stride_size: int = 2,
+        avg_pooler: int = 1,
+        d_model: int = 768,
+        scale_embedding: bool = True,
+        kernel_size: int = 3,
+        activation_function: str = "gelu",
+        encoder_layers: int = 8,
+        encoder_skip_layer_id: int = None,
+        encoder_attention_heads: int = 12,
+        encoder_ffn_dim: int = 3072,
+        encoder_causal: bool = False,
+        encoder_attn_window_size: list[int] = None,
+        decoder_layers: int = 8,
+        decoder_attention_heads: int = 12,
+        decoder_ffn_dim: int = 3072,
+        decoder_kernel_size: int = 3,
+        decoder_stride_size: int = 2,
+        decoder_causal: bool = True,
+        decoder_attn_window_size: list[int] = None,
+        nfft: int = 1024,
+        vocoder_dim: int = 512,
+        vocoder_intermediate_dim: int = 4096,
+        vocoder_num_layers: int = 30,
+        n_mels: int = 80,
+        sampling_rate: int = 24000,
+        hop_length: int = 240,
+        window_size: int = 1024,
+        vocoder_padding: str = "same",
+        fmin: int = 0,
+        fmax: int = None,
+        num_quantizers: int = 12,
+        codebook_size: list[int] = None,
+        threshold_ema_dead_code: int = 10,
+        position_embedding_type: str = "rope",
+        rope_theta: int = 10000,
+        rope_type: str = "default",
+        ln_type: str = "LayerNorm",
+        vocoder_attention_heads: int = 4,
+        vocoder_attn_window_size: list[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_audio_seconds = max_audio_seconds
+        self.stride_size = stride_size
+        self.avg_pooler = avg_pooler
+        self.d_model = d_model
+        self.scale_embedding = scale_embedding
+        self.kernel_size = kernel_size
+        self.activation_function = activation_function
+        self.encoder_layers = encoder_layers
+        self.encoder_skip_layer_id = encoder_skip_layer_id
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_causal = encoder_causal
+        self.encoder_attn_window_size = (
+            encoder_attn_window_size
+            if encoder_attn_window_size is not None
+            else [-1, -1]
+        )
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_kernel_size = decoder_kernel_size
+        self.decoder_stride_size = decoder_stride_size
+        self.decoder_causal = decoder_causal
+        self.decoder_attn_window_size = (
+            decoder_attn_window_size
+            if decoder_attn_window_size is not None
+            else [-1, -1]
+        )
+        self.nfft = nfft
+        self.vocoder_dim = vocoder_dim
+        self.vocoder_intermediate_dim = vocoder_intermediate_dim
+        self.vocoder_num_layers = vocoder_num_layers
+        self.n_mels = n_mels
+        self.sampling_rate = sampling_rate
+        self.hop_length = hop_length
+        self.window_size = window_size
+        self.vocoder_padding = vocoder_padding
+        self.fmin = fmin
+        self.fmax = fmax
+        self.num_quantizers = num_quantizers
+        self.codebook_size = codebook_size if codebook_size is not None else [1024]
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.position_embedding_type = position_embedding_type
+        self.rope_theta = rope_theta
+        self.rope_type = rope_type
+        self.ln_type = ln_type
+        self.vocoder_attention_heads = vocoder_attention_heads
+        self.vocoder_attn_window_size = (
+            vocoder_attn_window_size
+            if vocoder_attn_window_size is not None
+            else [40, 10]
+        )

src/mimo_audio_tokenizer/modeling_audio_tokenizer.py ADDED Viewed

	@@ -0,0 +1,857 @@

+# Copyright 2025 Xiaomi Corporation.
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from flash_attn import flash_attn_varlen_func
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_audio_tokenizer import MiMoAudioTokenizerConfig
+from .modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update, apply_rotary_pos_emb
+from .quantization import ResidualVectorQuantizer
+from dataclasses import dataclass, field
+from typing import List
+def get_sequence_mask(inputs, inputs_length):
+    if inputs.dim() == 3:
+        bsz, tgt_len, _ = inputs.size()
+    else:
+        bsz, tgt_len = inputs_length.shape[0], torch.max(inputs_length)
+    sequence_mask = torch.arange(0, tgt_len).to(inputs.device)
+    sequence_mask = torch.lt(sequence_mask, inputs_length.reshape(bsz, 1)).view(
+        bsz, tgt_len, 1
+    )
+    unpacking_index = torch.cumsum(sequence_mask.to(torch.int64).view(-1), dim=0) - 1
+    return sequence_mask, unpacking_index
+def unpack_hidden_states(
+    hidden_states, lengths, sequence_mask=None, unpacking_index=None
+):
+    bsz = lengths.shape[0]
+    if sequence_mask is None or unpacking_index is None:
+        sequence_mask, unpacking_index = get_sequence_mask(hidden_states, lengths)
+    hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+        bsz, torch.max(lengths), hidden_states.shape[-1]
+    )
+    hidden_states = torch.where(sequence_mask, hidden_states, 0)
+    return hidden_states
+def get_position_ids(lengths):
+    total_len = lengths.sum()
+    offset = torch.cat([torch.zeros(1).to(lengths), lengths[:-1].cumsum(dim=0)])
+    offset = torch.repeat_interleave(offset, lengths)
+    position_ids = torch.arange(0, total_len).to(offset) - offset
+    return position_ids
+@dataclass
+class StreamingConfig:
+    seg_point: int = field(default=60 * 25)
+    process_seg_point: bool = field(default=True)
+    left_overlap: int = field(default=10 * 25)
+    right_overlap: int = field(default=40)
+    seg_point_left_overlap: int = field(default=0)
+@dataclass
+class StreamingCache:
+    hidden_states: List[torch.Tensor] = field(default=None)
+    processed_lengths: List[int] = field(default=None)
+class ISTFT(nn.Module):
+    """
+    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
+    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
+    See issue: https://github.com/pytorch/pytorch/issues/62323
+    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
+    The NOLA constraint is met as we trim padded samples anyway.
+    Args:
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames.
+        win_length (int): The size of window frame and STFT filter.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(
+        self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"
+    ):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        window = torch.hann_window(win_length)
+        self.register_buffer("window", window)
+    def forward(self, spec: torch.Tensor) -> torch.Tensor:
+        """
+        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
+        Args:
+            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
+                            N is the number of frequency bins, and T is the number of time frames.
+        Returns:
+            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
+        """
+        if self.padding == "center":
+            # Fallback to pytorch native implementation
+            return torch.istft(
+                spec,
+                self.n_fft,
+                self.hop_length,
+                self.win_length,
+                self.window,
+                center=True,
+            )
+        elif self.padding == "same":
+            pad = (self.win_length - self.hop_length) // 2
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        assert spec.dim() == 3, "Expected a 3D tensor as input"
+        B, N, T = spec.shape
+        # Inverse FFT
+        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+        ifft = ifft * self.window[None, :, None]
+        # Overlap and Add
+        output_size = (T - 1) * self.hop_length + self.win_length
+        y = torch.nn.functional.fold(
+            ifft,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        )[:, 0, 0, pad:-pad]
+        # Window envelope
+        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
+        window_envelope = torch.nn.functional.fold(
+            window_sq,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        ).squeeze()[pad:-pad]
+        # Normalize
+        assert (window_envelope > 1e-11).all()
+        y = y / window_envelope
+        return y
+class ISTFTHead(nn.Module):
+    """
+    ISTFT Head module for predicting STFT complex coefficients.
+    Args:
+        dim (int): Hidden dimension of the model.
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames, which should align with
+                          the resolution of the input features.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
+        super().__init__()
+        out_dim = n_fft + 2
+        self.out = torch.nn.Linear(dim, out_dim)
+        self.istft = ISTFT(
+            n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the ISTFTHead module.
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x = self.out(x).transpose(1, 2)
+        mag, p = x.chunk(2, dim=1)
+        mag = torch.exp(mag)
+        mag = torch.clip(
+            mag, max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        # wrapping happens here. These two lines produce real and imaginary value
+        x = torch.cos(p)
+        y = torch.sin(p)
+        # recalculating phase here does not produce anything new
+        # only costs time
+        # phase = torch.atan2(y, x)
+        # S = mag * torch.exp(phase * 1j)
+        # better directly produce the complex value
+        original_dtype = x.dtype
+        S = mag.float() * (x.float() + 1j * y.float())
+        audio = self.istft(S)
+        audio = audio.to(original_dtype)
+        return audio
+class RotaryEmbedding(nn.Module):
+    def __init__(self, base, dim, max_seq_len, rope_type="default", device=None):
+        super().__init__()
+        self.max_seq_len = max_seq_len
+        self.rope_type = rope_type
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(
+            device=device, base=base, dim=dim
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[:, None].float().expand(-1, 1).to(x.device)
+        position_ids_expanded = position_ids[None, :].float()
+        device_type = (
+            x.device.type
+            if isinstance(x.device.type, str) and x.device.type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(0, 1)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+LAYER_NORM = {"LayerNorm": nn.LayerNorm, "RMSNorm": RMSNorm}
+class Attention(nn.Module):
+    def __init__(self, embed_dim, num_heads, window_size=(-1, -1), causal=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.window_size = window_size
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.causal = causal
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        seq_len: torch.Tensor,
+        rope_position_embeddings=None,
+    ):
+        bsz, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(
+            bsz, self.num_heads, self.head_dim
+        )
+        key_states = self.k_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(
+            bsz, self.num_heads, self.head_dim
+        )
+        if rope_position_embeddings is not None:
+            cos, sin = rope_position_embeddings
+            query_states = apply_rotary_pos_emb(query_states, cos, sin)
+            key_states = apply_rotary_pos_emb(key_states, cos, sin)
+        cu_len = F.pad(torch.cumsum(seq_len, dim=0), (1, 0), "constant", 0).to(
+            torch.int32
+        )
+        max_seqlen = torch.max(seq_len).to(torch.int32).detach()
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_len,
+            cu_len,
+            max_seqlen,
+            max_seqlen,
+            causal=self.causal,
+            window_size=self.window_size,
+        )
+        attn_output = attn_output.reshape(bsz, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class TransformerLayer(nn.Module):
+    def __init__(
+        self,
+        act,
+        d_model,
+        encoder_attention_heads,
+        encoder_ffn_dim,
+        causal,
+        ln_type="LayerNorm",
+        attn_window_size=(-1, -1),
+    ):
+        super().__init__()
+        self.embed_dim = d_model
+        self.self_attn = Attention(
+            self.embed_dim, encoder_attention_heads, attn_window_size, causal
+        )
+        self.self_attn_layer_norm = LAYER_NORM[ln_type](self.embed_dim)
+        self.activation_fn = act
+        self.fc1 = nn.Linear(self.embed_dim, encoder_ffn_dim)
+        self.fc2 = nn.Linear(encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = LAYER_NORM[ln_type](self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        seq_len: torch.Tensor,
+        rope_position_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states, seq_len, rope_position_embeddings=rope_position_embeddings
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        if (
+            hidden_states.dtype == torch.float16
+            or hidden_states.dtype == torch.bfloat16
+        ) and (torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+        return hidden_states
+class TransformerVocos(nn.Module):
+    def __init__(self, config: MiMoAudioTokenizerConfig):
+        super().__init__()
+        self.config = config
+        self.max_source_positions = (
+            self.config.max_audio_seconds
+            * self.config.sampling_rate
+            // self.config.hop_length
+        )
+        self.embeddings = nn.Linear(config.n_mels, config.vocoder_dim, bias=False)
+        self.poisition_embedding = RotaryEmbedding(
+            config.rope_theta,
+            config.vocoder_dim // config.vocoder_attention_heads,
+            self.max_source_positions,
+            self.config.rope_type,
+        )
+        self.layers = nn.ModuleList(
+            [
+                TransformerLayer(
+                    ACT2FN[self.config.activation_function],
+                    self.config.vocoder_dim,
+                    self.config.vocoder_attention_heads,
+                    self.config.vocoder_intermediate_dim,
+                    causal=False,
+                    ln_type=self.config.ln_type,
+                    attn_window_size=self.config.vocoder_attn_window_size,
+                )
+                for _ in range(self.config.vocoder_num_layers)
+            ]
+        )
+        self.layer_norm = LAYER_NORM[self.config.ln_type](self.config.vocoder_dim)
+        self.hop_size = self.config.hop_length
+        self.head = ISTFTHead(
+            self.config.vocoder_dim,
+            self.config.nfft,
+            self.config.hop_length,
+            self.config.vocoder_padding,
+        )
+    def forward(self, x: torch.Tensor, input_length):
+        x = x.transpose(1, 2)
+        attention_mask, unpacking_index = get_sequence_mask(x, input_length)
+        x = torch.masked_select(x, attention_mask).view(
+            torch.sum(input_length), self.config.n_mels
+        )
+        x = self.embeddings(x)
+        position_ids = torch.arange(0, x.size(0), device=x.device, dtype=torch.long)
+        rope_position_embeddings = self.poisition_embedding(x, position_ids)
+        for idx, layer in enumerate(self.layers):
+            x = layer(
+                x, input_length, rope_position_embeddings=rope_position_embeddings
+            )
+        x = self.layer_norm(x)
+        x = unpack_hidden_states(x, input_length, attention_mask, unpacking_index)
+        x = self.head(x)
+        output_length = input_length * self.hop_size
+        return x[:, None, :], output_length
+class AudioEncoder(nn.Module):
+    def __init__(self, config: MiMoAudioTokenizerConfig):
+        super().__init__()
+        config._attn_implementation = "flash_attention_2"
+        self.config = config
+        self.max_source_positions = (
+            config.max_audio_seconds * config.sampling_rate // config.hop_length
+        ) // config.stride_size
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.skip_layer_idx = config.encoder_skip_layer_id
+        self.conv1 = nn.Conv1d(
+            config.n_mels, config.d_model, kernel_size=config.kernel_size, padding=1
+        )
+        self.conv2 = nn.Conv1d(
+            config.d_model,
+            config.d_model,
+            kernel_size=config.kernel_size,
+            stride=config.stride_size,
+            padding=1,
+        )
+        self.position_embedding = RotaryEmbedding(
+            config.rope_theta,
+            config.d_model // config.encoder_attention_heads,
+            self.max_source_positions,
+            config.rope_type,
+        )
+        self.layers = nn.ModuleList(
+            [
+                TransformerLayer(
+                    ACT2FN[config.activation_function],
+                    config.d_model,
+                    config.encoder_attention_heads,
+                    config.encoder_ffn_dim,
+                    causal=self.config.encoder_causal,
+                    ln_type=self.config.ln_type,
+                    attn_window_size=self.config.encoder_attn_window_size,
+                )
+                for _ in range(config.encoder_layers)
+            ]
+        )
+        self.layer_norm = LAYER_NORM[config.ln_type](config.d_model)
+        if self.config.avg_pooler != 1:
+            self.down_sample_layer = nn.Sequential(
+                nn.Conv1d(
+                    config.d_model,
+                    config.d_model,
+                    config.avg_pooler,
+                    config.avg_pooler,
+                    bias=False,
+                ),
+                nn.GELU(),
+            )
+            self.down_sample_norm = LAYER_NORM[config.ln_type](config.d_model)
+        else:
+            self.down_sample_layer = None
+        if self.config.num_quantizers != 0:
+            self.quantizer = ResidualVectorQuantizer(
+                dimension=self.config.d_model,
+                n_q=self.config.num_quantizers,
+                bins=self.config.codebook_size,
+                threshold_ema_dead_code=self.config.threshold_ema_dead_code,
+            )
+        else:
+            self.quantizer = None
+    def get_features(self, input_features, output_length):
+        input_features = input_features.to(self.conv1.weight)
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        bsz, tgt_len, _ = inputs_embeds.size()
+        hidden_states = inputs_embeds
+        position_ids = (
+            get_position_ids(output_length).long().to(input_features.device)
+        )
+        rope_position_embeddings = self.position_embedding(
+            input_features, position_ids
+        )
+        attention_mask, unpacking_index = get_sequence_mask(
+            hidden_states, output_length
+        )
+        hidden_states = torch.masked_select(hidden_states, attention_mask).view(
+            torch.sum(output_length), self.config.d_model
+        )
+        skip_connect_hidden_states = 0.0
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(
+                hidden_states,
+                output_length,
+                rope_position_embeddings=rope_position_embeddings,
+            )
+            if (self.skip_layer_idx is not None) and idx == self.skip_layer_idx - 1:
+                skip_connect_hidden_states = hidden_states.clone()
+        hidden_states += skip_connect_hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        if self.down_sample_layer is not None:
+            hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+                bsz, tgt_len, self.config.d_model
+            )
+            if hidden_states.size(1) % self.config.avg_pooler:
+                pad_len = (
+                    self.config.avg_pooler
+                    - hidden_states.size(1) % self.config.avg_pooler
+                )
+                hidden_states = torch.nn.functional.pad(
+                    hidden_states, (0, 0, 0, pad_len), mode="constant", value=0.0
+                )
+                tgt_len += pad_len
+            tgt_len = tgt_len // self.config.avg_pooler
+            hidden_states = self.down_sample_layer(hidden_states.transpose(1, 2))
+            output_length = (
+                output_length // self.config.avg_pooler
+                + (output_length % self.config.avg_pooler != 0).int()
+            )
+            hidden_states = hidden_states.transpose(1, 2)
+            attention_mask, unpacking_index = get_sequence_mask(
+                hidden_states, output_length
+            )
+            hidden_states = torch.masked_select(hidden_states, attention_mask).view(
+                torch.sum(output_length), self.config.d_model
+            )
+            hidden_states = self.down_sample_norm(hidden_states)
+        return (
+            hidden_states,
+            output_length,
+            attention_mask,
+            unpacking_index,
+            tgt_len,
+            bsz,
+        )
+    def get_output_length(self, mel_len):
+        tgt_len = mel_len + 3 - self.config.kernel_size
+        return (tgt_len + 2 - self.config.kernel_size) // self.config.stride_size + 1
+    @torch.no_grad()
+    def encode(
+        self,
+        input_features,
+        input_lens=None,
+        output_length=None,
+        return_codes_only=False,
+        n_q=None,
+        use_quantizer=True,
+    ):
+        if output_length is None:
+            output_length = self.get_output_length(input_lens)
+        input_features = unpack_hidden_states(input_features, input_lens)
+        hidden_states, output_length, attention_mask, unpacking_index, tgt_len, bsz = (
+            self.get_features(
+                input_features=input_features.transpose(1, 2),
+                output_length=output_length,
+            )
+        )
+        dtype = hidden_states.dtype
+        if use_quantizer and self.quantizer is not None:
+            self.quantizer.float()
+            codes = self.quantizer.encode(hidden_states.float(), n_q=n_q)
+            if return_codes_only:
+                return codes, output_length
+            hidden_states = self.quantizer.decode(codes)
+            hidden_states = hidden_states.to(dtype)
+        else:
+            codes = None
+        hidden_states_packed = hidden_states.clone()
+        # unpacking
+        hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+            bsz, tgt_len, self.config.d_model
+        )
+        hidden_states = torch.where(attention_mask, hidden_states, 0)
+        return hidden_states, hidden_states_packed, output_length, codes
+    @torch.no_grad()
+    def decode_vq(self, codes):
+        self.quantizer.float()
+        hidden_states = self.quantizer.decode(codes)
+        return hidden_states
+class CausalConvTranspose1d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride):
+        super().__init__()
+        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
+        self.norm = nn.GroupNorm(1, out_channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+    def forward(self, hidden_states, input_length, output_dim=None):
+        kernel_size = self.conv.kernel_size[0]
+        stride = self.conv.stride[0]
+        bsz = input_length.shape[0]
+        if output_dim is None:
+            output_dim = hidden_states.dim()
+        if hidden_states.dim() <= 2:  # unpack sequence to 3d
+            sequence_mask, unpacking_index = get_sequence_mask(
+                hidden_states, input_length
+            )
+            hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+                bsz, torch.max(input_length), self.in_channels
+            )
+            hidden_states = torch.where(sequence_mask, hidden_states, 0)
+        hidden_states = hidden_states.transpose(2, 1)  # (N, L, C) -> (N, C, L)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states.transpose(2, 1)  # (N, C, L) -> (N, L, C)
+        casual_padding_right = max(0, kernel_size - stride)
+        hidden_states = hidden_states[
+            :, : hidden_states.shape[1] - casual_padding_right, :
+        ]
+        output_length = (input_length - 1) * stride + kernel_size - casual_padding_right
+        sequence_mask, _ = get_sequence_mask(hidden_states, output_length)
+        if output_dim <= 2:
+            hidden_states = torch.masked_select(hidden_states, sequence_mask).view(
+                -1, self.out_channels
+            )
+        else:
+            hidden_states = torch.where(sequence_mask, hidden_states, 0)
+            hidden_states = hidden_states[:, : torch.max(output_length), :]
+        return hidden_states, output_length
+class AudioDecoder(nn.Module):
+    def __init__(self, config: MiMoAudioTokenizerConfig):
+        super().__init__()
+        self.config = config
+        self.max_source_positions = (
+            self.config.max_audio_seconds
+            * self.config.sampling_rate
+            // self.config.hop_length
+        )
+        if self.config.avg_pooler != 1:
+            self.dconv1 = CausalConvTranspose1d(
+                self.config.d_model,
+                self.config.d_model,
+                self.config.avg_pooler,
+                self.config.avg_pooler,
+            )
+        else:
+            self.dconv1 = None
+        self.position_embedding = RotaryEmbedding(
+            config.rope_theta,
+            config.d_model // config.decoder_attention_heads,
+            self.max_source_positions,
+            config.rope_type,
+        )
+        self.layers = nn.ModuleList(
+            [
+                TransformerLayer(
+                    ACT2FN[self.config.activation_function],
+                    self.config.d_model,
+                    self.config.decoder_attention_heads,
+                    self.config.decoder_ffn_dim,
+                    causal=self.config.decoder_causal,
+                    ln_type=self.config.ln_type,
+                    attn_window_size=self.config.decoder_attn_window_size,
+                )
+                for _ in range(self.config.decoder_layers)
+            ]
+        )
+        self.layer_norm = LAYER_NORM[config.ln_type](self.config.d_model)
+        self.dconv2 = CausalConvTranspose1d(
+            self.config.d_model,
+            self.config.n_mels,
+            self.config.decoder_kernel_size,
+            self.config.decoder_stride_size,
+        )
+        self.vocoder = TransformerVocos(config)
+    def forward(
+        self,
+        audio_embed,
+        input_length,
+    ):
+        assert audio_embed.shape[-1] == self.config.d_model
+        audio_embed = audio_embed.to(self.layer_norm.weight)
+        if self.dconv1 is not None:
+            audio_embed, output_length = self.dconv1(
+                audio_embed, input_length, output_dim=3
+            )
+            _, tgt_len, _ = audio_embed.size()
+        else:
+            output_length = input_length
+            tgt_len = audio_embed.size(0)
+        hidden_states = audio_embed
+        position_ids = (
+            get_position_ids(output_length).long().to(hidden_states.device)
+        )
+        rope_position_embeddings = self.position_embedding(
+            hidden_states, position_ids
+        )
+        # packing hidden states
+        attention_mask, _ = get_sequence_mask(hidden_states, output_length)
+        hidden_states = torch.masked_select(hidden_states, attention_mask).view(
+            torch.sum(output_length), self.config.d_model
+        )
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(
+                hidden_states,
+                output_length,
+                rope_position_embeddings=rope_position_embeddings,
+            )
+        hidden_states = self.layer_norm(hidden_states)
+        coarse_mel, output_length = self.dconv2(
+            hidden_states, output_length, output_dim=3
+        )
+        recon_wav, wav_length = self.vocoder(
+            x=coarse_mel.transpose(1, 2),
+            input_length=output_length,
+        )
+        return recon_wav
+class MiMoAudioTokenizer(PreTrainedModel):
+    config_class = MiMoAudioTokenizerConfig
+    def __init__(self, config: MiMoAudioTokenizerConfig):
+        super().__init__(config)
+        self.config = config
+        self.sampling_rate = config.sampling_rate
+        self.encoder = AudioEncoder(config=config)
+        self.decoder = AudioDecoder(config=config)
+        self.downsample_rate = int(self.config.hop_length * 2 * self.config.avg_pooler)
+    def get_output_length(self, mel_len):
+        tgt_len = mel_len + 3 - self.config.kernel_size
+        return (tgt_len + 2 - self.config.kernel_size) // self.config.stride_size + 1
+    @torch.no_grad()
+    def encode(self, mels, input_lens, use_quantizer=True):
+        input_features = mels
+        encoder_output_length = self.get_output_length(input_lens)
+        hidden_states, hidden_states_packed, encoder_output_length, codes = (
+            self.encoder.encode(
+                input_features, input_lens=input_lens, use_quantizer=use_quantizer
+            )
+        )
+        return hidden_states, hidden_states_packed, encoder_output_length, codes
+    @torch.no_grad()
+    def decode(self, codes):
+        hidden_states = self.encoder.decode_vq(codes)
+        output = self.decoder(
+            hidden_states,
+            torch.tensor([hidden_states.size(0)], device=hidden_states.device),
+        )
+        return output
+    @torch.no_grad()
+    def streaming_decode(self, codes_chunks, chunk_input_lengths, history_cache=StreamingCache(), streaming_config=StreamingConfig(), last_chunk=False):
+        hidden_states = self.encoder.decode_vq(codes_chunks)
+        input_lengths = []
+        input_hidden_states = []
+        start_idx = 0
+        cache_hidden_states = []
+        for i, input_length in enumerate(chunk_input_lengths):
+            sample_hidden_states = hidden_states[start_idx:start_idx + input_length]
+            start_idx += input_length
+            if history_cache.hidden_states is not None:
+                sample_hidden_states = torch.cat([history_cache.hidden_states[i], sample_hidden_states], dim=0)
+                input_length += history_cache.hidden_states[i].size(0)
+            input_hidden_states.append(sample_hidden_states)
+            cache_hidden_states.append(sample_hidden_states.clone())
+            input_lengths.append(input_length)
+        input_hidden_states = torch.cat(input_hidden_states, dim=0)
+        input_lengths = torch.tensor(input_lengths, device=hidden_states.device)
+        output = self.decoder(input_hidden_states, input_lengths)
+        return_wavs = []
+        frames_per_token = self.config.avg_pooler * self.config.stride_size * self.config.hop_length
+        processed_lengths = []
+        for i, wav in enumerate(output):
+            wav = wav.float().detach().cpu()
+            start_idx = history_cache.processed_lengths[i] if history_cache.processed_lengths is not None else 0
+            if last_chunk:
+                return_wavs.append(wav[:, start_idx * frames_per_token:])
+                new_processed_length = input_lengths[i].item()
+            elif input_lengths[i].item() <= streaming_config.right_overlap:
+                return_wavs.append(None)
+                new_processed_length = 0
+            else:
+                end_idx = (input_lengths[i].item() - streaming_config.right_overlap)
+                wav = wav[:, start_idx * frames_per_token: end_idx * frames_per_token]
+                return_wavs.append(wav)
+                new_processed_length = end_idx
+                if input_lengths[i].item() > streaming_config.left_overlap:
+                    cache_hidden_states[i] = cache_hidden_states[i][-streaming_config.left_overlap:]
+                    new_processed_length -= (input_lengths[i].item() - streaming_config.left_overlap)
+            processed_lengths.append(new_processed_length)
+        history_cache.hidden_states = cache_hidden_states
+        history_cache.processed_lengths = processed_lengths
+        return return_wavs, history_cache

src/mimo_audio_tokenizer/modeling_rope_utils.py ADDED Viewed

	@@ -0,0 +1,878 @@

+# Copyright 2025 Xiaomi Corporation.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from functools import wraps
+from typing import Optional
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import is_torch_available, logging
+logger = logging.get_logger(__name__)
+if is_torch_available():
+    import torch
+def dynamic_rope_update(rope_forward):
+    """
+    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
+    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+    Args:
+        rope_forward (Callable):
+            The forward pass of the RoPE implementation.
+    Returns:
+        The decorated forward pass.
+    """
+    def longrope_frequency_update(self, position_ids, device):
+        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        seq_len = torch.max(position_ids) + 1
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = (
+                self.config.original_max_position_embeddings
+            )
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+        if seq_len > original_max_position_embeddings:
+            if not hasattr(self, "long_inv_freq"):
+                self.long_inv_freq, _ = self.rope_init_fn(
+                    self.config, device, seq_len=original_max_position_embeddings + 1
+                )
+            self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+        else:
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+    def dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len
+            )
+            self.register_buffer(
+                "inv_freq", inv_freq, persistent=False
+            )  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if (
+            seq_len < self.original_max_seq_len
+            and self.max_seq_len_cached > self.original_max_seq_len
+        ):  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @wraps(rope_forward)
+    def wrapper(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device)
+        return rope_forward(self, x, position_ids)
+    return wrapper
+def _compute_default_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies according to the original RoPE implementation
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+    elif config is not None:
+        base = config.rope_theta
+        partial_rotary_factor = (
+            config.partial_rotary_factor
+            if hasattr(config, "partial_rotary_factor")
+            else 1.0
+        )
+        head_dim = (
+            getattr(config, "head_dim", None)
+            or config.hidden_size // config.num_attention_heads
+        )
+        dim = int(head_dim * partial_rotary_factor)
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # Compute the inverse frequencies
+    inv_freq = 1.0 / (
+        base
+        ** (
+            torch.arange(0, dim, 2, dtype=torch.int64).to(
+                device=device, dtype=torch.float
+            )
+            / dim
+        )
+    )
+    return inv_freq, attention_factor
+def _compute_linear_scaling_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        factor = rope_kwargs["factor"]
+    elif config is not None:
+        factor = config.rope_scaling["factor"]
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(
+        config, device, seq_len, **rope_kwargs
+    )
+    # Then applies linear scaling to the frequencies.
+    # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
+    # applying scaling to the inverse frequencies is equivalent.
+    inv_freq /= factor
+    return inv_freq, attention_factor
+def _compute_dynamic_ntk_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length, used to update the dynamic RoPE at inference time.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+        max_position_embeddings = rope_kwargs["max_position_embeddings"]
+        factor = rope_kwargs["factor"]
+    elif config is not None:
+        base = config.rope_theta
+        partial_rotary_factor = (
+            config.partial_rotary_factor
+            if hasattr(config, "partial_rotary_factor")
+            else 1.0
+        )
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        dim = int(head_dim * partial_rotary_factor)
+        max_position_embeddings = config.max_position_embeddings
+        factor = config.rope_scaling["factor"]
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # seq_len: default to max_position_embeddings, e.g. at init time
+    seq_len = (
+        seq_len
+        if seq_len is not None and seq_len > max_position_embeddings
+        else max_position_embeddings
+    )
+    # Compute the inverse frequencies
+    base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (
+        dim / (dim - 2)
+    )
+    inv_freq = 1.0 / (
+        base
+        ** (
+            torch.arange(0, dim, 2, dtype=torch.int64).to(
+                device=device, dtype=torch.float
+            )
+            / dim
+        )
+    )
+    return inv_freq, attention_factor
+def _compute_yarn_parameters(
+    config: PretrainedConfig,
+    device: "torch.device",
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Please refer to the
+    [original paper](https://huggingface.co/papers/2309.00071)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # No need to keep BC with yarn, unreleased when this new pattern was created.
+    if len(rope_kwargs) > 0:
+        raise ValueError(
+            f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
+        )
+    base = config.rope_theta
+    partial_rotary_factor = (
+        config.partial_rotary_factor
+        if hasattr(config, "partial_rotary_factor")
+        else 1.0
+    )
+    head_dim = getattr(
+        config, "head_dim", config.hidden_size // config.num_attention_heads
+    )
+    dim = int(head_dim * partial_rotary_factor)
+    factor = config.rope_scaling["factor"]
+    attention_factor = config.rope_scaling.get("attention_factor")
+    mscale = config.rope_scaling.get("mscale")
+    mscale_all_dim = config.rope_scaling.get("mscale_all_dim")
+    # NOTE: DeekSeek-V3 (and potentially other models) modify `max_position_embeddings` and have a
+    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+    # values to compute the default attention scaling factor, instead of using `factor`.
+    if "original_max_position_embeddings" in config.rope_scaling:
+        original_max_position_embeddings = config.rope_scaling[
+            "original_max_position_embeddings"
+        ]
+        factor = config.max_position_embeddings / original_max_position_embeddings
+    else:
+        original_max_position_embeddings = config.max_position_embeddings
+    def get_mscale(scale, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
+    # Sets the attention factor as suggested in the paper
+    if attention_factor is None:
+        if mscale and mscale_all_dim:
+            attention_factor = float(
+                get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dim)
+            )
+        else:
+            attention_factor = get_mscale(factor)
+    # Optional config options
+    # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+    beta_fast = config.rope_scaling.get("beta_fast") or 32
+    beta_slow = config.rope_scaling.get("beta_slow") or 1
+    # Compute the inverse frequencies
+    def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
+        """Inverse dimension formula to find the dimension based on the number of rotations"""
+        return (
+            dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))
+        ) / (2 * math.log(base))
+    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+        """Find dimension range bounds based on rotations"""
+        low = math.floor(
+            find_correction_dim(low_rot, dim, base, max_position_embeddings)
+        )
+        high = math.ceil(
+            find_correction_dim(high_rot, dim, base, max_position_embeddings)
+        )
+        return max(low, 0), min(high, dim - 1)
+    def linear_ramp_factor(min, max, dim):
+        if min == max:
+            max += 0.001  # Prevent singularity
+        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+        ramp_func = torch.clamp(linear_func, 0, 1)
+        return ramp_func
+    # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+    # to expand the possible context length. In other words, interpolation = apply scaling factor.
+    pos_freqs = base ** (
+        torch.arange(0, dim, 2).to(device=device, dtype=torch.float) / dim
+    )
+    inv_freq_extrapolation = 1.0 / pos_freqs
+    inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+    low, high = find_correction_range(
+        beta_fast, beta_slow, dim, base, original_max_position_embeddings
+    )
+    # Get n-dimensional rotational scaling corrected for extrapolation
+    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).to(
+        device=device, dtype=torch.float
+    )
+    inv_freq = (
+        inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+        + inv_freq_extrapolation * inv_freq_extrapolation_factor
+    )
+    return inv_freq, attention_factor
+def _compute_longrope_parameters(
+    config: PretrainedConfig,
+    device: "torch.device",
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
+    [original implementation](https://github.com/microsoft/LongRoPE)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+    # No need to keep BC with longrope, unreleased when this new pattern was created.
+    if len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
+            f"{rope_kwargs}"
+        )
+    base = config.rope_theta
+    partial_rotary_factor = (
+        config.partial_rotary_factor
+        if hasattr(config, "partial_rotary_factor")
+        else 1.0
+    )
+    head_dim = getattr(
+        config, "head_dim", config.hidden_size // config.num_attention_heads
+    )
+    dim = int(head_dim * partial_rotary_factor)
+    long_factor = config.rope_scaling["long_factor"]
+    short_factor = config.rope_scaling["short_factor"]
+    factor = config.rope_scaling.get("factor")
+    attention_factor = config.rope_scaling.get("attention_factor")
+    # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
+    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+    # values to compute the default attention scaling factor, instead of using `factor`.
+    if hasattr(config, "original_max_position_embeddings"):
+        original_max_position_embeddings = config.original_max_position_embeddings
+        factor = (
+            config.max_position_embeddings / config.original_max_position_embeddings
+        )
+    else:
+        original_max_position_embeddings = config.max_position_embeddings
+    # Sets the attention factor as suggested in the paper
+    if attention_factor is None:
+        if factor <= 1.0:
+            attention_factor = 1.0
+        else:
+            attention_factor = math.sqrt(
+                1 + math.log(factor) / math.log(original_max_position_embeddings)
+            )
+    # Compute the inverse frequencies -- scaled based on the target sequence length
+    if seq_len and seq_len > original_max_position_embeddings:
+        ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
+    else:
+        ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
+    inv_freq_shape = (
+        torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
+    )
+    inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
+    return inv_freq, attention_factor
+def _compute_llama3_parameters(
+    config: PretrainedConfig,
+    device: "torch.device",
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies for llama 3.1.
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(
+        config, device, seq_len, **rope_kwargs
+    )
+    factor = config.rope_scaling["factor"]  # `8` in the original implementation
+    low_freq_factor = config.rope_scaling[
+        "low_freq_factor"
+    ]  # `1` in the original implementation
+    high_freq_factor = config.rope_scaling[
+        "high_freq_factor"
+    ]  # `4` in the original implementation
+    old_context_len = config.rope_scaling[
+        "original_max_position_embeddings"
+    ]  # `8192` in the original implementation
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(
+        wavelen > low_freq_wavelen, inv_freq / factor, inv_freq
+    )
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+        high_freq_factor - low_freq_factor
+    )
+    smoothed_inv_freq = (
+        1 - smooth_factor
+    ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+    return inv_freq_llama, attention_factor
+# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
+# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
+# parameterizations, as long as the callable has the same signature.
+ROPE_INIT_FUNCTIONS = {
+    "default": _compute_default_rope_parameters,
+    "linear": _compute_linear_scaling_rope_parameters,
+    "dynamic": _compute_dynamic_ntk_parameters,
+    "yarn": _compute_yarn_parameters,
+    "longrope": _compute_longrope_parameters,
+    "llama3": _compute_llama3_parameters,
+}
+def _check_received_keys(
+    rope_type: str,
+    received_keys: set,
+    required_keys: set,
+    optional_keys: Optional[set] = None,
+    ignore_keys: Optional[set] = None,
+):
+    """Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
+    # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present
+    if "type" in received_keys:
+        received_keys -= {"type"}
+        required_keys.add("rope_type")
+    # Some models need to store model-specific keys, and we don't want to throw warning at them
+    if ignore_keys is not None:
+        received_keys -= ignore_keys
+    missing_keys = required_keys - received_keys
+    if missing_keys:
+        raise KeyError(
+            f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}"
+        )
+    if optional_keys is not None:
+        unused_keys = received_keys - required_keys - optional_keys
+    else:
+        unused_keys = received_keys - required_keys
+    if unused_keys:
+        logger.warning(
+            f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}"
+        )
+def _validate_default_rope_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, ignore_keys=ignore_keys
+    )
+def _validate_linear_scaling_rope_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, ignore_keys=ignore_keys
+    )
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(
+            f"`rope_scaling`'s factor field must be a float >= 1, got {factor}"
+        )
+def _validate_dynamic_scaling_rope_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
+    optional_keys = {"original_max_position_embeddings"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys
+    )
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(
+            f"`rope_scaling`'s factor field must be a float >= 1, got {factor}"
+        )
+def _validate_yarn_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    optional_keys = {
+        "attention_factor",
+        "beta_fast",
+        "beta_slow",
+        "original_max_position_embeddings",
+        "mscale",
+        "mscale_all_dim",
+    }
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys
+    )
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(
+            f"`rope_scaling`'s factor field must be a float >= 1, got {factor}"
+        )
+    attention_factor = rope_scaling.get("attention_factor")
+    if attention_factor is not None and (
+        not isinstance(attention_factor, float) or attention_factor < 0
+    ):
+        logger.warning(
+            f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
+        )
+    beta_fast = rope_scaling.get("beta_fast")
+    if beta_fast is not None and not isinstance(beta_fast, float):
+        logger.warning(
+            f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}"
+        )
+    beta_slow = rope_scaling.get("beta_slow")
+    if beta_slow is not None and not isinstance(beta_slow, float):
+        logger.warning(
+            f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}"
+        )
+    if (beta_fast or 32) < (beta_slow or 1):
+        logger.warning(
+            f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
+            f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
+        )
+def _validate_longrope_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "short_factor", "long_factor"}
+    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
+    optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys
+    )
+    partial_rotary_factor = (
+        config.partial_rotary_factor
+        if hasattr(config, "partial_rotary_factor")
+        else 1.0
+    )
+    head_dim = getattr(
+        config, "head_dim", config.hidden_size // config.num_attention_heads
+    )
+    dim = int(head_dim * partial_rotary_factor)
+    short_factor = rope_scaling.get("short_factor")
+    if not isinstance(short_factor, list) and all(
+        isinstance(x, (int, float)) for x in short_factor
+    ):
+        logger.warning(
+            f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}"
+        )
+    if not len(short_factor) == dim // 2:
+        logger.warning(
+            f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}"
+        )
+    long_factor = rope_scaling.get("long_factor")
+    if not isinstance(long_factor, list) and all(
+        isinstance(x, (int, float)) for x in long_factor
+    ):
+        logger.warning(
+            f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}"
+        )
+    if not len(long_factor) == dim // 2:
+        logger.warning(
+            f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}"
+        )
+    # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
+    # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
+    # unique to longrope (= undesirable)
+    if hasattr(config, "original_max_position_embeddings"):
+        logger.warning_once(
+            "This model has set a `original_max_position_embeddings` field, to be used together with "
+            "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
+            "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
+            "as it is compatible with most model architectures."
+        )
+    else:
+        factor = rope_scaling.get("factor")
+        if factor is None:
+            logger.warning("Missing required keys in `rope_scaling`: 'factor'")
+        elif not isinstance(factor, float) or factor < 1.0:
+            logger.warning(
+                f"`rope_scaling`'s factor field must be a float >= 1, got {factor}"
+            )
+        attention_factor = rope_scaling.get("attention_factor")
+        if attention_factor is not None:
+            if not isinstance(attention_factor, float) or attention_factor < 0.0:
+                logger.warning(
+                    f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
+                )
+def _validate_llama3_parameters(
+    config: PretrainedConfig, ignore_keys: Optional[set] = None
+):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get(
+        "rope_type", rope_scaling.get("type", None)
+    )  # BC: "rope_type" was originally "type"
+    required_keys = {
+        "rope_type",
+        "factor",
+        "original_max_position_embeddings",
+        "low_freq_factor",
+        "high_freq_factor",
+    }
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(
+        rope_type, received_keys, required_keys, ignore_keys=ignore_keys
+    )
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(
+            f"`rope_scaling`'s factor field must be a float >= 1, got {factor}"
+        )
+    low_freq_factor = rope_scaling["low_freq_factor"]
+    high_freq_factor = rope_scaling["high_freq_factor"]
+    if low_freq_factor is None or not isinstance(low_freq_factor, float):
+        logger.warning(
+            f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}"
+        )
+    if high_freq_factor is None or not isinstance(high_freq_factor, float):
+        logger.warning(
+            f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}"
+        )
+    if high_freq_factor <= low_freq_factor:
+        logger.warning(
+            "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
+            f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
+        )
+    original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
+    if original_max_position_embeddings is None or not isinstance(
+        original_max_position_embeddings, int
+    ):
+        logger.warning(
+            "`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
+            f"{original_max_position_embeddings}"
+        )
+    if original_max_position_embeddings >= config.max_position_embeddings:
+        logger.warning(
+            "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
+            f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
+        )
+# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
+ROPE_VALIDATION_FUNCTIONS = {
+    "default": _validate_default_rope_parameters,
+    "linear": _validate_linear_scaling_rope_parameters,
+    "dynamic": _validate_dynamic_scaling_rope_parameters,
+    "yarn": _validate_yarn_parameters,
+    "longrope": _validate_longrope_parameters,
+    "llama3": _validate_llama3_parameters,
+}
+def rope_config_validation(config: PretrainedConfig, ignore_keys: Optional[set] = None):
+    """
+    Validate the RoPE config arguments, given a `PretrainedConfig` object
+    """
+    rope_scaling = getattr(
+        config, "rope_scaling", None
+    )  # not a default parameter in `PretrainedConfig`
+    if rope_scaling is None:
+        return
+    # BC: "rope_type" was originally "type"
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
+    validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
+    if validation_fn is not None:
+        validation_fn(config, ignore_keys=ignore_keys)
+    else:
+        logger.warning(
+            f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(x, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        x (`torch.Tensor`): The input tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    x_embed = (x * cos) + (rotate_half(x) * sin)
+    return x_embed

src/mimo_audio_tokenizer/quantization.py ADDED Viewed

	@@ -0,0 +1,480 @@

+# Copyright 2025 Xiaomi Corporation.
+import typing as tp
+from einops import rearrange, repeat
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.distributed as dist
+def rank():
+    if dist.is_initialized():
+        return dist.get_rank()
+    else:
+        return 0
+def world_size():
+    if dist.is_initialized():
+        return dist.get_world_size()
+    else:
+        return 1
+def default(val: tp.Any, d: tp.Any) -> tp.Any:
+    return val if val is not None else d
+def ema_inplace(moving_avg, new, decay: float):
+    if dist.is_initialized():
+        dist.all_reduce(new, op=dist.ReduceOp.SUM)
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    selected_samples = samples[indices]
+    if dist.is_initialized():
+        dist.broadcast(selected_samples, src=0)
+    return selected_samples
+def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+    means = sample_vectors(samples, num_clusters)
+    for _ in range(num_iters):
+        dists = -(
+            samples.pow(2).sum(1, keepdim=True)
+            - 2 * samples @ means.t()
+            + means.t().pow(2).sum(0, keepdim=True)
+        )
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means = new_means.scatter_add_(
+            0, repeat(buckets, "n -> n d", d=dim), samples
+        )
+        if dist.is_initialized():
+            dist.all_reduce(bins, op=dist.ReduceOp.SUM)
+            dist.all_reduce(new_means, op=dist.ReduceOp.SUM)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = new_means / bins_min_clamped[..., None]
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = (
+            uniform_init if not kmeans_init else torch.zeros
+        )
+        embed = init_fn(codebook_size, dim)
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
+        self.register_buffer("cluster_size", torch.zeros(codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+    def replace_(self, samples, mask):
+        # modified_codebook = torch.where(
+        #     mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        # )
+        replace_num = mask.sum()
+        modified_codebook = self.embed.clone()
+        modified_codebook[mask] = sample_vectors(samples, replace_num)
+        self.embed.data.copy_(modified_codebook)
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+        expired_codes = self.cluster_size < self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+        batch_samples = rearrange(batch_samples, "... d -> (...) d")
+        self.replace_(batch_samples, mask=expired_codes)
+    def preprocess(self, x):
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+    def encode(self, x):
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        self.init_embed_(x)
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t().contiguous(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+        return quantize, embed_ind
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation.
+    Currently supports only euclidean distance.
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        commitment_weight (float): Weight for commitment loss.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+        commitment_weight: float = 1.0,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+        requires_projection = _codebook_dim != dim
+        self.project_in = (
+            nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()
+        )
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+        self._codebook = EuclideanCodebook(
+            dim=_codebook_dim,
+            codebook_size=codebook_size,
+            kmeans_init=kmeans_init,
+            kmeans_iters=kmeans_iters,
+            decay=decay,
+            epsilon=epsilon,
+            threshold_ema_dead_code=threshold_ema_dead_code,
+        )
+        self.codebook_size = codebook_size
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    def encode(self, x):
+        # x = rearrange(x, "b d n -> b n d")
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        # quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+    def forward(self, x):
+        device = x.device
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+        if self.training:
+            quantize = x + (quantize - x).detach()
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+        if self.training:
+            if self.commitment_weight > 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+        quantize = self.project_out(quantize)
+        # quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize, embed_ind, loss
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers, codebook_size, **kwargs):
+        super().__init__()
+        if isinstance(codebook_size, int):
+            codebook_size = [codebook_size] * num_quantizers
+        elif len(codebook_size) < num_quantizers:
+            codebook_size += [codebook_size[-1]] * (num_quantizers - len(codebook_size))
+        self.layers = nn.ModuleList(
+            [
+                VectorQuantization(codebook_size=codebook_size[i], **kwargs)
+                for i in range(num_quantizers)
+            ]
+        )
+    def forward(
+        self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None
+    ):
+        quantized_out = 0.0
+        residual = x
+        all_losses = []
+        all_indices = []
+        out_quantized = []
+        n_q = n_q or len(self.layers)
+        for i, layer in enumerate(self.layers[:n_q]):
+            quantized, indices, loss = layer(residual)
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+            if layers and i in layers:
+                out_quantized.append(quantized_out)
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses, out_quantized
+    def encode(
+        self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None
+    ) -> torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = len(self.layers) if n_q is None else n_q
+        st = 0 if st is None else st
+        for layer in self.layers[st:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+    def decode(self, q_indices: torch.Tensor, st: int = 0) -> torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[st + i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out
+class ResidualVectorQuantizer(nn.Module):
+    """Residual Vector Quantizer.
+    Args:
+        dimension (int): Dimension of the codebooks.
+        n_q (int): Number of residual vector quantizers used.
+        bins (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        dimension: int = 256,
+        n_q: int = 8,
+        bins: int | list = 1024,
+        decay: float = 0.99,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.n_q = n_q
+        self.dimension = dimension
+        self.bins = bins
+        self.decay = decay
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            kmeans_init=self.kmeans_init,
+            kmeans_iters=self.kmeans_iters,
+            threshold_ema_dead_code=self.threshold_ema_dead_code,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        n_q: tp.Optional[int] = None,
+        layers: tp.Optional[list] = None,
+    ):
+        """Residual vector quantization on the given input tensor.
+        Args:
+            x (torch.Tensor): Input tensor.
+            n_q (int): Number of quantizer used to quantize. Default: All quantizers.
+            layers (list): Layer that need to return quantized. Defalt: None.
+        Returns:
+            QuantizedResult:
+                The quantized (or approximately quantized) representation with
+                the associated numbert quantizers and layer quantized required to return.
+        """
+        n_q = n_q if n_q else self.n_q
+        quantized, codes, commit_loss, quantized_list = self.vq(
+            x, n_q=n_q, layers=layers
+        )
+        return quantized, codes, torch.mean(commit_loss), quantized_list
+    def encode(
+        self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None
+    ) -> torch.Tensor:
+        """Encode a given input tensor with the specified sample rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizer to use
+        and returns indices for each quantizer.
+        Args:
+            x (torch.Tensor): Input tensor.
+            n_q (int): Number of quantizer used to quantize. Default: All quantizers.
+            st (int): Start to encode input from which layers. Default: 0.
+        """
+        n_q = n_q if n_q else self.n_q
+        st = st or 0
+        codes = self.vq.encode(x, n_q=n_q, st=st)
+        return codes
+    def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor:
+        """Decode the given codes to the quantized representation.
+        Args:
+            codes (torch.Tensor): Input indices for each quantizer.
+            st (int): Start to decode input codes from which layers. Default: 0.
+        """
+        quantized = self.vq.decode(codes, st=st)
+        return quantized