# Copyright 2025 Xiaomi Corporation. from transformers import PretrainedConfig class MiMoAudioTokenizerConfig(PretrainedConfig): model_type = "mimo_audio_tokenizer" def __init__( self, max_audio_seconds: int = 1800, stride_size: int = 2, avg_pooler: int = 1, d_model: int = 768, scale_embedding: bool = True, kernel_size: int = 3, activation_function: str = "gelu", encoder_layers: int = 8, encoder_skip_layer_id: int = None, encoder_attention_heads: int = 12, encoder_ffn_dim: int = 3072, encoder_causal: bool = False, encoder_attn_window_size: list[int] = None, decoder_layers: int = 8, decoder_attention_heads: int = 12, decoder_ffn_dim: int = 3072, decoder_kernel_size: int = 3, decoder_stride_size: int = 2, decoder_causal: bool = True, decoder_attn_window_size: list[int] = None, nfft: int = 1024, vocoder_dim: int = 512, vocoder_intermediate_dim: int = 4096, vocoder_num_layers: int = 30, n_mels: int = 80, sampling_rate: int = 24000, hop_length: int = 240, window_size: int = 1024, vocoder_padding: str = "same", fmin: int = 0, fmax: int = None, num_quantizers: int = 12, codebook_size: list[int] = None, threshold_ema_dead_code: int = 10, position_embedding_type: str = "rope", rope_theta: int = 10000, rope_type: str = "default", ln_type: str = "LayerNorm", vocoder_attention_heads: int = 4, vocoder_attn_window_size: list[int] = None, **kwargs, ): super().__init__(**kwargs) self.max_audio_seconds = max_audio_seconds self.stride_size = stride_size self.avg_pooler = avg_pooler self.d_model = d_model self.scale_embedding = scale_embedding self.kernel_size = kernel_size self.activation_function = activation_function self.encoder_layers = encoder_layers self.encoder_skip_layer_id = encoder_skip_layer_id self.encoder_attention_heads = encoder_attention_heads self.encoder_ffn_dim = encoder_ffn_dim self.encoder_causal = encoder_causal self.encoder_attn_window_size = ( encoder_attn_window_size if encoder_attn_window_size is not None else [-1, -1] ) self.decoder_layers = decoder_layers self.decoder_attention_heads = decoder_attention_heads self.decoder_ffn_dim = decoder_ffn_dim self.decoder_kernel_size = decoder_kernel_size self.decoder_stride_size = decoder_stride_size self.decoder_causal = decoder_causal self.decoder_attn_window_size = ( decoder_attn_window_size if decoder_attn_window_size is not None else [-1, -1] ) self.nfft = nfft self.vocoder_dim = vocoder_dim self.vocoder_intermediate_dim = vocoder_intermediate_dim self.vocoder_num_layers = vocoder_num_layers self.n_mels = n_mels self.sampling_rate = sampling_rate self.hop_length = hop_length self.window_size = window_size self.vocoder_padding = vocoder_padding self.fmin = fmin self.fmax = fmax self.num_quantizers = num_quantizers self.codebook_size = codebook_size if codebook_size is not None else [1024] self.threshold_ema_dead_code = threshold_ema_dead_code self.position_embedding_type = position_embedding_type self.rope_theta = rope_theta self.rope_type = rope_type self.ln_type = ln_type self.vocoder_attention_heads = vocoder_attention_heads self.vocoder_attn_window_size = ( vocoder_attn_window_size if vocoder_attn_window_size is not None else [40, 10] )