Spaces:
Running on A100
Running on A100
| # Copyright 2025 Xiaomi Corporation. | |
| from transformers import PretrainedConfig | |
| class MiMoAudioTokenizerConfig(PretrainedConfig): | |
| model_type = "mimo_audio_tokenizer" | |
| def __init__( | |
| self, | |
| max_audio_seconds: int = 1800, | |
| stride_size: int = 2, | |
| avg_pooler: int = 1, | |
| d_model: int = 768, | |
| scale_embedding: bool = True, | |
| kernel_size: int = 3, | |
| activation_function: str = "gelu", | |
| encoder_layers: int = 8, | |
| encoder_skip_layer_id: int = None, | |
| encoder_attention_heads: int = 12, | |
| encoder_ffn_dim: int = 3072, | |
| encoder_causal: bool = False, | |
| encoder_attn_window_size: list[int] = None, | |
| decoder_layers: int = 8, | |
| decoder_attention_heads: int = 12, | |
| decoder_ffn_dim: int = 3072, | |
| decoder_kernel_size: int = 3, | |
| decoder_stride_size: int = 2, | |
| decoder_causal: bool = True, | |
| decoder_attn_window_size: list[int] = None, | |
| nfft: int = 1024, | |
| vocoder_dim: int = 512, | |
| vocoder_intermediate_dim: int = 4096, | |
| vocoder_num_layers: int = 30, | |
| n_mels: int = 80, | |
| sampling_rate: int = 24000, | |
| hop_length: int = 240, | |
| window_size: int = 1024, | |
| vocoder_padding: str = "same", | |
| fmin: int = 0, | |
| fmax: int = None, | |
| num_quantizers: int = 12, | |
| codebook_size: list[int] = None, | |
| threshold_ema_dead_code: int = 10, | |
| position_embedding_type: str = "rope", | |
| rope_theta: int = 10000, | |
| rope_type: str = "default", | |
| ln_type: str = "LayerNorm", | |
| vocoder_attention_heads: int = 4, | |
| vocoder_attn_window_size: list[int] = None, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.max_audio_seconds = max_audio_seconds | |
| self.stride_size = stride_size | |
| self.avg_pooler = avg_pooler | |
| self.d_model = d_model | |
| self.scale_embedding = scale_embedding | |
| self.kernel_size = kernel_size | |
| self.activation_function = activation_function | |
| self.encoder_layers = encoder_layers | |
| self.encoder_skip_layer_id = encoder_skip_layer_id | |
| self.encoder_attention_heads = encoder_attention_heads | |
| self.encoder_ffn_dim = encoder_ffn_dim | |
| self.encoder_causal = encoder_causal | |
| self.encoder_attn_window_size = ( | |
| encoder_attn_window_size | |
| if encoder_attn_window_size is not None | |
| else [-1, -1] | |
| ) | |
| self.decoder_layers = decoder_layers | |
| self.decoder_attention_heads = decoder_attention_heads | |
| self.decoder_ffn_dim = decoder_ffn_dim | |
| self.decoder_kernel_size = decoder_kernel_size | |
| self.decoder_stride_size = decoder_stride_size | |
| self.decoder_causal = decoder_causal | |
| self.decoder_attn_window_size = ( | |
| decoder_attn_window_size | |
| if decoder_attn_window_size is not None | |
| else [-1, -1] | |
| ) | |
| self.nfft = nfft | |
| self.vocoder_dim = vocoder_dim | |
| self.vocoder_intermediate_dim = vocoder_intermediate_dim | |
| self.vocoder_num_layers = vocoder_num_layers | |
| self.n_mels = n_mels | |
| self.sampling_rate = sampling_rate | |
| self.hop_length = hop_length | |
| self.window_size = window_size | |
| self.vocoder_padding = vocoder_padding | |
| self.fmin = fmin | |
| self.fmax = fmax | |
| self.num_quantizers = num_quantizers | |
| self.codebook_size = codebook_size if codebook_size is not None else [1024] | |
| self.threshold_ema_dead_code = threshold_ema_dead_code | |
| self.position_embedding_type = position_embedding_type | |
| self.rope_theta = rope_theta | |
| self.rope_type = rope_type | |
| self.ln_type = ln_type | |
| self.vocoder_attention_heads = vocoder_attention_heads | |
| self.vocoder_attn_window_size = ( | |
| vocoder_attn_window_size | |
| if vocoder_attn_window_size is not None | |
| else [40, 10] | |
| ) | |