| """ |
| WavTokenizer Configuration for HuggingFace Transformers |
| |
| This configuration class defines all the hyperparameters for WavTokenizer, |
| an acoustic discrete codec tokenizer for audio language modeling. |
| """ |
|
|
| from transformers import PretrainedConfig |
|
|
|
|
| class WavTokenizerConfig(PretrainedConfig): |
| """ |
| Configuration class for WavTokenizer model. |
| |
| WavTokenizer is a SOTA discrete acoustic codec model that compresses audio |
| into discrete tokens (40 or 75 tokens per second) while maintaining high |
| reconstruction quality. |
| |
| Args: |
| sample_rate (`int`, *optional*, defaults to 24000): |
| The sample rate of input audio. |
| n_fft (`int`, *optional*, defaults to 1280): |
| FFT size for STFT. |
| hop_length (`int`, *optional*, defaults to 320): |
| Hop length for STFT (determines frame rate: 24000/320 = 75 fps). |
| n_mels (`int`, *optional*, defaults to 128): |
| Number of mel filterbank channels. |
| padding (`str`, *optional*, defaults to "center"): |
| Padding mode for STFT ("center" or "same"). |
| |
| feature_dim (`int`, *optional*, defaults to 512): |
| Dimension of the feature backbone. |
| encoder_dim (`int`, *optional*, defaults to 64): |
| Dimension of encoder output. |
| encoder_rates (`list[int]`, *optional*, defaults to [8, 5, 4, 2]): |
| Downsampling rates for the encoder. |
| latent_dim (`int`, *optional*): |
| Dimension of the latent space (defaults to feature_dim). |
| |
| codebook_size (`int`, *optional*, defaults to 4096): |
| Size of the VQ codebook. |
| codebook_dim (`int`, *optional*, defaults to 8): |
| Dimension of codebook vectors. |
| num_quantizers (`int`, *optional*, defaults to 1): |
| Number of residual vector quantizers. |
| |
| backbone_type (`str`, *optional*, defaults to "vocos"): |
| Type of decoder backbone ("vocos"). |
| backbone_dim (`int`, *optional*, defaults to 512): |
| Dimension of the decoder backbone. |
| backbone_num_blocks (`int`, *optional*, defaults to 8): |
| Number of ConvNeXt blocks in the backbone. |
| backbone_intermediate_dim (`int`, *optional*, defaults to 1536): |
| Intermediate dimension in ConvNeXt blocks. |
| backbone_kernel_size (`int`, *optional*, defaults to 7): |
| Kernel size for depthwise convolutions. |
| backbone_layer_scale_init_value (`float`, *optional*, defaults to 1e-6): |
| Initial value for layer scale. |
| |
| head_type (`str`, *optional*, defaults to "istft"): |
| Type of waveform synthesis head ("istft"). |
| head_dim (`int`, *optional*, defaults to 1025): |
| Output dimension for the head (n_fft // 2 + 1). |
| |
| use_attention (`bool`, *optional*, defaults to True): |
| Whether to use attention in the decoder. |
| attention_dim (`int`, *optional*, defaults to 512): |
| Dimension for attention layers. |
| attention_heads (`int`, *optional*, defaults to 8): |
| Number of attention heads. |
| attention_layers (`int`, *optional*, defaults to 1): |
| Number of attention layers. |
| """ |
| |
| model_type = "wavtokenizer" |
| |
| def __init__( |
| self, |
| |
| sample_rate: int = 24000, |
| n_fft: int = 1280, |
| hop_length: int = 320, |
| n_mels: int = 128, |
| padding: str = "center", |
| |
| |
| feature_dim: int = 512, |
| encoder_dim: int = 32, |
| encoder_rates: list = None, |
| latent_dim: int = None, |
| |
| |
| codebook_size: int = 4096, |
| codebook_dim: int = 512, |
| num_quantizers: int = 1, |
| |
| |
| backbone_type: str = "vocos", |
| backbone_dim: int = 768, |
| backbone_num_blocks: int = 12, |
| backbone_intermediate_dim: int = 2304, |
| backbone_kernel_size: int = 7, |
| backbone_layer_scale_init_value: float = 1e-6, |
| |
| |
| head_type: str = "istft", |
| head_dim: int = 1025, |
| |
| |
| use_attention: bool = True, |
| attention_dim: int = 512, |
| attention_heads: int = 8, |
| attention_layers: int = 1, |
| |
| **kwargs |
| ): |
| super().__init__(**kwargs) |
| |
| |
| self.sample_rate = sample_rate |
| self.n_fft = n_fft |
| self.hop_length = hop_length |
| self.n_mels = n_mels |
| self.padding = padding |
| |
| |
| self.feature_dim = feature_dim |
| self.encoder_dim = encoder_dim |
| self.encoder_rates = encoder_rates if encoder_rates is not None else [2, 4, 5, 8] |
| self.latent_dim = latent_dim if latent_dim is not None else feature_dim |
| |
| |
| self.codebook_size = codebook_size |
| self.codebook_dim = codebook_dim |
| self.num_quantizers = num_quantizers |
| |
| |
| self.backbone_type = backbone_type |
| self.backbone_dim = backbone_dim |
| self.backbone_num_blocks = backbone_num_blocks |
| self.backbone_intermediate_dim = backbone_intermediate_dim |
| self.backbone_kernel_size = backbone_kernel_size |
| self.backbone_layer_scale_init_value = backbone_layer_scale_init_value |
| |
| |
| self.head_type = head_type |
| self.head_dim = head_dim |
| |
| |
| self.use_attention = use_attention |
| self.attention_dim = attention_dim |
| self.attention_heads = attention_heads |
| self.attention_layers = attention_layers |
| |
| @property |
| def vocab_size(self) -> int: |
| """Returns the vocabulary size (codebook size).""" |
| return self.codebook_size |
| |
| @property |
| def frame_rate(self) -> float: |
| """Returns the frame rate (tokens per second).""" |
| return self.sample_rate / self.hop_length |
|
|