| from transformers import PretrainedConfig |
|
|
|
|
| class CLSPConfig(PretrainedConfig): |
| model_type = "clsp" |
|
|
| def __init__( |
| self, |
| feature_dim: int = 128, |
| output_downsampling_factor: int = 2, |
| downsampling_factor: str = "1,2,4,8,4,2,1", |
| num_encoder_layers: str = "1,2,3,4,1,1,1", |
| encoder_dim: str = "1280,1280,1280,1280,1280,1280,1280", |
| encoder_unmasked_dim: str = "768,768,768,768,768,768,768", |
| query_head_dim: str = "32", |
| pos_head_dim: str = "4", |
| value_head_dim: str = "12", |
| pos_dim: int = 48, |
| num_heads: str = "8,8,8,8,8,8,8", |
| feedforward_dim: str = "3840,3840,3840,3840,3840,3840,3840", |
| cnn_module_kernel: str = "31,31,15,15,15,31,31", |
| causal: bool = False, |
| chunk_size: str = "-1", |
| left_context_frames: str = "-1", |
| text_encoder_dim: int = 768, |
| joint_dim: int = 512, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| |
| self.feature_dim = feature_dim |
| self.output_downsampling_factor = output_downsampling_factor |
| self.downsampling_factor = downsampling_factor |
| self.num_encoder_layers = num_encoder_layers |
| self.encoder_dim = encoder_dim |
| self.encoder_unmasked_dim = encoder_unmasked_dim |
| self.query_head_dim = query_head_dim |
| self.pos_head_dim = pos_head_dim |
| self.value_head_dim = value_head_dim |
| self.pos_dim = pos_dim |
| self.num_heads = num_heads |
| self.feedforward_dim = feedforward_dim |
| self.cnn_module_kernel = cnn_module_kernel |
| self.causal = causal |
| self.chunk_size = chunk_size |
| self.left_context_frames = left_context_frames |
|
|
| self.text_encoder_dim = text_encoder_dim |
| self.joint_dim = joint_dim |
|
|