"""GLAP (Generalized Language Audio Pretraining) configuration.""" from transformers import PretrainedConfig class GlapConfig(PretrainedConfig): model_type = "glap" def __init__( self, # Audio encoder (Dasheng) audio_embed_dim: int = 768, audio_depth: int = 12, audio_num_heads: int = 12, patch_size: list = None, patch_stride: list = None, target_length: int = 1008, sample_rate: int = 16000, # Text encoder (SONAR) text_vocab_size: int = 256206, text_model_dim: int = 1024, text_num_layers: int = 24, text_num_heads: int = 16, text_ffn_inner_dim: int = 8192, text_max_seq_len: int = 514, text_pad_idx: int = 0, text_dropout_p: float = 0.1, # Projection embed_size: int = 1024, **kwargs, ): super().__init__(**kwargs) self.audio_embed_dim = audio_embed_dim self.audio_depth = audio_depth self.audio_num_heads = audio_num_heads self.patch_size = patch_size or [64, 4] self.patch_stride = patch_stride or [64, 4] self.target_length = target_length self.sample_rate = sample_rate self.text_vocab_size = text_vocab_size self.text_model_dim = text_model_dim self.text_num_layers = text_num_layers self.text_num_heads = text_num_heads self.text_ffn_inner_dim = text_ffn_inner_dim self.text_max_seq_len = text_max_seq_len self.text_pad_idx = text_pad_idx self.text_dropout_p = text_dropout_p self.embed_size = embed_size