| from transformers import PretrainedConfig |
| from torchscale.architecture.config import EncoderConfig |
|
|
| class ViVQAConfig(PretrainedConfig): |
| model_type = "vivqa" |
| |
| def __init__( |
| self, |
| drop_path_rate: float = 0.0, |
| mlp_ratio: float = 4.0, |
| encoder_layers: int = 6, |
| encoder_attention_heads: int = 6, |
| multiway: bool = True, |
| layernorm_embedding: bool = False, |
| normalize_output: bool = True, |
| no_output_layer: bool = True, |
| encoder_embed_dim: int = 768, |
| **kwargs |
| ): |
| args = EncoderConfig( |
| multiway=multiway, |
| layernorm_embedding=layernorm_embedding, normalize_output=normalize_output, no_output_layer=no_output_layer, |
| drop_path_rate=drop_path_rate, encoder_embed_dim=768, encoder_attention_heads=encoder_attention_heads, |
| encoder_ffn_embed_dim=int(768 * mlp_ratio), encoder_layers=encoder_layers, |
| ) |
| for key, value in args.__dict__.items(): |
| setattr(self, key, value) |
| |
| super().__init__(**kwargs) |