| from transformers import PretrainedConfig |
| from typing import List |
|
|
| from transformers import Qwen2Config, CLIPVisionConfig |
|
|
| class InfMLLMUnifiedHDChatConfig(PretrainedConfig): |
| def __init__( |
| self, |
| vison_config=None, |
| lm_config=None, |
| lm_model="", |
| lm_tokenizer="", |
| lora_modules="", |
| lora_llm=False, |
| lora_r=128, |
| lora_alpha=256, |
| lora_dropout=0, |
| |
| encoder_img="", |
| image_size_img=336, |
| lora_encoder_img=False, |
| hd_num=9, |
| |
| encoder_video="", |
| |
| max_txt_len=4096, |
| conv_style='qwen-7b-chat', |
| precision="bf16", |
| **kwargs |
| ): |
| self.lm_model = lm_model |
| self.lm_tokenizer = lm_tokenizer |
| self.lora_modules = lora_modules |
| self.lora_llm = lora_llm |
| self.lora_r = lora_r |
| self.lora_alpha = lora_alpha |
| self.lora_dropout = lora_dropout |
|
|
| self.encoder_img = encoder_img |
| self.image_size_img = image_size_img |
| self.lora_encoder_img = lora_encoder_img |
| self.hd_num = hd_num |
|
|
| self.encoder_video = encoder_video |
| |
| self.max_txt_len = max_txt_len |
| self.conv_style = conv_style |
|
|
| self.precision = precision |
| |
| if type(vison_config) == dict: |
| self.vision_config = CLIPVisionConfig(**vison_config) |
| else: |
| self.vision_config = vison_config |
| |
| if type(lm_config) == dict: |
| self.lm_config = Qwen2Config(**lm_config) |
| else: |
| self.lm_config = lm_config |
| super().__init__(**kwargs) |
| |