| |
| |
| |
| |
|
|
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.utils import logging |
| from .configuration_nemotron_h import NemotronHConfig |
| from .configuration_radio import RADIOConfig |
|
|
| logger = logging.get_logger(__name__) |
|
|
| class NemotronH_Nano_VL_V2_Config(PretrainedConfig): |
| model_type = 'NemotronH_Nano_VL_V2' |
| is_composition = True |
|
|
| def __init__( |
| self, |
| vision_config=None, |
| llm_config=None, |
| force_image_size=None, |
| downsample_ratio=0.5, |
| template=None, |
| ps_version='v1', |
| image_tag_type="internvl", |
| projector_hidden_size=4096, |
| vit_hidden_size=1280, |
| attn_implementation="flash_attention_2", |
| video_pruning_rate: float = 0.0, |
| **kwargs |
| ): |
| super().__init__(**kwargs) |
|
|
| if vision_config is not None: |
| self.vision_config = RADIOConfig(**vision_config) |
| else: |
| self.vision_config = RADIOConfig() |
|
|
| |
| if llm_config is not None: |
| self.llm_config = NemotronHConfig(**llm_config) |
| else: |
| self.llm_config = NemotronHConfig() |
|
|
| |
| self.force_image_size = force_image_size |
| self.downsample_ratio = downsample_ratio |
| self.template = template |
| self.ps_version = ps_version |
| self.image_tag_type = image_tag_type |
| self.projector_hidden_size = projector_hidden_size |
| self.vit_hidden_size = vit_hidden_size |
| self.video_pruning_rate = video_pruning_rate |
|
|
| self._attn_implementation = attn_implementation |
| self.vision_config.use_flash_attn = self._attn_implementation is not None and "flash_attention" in self._attn_implementation |
| self.llm_config._attn_implementation = self._attn_implementation |
|
|