File size: 4,872 Bytes
14189d7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
from .configuration_nemotron_h import NemotronHConfig
from .configuration_radio import RADIOConfig
logger = logging.get_logger(__name__)
class SoundConfig(PretrainedConfig):
"""Configuration for the sound/audio model (Parakeet encoder + projection)."""
model_type = "parakeet"
def __init__(
self,
# Parakeet encoder config
hidden_size: int = 1024,
num_attention_heads: int = 8,
num_hidden_layers: int = 24,
intermediate_size: int = 4096,
conv_kernel_size: int = 31,
feat_in: int = 80, # Mel features
subsampling_factor: int = 8,
# Projection config
projection_hidden_size: int = 20480,
projection_bias: bool = True,
# Audio processing
sampling_rate: int = 16000,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.conv_kernel_size = conv_kernel_size
self.feat_in = feat_in
self.subsampling_factor = subsampling_factor
self.projection_hidden_size = projection_hidden_size
self.projection_bias = projection_bias
self.sampling_rate = sampling_rate
class NemotronH_Nano_Omni_Reasoning_V3_Config(PretrainedConfig):
model_type = 'NemotronH_Nano_Omni_Reasoning_V3'
is_composition = True
def __init__(
self,
vision_config=None,
llm_config=None,
sound_config=None,
force_image_size=None,
downsample_ratio=0.5,
template=None,
ps_version='v1',
image_tag_type="internvl",
projector_hidden_size=4096,
vit_hidden_size=1280,
attn_implementation="flash_attention_2",
video_pruning_rate: float = 0.0,
video_temporal_patch_size: int = 2,
# Sound/audio settings
sound_context_token_id: int = None,
sound_context_token: str = "<audio>",
**kwargs
):
super().__init__(**kwargs)
if vision_config is not None:
self.vision_config = RADIOConfig(**vision_config)
else:
self.vision_config = RADIOConfig()
# Handle both cases: when loading from JSON (llm_config is dict) and when called internally by transformers (llm_config is None)
if llm_config is not None:
self.llm_config = NemotronHConfig(**llm_config)
else:
self.llm_config = NemotronHConfig()
# Sound/audio model configuration
if sound_config is not None:
self.sound_config = SoundConfig(**sound_config)
else:
self.sound_config = None # Sound model is optional
# Assign configuration values
self.force_image_size = force_image_size
self.downsample_ratio = downsample_ratio
self.template = template # TODO move out of here and into the tokenizer
self.ps_version = ps_version # Pixel shuffle version
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
self.projector_hidden_size = projector_hidden_size
self.vit_hidden_size = vit_hidden_size
self.video_pruning_rate = video_pruning_rate
self.video_temporal_patch_size = video_temporal_patch_size
# Sound/audio token settings
self.sound_context_token_id = sound_context_token_id
self.sound_context_token = sound_context_token
self._attn_implementation = attn_implementation
self.vision_config.use_flash_attn = self._attn_implementation is not None and "flash_attention" in self._attn_implementation
self.llm_config._attn_implementation = self._attn_implementation
# vLLM's `NemotronH_Nano_VL_V2` implementation reads the language-model sub-config as
# `config.text_config`. Our HF config stores it as `config.llm_config`; expose an alias so the
# same config object loads under both loaders without having to duplicate the dict on disk.
@property
def text_config(self):
return self.llm_config
|