File size: 4,872 Bytes

14189d7

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
from .configuration_nemotron_h import NemotronHConfig
from .configuration_radio import RADIOConfig

logger = logging.get_logger(__name__)


class SoundConfig(PretrainedConfig):
    """Configuration for the sound/audio model (Parakeet encoder + projection)."""
    model_type = "parakeet"
    
    def __init__(
        self,
        # Parakeet encoder config
        hidden_size: int = 1024,
        num_attention_heads: int = 8,
        num_hidden_layers: int = 24,
        intermediate_size: int = 4096,
        conv_kernel_size: int = 31,
        feat_in: int = 80,  # Mel features
        subsampling_factor: int = 8,
        # Projection config  
        projection_hidden_size: int = 20480,
        projection_bias: bool = True,
        # Audio processing
        sampling_rate: int = 16000,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.num_hidden_layers = num_hidden_layers
        self.intermediate_size = intermediate_size
        self.conv_kernel_size = conv_kernel_size
        self.feat_in = feat_in
        self.subsampling_factor = subsampling_factor
        self.projection_hidden_size = projection_hidden_size
        self.projection_bias = projection_bias
        self.sampling_rate = sampling_rate


class NemotronH_Nano_Omni_Reasoning_V3_Config(PretrainedConfig):
    model_type = 'NemotronH_Nano_Omni_Reasoning_V3'
    is_composition = True

    def __init__(
        self,
        vision_config=None,
        llm_config=None,
        sound_config=None,
        force_image_size=None,
        downsample_ratio=0.5,
        template=None,
        ps_version='v1',
        image_tag_type="internvl",
        projector_hidden_size=4096,
        vit_hidden_size=1280,
        attn_implementation="flash_attention_2",
        video_pruning_rate: float = 0.0,
        video_temporal_patch_size: int = 2,
        # Sound/audio settings
        sound_context_token_id: int = None,
        sound_context_token: str = "<audio>",
        **kwargs
    ):
        super().__init__(**kwargs)

        if vision_config is not None:
            self.vision_config = RADIOConfig(**vision_config)
        else:
            self.vision_config = RADIOConfig()

        # Handle both cases: when loading from JSON (llm_config is dict) and when called internally by transformers (llm_config is None)
        if llm_config is not None:
            self.llm_config = NemotronHConfig(**llm_config)
        else:
            self.llm_config = NemotronHConfig()

        # Sound/audio model configuration
        if sound_config is not None:
            self.sound_config = SoundConfig(**sound_config)
        else:
            self.sound_config = None  # Sound model is optional

        # Assign configuration values
        self.force_image_size = force_image_size
        self.downsample_ratio = downsample_ratio
        self.template = template  # TODO move out of here and into the tokenizer
        self.ps_version = ps_version  # Pixel shuffle version
        self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
        self.projector_hidden_size = projector_hidden_size
        self.vit_hidden_size = vit_hidden_size
        self.video_pruning_rate = video_pruning_rate
        self.video_temporal_patch_size = video_temporal_patch_size
        
        # Sound/audio token settings
        self.sound_context_token_id = sound_context_token_id
        self.sound_context_token = sound_context_token

        self._attn_implementation = attn_implementation
        self.vision_config.use_flash_attn = self._attn_implementation is not None and "flash_attention" in self._attn_implementation
        self.llm_config._attn_implementation = self._attn_implementation

    # vLLM's `NemotronH_Nano_VL_V2` implementation reads the language-model sub-config as
    # `config.text_config`. Our HF config stores it as `config.llm_config`; expose an alias so the
    # same config object loads under both loaders without having to duplicate the dict on disk.
    @property
    def text_config(self):
        return self.llm_config