# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging from .configuration_nemotron_h import NemotronHConfig from .configuration_radio import RADIOConfig logger = logging.get_logger(__name__) class SoundConfig(PretrainedConfig): """Configuration for the sound/audio model (Parakeet encoder + projection).""" model_type = "parakeet" def __init__( self, # Parakeet encoder config hidden_size: int = 1024, num_attention_heads: int = 8, num_hidden_layers: int = 24, intermediate_size: int = 4096, conv_kernel_size: int = 31, feat_in: int = 80, # Mel features subsampling_factor: int = 8, # Projection config projection_hidden_size: int = 20480, projection_bias: bool = True, # Audio processing sampling_rate: int = 16000, **kwargs, ): super().__init__(**kwargs) self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.num_hidden_layers = num_hidden_layers self.intermediate_size = intermediate_size self.conv_kernel_size = conv_kernel_size self.feat_in = feat_in self.subsampling_factor = subsampling_factor self.projection_hidden_size = projection_hidden_size self.projection_bias = projection_bias self.sampling_rate = sampling_rate class NemotronH_Nano_Omni_Reasoning_V3_Config(PretrainedConfig): model_type = 'NemotronH_Nano_Omni_Reasoning_V3' is_composition = True def __init__( self, vision_config=None, llm_config=None, sound_config=None, force_image_size=None, downsample_ratio=0.5, template=None, ps_version='v1', image_tag_type="internvl", projector_hidden_size=4096, vit_hidden_size=1280, attn_implementation="flash_attention_2", video_pruning_rate: float = 0.0, video_temporal_patch_size: int = 2, # Sound/audio settings sound_context_token_id: int = None, sound_context_token: str = "