# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional, Union, List import math import numpy as np import torch from transformers.feature_extraction_utils import BatchFeature from transformers.image_utils import ImageInput from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs from transformers.tokenization_utils_base import PreTokenizedInput, TextInput from transformers.video_utils import VideoInput # Audio input type - can be file paths, numpy arrays, or torch tensors AudioInput = Union[str, np.ndarray, torch.Tensor, List[str], List[np.ndarray], List[torch.Tensor]] class NemotronH_Nano_Omni_Reasoning_V3ImagesKwargs(ImagesKwargs): min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] temporal_patch_size: Optional[int] merge_size: Optional[int] class NemotronH_Nano_Omni_Reasoning_V3AudioKwargs(ProcessingKwargs, total=False): sampling_rate: Optional[int] class NemotronH_Nano_Omni_Reasoning_V3ProcessorKwargs(ProcessingKwargs, total=False): images_kwargs: NemotronH_Nano_Omni_Reasoning_V3ImagesKwargs videos_kwargs: VideosKwargs audio_kwargs: NemotronH_Nano_Omni_Reasoning_V3AudioKwargs _defaults = { "text_kwargs": { "padding": False, }, } class NemotronH_Nano_Omni_Reasoning_V3Processor(ProcessorMixin): r""" Constructs a Nemotron-3-Nano-Omni-30B-A3B-Reasoning processor which wraps an image processor, audio feature extractor, and a tokenizer into a single processor. [`NemotronH_Nano_Omni_Reasoning_V3Processor`] offers all the functionalities of the image processor, audio processor, and tokenizer. See the [`~NemotronH_Nano_Omni_Reasoning_V3Processor.__call__`] and [`~NemotronH_Nano_Omni_Reasoning_V3Processor.decode`] for more information. Args: image_processor ([`AutoImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`AutoTokenizer`], *optional*): The tokenizer is a required input. chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. audio_sampling_rate (`int`, *optional*): Sampling rate for audio processing (default: 16000). audio_subsampling_factor (`int`, *optional*): Subsampling factor for audio encoder (default: 8). audio_hop_length (`int`, *optional*): Hop length in samples for feature extraction (default: 160). """ attributes = ["image_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" video_processor_class = "AutoVideoProcessor" tokenizer_class = ("AutoTokenizer") def __init__( self, image_processor=None, tokenizer=None, chat_template=None, audio_sampling_rate: int = 16000, audio_subsampling_factor: int = 8, audio_hop_length: int = 160, video_temporal_patch_dim: int = 2, **kwargs ): # Number of frames collapsed into a single temporal patch by the model's `video_embedder`. # The `