openPangu-VL-7B / videoprocessor_openpangu_vl.py
wangrongsheng's picture
Upload folder using huggingface_hub
1688f96 verified
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2025 The HuggingFace Inc. team
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
# Adapted from transformers/models/qwen2_vl/image_processing_qwen2_vl.py
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional, Union
import torch
from transformers.image_processing_utils import (
BatchFeature,
get_size_dict,
)
from transformers.image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
SizeDict,
get_image_size,
)
from transformers.processing_utils import Unpack, VideosKwargs
from transformers.utils import TensorType
from transformers.utils.import_utils import requires
from transformers.video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
from transformers.video_utils import group_videos_by_shape, reorder_videos
from transformers.image_utils import PILImageResampling
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from torchvision import transforms
from torchvision.transforms.v2 import functional as F
class OpenPanguVLVideoProcessorInitKwargs(VideosKwargs):
min_pixels: Optional[int]
max_pixels: Optional[int]
patch_size: Optional[int]
temporal_patch_size: Optional[int]
merge_size: Optional[int]
any_res_dynamic_video_pixels: bool
any_res_min_video_total_pixels: Optional[int]
any_res_max_video_total_pixels: Optional[int]
any_res_min_frame_pixels: Optional[int]
any_res_max_frame_pixels: Optional[int]
@requires(backends=("torchvision",))
class OpenPanguVLVideoProcessor(BaseVideoProcessor):
resample = PILImageResampling.BICUBIC
size = {"height": 448, "width": 448}
do_resize = True
do_rescale = True
rescale_factor = 1 / 255
do_normalize = True
do_convert_rgb = True
min_pixels = 56 * 56
max_pixels = 28 * 28 * 1280
patch_size = 14
temporal_patch_size = 1
merge_size = 2
image_mean = OPENAI_CLIP_MEAN
image_std = OPENAI_CLIP_STD
any_res_dynamic_video_pixels = True
any_res_min_video_total_pixels = 448 * 448 * 32
any_res_max_video_total_pixels = 448 * 448 * 32
any_res_min_frame_pixels = 56 * 56
any_res_max_frame_pixels = 28 * 28 * 1280
valid_kwargs = OpenPanguVLVideoProcessorInitKwargs
model_input_names = ["pixel_values_videos", "video_grid_thw"]
dtype = torch.bfloat16
def __init__(self, **kwargs: Unpack[OpenPanguVLVideoProcessorInitKwargs]):
super().__init__(**kwargs)
def _preprocess(
self,
videos: List["torch.Tensor"],
do_convert_rgb: bool,
do_resize: bool,
size: SizeDict,
interpolation: Optional["F.InterpolationMode"],
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
image_mean: Optional[Union[float, List[float]]],
image_std: Optional[Union[float, List[float]]],
return_tensors: Optional[Union[str, TensorType]] = None,
patch_size: Optional[int] = None,
temporal_patch_size: Optional[int] = None,
merge_size: Optional[int] = None,
**kwargs,
):
temporal_patch_size = OpenPanguVLVideoProcessor.temporal_patch_size
# Recalculate the maximum and minimum resolution of a single frame
num_frames = sum(video.shape[0] for video in videos)
if not self.any_res_dynamic_video_pixels:
self.min_pixels = self.any_res_min_frame_pixels
self.max_pixels = self.any_res_max_frame_pixels
else:
# dynamic video pixels
self.min_pixels = max(min(self.any_res_min_video_total_pixels // num_frames, \
self.any_res_max_frame_pixels), self.any_res_min_frame_pixels)
self.max_pixels = max(min(self.any_res_max_video_total_pixels // num_frames, \
self.any_res_max_frame_pixels), self.any_res_min_frame_pixels)
# Group videos by size for batched resizing
grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
resized_videos_grouped = {}
for shape, stacked_videos in grouped_videos.items():
height, width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST)
resized_height, resized_width = height, width
if do_resize:
resized_height, resized_width = smart_resize(
height,
width,
factor=patch_size * merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
stacked_videos = F.resize(
stacked_videos, size=(resized_height, resized_width), interpolation=interpolation
)
resized_videos_grouped[shape] = stacked_videos
resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
# Group videos by size for further processing
# Needed in case do_resize is False, or resize returns videos with different sizes
grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
processed_videos_grouped = {}
processed_video_grid_thw = {}
for shape, stacked_videos in grouped_videos.items():
resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST)
# rescale and normalize
stacked_videos = torch.mul(stacked_videos, rescale_factor)
stacked_videos = transforms.Normalize(mean=image_mean, std=image_std)(stacked_videos)
# Need to fill frames to cope with temporal_patch_size, avoid time block sticking
stacked_videos = torch.repeat_interleave(stacked_videos, repeats=temporal_patch_size, dim=1)
batch_size, grid_t, channel = stacked_videos.shape[:3]
grid_t, grid_h, grid_w = (
grid_t // temporal_patch_size,
resized_height // patch_size,
resized_width // patch_size,
)
stacked_videos = stacked_videos.view(
batch_size,
grid_t,
temporal_patch_size,
channel,
grid_h // merge_size,
merge_size,
patch_size,
grid_w // merge_size,
merge_size,
patch_size,
)
stacked_videos = stacked_videos.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
processed_stacked_videos = stacked_videos.reshape(
batch_size,
grid_t * grid_h * grid_w,
channel * temporal_patch_size * patch_size * patch_size,
)
processed_videos_grouped[shape] = processed_stacked_videos
processed_video_grid_thw[shape] = [[grid_t, grid_h, grid_w]] * batch_size
processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
processed_video_grid_thw = reorder_videos(processed_video_grid_thw, grouped_videos_index)
pixel_values_videos = torch.cat(processed_videos, dim=0).to(OpenPanguVLVideoProcessor.dtype)
video_grid_thw = torch.tensor(processed_video_grid_thw)
return BatchFeature(
data={"pixel_values_videos": pixel_values_videos, "video_grid_thw": video_grid_thw},
tensor_type=return_tensors,
)
__all__ = ["OpenPanguVLVideoProcessor"]