#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2025 The HuggingFace Inc. team
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
# Adapted from transformers/models/qwen2_vl/image_processing_qwen2_vl.py

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Union
import torch
from transformers.image_processing_utils import (
    BatchFeature,
    get_size_dict,
)
from transformers.image_utils import (
    OPENAI_CLIP_MEAN,
    OPENAI_CLIP_STD,
    ChannelDimension,
    SizeDict,
    get_image_size,
)
from transformers.processing_utils import Unpack, VideosKwargs
from transformers.utils import TensorType
from transformers.utils.import_utils import requires
from transformers.video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
from transformers.video_utils import group_videos_by_shape, reorder_videos
from transformers.image_utils import PILImageResampling
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from torchvision import transforms
from torchvision.transforms.v2 import functional as F


class OpenPanguVLVideoProcessorInitKwargs(VideosKwargs):
    min_pixels: Optional[int]
    max_pixels: Optional[int]
    patch_size: Optional[int]
    temporal_patch_size: Optional[int]
    merge_size: Optional[int]
    any_res_dynamic_video_pixels: bool
    any_res_min_video_total_pixels: Optional[int]
    any_res_max_video_total_pixels: Optional[int]
    any_res_min_frame_pixels: Optional[int]
    any_res_max_frame_pixels: Optional[int]


@requires(backends=("torchvision",))
class OpenPanguVLVideoProcessor(BaseVideoProcessor):

    resample = PILImageResampling.BICUBIC
    size = {"height": 448, "width": 448}
    do_resize = True
    do_rescale = True
    rescale_factor = 1 / 255
    do_normalize = True
    do_convert_rgb = True
    min_pixels = 56 * 56
    max_pixels = 28 * 28 * 1280
    patch_size = 14
    temporal_patch_size = 1
    merge_size = 2
    image_mean = OPENAI_CLIP_MEAN
    image_std = OPENAI_CLIP_STD
    any_res_dynamic_video_pixels = True
    any_res_min_video_total_pixels = 448 * 448 * 32
    any_res_max_video_total_pixels = 448 * 448 * 32
    any_res_min_frame_pixels = 56 * 56
    any_res_max_frame_pixels = 28 * 28 * 1280
    valid_kwargs = OpenPanguVLVideoProcessorInitKwargs
    model_input_names = ["pixel_values_videos", "video_grid_thw"]
    dtype = torch.bfloat16

    def __init__(self, **kwargs: Unpack[OpenPanguVLVideoProcessorInitKwargs]):
        super().__init__(**kwargs)

    def _preprocess(
        self,
        videos: List["torch.Tensor"],
        do_convert_rgb: bool,
        do_resize: bool,
        size: SizeDict,
        interpolation: Optional["F.InterpolationMode"],
        do_rescale: bool,
        rescale_factor: float,
        do_normalize: bool,
        image_mean: Optional[Union[float, List[float]]],
        image_std: Optional[Union[float, List[float]]],
        return_tensors: Optional[Union[str, TensorType]] = None,
        patch_size: Optional[int] = None,
        temporal_patch_size: Optional[int] = None,
        merge_size: Optional[int] = None,
        **kwargs,
    ):
        temporal_patch_size = OpenPanguVLVideoProcessor.temporal_patch_size
        # Recalculate the maximum and minimum resolution of a single frame
        num_frames = sum(video.shape[0] for video in videos)
        if not self.any_res_dynamic_video_pixels:
            self.min_pixels = self.any_res_min_frame_pixels
            self.max_pixels = self.any_res_max_frame_pixels
        else:
            # dynamic video pixels
            self.min_pixels = max(min(self.any_res_min_video_total_pixels // num_frames, \
                              self.any_res_max_frame_pixels), self.any_res_min_frame_pixels)
            self.max_pixels = max(min(self.any_res_max_video_total_pixels // num_frames, \
                              self.any_res_max_frame_pixels), self.any_res_min_frame_pixels)
        # Group videos by size for batched resizing
        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
        resized_videos_grouped = {}
        for shape, stacked_videos in grouped_videos.items():
            height, width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST)
            resized_height, resized_width = height, width
            if do_resize:
                resized_height, resized_width = smart_resize(
                    height,
                    width,
                    factor=patch_size * merge_size,
                    min_pixels=self.min_pixels,
                    max_pixels=self.max_pixels,
                )
                stacked_videos = F.resize(
                    stacked_videos, size=(resized_height, resized_width), interpolation=interpolation
                )
            resized_videos_grouped[shape] = stacked_videos 
        resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
        # Group videos by size for further processing
        # Needed in case do_resize is False, or resize returns videos with different sizes
        grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
        processed_videos_grouped = {}
        processed_video_grid_thw = {}
        for shape, stacked_videos in grouped_videos.items():
            resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST)

            # rescale and normalize
            stacked_videos = torch.mul(stacked_videos, rescale_factor)
            stacked_videos = transforms.Normalize(mean=image_mean, std=image_std)(stacked_videos)


            # Need to fill frames to cope with temporal_patch_size, avoid time block sticking
            stacked_videos = torch.repeat_interleave(stacked_videos, repeats=temporal_patch_size, dim=1)

            batch_size, grid_t, channel = stacked_videos.shape[:3]
            grid_t, grid_h, grid_w = (
                grid_t // temporal_patch_size,
                resized_height // patch_size,
                resized_width // patch_size,
            )

            stacked_videos = stacked_videos.view(
                batch_size,
                grid_t,
                temporal_patch_size,
                channel,
                grid_h // merge_size,
                merge_size,
                patch_size,
                grid_w // merge_size,
                merge_size,
                patch_size,
            )
            stacked_videos = stacked_videos.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9) 
            processed_stacked_videos = stacked_videos.reshape(
                batch_size,
                grid_t * grid_h * grid_w,
                channel * temporal_patch_size * patch_size * patch_size,
            )

            processed_videos_grouped[shape] = processed_stacked_videos
            processed_video_grid_thw[shape] = [[grid_t, grid_h, grid_w]] * batch_size

        processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
        processed_video_grid_thw = reorder_videos(processed_video_grid_thw, grouped_videos_index)
        pixel_values_videos = torch.cat(processed_videos, dim=0).to(OpenPanguVLVideoProcessor.dtype)
        video_grid_thw = torch.tensor(processed_video_grid_thw)
        return BatchFeature(
            data={"pixel_values_videos": pixel_values_videos, "video_grid_thw": video_grid_thw},
            tensor_type=return_tensors,
        )


__all__ = ["OpenPanguVLVideoProcessor"]