SDAR-VL-Instruct-4B / modeling_llava_onevision.py

Upload model files: SDAR-VL-Instruct-4B

3a1fbb9 verified 4 months ago

70.5 kB

	# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
	# This file was automatically generated from src/transformers/models/llava_onevision/modular_llava_onevision.py.
	# Do NOT edit this file manually as any edits will be overwritten by the generation of
	# the file from the modular. If any change should be done, please apply the change to the
	# modular_llava_onevision.py file directly. One of our CI enforces this.
	# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
	# coding=utf-8
	# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import math
	from dataclasses import dataclass
	from typing import List, Optional, Tuple, Union

	import numpy as np
	import torch
	from torch import nn
	import torch.distributed as dist

	from transformers.activations import ACT2FN
	from transformers.generation import GenerationMixin
	from transformers.image_processing_utils import select_best_resolution
	from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
	from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
	from transformers.modeling_utils import PreTrainedModel
	from transformers.processing_utils import Unpack
	from transformers.utils import (
	LossKwargs,
	auto_docstring,
	can_return_tuple,
	is_torchdynamo_compiling,
	logging,
	)
	from transformers.models.auto import AutoModel
	from torch.nn.attention.flex_attention import create_block_mask
	from .configuration_llava_onevision import LlavaOnevisionConfig
	from .fused_linear_diffusion_cross_entropy import FusedLinearDiffusionCrossEntropyLoss

	logger = logging.get_logger(__name__)


	@dataclass
	class LlavaOnevisionModelOutputWithPast(BaseModelOutputWithPast):
	"""
	Base class for Llava outputs, with hidden states and attentions.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.
	past_key_values (`tuple(tuple(torch.FloatTensor))`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
	`(batch_size, num_heads, sequence_length, embed_size_per_head)`)

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	image_hidden_states (`torch.FloatTensor`, optional):
	A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
	image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.

	video_hidden_states (`torch.FloatTensor`, optional):
	A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
	video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
	"""

	image_hidden_states: Optional[torch.FloatTensor] = None

	video_hidden_states: Optional[torch.FloatTensor] = None

	logits_to_keep_half: Optional[torch.BoolTensor] = None

	logits_to_keep: Optional[torch.BoolTensor] = None

	p_mask: Optional[torch.FloatTensor] = None



	@dataclass
	class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
	"""
	Base class for LlavaOnevision causal language model (or autoregressive) outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	past_key_values (`tuple(tuple(torch.FloatTensor))`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
	`(batch_size, num_heads, sequence_length, embed_size_per_head)`)

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	image_hidden_states (`torch.FloatTensor`, optional):
	A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
	image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.

	video_hidden_states (`torch.FloatTensor`, optional):
	A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
	video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[List[torch.FloatTensor]] = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None
	image_hidden_states: Optional[torch.FloatTensor] = None

	video_hidden_states: Optional[torch.FloatTensor] = None


	class LlavaOnevisionPooler(nn.Module):
	def __init__(self, config):
	super().__init__()

	mode = config.spatial_pool_mode
	stride = config.spatial_pool_stride
	out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
	self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2

	if mode == "average":
	self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
	elif mode == "max":
	self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
	elif mode == "conv":
	self.pool = nn.Conv2d(
	in_channels=config.vision_config.hidden_size,
	out_channels=out_channels,
	kernel_size=stride,
	stride=stride,
	)
	else:
	raise ValueError(f"Unknown pooling mode: {mode}. Has to be one of [`average`, `max`, `conv`]")

	def forward(self, image_features):
	ori_width = int(math.sqrt(image_features.shape[1] * self.image_size // self.image_size))
	ori_height = int(ori_width * self.image_size // self.image_size)

	batch_size, _, dim = image_features.shape
	image_features_spatial = image_features.view(batch_size, ori_height, ori_height, dim).permute(0, 3, 1, 2)
	image_features_spatial_pool = self.pool(image_features_spatial)

	return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()


	class LlavaOnevisionMultiModalProjector(nn.Module):
	def __init__(self, config: LlavaOnevisionConfig):
	super().__init__()
	# We have hidden_size * the number of vision feature layers
	num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
	self.linear_1 = nn.Linear(
	config.vision_config.hidden_size * num_feature_layers,
	config.text_config.hidden_size,
	bias=config.multimodal_projector_bias,
	)
	self.act = ACT2FN[config.projector_hidden_act]
	self.linear_2 = nn.Linear(
	config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
	)

	def forward(self, image_features):
	hidden_states = self.linear_1(image_features)
	hidden_states = self.act(hidden_states)
	hidden_states = self.linear_2(hidden_states)
	return hidden_states


	def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
	"""
	Calculate the shape of the image patch grid after the preprocessing for images of any resolution.

	Args:
	image_size (`tuple`):
	The size of the input image in the format (width, height).
	grid_pinpoints (`List`):
	A list containing possible resolutions. Each item in the list should be a tuple or list
	of the form `(height, width)`.
	patch_size (`int`):
	The size of each image patch.

	Returns:
	tuple: The shape of the image patch grid in the format (width, height).
	"""
	if not isinstance(grid_pinpoints, list):
	raise TypeError("grid_pinpoints should be a list of tuples or lists")

	# ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
	if not isinstance(image_size, (list, tuple)):
	if not isinstance(image_size, (torch.Tensor, np.ndarray)):
	raise TypeError(
	f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor"
	)
	image_size = image_size.tolist()

	height, width = select_best_resolution(image_size, grid_pinpoints)
	return height // patch_size, width // patch_size


	def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
	"""
	Calculate the number of patches after the preprocessing for images of any resolution.

	Args:
	image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
	The size of the input image in the format (height, width). ?
	grid_pinpoints (`List`):
	A list containing possible resolutions. Each item in the list should be a tuple or list
	of the form `(height, width)`.
	patch_size (`int`):
	The size of each image patch.

	Returns:
	int: the number of patches
	"""
	if not isinstance(grid_pinpoints, list):
	raise TypeError("grid_pinpoints should be a list of tuples or lists")

	# ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
	if not isinstance(image_size, (list, tuple)):
	if not isinstance(image_size, (torch.Tensor, np.ndarray)):
	raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}")
	image_size = image_size.tolist()

	best_resolution = select_best_resolution(image_size, grid_pinpoints)
	height, width = best_resolution
	num_patches = 0
	# consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1
	for i in range(0, height, patch_size):
	for j in range(0, width, patch_size):
	num_patches += 1
	# add the base patch
	num_patches += 1
	return num_patches


	def unpad_image(tensor, original_size):
	"""
	Unpads a PyTorch tensor of a padded and resized image.

	Args:
	tensor (`torch.Tensor`):
	The image tensor, assumed to be of shape (num_channels, height, width).
	original_size (`tuple`):
	The original size of the image (height, width).

	Returns:
	`torch.Tensor`: The unpadded image tensor.
	"""
	if not isinstance(original_size, (list, tuple)):
	if not isinstance(original_size, (torch.Tensor, np.ndarray)):
	raise TypeError(
	f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
	)
	original_size = original_size.tolist()
	original_height, original_width = original_size
	current_height, current_width = tensor.shape[1:]

	original_aspect_ratio = original_width / original_height
	current_aspect_ratio = current_width / current_height

	if original_aspect_ratio > current_aspect_ratio:
	scale_factor = current_width / original_width
	new_height = int(round(original_height * scale_factor, 7))
	padding = (current_height - new_height) // 2
	unpadded_tensor = tensor[:, padding : current_height - padding, :]
	else:
	scale_factor = current_height / original_height
	new_width = int(round(original_width * scale_factor, 7))
	padding = (current_width - new_width) // 2
	unpadded_tensor = tensor[:, :, padding : current_width - padding]

	return unpadded_tensor


	@auto_docstring
	class LlavaOnevisionPreTrainedModel(PreTrainedModel):
	config_class = LlavaOnevisionConfig
	base_model_prefix = ""
	supports_gradient_checkpointing = True
	_no_split_modules = ["LlamaDecoderLayer"]
	_skip_keys_device_placement = "past_key_values"
	_supports_cache_class = True
	_supports_flash_attn_2 = True
	_supports_sdpa = True
	_supports_quantized_cache = True
	_supports_static_cache = True
	_supports_attention_backend = True

	def _init_weights(self, module):
	std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)

	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, LlavaOnevisionModel):
	embed_std = 1 / math.sqrt(self.config.text_config.hidden_size)
	module.image_newline.data.normal_(mean=0.0, std=embed_std)


	def modify_padded_position_ids(position_ids: torch.Tensor) -> torch.Tensor:
	"""
	使用 PyTorch Tensor 操作修改 packed position_ids 中尾部 padding 的值。
	这个函数假设输入是一个 1D Tensor。
	Args:
	position_ids: 一维 PyTorch Tensor.
	Returns:
	修改后的 position_ids Tensor.
	"""
	seq_len = position_ids.size(0)
	# 找到所有非零元素的索引
	nonzero_indices = (position_ids != 0).nonzero().squeeze()

	# 确定 padding 开始的位置
	if nonzero_indices.numel() > 0:
	# 如果存在非零元素，padding 从最后一个非零元素的下一个位置开始
	last_nonzero_idx = nonzero_indices.max().item()
	pad_start_idx = last_nonzero_idx + 1
	else:
	pad_start_idx = 0

	# 如果有需要修改的 padding 部分
	if pad_start_idx < seq_len:
	pad_length = seq_len - pad_start_idx
	new_pad_values = torch.arange(pad_length, device=position_ids.device, dtype=position_ids.dtype)
	position_ids[pad_start_idx:] = new_pad_values

	return position_ids


	def modify_padded_position_ids_2d(position_ids: torch.LongTensor) -> torch.LongTensor:
	"""
	使用完全向量化的 PyTorch 操作修改一个 batch 的 packed position_ids。
	这个函数假设输入是一个 2D Tensor，形状为 (batch_size, sequence_length)。
	它会独立地处理 batch 中的每一行。

	Args:
	position_ids: 二维 PyTorch Tensor, shape (batch_size, sequence_length).

	Returns:
	修改后的 position_ids Tensor, shape (batch_size, sequence_length).
	"""
	if position_ids.dim() != 2:
	raise ValueError(f"Input tensor must be 2D, but got {position_ids.dim()} dimensions.")

	batch_size, seq_len = position_ids.shape
	device = position_ids.device

	col_indices = torch.arange(seq_len, device=device, dtype=position_ids.dtype).expand(batch_size, -1)
	mask = (position_ids != 0)

	masked_indices = col_indices * mask
	last_nonzero_idx = torch.max(masked_indices, dim=1).values
	has_nonzero = torch.any(mask, dim=1)
	pad_start_idx = torch.where(has_nonzero, last_nonzero_idx + 1, torch.tensor(0, device=device, dtype=position_ids.dtype))

	padding_mask = col_indices >= pad_start_idx.unsqueeze(1)
	new_pad_values = col_indices - pad_start_idx.unsqueeze(1)
	position_ids = torch.where(padding_mask, new_pad_values, position_ids)

	return position_ids


	def calculate_token_nums(position_ids: torch.Tensor):
	"""
	使用 PyTorch 高效计算一个批次中每个打包序列的长度。

	Args:
	position_ids (torch.Tensor): 一个 2D Tensor，形状为 (batch_size, sequence_length)。
	例如：tensor([[0,1,2,3,4,0,1,2,3,4,5,0,1,2,3,0,0,0]])
	Returns:
	list[list[int]]: 一个嵌套列表，包含每个批次项中各个序列的长度。
	例如：[[5, 6, 4, 1, 1, 1]]
	"""
	# 检查输入是否为 2D Tensor
	if position_ids.dim() != 2:
	raise ValueError(f"输入必须是 2D Tensor，但得到了 {position_ids.dim()}D")

	all_lengths = []

	# 我们按批次逐行处理。因为每行的序列长度数量不同（ragged），
	# 所以 Python 循环在批次维度上是最高效且最清晰的写法。
	# 循环内部的操作是完全向量化的。
	for pids_row in position_ids:
	# 获取当前行的总长度
	seq_len = pids_row.shape[0]

	# 1. 找到所有值为 0 的元素的索引
	# pids_row == 0 会返回一个布尔 Tensor: [True, False, ..., True, ...]
	# torch.nonzero 会返回这些 True 值的索引
	# .flatten() 将其从 (N, 1) 形状的 Tensor 变为 (N,) 形状
	zero_indices = torch.nonzero(pids_row == 0).flatten()

	# 2. 将序列的总长度作为一个额外的切分点添加到末尾
	# 这对于计算最后一个序列的长度至关重要
	# 注意：要确保新创建的 tensor 和原始 tensor 在同一个设备上 (cpu/cuda)
	split_points = torch.cat([
	zero_indices,
	torch.tensor([seq_len], device=pids_row.device, dtype=zero_indices.dtype)
	])

	# 3. 计算相邻切分点之间的差值，这就是我们想要的长度
	# torch.diff([a, b, c, d]) 会返回 [b-a, c-b, d-c]
	lengths = torch.diff(split_points)

	all_lengths.append(lengths)

	return all_lengths


	# def forward_add_noise_packed(
	# inputs_embeds: torch.Tensor,
	# num_tokens: torch.Tensor,
	# prompt_mask: torch.Tensor,
	# mask_embed: torch.Tensor,
	# eps: float = 1e-3,
	# max_tries: int = 10,
	# ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	# """
	# 为单个打包（packed）序列的 embedding 添加噪声，该序列的形状带有 batch 维度。

	# 函数为每个逻辑样本（在 inputs_embeds 中拼接）生成一个随机噪声率，
	# 并随机将一部分 token 的 embedding 替换为 mask_embed。
	# 这个过程会避开被 prompt_mask 标记的位置。

	# Args:
	# inputs_embeds (torch.Tensor): 输入的 embedding 张量，形状为 (1, total_tokens, embed_dim)。
	# num_tokens (torch.Tensor): 1D 张量，记录了每个逻辑样本的长度。
	# 例如 [len_sample1, len_sample2, ...]。
	# prompt_mask (torch.Tensor): 布尔型张量，形状为 (1, total_tokens)，
	# 值为 True 的位置表示是 prompt，不应添加噪声。
	# mask_embed (torch.Tensor): 用于替换的 mask embedding，形状为 (embed_dim,) 或 (1, embed_dim)。
	# eps (float): 微小值，用于防止噪声率 t 恰好为 0，确保 p_mask > 0。
	# max_tries (int): 为确保至少一个非 prompt token 被 mask，尝试的最大次数。

	# Returns:
	# Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	# - noisy_embeds (torch.Tensor): 添加噪声后的 embedding 张量，形状为 (1, total_tokens, embed_dim)。
	# - final_masked_indices (torch.Tensor): 布尔型张量，标记了哪些位置被实际 mask 了，形状为 (1, total_tokens)。
	# - p_mask_per_sample (torch.Tensor): 每个逻辑样本实际使用的噪声率，形状为 (num_samples, )。
	# """
	# # 1. 验证和获取形状
	# bsz, total_tokens, embed_dim = inputs_embeds.shape
	# assert bsz == 1, f"此函数设计用于处理 bsz=1 的打包序列，但收到了 bsz={bsz}"

	# num_samples = len(num_tokens)
	# assert total_tokens == torch.sum(num_tokens), "num_tokens 之和与 inputs_embeds 的总长度不匹配"
	# assert prompt_mask.shape == (bsz, total_tokens), f"prompt_mask 形状不匹配, 期望 {(bsz, total_tokens)}, 得到 {prompt_mask.shape}"
	# assert mask_embed.dim() == 1 or mask_embed.shape[-1] == embed_dim, "mask_embed 形状不匹配"

	# device = inputs_embeds.device

	# # 调整 mask_embed 形状以便广播: (dim,) -> (1, 1, dim)
	# mask_embed = mask_embed.view(1, 1, embed_dim)

	# # --- 确定可以被 mask 的位置 ---
	# eligible_for_masking = ~prompt_mask

	# # 如果没有任何 token 可以被 mask，直接返回原始输入
	# if not eligible_for_masking.any():
	# return (
	# inputs_embeds,
	# torch.zeros_like(prompt_mask, dtype=torch.bool),
	# torch.full((num_samples,), eps, device=device)
	# )

	# # 2. 生成噪声率和 mask，尝试几次以确保至少 mask 一个 token
	# final_masked_indices = torch.zeros_like(prompt_mask, dtype=torch.bool)

	# for _ in range(max_tries):
	# # 为每个逻辑样本生成一个独立的随机噪声率 t in [0, 1]
	# t = torch.rand(num_samples, device=device) # shape: (num_samples,)
	# p_mask_per_sample = (1 - eps) * t + eps

	# # 将每个样本的噪声率扩展到其所有 token 上
	# p_mask_per_token_1d = torch.repeat_interleave(p_mask_per_sample, num_tokens) # shape: (total_tokens,)
	# p_mask_per_token = p_mask_per_token_1d.unsqueeze(0) # shape: (1, total_tokens)

	# # 生成随机数并根据 p_mask 创建初步的 mask
	# masked_indices = torch.rand_like(p_mask_per_token) < p_mask_per_token # shape: (1, total_tokens)

	# # 应用约束：只在允许的位置进行 mask
	# final_masked_indices = masked_indices & eligible_for_masking

	# if final_masked_indices.any():
	# break

	# # 3. 根据最终的 mask 生成带噪声的 embedding
	# # final_masked_indices 是 (1, total_tokens)，需要扩展到 (1, total_tokens, 1)
	# # 以便和 (1, total_tokens, embed_dim) 的张量在 torch.where 中正确广播
	# noisy_embeds = torch.where(
	# final_masked_indices.unsqueeze(-1),
	# mask_embed,
	# inputs_embeds
	# )

	# return noisy_embeds, final_masked_indices, p_mask_per_token[final_masked_indices]

	def forward_add_noise_packed(
	inputs_embeds: torch.Tensor,
	num_tokens_list: List[torch.Tensor],
	prompt_mask: torch.Tensor,
	mask_embed: torch.Tensor,
	eps: float = 1e-3,
	max_tries: int = 10,
	) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
	"""
	为一批打包（packed）序列的 embedding 添加噪声。

	函数为每个逻辑样本（在每个批次项内拼接）生成一个独立的随机噪声率，
	并随机将一部分 token 的 embedding 替换为 mask_embed。
	这个过程会避开被 prompt_mask 标记的位置。

	Args:
	inputs_embeds (torch.Tensor):
	输入的 embedding 张量，形状为 (bsz, total_tokens, embed_dim)。
	num_tokens_list (List[torch.Tensor]):
	一个张量列表，长度为 bsz。列表中的每个张量记录了对应批次项中
	每个逻辑样本的长度。例如: [tensor([len1, len2]), tensor([len3, len4, len5])].
	prompt_mask (torch.Tensor):
	布尔型张量，形状为 (bsz, total_tokens)，值为 True 的位置表示是 prompt，
	不应添加噪声。
	mask_embed (torch.Tensor):
	用于替换的 mask embedding，形状为 (embed_dim,) 或 (1, embed_dim)。
	eps (float):
	微小值，用于防止噪声率 t 恰好为 0，确保 p_mask > 0。
	max_tries (int):
	为确保至少一个非 prompt token 被 mask，对每个批次项尝试的最大次数。

	Returns:
	Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
	- noisy_embeds (torch.Tensor):
	添加噪声后的 embedding 张量，形状为 (bsz, total_tokens, embed_dim)。
	- final_masked_indices (torch.Tensor):
	布尔型张量，标记了哪些位置被实际 mask 了，形状为 (bsz, total_tokens)。
	- p_masks_list (List[torch.Tensor]):
	一个张量列表，长度为 bsz。每个张量包含了对应批次项中每个逻辑样本的
	实际噪声率。
	"""
	# 1. 验证和获取形状
	bsz, total_tokens, embed_dim = inputs_embeds.shape
	device = inputs_embeds.device

	# 检查输入的一致性
	assert len(num_tokens_list) == bsz, f"num_tokens_list 的长度 ({len(num_tokens_list)}) 必须等于 bsz ({bsz})"
	assert prompt_mask.shape == (bsz, total_tokens), f"prompt_mask 形状不匹配, 期望 {(bsz, total_tokens)}, 得到 {prompt_mask.shape}"

	# 准备结果容器
	noisy_embeds_list = []
	final_masked_indices_list = []
	p_masks_list = []

	# 调整 mask_embed 形状以便广播: (dim,) -> (1, 1, dim)
	mask_embed_view = mask_embed.view(1, 1, embed_dim)

	# 2. 在批次维度上迭代
	# 这是处理不同打包结构最直接有效的方法
	for i in range(bsz):
	# 提取当前批次项的数据
	current_embeds = inputs_embeds[i:i+1] # shape: (1, total_tokens, embed_dim)
	current_num_tokens = num_tokens_list[i]
	current_prompt_mask = prompt_mask[i:i+1] # shape: (1, total_tokens)

	num_samples_in_item = len(current_num_tokens)
	assert total_tokens == torch.sum(current_num_tokens), \
	f"批次项 {i} 的 num_tokens 之和与总长度不匹配"

	eligible_for_masking = ~current_prompt_mask

	# 如果没有任何 token 可以被 mask，直接使用原始输入
	if not eligible_for_masking.any():
	noisy_embeds_list.append(current_embeds)
	final_masked_indices_list.append(torch.zeros_like(current_prompt_mask, dtype=torch.bool))
	p_masks_list.append(torch.full((total_tokens,), eps, device=device))
	continue

	# --- 尝试生成 mask，确保至少 mask 一个 token ---
	final_masked_indices_item = torch.zeros_like(current_prompt_mask, dtype=torch.bool)
	p_mask_per_token = None
	for _ in range(max_tries):
	t = torch.rand(num_samples_in_item, device=device)
	p_mask_per_sample = (1 - eps) * t + eps

	p_mask_per_token_1d = torch.repeat_interleave(p_mask_per_sample, current_num_tokens)
	p_mask_per_token = p_mask_per_token_1d.unsqueeze(0)

	masked_indices = torch.rand_like(p_mask_per_token) < p_mask_per_token
	final_masked_indices_item = masked_indices & eligible_for_masking

	if final_masked_indices_item.any():
	break

	# --- 根据最终的 mask 生成带噪声的 embedding ---
	noisy_embeds_item = torch.where(
	final_masked_indices_item.unsqueeze(-1),
	mask_embed_view,
	current_embeds
	)

	# 保存这个批次项的结果
	noisy_embeds_list.append(noisy_embeds_item)
	final_masked_indices_list.append(final_masked_indices_item)

	p_masks_list.append(p_mask_per_token)

	# 3. 将列表中的结果堆叠成最终的批处理张量
	final_noisy_embeds = torch.cat(noisy_embeds_list, dim=0)
	final_masked_indices = torch.cat(final_masked_indices_list, dim=0)
	p_mask = torch.cat(p_masks_list, dim=0)
	return final_noisy_embeds, final_masked_indices, p_mask[final_masked_indices]


	def block_diff_mask(b, h, q_idx, kv_idx, block_size=None, n=None):
	"""
	Constructs the specialized block diffusion attention mask for training
	composed of three masks:
	- Block Diagonal Mask (M_BD): Self-attention within noised blocks
	- Offset Block Causal Mask (M_OBC): Cross-attention for conditional context
	- Block Causal Mask (M_BC): Attention to update x0

	Args:
	b, h: Batch and head indices (ignored for mask logic).
	q_idx, kv_idx: Query and Key indices.
	seq_len: Total sequence length.
	block_size: Defines the block structure.

	Returns:
	A boolean attention mask.
	"""

	# Indicate whether token belongs to xt or x0
	x0_flag_q = q_idx >= n
	x0_flag_kv = kv_idx >= n

	# Compute block indices
	block_q = torch.where(
	x0_flag_q == 1, (q_idx - n) // block_size, q_idx // block_size
	)
	block_kv = torch.where(
	x0_flag_kv == 1, (kv_idx - n) // block_size, kv_idx // block_size
	)

	# 1. Block Diagonal Mask (M_BD)
	block_diagonal = (block_q == block_kv) & (x0_flag_q == x0_flag_kv)

	# 2. Offset Block-Causal Mask (M_OBC)
	offset_block_causal = (block_q > block_kv) & (
	x0_flag_kv == 1) & (x0_flag_q == 0)

	# 3. Block-Causal Mask (M_BC)
	block_causal = (block_q >= block_kv) & (x0_flag_kv == 1) & (x0_flag_q == 1)

	# 4. Combine Masks
	return block_diagonal \| offset_block_causal \| block_causal


	def block_attn_mask(num_tokens, block_size, device):
	masks = []
	for i in range(len(num_tokens)):
	cur_masks = []
	for num in num_tokens[i]:
	# 全部返回 nn 而非 2n2n
	single_mask = block_diff_mask(
	b=None,
	h=None,
	q_idx=torch.arange(num * 2, device=device)[:, None],
	kv_idx=torch.arange(num * 2, device=device)[None, :],
	block_size=block_size,
	n=num,
	)
	cur_masks.append(single_mask)
	masks.append(torch.block_diag(*cur_masks))
	masks = torch.stack(masks, dim=0)
	return masks


	@auto_docstring(
	custom_intro="""
	The Llava-Next model which consists of a vision backbone and a language model without language modeling head.
	"""
	)
	class LlavaOnevisionModel(LlavaOnevisionPreTrainedModel):
	_checkpoint_conversion_mapping = {"language_model.model": "language_model"}

	def __init__(self, config):
	super().__init__(config)
	self.vision_tower = AutoModel.from_config(config.vision_config)

	self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
	embed_std = 1 / math.sqrt(config.text_config.hidden_size)
	self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std)

	self.vocab_size = config.text_config.vocab_size
	if "auto_map" in config.text_config.to_dict():
	logger.warning_once(
	"The text_config of this model contains `auto_map` in its configuration. This might result in errors when using `from_pretrained` to load the model. Please make sure that the `auto_map` is correct."
	)
	config.text_config._name_or_path = config._name_or_path
	self.language_model = AutoModel.from_config(config.text_config, trust_remote_code=True)
	else:
	self.language_model = AutoModel.from_config(config.text_config)

	self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
	self.post_init()

	def get_input_embeddings(self):
	return self.language_model.get_input_embeddings()

	def set_input_embeddings(self, value):
	self.language_model.set_input_embeddings(value)

	def pack_image_features(self, image_features, image_sizes, image_newline=None, vision_aspect_ratio="anyres_max_9"):
	"""
	Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.

	Args:
	image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
	List of image feature tensor, each contains all the visual feature of all patches.
	image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
	Actual image size of each images (H, W).
	image_newline (`torch.Tensor` of shape `(embed_dim)`)
	New line embedding vector.
	vision_aspect_ratio (`str`, optional, "anyres_max_9"):
	Aspect ratio used when processong image features. The default value is "anyres_max_9".
	Returns:
	image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
	feature_lens (`List[int]`)
	token length of each image in image_features
	"""
	new_image_features = []
	feature_lens = []
	for image_idx, image_feature in enumerate(image_features):
	if image_feature.shape[0] > 1:
	base_image_feature = image_feature[0]
	image_feature = image_feature[1:]
	height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
	if height * width != base_image_feature.shape[0]:
	raise ValueError("The number of patches is not consistent with the image size.")
	num_patch_height, num_patch_width = get_anyres_image_grid_shape(
	image_sizes[image_idx],
	self.config.image_grid_pinpoints,
	self.config.vision_config.image_size,
	)
	image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
	image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
	image_feature = image_feature.flatten(1, 2).flatten(2, 3)
	image_feature = unpad_image(image_feature, image_sizes[image_idx])
	max_num_patches = int(vision_aspect_ratio.strip("anyres_max_"))
	channels, curr_height, curr_width = image_feature.shape
	ratio = math.sqrt(curr_height * curr_width / (max_num_patches * height**2))
	if ratio > 1.1:
	image_feature = image_feature[None]
	image_feature = nn.functional.interpolate(
	image_feature, [int(curr_height // ratio), int(curr_width // ratio)], mode="bilinear"
	)[0]
	if image_newline is not None:
	image_feature = torch.cat(
	(
	image_feature,
	image_newline[:, None, None]
	.expand(*image_feature.shape[:-1], 1)
	.to(image_feature.device, image_feature.dtype),
	),
	dim=-1,
	)
	image_feature = image_feature.flatten(1, 2).transpose(0, 1)
	image_feature = torch.cat((base_image_feature, image_feature), dim=0)
	else:
	image_feature = image_feature[0]
	if image_newline is not None:
	image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
	new_image_features.append(image_feature)
	feature_lens.append(image_feature.size(0))
	image_features = torch.cat(new_image_features, dim=0)
	feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device)
	return image_features, feature_lens

	def get_image_features(
	self,
	pixel_values: torch.FloatTensor,
	image_sizes: torch.Tensor,
	vision_feature_layer: Optional[Union[int, List[int]]] = None,
	vision_feature_select_strategy: Optional[str] = None,
	):
	"""
	Obtains image last hidden states from the vision tower and apply multimodal projection.

	Args:
	pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
	The tensors corresponding to the input images.
	image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
	Actual image size of each images (H, W).
	vision_feature_layer (`Union[int, List[int]]`, optional):
	The index of the layer to select the vision feature. If multiple indices are provided,
	the vision feature of the corresponding indices will be concatenated to form the
	vision features.
	vision_feature_select_strategy (`str`, optional):
	The feature selection strategy used to select the vision feature from the vision backbone.
	Can be one of `"default"` or `"full"`
	Returns:
	image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
	and are of shape `(num_patches, image_length, embed_dim)`).
	"""
	vision_feature_layer = (
	vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
	)
	vision_feature_select_strategy = (
	vision_feature_select_strategy
	if vision_feature_select_strategy is not None
	else self.config.vision_feature_select_strategy
	)

	# ! infer image_num_patches from image_sizes
	image_num_patches = [
	image_size_to_num_patches(
	image_size=imsize,
	grid_pinpoints=self.config.image_grid_pinpoints,
	patch_size=self.config.vision_config.image_size,
	)
	for imsize in image_sizes
	]
	if pixel_values.dim() == 5:
	# stacked if input is (batch_size, num_patches, num_channels, height, width)
	_pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)]
	pixel_values = torch.cat(_pixel_values_list, dim=0)
	elif pixel_values.dim() != 4:
	# otherwise has to be stacked from list of (num_patches, num_channels, height, width)
	raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")

	image_features = self.vision_tower(pixel_values, output_hidden_states=True)
	# If we have one vision feature layer, return the corresponding hidden states,
	# otherwise, select the hidden states of each feature layer and concatenate them
	if isinstance(vision_feature_layer, int):
	selected_image_feature = image_features.hidden_states[vision_feature_layer]
	else:
	hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
	selected_image_feature = torch.cat(hs_pool, dim=-1)

	if vision_feature_select_strategy == "default":
	selected_image_feature = selected_image_feature[:, 1:]
	elif vision_feature_select_strategy == "full":
	selected_image_feature = selected_image_feature
	image_features = self.multi_modal_projector(selected_image_feature)
	image_features = torch.split(image_features, image_num_patches, dim=0)
	return image_features

	def _get_mask_embedding(self):
	device = self.get_input_embeddings().weight.device
	mask_token_tensor = torch.tensor(self.config.text_config.mask_token_id, device=device)
	return self.get_input_embeddings()(mask_token_tensor)

	def prepare_for_bd_training(self, inputs_embeds, position_ids, prompt_mask):
	bsz, seq_len, _ = inputs_embeds.shape
	num_tokens = calculate_token_nums(position_ids) # List[torch.Tensor]
	noisy_inputs_embeds, logits_to_keep_half, p_mask = forward_add_noise_packed(
	inputs_embeds=inputs_embeds,
	num_tokens_list=num_tokens,
	prompt_mask=prompt_mask,
	mask_embed=self._get_mask_embedding(),
	)
	router_noisy_part_list = []
	for i in range(bsz):
	cur_router_noisy_part = (torch.arange(num_tokens[i].shape[0] *2) % 2 == 0).to(inputs_embeds.device)
	cur_router_noisy_part = cur_router_noisy_part.repeat_interleave(num_tokens[i].repeat_interleave(2))
	router_noisy_part_list.append(cur_router_noisy_part)
	router_noisy_part = torch.stack(router_noisy_part_list, dim=0)

	# concated inputs_embeds: (bzs, seq_len x 2, dim)
	concat_inputs_embeds = inputs_embeds.repeat(1, 2, 1)
	# concated logits_to_keep: (bsz, seq_len x 2)
	logits_to_keep = torch.zeros(
	bsz, 2 * seq_len, dtype=torch.bool, device=inputs_embeds.device)
	# concated position_ids: (bsz, seq_len x 2)
	concat_position_ids = torch.zeros(
	bsz, 2 * seq_len, dtype=position_ids.dtype, device=position_ids.device)
	for i in range(bsz):
	concat_inputs_embeds[i][router_noisy_part[i]] = noisy_inputs_embeds[i]
	concat_inputs_embeds[i][~router_noisy_part[i]] = inputs_embeds[i]

	logits_to_keep[i][router_noisy_part[i]] = logits_to_keep_half[i]

	concat_position_ids[i][router_noisy_part[i]] = position_ids[i]
	concat_position_ids[i][~router_noisy_part[i]] = position_ids[i]

	# create flex_attention mask
	attention_mask = block_attn_mask(num_tokens, self.config.text_config.block_size, inputs_embeds.device)
	flex_attention_mask_3d = create_block_mask(
	lambda b, h, q_idx, kv_idx: attention_mask[b, q_idx, kv_idx],
	B=attention_mask.size(0), H=None,
	Q_LEN=attention_mask.size(1), KV_LEN=attention_mask.size(2),
	)

	return concat_inputs_embeds, concat_position_ids, flex_attention_mask_3d, logits_to_keep_half, logits_to_keep, p_mask


	@can_return_tuple
	@auto_docstring
	def forward(
	self,
	input_ids: torch.LongTensor = None,
	pixel_values: torch.FloatTensor = None,
	image_sizes: Optional[torch.LongTensor] = None,
	pixel_values_videos: torch.FloatTensor = None,
	image_sizes_videos: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	prompt_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	vision_feature_layer: Optional[Union[int, List[int]]] = None,
	vision_feature_select_strategy: Optional[str] = None,
	vision_aspect_ratio: Optional[str] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	**kwargs: Unpack[FlashAttentionKwargs],
	) -> Union[Tuple, LlavaOnevisionModelOutputWithPast]:
	r"""
	pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, frames, num_channels, image_size, image_size)):
	The tensors corresponding to the input videos. Pixel values can be obtained using
	[`LlavaNextVideoProcessor`]. See [`LlavaNextVideoProcessor.__call__`] for details. [`LlavaProcessor`] uses
	[`LlavaNextVideoProcessor`] for processing videos.
	image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, optional):
	The sizes of the videos in the batch, being (height, width) for each frame in the video.
	vision_feature_select_strategy (`str`, optional, defaults to `"default"`):
	The feature selection strategy used to select the vision feature from the vision backbone.
	Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
	If `"full"`, the full vision features are used.
	vision_aspect_ratio (`str`, optional, defaults to `"anyres_max_9"`):
	Aspect ratio used when processong image features. The default value is "anyres_max_9".
	"""

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict
	vision_feature_layer = (
	vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
	)
	vision_feature_select_strategy = (
	vision_feature_select_strategy
	if vision_feature_select_strategy is not None
	else self.config.vision_feature_select_strategy
	)
	vision_aspect_ratio = (
	vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio
	)

	if (input_ids is None) ^ (inputs_embeds is not None):
	raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

	if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
	raise ValueError(
	"You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, "
	"and must specify either one"
	)

	if inputs_embeds is None:
	inputs_embeds = self.get_input_embeddings()(input_ids)

	# Images are processed with Anyres
	if pixel_values is not None:
	image_features = self.get_image_features(
	pixel_values,
	image_sizes,
	vision_feature_layer=vision_feature_layer,
	vision_feature_select_strategy=vision_feature_select_strategy,
	)
	image_features, feature_lens = self.pack_image_features(
	image_features,
	image_sizes,
	image_newline=self.image_newline,
	vision_aspect_ratio=vision_aspect_ratio,
	)

	special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
	special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
	if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
	n_image_tokens = (input_ids == self.config.image_token_id).sum()
	n_image_features = image_features.shape[0]
	raise ValueError(
	f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
	)
	image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
	inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)

	# Video are simply embedded and further pooled to decrease seq len
	if pixel_values_videos is not None:
	video_features = self.get_video_features(
	pixel_values_videos,
	vision_feature_layer=vision_feature_layer,
	vision_feature_select_strategy=vision_feature_select_strategy,
	)
	if isinstance(video_features, tuple):
	image_newline = self.image_newline[None, :].to(video_features[0].device)
	video_features = [torch.cat((single_video_feature, image_newline), dim=0) for single_video_feature in video_features]
	video_features = torch.cat(video_features, dim=0)
	else:
	image_newline = (
	self.image_newline[None, None, :].repeat(video_features.shape[0], 1, 1).to(video_features.device)
	)
	video_features = torch.cat((video_features, image_newline), dim=1)
	video_features = video_features.flatten(0, 1)

	special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
	special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
	if not is_torchdynamo_compiling() and inputs_embeds[special_video_mask].numel() != video_features.numel():
	n_video_tokens = (input_ids == self.config.video_token_id).sum()
	n_video_features = video_features.shape[0]
	raise ValueError(
	f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
	)
	video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
	inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)

	if self.training:
	position_ids = modify_padded_position_ids_2d(position_ids)
	concat_inputs_embeds, concat_position_ids, flex_attention_mask_3d, logits_to_keep_half, logits_to_keep, p_mask = self.prepare_for_bd_training(inputs_embeds, position_ids, prompt_mask)
	outputs = self.language_model(
	attention_mask=flex_attention_mask_3d,
	position_ids=concat_position_ids,
	inputs_embeds=concat_inputs_embeds,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=True,
	cache_position=cache_position,
	**kwargs,
	)
	else:
	# raise NotImplementedError("Currently only support training.")
	outputs = self.language_model(
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=True,
	cache_position=cache_position,
	**kwargs,
	)

	return LlavaOnevisionModelOutputWithPast(
	last_hidden_state=outputs.last_hidden_state,
	logits_to_keep_half=logits_to_keep_half if self.training else None,
	logits_to_keep=logits_to_keep if self.training else None,
	p_mask=p_mask if self.training else None,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	image_hidden_states=image_features if pixel_values is not None else None,
	video_hidden_states=video_features if pixel_values_videos is not None else None,
	)

	def get_video_features(
	self,
	pixel_values: torch.FloatTensor,
	vision_feature_layer: Union[int, List[int]],
	vision_feature_select_strategy: str,
	):
	"""
	Obtains video last hidden states from the vision tower, apply multimodal projection and pooling.

	Args:
	pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
	The tensors corresponding to the input video.
	vision_feature_layer (`Union[int, List[int]], optional, defaults to -2`):
	The index of the layer to select the vision feature. If multiple indices are provided,
	the vision feature of the corresponding indices will be concatenated to form the
	vision features.
	vision_feature_select_strategy (`str`):
	The feature selection strategy used to select the vision feature from the vision backbone.
	Can be one of `"default"` or `"full"`
	Returns:
	video_features (List[`torch.Tensor`]): List of video feature tensor, each contains all the visual feature of all patches
	and are of shape `(num_videos, video_length, embed_dim)`).
	"""
	has_variable_frames = isinstance(pixel_values, List)
	if has_variable_frames:
	frame_nums = [video.size(0) for video in pixel_values]
	pixel_values = torch.cat(pixel_values, dim=0) # Shape: (total_frames, C, H, W)
	else:
	# 每个视频帧数相同
	batch_size, frames, channels, height, width = pixel_values.shape
	pixel_values = pixel_values.view(batch_size * frames, channels, height, width)
	video_features = self.vision_tower(pixel_values, output_hidden_states=True)
	# If we have one vision feature layer, return the corresponding hidden states,
	# otherwise, select the hidden states of each feature layer and concatenate them
	if isinstance(vision_feature_layer, int):
	selected_video_feature = video_features.hidden_states[vision_feature_layer]
	else:
	hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
	selected_video_feature = torch.cat(hs_pool, dim=-1)

	if vision_feature_select_strategy == "default":
	selected_video_feature = selected_video_feature[:, 1:]
	elif vision_feature_select_strategy == "full":
	selected_video_feature = selected_video_feature
	video_features = self.multi_modal_projector(selected_video_feature)

	video_features = self.apply_pooling(video_features)

	if has_variable_frames:
	tokens_per_frame = video_features.shape[1]
	video_features = video_features.flatten(0, 1)
	video_tokens_lengths = [num_frames * tokens_per_frame for num_frames in frame_nums]
	video_features = torch.split(video_features, video_tokens_lengths, dim=0)
	else:
	video_features = video_features.reshape(batch_size, frames * video_features.shape[1], -1)

	return video_features

	def apply_pooling(self, image_features):
	height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
	batch_frames, seq_len, dim = image_features.shape
	image_features = image_features.view(batch_frames, height, width, -1)
	image_features = image_features.permute(0, 3, 1, 2).contiguous()

	height, width = image_features.shape[2:]
	scaled_shape = [math.ceil(height / 2), math.ceil(width / 2)]
	image_features = nn.functional.interpolate(image_features, size=scaled_shape, mode="bilinear")

	image_features = image_features.permute(0, 2, 3, 1)
	image_features = image_features.view(batch_frames, -1, dim)
	return image_features


	class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...


	@auto_docstring(
	custom_intro="""
	The LLAVA-NeXT model which consists of a vision backbone and a language model.
	"""
	)
	class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, GenerationMixin):
	_checkpoint_conversion_mapping = {
	"^language_model.model": "model.language_model",
	"^vision_tower": "model.vision_tower",
	"^multi_modal_projector": "model.multi_modal_projector",
	"^image_newline": "model.image_newline",
	"^language_model.lm_head": "lm_head",
	}
	_tied_weights_keys = ["lm_head.weight"]

	def __init__(self, config: LlavaOnevisionConfig):
	super().__init__(config)
	self.model = LlavaOnevisionModel(config)
	self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
	self.post_init()

	def get_input_embeddings(self):
	return self.model.get_input_embeddings()

	def set_input_embeddings(self, value):
	self.model.set_input_embeddings(value)

	def get_output_embeddings(self) -> nn.Module:
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	# Make modules available throught conditional class for BC
	@property
	def language_model(self):
	return self.model.language_model

	@property
	def vision_tower(self):
	return self.model.vision_tower

	@property
	def multi_modal_projector(self):
	return self.model.multi_modal_projector

	@can_return_tuple
	@auto_docstring
	def forward(
	self,
	input_ids: torch.LongTensor = None,
	pixel_values: torch.FloatTensor = None,
	image_sizes: Optional[torch.LongTensor] = None,
	pixel_values_videos: torch.FloatTensor = None,
	image_sizes_videos: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	vision_feature_layer: Optional[Union[int, List[int]]] = None,
	vision_feature_select_strategy: Optional[str] = None,
	vision_aspect_ratio: Optional[str] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	logits_to_keep: Union[int, torch.Tensor] = 0,
	**kwargs: Unpack[KwargsForCausalLM],
	) -> Union[Tuple, LlavaOnevisionCausalLMOutputWithPast]:
	r"""
	pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, frames, num_channels, image_size, image_size)):
	The tensors corresponding to the input videos. Pixel values can be obtained using
	[`LlavaNextVideoProcessor`]. See [`LlavaNextVideoProcessor.__call__`] for details. [`LlavaProcessor`] uses
	[`LlavaNextVideoProcessor`] for processing videos.
	image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, optional):
	The sizes of the videos in the batch, being (height, width) for each frame in the video.
	vision_feature_select_strategy (`str`, optional, defaults to `"default"`):
	The feature selection strategy used to select the vision feature from the vision backbone.
	Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
	If `"full"`, the full vision features are used.
	vision_aspect_ratio (`str`, optional, defaults to `"anyres_max_9"`):
	Aspect ratio used when processong image features. The default value is "anyres_max_9".
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

	Example:

	```python
	>>> from PIL import Image
	>>> import requests
	>>> import torch
	>>> from transformers import LlavaOnevisionProcessor, LlavaOnevisionForConditionalGeneration

	>>> model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype="float16", device_map="cuda:0")
	>>> processor = LlavaOnevisionProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")

	>>> conversation = [
	... {
	... "role": "user",
	... "content": [
	... {"type": "text", "text": "What is shown in this image?"},
	... {"type": "image"},
	... ],
	... },
	... ]
	>>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

	>>> image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> raw_image = Image.open(requests.get(image_file, stream=True).raw)
	>>> inputs = processor(text=prompt, images=raw_image, return_tensors='pt').to(0, torch.float16)

	>>> output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
	>>> processor.batch_decode(output, skip_special_tokens=True)[0]
	"user\n\nWhat is shown in this image?\nassistant\ncat"
	```"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict
	vision_feature_layer = (
	vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
	)
	vision_feature_select_strategy = (
	vision_feature_select_strategy
	if vision_feature_select_strategy is not None
	else self.config.vision_feature_select_strategy
	)
	vision_aspect_ratio = (
	vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio
	)
	prompt_mask = (labels == -100) if labels is not None else None
	outputs = self.model(
	input_ids=input_ids,
	pixel_values=pixel_values,
	pixel_values_videos=pixel_values_videos,
	image_sizes=image_sizes,
	image_sizes_videos=image_sizes_videos,
	vision_aspect_ratio=vision_aspect_ratio,
	vision_feature_layer=vision_feature_layer,
	vision_feature_select_strategy=vision_feature_select_strategy,
	attention_mask=attention_mask,
	prompt_mask=prompt_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=True,
	cache_position=cache_position,
	logits_to_keep=logits_to_keep,
	**kwargs,
	)

	hidden_states = outputs[0]
	# Only compute necessary logits, and do not upcast them to float if we are not computing the loss

	loss = None
	if self.training:
	assert labels is not None, "Labels must be provided for training."
	hidden_states = hidden_states[outputs.logits_to_keep].contiguous()
	labels = labels[outputs.logits_to_keep_half].contiguous()
	loss_fct = FusedLinearDiffusionCrossEntropyLoss(reduction='sum')
	loss = loss_fct( # it will return (sum_loss, unreduced_loss)
	# conduct `view(-1, V)` inside the function
	x=hidden_states,
	target=labels,
	weight=self.lm_head.weight,
	bias=self.lm_head.bias,
	p_mask=outputs.p_mask,
	)
	loss = loss / labels.numel()
	logits = None
	else:
	logits = self.lm_head(hidden_states)

	return LlavaOnevisionCausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	image_hidden_states=outputs.image_hidden_states,
	video_hidden_states=outputs.video_hidden_states,
	)

	def prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	inputs_embeds=None,
	pixel_values=None,
	image_sizes=None,
	pixel_values_videos=None,
	image_sizes_videos=None,
	attention_mask=None,
	cache_position=None,
	logits_to_keep=None,
	**kwargs,
	):
	# Overwritten -- in specific circumstances we don't want to forward image inputs to the model

	model_inputs = super().prepare_inputs_for_generation(
	input_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	attention_mask=attention_mask,
	cache_position=cache_position,
	logits_to_keep=logits_to_keep,
	**kwargs,
	)

	if cache_position[0] == 0:
	# If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
	# Otherwise we need pixel values to be passed to model
	model_inputs["pixel_values"] = pixel_values
	model_inputs["image_sizes"] = image_sizes
	model_inputs["pixel_values_videos"] = pixel_values_videos
	model_inputs["image_sizes_videos"] = image_sizes_videos

	return model_inputs

	@staticmethod
	def _prepare_4d_causal_attention_mask_with_cache_position(
	attention_mask: torch.Tensor,
	sequence_length: int,
	target_length: int,
	dtype: torch.dtype,
	cache_position: torch.Tensor,
	batch_size: int,
	**kwargs,
	):
	"""
	Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
	`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

	Args:
	attention_mask (`torch.Tensor`):
	A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
	`(batch_size, 1, query_length, key_value_length)`.
	sequence_length (`int`):
	The sequence length being processed.
	target_length (`int`):
	The target length: when generating with static cache, the mask should be as long as the static cache,
	to account for the 0 padding, the part of the cache that is not filled yet.
	dtype (`torch.dtype`):
	The dtype to use for the 4D attention mask.
	cache_position (`torch.Tensor`):
	Indices depicting the position of the input sequence tokens in the sequence.
	batch_size (`torch.Tensor`):
	Batch size.
	"""
	if attention_mask is not None and attention_mask.dim() == 4:
	# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
	causal_mask = attention_mask
	else:
	min_dtype = torch.finfo(dtype).min
	causal_mask = torch.full(
	(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
	)
	if sequence_length != 1:
	causal_mask = torch.triu(causal_mask, diagonal=1)
	causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
	causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
	if attention_mask is not None:
	causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
	mask_length = attention_mask.shape[-1]
	padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
	causal_mask.device
	)
	padding_mask = padding_mask == 0
	causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
	padding_mask, min_dtype
	)

	return causal_mask


	__all__ = ["LlavaOnevisionModel", "LlavaOnevisionForConditionalGeneration", "LlavaOnevisionPreTrainedModel"]