microsoft/Phi-4-multimodal-instruct · Make it compatible with transformers 5.3.0

Make it compatible with transformers 5.3.0

#91

by Kaixuanliu - opened 26 days ago

base: refs/heads/main

←

from: refs/pr/91

Discussion Files changed

+24

-12

Files changed (2) hide show

modeling_phi4mm.py +21 -10
speech_conformer_encoder.py +3 -2

modeling_phi4mm.py CHANGED Viewed

@@ -26,7 +26,7 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_flash_attention_utils import _flash_attention_forward
@@ -41,7 +41,7 @@ from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -1134,7 +1134,7 @@ class Phi4MMAttention(nn.Module):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -1190,7 +1190,7 @@ class Phi4MMFlashAttention2(Phi4MMAttention):
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
     def forward(
         self,
@@ -1229,7 +1229,7 @@ class Phi4MMFlashAttention2(Phi4MMAttention):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = (
@@ -1351,7 +1351,7 @@ class Phi4MMSdpaAttention(Phi4MMAttention):
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -1399,6 +1399,7 @@ class Phi4MMSdpaAttention(Phi4MMAttention):
 PHI4MM_ATTENTION_CLASSES = {
     "eager": Phi4MMAttention,
     "flash_attention_2": Phi4MMFlashAttention2,
     "sdpa": Phi4MMSdpaAttention,
 }
@@ -1511,6 +1512,7 @@ class Phi4MMPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["Phi4MMDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
@@ -1807,7 +1809,7 @@ class Phi4MMModel(Phi4MMPreTrainedModel):
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
-        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
         if (
@@ -1913,7 +1915,7 @@ class Phi4MMModel(Phi4MMPreTrainedModel):
             if config.sliding_window is not None:
                 # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
                 # the check is needed to verify is current checkpoint was trained with sliding window or not
-                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
@@ -1934,7 +1936,7 @@ class Phi4MMModel(Phi4MMPreTrainedModel):
 class Phi4MMForCausalLM(Phi4MMPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = ["lm_head.weight"]
     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi
     def __init__(self, config):
@@ -1949,6 +1951,12 @@ class Phi4MMForCausalLM(Phi4MMPreTrainedModel, GenerationMixin):
         # LoRA related settings
         assert getattr(config, "vision_lora", None) is not None
         from peft import LoraConfig, get_peft_model
         vision_lora_config = LoraConfig(
             r=config.vision_lora['r'],
             lora_alpha=config.vision_lora['lora_alpha'],
@@ -2134,7 +2142,10 @@ class Phi4MMForCausalLM(Phi4MMPreTrainedModel, GenerationMixin):
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         loss = None
         if labels is not None:

 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_flash_attention_utils import _flash_attention_forward
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_flash_attn_greater_or_equal,
     logging,
     replace_return_docstrings,
 )
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
         cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal("2.10")
     def forward(
         self,
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = (
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
+            kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
         cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 PHI4MM_ATTENTION_CLASSES = {
     "eager": Phi4MMAttention,
     "flash_attention_2": Phi4MMFlashAttention2,
+    "kernels-community/flash-attn2": Phi4MMFlashAttention2,
     "sdpa": Phi4MMSdpaAttention,
 }
     supports_gradient_checkpointing = True
     _no_split_modules = ["Phi4MMDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = False  # SlidingWindowCache removed in newer transformers
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
         if (
             if config.sliding_window is not None:
                 # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
                 # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if sequence_length > target_length:  # SlidingWindowCache removed
                     sliding_attend_mask = torch.arange(target_length, device=device) <= (
                         cache_position.reshape(-1, 1) - config.sliding_window
                     )
 class Phi4MMForCausalLM(Phi4MMPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
     # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi
     def __init__(self, config):
         # LoRA related settings
         assert getattr(config, "vision_lora", None) is not None
         from peft import LoraConfig, get_peft_model
+        # Add a placeholder prepare_inputs_for_generation to satisfy PEFT's requirements
+        # The actual method is defined on Phi4MMForCausalLM
+        if not hasattr(self.model, 'prepare_inputs_for_generation'):
+            self.model.prepare_inputs_for_generation = lambda *args, **kwargs: None
         vision_lora_config = LoraConfig(
             r=config.vision_lora['r'],
             lora_alpha=config.vision_lora['lora_alpha'],
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        if num_logits_to_keep is None or num_logits_to_keep == 0:
+            logits = self.lm_head(hidden_states)
+        else:
+            logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
         loss = None
         if labels is not None:

speech_conformer_encoder.py CHANGED Viewed

@@ -1423,7 +1423,8 @@ class NemoConvSubsampling(torch.nn.Module):
             raise ValueError(f"Not valid sub-sampling: {subsampling}!")
         if subsampling in ["dw_striding", "striding"]:
-            in_length = torch.tensor(feat_in, dtype=torch.float)
             out_length = calc_length(
                 lengths=in_length,
                 all_paddings=self._left_padding + self._right_padding,
@@ -1432,7 +1433,7 @@ class NemoConvSubsampling(torch.nn.Module):
                 ceil_mode=self._ceil_mode,
                 repeat_num=self._sampling_num,
             )
-            self.out = torch.nn.Linear(conv_channels * int(out_length), feat_out)
             self.conv2d_subsampling = True
         elif subsampling in ["striding_conv1d", "dw_striding_conv1d"]:
             self.out = None

             raise ValueError(f"Not valid sub-sampling: {subsampling}!")
         if subsampling in ["dw_striding", "striding"]:
+            # Force CPU tensor to avoid meta tensor issues with device_map
+            in_length = torch.tensor(feat_in, dtype=torch.float, device='cpu')
             out_length = calc_length(
                 lengths=in_length,
                 all_paddings=self._left_padding + self._right_padding,
                 ceil_mode=self._ceil_mode,
                 repeat_num=self._sampling_num,
             )
+            self.out = torch.nn.Linear(conv_channels * int(out_length.item()), feat_out)
             self.conv2d_subsampling = True
         elif subsampling in ["striding_conv1d", "dw_striding_conv1d"]:
             self.out = None