Sailesh97
/

Hinvec

@@ -61,6 +61,9 @@ class BidirectionalMistralModel(MistralModel):
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
                 "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."

         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, seq_length, dtype=input_ids.dtype, device=input_ids.device)
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
                 "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."