| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import copy |
| from dataclasses import dataclass |
| from typing import Any, Callable, Optional, Union |
|
|
| import torch |
| from torch import nn |
|
|
| from transformers.activations import ACT2FN |
| from transformers.cache_utils import Cache, DynamicCache |
| from transformers.generation import GenerationMixin |
| from transformers.generation.configuration_utils import GenerationConfig |
| from transformers.generation.logits_process import ( |
| LogitsProcessorList, |
| RepetitionPenaltyLogitsProcessor, |
| TemperatureLogitsWarper, |
| TopKLogitsWarper, |
| TopPLogitsWarper, |
| ) |
| from transformers.generation.stopping_criteria import StoppingCriteriaList |
| from transformers.generation.streamers import BaseStreamer |
| from transformers.generation.utils import GenerateDecoderOnlyOutput |
| from transformers.integrations import use_kernel_forward_from_hub |
| from transformers.masking_utils import ( |
| create_causal_mask, |
| create_sliding_window_causal_mask, |
| ) |
| from transformers.modeling_flash_attention_utils import FlashAttentionKwargs |
| from transformers.modeling_layers import ( |
| GenericForQuestionAnswering, |
| GenericForSequenceClassification, |
| GenericForTokenClassification, |
| GradientCheckpointingLayer, |
| ) |
| from transformers.modeling_outputs import ( |
| ModelOutput, |
| ) |
| from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update |
| from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel |
| from transformers.processing_utils import Unpack |
| from transformers.pytorch_utils import isin_mps_friendly |
| from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple |
| from transformers.utils.deprecation import deprecate_kwarg |
| from transformers.utils.generic import check_model_inputs |
|
|
| from .configuration_moss_speech import MossSpeechConfig |
|
|
|
|
| @dataclass |
| class MossSpeechModelOutputWithPast(ModelOutput): |
| """MossSpeech model output that includes per-modality last_hidden_state dict |
| |
| Args: |
| last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): |
| Sequence of hidden-states at the output of the last layer of the model. |
| past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed |
| or when `config.use_cache=True`): Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with |
| each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`. |
| last_hidden_state_dict (`Dict[str, torch.FloatTensor]`, *optional*): |
| Dictionary containing hidden states for each modality. Keys are modality names (e.g., "text", "audio") |
| and values are tensors of shape `(batch_size, sequence_length, hidden_size)`. |
| attentions (`tuple(torch.FloatTensor)`, *optional*): |
| Tuple of `torch.FloatTensor` (one for each layer) of shape |
| `(batch_size, num_heads, sequence_length, sequence_length)`. |
| """ |
|
|
| last_hidden_state: torch.FloatTensor = None |
| past_key_values: Optional[tuple] = None |
| hidden_states: Optional[tuple] = None |
| attentions: Optional[tuple] = None |
| last_hidden_state_dict: Optional[dict[str, torch.Tensor]] = None |
| past_key_values_dict: Optional[dict] = None |
|
|
|
|
| @dataclass |
| class MossSpeechCausalLMOutputWithPast(ModelOutput): |
| """MossSpeech causal language modeling output, includes per-modality hidden_states dict |
| |
| Args: |
| loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): |
| Language modeling loss (for next-token prediction). |
| logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): |
| Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). |
| past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): |
| Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape |
| `(batch_size, num_heads, sequence_length, embed_size_per_head)`) |
| Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see |
| `past_key_values` input) to speed up sequential decoding. |
| hidden_states (`Dict[str, torch.FloatTensor]`, *optional*): |
| Dictionary containing hidden states for each modality. Keys are modality names (e.g., "text", "audio") |
| and values are tensors of shape `(batch_size, sequence_length, hidden_size)`. |
| attentions (`tuple(torch.FloatTensor)`, *optional*): |
| Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. |
| """ |
|
|
| hidden_states: Optional[dict[str, torch.Tensor]] = None |
| past_key_values: Optional[tuple] = None |
| attentions: Optional[tuple] = None |
| last_hidden_state_dict: Optional[dict[str, torch.Tensor]] = None |
| audio_loss: Optional[torch.FloatTensor] = None |
| audio_logits: Optional[torch.FloatTensor] = None |
| text_loss: Optional[torch.FloatTensor] = None |
| text_logits: Optional[torch.FloatTensor] = None |
| text_hidden_states: Optional[torch.FloatTensor] = None |
| audio_hidden_states: Optional[torch.FloatTensor] = None |
| logits_all: Optional[tuple] = None |
| past_key_values_dict: Optional[dict] = None |
|
|
|
|
| @use_kernel_forward_from_hub("RMSNorm") |
| class MossSpeechRMSNorm(nn.Module): |
| |
| def __init__(self, hidden_size, eps: float = 1e-6) -> None: |
| """Root Mean Square LayerNorm used in MossSpeech.""" |
| super().__init__() |
| self.weight = nn.Parameter(torch.ones(hidden_size)) |
| self.variance_epsilon = eps |
|
|
| def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: |
| input_dtype = hidden_states.dtype |
| hidden_states = hidden_states.to(torch.float32) |
| variance = hidden_states.pow(2).mean(-1, keepdim=True) |
| hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) |
| return self.weight * hidden_states.to(input_dtype) |
|
|
| def extra_repr(self): |
| return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" |
|
|
|
|
| class MossSpeechMLP(nn.Module): |
| def __init__(self, config): |
| super().__init__() |
| self.config = config |
| self.hidden_size = config.hidden_size |
| self.intermediate_size = config.intermediate_size |
| self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) |
| self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) |
| self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) |
| self.act_fn = ACT2FN[config.hidden_act] |
|
|
| def forward(self, x): |
| down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) |
| return down_proj |
|
|
|
|
| def rotate_half(x): |
| |
| """Rotate half the hidden dims of the input.""" |
| x1 = x[..., : x.shape[-1] // 2] |
| x2 = x[..., x.shape[-1] // 2 :] |
| return torch.cat((-x2, x1), dim=-1) |
|
|
|
|
| def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): |
| |
| """Apply Rotary Position Embeddings to the query and key tensors. |
| |
| Args: |
| q (`torch.Tensor`): The query tensor. |
| k (`torch.Tensor`): The key tensor. |
| cos (`torch.Tensor`): The cosine part of the rotary embedding. |
| sin (`torch.Tensor`): The sine part of the rotary embedding. |
| position_ids (`torch.Tensor`, *optional*): Deprecated and unused. |
| unsqueeze_dim (`int`, *optional*, defaults to 1): |
| Dimension along which to unsqueeze cos[position_ids] and sin[position_ids] for broadcasting to `q`/`k`. |
| Returns: |
| `tuple(torch.Tensor)` comprising of the rotated query and key tensors. |
| """ |
| cos = cos.unsqueeze(unsqueeze_dim) |
| sin = sin.unsqueeze(unsqueeze_dim) |
| q_embed = (q * cos) + (rotate_half(q) * sin) |
| k_embed = (k * cos) + (rotate_half(k) * sin) |
| return q_embed, k_embed |
|
|
|
|
| def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: |
| |
| """Repeat key/value heads to match attention heads. |
| |
| Equivalent to `torch.repeat_interleave(x, dim=1, repeats=n_rep)` transforming |
| (batch, num_key_value_heads, seqlen, head_dim) -> (batch, num_attention_heads, seqlen, head_dim). |
| """ |
| batch, num_key_value_heads, slen, head_dim = hidden_states.shape |
| if n_rep == 1: |
| return hidden_states |
| hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) |
| return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) |
|
|
|
|
| def eager_attention_forward( |
| module: nn.Module, |
| query: torch.Tensor, |
| key: torch.Tensor, |
| value: torch.Tensor, |
| attention_mask: Optional[torch.Tensor], |
| scaling: float, |
| dropout: float = 0.0, |
| **kwargs: Unpack[TransformersKwargs], |
| ): |
| |
| key_states = repeat_kv(key, module.num_key_value_groups) |
| value_states = repeat_kv(value, module.num_key_value_groups) |
|
|
| attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling |
| if attention_mask is not None: |
| causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] |
| attn_weights = attn_weights + causal_mask |
|
|
| attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) |
| attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) |
| attn_output = torch.matmul(attn_weights, value_states) |
| attn_output = attn_output.transpose(1, 2).contiguous() |
|
|
| return attn_output, attn_weights |
|
|
|
|
| class MossSpeechAttention(nn.Module): |
| """Multi-headed attention from 'Attention Is All You Need'.""" |
|
|
| def __init__(self, config: MossSpeechConfig, layer_idx: int): |
| super().__init__() |
| self.config = config |
| self.layer_idx = layer_idx |
| self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) |
| self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads |
| self.scaling = self.head_dim**-0.5 |
| self.attention_dropout = config.attention_dropout |
| self.is_causal = True |
|
|
| self.q_proj = nn.Linear( |
| config.hidden_size, |
| config.num_attention_heads * self.head_dim, |
| bias=config.attention_bias, |
| ) |
| self.k_proj = nn.Linear( |
| config.hidden_size, |
| config.num_key_value_heads * self.head_dim, |
| bias=config.attention_bias, |
| ) |
| self.v_proj = nn.Linear( |
| config.hidden_size, |
| config.num_key_value_heads * self.head_dim, |
| bias=config.attention_bias, |
| ) |
| self.o_proj = nn.Linear( |
| config.num_attention_heads * self.head_dim, |
| config.hidden_size, |
| bias=config.attention_bias, |
| ) |
| self.q_norm = MossSpeechRMSNorm(self.head_dim, eps=config.rms_norm_eps) |
| self.k_norm = MossSpeechRMSNorm(self.head_dim, eps=config.rms_norm_eps) |
| self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None |
|
|
| @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") |
| def forward( |
| self, |
| hidden_states: torch.Tensor, |
| position_embeddings: tuple[torch.Tensor, torch.Tensor], |
| attention_mask: Optional[torch.Tensor], |
| past_key_values: Optional[Cache] = None, |
| cache_position: Optional[torch.LongTensor] = None, |
| **kwargs: Unpack[FlashAttentionKwargs], |
| ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: |
| input_shape = hidden_states.shape[:-1] |
| hidden_shape = (*input_shape, -1, self.head_dim) |
|
|
| query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2) |
| key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) |
| value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) |
|
|
| cos, sin = position_embeddings |
|
|
| query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) |
| if past_key_values is not None: |
| |
| cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} |
| key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) |
|
|
| attention_interface: Callable = eager_attention_forward |
| if self.config._attn_implementation != "eager": |
| attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] |
|
|
| attn_output, attn_weights = attention_interface( |
| self, |
| query_states, |
| key_states, |
| value_states, |
| attention_mask, |
| dropout=0.0 if not self.training else self.attention_dropout, |
| scaling=self.scaling, |
| sliding_window=self.sliding_window, |
| **kwargs, |
| ) |
|
|
| attn_output = attn_output.reshape(*input_shape, -1).contiguous() |
| attn_output = self.o_proj(attn_output) |
| return attn_output, attn_weights |
|
|
|
|
| class MossSpeechDecoderLayer(GradientCheckpointingLayer): |
| """Single decoder layer used in the MossSpeech transformer.""" |
|
|
| def __init__(self, config: MossSpeechConfig, layer_idx: int): |
| super().__init__() |
| self.hidden_size = config.hidden_size |
|
|
| self.self_attn = MossSpeechAttention(config=config, layer_idx=layer_idx) |
|
|
| self.mlp = MossSpeechMLP(config) |
| self.input_layernorm = MossSpeechRMSNorm(config.hidden_size, eps=config.rms_norm_eps) |
| self.post_attention_layernorm = MossSpeechRMSNorm(config.hidden_size, eps=config.rms_norm_eps) |
| self.attention_type = config.layer_types[layer_idx] |
|
|
| @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") |
| def forward( |
| self, |
| hidden_states: torch.Tensor, |
| attention_mask: Optional[torch.Tensor] = None, |
| position_ids: Optional[torch.LongTensor] = None, |
| past_key_values: Optional[Cache] = None, |
| use_cache: Optional[bool] = False, |
| cache_position: Optional[torch.LongTensor] = None, |
| position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, |
| **kwargs: Unpack[TransformersKwargs], |
| ) -> torch.Tensor: |
| residual = hidden_states |
| hidden_states = self.input_layernorm(hidden_states) |
| |
| hidden_states, _ = self.self_attn( |
| hidden_states=hidden_states, |
| attention_mask=attention_mask, |
| position_ids=position_ids, |
| past_key_values=past_key_values, |
| use_cache=use_cache, |
| cache_position=cache_position, |
| position_embeddings=position_embeddings, |
| **kwargs, |
| ) |
| hidden_states = residual + hidden_states |
|
|
| |
| residual = hidden_states |
| hidden_states = self.post_attention_layernorm(hidden_states) |
| hidden_states = self.mlp(hidden_states) |
| hidden_states = residual + hidden_states |
| return hidden_states |
|
|
|
|
| class MossSpeechTransformerBlock(nn.Module): |
| """A contiguous stack of decoder layers that handles attention types and cache offsets.""" |
|
|
| def __init__(self, config: MossSpeechConfig, start_idx: int, num_layers: int): |
| super().__init__() |
| self.start_idx = start_idx |
| self.num_layers = num_layers |
| self.layers = nn.ModuleList( |
| [MossSpeechDecoderLayer(config, layer_idx=start_idx + i) for i in range(num_layers)] |
| ) |
| self.config = config |
| self.has_sliding_layers = "sliding_attention" in config.layer_types |
|
|
| def _mask_for_layer(self, causal_masks: dict[str, torch.Tensor], layer_type: str) -> torch.Tensor: |
| return causal_masks[("sliding_attention" if layer_type == "sliding_attention" else "full_attention")] |
|
|
| def forward( |
| self, |
| hidden_states: torch.Tensor, |
| *, |
| causal_masks: dict[str, torch.Tensor], |
| position_ids: torch.LongTensor, |
| position_embeddings: tuple[torch.Tensor, torch.Tensor], |
| past_key_values: Optional[Cache], |
| use_cache: bool, |
| cache_position: torch.LongTensor, |
| **kwargs: Unpack[TransformersKwargs], |
| ) -> torch.Tensor: |
| for i, decoder_layer in enumerate(self.layers): |
| layer_type = self.config.layer_types[self.start_idx + i] |
| hidden_states = decoder_layer( |
| hidden_states, |
| attention_mask=self._mask_for_layer(causal_masks, layer_type), |
| position_ids=position_ids, |
| past_key_values=past_key_values, |
| use_cache=use_cache, |
| cache_position=cache_position, |
| position_embeddings=position_embeddings, |
| **kwargs, |
| ) |
| return hidden_states |
|
|
|
|
| @auto_docstring |
| class MossSpeechPreTrainedModel(PreTrainedModel): |
| config: MossSpeechConfig |
| base_model_prefix = "model" |
| supports_gradient_checkpointing = True |
| _no_split_modules = ["MossSpeechDecoderLayer"] |
| _skip_keys_device_placement = ["past_key_values"] |
| _supports_flash_attn = True |
| _supports_sdpa = True |
| _supports_flex_attn = True |
|
|
| _can_compile_fullgraph = True |
| _supports_attention_backend = True |
| _can_record_outputs = { |
| "hidden_states": MossSpeechDecoderLayer, |
| "attentions": MossSpeechAttention, |
| } |
|
|
|
|
| class MossSpeechRotaryEmbedding(nn.Module): |
| inv_freq: torch.Tensor |
|
|
| def __init__(self, config: MossSpeechConfig, device=None): |
| super().__init__() |
| |
| if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): |
| self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) |
| else: |
| self.rope_type = "default" |
| self.max_seq_len_cached = config.max_position_embeddings |
| self.original_max_seq_len = config.max_position_embeddings |
|
|
| self.config = config |
| self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] |
|
|
| inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) |
| self.register_buffer("inv_freq", inv_freq, persistent=False) |
| self.original_inv_freq = self.inv_freq |
|
|
| @torch.no_grad() |
| @dynamic_rope_update |
| def forward(self, x, position_ids): |
| inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) |
| position_ids_expanded = position_ids[:, None, :].float() |
|
|
| seq_len = position_ids_expanded.max().item() + 1 |
| if seq_len > self.max_seq_len_cached: |
| self.max_seq_len_cached = int(seq_len) |
|
|
| device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" |
|
|
| with torch.autocast(device_type=device_type, enabled=False): |
| freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) |
| emb = torch.cat((freqs, freqs), dim=-1) |
| cos = emb.cos() * self.attention_scaling |
| sin = emb.sin() * self.attention_scaling |
|
|
| return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) |
|
|
|
|
| class MossSpeechModel(MossSpeechPreTrainedModel): |
| """The bare MossSpeech decoder-only Transformer with shared and modality-specific blocks. |
| |
| This model outputs a dictionary of per-modality hidden states in addition to the shared hidden states. |
| """ |
|
|
| def __init__(self, config: MossSpeechConfig): |
| super().__init__(config) |
| pad_idx = ( |
| config.pad_token_id if getattr(config, "pad_token_id", None) is not None else config.modality_pad_token_id |
| ) |
| self.padding_idx = pad_idx |
| self.vocab_size = config.vocab_size |
|
|
| |
| self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx) |
| self.audio_embed = nn.Embedding(config.audio_vocab_size, config.hidden_size) |
| self.rotary_emb = MossSpeechRotaryEmbedding(config) |
| self.gradient_checkpointing = False |
| self.has_sliding_layers = "sliding_attention" in config.layer_types |
|
|
| self.shared_block = MossSpeechTransformerBlock(config, start_idx=0, num_layers=config.num_shared_layers) |
|
|
| |
| self.text_block = MossSpeechTransformerBlock( |
| config, |
| start_idx=0, |
| num_layers=config.num_modality_layers, |
| ) |
| self.audio_block = MossSpeechTransformerBlock( |
| config, |
| start_idx=0, |
| num_layers=config.num_modality_layers, |
| ) |
|
|
| |
| self.text_norm = MossSpeechRMSNorm(config.hidden_size, eps=config.rms_norm_eps) |
| self.audio_norm = MossSpeechRMSNorm(config.hidden_size, eps=config.rms_norm_eps) |
|
|
| |
| self.modality_blocks = nn.ModuleDict( |
| { |
| "text": self.text_block, |
| "audio": self.audio_block, |
| } |
| ) |
| self.modality_norms = nn.ModuleDict( |
| { |
| "text": self.text_norm, |
| "audio": self.audio_norm, |
| } |
| ) |
|
|
| |
| self.post_init() |
|
|
| def _route_block(self, modality: str): |
| """Return the Transformer block for the given modality.""" |
| if modality in self.modality_blocks: |
| return self.modality_blocks[modality] |
| else: |
| raise ValueError(f"Unknown modality: {modality}") |
|
|
| def _route_final_layer_norm(self, modality: str): |
| """Return the final layer normalization for the given modality.""" |
| if modality in self.modality_norms: |
| return self.modality_norms[modality] |
| else: |
| raise ValueError(f"Unknown modality: {modality}") |
|
|
| @check_model_inputs |
| @auto_docstring |
| def forward( |
| self, |
| modalities: list[str], |
| input_ids: Optional[torch.LongTensor] = None, |
| attention_mask: Optional[torch.Tensor] = None, |
| position_ids: Optional[torch.LongTensor] = None, |
| past_key_values: Optional[dict] = None, |
| past_key_values_dict: Optional[dict] = None, |
| inputs_embeds: Optional[torch.FloatTensor] = None, |
| use_cache: Optional[bool] = None, |
| cache_position: Optional[torch.LongTensor] = None, |
| **kwargs: Unpack[TransformersKwargs], |
| ) -> MossSpeechModelOutputWithPast: |
| r""" |
| Args: |
| modalities (List[str]): Modalities to compute in this forward pass. |
| past_key_values_dict (Optional[dict]): KV cache per block when using `use_cache=True`. |
| """ |
|
|
| if (input_ids is None) ^ (inputs_embeds is not None): |
| raise ValueError("You must specify exactly one of input_ids or inputs_embeds") |
|
|
| if inputs_embeds is None: |
| if input_ids is None: |
| raise ValueError("require input_ids (packed or text-only)") |
|
|
| if input_ids.dim() == 3 and input_ids.shape[1] == 2: |
| text_ids = input_ids[:, 0, :] |
| audio_ids_packed = input_ids[:, 1, :] |
| pad_id = int(getattr(self.config, "modality_pad_token_id", 0)) |
| if pad_id == 0: |
| raise ValueError("Expected a non-zero modality_pad_token_id for packed inputs") |
| |
| text_ids_safe = text_ids.masked_fill(text_ids == pad_id, 0) |
|
|
| text_embeds = self.embed_tokens(text_ids_safe) |
| audio_embeds = self.audio_embed(audio_ids_packed) |
| selection_mask = (text_ids != pad_id).unsqueeze(-1).to(dtype=text_embeds.dtype) |
| inputs_embeds = text_embeds * selection_mask + audio_embeds * (1 - selection_mask) |
| elif input_ids.dim() == 2: |
| |
| inputs_embeds = self.embed_tokens(input_ids) |
| else: |
| raise ValueError("invalid inputs for embedding construction") |
|
|
| past_key_values_dict = {} if past_key_values_dict is None else dict(past_key_values_dict) |
|
|
| shared_cache: Optional[Cache] = None |
| if isinstance(past_key_values, dict): |
| |
| past_key_values_dict.update(past_key_values) |
| shared_cache = past_key_values.get("shared") |
| elif isinstance(past_key_values, Cache): |
| shared_cache = past_key_values |
| elif past_key_values is not None: |
| raise TypeError("past_key_values must be either a Cache instance, a dict of caches, or None") |
|
|
| if shared_cache is None: |
| shared_cache = past_key_values_dict.get("shared") |
|
|
| if use_cache: |
| if shared_cache is None: |
| shared_cache_config = copy.deepcopy(self.config) |
| shared_cache_config.layer_types = shared_cache_config.layer_types[: self.config.num_shared_layers] |
| shared_cache = DynamicCache(config=shared_cache_config) |
|
|
| past_key_values_dict = dict(past_key_values_dict) |
| past_key_values_dict["shared"] = shared_cache |
|
|
| for modality in self.modality_blocks: |
| cache = past_key_values_dict.get(modality) |
| if not isinstance(cache, Cache): |
| modality_config = copy.deepcopy(self.config) |
| modality_config.layer_types = modality_config.layer_types[-self.config.num_modality_layers :] |
| past_key_values_dict[modality] = DynamicCache(config=modality_config) |
| else: |
| shared_cache = None |
|
|
| if cache_position is None: |
| past_seen_tokens = 0 |
| if isinstance(shared_cache, Cache): |
| past_seen_tokens = shared_cache.get_seq_length() |
| cache_position = torch.arange( |
| past_seen_tokens, |
| past_seen_tokens + inputs_embeds.shape[1], |
| device=inputs_embeds.device, |
| ) |
|
|
| if position_ids is None: |
| position_ids = cache_position.unsqueeze(0) |
|
|
| |
| if not isinstance(causal_mask_mapping := attention_mask, dict): |
| |
| |
| mask_past_key_values = shared_cache if isinstance(shared_cache, Cache) else None |
| |
| mask_kwargs = { |
| "config": self.config, |
| "input_embeds": inputs_embeds, |
| "attention_mask": attention_mask, |
| "cache_position": cache_position, |
| "past_key_values": mask_past_key_values, |
| "position_ids": position_ids, |
| } |
|
|
| |
| causal_mask_mapping = { |
| "full_attention": create_causal_mask(**mask_kwargs), |
| } |
| |
| if self.has_sliding_layers: |
| causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) |
|
|
| hidden_states = inputs_embeds |
|
|
| |
| position_embeddings = self.rotary_emb(hidden_states, position_ids) |
|
|
| |
| hidden_states = self.shared_block( |
| hidden_states, |
| causal_masks=causal_mask_mapping, |
| position_ids=position_ids, |
| position_embeddings=position_embeddings, |
| past_key_values=shared_cache if use_cache else None, |
| use_cache=bool(use_cache), |
| cache_position=cache_position, |
| **kwargs, |
| ) |
|
|
| |
| last_hidden_state_dict = {} |
| for modality in modalities: |
| mod_block = self._route_block(modality) |
| mod_norm = self._route_final_layer_norm(modality) |
| mod_cache = past_key_values_dict.get(modality) if use_cache else None |
|
|
| |
| mod_hidden_states = mod_block( |
| hidden_states, |
| causal_masks=causal_mask_mapping, |
| position_ids=position_ids, |
| position_embeddings=position_embeddings, |
| past_key_values=mod_cache, |
| use_cache=bool(use_cache), |
| cache_position=cache_position, |
| **kwargs, |
| ) |
| mod_hidden_states = mod_norm(mod_hidden_states) |
| last_hidden_state_dict[modality] = mod_hidden_states |
|
|
| return MossSpeechModelOutputWithPast( |
| last_hidden_state=hidden_states, |
| last_hidden_state_dict=last_hidden_state_dict, |
| past_key_values=shared_cache if use_cache else None, |
| past_key_values_dict=past_key_values_dict if use_cache else None, |
| ) |
|
|
|
|
| class MossSpeechGenerationMixin(GenerationMixin): |
| """Generation mixin for MossSpeech model with two-channel (text/audio) support.""" |
|
|
| def _setup_processors(self, generation_config: GenerationConfig, modalities: int) -> list[LogitsProcessorList]: |
| """Setup per-channel logits processors based on the generation config.""" |
| realprocessor = [LogitsProcessorList() for _ in range(modalities)] |
|
|
| if hasattr(generation_config, "layers"): |
| for i, layer_config in enumerate(generation_config.layers): |
| if i >= len(realprocessor): |
| break |
|
|
| if layer_config.get("repetition_penalty") is not None: |
| realprocessor[i].append( |
| RepetitionPenaltyLogitsProcessor(penalty=layer_config.get("repetition_penalty")) |
| ) |
| if layer_config.get("temperature") is not None: |
| realprocessor[i].append(TemperatureLogitsWarper(temperature=layer_config.get("temperature"))) |
| if layer_config.get("top_k") is not None: |
| realprocessor[i].append(TopKLogitsWarper(top_k=layer_config.get("top_k"))) |
| if layer_config.get("top_p") is not None: |
| realprocessor[i].append(TopPLogitsWarper(top_p=layer_config.get("top_p"))) |
|
|
| return realprocessor |
|
|
| def _generate_next_tokens_with_scores( |
| self, |
| logits_all: tuple[torch.Tensor, ...], |
| input_ids: torch.LongTensor, |
| realprocessor: list[LogitsProcessorList], |
| do_samples: list[bool], |
| generation_config: GenerationConfig, |
| generating_length: int, |
| ) -> tuple[torch.LongTensor, tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]]: |
| """Generate next tokens for all channels with scores and logits.""" |
| |
| next_token_logits = tuple(logits[:, -1, :].clone().float().to(input_ids.device) for logits in logits_all) |
|
|
| |
| next_token_logits[1][:, 16385:] = -torch.inf |
| if hasattr(generation_config, "min_new_tokens") and generating_length < generation_config.min_new_tokens: |
| next_token_logits[1][:, 16384] = -torch.inf |
|
|
| |
| next_token_scores = tuple( |
| realprocessor[i](input_ids[:, :, i], logits) for i, logits in enumerate(next_token_logits) |
| ) |
|
|
| |
| next_tokens = [] |
| for i, channel_score in enumerate(next_token_scores): |
| if do_samples[i]: |
| channel_ntk = torch.multinomial(nn.functional.softmax(channel_score, dim=-1), num_samples=1).squeeze(1) |
| else: |
| channel_ntk = torch.argmax(channel_score, dim=-1) |
| next_tokens.append(channel_ntk) |
|
|
| return torch.stack(next_tokens, dim=-1), next_token_scores, next_token_logits |
|
|
| def _process_multi_modality_tokens( |
| self, |
| next_tokens: torch.LongTensor, |
| current_modality: torch.Tensor, |
| modality_pad_token: Union[int, torch.Tensor], |
| ) -> torch.LongTensor: |
| """Process tokens for MossSpeech generation.""" |
|
|
| mask = current_modality == 1 |
| if mask.any(): |
| pad_value = modality_pad_token.item() if torch.is_tensor(modality_pad_token) else modality_pad_token |
| next_tokens[mask, 0] = int(pad_value) |
|
|
| return next_tokens |
|
|
| def _sample( |
| self, |
| input_ids: torch.LongTensor, |
| logits_processor: LogitsProcessorList, |
| stopping_criteria: StoppingCriteriaList, |
| generation_config: GenerationConfig, |
| synced_gpus: bool, |
| streamer: Optional[BaseStreamer], |
| **model_kwargs, |
| ) -> Union[GenerateDecoderOnlyOutput, torch.LongTensor]: |
| """Sampling implementation for MossSpeech with text and audio modalities.""" |
|
|
| |
| modality_pad_token = generation_config._pad_token_tensor |
| if modality_pad_token is None: |
| pad_fallback = getattr(self.config, "modality_pad_token_id", None) |
| if pad_fallback is None: |
| pad_fallback = getattr(self.config, "pad_token_id", None) |
| if pad_fallback is None: |
| raise ValueError( |
| "MossSpeech generation requires a pad token id; please set it on the config or generation config." |
| ) |
| modality_pad_token = torch.tensor( |
| pad_fallback, |
| device=input_ids.device, |
| dtype=input_ids.dtype, |
| ) |
| else: |
| modality_pad_token = modality_pad_token.to(device=input_ids.device, dtype=input_ids.dtype) |
|
|
| audio_pad_token_id = getattr(self.config, "audio_pad_token_id", None) |
| sosp_token_id = getattr(self.config, "sosp_token_id", None) |
| eosp_token_id = getattr(self.config, "eosp_token_id", None) |
|
|
| output_attentions = generation_config.output_attentions |
| output_hidden_states = generation_config.output_hidden_states |
| output_scores = generation_config.output_scores |
| output_logits = generation_config.output_logits |
| return_dict_in_generate = generation_config.return_dict_in_generate |
| global_do_sample = generation_config.do_sample |
|
|
| scores = () if (return_dict_in_generate and output_scores) else None |
| raw_logits = () if (return_dict_in_generate and output_logits) else None |
| decoder_attentions = () if (return_dict_in_generate and output_attentions) else None |
| decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None |
|
|
| batch_size, cur_len, input_modalities = input_ids.shape |
| this_peer_finished = False |
| unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) |
|
|
| model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs) |
|
|
| if hasattr(generation_config, "do_samples") and generation_config.do_samples is not None: |
| per_modality_do_sample = generation_config.do_samples |
| logits_processors = self._setup_processors(generation_config, input_modalities) |
| else: |
| per_modality_do_sample = [global_do_sample for _ in range(input_modalities)] |
| logits_processors = [logits_processor for _ in range(input_modalities)] |
|
|
| current_modality = torch.zeros((batch_size,), dtype=torch.long, device=input_ids.device) |
|
|
| |
| if self.config.modality_pad_token_id is not None: |
| audio_mask = input_ids[:, -1, 0] == self.config.modality_pad_token_id |
| current_modality[audio_mask] = 1 |
| if audio_pad_token_id is not None: |
| text_mask = input_ids[:, -1, 1] == audio_pad_token_id |
| current_modality[text_mask] = 0 |
|
|
| generating_length = 0 |
| while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): |
| generating_length += 1 |
| model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) |
| if output_attentions: |
| model_inputs["output_attentions"] = output_attentions |
| if output_hidden_states: |
| model_inputs["output_hidden_states"] = output_hidden_states |
| if "past_key_values_dict" in model_kwargs: |
| model_inputs["past_key_values_dict"] = model_kwargs["past_key_values_dict"] |
|
|
| if sosp_token_id is not None: |
| text_mode_mask = current_modality == 0 |
| text_to_audio_mask = text_mode_mask & (input_ids[:, -1, 0] == sosp_token_id) |
| current_modality[text_to_audio_mask] = 1 |
| if eosp_token_id is not None: |
| audio_mode_mask = current_modality == 1 |
| audio_to_text_mask = audio_mode_mask & (input_ids[:, -1, 1] == eosp_token_id) |
| current_modality[audio_to_text_mask] = 0 |
|
|
| outputs = self(**model_inputs, return_dict=True) |
| model_kwargs = self._update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False) |
| if outputs.past_key_values_dict is not None: |
| model_kwargs["past_key_values_dict"] = outputs.past_key_values_dict |
|
|
| if synced_gpus and this_peer_finished: |
| continue |
|
|
| next_tokens, next_token_scores, next_token_logits = self._generate_next_tokens_with_scores( |
| outputs.logits_all, |
| input_ids, |
| logits_processors, |
| per_modality_do_sample, |
| generation_config, |
| generating_length, |
| ) |
| next_tokens = self._process_multi_modality_tokens( |
| next_tokens, |
| current_modality, |
| modality_pad_token, |
| ) |
|
|
| input_ids = torch.cat([input_ids, next_tokens[:, None, :]], dim=1) |
| if streamer is not None: |
| streamer.put(next_tokens[:, 0].cpu()) |
|
|
| stopping = stopping_criteria(input_ids[:, :, 0], scores) |
|
|
| unfinished_sequences = unfinished_sequences & ~stopping |
| this_peer_finished = unfinished_sequences.max() == 0 |
|
|
| if return_dict_in_generate: |
| if output_scores: |
| scores += (next_token_scores,) |
| if output_logits: |
| raw_logits += (next_token_logits,) |
| if output_attentions: |
| decoder_attentions += (outputs.attentions,) |
| if output_hidden_states: |
| decoder_hidden_states += (outputs.hidden_states,) |
|
|
| cur_len += 1 |
|
|
| if streamer is not None: |
| streamer.end() |
|
|
| if return_dict_in_generate: |
| return GenerateDecoderOnlyOutput( |
| sequences=input_ids, |
| scores=scores, |
| logits=raw_logits, |
| attentions=decoder_attentions, |
| hidden_states=decoder_hidden_states, |
| past_key_values=model_kwargs.get("past_key_values"), |
| ) |
|
|
| return input_ids |
|
|
| def generate( |
| self, |
| input_ids: Optional[torch.Tensor] = None, |
| output_only: bool = True, |
| **kwargs, |
| ): |
| batch_size, seq_len, modalities = input_ids.shape |
| start_id = seq_len |
| outputs = super().generate(input_ids, **kwargs) |
| return_dict_in_generate = kwargs.get("return_dict_in_generate", False) |
| if return_dict_in_generate: |
| output_ids = outputs["sequences"] |
| else: |
| output_ids = outputs |
| if output_only: |
| output_ids = output_ids[:, start_id:, :] |
| if return_dict_in_generate: |
| outputs["sequences"] = output_ids |
| else: |
| outputs = output_ids |
| return outputs |
|
|
| def _prepare_attention_mask_for_generation( |
| self, |
| inputs_tensor: torch.Tensor, |
| generation_config: GenerationConfig, |
| model_kwargs: dict[str, Any], |
| ) -> torch.LongTensor: |
| pad_token_id = generation_config._pad_token_tensor |
| eos_token_id = generation_config._eos_token_tensor |
|
|
| |
| if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0: |
| inputs_tensor = model_kwargs["input_ids"] |
|
|
| |
| if len(inputs_tensor.shape) == 3 and inputs_tensor.shape[1] == 2: |
| |
| default_attention_mask = torch.ones( |
| (inputs_tensor.shape[0], inputs_tensor.shape[2]), |
| dtype=torch.long, |
| device=inputs_tensor.device, |
| ) |
| else: |
| |
| default_attention_mask = torch.ones( |
| inputs_tensor.shape[:2], |
| dtype=torch.long, |
| device=inputs_tensor.device, |
| ) |
| if pad_token_id is None: |
| return default_attention_mask |
|
|
| is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long] |
| if not is_input_ids: |
| return default_attention_mask |
|
|
| is_pad_token_in_inputs = (pad_token_id is not None) and ( |
| isin_mps_friendly(elements=inputs_tensor, test_elements=pad_token_id).any() |
| ) |
| is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~( |
| isin_mps_friendly(elements=eos_token_id, test_elements=pad_token_id).any() |
| ) |
| can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id |
| attention_mask_from_padding = inputs_tensor.ne(pad_token_id).long() |
|
|
| attention_mask = ( |
| attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask |
| ) |
| return attention_mask |
|
|
|
|
| @auto_docstring |
| class MossSpeechForCausalLM(MossSpeechPreTrainedModel, MossSpeechGenerationMixin): |
| _tied_weights_keys = ["text_lm_head.weight", "audio_lm_head.weight"] |
| _tp_plan = {"text_lm_head": "colwise_rep", "audio_lm_head": "colwise_rep"} |
| _pp_plan = { |
| "text_lm_head": (["hidden_states"], ["logits"]), |
| "audio_lm_head": (["hidden_states"], ["logits"]), |
| } |
|
|
| def __init__(self, config): |
| super().__init__(config) |
| self.model = MossSpeechModel(config) |
| self.vocab_size = config.vocab_size |
| self.text_lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) |
| self.audio_lm_head = nn.Linear(config.hidden_size, config.audio_vocab_size, bias=False) |
| self.modality_lm_head = { |
| "text": self.text_lm_head, |
| "audio": self.audio_lm_head, |
| } |
|
|
| |
| self.post_init() |
|
|
| @can_return_tuple |
| @auto_docstring |
| def forward( |
| self, |
| input_ids: Optional[torch.LongTensor] = None, |
| attention_mask: Optional[torch.Tensor] = None, |
| position_ids: Optional[torch.LongTensor] = None, |
| past_key_values: Optional[torch.Tensor] = None, |
| past_key_values_dict: Optional[dict] = None, |
| inputs_embeds: Optional[torch.FloatTensor] = None, |
| labels: Optional[torch.LongTensor] = None, |
| use_cache: Optional[bool] = None, |
| cache_position: Optional[torch.LongTensor] = None, |
| logits_to_keep: Union[int, torch.Tensor] = 0, |
| **kwargs: Unpack[TransformersKwargs], |
| ) -> MossSpeechCausalLMOutputWithPast: |
| r""" |
| labels (`torch.LongTensor` of shape `(batch_size, 2, sequence_length)` or `(batch_size, 2*sequence_length)`, *optional*): |
| Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., |
| config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored |
| (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. |
| |
| past_key_values_dict (Optional[dict]): KV cache for each block. |
| |
| Example: |
| |
| ```python |
| >>> from transformers import AutoTokenizer, MossSpeechForCausalLM |
| |
| >>> model = MossSpeechForCausalLM.from_pretrained("Qwen/MossSpeech-8B") |
| >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/MossSpeech-8B") |
| |
| >>> prompt = "Hey, are you conscious? Can you talk to me?" |
| >>> inputs = tokenizer(prompt, return_tensors="pt") |
| |
| >>> # Generate |
| >>> generate_ids = model.generate(inputs.input_ids, max_length=30) |
| >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
| "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." |
| ``` |
| """ |
| if input_ids is not None and input_ids.dim() == 2: |
| B = input_ids.shape[0] |
| input_ids = input_ids.reshape([B, 2, -1]) |
|
|
| input_ids = input_ids.transpose(1, 2) |
| outputs: MossSpeechModelOutputWithPast = self.model( |
| modalities=["text", "audio"], |
| input_ids=input_ids, |
| attention_mask=attention_mask, |
| position_ids=position_ids, |
| past_key_values=past_key_values, |
| past_key_values_dict=past_key_values_dict, |
| inputs_embeds=inputs_embeds, |
| use_cache=use_cache, |
| cache_position=cache_position, |
| **kwargs, |
| ) |
| text_hidden_states = outputs.last_hidden_state_dict["text"] |
| audio_hidden_states = outputs.last_hidden_state_dict["audio"] |
| |
| slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep |
| text_logits = self.modality_lm_head["text"](text_hidden_states[:, slice_indices, :]) |
| audio_logits = self.modality_lm_head["audio"](audio_hidden_states[:, slice_indices, :]) |
|
|
| text_loss = None |
| audio_loss = None |
| if labels is not None: |
| if labels.dim() == 2: |
| B = labels.shape[0] |
| labels.reshape([B, 2, -1]) |
| text_labels = labels[:, 0, :] |
| audio_labels = labels[:, 1, :] |
| text_loss = self.loss_function( |
| logits=text_logits, |
| labels=text_labels, |
| vocab_size=self.config.vocab_size, |
| **kwargs, |
| ) |
| audio_loss = self.loss_function( |
| logits=audio_logits, |
| labels=audio_labels, |
| vocab_size=self.config.vocab_size, |
| **kwargs, |
| ) |
| return MossSpeechCausalLMOutputWithPast( |
| text_loss=text_loss, |
| audio_loss=audio_loss, |
| text_logits=text_logits, |
| audio_logits=audio_logits, |
| logits_all=(text_logits, audio_logits), |
| past_key_values=outputs.past_key_values, |
| past_key_values_dict=outputs.past_key_values_dict, |
| text_hidden_states=text_hidden_states, |
| audio_hidden_states=audio_hidden_states, |
| attentions=outputs.attentions, |
| ) |
|
|
|
|
| class MossSpeechForSequenceClassification(GenericForSequenceClassification, MossSpeechPreTrainedModel): |
| pass |
|
|
|
|
| class MossSpeechForTokenClassification(GenericForTokenClassification, MossSpeechPreTrainedModel): |
| pass |
|
|
|
|
| class MossSpeechForQuestionAnswering(GenericForQuestionAnswering, MossSpeechPreTrainedModel): |
| base_model_prefix = "transformer" |
|
|
|
|
| __all__ = [ |
| "MossSpeechForCausalLM", |
| "MossSpeechForQuestionAnswering", |
| "MossSpeechPreTrainedModel", |
| "MossSpeechModel", |
| "MossSpeechForSequenceClassification", |
| "MossSpeechForTokenClassification", |
| ] |
|
|