Upload AuriStream base model code

Browse files

Files changed (2) hide show

configuration_auristream.py +0 -3
modeling_auristream.py +35 -81

configuration_auristream.py CHANGED Viewed

@@ -50,9 +50,6 @@ class AuriStreamConfig(PretrainedConfig):
         input_conv_kernel_size: int = 0,
         **kwargs,
     ):
-        # Force no weight tying
-        kwargs["tie_word_embeddings"] = False
         self.vocab_size = vocab_size
         self.n_embd = n_embd
         self.n_layer = n_layer

         input_conv_kernel_size: int = 0,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_embd = n_embd
         self.n_layer = n_layer

modeling_auristream.py CHANGED Viewed

@@ -8,14 +8,22 @@ https://huggingface.co/TuKoResearch/WavCochCausalV8192
 """
 import math
-from typing import Optional, List
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from transformers import PreTrainedModel
-from transformers.modeling_outputs import CausalLMOutput, BaseModelOutput
 from .configuration_auristream import AuriStreamConfig
@@ -67,7 +75,7 @@ def apply_rotary_emb(x, cos, sin):
     x2 = x[..., d:]
     y1 = x1 * cos + x2 * sin
     y2 = x1 * (-sin) + x2 * cos
-    return torch.cat([y1, y2], dim=3)
 class CausalSelfAttention(nn.Module):
@@ -225,6 +233,11 @@ class AuriStreamPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["Block"]
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
@@ -240,8 +253,7 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
     AuriStream speech language model.
     A GPT-like transformer model for cochlear token prediction with optional
-    multi-token prediction (MTP) heads for improved representation learning and
-    novel inference capabilities.
     Developed by Greta Tuckute and Klemen Kotar.
     """
@@ -267,11 +279,10 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         else:
             self.future_heads = None
-        # "Standard" LM output head
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        # Initialize weights
-        self.apply(self._init_weights)
         # Apply special scaled init to residual projections
         for pn, p in self.named_parameters():
             if pn.endswith('c_proj.weight'):
@@ -291,11 +302,8 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         self,
         input_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
-        output_logits: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
-        up_until_layer: Optional[int] = None,
-        normalize_embeddings: Optional[str] = None,
         # Legacy arguments for compatibility
         seq: Optional[torch.LongTensor] = None,
         tgt: Optional[torch.LongTensor] = None,
@@ -306,27 +314,13 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         Args:
             input_ids: Input token IDs of shape (batch_size, seq_len)
             labels: Target token IDs for computing loss
-            output_logits: Whether to return all logits (including from future heads).
-                 The first element corresponds to the standard next-token head (prediction of i+1);
-                 subsequent elements correspond to future heads predicting tokens i+2, i+3, etc.
-            output_hidden_states: Whether to return all hidden states, including the input
-                embedding state and final pre-ln_f state. Matches HuggingFace GPT-style.
-            return_dict: Whether to return a dict or tuple. If True, return a CausalLMOutput dict,
-                otherwise return a tuple.
-            up_until_layer: If set, stop the forward pass after this transformer block
-                (inclusive) and return intermediate activations. Useful for saving compute.
-            normalize_embeddings: 'l2' or 'learned' to normalize hidden states
-            seq: Legacy argument (alias for input_ids for backward compatibility)
-            tgt: Legacy argument (alias for labels for backward compatibility)
         Returns:
-            If return_dict is True:
-                CausalLMOutput with fields:
-                  • loss (optional): Scalar training loss
-                  • logits: Tensor or list of tensors of prediction logits
-                  • hidden_states (optional): Tuple of hidden states
-            Otherwise:
-                Tuple of (logits or list of logits, loss).
         """
         # Handle legacy arguments
         if seq is not None:
@@ -338,64 +332,25 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         tok_emb = self.wte(input_ids)
         x = self.drop(tok_emb)
-        # Collect hidden states
         all_hidden_states = []
         # Forward through transformer blocks
-        for block_idx, block in enumerate(self.h):
-            all_hidden_states.append(x)
-            if up_until_layer is not None and block_idx == up_until_layer:
-                break
             x = block(x)
-        # Append final pre-ln_f state if we didn't exit early
-        if up_until_layer is None or block_idx == len(self.h) - 1:
             all_hidden_states.append(x)
-        # Normalize hidden states if requested
-        hs_to_return = all_hidden_states
-        if output_hidden_states and normalize_embeddings is not None:
-            if normalize_embeddings == 'l2': # Preserve direction, get rid of magnitude
-                hs_to_return = [F.normalize(h, p=2, dim=-1) for h in all_hidden_states] # Dim -1 is the hidden state dim;
-                # after normalization torch.norm(h_norm, p=2, dim=-1) will be 1. I.e. for every token, the hidden state dim norm is 1.
-            elif normalize_embeddings == 'learned': # We use the learned RMSNorm (first one; used to prepare embeddings for attn)
-                # I.e. these are the representations on which the model computes.
-                hs_to_return = []
-                L = len(self.h)
-                for i, h in enumerate(all_hidden_states):
-                    if i < L:
-                        hs_to_return.append(self.h[i].norm1(h))
-                    else:
-                        hs_to_return.append(self.ln_f(h)) # Final layer norm (after the main blocks, before LM head(s))
-        # If only hidden states requested (not logits), return early
-        if output_hidden_states and not output_logits and labels is None:
-            return BaseModelOutput(
-                last_hidden_state=x,
-                hidden_states=hs_to_return,
-            )
         # Final layer norm and output head
         x = self.ln_f(x)
         logits = self.lm_head(x)
-        # Collect all logits if requested
-        all_logits = [logits] if output_logits else None
-        # Compute future head logits
-        # lm_head is the first "standard" lm head which predicts token i+1 (as all GPT models have)
-        # self.future_heads holds all the other "MTP" future prediction heads, so self.future_heads
-        # corresponds to the head that predicts token i+2 - aka the "second head"
-        if self.future_heads is not None:
-            for i, head in enumerate(self.future_heads):
-                future_logits = head(x[:, :-(i + 1)])
-                if output_logits:
-                    all_logits.append(future_logits)
         # Compute loss if labels provided
         loss = None
         if labels is not None:
-            # compute loss from the first "standard" lm head
             loss = F.cross_entropy(
                 logits.reshape(-1, self.config.vocab_size),
                 labels.reshape(-1),
@@ -404,21 +359,21 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
             # Multi-token prediction loss
             if self.future_heads is not None:
                 for i, head in enumerate(self.future_heads):
-                    future_logits = head(x[:, :-(i + 1)])
                     loss = loss + F.cross_entropy(
                         future_logits.reshape(-1, self.config.vocab_size),
-                        labels[:, (i + 1):].reshape(-1),
                     )
         if not return_dict:
             if labels is not None:
-                return (all_logits if output_logits else logits), loss
-            return (all_logits if output_logits else logits), None
         return CausalLMOutput(
             loss=loss,
-            logits=all_logits if output_logits else logits,
-            hidden_states=hs_to_return if output_hidden_states else None,
         )
     def sample_logits(
@@ -489,7 +444,6 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
             torch.manual_seed(seed)
         all_logits = []
-        device = seq.device
         b, t = seq.size()
         # Encode conditioning sequence into KV cache

 """
 import math
+import os
+from typing import Optional
+os.environ.setdefault("USE_TORCH_XLA", "0")
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutput
+import transformers.modeling_utils as transformers_modeling_utils
+import transformers.utils.import_utils as transformers_import_utils
+transformers_import_utils.is_torch_xla_available = lambda *args, **kwargs: False
+transformers_modeling_utils.is_torch_xla_available = lambda *args, **kwargs: False
 from .configuration_auristream import AuriStreamConfig
     x2 = x[..., d:]
     y1 = x1 * cos + x2 * sin
     y2 = x1 * (-sin) + x2 * cos
+    return torch.cat([y1, y2], dim=3).type_as(x)
 class CausalSelfAttention(nn.Module):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["Block"]
+    def __init__(self, config: AuriStreamConfig):
+        super().__init__(config)
+        if not hasattr(self, "all_tied_weights_keys"):
+            self.all_tied_weights_keys = {}
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
     AuriStream speech language model.
     A GPT-like transformer model for cochlear token prediction with optional
+    multi-token prediction (MTP) heads for speculative decoding.
     Developed by Greta Tuckute and Klemen Kotar.
     """
         else:
             self.future_heads = None
+        # Output head
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.post_init()
         # Apply special scaled init to residual projections
         for pn, p in self.named_parameters():
             if pn.endswith('c_proj.weight'):
         self,
         input_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = False,
         return_dict: Optional[bool] = True,
         # Legacy arguments for compatibility
         seq: Optional[torch.LongTensor] = None,
         tgt: Optional[torch.LongTensor] = None,
         Args:
             input_ids: Input token IDs of shape (batch_size, seq_len)
             labels: Target token IDs for computing loss
+            output_hidden_states: Whether to return all hidden states
+            return_dict: Whether to return a dict or tuple
+            seq: Legacy argument (alias for input_ids)
+            tgt: Legacy argument (alias for labels)
         Returns:
+            CausalLMOutput with logits and optional loss
         """
         # Handle legacy arguments
         if seq is not None:
         tok_emb = self.wte(input_ids)
         x = self.drop(tok_emb)
+        # Collect hidden states if requested
         all_hidden_states = []
         # Forward through transformer blocks
+        for block in self.h:
+            if output_hidden_states:
+                all_hidden_states.append(x)
             x = block(x)
+        if output_hidden_states:
             all_hidden_states.append(x)
         # Final layer norm and output head
         x = self.ln_f(x)
         logits = self.lm_head(x)
         # Compute loss if labels provided
         loss = None
         if labels is not None:
             loss = F.cross_entropy(
                 logits.reshape(-1, self.config.vocab_size),
                 labels.reshape(-1),
             # Multi-token prediction loss
             if self.future_heads is not None:
                 for i, head in enumerate(self.future_heads):
+                    future_logits = head(x[:, :-(i+1)])
                     loss = loss + F.cross_entropy(
                         future_logits.reshape(-1, self.config.vocab_size),
+                        labels[:, (i+1):].reshape(-1),
                     )
         if not return_dict:
             if labels is not None:
+                return logits, loss
+            return logits, None
         return CausalLMOutput(
             loss=loss,
+            logits=logits,
+            hidden_states=all_hidden_states if output_hidden_states else None,
         )
     def sample_logits(
             torch.manual_seed(seed)
         all_logits = []
         b, t = seq.size()
         # Encode conditioning sequence into KV cache