KitsuVp commited on
Commit
94c09c1
Β·
verified Β·
1 Parent(s): ccb4fe6

Update modeling_neollm.py

Browse files
Files changed (1) hide show
  1. modeling_neollm.py +18 -4
modeling_neollm.py CHANGED
@@ -853,15 +853,24 @@ class LeviathanGenerator(nn.Module):
853
  # head_proj_weight [M*d_seed, d_seed] β€” los pesos del cabezal m
854
  # son las filas [m*d_seed : (m+1)*d_seed].
855
  proj_w = self.head_proj_weight[m * d : (m + 1) * d] # [d_seed, d_seed]
856
- zh = F.linear(z.float(), proj_w) # [N, d_seed]
 
 
 
 
 
 
 
857
 
858
  # ── LayerNorm manual por cabezal ──────────────────────────────────
859
  # Equivalente a nn.LayerNorm(d_seed) con parΓ‘metros independientes
860
  # head_norm_weight[m] y head_norm_bias[m].
 
 
861
  mean = zh.mean(dim=-1, keepdim=True)
862
  var = zh.var(dim=-1, keepdim=True, unbiased=False)
863
  zh = (zh - mean) / (var + self.head_norm_eps).sqrt()
864
- zh = zh * self.head_norm_weight[m] + self.head_norm_bias[m]
865
 
866
  # ── Sigmoid(x/2) β†’ coordenada latente en [0,1]^d_seed ────────────
867
  zh = torch.sigmoid(zh / 2.0).clamp(0.0, 1.0) # [N, d_seed]
@@ -4230,7 +4239,12 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
4230
  output_attentions if output_attentions is not None
4231
  else self.config.output_attentions
4232
  )
4233
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
 
 
 
 
4234
 
4235
  if (input_ids is None) ^ (inputs_embeds is not None):
4236
  raise ValueError("Specify exactly one of input_ids or inputs_embeds")
@@ -4723,4 +4737,4 @@ __all__ = [
4723
 
4724
  AutoConfig.register("neollm", NeoLLMConfig)
4725
  AutoModel.register(NeoLLMConfig, NeoLLMModel)
4726
- AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)
 
853
  # head_proj_weight [M*d_seed, d_seed] β€” los pesos del cabezal m
854
  # son las filas [m*d_seed : (m+1)*d_seed].
855
  proj_w = self.head_proj_weight[m * d : (m + 1) * d] # [d_seed, d_seed]
856
+ # Keep the matmul in the parameter dtype so eager inference matches
857
+ # mixed-precision training, then promote to float32 for the reduction-
858
+ # heavy normalization and KHRONOS path below.
859
+ zh = F.linear(
860
+ z.to(dtype=proj_w.dtype, device=proj_w.device),
861
+ proj_w,
862
+ ) # [N, d_seed]
863
+ zh = zh.float()
864
 
865
  # ── LayerNorm manual por cabezal ──────────────────────────────────
866
  # Equivalente a nn.LayerNorm(d_seed) con parΓ‘metros independientes
867
  # head_norm_weight[m] y head_norm_bias[m].
868
+ norm_w = self.head_norm_weight[m].float()
869
+ norm_b = self.head_norm_bias[m].float()
870
  mean = zh.mean(dim=-1, keepdim=True)
871
  var = zh.var(dim=-1, keepdim=True, unbiased=False)
872
  zh = (zh - mean) / (var + self.head_norm_eps).sqrt()
873
+ zh = zh * norm_w + norm_b
874
 
875
  # ── Sigmoid(x/2) β†’ coordenada latente en [0,1]^d_seed ────────────
876
  zh = torch.sigmoid(zh / 2.0).clamp(0.0, 1.0) # [N, d_seed]
 
4239
  output_attentions if output_attentions is not None
4240
  else self.config.output_attentions
4241
  )
4242
+ if return_dict is None:
4243
+ cfg_dict = vars(self.config)
4244
+ return_dict = cfg_dict.get(
4245
+ "return_dict",
4246
+ cfg_dict.get("use_return_dict", True),
4247
+ )
4248
 
4249
  if (input_ids is None) ^ (inputs_embeds is not None):
4250
  raise ValueError("Specify exactly one of input_ids or inputs_embeds")
 
4737
 
4738
  AutoConfig.register("neollm", NeoLLMConfig)
4739
  AutoModel.register(NeoLLMConfig, NeoLLMModel)
4740
+ AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)