Update modeling_neollm.py
Browse files- modeling_neollm.py +18 -4
modeling_neollm.py
CHANGED
|
@@ -853,15 +853,24 @@ class LeviathanGenerator(nn.Module):
|
|
| 853 |
# head_proj_weight [M*d_seed, d_seed] β los pesos del cabezal m
|
| 854 |
# son las filas [m*d_seed : (m+1)*d_seed].
|
| 855 |
proj_w = self.head_proj_weight[m * d : (m + 1) * d] # [d_seed, d_seed]
|
| 856 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
|
| 858 |
# ββ LayerNorm manual por cabezal ββββββββββββββββββββββββββββββββββ
|
| 859 |
# Equivalente a nn.LayerNorm(d_seed) con parΓ‘metros independientes
|
| 860 |
# head_norm_weight[m] y head_norm_bias[m].
|
|
|
|
|
|
|
| 861 |
mean = zh.mean(dim=-1, keepdim=True)
|
| 862 |
var = zh.var(dim=-1, keepdim=True, unbiased=False)
|
| 863 |
zh = (zh - mean) / (var + self.head_norm_eps).sqrt()
|
| 864 |
-
zh = zh *
|
| 865 |
|
| 866 |
# ββ Sigmoid(x/2) β coordenada latente en [0,1]^d_seed ββββββββββββ
|
| 867 |
zh = torch.sigmoid(zh / 2.0).clamp(0.0, 1.0) # [N, d_seed]
|
|
@@ -4230,7 +4239,12 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
|
|
| 4230 |
output_attentions if output_attentions is not None
|
| 4231 |
else self.config.output_attentions
|
| 4232 |
)
|
| 4233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4234 |
|
| 4235 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 4236 |
raise ValueError("Specify exactly one of input_ids or inputs_embeds")
|
|
@@ -4723,4 +4737,4 @@ __all__ = [
|
|
| 4723 |
|
| 4724 |
AutoConfig.register("neollm", NeoLLMConfig)
|
| 4725 |
AutoModel.register(NeoLLMConfig, NeoLLMModel)
|
| 4726 |
-
AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)
|
|
|
|
| 853 |
# head_proj_weight [M*d_seed, d_seed] β los pesos del cabezal m
|
| 854 |
# son las filas [m*d_seed : (m+1)*d_seed].
|
| 855 |
proj_w = self.head_proj_weight[m * d : (m + 1) * d] # [d_seed, d_seed]
|
| 856 |
+
# Keep the matmul in the parameter dtype so eager inference matches
|
| 857 |
+
# mixed-precision training, then promote to float32 for the reduction-
|
| 858 |
+
# heavy normalization and KHRONOS path below.
|
| 859 |
+
zh = F.linear(
|
| 860 |
+
z.to(dtype=proj_w.dtype, device=proj_w.device),
|
| 861 |
+
proj_w,
|
| 862 |
+
) # [N, d_seed]
|
| 863 |
+
zh = zh.float()
|
| 864 |
|
| 865 |
# ββ LayerNorm manual por cabezal ββββββββββββββββββββββββββββββββββ
|
| 866 |
# Equivalente a nn.LayerNorm(d_seed) con parΓ‘metros independientes
|
| 867 |
# head_norm_weight[m] y head_norm_bias[m].
|
| 868 |
+
norm_w = self.head_norm_weight[m].float()
|
| 869 |
+
norm_b = self.head_norm_bias[m].float()
|
| 870 |
mean = zh.mean(dim=-1, keepdim=True)
|
| 871 |
var = zh.var(dim=-1, keepdim=True, unbiased=False)
|
| 872 |
zh = (zh - mean) / (var + self.head_norm_eps).sqrt()
|
| 873 |
+
zh = zh * norm_w + norm_b
|
| 874 |
|
| 875 |
# ββ Sigmoid(x/2) β coordenada latente en [0,1]^d_seed ββββββββββββ
|
| 876 |
zh = torch.sigmoid(zh / 2.0).clamp(0.0, 1.0) # [N, d_seed]
|
|
|
|
| 4239 |
output_attentions if output_attentions is not None
|
| 4240 |
else self.config.output_attentions
|
| 4241 |
)
|
| 4242 |
+
if return_dict is None:
|
| 4243 |
+
cfg_dict = vars(self.config)
|
| 4244 |
+
return_dict = cfg_dict.get(
|
| 4245 |
+
"return_dict",
|
| 4246 |
+
cfg_dict.get("use_return_dict", True),
|
| 4247 |
+
)
|
| 4248 |
|
| 4249 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 4250 |
raise ValueError("Specify exactly one of input_ids or inputs_embeds")
|
|
|
|
| 4737 |
|
| 4738 |
AutoConfig.register("neollm", NeoLLMConfig)
|
| 4739 |
AutoModel.register(NeoLLMConfig, NeoLLMModel)
|
| 4740 |
+
AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)
|