Update configuration_neollm.py
Browse files- configuration_neollm.py +9 -9
configuration_neollm.py
CHANGED
|
@@ -495,7 +495,7 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 495 |
|
| 496 |
def __init__(
|
| 497 |
self,
|
| 498 |
-
vocab_size=
|
| 499 |
hidden_size=512,
|
| 500 |
intermediate_size=1536,
|
| 501 |
num_hidden_layers=12,
|
|
@@ -514,17 +514,17 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 514 |
head_dim=64,
|
| 515 |
use_momentum_attention=True,
|
| 516 |
momentum_gamma=0.10,
|
| 517 |
-
use_mea_attention=
|
| 518 |
mea_component_key_value_heads=None,
|
| 519 |
mea_groupnorm_eps=1e-6,
|
| 520 |
-
use_lucid_attention=
|
| 521 |
lucid_attention_eps=1e-6,
|
| 522 |
use_affine_scaled_attention=True,
|
| 523 |
affine_momentum=0.9,
|
| 524 |
use_xsa=True,
|
| 525 |
xsa_eps=1e-6,
|
| 526 |
# ββ Directional Routing (Taylor, 2026) ββββββββββββββββββββββββββββ
|
| 527 |
-
use_directional_routing=
|
| 528 |
directional_routing_k=4,
|
| 529 |
directional_routing_temp=3.0,
|
| 530 |
# ββ Attention Residuals (Kimi Team, 2026) βββββββββββββββββββββββββ
|
|
@@ -534,13 +534,13 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 534 |
fan_ratio_ffn=0.0625,
|
| 535 |
dropout_rate=0.1,
|
| 536 |
# ββ Leviathan continuous token generator ββββββββββββββββββββββββββ
|
| 537 |
-
use_token_generator=
|
| 538 |
generator_d_seed=128,
|
| 539 |
generator_num_modes=8,
|
| 540 |
generator_num_knots=32,
|
| 541 |
generator_spline_degree=2,
|
| 542 |
generator_k=3,
|
| 543 |
-
generator_krank=
|
| 544 |
# ββ Leviathan-JTok-M token-indexed modulation βββββββββββββββββββββ
|
| 545 |
use_jtokm=False,
|
| 546 |
jtokm_num_experts=4,
|
|
@@ -568,9 +568,9 @@ class NeoLLMConfig(PretrainedConfig):
|
|
| 568 |
versatile_gumbel_temp_decay=0.99984,
|
| 569 |
versatile_aux_loss_weight=1e-5,
|
| 570 |
# ββ LAuReL: Learned Augmented Residual Layer (Menghani et al., 2025) β
|
| 571 |
-
use_laurel=
|
| 572 |
-
use_laurel_rw=
|
| 573 |
-
use_laurel_lr=
|
| 574 |
laurel_lr_rank=32,
|
| 575 |
**kwargs,
|
| 576 |
):
|
|
|
|
| 495 |
|
| 496 |
def __init__(
|
| 497 |
self,
|
| 498 |
+
vocab_size=64402,
|
| 499 |
hidden_size=512,
|
| 500 |
intermediate_size=1536,
|
| 501 |
num_hidden_layers=12,
|
|
|
|
| 514 |
head_dim=64,
|
| 515 |
use_momentum_attention=True,
|
| 516 |
momentum_gamma=0.10,
|
| 517 |
+
use_mea_attention=False,
|
| 518 |
mea_component_key_value_heads=None,
|
| 519 |
mea_groupnorm_eps=1e-6,
|
| 520 |
+
use_lucid_attention=False,
|
| 521 |
lucid_attention_eps=1e-6,
|
| 522 |
use_affine_scaled_attention=True,
|
| 523 |
affine_momentum=0.9,
|
| 524 |
use_xsa=True,
|
| 525 |
xsa_eps=1e-6,
|
| 526 |
# ββ Directional Routing (Taylor, 2026) ββββββββββββββββββββββββββββ
|
| 527 |
+
use_directional_routing=False,
|
| 528 |
directional_routing_k=4,
|
| 529 |
directional_routing_temp=3.0,
|
| 530 |
# ββ Attention Residuals (Kimi Team, 2026) βββββββββββββββββββββββββ
|
|
|
|
| 534 |
fan_ratio_ffn=0.0625,
|
| 535 |
dropout_rate=0.1,
|
| 536 |
# ββ Leviathan continuous token generator ββββββββββββββββββββββββββ
|
| 537 |
+
use_token_generator=True,
|
| 538 |
generator_d_seed=128,
|
| 539 |
generator_num_modes=8,
|
| 540 |
generator_num_knots=32,
|
| 541 |
generator_spline_degree=2,
|
| 542 |
generator_k=3,
|
| 543 |
+
generator_krank=32,
|
| 544 |
# ββ Leviathan-JTok-M token-indexed modulation βββββββββββββββββββββ
|
| 545 |
use_jtokm=False,
|
| 546 |
jtokm_num_experts=4,
|
|
|
|
| 568 |
versatile_gumbel_temp_decay=0.99984,
|
| 569 |
versatile_aux_loss_weight=1e-5,
|
| 570 |
# ββ LAuReL: Learned Augmented Residual Layer (Menghani et al., 2025) β
|
| 571 |
+
use_laurel=False,
|
| 572 |
+
use_laurel_rw=False,
|
| 573 |
+
use_laurel_lr=False,
|
| 574 |
laurel_lr_rank=32,
|
| 575 |
**kwargs,
|
| 576 |
):
|