KitsuVp commited on
Commit
68b34cf
Β·
verified Β·
1 Parent(s): d5dfbc7

Update configuration_neollm.py

Browse files
Files changed (1) hide show
  1. configuration_neollm.py +9 -9
configuration_neollm.py CHANGED
@@ -495,7 +495,7 @@ class NeoLLMConfig(PretrainedConfig):
495
 
496
  def __init__(
497
  self,
498
- vocab_size=200005,
499
  hidden_size=512,
500
  intermediate_size=1536,
501
  num_hidden_layers=12,
@@ -514,17 +514,17 @@ class NeoLLMConfig(PretrainedConfig):
514
  head_dim=64,
515
  use_momentum_attention=True,
516
  momentum_gamma=0.10,
517
- use_mea_attention=True,
518
  mea_component_key_value_heads=None,
519
  mea_groupnorm_eps=1e-6,
520
- use_lucid_attention=True,
521
  lucid_attention_eps=1e-6,
522
  use_affine_scaled_attention=True,
523
  affine_momentum=0.9,
524
  use_xsa=True,
525
  xsa_eps=1e-6,
526
  # ── Directional Routing (Taylor, 2026) ────────────────────────────
527
- use_directional_routing=True,
528
  directional_routing_k=4,
529
  directional_routing_temp=3.0,
530
  # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
@@ -534,13 +534,13 @@ class NeoLLMConfig(PretrainedConfig):
534
  fan_ratio_ffn=0.0625,
535
  dropout_rate=0.1,
536
  # ── Leviathan continuous token generator ──────────────────────────
537
- use_token_generator=False,
538
  generator_d_seed=128,
539
  generator_num_modes=8,
540
  generator_num_knots=32,
541
  generator_spline_degree=2,
542
  generator_k=3,
543
- generator_krank=16,
544
  # ── Leviathan-JTok-M token-indexed modulation ─────────────────────
545
  use_jtokm=False,
546
  jtokm_num_experts=4,
@@ -568,9 +568,9 @@ class NeoLLMConfig(PretrainedConfig):
568
  versatile_gumbel_temp_decay=0.99984,
569
  versatile_aux_loss_weight=1e-5,
570
  # ── LAuReL: Learned Augmented Residual Layer (Menghani et al., 2025) ─
571
- use_laurel=True,
572
- use_laurel_rw=True,
573
- use_laurel_lr=True,
574
  laurel_lr_rank=32,
575
  **kwargs,
576
  ):
 
495
 
496
  def __init__(
497
  self,
498
+ vocab_size=64402,
499
  hidden_size=512,
500
  intermediate_size=1536,
501
  num_hidden_layers=12,
 
514
  head_dim=64,
515
  use_momentum_attention=True,
516
  momentum_gamma=0.10,
517
+ use_mea_attention=False,
518
  mea_component_key_value_heads=None,
519
  mea_groupnorm_eps=1e-6,
520
+ use_lucid_attention=False,
521
  lucid_attention_eps=1e-6,
522
  use_affine_scaled_attention=True,
523
  affine_momentum=0.9,
524
  use_xsa=True,
525
  xsa_eps=1e-6,
526
  # ── Directional Routing (Taylor, 2026) ────────────────────────────
527
+ use_directional_routing=False,
528
  directional_routing_k=4,
529
  directional_routing_temp=3.0,
530
  # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
 
534
  fan_ratio_ffn=0.0625,
535
  dropout_rate=0.1,
536
  # ── Leviathan continuous token generator ──────────────────────────
537
+ use_token_generator=True,
538
  generator_d_seed=128,
539
  generator_num_modes=8,
540
  generator_num_knots=32,
541
  generator_spline_degree=2,
542
  generator_k=3,
543
+ generator_krank=32,
544
  # ── Leviathan-JTok-M token-indexed modulation ─────────────────────
545
  use_jtokm=False,
546
  jtokm_num_experts=4,
 
568
  versatile_gumbel_temp_decay=0.99984,
569
  versatile_aux_loss_weight=1e-5,
570
  # ── LAuReL: Learned Augmented Residual Layer (Menghani et al., 2025) ─
571
+ use_laurel=False,
572
+ use_laurel_rw=False,
573
+ use_laurel_lr=False,
574
  laurel_lr_rank=32,
575
  **kwargs,
576
  ):