NeoLLM / config.json
KitsuVp's picture
Model save
c114c48 verified
{
"affine_momentum": 0.9,
"architectures": [
"NeoLLMForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.1,
"attn_res_num_blocks": 4,
"auto_map": {
"AutoConfig": "configuration_neollm.NeoLLMConfig",
"AutoModel": "modeling_neollm.NeoLLMModel",
"AutoModelForCausalLM": "modeling_neollm.NeoLLMForCausalLM"
},
"bos_token_id": 1,
"directional_routing_k": 4,
"directional_routing_temp": 3.0,
"dropout_rate": 0.1,
"dtype": "bfloat16",
"eos_token_id": 7,
"fan_ratio": 0.125,
"fan_ratio_ffn": 0.0625,
"generator_d_seed": 128,
"generator_k": 3,
"generator_krank": 32,
"generator_num_knots": 32,
"generator_num_modes": 8,
"generator_spline_degree": 2,
"head_dim": 64,
"hidden_act": "xielu",
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 1536,
"jtokm_aux_loss_weight": 0.0001,
"jtokm_norm_eps": 1e-06,
"jtokm_num_experts": 4,
"jtokm_num_modes": 4,
"jtokm_top_k": 2,
"laurel_lr_rank": 32,
"lucid_attention_eps": 1e-06,
"max_position_embeddings": 512,
"mea_component_key_value_heads": 2,
"mea_groupnorm_eps": 1e-06,
"model_type": "neollm",
"momentum_gamma": 0.1,
"num_attention_heads": 8,
"num_hidden_layers": 12,
"num_key_value_heads": 2,
"pad_token_id": 0,
"partial_rotary_factor": 0.25,
"polynorm_exclusive": false,
"repo_d_p": 64,
"repo_start_layer": 4,
"rms_norm_eps": 1e-06,
"rope_parameters": {
"partial_rotary_factor": 0.25,
"rope_theta": 10000.0,
"rope_type": "default"
},
"rope_theta": 10000.0,
"tie_word_embeddings": true,
"transformers_version": "5.5.3",
"use_affine_scaled_attention": true,
"use_attn_res": false,
"use_cache": false,
"use_directional_routing": false,
"use_hadamard_o_proj": true,
"use_jtokm": false,
"use_laurel": false,
"use_laurel_lr": false,
"use_laurel_rw": false,
"use_lucid_attention": false,
"use_mea_attention": false,
"use_momentum_attention": true,
"use_repo": true,
"use_spelling_bee_embeddings": false,
"use_token_generator": false,
"use_versatile_ffn": false,
"use_xsa": true,
"versatile_active_experts": 2,
"versatile_aux_loss_weight": 1e-05,
"versatile_gumbel_temp_decay": 0.99984,
"versatile_gumbel_temp_end": 0.1,
"versatile_gumbel_temp_start": 5.0,
"versatile_max_depth": 2,
"versatile_total_experts": 4,
"vocab_size": 64402,
"xsa_eps": 1e-06
}