{ "sequence_len": 8192, "vocab_size": 131072, "n_layer": 16, "n_head": 8, "n_kv_head": 8, "n_embd": 1024, "moe_num_experts": 8, "moe_top_k": 2, "moe_layer_interval": 3, "moe_group_size": 4, "moe_expert_intermediate_size": 1792, "moe_adjugate_intermediate_size": 0, "moe_adjugate_scale": 0.05, "moe_router_aux_loss_coef": 0.015, "moe_router_bias_lr": 0.001, "moe_activation_checkpoint": true, "moe_capacity_factor": 0.75, "rotary_scaling_type": "yarn", "rotary_scale_factor": 4.0, "residual_scale": -1.0, "attn_dropout": 0.01, "label_smoothing": 0.0, "z_loss_weight": 0.0, "use_flash_attention": true, "domain_router_dim": 32, "num_domain_tags": 128, "domain_router_features": { "dataset": { "capacity": 128, "mode": "one_hot" }, "quality": { "capacity": 32, "mode": "one_hot" }, "specialty": { "capacity": 64, "mode": "one_hot" }, "modality": { "capacity": 32, "mode": "one_hot" }, "language": { "capacity": 32, "mode": "one_hot" }, "origin": { "capacity": 8, "mode": "one_hot" } } }