Minh2508
/

Decode

Text Generation

Mixture of Experts

mixture-of-experts

Model card Files Files and versions

Decode / config.json

Minh2508's picture

Update config.json

8a2f1d8 verified 15 days ago

history blame contribute delete

982 Bytes

	{
	"model_type": "afmoe",
	"architectures": [
	"MOE"
	],
	"profile_name": "1b_3e_8l_t4x2",
	"vocab_size": 200024,
	"text_embed_dim": 1024,
	"vision_embed_dim": 1024,
	"hidden_dim": 1024,
	"ffn_dim": 6144,
	"num_layers": 8,
	"num_heads": 16,
	"num_kv_heads": 4,
	"num_experts": 3,
	"top_k": 2,
	"max_position_embeddings": 16384,
	"router_aux_loss_coef": 0.01,
	"share_experts_across_layers": false,
	"gradient_checkpointing": true,
	"num_agents": 4,
	"moe_capacity_factor": 1.0,
	"moe_hierarchy_groups": 1,
	"moe_hierarchy_top_k": 1,
	"num_shared_experts": 0,
	"load_balancing_mode": "aux_free",
	"router_bias_update_rate": 0.01,
	"kv_latent_dim": 128,
	"kv_cache_dtype": "int4",
	"rope_training_context": 16384,
	"rope_ntk_alpha": 1.0,
	"rope_yarn_scale": 1.0,
	"ring_attention_chunk_size": 0,
	"prefill_chunk_size": 256,
	"use_q_former_projector": true,
	"q_former_queries": 8,
	"q_former_layers": 1,
	"tokenizer_name": "ai-tokenizer:GPT-5"
	}