| { |
| "model": { |
| "max_batch_size": 16, |
| "max_seq_len": 512, |
| "dtype": "fp32", |
| "scale_fmt": null, |
| "vocab_size": 32768, |
| "dim": 512, |
| "inter_dim": 4096, |
| "moe_inter_dim": 512, |
| "n_layers": 16, |
| "n_dense_layers": 3, |
| "n_heads": 12, |
| "n_routed_experts": 4, |
| "n_shared_experts": 1, |
| "n_activated_experts": 2, |
| "route_scale": 1.0, |
| "use_routing_bias": true, |
| "q_lora_rank": 0, |
| "kv_lora_rank": 256, |
| "qk_nope_head_dim": 64, |
| "qk_rope_head_dim": 32, |
| "v_head_dim": 64, |
| "original_seq_len": 4096, |
| "rope_theta": 10000.0, |
| "rope_factor": 40, |
| "beta_fast": 32, |
| "beta_slow": 1, |
| "mscale": 1.0, |
| "tokenizer_name": "turkish" |
| }, |
| "training": { |
| "learning_rate": 3e-5, |
| "weight_decay": 0.1, |
| "beta1": 0.9, |
| "beta2": 0.95, |
| "grad_clip": 1.0, |
| "warmup_steps": 1000, |
| "total_steps": 100000, |
| "use_checkpointing": false, |
| "expert_rotation_steps": 5000, |
| "gradient_accumulation_steps": 8, |
| "eval_every": 1000, |
| "save_every": 5000, |
| "save_dir": "./checkpoints", |
| "log_every": 100, |
| "dtype": "fp32", |
| "compile": false, |
| "max_val_batches": 50, |
| "val_batch_size_multiplier": 4, |
| "train_all_experts":false |
| }, |
| "data": { |
| "train_file": "./data/train.txt", |
| "val_file": "./data/val.txt", |
| "stride": 512 |
| }, |
| "logging": { |
| "use_wandb": true, |
| "project_name": "sequential-moe", |
| "run_name": "moe-12gb-gpu" |
| } |
| } |
|
|