| { |
| "checkpoint_dir": "/workspace/adaptive_model/checkpoints", |
| "tokenizer_dir": "/workspace/adaptive_model/tokenizer", |
| "run_name": "phase2_deepseek_exclusive", |
| "hf_repo_id": "moudook/adaptive-model", |
| "hf_push_every_n": 3, |
| "dataset_names": [ |
| "a-m-team/AM-DeepSeek-R1-Distilled-1.4M" |
| ], |
| "local_paths": {}, |
| "vocab_size": 32000, |
| "hidden_dim": 2048, |
| "num_layers": 24, |
| "num_heads": 16, |
| "num_kv_heads": 4, |
| "intermediate_dim": 5504, |
| "max_seq_len": 2048, |
| "dtype": "bfloat16", |
| "learning_rate": 0.0001, |
| "weight_decay": 0.1, |
| "beta1": 0.9, |
| "beta2": 0.95, |
| "grad_clip": 1.0, |
| "warmup_steps": 200, |
| "total_steps": 100000, |
| "batch_size": 64, |
| "grad_accum": 2, |
| "save_every": 500, |
| "log_every": 10, |
| "keep_last_n": 3, |
| "use_wandb": false, |
| "wandb_project": "adaptive-model", |
| "device": "cuda", |
| "seed": 42, |
| "prefetch_batches": 8 |
| } |