{ "checkpoint_dir": "/workspace/adaptive_model/checkpoints", "tokenizer_dir": "/workspace/adaptive_model/tokenizer", "run_name": "phase2_deepseek_exclusive", "hf_repo_id": "moudook/adaptive-model", "hf_push_every_n": 3, "dataset_names": [ "a-m-team/AM-DeepSeek-R1-Distilled-1.4M" ], "local_paths": {}, "vocab_size": 32000, "hidden_dim": 2048, "num_layers": 24, "num_heads": 16, "num_kv_heads": 4, "intermediate_dim": 5504, "max_seq_len": 2048, "dtype": "bfloat16", "learning_rate": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "warmup_steps": 200, "total_steps": 100000, "batch_size": 64, "grad_accum": 2, "save_every": 500, "log_every": 10, "keep_last_n": 3, "use_wandb": false, "wandb_project": "adaptive-model", "device": "cuda", "seed": 42, "prefetch_batches": 8 }