{
  "checkpoint_dir": "/workspace/adaptive_model/checkpoints",
  "tokenizer_dir": "/workspace/adaptive_model/tokenizer",
  "run_name": "phase2_deepseek_exclusive",
  "hf_repo_id": "moudook/adaptive-model",
  "hf_push_every_n": 3,
  "dataset_names": [
    "a-m-team/AM-DeepSeek-R1-Distilled-1.4M"
  ],
  "local_paths": {},
  "vocab_size": 32000,
  "hidden_dim": 2048,
  "num_layers": 24,
  "num_heads": 16,
  "num_kv_heads": 4,
  "intermediate_dim": 5504,
  "max_seq_len": 2048,
  "dtype": "bfloat16",
  "learning_rate": 0.0001,
  "weight_decay": 0.1,
  "beta1": 0.9,
  "beta2": 0.95,
  "grad_clip": 1.0,
  "warmup_steps": 200,
  "total_steps": 100000,
  "batch_size": 64,
  "grad_accum": 2,
  "save_every": 500,
  "log_every": 10,
  "keep_last_n": 3,
  "use_wandb": false,
  "wandb_project": "adaptive-model",
  "device": "cuda",
  "seed": 42,
  "prefetch_batches": 8
}