# ───────────────────────────────────────────────────────────────────────────── # LUNA 100M — Training Configuration # Single source of truth for all hyperparameters. # # auto_config: true → All batch/LR/worker settings are auto-detected from # available VRAM / RAM / CPU at runtime. Your values below # are used as FALLBACKS only if detection fails. # # auto_config: false → Every value below is used as-is. Nothing is overridden. # Use this when you've already benchmarked and want # repeatable, fixed runs. # ───────────────────────────────────────────────────────────────────────────── auto_config: true # ← flip to false to lock everything below # ── Data ────────────────────────────────────────────────────────────────────── data_path: "Base/data/litdata_pretrain_final" # local default; overridden by --data_path out_dir: "out/pretrain/luna-100m" tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m" # ── Model (fixed for LUNA-100M — do not change) ─────────────────────────────── model: vocab_size: 50304 # ceil(50277/128)*128 — pythia tokenizer with EOS padding seq_len: 1024 n_layer: 10 n_embd: 768 n_head: 12 # ── Training budget ─────────────────────────────────────────────────────────── train: max_tokens: 4515286950 # full dataset (verified from index.json, 270 chunks) lr_warmup_steps: 500 # [AUTO] scaled to 5% of total_steps if auto_config save_interval: 1000 # save checkpoint every N optimizer steps log_interval: 10 # print log every N steps max_norm: 1.0 # gradient clip norm # ── Optimiser ───────────────────────────────────────────────────────────────── optimizer: lr: 0.0006 # 6e-4 [AUTO] scaled by sqrt(global_batch/120) if auto_config min_lr: 0.00006 # 6e-5 weight_decay: 0.1 betas: [0.9, 0.95] eps: 1.0e-8 # ── Batch sizing ────────────────────────────────────────────────────────────── # When auto_config: true → micro_batch and grad_accum are ignored; the script # probes VRAM and fills it to ~82% saturation, then # computes grad_accum to hit global_batch. # When auto_config: false → micro_batch × grad_accum must equal global_batch. batch: global_batch: 120 # target total samples per optimizer step micro_batch: 12 # [MANUAL] samples per GPU forward pass (ignored when auto) grad_accum: 10 # [MANUAL] accumulation steps (ignored when auto) # ── DataLoader ──────────────────────────────────────────────────────────────── # When auto_config: true → num_workers auto = cpu_cores // 2, capped by RAM # When auto_config: false → num_workers used as-is dataloader: num_workers: -1 # -1 = auto; set to 0 to disable multiprocessing pin_memory: true # [AUTO] disabled if RAM < 16GB # ── Hardware / precision ────────────────────────────────────────────────────── # When auto_config: true → precision detected from GPU compute capability # When auto_config: false → use the value below hardware: precision: "bf16" # bf16 | fp16 | fp32 compile: true # torch.compile (requires Triton — Linux/cloud only)