# ─────────────────────────────────────────────────────────────────────────────
#  LUNA 100M — Training Configuration
#  Single source of truth for all hyperparameters.
#
#  auto_config: true   → All batch/LR/worker settings are auto-detected from
#                        available VRAM / RAM / CPU at runtime. Your values below
#                        are used as FALLBACKS only if detection fails.
#
#  auto_config: false  → Every value below is used as-is. Nothing is overridden.
#                        Use this when you've already benchmarked and want
#                        repeatable, fixed runs.
# ─────────────────────────────────────────────────────────────────────────────

auto_config: true   # ← flip to false to lock everything below

# ── Data ──────────────────────────────────────────────────────────────────────
data_path:     "Base/data/litdata_pretrain_final"   # local default; overridden by --data_path
out_dir:       "out/pretrain/luna-100m"
tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m"

# ── Model (fixed for LUNA-100M — do not change) ───────────────────────────────
model:
  vocab_size: 50304      # ceil(50277/128)*128 — pythia tokenizer with EOS padding
  seq_len:    1024
  n_layer:    10
  n_embd:     768
  n_head:     12

# ── Training budget ───────────────────────────────────────────────────────────
train:
  max_tokens:       4515286950   # full dataset (verified from index.json, 270 chunks)
  lr_warmup_steps:  500          # [AUTO] scaled to 5% of total_steps if auto_config
  save_interval:    1000         # save checkpoint every N optimizer steps
  log_interval:     10           # print log every N steps
  max_norm:         1.0          # gradient clip norm

# ── Optimiser ─────────────────────────────────────────────────────────────────
optimizer:
  lr:           0.0006    # 6e-4  [AUTO] scaled by sqrt(global_batch/120) if auto_config
  min_lr:       0.00006   # 6e-5
  weight_decay: 0.1
  betas:        [0.9, 0.95]
  eps:          1.0e-8

# ── Batch sizing ──────────────────────────────────────────────────────────────
# When auto_config: true  → micro_batch and grad_accum are ignored; the script
#                           probes VRAM and fills it to ~82% saturation, then
#                           computes grad_accum to hit global_batch.
# When auto_config: false → micro_batch × grad_accum must equal global_batch.
batch:
  global_batch: 120     # target total samples per optimizer step
  micro_batch:  12      # [MANUAL] samples per GPU forward pass (ignored when auto)
  grad_accum:   10      # [MANUAL] accumulation steps      (ignored when auto)

# ── DataLoader ────────────────────────────────────────────────────────────────
# When auto_config: true  → num_workers auto = cpu_cores // 2, capped by RAM
# When auto_config: false → num_workers used as-is
dataloader:
  num_workers:  -1      # -1 = auto; set to 0 to disable multiprocessing
  pin_memory:   true    # [AUTO] disabled if RAM < 16GB

# ── Hardware / precision ──────────────────────────────────────────────────────
# When auto_config: true  → precision detected from GPU compute capability
# When auto_config: false → use the value below
hardware:
  precision:    "bf16"  # bf16 | fp16 | fp32
  compile:      true    # torch.compile (requires Triton — Linux/cloud only)