LUNA-Training / train_config.yaml
ASTERIZER's picture
Upload train_config.yaml with huggingface_hub
060646a verified
# ─────────────────────────────────────────────────────────────────────────────
# LUNA 100M β€” Training Configuration
# Single source of truth for all hyperparameters.
#
# auto_config: true β†’ All batch/LR/worker settings are auto-detected from
# available VRAM / RAM / CPU at runtime. Your values below
# are used as FALLBACKS only if detection fails.
#
# auto_config: false β†’ Every value below is used as-is. Nothing is overridden.
# Use this when you've already benchmarked and want
# repeatable, fixed runs.
# ─────────────────────────────────────────────────────────────────────────────
auto_config: true # ← flip to false to lock everything below
# ── Data ──────────────────────────────────────────────────────────────────────
data_path: "Base/data/litdata_pretrain_final" # local default; overridden by --data_path
out_dir: "out/pretrain/luna-100m"
tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m"
# ── Model (fixed for LUNA-100M β€” do not change) ───────────────────────────────
model:
vocab_size: 50304 # ceil(50277/128)*128 β€” pythia tokenizer with EOS padding
seq_len: 1024
n_layer: 10
n_embd: 768
n_head: 12
# ── Training budget ───────────────────────────────────────────────────────────
train:
max_tokens: 4515286950 # full dataset (verified from index.json, 270 chunks)
lr_warmup_steps: 500 # [AUTO] scaled to 5% of total_steps if auto_config
save_interval: 1000 # save checkpoint every N optimizer steps
log_interval: 10 # print log every N steps
max_norm: 1.0 # gradient clip norm
# ── Optimiser ─────────────────────────────────────────────────────────────────
optimizer:
lr: 0.0006 # 6e-4 [AUTO] scaled by sqrt(global_batch/120) if auto_config
min_lr: 0.00006 # 6e-5
weight_decay: 0.1
betas: [0.9, 0.95]
eps: 1.0e-8
# ── Batch sizing ──────────────────────────────────────────────────────────────
# When auto_config: true β†’ micro_batch and grad_accum are ignored; the script
# probes VRAM and fills it to ~82% saturation, then
# computes grad_accum to hit global_batch.
# When auto_config: false β†’ micro_batch Γ— grad_accum must equal global_batch.
batch:
global_batch: 120 # target total samples per optimizer step
micro_batch: 12 # [MANUAL] samples per GPU forward pass (ignored when auto)
grad_accum: 10 # [MANUAL] accumulation steps (ignored when auto)
# ── DataLoader ────────────────────────────────────────────────────────────────
# When auto_config: true β†’ num_workers auto = cpu_cores // 2, capped by RAM
# When auto_config: false β†’ num_workers used as-is
dataloader:
num_workers: -1 # -1 = auto; set to 0 to disable multiprocessing
pin_memory: true # [AUTO] disabled if RAM < 16GB
# ── Hardware / precision ──────────────────────────────────────────────────────
# When auto_config: true β†’ precision detected from GPU compute capability
# When auto_config: false β†’ use the value below
hardware:
precision: "bf16" # bf16 | fp16 | fp32
compile: true # torch.compile (requires Triton β€” Linux/cloud only)