LUNA-Training / train_config.yaml

Upload train_config.yaml with huggingface_hub

060646a verified 17 days ago

4.56 kB

	# ─────────────────────────────────────────────────────────────────────────────
	# LUNA 100M — Training Configuration
	# Single source of truth for all hyperparameters.
	#
	# auto_config: true → All batch/LR/worker settings are auto-detected from
	# available VRAM / RAM / CPU at runtime. Your values below
	# are used as FALLBACKS only if detection fails.
	#
	# auto_config: false → Every value below is used as-is. Nothing is overridden.
	# Use this when you've already benchmarked and want
	# repeatable, fixed runs.
	# ─────────────────────────────────────────────────────────────────────────────

	auto_config: true # ← flip to false to lock everything below

	# ── Data ──────────────────────────────────────────────────────────────────────
	data_path: "Base/data/litdata_pretrain_final" # local default; overridden by --data_path
	out_dir: "out/pretrain/luna-100m"
	tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m"

	# ── Model (fixed for LUNA-100M — do not change) ───────────────────────────────
	model:
	vocab_size: 50304 # ceil(50277/128)*128 — pythia tokenizer with EOS padding
	seq_len: 1024
	n_layer: 10
	n_embd: 768
	n_head: 12

	# ── Training budget ───────────────────────────────────────────────────────────
	train:
	max_tokens: 4515286950 # full dataset (verified from index.json, 270 chunks)
	lr_warmup_steps: 500 # [AUTO] scaled to 5% of total_steps if auto_config
	save_interval: 1000 # save checkpoint every N optimizer steps
	log_interval: 10 # print log every N steps
	max_norm: 1.0 # gradient clip norm

	# ── Optimiser ─────────────────────────────────────────────────────────────────
	optimizer:
	lr: 0.0006 # 6e-4 [AUTO] scaled by sqrt(global_batch/120) if auto_config
	min_lr: 0.00006 # 6e-5
	weight_decay: 0.1
	betas: [0.9, 0.95]
	eps: 1.0e-8

	# ── Batch sizing ──────────────────────────────────────────────────────────────
	# When auto_config: true → micro_batch and grad_accum are ignored; the script
	# probes VRAM and fills it to ~82% saturation, then
	# computes grad_accum to hit global_batch.
	# When auto_config: false → micro_batch × grad_accum must equal global_batch.
	batch:
	global_batch: 120 # target total samples per optimizer step
	micro_batch: 12 # [MANUAL] samples per GPU forward pass (ignored when auto)
	grad_accum: 10 # [MANUAL] accumulation steps (ignored when auto)

	# ── DataLoader ────────────────────────────────────────────────────────────────
	# When auto_config: true → num_workers auto = cpu_cores // 2, capped by RAM
	# When auto_config: false → num_workers used as-is
	dataloader:
	num_workers: -1 # -1 = auto; set to 0 to disable multiprocessing
	pin_memory: true # [AUTO] disabled if RAM < 16GB

	# ── Hardware / precision ──────────────────────────────────────────────────────
	# When auto_config: true → precision detected from GPU compute capability
	# When auto_config: false → use the value below
	hardware:
	precision: "bf16" # bf16 \| fp16 \| fp32
	compile: true # torch.compile (requires Triton — Linux/cloud only)