| """
|
| LUNA 100M β Config-Driven Dynamic Training Script
|
| ==================================================
|
| Reads train_config.yaml for all hyperparameters.
|
|
|
| auto_config: true -> hardware probed; batch/lr/workers set automatically
|
| auto_config: false -> every value in config used exactly as-is
|
|
|
| Usage:
|
| python train.py # uses train_config.yaml defaults
|
| python train.py --config train_config.yaml # explicit config path
|
| python train.py --data_path /mnt/data/litdata_final # override data path only
|
| python train.py --max_tokens 10000000 # short smoke-test run
|
| """
|
|
|
| import os
|
| import gc
|
| import sys
|
| import math
|
| import time
|
| import json
|
| import argparse
|
| import yaml
|
| import psutil
|
| import torch
|
| import torch.nn as nn
|
| import torch.nn.functional as F
|
| from torch.amp import autocast, GradScaler
|
| from pathlib import Path
|
|
|
|
|
| os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
|
|
|
|
|
|
|
|
| class RotaryEmbedding(nn.Module):
|
| def __init__(self, dim, max_seq_len=1024):
|
| super().__init__()
|
| inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
|
| self.register_buffer("inv_freq", inv_freq)
|
| t = torch.arange(max_seq_len).float()
|
| freqs = torch.einsum("i,j->ij", t, inv_freq)
|
| emb = torch.cat([freqs, freqs], dim=-1)
|
| self.register_buffer("cos_cached", emb.cos())
|
| self.register_buffer("sin_cached", emb.sin())
|
|
|
| def forward(self, seq_len):
|
| return self.cos_cached[:seq_len], self.sin_cached[:seq_len]
|
|
|
|
|
| def rotate_half(x):
|
| x1, x2 = x.chunk(2, dim=-1)
|
| return torch.cat([-x2, x1], dim=-1)
|
|
|
|
|
| def apply_rotary(x, cos, sin):
|
| c = cos.unsqueeze(0).unsqueeze(0)
|
| s = sin.unsqueeze(0).unsqueeze(0)
|
| return x * c + rotate_half(x) * s
|
|
|
|
|
| class CausalSelfAttention(nn.Module):
|
| def __init__(self, n_embd, n_head, block_size, rotary_pct=0.25):
|
| super().__init__()
|
| self.n_head = n_head
|
| self.head_dim = n_embd // n_head
|
| self.rot_dim = int(self.head_dim * rotary_pct)
|
| self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=True)
|
| self.c_proj = nn.Linear(n_embd, n_embd, bias=True)
|
| self.rotary = RotaryEmbedding(self.rot_dim, block_size)
|
|
|
| def forward(self, x):
|
| B, T, C = x.size()
|
| qkv = self.c_attn(x).reshape(B, T, 3, self.n_head, self.head_dim).permute(2, 0, 3, 1, 4)
|
| q, k, v = qkv.unbind(0)
|
| cos, sin = self.rotary(T)
|
| q = torch.cat([apply_rotary(q[..., :self.rot_dim], cos, sin), q[..., self.rot_dim:]], dim=-1)
|
| k = torch.cat([apply_rotary(k[..., :self.rot_dim], cos, sin), k[..., self.rot_dim:]], dim=-1)
|
| y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
|
| return self.c_proj(y.transpose(1, 2).contiguous().view(B, T, C))
|
|
|
|
|
| class MLP(nn.Module):
|
| def __init__(self, n_embd):
|
| super().__init__()
|
| self.fc = nn.Linear(n_embd, 4 * n_embd, bias=True)
|
| self.gelu = nn.GELU()
|
| self.proj = nn.Linear(4 * n_embd, n_embd, bias=True)
|
|
|
| def forward(self, x):
|
| return self.proj(self.gelu(self.fc(x)))
|
|
|
|
|
| class Block(nn.Module):
|
| def __init__(self, n_embd, n_head, block_size):
|
| super().__init__()
|
| self.ln1 = nn.LayerNorm(n_embd)
|
| self.attn = CausalSelfAttention(n_embd, n_head, block_size)
|
| self.ln2 = nn.LayerNorm(n_embd)
|
| self.mlp = MLP(n_embd)
|
|
|
| def forward(self, x):
|
| x = x + self.attn(self.ln1(x))
|
| x = x + self.mlp(self.ln2(x))
|
| return x
|
|
|
|
|
| class LUNAModel(nn.Module):
|
| def __init__(self, vocab_size, block_size, n_layer, n_embd, n_head):
|
| super().__init__()
|
| self.wte = nn.Embedding(vocab_size, n_embd)
|
| self.blocks = nn.ModuleList([Block(n_embd, n_head, block_size) for _ in range(n_layer)])
|
| self.ln_f = nn.LayerNorm(n_embd)
|
| self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
|
| self.lm_head.weight = self.wte.weight
|
| self.apply(self._init_weights)
|
|
|
| def _init_weights(self, m):
|
| if isinstance(m, (nn.Linear, nn.Embedding)):
|
| m.weight.data.normal_(mean=0.0, std=0.02)
|
| if isinstance(m, nn.Linear) and m.bias is not None:
|
| m.bias.data.zero_()
|
|
|
| def forward(self, idx, targets=None, return_logits=True):
|
| x = self.wte(idx)
|
| for block in self.blocks:
|
| x = block(x)
|
| x = self.ln_f(x)
|
| logits = self.lm_head(x)
|
| loss = None
|
| if targets is not None:
|
| loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
|
| if not return_logits:
|
| logits = None
|
| return logits, loss
|
|
|
| @property
|
| def num_params(self):
|
| return sum(p.numel() for p in self.parameters()) - self.wte.weight.numel()
|
|
|
|
|
|
|
|
|
| class LitDataDataset(torch.utils.data.Dataset):
|
| def __init__(self, data_path: str, block_size: int = 1024):
|
| import struct, numpy as np
|
| self.block_size = block_size
|
| self.data_path = Path(data_path)
|
| with open(self.data_path / "index.json") as f:
|
| idx = json.load(f)
|
| self.chunks_meta = idx["chunks"]
|
| self._cum_blocks = []
|
| total = 0
|
| for c in self.chunks_meta:
|
| n = c["dim"] // (block_size + 1)
|
| total += n
|
| self._cum_blocks.append(total)
|
| self.total_blocks = total
|
| self._chunk_cache = {}
|
|
|
| def _load_chunk(self, chunk_idx: int):
|
| if chunk_idx in self._chunk_cache:
|
| return self._chunk_cache[chunk_idx]
|
| import struct, numpy as np
|
| meta = self.chunks_meta[chunk_idx]
|
| with open(self.data_path / meta["filename"], "rb") as f:
|
| raw = f.read()
|
| num_items = struct.unpack_from("<I", raw, 0)[0]
|
| header_bytes = (num_items + 2) * 4
|
| tokens = torch.from_numpy(np.frombuffer(raw[header_bytes:], dtype=np.int32).copy())
|
| if len(self._chunk_cache) >= 4:
|
| del self._chunk_cache[next(iter(self._chunk_cache))]
|
| self._chunk_cache[chunk_idx] = tokens
|
| return tokens
|
|
|
| def __len__(self):
|
| return self.total_blocks
|
|
|
| def __getitem__(self, idx):
|
| chunk_idx = 0
|
| for i, cum in enumerate(self._cum_blocks):
|
| if idx < cum:
|
| chunk_idx = i
|
| break
|
| prev = self._cum_blocks[chunk_idx - 1] if chunk_idx > 0 else 0
|
| tokens = self._load_chunk(chunk_idx)
|
| s = (idx - prev) * (self.block_size + 1)
|
| e = s + self.block_size + 1
|
| chunk = tokens[s:e]
|
| if len(chunk) < self.block_size + 1:
|
| pad = torch.zeros(self.block_size + 1, dtype=torch.int32)
|
| pad[:len(chunk)] = chunk
|
| chunk = pad
|
| chunk = chunk.long()
|
| return chunk[:self.block_size], chunk[1:self.block_size + 1]
|
|
|
|
|
|
|
|
|
| def probe_hardware():
|
| info = {
|
| "cpu_cores": os.cpu_count() or 4,
|
| "ram_gb": psutil.virtual_memory().total / 1024**3,
|
| }
|
| if torch.cuda.is_available():
|
| props = torch.cuda.get_device_properties(0)
|
| info.update({
|
| "device": "cuda",
|
| "gpu_name": props.name,
|
| "vram_gb": props.total_memory / 1024**3,
|
| "sm_major": props.major,
|
| })
|
| if props.major >= 8:
|
| torch.backends.cuda.matmul.allow_tf32 = True
|
| torch.backends.cudnn.allow_tf32 = True
|
| info["precision"] = "bf16"
|
| info["dtype"] = torch.bfloat16
|
| else:
|
| info["precision"] = "fp16"
|
| info["dtype"] = torch.float16
|
| else:
|
| info.update({
|
| "device": "cpu",
|
| "gpu_name": "CPU",
|
| "vram_gb": 0,
|
| "sm_major": 0,
|
| "precision": "fp32",
|
| "dtype": torch.float32,
|
| })
|
| return info
|
|
|
|
|
| def probe_max_batch(model, device, dtype, seq_len, vocab_size, max_search=4096, grad_accum_sim=4):
|
| """Binary search for max micro_batch. Simulates grad_accum forward+backward
|
| passes to account for real training memory patterns. Safety: x0.70."""
|
| tmp_opt = torch.optim.AdamW(model.parameters(), lr=1e-4)
|
| lo, hi, best = 1, max_search, 1
|
| while lo <= hi:
|
| mid = (lo + hi) // 2
|
| try:
|
| torch.cuda.empty_cache(); gc.collect()
|
| tmp_opt.zero_grad(set_to_none=True)
|
|
|
| for _ in range(grad_accum_sim):
|
| x = torch.randint(0, vocab_size, (mid, seq_len), device=device)
|
| t = torch.randint(0, vocab_size, (mid, seq_len), device=device)
|
| with autocast(device_type="cuda", dtype=dtype):
|
| _, loss = model(x, t, return_logits=False)
|
| loss = loss / grad_accum_sim
|
| loss.backward()
|
| del x, t, loss
|
| tmp_opt.step()
|
| tmp_opt.zero_grad(set_to_none=True)
|
| best = mid; lo = mid + 1
|
| torch.cuda.empty_cache()
|
| except torch.cuda.OutOfMemoryError:
|
| try: del x, t, loss
|
| except: pass
|
| torch.cuda.empty_cache()
|
| tmp_opt.zero_grad(set_to_none=True)
|
| hi = mid - 1
|
| except RuntimeError as e:
|
| if "out of memory" in str(e).lower():
|
| try: del x, t, loss
|
| except: pass
|
| torch.cuda.empty_cache()
|
| tmp_opt.zero_grad(set_to_none=True)
|
| hi = mid - 1
|
| else:
|
| raise
|
| del tmp_opt; torch.cuda.empty_cache(); gc.collect()
|
| safe = max(1, int(best * 0.70))
|
| print(f" Probe found max_batch={best}, using {safe} (70% safety, tested with {grad_accum_sim} accum steps)")
|
| return safe
|
|
|
|
|
|
|
|
|
| def cosine_lr(step, warmup, total, lr_max, lr_min):
|
| if step < warmup:
|
| return lr_max * (step + 1) / warmup
|
| p = (step - warmup) / max(1, total - warmup)
|
| return lr_min + 0.5 * (1 + math.cos(math.pi * p)) * (lr_max - lr_min)
|
|
|
|
|
|
|
|
|
| def load_config(config_path: str) -> dict:
|
| """Load YAML config and return flat namespace dict."""
|
| with open(config_path, encoding="utf-8") as f:
|
| raw = yaml.safe_load(f)
|
|
|
| cfg = {
|
|
|
| "auto_config": raw.get("auto_config", True),
|
| "data_path": raw.get("data_path", "Base/data/litdata_pretrain_final"),
|
| "out_dir": raw.get("out_dir", "out/pretrain/luna-100m"),
|
| "tokenizer_dir": raw.get("tokenizer_dir", "Base/checkpoints/EleutherAI/pythia-160m"),
|
|
|
| "vocab_size": raw["model"]["vocab_size"],
|
| "seq_len": raw["model"]["seq_len"],
|
| "n_layer": raw["model"]["n_layer"],
|
| "n_embd": raw["model"]["n_embd"],
|
| "n_head": raw["model"]["n_head"],
|
|
|
| "max_tokens": raw["train"]["max_tokens"],
|
| "lr_warmup_steps":raw["train"]["lr_warmup_steps"],
|
| "save_interval": raw["train"]["save_interval"],
|
| "log_interval": raw["train"]["log_interval"],
|
| "max_norm": raw["train"]["max_norm"],
|
|
|
| "lr": raw["optimizer"]["lr"],
|
| "min_lr": raw["optimizer"]["min_lr"],
|
| "weight_decay": raw["optimizer"]["weight_decay"],
|
| "betas": tuple(raw["optimizer"]["betas"]),
|
| "eps": raw["optimizer"]["eps"],
|
|
|
| "global_batch": raw["batch"]["global_batch"],
|
| "micro_batch": raw["batch"]["micro_batch"],
|
| "grad_accum": raw["batch"]["grad_accum"],
|
|
|
| "num_workers": raw["dataloader"]["num_workers"],
|
| "pin_memory": raw["dataloader"]["pin_memory"],
|
|
|
| "precision": raw["hardware"]["precision"],
|
| "compile": raw["hardware"]["compile"],
|
| }
|
| return cfg
|
|
|
|
|
| def apply_cli_overrides(cfg: dict, cli_args: argparse.Namespace) -> dict:
|
| """CLI args override config values (only if explicitly provided)."""
|
| for key, val in vars(cli_args).items():
|
| if key == "config":
|
| continue
|
| if val is not None:
|
| cfg[key] = val
|
| return cfg
|
|
|
|
|
| def resolve_auto(cfg: dict, hw: dict) -> dict:
|
| """
|
| When auto_config=True: override batch, workers, lr-warmup, pin_memory,
|
| precision from real hardware. Never touches model arch or max_tokens.
|
| Returns updated cfg plus injected hw info.
|
| """
|
| if not cfg["auto_config"]:
|
| print(" [CONFIG] auto_config=false -- using manual values as-is")
|
| cfg.update({"_hw": hw})
|
| return cfg
|
|
|
| print(" [CONFIG] auto_config=true -- tuning settings to this hardware")
|
|
|
|
|
| cfg["precision"] = hw["precision"]
|
| cfg["_dtype"] = hw["dtype"]
|
|
|
|
|
| auto_workers = hw["cpu_cores"] // 2
|
|
|
| max_by_ram = max(0, int(hw["ram_gb"] * 0.25 * 1024 / 268))
|
| cfg["num_workers"] = min(auto_workers, max_by_ram, hw["cpu_cores"])
|
| if cfg["num_workers"] == -1:
|
| cfg["num_workers"] = 0
|
|
|
|
|
| cfg["pin_memory"] = hw["ram_gb"] > 16 and hw["device"] == "cuda"
|
|
|
|
|
| cfg["_auto_warmup"] = True
|
|
|
|
|
| base_global = 120
|
| cfg["lr"] = cfg["lr"] * math.sqrt(cfg["global_batch"] / base_global)
|
| cfg["min_lr"] = cfg["min_lr"] * math.sqrt(cfg["global_batch"] / base_global)
|
|
|
| cfg["_hw"] = hw
|
| return cfg
|
|
|
|
|
|
|
|
|
| SEP = "=" * 72
|
|
|
| def train(cfg: dict):
|
| hw = cfg["_hw"]
|
| device = torch.device(hw["device"])
|
|
|
|
|
| if device.type == "cuda":
|
| torch.cuda.empty_cache()
|
| gc.collect()
|
| free_gb = (torch.cuda.get_device_properties(0).total_memory
|
| - torch.cuda.memory_allocated()) / 1024**3
|
| print(f" GPU free before model load: {free_gb:.1f} GB")
|
|
|
|
|
| if cfg["auto_config"]:
|
| dtype = hw.get("dtype", torch.float32)
|
| else:
|
| dtype = {"bf16": torch.bfloat16, "fp16": torch.float16,
|
| "fp32": torch.float32}.get(cfg["precision"], torch.float32)
|
|
|
| print(SEP)
|
| print(" LUNA 100M - Training")
|
| print(SEP)
|
| mode = "AUTO" if cfg["auto_config"] else "MANUAL"
|
| print(f" Config mode : {mode}")
|
| print(f" GPU : {hw['gpu_name']} ({hw['vram_gb']:.1f} GB)")
|
| print(f" RAM : {hw['ram_gb']:.1f} GB CPU: {hw['cpu_cores']} cores")
|
| print(f" Precision : {cfg['precision']} dtype={dtype}")
|
| print(f" Workers : {cfg['num_workers']} pin_memory={cfg['pin_memory']}")
|
|
|
|
|
| print(f"\n Building LUNA-100M...")
|
| model = LUNAModel(
|
| vocab_size=cfg["vocab_size"],
|
| block_size=cfg["seq_len"],
|
| n_layer=cfg["n_layer"],
|
| n_embd=cfg["n_embd"],
|
| n_head=cfg["n_head"],
|
| ).to(device)
|
|
|
| compiled_model = False
|
|
|
|
|
| print(" torch.compile: disabled (not needed for 100M params)")
|
|
|
| print(f" Parameters: {model.num_params:,} (unique)")
|
|
|
|
|
| if cfg["auto_config"] and device.type == "cuda":
|
| print(f"\n Probing max micro_batch_size (VRAM search)...")
|
|
|
| max_mbs = probe_max_batch(
|
| model, device, dtype, cfg["seq_len"], cfg["vocab_size"]
|
| )
|
|
|
| model.apply(model._init_weights)
|
| torch.cuda.empty_cache(); gc.collect()
|
|
|
| grad_accum = max(1, math.ceil(cfg["global_batch"] / max_mbs))
|
| effective_batch = max_mbs * grad_accum
|
| print(f" AUTO -> micro_batch={max_mbs}, grad_accum={grad_accum}, "
|
| f"effective_batch={effective_batch}")
|
| else:
|
| max_mbs = cfg["micro_batch"]
|
| grad_accum = cfg["grad_accum"]
|
| effective_batch = max_mbs * grad_accum
|
| print(f"\n MANUAL -> micro_batch={max_mbs}, grad_accum={grad_accum}, "
|
| f"effective_batch={effective_batch}")
|
|
|
| tokens_per_step = effective_batch * cfg["seq_len"]
|
| print(f" Tokens/step : {tokens_per_step:,}")
|
|
|
|
|
| print(f"\n Dataset: {cfg['data_path']}")
|
| dataset = LitDataDataset(cfg["data_path"], block_size=cfg["seq_len"])
|
| print(f" Blocks : {len(dataset):,} ({len(dataset) * cfg['seq_len']:,} tokens)")
|
|
|
| loader = torch.utils.data.DataLoader(
|
| dataset,
|
| batch_size=max_mbs,
|
| shuffle=True,
|
| num_workers=cfg["num_workers"],
|
| pin_memory=cfg["pin_memory"],
|
| drop_last=True,
|
| prefetch_factor=4 if cfg["num_workers"] > 0 else None,
|
| persistent_workers=cfg["num_workers"] > 0,
|
| )
|
|
|
|
|
| fused_ok = device.type == "cuda" and hasattr(torch.optim, "AdamW")
|
| try:
|
| optimizer = torch.optim.AdamW(
|
| model.parameters(),
|
| lr=cfg["lr"], weight_decay=cfg["weight_decay"],
|
| betas=cfg["betas"], eps=cfg["eps"],
|
| fused=True,
|
| )
|
| except TypeError:
|
| optimizer = torch.optim.AdamW(
|
| model.parameters(),
|
| lr=cfg["lr"], weight_decay=cfg["weight_decay"],
|
| betas=cfg["betas"], eps=cfg["eps"],
|
| )
|
|
|
| use_scaler = dtype == torch.float16
|
| scaler = GradScaler(enabled=use_scaler)
|
|
|
|
|
| total_steps = max(1, cfg["max_tokens"] // tokens_per_step)
|
| if cfg["auto_config"] and cfg.get("_auto_warmup"):
|
| warmup_steps = max(50, min(500, total_steps // 20))
|
| else:
|
| warmup_steps = min(cfg["lr_warmup_steps"], total_steps)
|
|
|
| out_dir = Path(cfg["out_dir"])
|
| out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
| print(f"\n max_tokens : {cfg['max_tokens']:,}")
|
| print(f" total_steps : {total_steps:,}")
|
| print(f" warmup_steps : {warmup_steps}")
|
| print(f" lr : {cfg['lr']:.2e} -> {cfg['min_lr']:.2e}")
|
| print(f" save every : {cfg['save_interval']} steps")
|
| print(f" out_dir : {out_dir}")
|
| print(SEP)
|
|
|
|
|
| start_step = 0
|
| ckpt_path = out_dir / "latest.pt"
|
| if ckpt_path.exists():
|
| print(f"\n Resuming from {ckpt_path}...")
|
| ckpt = torch.load(ckpt_path, map_location=device, weights_only=True)
|
| model.load_state_dict(ckpt["model"])
|
| optimizer.load_state_dict(ckpt["optimizer"])
|
| start_step = ckpt["step"]
|
| print(f" Resumed at step {start_step}")
|
|
|
|
|
| model.train()
|
| data_iter = iter(loader)
|
|
|
| def get_batch():
|
| nonlocal data_iter
|
| try:
|
| return next(data_iter)
|
| except StopIteration:
|
| data_iter = iter(loader)
|
| return next(data_iter)
|
|
|
| run_t0 = time.perf_counter()
|
| tokens_seen = start_step * tokens_per_step
|
| step = start_step
|
|
|
| print(f"\n Starting training (step {start_step} -> {total_steps})...")
|
|
|
| while step < total_steps:
|
| t0 = time.perf_counter()
|
| lr_now = cosine_lr(step, warmup_steps, total_steps, cfg["lr"], cfg["min_lr"])
|
| for pg in optimizer.param_groups:
|
| pg["lr"] = lr_now
|
|
|
| optimizer.zero_grad(set_to_none=True)
|
| total_loss = 0.0
|
|
|
| for _ in range(grad_accum):
|
| x, t = get_batch()
|
| x = x.to(device, non_blocking=True)
|
| t = t.to(device, non_blocking=True)
|
| with autocast(device_type=device.type, dtype=dtype, enabled=(device.type == "cuda")):
|
| _, loss = model(x, t, return_logits=False)
|
| loss = loss / grad_accum
|
| scaler.scale(loss).backward()
|
| total_loss += loss.item()
|
|
|
| scaler.unscale_(optimizer)
|
| torch.nn.utils.clip_grad_norm_(model.parameters(), cfg["max_norm"])
|
| scaler.step(optimizer)
|
| scaler.update()
|
|
|
| if device.type == "cuda":
|
| torch.cuda.synchronize()
|
|
|
| dt = time.perf_counter() - t0
|
| step += 1
|
| tokens_seen += tokens_per_step
|
|
|
| if step % cfg["log_interval"] == 0 or step <= 2:
|
| tps = tokens_per_step / dt
|
| steps_left = total_steps - step
|
| eta_h = steps_left * dt / 3600
|
| vram = torch.cuda.max_memory_allocated() / 1024**3 if device.type == "cuda" else 0
|
| print(f" step {step:6d}/{total_steps} | loss {total_loss:.4f} | "
|
| f"lr {lr_now:.2e} | {tps:,.0f} tok/s | VRAM {vram:.1f}GB | ETA {eta_h:.1f}h")
|
|
|
| if step % cfg["save_interval"] == 0 or step == total_steps:
|
| raw = model._orig_mod if hasattr(model, "_orig_mod") else model
|
| step_dir = out_dir / f"step-{step:08d}"
|
| step_dir.mkdir(parents=True, exist_ok=True)
|
| torch.save(raw.state_dict(), step_dir / "lit_model.pth")
|
| torch.save({"step": step, "model": raw.state_dict(),
|
| "optimizer": optimizer.state_dict(),
|
| "tokens_seen": tokens_seen},
|
| out_dir / "latest.pt")
|
| print(f" Saved -> {step_dir}")
|
|
|
|
|
| final_dir = out_dir / "final"
|
| final_dir.mkdir(parents=True, exist_ok=True)
|
| raw = model._orig_mod if hasattr(model, "_orig_mod") else model
|
| torch.save(raw.state_dict(), final_dir / "lit_model.pth")
|
|
|
| import shutil
|
| tok_src = Path(cfg["tokenizer_dir"])
|
| if tok_src.exists():
|
| shutil.copytree(tok_src, final_dir / "tokenizer", dirs_exist_ok=True)
|
|
|
| total_h = (time.perf_counter() - run_t0) / 3600
|
| print(SEP)
|
| print(f" Done! {total_h:.2f} h -> {final_dir}")
|
| print(SEP)
|
|
|
|
|
|
|
|
|
| def parse_args():
|
| p = argparse.ArgumentParser(description="LUNA 100M Trainer")
|
| p.add_argument("--config", type=str, default="train_config.yaml",
|
| help="Path to train_config.yaml")
|
|
|
| p.add_argument("--data_path", type=str, default=None)
|
| p.add_argument("--out_dir", type=str, default=None)
|
| p.add_argument("--max_tokens", type=int, default=None)
|
| p.add_argument("--micro_batch", type=int, default=None)
|
| p.add_argument("--global_batch",type=int, default=None)
|
| p.add_argument("--lr", type=float, default=None)
|
| p.add_argument("--num_workers", type=int, default=None)
|
| p.add_argument("--save_interval",type=int, default=None)
|
| p.add_argument("--log_interval",type=int, default=None)
|
| p.add_argument("--auto_config", type=lambda x: x.lower() in ("1","true","yes"),
|
| default=None, help="Override auto_config (true/false)")
|
| return p.parse_args()
|
|
|
|
|
| if __name__ == "__main__":
|
| args = parse_args()
|
| cfg = load_config(args.config)
|
| cfg = apply_cli_overrides(cfg, args)
|
| hw = probe_hardware()
|
| cfg = resolve_auto(cfg, hw)
|
| train(cfg)
|
|
|