ch1mera / train.py

Upload folder using huggingface_hub

6e408ce verified 12 days ago

25 kB

	#!/usr/bin/env python3
	"""
	Chimera 5.2 — CPU-first training script.

	Highlights vs the previous version:

	* MeZO optimiser uses a single deterministic seed per step, samples each
	parameter's perturbation direction on demand via per-parameter seeds and
	drops the heavy direction cache. This brings the memory cost of MeZO back
	down to "1× model" exactly as advertised.
	* AdamW path uses fused parameter groups and shares the same loss closure as
	MeZO so accumulation and logging are identical between modes.
	* Logging never references an undefined ``lr`` (the previous draft printed it
	before the AdamW step ran on the first accumulator boundary).
	* Gradient checkpointing falls back to ``use_reentrant=False`` (the modern,
	faster path).
	* Tokeniser/dataset loading is unchanged but the Python loops are skipped
	entirely for ``max_tokens=0``.

	Recommended commands::

	# MeZO smoke test on TinyStories
	python train.py --scale tiny --seq_len 64 --max_steps 20 --optimizer mezo

	# AdamW with grad checkpointing + bf16
	python train.py --scale small --seq_len 256 --max_steps 1000 \\
	--optimizer adamw --grad_checkpoint --bf16
	"""

	from __future__ import annotations

	import argparse
	import json
	import math
	import os
	import sys
	import time

	# CPU threading must be configured before importing torch.
	def _setup_cpu_runtime() -> None:
	n_cpus = os.cpu_count() or 4
	os.environ.setdefault("OMP_NUM_THREADS", str(n_cpus))
	os.environ.setdefault("MKL_NUM_THREADS", str(n_cpus))
	os.environ.setdefault("KMP_AFFINITY", "granularity=fine,compact,1,0")
	os.environ.setdefault("KMP_BLOCKTIME", "1")
	os.environ.setdefault("MALLOC_CONF", "background_thread:true,metadata_thp:auto")


	_setup_cpu_runtime()


	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import DataLoader, Dataset

	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	from chimera import Chimera51ForCausalLM
	from chimera.quantization import BitLinear


	torch.set_num_threads(int(os.environ.get("OMP_NUM_THREADS", os.cpu_count() or 4)))
	try:
	torch.set_num_interop_threads(int(os.environ.get("CHIMERA_INTEROP_THREADS", "1")))
	except RuntimeError:
	pass


	# Optional Intel Extension for PyTorch.
	HAS_IPEX = False
	try: # pragma: no cover - optional dependency.
	import intel_extension_for_pytorch as ipex # noqa: F401
	HAS_IPEX = True
	except Exception:
	pass


	# ---------------------------------------------------------------------------
	# MeZO optimiser
	# ---------------------------------------------------------------------------

	class MeZOOptimizer:
	"""Memory-Efficient Zeroth-Order optimiser (Princeton MeZO).

	Each step runs two forward passes around ``θ`` and uses the resulting
	loss difference to estimate a projected gradient. No backward pass and
	no per-parameter optimiser state — memory cost is exactly ``1× model``.

	For BitLinear layers we mask perturbations to currently non-zero ternary
	positions, so ``~1/3`` of the weights skip both perturbation and update.
	"""

	def __init__(self, model: nn.Module, lr: float = 1e-4, eps: float = 1e-3,
	weight_decay: float = 0.0, momentum: float = 0.0,
	direction: str = "rademacher"):
	self.model = model
	self.lr = float(lr)
	self.eps = float(eps)
	self.wd = float(weight_decay)
	self.momentum = float(momentum)
	if direction not in ("rademacher", "gaussian"):
	raise ValueError(f"unknown direction: {direction!r}")
	self.direction = direction

	# Collect trainable parameters once and deduplicate tied weights.
	self._bitlinear_modules: list[tuple[str, BitLinear]] = []
	self._dense_params: list[tuple[str, torch.Tensor]] = []
	seen: set[int] = set()

	for name, module in model.named_modules():
	if isinstance(module, BitLinear):
	self._bitlinear_modules.append((name, module))
	seen.add(id(module.weight))
	if module.bias is not None:
	seen.add(id(module.bias))

	for name, p in model.named_parameters():
	if p.requires_grad and id(p) not in seen:
	self._dense_params.append((name, p))
	seen.add(id(p))

	# Optional momentum buffer — only allocated when momentum > 0.
	self._momentum: dict[int, torch.Tensor] = {}
	if self.momentum > 0:
	for _, p in self._dense_params:
	self._momentum[id(p)] = torch.zeros_like(p.data)
	for _, m in self._bitlinear_modules:
	self._momentum[id(m.weight)] = torch.zeros_like(m.weight.data)

	# Snapshot ternary non-zero masks once per step.
	self._step_masks: dict[int, torch.Tensor] = {}

	# ------------------------------------------------------------------
	# Direction sampling — deterministic per (step seed, parameter index).
	# ------------------------------------------------------------------

	def _direction(self, p: torch.Tensor, seed: int) -> torch.Tensor:
	gen = torch.Generator(device="cpu")
	gen.manual_seed(int(seed) & 0x7FFF_FFFF_FFFF_FFFF)
	if self.direction == "gaussian":
	return torch.randn(p.shape, dtype=p.dtype, device="cpu",
	generator=gen).to(p.device)
	z = torch.empty(p.shape, dtype=p.dtype, device="cpu")
	z.bernoulli_(0.5, generator=gen).mul_(2).sub_(1)
	return z.to(p.device)

	def _walk_params(self):
	"""Yield ``(seed_offset, param, mask_or_None)`` for every trainable tensor."""
	offset = 0
	for _, module in self._bitlinear_modules:
	yield offset, module.weight.data, self._step_masks.get(id(module.weight))
	offset += 1
	if module.bias is not None:
	yield offset, module.bias.data, None
	offset += 1
	for _, p in self._dense_params:
	yield offset, p.data, None
	offset += 1

	def _perturb(self, base_seed: int, scale: float) -> None:
	for off, p, mask in self._walk_params():
	z = self._direction(p, base_seed + off * 1_000_003)
	if mask is not None:
	z = z * mask.to(dtype=z.dtype, device=z.device)
	p.add_(z, alpha=scale)
	# Mark BitLinear caches stale.
	for _, m in self._bitlinear_modules:
	m.invalidate_packed()

	def _update(self, base_seed: int, projected_grad: float) -> None:
	for off, p, mask in self._walk_params():
	z = self._direction(p, base_seed + off * 1_000_003)
	if mask is not None:
	z = z * mask.to(dtype=z.dtype, device=z.device)
	buf = self._momentum.get(id(p))
	if buf is not None:
	buf.mul_(self.momentum).add_(z, alpha=projected_grad)
	p.add_(buf, alpha=-self.lr)
	else:
	p.add_(z, alpha=-self.lr * projected_grad)
	if self.wd > 0:
	p.mul_(1 - self.lr * self.wd)
	for _, m in self._bitlinear_modules:
	m.invalidate_packed()

	@torch.no_grad()
	def step(self, loss_fn, batch) -> float:
	"""Run one MeZO step (two forward passes) and return the mean loss."""
	seed = int(torch.randint(0, 2**31, (1,)).item())

	# Snapshot ternary non-zero masks once for this step.
	self._step_masks = {
	id(m.weight): m.ternary_nonzero_mask().detach()
	for _, m in self._bitlinear_modules
	}

	# Forward at θ + εz.
	self._perturb(seed, +self.eps)
	loss_pos = float(loss_fn(batch).item())

	# Net displacement: θ + εz - 2εz = θ - εz.
	self._perturb(seed, -2.0 * self.eps)
	loss_neg = float(loss_fn(batch).item())

	# Restore θ.
	self._perturb(seed, +self.eps)

	projected_grad = (loss_pos - loss_neg) / (2.0 * self.eps)
	self._update(seed, projected_grad)
	self._step_masks = {}

	return 0.5 * (loss_pos + loss_neg)


	# ---------------------------------------------------------------------------
	# Dataset & tokenisation helpers.
	# ---------------------------------------------------------------------------

	class TokenDataset(Dataset):
	def __init__(self, chunks: torch.Tensor):
	self.chunks = chunks

	def __len__(self) -> int:
	return self.chunks.size(0)

	def __getitem__(self, idx: int) -> dict:
	c = self.chunks[idx]
	return {"input_ids": c, "labels": c}


	def _matches_category_filter(ex: dict, filters: list) -> bool:
	cat = ex.get("category", "") or ""
	if not cat:
	return False
	cat_lower = cat.lower()
	return any(f.lower() in cat_lower for f in filters)


	def _format_example(ex: dict, tok, text_column: str = "auto",
	include_reasoning: bool = False) -> str:
	if text_column == "auto":
	for cand in ("messages", "text", "content", "conversation"):
	if cand in ex:
	text_column = cand
	break
	else:
	text_column = ""

	if text_column == "messages" and "messages" in ex:
	msgs = ex["messages"]
	if include_reasoning and isinstance(msgs, list):
	new_msgs = []
	for m in msgs:
	if isinstance(m, dict) and m.get("role") == "assistant" and "reasoning" in m:
	new_msgs.append({
	"role": "assistant",
	"content": (f"<\|thinking\|>\n{m['reasoning']}\n<\|/thinking\|>\n"
	f"{m.get('content', '')}"),
	})
	else:
	new_msgs.append(m)
	msgs = new_msgs
	return tok.apply_chat_template(msgs)

	if text_column and text_column in ex:
	val = ex[text_column]
	if isinstance(val, str):
	return val
	if isinstance(val, list) and val and isinstance(val[0], dict):
	return tok.apply_chat_template(val)
	return str(val)
	return str(ex)


	def build_dataset(seq_len: int, max_samples=None, max_tokens=None,
	split: str = "train",
	dataset_name: str = "roneneldan/TinyStories",
	dataset_config: str = None, text_column: str = "auto",
	category_filter: str = None,
	include_reasoning: bool = False):
	from datasets import load_dataset
	from chimera import ChimeraTokenizer

	print(f"[DATA] Loading {dataset_name} ({split})...")
	load_kwargs = {"split": split, "streaming": True}
	if dataset_config:
	load_kwargs["name"] = dataset_config
	ds = load_dataset(dataset_name, **load_kwargs)
	tok = ChimeraTokenizer(pretrained="o200k_base")

	cat_filters = ([c.strip() for c in category_filter.split(",") if c.strip()]
	if category_filter else None)
	if cat_filters:
	print(f"[DATA] Filtering categories: {cat_filters}")

	if max_tokens is not None:
	token_budget = int(max_tokens)
	elif max_samples is not None:
	token_budget = int(max_samples) * (seq_len + 1)
	else:
	token_budget = None

	if token_budget is None or token_budget <= 0:
	# Fallback: list-based collection.
	all_ids: list[int] = []
	target = (max_samples * (seq_len + 1)) if max_samples else float("inf")
	for ex in ds:
	if cat_filters and not _matches_category_filter(ex, cat_filters):
	continue
	text = _format_example(ex, tok, text_column, include_reasoning)
	if not text or not text.strip():
	continue
	ids = tok.encode(text, add_special_tokens=False)
	ids.append(tok.eos_token_id)
	all_ids.extend(ids)
	if len(all_ids) >= target:
	break
	all_ids = torch.tensor(all_ids, dtype=torch.long)
	else:
	# Pre-allocated token buffer.
	buffer = torch.empty(token_budget, dtype=torch.long)
	buf_idx = 0
	processed = skipped = 0
	for ex in ds:
	if cat_filters and not _matches_category_filter(ex, cat_filters):
	skipped += 1
	continue
	text = _format_example(ex, tok, text_column, include_reasoning)
	if not text or not text.strip():
	skipped += 1
	continue
	ids = tok.encode(text, add_special_tokens=False)
	ids.append(tok.eos_token_id)
	n = len(ids)
	if buf_idx + n > token_budget:
	n = token_budget - buf_idx
	if n <= 0:
	break
	ids = ids[:n]
	if n > 0:
	buffer[buf_idx:buf_idx + n] = torch.tensor(ids, dtype=torch.long)
	buf_idx += n
	processed += 1
	if buf_idx >= token_budget:
	break
	if (processed % 10_000) == 0:
	print(f" {processed:,} examples, {buf_idx:,} tokens...")
	all_ids = buffer[:buf_idx]
	print(f"[DATA] Processed {processed:,} examples, skipped {skipped:,}.")

	if all_ids.numel() == 0:
	raise ValueError("No data matched filters.")

	n = all_ids.numel() // (seq_len + 1)
	if max_samples:
	n = min(n, max_samples)
	chunks = all_ids[:n * (seq_len + 1)].view(n, seq_len + 1)
	print(f"[DATA] {n:,} chunks × {seq_len} tokens = {n * seq_len:,} total")
	return TokenDataset(chunks), tok


	# ---------------------------------------------------------------------------
	# Learning-rate schedule.
	# ---------------------------------------------------------------------------

	def cosine_lr(step: int, warmup: int, total: int, max_lr: float, min_lr: float
	) -> float:
	if warmup > 0 and step < warmup:
	return max_lr * (step + 1) / warmup
	if step >= total:
	return min_lr
	p = (step - warmup) / max(1, total - warmup)
	return min_lr + 0.5 * (max_lr - min_lr) * (1.0 + math.cos(math.pi * p))


	# ---------------------------------------------------------------------------
	# Main loop.
	# ---------------------------------------------------------------------------

	_SCALE_PRESETS = {
	"tiny": dict(hidden_size=256, intermediate_size=512, num_heads=4, head_dim=48),
	"small": dict(hidden_size=512, intermediate_size=1024, num_heads=8, head_dim=48),
	"medium": dict(hidden_size=1024, intermediate_size=2048, num_heads=8, head_dim=96),
	}


	def train(args) -> None:
	with open(args.config) as f:
	config = json.load(f)

	if args.scale in _SCALE_PRESETS:
	config.update(_SCALE_PRESETS[args.scale])
	config["num_hidden_layers"] = int(config.get("num_hidden_layers", 28))

	config["vocab_size"] = config.get("vocab_size", 200073)
	config.setdefault("gated_deltanet", {})["chunk_size"] = min(args.seq_len, 64)
	config.setdefault("xlstm", {})["memory_size_per_head"] = [config["head_dim"], config["head_dim"]]
	config.setdefault("titans", {}).update({
	"memory_depth": 2, "persistent_memory_slots": 16,
	"local_window_size": min(args.seq_len, 256),
	})
	moe_cfg = config.setdefault("backbone", {}).setdefault("moe", {})
	moe_cfg.setdefault("layers", [3, 7, 11, 15, 19, 23, 27])
	moe_cfg.setdefault("moe_intermediate_size", config["intermediate_size"] // 4)
	moe_cfg.setdefault("n_routed_experts", 8)
	moe_cfg.setdefault("n_shared_experts", 1)
	moe_cfg.setdefault("num_experts_per_tok", 2)
	config.setdefault("looping", {}).update({
	"enabled": True, "prelude": [0, 3], "loop": [4, 23], "coda": [24, 27],
	"loop_range": [1, 3], "loop_default": 2,
	})
	config.setdefault("span_inference", {})["enabled"] = True
	config.setdefault("grammar", {})["enabled"] = True
	config.setdefault("entropy_valve", {})["enabled"] = True
	config.setdefault("debt_ledger", {})["enabled"] = True
	config.setdefault("multimodal", {})["enabled"] = False

	use_mezo = (args.optimizer == "mezo")
	use_bf16 = bool(args.bf16)
	use_compile = bool(args.compile)

	print("=" * 60)
	print(f"CHIMERA 5.2 TRAINING — scale={args.scale}, "
	f"optimizer={'MeZO' if use_mezo else 'AdamW'}, bf16={use_bf16}")
	print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} "
	f"vocab={config['vocab_size']} seq_len={args.seq_len} steps={args.max_steps}")
	print(f"Threads: {torch.get_num_threads()} IPEX={HAS_IPEX}")
	print("=" * 60)

	model = Chimera51ForCausalLM(config)
	counts = model.count_parameters()
	print(f"Params: total={counts['total']:,} ternary={counts['ternary']:,}")

	if args.grad_checkpoint and not use_mezo:
	model.enable_gradient_checkpointing()
	print("[OPT] Gradient checkpointing ON")

	if HAS_IPEX and not use_mezo:
	adamw = torch.optim.AdamW(model.parameters(), lr=args.lr)
	model, adamw = ipex.optimize(
	model, optimizer=adamw,
	dtype=torch.bfloat16 if use_bf16 else torch.float32, level="O1")
	print("[OPT] IPEX optimisation applied (level O1)")
	else:
	adamw = None

	if use_compile:
	print("[OPT] Compiling model with torch.compile (inductor)...")
	model = torch.compile(model, backend="inductor", mode="default", dynamic=True)

	dataset, tok = build_dataset(
	args.seq_len, max_samples=args.max_samples, max_tokens=args.max_tokens,
	split=args.dataset_split, dataset_name=args.dataset_name,
	dataset_config=args.dataset_config, text_column=args.text_column,
	category_filter=args.category_filter,
	include_reasoning=args.include_reasoning,
	)
	loader = DataLoader(
	dataset, batch_size=args.batch_size, shuffle=True,
	num_workers=args.num_workers, drop_last=True,
	persistent_workers=args.num_workers > 0,
	prefetch_factor=2 if args.num_workers > 0 else None,
	)

	if use_mezo:
	optimizer = MeZOOptimizer(
	model, lr=args.lr * 0.01, eps=1e-3,
	weight_decay=0.1, momentum=0.9, direction=args.mezo_direction,
	)
	else:
	no_decay = {"A_log", "dt_bias", "norm", "bias", "embed", "energy_weights"}
	decay_params, no_decay_params = [], []
	for n, p in model.named_parameters():
	if not p.requires_grad:
	continue
	if any(tag in n for tag in no_decay):
	no_decay_params.append(p)
	else:
	decay_params.append(p)
	if adamw is None:
	optimizer = torch.optim.AdamW(
	[{"params": decay_params, "weight_decay": 0.1},
	{"params": no_decay_params, "weight_decay": 0.0}],
	lr=args.lr, betas=(0.9, 0.95))
	else:
	optimizer = adamw

	def compute_loss(batch) -> torch.Tensor:
	ids = batch["input_ids"][:, :-1]
	labels = batch["labels"][:, 1:]
	if use_bf16:
	with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
	out = model(ids, labels=labels)
	else:
	out = model(ids, labels=labels)
	return out.loss

	os.makedirs(args.output_dir, exist_ok=True)
	log_path = os.path.join(args.output_dir, "log.jsonl")
	log_f = open(log_path, "w", encoding="utf-8")

	model.train()
	step = 0
	cur_lr = args.lr
	total_loss = 0.0
	best_loss = float("inf")
	toks = 0
	t0 = time.time()
	data_iter = iter(loader)
	warmup = min(args.warmup, max(1, args.max_steps // 10))

	if not use_mezo:
	optimizer.zero_grad(set_to_none=True)

	print(f"\n{'=' * 60}\nTraining starts\n{'=' * 60}\n")

	while step < args.max_steps:
	try:
	batch = next(data_iter)
	except StopIteration:
	data_iter = iter(loader)
	batch = next(data_iter)

	if use_mezo:
	cur_lr = cosine_lr(step, warmup, args.max_steps,
	args.lr * 0.01, args.lr * 0.001)
	optimizer.lr = cur_lr
	loss_val = optimizer.step(compute_loss, batch)
	total_loss += loss_val
	else:
	loss = compute_loss(batch)
	(loss / args.grad_accum).backward()
	total_loss += float(loss.item())
	if (step + 1) % args.grad_accum == 0:
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	cur_lr = cosine_lr(step, warmup, args.max_steps,
	args.lr, args.lr * 0.1)
	for pg in optimizer.param_groups:
	pg["lr"] = cur_lr
	optimizer.step()
	optimizer.zero_grad(set_to_none=True)

	toks += batch["input_ids"][:, :-1].numel()
	step += 1

	if step % args.log_every == 0:
	dt = time.time() - t0
	avg = total_loss / args.log_every
	ppl = math.exp(min(avg, 20))
	tps = toks / dt if dt > 0 else 0
	eta_h = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0.0
	log_f.write(json.dumps({
	"step": step, "loss": round(avg, 4), "ppl": round(ppl, 2),
	"lr": cur_lr, "tok/s": round(tps),
	"optimizer": "mezo" if use_mezo else "adamw",
	}) + "\n")
	log_f.flush()
	print(f" step {step:>6}/{args.max_steps} \| loss {avg:.4f} \| "
	f"ppl {ppl:>8.2f} \| lr {cur_lr:.2e} \| "
	f"{tps:.0f} tok/s \| ETA {eta_h:.1f}h")
	best_loss = min(best_loss, avg)
	total_loss = 0.0
	toks = 0
	t0 = time.time()

	if step % args.save_every == 0:
	ckpt_dir = os.path.join(args.output_dir, f"ckpt-{step}")
	os.makedirs(ckpt_dir, exist_ok=True)
	raw = getattr(model, "_orig_mod", model)
	torch.save({
	"model": raw.state_dict(), "config": config,
	"step": step, "optimizer": args.optimizer,
	}, os.path.join(ckpt_dir, "ckpt.pt"))
	print(f" [SAVE] {ckpt_dir}")

	final_dir = os.path.join(args.output_dir, "final")
	os.makedirs(final_dir, exist_ok=True)
	raw = getattr(model, "_orig_mod", model)
	torch.save({
	"model": raw.state_dict(), "config": config,
	"step": step, "best_loss": best_loss,
	}, os.path.join(final_dir, "model.pt"))
	with open(os.path.join(final_dir, "config.json"), "w", encoding="utf-8") as fh:
	json.dump(config, fh, indent=2)
	log_f.close()

	print(f"\n{'=' * 60}")
	print(f"DONE — best loss {best_loss:.4f}, ppl {math.exp(min(best_loss, 20)):.2f}")
	print(f"Saved to {final_dir}")


	# ---------------------------------------------------------------------------
	# CLI
	# ---------------------------------------------------------------------------

	def _build_argparser() -> argparse.ArgumentParser:
	p = argparse.ArgumentParser(description="Chimera 5.2 CPU-first training")
	p.add_argument("--config", default="config.json")
	p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"])
	p.add_argument("--seq_len", type=int, default=256)
	p.add_argument("--optimizer", default="mezo", choices=["mezo", "adamw"])
	p.add_argument("--batch_size", type=int, default=2)
	p.add_argument("--grad_accum", type=int, default=8)
	p.add_argument("--lr", type=float, default=1e-3)
	p.add_argument("--warmup", type=int, default=200)
	p.add_argument("--max_steps", type=int, default=5000)
	p.add_argument("--max_samples", type=int, default=None)
	p.add_argument("--max_tokens", type=int, default=None)
	p.add_argument("--bf16", action="store_true", default=True)
	p.add_argument("--no-bf16", dest="bf16", action="store_false")
	p.add_argument("--compile", action="store_true", default=False)
	p.add_argument("--grad_checkpoint", action="store_true", default=True)
	p.add_argument("--no-grad-checkpoint", dest="grad_checkpoint", action="store_false")
	p.add_argument("--mezo_direction", choices=["rademacher", "gaussian"],
	default="rademacher")
	p.add_argument("--dataset_name", default="roneneldan/TinyStories")
	p.add_argument("--dataset_config", default=None)
	p.add_argument("--dataset_split", default="train")
	p.add_argument("--text_column", default="auto")
	p.add_argument("--category_filter", default=None)
	p.add_argument("--include_reasoning", action="store_true", default=False)
	p.add_argument("--num_workers", type=int, default=2)
	p.add_argument("--log_every", type=int, default=10)
	p.add_argument("--save_every", type=int, default=1000)
	p.add_argument("--output_dir", default="./chimera_output")
	return p


	if __name__ == "__main__":
	args = _build_argparser().parse_args()
	train(args)