chimera / train.py

Upload train.py

89aac72 verified 12 days ago

33.6 kB

	"""
	Chimera 5.1 — Training Script (CPU-Optimized)
	==================================================
	Optimizations implemented:
	1. MeZO (Memory-Efficient Zeroth-Order) optimizer — eliminates backward pass entirely
	- 2× forward only, no activation storage, no gradient computation
	- arxiv:2305.17333
	2. BFloat16 autocast on CPU — 2-4× faster matmuls on AVX-512/AMX hardware
	3. torch.compile with Inductor backend — fused ops, reduced Python overhead
	4. Gradient checkpointing (for AdamW mode) — trades compute for memory
	5. Optimal CPU threading — KMP_AFFINITY, OMP tuning, NUMA-aware
	6. Persistent DataLoader workers — no worker restart overhead
	7. Intel IPEX integration (optional) — auto-detected
	8. Cosine LR with warmup
	9. Standard AdamW with backprop as fallback mode
	10. Generic dataset loading — supports any HF dataset, messages/text columns, category filtering

	Usage:
	# MeZO mode (recommended for CPU — no backward pass):
	python train.py --optimizer mezo --scale tiny --seq_len 64 --max_steps 100

	# AdamW mode (standard backprop with gradient checkpointing + bf16):
	python train.py --optimizer adamw --scale tiny --seq_len 64 --max_steps 100

	# Full run with custom dataset and category filter:
	python train.py --optimizer mezo --scale tiny --seq_len 64 --max_steps 10000 \
	--dataset_name Roman1111111/claude-sonnet-4.6-120000x \
	--dataset_split train --text_column messages \
	--category_filter "C++,organic chemistry"
	"""

	import os
	import sys
	import json
	import time
	import math
	import argparse

	# ─── CPU Threading Setup (MUST be before torch import) ───
	def _setup_cpu_threading():
	"""Configure optimal CPU threading for training."""
	n_cpus = os.cpu_count() or 4
	# Use all physical cores for compute
	os.environ.setdefault('OMP_NUM_THREADS', str(n_cpus))
	os.environ.setdefault('MKL_NUM_THREADS', str(n_cpus))
	# Compact thread affinity: pack threads on adjacent cores
	os.environ.setdefault('KMP_AFFINITY', 'granularity=fine,compact,1,0')
	# Short blocktime: allow threads to sleep quickly (reduces power, same perf)
	os.environ.setdefault('KMP_BLOCKTIME', '1')
	# jemalloc background thread for faster allocation
	os.environ.setdefault('MALLOC_CONF', 'background_thread:true,metadata_thp:auto')

	_setup_cpu_threading()

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import DataLoader, Dataset

	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
	from chimera import Chimera51ForCausalLM
	from chimera.quantization import BitLinear

	# Configure PyTorch threading
	torch.set_num_threads(int(os.environ.get('OMP_NUM_THREADS', os.cpu_count() or 4)))
	try:
	torch.set_num_interop_threads(int(os.environ.get('CHIMERA_INTEROP_THREADS', '1')))
	except RuntimeError:
	pass

	# ─── Optional: Intel Extension for PyTorch ───
	HAS_IPEX = False
	try:
	import intel_extension_for_pytorch as ipex
	HAS_IPEX = True
	print("[IPEX] Intel Extension for PyTorch detected — will use optimized kernels")
	except ImportError:
	pass


	# ─────────────────────────────────────────────────
	# MeZO Optimizer — Ternary-Aware (arxiv:2305.17333)
	# ─────────────────────────────────────────────────
	class MeZOOptimizer:
	"""Ternary-Aware Memory-Efficient Zeroth-Order Optimizer.

	Eliminates the backward pass entirely:
	- 2 forward passes per step (θ+εz and θ-εz)
	- Memory = model size only (no activations, no gradients, no optimizer states)
	- Gradient estimated via finite differences

	TERNARY OPTIMIZATION: For BitLinear layers, perturbation and update
	skip zero-weight positions (~33% of weights), saving ~33% of the
	perturbation and update compute. Uses C++ kernel when available.
	"""

	def __init__(self, model, lr=1e-4, eps=1e-3, weight_decay=0.0,
	momentum=0.0, direction="rademacher", cache_directions=True):
	self.model = model
	self.lr = lr
	self.eps = eps
	self.wd = weight_decay
	self.momentum = momentum
	self.direction = direction
	self.cache_directions = cache_directions

	# Collect trainable parameters once and deduplicate tied weights. The
	# embedding and tied lm_head can share storage; updating both silently
	# doubles the effective LR and wastes CPU.
	self._bitlinear_params = []
	self._other_params = []
	found_params = set()

	def add_other(name, param):
	if param.requires_grad and id(param) not in found_params:
	self._other_params.append((name, param))
	found_params.add(id(param))

	for name, module in model.named_modules():
	if isinstance(module, BitLinear):
	self._bitlinear_params.append((name, module))
	for p in module.parameters(recurse=False):
	found_params.add(id(p))
	elif isinstance(module, (nn.Linear, nn.Embedding)):
	for pn, p in module.named_parameters(recurse=False):
	add_other(f"{name}.{pn}", p)

	# Also collect params not in any submodule we found.
	for name, p in model.named_parameters():
	add_other(name, p)

	self._mezo_masks = {}
	self._direction_cache = {}

	# Momentum buffer
	if momentum > 0:
	self._momentum_buffer = {}
	for n, p in model.named_parameters():
	if p.requires_grad:
	self._momentum_buffer[n] = torch.zeros_like(p.data)

	def _sample_direction(self, p: torch.Tensor, seed: int) -> torch.Tensor:
	gen = torch.Generator(device=p.device if p.device.type != 'cpu' else 'cpu')
	gen.manual_seed(int(seed) & 0x7FFFFFFFFFFFFFFF)
	if self.direction == "gaussian":
	return torch.randn(p.shape, dtype=p.dtype, device=p.device, generator=gen)
	# Rademacher ±1 is a valid ZO direction, much cheaper to sample than
	# Gaussian on CPU and avoids transcendental RNG work.
	z = torch.empty(p.shape, dtype=p.dtype, device=p.device)
	z.bernoulli_(0.5, generator=gen).mul_(2).sub_(1)
	return z

	def _direction_for(self, name: str, p: torch.Tensor, seed: int, mask=None) -> torch.Tensor:
	if self.cache_directions and name in self._direction_cache:
	return self._direction_cache[name]
	z = self._sample_direction(p, seed)
	if mask is not None:
	z.mul_(mask.to(device=p.device, dtype=z.dtype))
	if self.cache_directions:
	self._direction_cache[name] = z
	return z

	def _perturb_params(self, seed: int, scale: float):
	"""Ternary-aware perturbation with cached deterministic directions."""
	sub_seed = seed
	for name, module in self._bitlinear_params:
	mask = self._mezo_masks.get(name)
	if mask is None:
	mask = module.ternary_nonzero_mask()
	z = self._direction_for(f"{name}.weight", module.weight.data, sub_seed, mask=mask)
	module.weight.data.add_(z, alpha=scale)
	module.invalidate_packed()
	sub_seed += 1000003

	for i, (name, p) in enumerate(self._other_params):
	z = self._direction_for(name, p.data, seed + 500000007 + i * 1000003)
	p.data.add_(z, alpha=scale)

	def _update_params(self, seed: int, projected_grad: float):
	"""Ternary-aware parameter update using the same cached directions."""
	sub_seed = seed
	for name, module in self._bitlinear_params:
	z = self._direction_for(f"{name}.weight", module.weight.data, sub_seed,
	mask=self._mezo_masks.get(name))
	if self.momentum > 0 and f"{name}.weight" in self._momentum_buffer:
	buf = self._momentum_buffer[f"{name}.weight"]
	buf.mul_(self.momentum).add_(z, alpha=projected_grad)
	module.weight.data.add_(buf, alpha=-self.lr)
	else:
	module.weight.data.add_(z, alpha=-self.lr * projected_grad)
	if self.wd > 0:
	module.weight.data.mul_(1 - self.lr * self.wd)
	module.invalidate_packed()
	sub_seed += 1000003

	for i, (name, p) in enumerate(self._other_params):
	z = self._direction_for(name, p.data, seed + 500000007 + i * 1000003)
	if self.momentum > 0 and name in self._momentum_buffer:
	buf = self._momentum_buffer[name]
	buf.mul_(self.momentum).add_(z, alpha=projected_grad)
	p.data.add_(buf, alpha=-self.lr)
	else:
	p.data.add_(z, alpha=-self.lr * projected_grad)
	if self.wd > 0:
	p.data.mul_(1 - self.lr * self.wd)

	@torch.no_grad()
	def step(self, loss_fn, batch) -> float:
	"""Single MeZO step: 2 forward passes, no backward.

	Returns: loss estimate (average of pos/neg)
	"""
	seed = torch.randint(0, 2**31, (1,)).item()

	# Snapshot sparse masks once from θ. The same mask and direction are reused
	# for +eps, -eps, reset and update, reducing MeZO RNG from 4× model-size
	# samples/step to 1× while preserving the finite-difference direction.
	self._mezo_masks = {name: module.ternary_nonzero_mask().detach()
	for name, module in self._bitlinear_params}
	self._direction_cache = {}

	# Forward at θ + εz
	self._perturb_params(seed, self.eps)
	loss_pos = loss_fn(batch).item()

	# Forward at θ - εz (net: θ + εz - 2εz = θ - εz)
	self._perturb_params(seed, -2 * self.eps)
	loss_neg = loss_fn(batch).item()

	# Reset to θ (net: θ - εz + εz = θ)
	self._perturb_params(seed, self.eps)

	# Projected gradient
	projected_grad = (loss_pos - loss_neg) / (2 * self.eps)

	# Update parameters (sparse for BitLinear, dense for others)
	self._update_params(seed, projected_grad)

	# Invalidate packed caches (weights changed)
	for _, module in self._bitlinear_params:
	module.invalidate_packed()
	self._mezo_masks = {}
	self._direction_cache = {}

	return (loss_pos + loss_neg) / 2


	# ─────────────────────────────────────────────────
	# Dataset
	# ─────────────────────────────────────────────────
	class TokenDataset(Dataset):
	def __init__(self, chunks: torch.Tensor):
	self.chunks = chunks

	def __len__(self) -> int:
	return len(self.chunks)

	def __getitem__(self, idx: int) -> dict:
	return {"input_ids": self.chunks[idx], "labels": self.chunks[idx]}


	def _matches_category_filter(ex: dict, filters: list) -> bool:
	"""Check if example matches any of the requested category substrings."""
	cat = ex.get("category", "")
	if not cat:
	return False
	cat_lower = cat.lower()
	return any(f.lower() in cat_lower for f in filters)


	def _format_example(ex: dict, tok, text_column: str = "auto", include_reasoning: bool = False) -> str:
	"""Convert an example dict to a single text string for tokenization."""
	# Auto-detect text column
	if text_column == "auto":
	if "messages" in ex:
	text_column = "messages"
	elif "text" in ex:
	text_column = "text"
	elif "content" in ex:
	text_column = "content"
	elif "conversation" in ex:
	text_column = "conversation"
	else:
	text_column = None

	if text_column == "messages" and "messages" in ex:
	msgs = ex["messages"]
	# Inject reasoning into assistant messages if requested
	if include_reasoning and isinstance(msgs, list):
	msgs = []
	for m in ex["messages"]:
	if isinstance(m, dict) and m.get("role") == "assistant" and "reasoning" in m:
	content = f"<\|thinking\|>\n{m['reasoning']}\n<\|/thinking\|>\n{m.get('content', '')}"
	msgs.append({"role": "assistant", "content": content})
	else:
	msgs.append(m)
	return tok.apply_chat_template(msgs)

	if text_column and text_column in ex:
	val = ex[text_column]
	if isinstance(val, str):
	return val
	# Some datasets store conversation as list of dicts even in 'text' col
	if isinstance(val, list) and len(val) > 0 and isinstance(val[0], dict):
	return tok.apply_chat_template(val)
	return str(val)

	# Fallback: stringify the whole example
	return str(ex)


	def build_dataset(seq_len: int, max_samples=None, max_tokens=None, split: str = "train",
	dataset_name: str = "roneneldan/TinyStories",
	dataset_config: str = None,
	text_column: str = "auto",
	category_filter: str = None,
	include_reasoning: bool = False):
	"""Build dataset from any HuggingFace dataset with splintr tokenizer.

	Supports:
	- Generic text columns ('text', 'content', etc.)
	- Messages/chat format (auto-detected, uses apply_chat_template)
	- Category filtering (comma-separated substrings)
	- Streaming for huge datasets
	- Pre-allocated token buffer to avoid OOM on billion-token datasets
	"""
	from datasets import load_dataset
	from chimera import ChimeraTokenizer

	print(f"[DATA] Loading {dataset_name} ({split})...")
	load_kwargs = {"split": split, "streaming": True}
	if dataset_config:
	load_kwargs["name"] = dataset_config
	ds = load_dataset(dataset_name, **load_kwargs)

	print(f"[DATA] Loading tokenizer (splintr o200k_base)...")
	tok = ChimeraTokenizer(pretrained="o200k_base")

	# Parse category filters
	cat_filters = None
	if category_filter:
	cat_filters = [c.strip() for c in category_filter.split(",") if c.strip()]
	print(f"[DATA] Filtering categories: {cat_filters}")

	# Determine token budget
	if max_tokens is not None:
	token_budget = max_tokens
	elif max_samples is not None:
	token_budget = max_samples * (seq_len + 1)
	else:
	token_budget = None

	processed = 0
	skipped = 0

	if token_budget is not None and token_budget > 0:
	# Pre-allocated flat buffer — avoids Python list overhead (~28 bytes/token)
	buffer = torch.empty(token_budget, dtype=torch.long)
	buf_idx = 0

	for i, ex in enumerate(ds):
	if cat_filters and not _matches_category_filter(ex, cat_filters):
	skipped += 1
	continue

	text = _format_example(ex, tok, text_column, include_reasoning)
	if not text or not text.strip():
	skipped += 1
	continue

	ids = tok.encode(text, add_special_tokens=False)
	ids.append(tok.eos_token_id)
	n_ids = len(ids)

	# Truncate if we would exceed the buffer
	if buf_idx + n_ids > token_budget:
	n_ids = token_budget - buf_idx
	if n_ids <= 0:
	break
	ids = ids[:n_ids]

	if n_ids > 0:
	buffer[buf_idx:buf_idx + n_ids] = torch.tensor(ids, dtype=torch.long)
	buf_idx += n_ids
	processed += 1

	if buf_idx >= token_budget:
	break
	if (processed + 1) % 10000 == 0:
	print(f" {processed:,} examples, {buf_idx:,} tokens...")

	all_ids = buffer[:buf_idx]
	else:
	# Fallback: old list approach for unbounded collection
	all_ids = []
	target = max_samples * (seq_len + 1) if max_samples else float('inf')
	for i, ex in enumerate(ds):
	if cat_filters and not _matches_category_filter(ex, cat_filters):
	skipped += 1
	continue

	text = _format_example(ex, tok, text_column, include_reasoning)
	if not text or not text.strip():
	skipped += 1
	continue

	all_ids.extend(tok.encode(text, add_special_tokens=False))
	all_ids.append(tok.eos_token_id)
	processed += 1

	if len(all_ids) >= target:
	break
	if (processed + 1) % 10000 == 0:
	print(f" {processed:,} examples, {len(all_ids):,} tokens...")
	all_ids = torch.tensor(all_ids, dtype=torch.long)

	print(f"[DATA] Processed {processed:,} examples, skipped {skipped:,} (category/text mismatch)")

	if len(all_ids) == 0:
	raise ValueError(
	f"No data matched filters. dataset={dataset_name}, "
	f"category_filter={category_filter}, text_column={text_column}"
	)

	n = len(all_ids) // (seq_len + 1)
	if max_samples:
	n = min(n, max_samples)
	chunks = all_ids[:n * (seq_len + 1)].view(n, seq_len + 1)
	print(f"[DATA] {n:,} chunks × {seq_len} tokens = {n * seq_len:,} total")
	return TokenDataset(chunks), tok


	# ─────────────────────────────────────────────────
	# LR Schedule
	# ─────────────────────────────────────────────────
	def cosine_lr(step: int, warmup: int, total: int,
	max_lr: float, min_lr: float) -> float:
	if step < warmup:
	return max_lr * (step + 1) / warmup
	if step >= total:
	return min_lr
	p = (step - warmup) / (total - warmup)
	return min_lr + 0.5 * (max_lr - min_lr) * (1 + math.cos(math.pi * p))


	# ─────────────────────────────────────────────────
	# Main Training Loop
	# ─────────────────────────────────────────────────
	def train(args):
	with open(args.config) as f:
	config = json.load(f)

	# ─── Scale overrides ───
	if args.scale == "tiny":
	config['hidden_size'] = 256
	config['intermediate_size'] = 512
	config['num_hidden_layers'] = 28
	config['num_heads'] = 4
	config['head_dim'] = 48
	elif args.scale == "small":
	config['hidden_size'] = 512
	config['intermediate_size'] = 1024
	config['num_hidden_layers'] = 28
	config['num_heads'] = 8
	config['head_dim'] = 48
	elif args.scale == "medium":
	config['hidden_size'] = 1024
	config['intermediate_size'] = 2048
	config['num_hidden_layers'] = 28
	config['num_heads'] = 8
	config['head_dim'] = 96

	config['vocab_size'] = 200073
	config.setdefault('gated_deltanet', {})['chunk_size'] = min(args.seq_len, 64)
	config.setdefault('xlstm', {})['memory_size_per_head'] = [config['head_dim'], config['head_dim']]
	config.setdefault('titans', {}).update({
	'memory_depth': 2, 'persistent_memory_slots': 16,
	'local_window_size': min(args.seq_len, 256)
	})
	moe_cfg = config.setdefault('backbone', {}).setdefault('moe', {})
	moe_cfg.update({
	'layers': [3, 7, 11, 15, 19, 23, 27],
	'moe_intermediate_size': config['intermediate_size'] // 4,
	'n_routed_experts': 8, 'n_shared_experts': 1, 'num_experts_per_tok': 2
	})
	config.setdefault('looping', {}).update({
	'enabled': True, 'prelude': [0, 3], 'loop': [4, 23], 'coda': [24, 27],
	'loop_range': [1, 3], 'loop_default': 2, 'adaptive_exit_threshold': 0.01
	})
	config.setdefault('span_inference', {})['enabled'] = True
	config.setdefault('grammar', {})['enabled'] = True
	config.setdefault('entropy_valve', {})['enabled'] = True
	config.setdefault('debt_ledger', {}).update({
	'enabled': True, 'obligations': ['close_bracket', 'close_string'],
	'max_outstanding': 32, 'pressure_weight': 0.3
	})
	config.setdefault('self_evolution', {}).update({
	'tier1': {
	'ttt': {'enabled': True, 'target_layers': [13, 23], 'inner_lr': 0.0003,
	'momentum': 0.9, 'chunk_size': 256, 'reset_decay': 0.95},
	'memory_growth': {'enabled': True, 'pool_size_fixed': True}
	},
	'tier2': {
	'meta_guidelines': {'enabled': True, 'max': 64},
	'episodic_cases': {'enabled': True, 'max_cases': 256, 'case_bytes': 512},
	'self_feedback': {'enabled': True, 'confidence_threshold': 0.6,
	'max_refinement_rounds': 1}
	},
	'tier3': {'loop_depth_learning': {'enabled': True}},
	'safety': {'freeze_threshold': 0.05},
	})
	config.setdefault('semantic_memory', {}).update({
	'vector_bits': 1024, 'capacity': 1000, 'pool_size_fixed': True
	})
	config.setdefault('multimodal', {})['enabled'] = False

	# ─── Print configuration ───
	use_mezo = args.optimizer == 'mezo'
	use_bf16 = args.bf16 and torch.cpu.is_available()
	use_compile = args.compile

	print("=" * 60)
	print("CHIMERA 5.1 TRAINING — CPU-OPTIMIZED")
	print("=" * 60)
	print(f"Scale: {args.scale} (h={config['hidden_size']})")
	print(f"Layers: {config['num_hidden_layers']}")
	print(f"Seq len: {args.seq_len}")
	print(f"Steps: {args.max_steps}")
	print(f"Optimizer: {'MeZO (no backward)' if use_mezo else 'AdamW (backprop)'}")
	print(f"BFloat16: {use_bf16}")
	print(f"torch.compile:{use_compile}")
	print(f"Grad ckpt: {args.grad_checkpoint and not use_mezo}")
	print(f"Device: CPU ({torch.get_num_threads()} threads)")
	print(f"IPEX: {HAS_IPEX}")
	print(f"Tokenizer: splintr o200k_base ({config['vocab_size']} tokens)")
	print(f"Dataset: {args.dataset_name} / {args.dataset_split}")
	if args.dataset_config:
	print(f"Dataset config: {args.dataset_config}")
	if args.category_filter:
	print(f"Category filter: {args.category_filter}")
	if args.include_reasoning:
	print("Reasoning: INCLUDED (<\|thinking\|> ... <\|/thinking\|>)")

	# ─── Build model ───
	model = Chimera51ForCausalLM(config)
	p = model.count_parameters()
	print(f"Params: {p['total']:,} (ternary: {p['ternary']:,})")

	if use_mezo:
	mem_mb = p['total'] * 4 * 2 / 1024 ** 2 # 2× model (params + perturbation buffer)
	print(f"Memory: ~{mem_mb:.0f} MB (MeZO: 2× model only)")
	else:
	mem_mb = p['total'] * 12 / 1024 ** 2 # params + grads + optimizer states
	print(f"Memory: ~{mem_mb:.0f} MB (AdamW: params + grads + states)")

	# ─── Gradient checkpointing (AdamW mode only) ───
	if args.grad_checkpoint and not use_mezo:
	model.enable_gradient_checkpointing()
	print("[OPT] Gradient checkpointing enabled")

	# ─── IPEX optimization ───
	if HAS_IPEX and not use_mezo:
	optimizer_for_ipex = torch.optim.AdamW(model.parameters(), lr=args.lr)
	model, optimizer_for_ipex = ipex.optimize(
	model, optimizer=optimizer_for_ipex,
	dtype=torch.bfloat16 if use_bf16 else torch.float32,
	level='O1'
	)
	print("[OPT] IPEX optimization applied (level O1)")

	# ─── torch.compile ───
	if use_compile:
	print("[OPT] Compiling model with torch.compile (inductor)...")
	model = torch.compile(model, backend="inductor", mode="default",
	dynamic=True)
	print("[OPT] Compilation deferred (will compile on first forward pass)")

	# ─── Dataset ───
	dataset, tok = build_dataset(
	args.seq_len,
	max_samples=args.max_samples,
	max_tokens=args.max_tokens,
	split=args.dataset_split,
	dataset_name=args.dataset_name,
	dataset_config=args.dataset_config,
	text_column=args.text_column,
	category_filter=args.category_filter,
	include_reasoning=args.include_reasoning,
	)
	loader = DataLoader(
	dataset,
	batch_size=args.batch_size,
	shuffle=True,
	num_workers=args.num_workers,
	drop_last=True,
	persistent_workers=args.num_workers > 0, # Keep workers alive between epochs
	prefetch_factor=2 if args.num_workers > 0 else None,
	)

	# ─── Optimizer ───
	if use_mezo:
	optimizer = MeZOOptimizer(
	model,
	lr=args.lr * 0.01, # MeZO needs much smaller LR
	eps=1e-3,
	weight_decay=0.1,
	momentum=0.9,
	direction=args.mezo_direction,
	cache_directions=args.mezo_direction_cache,
	)
	else:
	no_decay = {"A_log", "dt_bias", "norm", "bias", "embed", "energy_weights"}
	param_groups = [
	{"params": [p for n, p in model.named_parameters()
	if not any(nd in n for nd in no_decay) and p.requires_grad],
	"weight_decay": 0.1},
	{"params": [p for n, p in model.named_parameters()
	if any(nd in n for nd in no_decay) and p.requires_grad],
	"weight_decay": 0.0},
	]
	if HAS_IPEX:
	optimizer = optimizer_for_ipex # Already created during ipex.optimize
	else:
	optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))

	# ─── Loss function (shared) ───
	def compute_loss(batch):
	ids = batch["input_ids"][:, :-1]
	labels = batch["labels"][:, 1:]
	if use_bf16:
	with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
	loss, _ = model(ids, labels=labels)
	else:
	loss, _ = model(ids, labels=labels)
	return loss

	# ─── Training loop ───
	os.makedirs(args.output_dir, exist_ok=True)
	log_f = open(os.path.join(args.output_dir, "log.jsonl"), "w")

	model.train()
	step = 0
	total_loss = 0.0
	best = float('inf')
	t0 = time.time()
	toks = 0
	data_iter = iter(loader)
	warmup = min(args.warmup, args.max_steps // 10)

	if not use_mezo:
	optimizer.zero_grad()

	print(f"\n{'=' * 60}")
	print(f"Starting training...")
	print(f"{'=' * 60}\n")

	while step < args.max_steps:
	# Get batch
	try:
	batch = next(data_iter)
	except StopIteration:
	data_iter = iter(loader)
	batch = next(data_iter)

	# ─── MeZO step (no backward) ───
	if use_mezo:
	# Update LR
	lr = cosine_lr(step, warmup, args.max_steps,
	args.lr * 0.01, args.lr * 0.001)
	optimizer.lr = lr

	loss_val = optimizer.step(compute_loss, batch)
	total_loss += loss_val
	toks += batch["input_ids"][:, :-1].numel()

	# ─── AdamW step (standard backprop) ───
	else:
	loss = compute_loss(batch)
	(loss / args.grad_accum).backward()
	total_loss += loss.item()
	toks += batch["input_ids"][:, :-1].numel()

	if (step + 1) % args.grad_accum == 0:
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	lr = cosine_lr(step, warmup, args.max_steps, args.lr, args.lr * 0.1)
	for pg in optimizer.param_groups:
	pg['lr'] = lr
	optimizer.step()
	optimizer.zero_grad()

	step += 1

	# ─── Logging ───
	if step % args.log_every == 0:
	dt = time.time() - t0
	avg = total_loss / args.log_every
	ppl = math.exp(min(avg, 20))
	tps = toks / dt if dt > 0 else 0
	eta = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0
	entry = {
	"step": step, "loss": round(avg, 4), "ppl": round(ppl, 2),
	"lr": round(lr, 8), "tok/s": round(tps), "eta_h": round(eta, 1),
	"optimizer": "mezo" if use_mezo else "adamw",
	}
	print(f" step {step:>6}/{args.max_steps} \| loss {avg:.4f} \| "
	f"ppl {ppl:>8.2f} \| {tps:.0f} tok/s \| ETA {eta:.1f}h")
	log_f.write(json.dumps(entry) + "\n")
	log_f.flush()
	if avg < best:
	best = avg
	total_loss = 0.0
	toks = 0
	t0 = time.time()

	# ─── Checkpoint ───
	if step % args.save_every == 0:
	path = os.path.join(args.output_dir, f"ckpt-{step}")
	os.makedirs(path, exist_ok=True)
	# Save raw model (unwrap compile if needed)
	raw_model = model._orig_mod if hasattr(model, '_orig_mod') else model
	torch.save({
	"model": raw_model.state_dict(),
	"config": config,
	"step": step,
	"optimizer": args.optimizer,
	}, os.path.join(path, "ckpt.pt"))
	print(f" [SAVE] {path}")

	# ─── Final save ───
	path = os.path.join(args.output_dir, "final")
	os.makedirs(path, exist_ok=True)
	raw_model = model._orig_mod if hasattr(model, '_orig_mod') else model
	torch.save({
	"model": raw_model.state_dict(),
	"config": config,
	"step": step,
	"best_loss": best,
	}, os.path.join(path, "model.pt"))
	json.dump(config, open(os.path.join(path, "config.json"), "w"), indent=2)
	print(f"\n{'=' * 60}")
	print(f"DONE — Best loss: {best:.4f}, PPL: {math.exp(min(best, 20)):.2f}")
	print(f"Optimizer: {'MeZO (no backward)' if use_mezo else 'AdamW'}")
	print(f"Saved: {path}")
	log_f.close()


	if __name__ == "__main__":
	p = argparse.ArgumentParser(description="Chimera 5.1 CPU-Optimized Training")

	# Model
	p.add_argument("--config", default="config.json")
	p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"])
	p.add_argument("--seq_len", type=int, default=256)

	# Training
	p.add_argument("--optimizer", default="mezo", choices=["mezo", "adamw"],
	help="mezo: no backward pass (CPU-optimal). adamw: standard backprop.")
	p.add_argument("--batch_size", type=int, default=2)
	p.add_argument("--grad_accum", type=int, default=8)
	p.add_argument("--lr", type=float, default=1e-3)
	p.add_argument("--warmup", type=int, default=200)
	p.add_argument("--max_steps", type=int, default=5000)
	p.add_argument("--max_samples", type=int, default=None,
	help="Maximum number of chunks to generate")
	p.add_argument("--max_tokens", type=int, default=None,
	help="Maximum total tokens to collect (pre-allocated buffer, prevents OOM on huge datasets)")

	# CPU Optimizations
	p.add_argument("--bf16", action="store_true", default=True,
	help="Enable BFloat16 autocast on CPU (default: True)")
	p.add_argument("--no-bf16", dest="bf16", action="store_false")
	p.add_argument("--compile", action="store_true", default=False,
	help="Enable torch.compile with Inductor backend")
	p.add_argument("--grad_checkpoint", action="store_true", default=True,
	help="Enable gradient checkpointing (AdamW mode only)")
	p.add_argument("--no-grad-checkpoint", dest="grad_checkpoint", action="store_false")
	p.add_argument("--mezo_direction", choices=["rademacher", "gaussian"],
	default="rademacher",
	help="ZO perturbation distribution; rademacher is fastest on CPU")
	p.add_argument("--no-mezo-direction-cache", dest="mezo_direction_cache",
	action="store_false", default=True,
	help="Regenerate directions instead of caching them for the step")

	# Data — fully configurable
	p.add_argument("--dataset_name", default="roneneldan/TinyStories",
	help="HuggingFace dataset name (e.g. Roman1111111/claude-sonnet-4.6-120000x)")
	p.add_argument("--dataset_config", default=None,
	help="Dataset config/subset name")
	p.add_argument("--dataset_split", default="train",
	help="Dataset split to use")
	p.add_argument("--text_column", default="auto",
	help="Column containing text. 'auto' detects 'messages'/'text'/'content'/'conversation'")
	p.add_argument("--category_filter", default=None,
	help="Comma-separated category substrings to filter on (e.g. 'C++,python,math')")
	p.add_argument("--include_reasoning", action="store_true", default=False,
	help="Include reasoning/thinking content from assistant messages as <\|thinking\|>...<\|/thinking\|>")

	# Logging / Output
	p.add_argument("--num_workers", type=int, default=4)
	p.add_argument("--log_every", type=int, default=10)
	p.add_argument("--save_every", type=int, default=1000)
	p.add_argument("--output_dir", default="./chimera_output")

	train(p.parse_args())