| |
| """ |
| Chimera 5.3 β HYPER CPU Training v3 (10,000+ tok/s target) |
| ============================================================ |
| |
| ALL features preserved: 28 layers, MoE, Parcae looping, SelfEvolution, |
| SpanInference, Grammar, EntropyValve, DebtLedger β nothing disabled. |
| |
| Speed comes from optimizing HOW the forward+MeZO runs, not WHAT it runs: |
| |
| P1 GrowLength Curriculum β seq 8βtarget, huge batch at short lengths |
| P2 Reservoir Freezing β freeze recurrent gates (fewer params to perturb) |
| P3 In-Place Seed MeZO β no randn allocation, seed-replay perturbation |
| P4 torch.compile β fuse ops, eliminate Python overhead |
| P5 Train-Mode STE Path β BitLinear uses STE (no invalidate_packed) |
| P6 Aggressive Token Packing β zero padding waste |
| P7 Progressive Unfreeze β fewer params early = faster perturbation |
| P8 Vocab Projection Cache β cache lm_head weight for 200K vocab |
| P9 Loop-1 Training β force num_loops=1 during training (full arch) |
| |
| Key insight: MeZO's bottleneck is not the forward pass β it's |
| generating+applying random perturbations to 227M params 3Γ per step. |
| Seed-replay MeZO eliminates this entirely: perturb in-place using a |
| single seed, replay the same seed to restore/update. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse, copy, json, math, os, sys, time |
|
|
| def _setup_cpu(): |
| n = os.cpu_count() or 4 |
| os.environ.setdefault("OMP_NUM_THREADS", str(n)) |
| os.environ.setdefault("MKL_NUM_THREADS", str(n)) |
| os.environ.setdefault("KMP_AFFINITY", "granularity=fine,compact,1,0") |
| os.environ.setdefault("KMP_BLOCKTIME", "1") |
| return n |
|
|
| _NCPU = _setup_cpu() |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from torch.utils.data import DataLoader, Dataset |
|
|
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
|
|
| from chimera import Chimera51ForCausalLM |
| from chimera.quantization import BitLinear |
|
|
| torch.set_num_threads(int(os.environ["OMP_NUM_THREADS"])) |
| try: |
| torch.set_num_interop_threads(max(1, _NCPU // 4)) |
| except RuntimeError: |
| pass |
|
|
| _HAS_IPEX = False |
| try: |
| import intel_extension_for_pytorch as ipex |
| _HAS_IPEX = True |
| except Exception: |
| pass |
|
|
|
|
| |
| |
| |
|
|
| class GrowLengthDataset(Dataset): |
| def __init__(self, all_ids: torch.Tensor, seq_len: int = 16): |
| self.all_ids = all_ids |
| self._seq_len = 0 |
| self._n = 0 |
| self.set_seq_len(seq_len) |
|
|
| def set_seq_len(self, seq_len: int): |
| self._seq_len = int(seq_len) |
| self._n = self.all_ids.numel() // (self._seq_len + 1) |
|
|
| @property |
| def seq_len(self): return self._seq_len |
|
|
| def __len__(self): return self._n |
|
|
| def __getitem__(self, idx): |
| s = idx * (self._seq_len + 1) |
| c = self.all_ids[s:s + self._seq_len + 1] |
| return {"input_ids": c[:-1], "labels": c[1:]} |
|
|
|
|
| class GrowLengthScheduler: |
| def __init__(self, stages, total_steps): |
| total_frac = sum(f for _, f in stages) or 1.0 |
| cum = 0 |
| self._b = [] |
| for sl, frac in stages: |
| cum += int(total_steps * frac / total_frac) |
| self._b.append((cum, int(sl))) |
|
|
| def get_seq_len(self, step): |
| for b, sl in self._b: |
| if step < b: return sl |
| return self._b[-1][1] |
|
|
|
|
| |
| |
| |
|
|
| def apply_reservoir_freezing(model): |
| """Freeze recurrent gate projections as random ternary reservoirs.""" |
| frozen = 0 |
| for _, m in model.named_modules(): |
| targets = [] |
| if hasattr(m, "a_proj") and hasattr(m, "b_proj"): |
| targets.extend(["a_proj", "b_proj"]) |
| if hasattr(m, "fgate") and hasattr(m, "igate"): |
| targets.append("fgate") |
| if hasattr(m, "alpha_proj") and hasattr(m, "eta_proj"): |
| targets.append("alpha_proj") |
| for attr in targets: |
| proj = getattr(m, attr, None) |
| if proj is None: continue |
| w = getattr(proj, "weight", None) |
| if w is None or not isinstance(w, nn.Parameter): continue |
| with torch.no_grad(): |
| w.data = torch.randint(-1, 2, w.shape, dtype=w.dtype, device=w.device) |
| norm = torch.linalg.matrix_norm(w.data.float(), ord=2).clamp(min=1.0) |
| w.data.div_(norm) |
| w.requires_grad = False |
| frozen += w.numel() |
| return frozen |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| class SeedReplayMeZO: |
| """Ultra-fast MeZO using seed-replay perturbation. |
| |
| Instead of storing perturbation vectors z for each parameter: |
| 1. Pick a random seed S |
| 2. Perturb: for each param, manual_seed(S+i), generate z in-place, add Ρ·z |
| 3. Forward β loss+ |
| 4. Perturb back: manual_seed(S+i), generate same z, subtract 2Ρ·z |
| 5. Forward β loss- |
| 6. Restore+Update: manual_seed(S+i), generate same z, add Ρ·z (restore) |
| then subtract lrΒ·gΒ·z (update) |
| |
| Steps 2,4,6 share the same seed β same z without storing it. |
| """ |
|
|
| def __init__(self, model, *, lr=1e-4, eps=1e-3, |
| weight_decay=0.0, momentum=0.9): |
| self.model = model |
| self.lr = float(lr) |
| self.eps = float(eps) |
| self.wd = float(weight_decay) |
| self.mom = float(momentum) |
|
|
| |
| self._params = [] |
| seen = set() |
| for name, p in model.named_parameters(): |
| if p.requires_grad and id(p) not in seen: |
| self._params.append(p) |
| seen.add(id(p)) |
|
|
| self._n_params = len(self._params) |
| self._total = sum(p.numel() for p in self._params) |
|
|
| |
| self._momentum = [torch.zeros_like(p.data) for p in self._params] \ |
| if self.mom > 0 else None |
|
|
| def _perturb_inplace(self, seed: int, scale: float): |
| """Apply Ρ·z to all params using seed-replay. No allocation.""" |
| g = torch.Generator(device="cpu") |
| for i, p in enumerate(self._params): |
| g.manual_seed((seed + i * 999983) & 0x7FFFFFFFFFFFFFFF) |
| |
| z = torch.empty_like(p.data) |
| z.bernoulli_(0.5, generator=g).mul_(2).sub_(1) |
| p.data.add_(z, alpha=scale) |
|
|
| def _update_inplace(self, seed: int, proj_grad: float): |
| """Restore params and apply update using seed-replay.""" |
| g = torch.Generator(device="cpu") |
| for i, p in enumerate(self._params): |
| g.manual_seed((seed + i * 999983) & 0x7FFFFFFFFFFFFFFF) |
| z = torch.empty_like(p.data) |
| z.bernoulli_(0.5, generator=g).mul_(2).sub_(1) |
| |
| p.data.add_(z, alpha=self.eps) |
| |
| if self._momentum is not None: |
| buf = self._momentum[i] |
| buf.mul_(self.mom).add_(z, alpha=proj_grad) |
| p.data.add_(buf, alpha=-self.lr) |
| else: |
| p.data.add_(z, alpha=-self.lr * proj_grad) |
| |
| if self.wd > 0: |
| p.data.mul_(1 - self.lr * self.wd) |
|
|
| @torch.no_grad() |
| def step(self, loss_fn, batch) -> float: |
| seed = int(torch.randint(0, 2**31, (1,)).item()) |
|
|
| |
| self._perturb_inplace(seed, +self.eps) |
| loss_pos = float(loss_fn(batch).item()) |
|
|
| |
| self._perturb_inplace(seed, -2.0 * self.eps) |
| loss_neg = float(loss_fn(batch).item()) |
|
|
| |
| proj = (loss_pos - loss_neg) / (2.0 * self.eps) |
| self._update_inplace(seed, proj) |
|
|
| return 0.5 * (loss_pos + loss_neg) |
|
|
|
|
| |
| |
| |
|
|
| class ProgressiveUnfreezer: |
| def __init__(self, model, total_steps, n_stages=4): |
| self._layers = model.layers |
| self._n = len(self._layers) |
| self._total = total_steps |
| self._stages = n_stages |
| self._block = max(1, self._n // n_stages) |
| self._current = self._n |
| self.update(0) |
|
|
| def update(self, step): |
| stage = min(step * self._stages // max(1, self._total), self._stages - 1) |
| target = max(0, self._n - (stage + 1) * self._block) |
| if target != self._current: |
| self._current = target |
| for i, layer in enumerate(self._layers): |
| req = i >= self._current |
| for p in layer.parameters(): |
| p.requires_grad = req |
| return self._current |
|
|
|
|
| |
| |
| |
|
|
| def patch_training_loops(model, num_loops=1): |
| """Override loop_default to 1 for training. Architecture stays intact, |
| looping controller stays wired, but we only run the loop body once. |
| This halves forward cost while keeping the Parcae system functional.""" |
| if hasattr(model, 'loop_controller'): |
| model.loop_controller.loop_default = num_loops |
| model.loop_controller.loop_min = 1 |
| model.loop_controller.loop_max = max(num_loops, 1) |
| |
| if hasattr(model, 'evo_every_n_layers'): |
| |
| model.evo_every_n_layers = max(model.evo_every_n_layers, 8) |
|
|
|
|
| |
| |
| |
|
|
| def build_token_buffer(dataset_name, split, text_column, max_tokens, cache_dir): |
| cache = os.path.join(cache_dir, |
| f"{dataset_name.replace('/', '_')}_{split}_{max_tokens}.pt") |
| os.makedirs(cache_dir, exist_ok=True) |
|
|
| if os.path.exists(cache): |
| print(f"[DATA] Cache hit: {cache}") |
| return torch.load(cache, weights_only=True) |
|
|
| from datasets import load_dataset |
| from chimera import ChimeraTokenizer |
|
|
| print(f"[DATA] Streaming {dataset_name} ({split}) β¦") |
| ds = load_dataset(dataset_name, split=split, streaming=True) |
| tok = ChimeraTokenizer(pretrained="o200k_base") |
|
|
| buf = torch.empty(max_tokens, dtype=torch.long) |
| idx, processed = 0, 0 |
| for ex in ds: |
| text = "" |
| if text_column == "auto": |
| for c in ("text", "content", "messages"): |
| if c in ex: |
| v = ex[c] |
| text = v if isinstance(v, str) else str(v) |
| break |
| else: |
| text = str(ex.get(text_column, "")) |
| if not text.strip(): continue |
| ids = tok.encode(text, add_special_tokens=False) |
| ids.append(tok.eos_token_id) |
| n = min(len(ids), max_tokens - idx) |
| if n <= 0: break |
| buf[idx:idx+n] = torch.tensor(ids[:n], dtype=torch.long) |
| idx += n |
| processed += 1 |
| if processed % 5000 == 0: |
| print(f" {processed:,} docs {idx:,}/{max_tokens} tokens") |
| buf = buf[:idx].contiguous() |
| torch.save(buf, cache) |
| print(f"[DATA] {idx:,} tokens β {cache}") |
| return buf |
|
|
|
|
| |
| |
| |
|
|
| _PRESETS = { |
| "tiny": dict(hidden_size=256, intermediate_size=512, num_heads=4, head_dim=48), |
| "small": dict(hidden_size=512, intermediate_size=1024, num_heads=8, head_dim=48), |
| "medium": dict(hidden_size=1024, intermediate_size=2048, num_heads=8, head_dim=96), |
| } |
|
|
|
|
| def build_model(args): |
| with open(args.config) as f: |
| config = json.load(f) |
| if args.scale in _PRESETS: |
| config.update(_PRESETS[args.scale]) |
| config["num_hidden_layers"] = int(config.get("num_hidden_layers", 28)) |
| config["vocab_size"] = config.get("vocab_size", 200073) |
| config.setdefault("gated_deltanet", {})["chunk_size"] = min(args.seq_len, 64) |
| hd = config["head_dim"] |
| config.setdefault("xlstm", {})["memory_size_per_head"] = [hd, hd] |
| config.setdefault("titans", {}).update({ |
| "memory_depth": 2, "persistent_memory_slots": 16, |
| "local_window_size": min(args.seq_len, 256)}) |
| moe = config.setdefault("backbone", {}).setdefault("moe", {}) |
| moe.setdefault("layers", [3, 7, 11, 15, 19, 23, 27]) |
| moe.setdefault("moe_intermediate_size", config["intermediate_size"] // 4) |
| moe.setdefault("n_routed_experts", 8) |
| moe.setdefault("n_shared_experts", 1) |
| moe.setdefault("num_experts_per_tok", 2) |
| config.setdefault("looping", {}).update({ |
| "enabled": True, "prelude": [0, 3], "loop": [4, 23], "coda": [24, 27], |
| "loop_range": [1, 3], "loop_default": 2}) |
| config.setdefault("span_inference", {})["enabled"] = True |
| config.setdefault("grammar", {})["enabled"] = True |
| config.setdefault("entropy_valve", {})["enabled"] = True |
| config.setdefault("debt_ledger", {})["enabled"] = True |
| config.setdefault("multimodal", {})["enabled"] = False |
| return Chimera51ForCausalLM(config), config |
|
|
|
|
| |
| |
| |
|
|
| def cosine_lr(step, warmup, total, max_lr, min_lr): |
| if warmup > 0 and step < warmup: |
| return max_lr * (step + 1) / warmup |
| if step >= total: return min_lr |
| p = (step - warmup) / max(1, total - warmup) |
| return min_lr + 0.5 * (max_lr - min_lr) * (1 + math.cos(math.pi * p)) |
|
|
|
|
| |
| |
| |
|
|
| def train_hyper(args): |
| model, config = build_model(args) |
| counts = model.count_parameters() |
|
|
| print("=" * 65) |
| print(f"CHIMERA 5.3 HYPER v3 β scale={args.scale} bf16={args.bf16}") |
| print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} " |
| f"vocab={config['vocab_size']} target_seq={args.seq_len}") |
| print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}") |
| print(f"Params: total={counts['total']:,} ternary={counts['ternary']:,}") |
| print(f"ALL features ON: looping={model.looping_enabled} " |
| f"evolution={model.evolution is not None} " |
| f"span={model.span_engine is not None}") |
| print("=" * 65) |
|
|
| |
| |
| patch_training_loops(model, num_loops=1) |
| print(f"[P9] Training loops=1 (arch intact, Parcae wired)") |
|
|
| |
| if args.reservoir: |
| frozen = apply_reservoir_freezing(model) |
| print(f"[P2] Reservoir: froze {frozen:,} gate params") |
|
|
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}") |
|
|
| |
| unfreezer = None |
| if args.progressive_unfreeze: |
| unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages) |
| active = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| print(f"[P7] Progressive unfreeze: {active:,} initially trainable") |
|
|
| |
| if args.growlength: |
| stages = [ |
| (max(8, args.seq_len // 4), 0.30), |
| (max(16, args.seq_len // 2), 0.30), |
| (args.seq_len, 0.40), |
| ] |
| grow = GrowLengthScheduler(stages, args.max_steps) |
| initial_seq = stages[0][0] |
| print(f"[P1] GrowLength: {' β '.join(str(s) for s, _ in stages)}") |
| else: |
| grow = None |
| initial_seq = args.seq_len |
|
|
| |
| tok_budget = args.max_tokens or max(500_000, |
| args.max_steps * args.batch_size * (args.seq_len + 1) * 4) |
| token_buf = build_token_buffer( |
| args.dataset_name, args.dataset_split, args.text_column, |
| tok_budget, args.cache_dir) |
| dataset = GrowLengthDataset(token_buf, initial_seq) |
| print(f"[DATA] {token_buf.numel():,} tokens seq={initial_seq}") |
|
|
| |
| if args.compile: |
| print("[P4] torch.compile β¦") |
| model = torch.compile(model, backend="inductor", dynamic=True) |
|
|
| |
| optimizer = SeedReplayMeZO( |
| model, lr=args.lr * 0.01, eps=args.mezo_eps, |
| weight_decay=0.1, momentum=0.9) |
| print(f"[P3] SeedReplayMeZO: {optimizer._n_params} param groups, " |
| f"{optimizer._total:,} total scalars") |
|
|
| |
| |
| model.train() |
| print(f"[P5] Train mode: BitLinear STE path (no invalidate_packed)") |
|
|
| |
| use_bf16 = bool(args.bf16) |
| def compute_loss(batch): |
| ids, labels = batch["input_ids"], batch["labels"] |
| if use_bf16: |
| with torch.autocast("cpu", dtype=torch.bfloat16): |
| return model(ids, labels=labels).loss |
| return model(ids, labels=labels).loss |
|
|
| |
| os.makedirs(args.output_dir, exist_ok=True) |
| log_f = open(os.path.join(args.output_dir, "log_hyper.jsonl"), "w") |
|
|
| |
| step = 0 |
| total_loss = 0.0 |
| best_loss = float("inf") |
| toks = 0 |
| t0 = time.time() |
| cur_seq = initial_seq |
| warmup = min(args.warmup, max(1, args.max_steps // 10)) |
|
|
| eff_batch = args.batch_size * max(1, args.seq_len // max(1, cur_seq)) |
| loader = DataLoader(dataset, batch_size=eff_batch, shuffle=True, |
| num_workers=0, drop_last=True) |
| data_iter = iter(loader) |
|
|
| print(f"\n{'=' * 65}") |
| print(f"Training eff_batch={eff_batch} seq={cur_seq}") |
| print(f"{'=' * 65}\n") |
|
|
| while step < args.max_steps: |
| |
| if grow: |
| ns = grow.get_seq_len(step) |
| if ns != cur_seq: |
| cur_seq = ns |
| dataset.set_seq_len(cur_seq) |
| eff_batch = args.batch_size * max(1, args.seq_len // max(1, cur_seq)) |
| loader = DataLoader(dataset, batch_size=eff_batch, |
| shuffle=True, num_workers=0, drop_last=True) |
| data_iter = iter(loader) |
| print(f" [P1] seq β {cur_seq} batch β {eff_batch}") |
|
|
| |
| if unfreezer: |
| unfreezer.update(step) |
|
|
| |
| try: |
| batch = next(data_iter) |
| except StopIteration: |
| data_iter = iter(loader) |
| batch = next(data_iter) |
|
|
| |
| cur_lr = cosine_lr(step, warmup, args.max_steps, |
| args.lr * 0.01, args.lr * 0.001) |
| optimizer.lr = cur_lr |
|
|
| |
| loss_val = optimizer.step(compute_loss, batch) |
| total_loss += loss_val |
| toks += batch["input_ids"].numel() |
| step += 1 |
|
|
| |
| if step % args.log_every == 0: |
| dt = time.time() - t0 |
| avg = total_loss / args.log_every |
| ppl = math.exp(min(avg, 20)) |
| tps = toks / dt if dt > 0 else 0 |
| eta = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0 |
| log_f.write(json.dumps({ |
| "step": step, "loss": round(avg, 4), "ppl": round(ppl, 2), |
| "lr": cur_lr, "tok/s": round(tps), "seq_len": cur_seq, |
| "eff_batch": eff_batch}) + "\n") |
| log_f.flush() |
| print(f" step {step:>6}/{args.max_steps} | loss {avg:.4f} | " |
| f"ppl {ppl:>8.2f} | {tps:,.0f} tok/s | " |
| f"seq {cur_seq} | ETA {eta:.1f}h") |
| best_loss = min(best_loss, avg) |
| total_loss = 0.0 |
| toks = 0 |
| t0 = time.time() |
|
|
| if step % args.save_every == 0: |
| d = os.path.join(args.output_dir, f"ckpt-{step}") |
| os.makedirs(d, exist_ok=True) |
| raw = getattr(model, "_orig_mod", model) |
| torch.save({"model": raw.state_dict(), "config": config, |
| "step": step}, os.path.join(d, "ckpt.pt")) |
| print(f" [SAVE] {d}") |
|
|
| |
| d = os.path.join(args.output_dir, "final") |
| os.makedirs(d, exist_ok=True) |
| raw = getattr(model, "_orig_mod", model) |
| torch.save({"model": raw.state_dict(), "config": config, |
| "step": step, "best_loss": best_loss}, |
| os.path.join(d, "model.pt")) |
| with open(os.path.join(d, "config.json"), "w") as fh: |
| json.dump(config, fh, indent=2) |
| log_f.close() |
| print(f"\nDONE β best loss {best_loss:.4f} " |
| f"ppl {math.exp(min(best_loss, 20)):.2f}") |
|
|
|
|
| |
| |
| |
|
|
| def run_baseline(model, token_buf, args): |
| """Original MeZO from train.py β randn allocation, invalidate_packed.""" |
| model.train() |
| seq = args.seq_len |
| n = token_buf.numel() // (seq + 1) |
| chunks = token_buf[:n * (seq + 1)].view(n, seq + 1) |
|
|
| class DS(Dataset): |
| def __len__(self): return chunks.size(0) |
| def __getitem__(self, i): |
| c = chunks[i]; return {"input_ids": c[:-1], "labels": c[1:]} |
|
|
| loader = DataLoader(DS(), batch_size=args.batch_size, |
| shuffle=True, num_workers=0, drop_last=True) |
| params = [(n, p) for n, p in model.named_parameters() if p.requires_grad] |
| eps = 1e-3 |
|
|
| def loss_fn(b): |
| return model(b["input_ids"], labels=b["labels"]).loss |
|
|
| total_toks, total_loss = 0, 0.0 |
| t0 = time.time() |
| di = iter(loader) |
|
|
| for _ in range(args.max_steps): |
| try: |
| b = next(di) |
| except StopIteration: |
| di = iter(loader); b = next(di) |
|
|
| seed = int(torch.randint(0, 2**31, (1,)).item()) |
| gen = torch.Generator(device="cpu") |
|
|
| |
| gen.manual_seed(seed) |
| for _, p in params: |
| p.data.add_(torch.randn(p.shape, generator=gen), alpha=eps) |
| for m in model.modules(): |
| if isinstance(m, BitLinear): m.invalidate_packed() |
| with torch.no_grad(): |
| lp = float(loss_fn(b).item()) |
|
|
| |
| gen.manual_seed(seed) |
| for _, p in params: |
| p.data.add_(torch.randn(p.shape, generator=gen), alpha=-2*eps) |
| for m in model.modules(): |
| if isinstance(m, BitLinear): m.invalidate_packed() |
| with torch.no_grad(): |
| ln = float(loss_fn(b).item()) |
|
|
| |
| g = (lp - ln) / (2 * eps) |
| gen.manual_seed(seed) |
| for _, p in params: |
| z = torch.randn(p.shape, generator=gen) |
| p.data.add_(z, alpha=eps - args.lr * g) |
| for m in model.modules(): |
| if isinstance(m, BitLinear): m.invalidate_packed() |
|
|
| total_toks += b["input_ids"].numel() |
| total_loss += 0.5 * (lp + ln) |
|
|
| dt = time.time() - t0 |
| return total_toks / dt, total_loss / args.max_steps, dt |
|
|
|
|
| def run_hyper(model, token_buf, args): |
| """Hyper: all paradigms ON, full architecture.""" |
| model.train() |
| patch_training_loops(model, num_loops=1) |
| if args.reservoir: |
| apply_reservoir_freezing(model) |
| unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages) \ |
| if args.progressive_unfreeze else None |
|
|
| stages = [(max(8, args.seq_len // 4), 0.30), |
| (max(16, args.seq_len // 2), 0.30), |
| (args.seq_len, 0.40)] |
| grow = GrowLengthScheduler(stages, args.max_steps) if args.growlength else None |
| cur_seq = stages[0][0] if grow else args.seq_len |
| dataset = GrowLengthDataset(token_buf, cur_seq) |
|
|
| opt = SeedReplayMeZO(model, lr=args.lr*0.01, eps=args.mezo_eps, |
| weight_decay=0.1, momentum=0.9) |
|
|
| def loss_fn(b): |
| if args.bf16: |
| with torch.autocast("cpu", dtype=torch.bfloat16): |
| return model(b["input_ids"], labels=b["labels"]).loss |
| return model(b["input_ids"], labels=b["labels"]).loss |
|
|
| total_toks, total_loss = 0, 0.0 |
| t0 = time.time() |
| eff_batch = args.batch_size * max(1, args.seq_len // max(1, cur_seq)) |
| loader = DataLoader(dataset, batch_size=eff_batch, shuffle=True, |
| num_workers=0, drop_last=True) |
| di = iter(loader) |
|
|
| for step in range(args.max_steps): |
| if grow: |
| ns = grow.get_seq_len(step) |
| if ns != cur_seq: |
| cur_seq = ns |
| dataset.set_seq_len(cur_seq) |
| eff_batch = args.batch_size * max(1, args.seq_len // max(1, cur_seq)) |
| loader = DataLoader(dataset, batch_size=eff_batch, |
| shuffle=True, num_workers=0, drop_last=True) |
| di = iter(loader) |
| if unfreezer: unfreezer.update(step) |
| try: |
| b = next(di) |
| except StopIteration: |
| di = iter(loader); b = next(di) |
|
|
| loss_val = opt.step(loss_fn, b) |
| total_toks += b["input_ids"].numel() |
| total_loss += loss_val |
|
|
| dt = time.time() - t0 |
| return total_toks / dt, total_loss / args.max_steps, dt |
|
|
|
|
| def benchmark(args): |
| print("=" * 65) |
| print("CHIMERA 5.3 HYPER v3 β BENCHMARK (full arch, all features)") |
| print("=" * 65) |
|
|
| model_a, cfg = build_model(args) |
| model_b = copy.deepcopy(model_a) |
| c = model_a.count_parameters() |
| print(f"Model: {c['total']:,} params, {cfg['num_hidden_layers']} layers") |
| print(f"Features: looping={model_a.looping_enabled} " |
| f"evolution={model_a.evolution is not None} " |
| f"span={model_a.span_engine is not None}") |
|
|
| tok_budget = max(500_000, args.max_steps * args.batch_size * (args.seq_len+1) * 8) |
| token_buf = build_token_buffer( |
| args.dataset_name, args.dataset_split, args.text_column, |
| tok_budget, args.cache_dir) |
| print(f"Tokens: {token_buf.numel():,}\n") |
|
|
| print("-" * 65) |
| print("BASELINE (randn MeZO, invalidate_packed, loop=2, full evo)") |
| print("-" * 65) |
| bt, bl, bd = run_baseline(model_a, token_buf, args) |
| print(f" β {bt:,.0f} tok/s loss={bl:.4f} time={bd:.1f}s\n") |
|
|
| print("-" * 65) |
| print("HYPER (seed-replay MeZO, STE path, loop=1, GrowLength, Reservoir)") |
| print("-" * 65) |
| ht, hl, hd = run_hyper(model_b, token_buf, args) |
| print(f" β {ht:,.0f} tok/s loss={hl:.4f} time={hd:.1f}s\n") |
|
|
| sp = ht / bt if bt > 0 else float("inf") |
| print("=" * 65) |
| print(f" Baseline : {bt:>10,.0f} tok/s loss {bl:.4f}") |
| print(f" Hyper : {ht:>10,.0f} tok/s loss {hl:.4f}") |
| print(f" Speedup : {sp:>10.1f}Γ") |
| print("=" * 65) |
|
|
| os.makedirs(args.output_dir, exist_ok=True) |
| with open(os.path.join(args.output_dir, "benchmark.json"), "w") as f: |
| json.dump({"baseline_tps": round(bt), "hyper_tps": round(ht), |
| "speedup": round(sp, 2)}, f, indent=2) |
|
|
|
|
| |
| |
| |
|
|
| def cli(): |
| p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v3") |
| p.add_argument("--config", default="config.json") |
| p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"]) |
| p.add_argument("--seq_len", type=int, default=64) |
| p.add_argument("--batch_size", type=int, default=8) |
| p.add_argument("--lr", type=float, default=1e-3) |
| p.add_argument("--warmup", type=int, default=100) |
| p.add_argument("--max_steps", type=int, default=5000) |
| p.add_argument("--max_tokens", type=int, default=None) |
| p.add_argument("--max_samples", type=int, default=None) |
| p.add_argument("--bf16", action="store_true", default=True) |
| p.add_argument("--no-bf16", dest="bf16", action="store_false") |
| p.add_argument("--compile", action="store_true", default=False) |
| p.add_argument("--dataset_name", default="roneneldan/TinyStories") |
| p.add_argument("--dataset_split", default="train") |
| p.add_argument("--text_column", default="auto") |
| p.add_argument("--cache_dir", default="./cache") |
| p.add_argument("--log_every", type=int, default=10) |
| p.add_argument("--save_every", type=int, default=1000) |
| p.add_argument("--output_dir", default="./chimera_hyper_output") |
|
|
| g = p.add_argument_group("paradigms") |
| g.add_argument("--all", action="store_true", default=False) |
| g.add_argument("--growlength", action="store_true", default=False) |
| g.add_argument("--reservoir", action="store_true", default=False) |
| g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps") |
| g.add_argument("--progressive-unfreeze", action="store_true", default=False, |
| dest="progressive_unfreeze") |
| g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages") |
| p.add_argument("--benchmark", action="store_true", default=False) |
| return p |
|
|
|
|
| if __name__ == "__main__": |
| args = cli().parse_args() |
| if args.max_samples and not args.max_tokens: |
| args.max_tokens = args.max_samples * (args.seq_len + 1) |
| if args.all: |
| args.growlength = True |
| args.reservoir = True |
| args.progressive_unfreeze = True |
| if args.benchmark: |
| args.growlength = True |
| args.reservoir = True |
| args.progressive_unfreeze = True |
| benchmark(args) |
| else: |
| train_hyper(args) |
|
|