#!/usr/bin/env python3 """ Chimera 5.3 — HYPER CPU Training v4.2 ======================================= All ch1mera features active: 28 layers (GD/XM/TM/SK pattern), MoE, Parcae looping (locked to 1 for 300-step runs), SelfEvolution, SpanInference, Grammar, EntropyValve, DebtLedger. Training paradigms: P1 GrowLength Curriculum — seq 16→32→64→128, 4-stage front-loaded P2 Reservoir Freezing — freeze recurrent gates as random ternary P5 STE + Muon — BitNet-paper training with NS-orthogonalized momentum P6 Aggressive Token Packing — zero padding waste (implicit in GrowLengthDataset) P10 Progressive Looping — locked to loops=1 for 300-step throughput P11 NaN-safe training — skip + recover on gradient explosion P15 Token Triage — focus on top-50% informative tokens P16 Plateau Breaker — adaptive LR burst (patience=60) P18 Grokfast-EMA — amplify slow grads (alpha=0.95, lambda=1.5) v4.2 — Memory-safe batch sizing for vocab=200073 on 32 GB RAM. """ from __future__ import annotations import argparse import os def _setup_cpu(): n = os.cpu_count() or 4 os.environ.setdefault("OMP_NUM_THREADS", str(n)) os.environ.setdefault("MKL_NUM_THREADS", str(n)) os.environ.setdefault("KMP_AFFINITY", "granularity=fine,compact,1,0") os.environ.setdefault("KMP_BLOCKTIME", "0") return n _NCPU = _setup_cpu() import torch from chimera.paths import DEFAULT_CONFIG_PATH from chimera.training import ( GrowLengthDataset, GrowLengthScheduler, ProgressiveUnfreezer, apply_reservoir_freezing, benchmark_hyper, build_model_from_args, build_token_buffer, patch_training_loops, train_hyper_loop, ) torch.set_num_threads(int(os.environ["OMP_NUM_THREADS"])) try: torch.set_num_interop_threads(max(1, _NCPU // 4)) except RuntimeError: pass _HAS_IPEX = False try: import intel_extension_for_pytorch as ipex _HAS_IPEX = True except Exception: pass def build_model(args): return build_model_from_args(args) def train_hyper(args): model, config = build_model(args) counts = model.count_parameters() print("=" * 65) print(f"CHIMERA 5.3 HYPER v4.2 — scale={args.scale} bf16={args.bf16}") print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} " f"vocab={config['vocab_size']} target_seq={args.seq_len}") print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}") print(f"Params: total={counts['total']:,} ternary={counts['ternary']:,}") print(f"ALL features ON: looping={model.looping_enabled} " f"evolution={model.evolution is not None} " f"span={model.span_engine is not None}") print("=" * 65) # ── Parcae: lock to 1 loop for throughput patch_training_loops(model, num_loops=1) print(f"[P10] Looping locked to 1 (throughput > refinement at 300 steps)") # ── P2: Reservoir Freezing if args.reservoir: frozen = apply_reservoir_freezing(model) print(f"[P2] Reservoir: froze {frozen:,} gate params") trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}") # ── P7: Progressive Unfreezing (OFF) unfreezer = None if args.progressive_unfreeze: unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages) active = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f"[P7] Progressive unfreeze: {active:,} initially trainable") else: print(f"[P7] Progressive unfreeze: OFF (all layers train from start)") # ── P1: GrowLength — 4-stage front-loaded schedule ── if args.growlength: # v4.2: batch sizes are base × (target_seq / stage_seq), then # capped by _safe_batch() in loops.py to stay under 2 GB logits. # # With base batch=4, target_seq=128, vocab=200073: # Stage 1: seq=16, desired=32 → logits=0.41GB ✓ # Stage 2: seq=32, desired=16 → logits=0.41GB ✓ # Stage 3: seq=64, desired=8 → logits=0.41GB ✓ # Stage 4: seq=128, desired=4 → logits=0.41GB ✓ stages = [ (16, 0.10), # 30 steps at seq=16 (32, 0.15), # 45 steps at seq=32 (64, 0.25), # 75 steps at seq=64 (args.seq_len, 0.50), # 150 steps at seq=128 ] grow = GrowLengthScheduler(stages, args.max_steps) initial_seq = stages[0][0] print(f"[P1] GrowLength 4-stage: {' → '.join(str(s) for s, _ in stages)}") else: grow = None initial_seq = args.seq_len # ── Data ── tok_budget = args.max_tokens or max(500_000, args.max_steps * args.batch_size * (args.seq_len + 1) * 4) token_buf = build_token_buffer( args.dataset_name, args.dataset_split, args.text_column, tok_budget, args.cache_dir) dataset = GrowLengthDataset(token_buf, initial_seq) print(f"[DATA] {token_buf.numel():,} tokens seq={initial_seq}") train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer) def cli(): p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v4.2") p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH)) p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"]) p.add_argument("--seq_len", type=int, default=128) p.add_argument("--batch_size", type=int, default=4) # base batch; GrowLength scales up p.add_argument("--lr", type=float, default=1.2e-2) p.add_argument("--warmup", type=int, default=30) p.add_argument("--max_steps", type=int, default=300) p.add_argument("--max_tokens", type=int, default=None) p.add_argument("--max_samples", type=int, default=None) p.add_argument("--bf16", action="store_true", default=True) p.add_argument("--no-bf16", dest="bf16", action="store_false") p.add_argument("--compile", action="store_true", default=False) p.add_argument("--dataset_name", default="roneneldan/TinyStories") p.add_argument("--dataset_split", default="train") p.add_argument("--text_column", default="auto") p.add_argument("--cache_dir", default="./cache") p.add_argument("--log_every", type=int, default=10) p.add_argument("--save_every", type=int, default=100) p.add_argument("--output_dir", default="./chimera_hyper_output") g = p.add_argument_group("paradigms") g.add_argument("--all", action="store_true", default=False) g.add_argument("--growlength", action="store_true", default=False) g.add_argument("--reservoir", action="store_true", default=False) g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps") g.add_argument("--progressive-unfreeze", action="store_true", default=False, dest="progressive_unfreeze") g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages") p.add_argument("--benchmark", action="store_true", default=False) return p if __name__ == "__main__": args = cli().parse_args() if args.max_samples and not args.max_tokens: args.max_tokens = args.max_samples * (args.seq_len + 1) if args.all: args.growlength = True args.reservoir = True if args.benchmark: args.growlength = True args.reservoir = True benchmark_hyper(args) else: train_hyper(args)