#!/usr/bin/env python3 """ Chimera 5.3 — HYPER CPU Training v4 ===================================== All ch1mera features active: 28 layers (GD/XM/TM/SK pattern), MoE, Parcae looping (progressive 1→2→3), SelfEvolution, SpanInference, Grammar, EntropyValve, DebtLedger. Training paradigms: P1 GrowLength Curriculum — seq 8→target, huge batch at short seq P2 Reservoir Freezing — freeze recurrent gates as random ternary P5 STE + AdamW — BitNet-paper training (replaces MeZO) P6 Aggressive Token Packing — zero padding waste P10 Progressive Looping — Parcae loops 1→2→3 during training P11 NaN-safe training — skip + recover on gradient explosion """ from __future__ import annotations import argparse import os def _setup_cpu(): n = os.cpu_count() or 4 os.environ.setdefault("OMP_NUM_THREADS", str(n)) os.environ.setdefault("MKL_NUM_THREADS", str(n)) os.environ.setdefault("KMP_AFFINITY", "granularity=fine,compact,1,0") os.environ.setdefault("KMP_BLOCKTIME", "1") return n _NCPU = _setup_cpu() import torch from chimera.paths import DEFAULT_CONFIG_PATH from chimera.training import ( GrowLengthDataset, GrowLengthScheduler, ProgressiveUnfreezer, apply_reservoir_freezing, benchmark_hyper, build_model_from_args, build_token_buffer, patch_training_loops, train_hyper_loop, ) torch.set_num_threads(int(os.environ["OMP_NUM_THREADS"])) try: torch.set_num_interop_threads(max(1, _NCPU // 4)) except RuntimeError: pass _HAS_IPEX = False try: import intel_extension_for_pytorch as ipex _HAS_IPEX = True except Exception: pass def build_model(args): return build_model_from_args(args) def train_hyper(args): model, config = build_model(args) counts = model.count_parameters() print("=" * 65) print(f"CHIMERA 5.3 HYPER v4 — scale={args.scale} bf16={args.bf16}") print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} " f"vocab={config['vocab_size']} target_seq={args.seq_len}") print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}") print(f"Params: total={counts['total']:,} ternary={counts['ternary']:,}") print(f"ALL features ON: looping={model.looping_enabled} " f"evolution={model.evolution is not None} " f"span={model.span_engine is not None}") print("=" * 65) # ── Parcae: start at 1 loop, progressive scheduler will increase to 2→3 patch_training_loops(model, num_loops=1) print(f"[P10] Progressive looping enabled (1→2→3)") # ── P2: Reservoir Freezing if args.reservoir: frozen = apply_reservoir_freezing(model) print(f"[P2] Reservoir: froze {frozen:,} gate params") trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}") # ── P7: Progressive Unfreezing (OFF by default — counterproductive with backprop) unfreezer = None if args.progressive_unfreeze: unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages) active = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f"[P7] Progressive unfreeze: {active:,} initially trainable") else: print(f"[P7] Progressive unfreeze: OFF (all layers train from start)") # ── P1: GrowLength if args.growlength: stages = [ (max(8, args.seq_len // 4), 0.30), (max(16, args.seq_len // 2), 0.30), (args.seq_len, 0.40), ] grow = GrowLengthScheduler(stages, args.max_steps) initial_seq = stages[0][0] print(f"[P1] GrowLength: {' → '.join(str(s) for s, _ in stages)}") else: grow = None initial_seq = args.seq_len # ── Data tok_budget = args.max_tokens or max(500_000, args.max_steps * args.batch_size * (args.seq_len + 1) * 4) token_buf = build_token_buffer( args.dataset_name, args.dataset_split, args.text_column, tok_budget, args.cache_dir) dataset = GrowLengthDataset(token_buf, initial_seq) print(f"[DATA] {token_buf.numel():,} tokens seq={initial_seq}") train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer) def cli(): p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v4") p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH)) p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"]) p.add_argument("--seq_len", type=int, default=64) p.add_argument("--batch_size", type=int, default=8) p.add_argument("--lr", type=float, default=1.5e-3) # ← BitNet-interpolated default p.add_argument("--warmup", type=int, default=750) # ← BitNet paper-exact p.add_argument("--max_steps", type=int, default=5000) p.add_argument("--max_tokens", type=int, default=None) p.add_argument("--max_samples", type=int, default=None) p.add_argument("--bf16", action="store_true", default=True) p.add_argument("--no-bf16", dest="bf16", action="store_false") p.add_argument("--compile", action="store_true", default=False) p.add_argument("--dataset_name", default="roneneldan/TinyStories") p.add_argument("--dataset_split", default="train") p.add_argument("--text_column", default="auto") p.add_argument("--cache_dir", default="./cache") p.add_argument("--log_every", type=int, default=10) p.add_argument("--save_every", type=int, default=1000) p.add_argument("--output_dir", default="./chimera_hyper_output") g = p.add_argument_group("paradigms") g.add_argument("--all", action="store_true", default=False) g.add_argument("--growlength", action="store_true", default=False) g.add_argument("--reservoir", action="store_true", default=False) g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps") # Progressive unfreeze: OFF by default (counterproductive with backprop) g.add_argument("--progressive-unfreeze", action="store_true", default=False, dest="progressive_unfreeze") g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages") p.add_argument("--benchmark", action="store_true", default=False) return p if __name__ == "__main__": args = cli().parse_args() if args.max_samples and not args.max_tokens: args.max_tokens = args.max_samples * (args.seq_len + 1) if args.all: args.growlength = True args.reservoir = True # NOTE: progressive_unfreeze deliberately NOT set by --all # It was designed for MeZO and is counterproductive with STE+AdamW if args.benchmark: args.growlength = True args.reservoir = True benchmark_hyper(args) else: train_hyper(args)