| |
| """ |
| Chimera 5.3 β HYPER CPU Training v4.2 |
| ======================================= |
| |
| All ch1mera features active: 28 layers (GD/XM/TM/SK pattern), MoE, |
| Parcae looping (locked to 1 for 300-step runs), SelfEvolution, |
| SpanInference, Grammar, EntropyValve, DebtLedger. |
| |
| Training paradigms: |
| P1 GrowLength Curriculum β seq 16β32β64β128, 4-stage front-loaded |
| P2 Reservoir Freezing β freeze recurrent gates as random ternary |
| P5 STE + Muon β BitNet-paper training with NS-orthogonalized momentum |
| P6 Aggressive Token Packing β zero padding waste (implicit in GrowLengthDataset) |
| P10 Progressive Looping β locked to loops=1 for 300-step throughput |
| P11 NaN-safe training β skip + recover on gradient explosion |
| P15 Token Triage β focus on top-50% informative tokens |
| P16 Plateau Breaker β adaptive LR burst (patience=60) |
| P18 Grokfast-EMA β amplify slow grads (alpha=0.95, lambda=1.5) |
| |
| v4.2 β Memory-safe batch sizing for vocab=200073 on 32 GB RAM. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import os |
|
|
| def _setup_cpu(): |
| n = os.cpu_count() or 4 |
| os.environ.setdefault("OMP_NUM_THREADS", str(n)) |
| os.environ.setdefault("MKL_NUM_THREADS", str(n)) |
| os.environ.setdefault("KMP_AFFINITY", "granularity=fine,compact,1,0") |
| os.environ.setdefault("KMP_BLOCKTIME", "0") |
| return n |
|
|
| _NCPU = _setup_cpu() |
|
|
| import torch |
|
|
| from chimera.paths import DEFAULT_CONFIG_PATH |
| from chimera.training import ( |
| GrowLengthDataset, |
| GrowLengthScheduler, |
| ProgressiveUnfreezer, |
| apply_reservoir_freezing, |
| benchmark_hyper, |
| build_model_from_args, |
| build_token_buffer, |
| patch_training_loops, |
| train_hyper_loop, |
| ) |
|
|
| torch.set_num_threads(int(os.environ["OMP_NUM_THREADS"])) |
| try: |
| torch.set_num_interop_threads(max(1, _NCPU // 4)) |
| except RuntimeError: |
| pass |
|
|
| _HAS_IPEX = False |
| try: |
| import intel_extension_for_pytorch as ipex |
| _HAS_IPEX = True |
| except Exception: |
| pass |
|
|
|
|
| def build_model(args): |
| return build_model_from_args(args) |
|
|
|
|
| def train_hyper(args): |
| model, config = build_model(args) |
| counts = model.count_parameters() |
|
|
| print("=" * 65) |
| print(f"CHIMERA 5.3 HYPER v4.2 β scale={args.scale} bf16={args.bf16}") |
| print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} " |
| f"vocab={config['vocab_size']} target_seq={args.seq_len}") |
| print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}") |
| print(f"Params: total={counts['total']:,} ternary={counts['ternary']:,}") |
| print(f"ALL features ON: looping={model.looping_enabled} " |
| f"evolution={model.evolution is not None} " |
| f"span={model.span_engine is not None}") |
| print("=" * 65) |
|
|
| |
| patch_training_loops(model, num_loops=1) |
| print(f"[P10] Looping locked to 1 (throughput > refinement at 300 steps)") |
|
|
| |
| if args.reservoir: |
| frozen = apply_reservoir_freezing(model) |
| print(f"[P2] Reservoir: froze {frozen:,} gate params") |
|
|
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}") |
|
|
| |
| unfreezer = None |
| if args.progressive_unfreeze: |
| unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages) |
| active = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| print(f"[P7] Progressive unfreeze: {active:,} initially trainable") |
| else: |
| print(f"[P7] Progressive unfreeze: OFF (all layers train from start)") |
|
|
| |
| if args.growlength: |
| |
| |
| |
| |
| |
| |
| |
| |
| stages = [ |
| (16, 0.10), |
| (32, 0.15), |
| (64, 0.25), |
| (args.seq_len, 0.50), |
| ] |
| grow = GrowLengthScheduler(stages, args.max_steps) |
| initial_seq = stages[0][0] |
| print(f"[P1] GrowLength 4-stage: {' β '.join(str(s) for s, _ in stages)}") |
| else: |
| grow = None |
| initial_seq = args.seq_len |
|
|
| |
| tok_budget = args.max_tokens or max(500_000, |
| args.max_steps * args.batch_size * (args.seq_len + 1) * 4) |
| token_buf = build_token_buffer( |
| args.dataset_name, args.dataset_split, args.text_column, |
| tok_budget, args.cache_dir) |
| dataset = GrowLengthDataset(token_buf, initial_seq) |
| print(f"[DATA] {token_buf.numel():,} tokens seq={initial_seq}") |
|
|
| train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer) |
|
|
|
|
| def cli(): |
| p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v4.2") |
| p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH)) |
| p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"]) |
| p.add_argument("--seq_len", type=int, default=128) |
| p.add_argument("--batch_size", type=int, default=4) |
| p.add_argument("--lr", type=float, default=1.2e-2) |
| p.add_argument("--warmup", type=int, default=30) |
| p.add_argument("--max_steps", type=int, default=300) |
| p.add_argument("--max_tokens", type=int, default=None) |
| p.add_argument("--max_samples", type=int, default=None) |
| p.add_argument("--bf16", action="store_true", default=True) |
| p.add_argument("--no-bf16", dest="bf16", action="store_false") |
| p.add_argument("--compile", action="store_true", default=False) |
| p.add_argument("--dataset_name", default="roneneldan/TinyStories") |
| p.add_argument("--dataset_split", default="train") |
| p.add_argument("--text_column", default="auto") |
| p.add_argument("--cache_dir", default="./cache") |
| p.add_argument("--log_every", type=int, default=10) |
| p.add_argument("--save_every", type=int, default=100) |
| p.add_argument("--output_dir", default="./chimera_hyper_output") |
|
|
| g = p.add_argument_group("paradigms") |
| g.add_argument("--all", action="store_true", default=False) |
| g.add_argument("--growlength", action="store_true", default=False) |
| g.add_argument("--reservoir", action="store_true", default=False) |
| g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps") |
| g.add_argument("--progressive-unfreeze", action="store_true", default=False, |
| dest="progressive_unfreeze") |
| g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages") |
| p.add_argument("--benchmark", action="store_true", default=False) |
| return p |
|
|
|
|
| if __name__ == "__main__": |
| args = cli().parse_args() |
| if args.max_samples and not args.max_tokens: |
| args.max_tokens = args.max_samples * (args.seq_len + 1) |
| if args.all: |
| args.growlength = True |
| args.reservoir = True |
| if args.benchmark: |
| args.growlength = True |
| args.reservoir = True |
| benchmark_hyper(args) |
| else: |
| train_hyper(args) |
|
|