File size: 7,568 Bytes
11c11f8 995be31 1eb24b2 11c11f8 acc06f5 1eb24b2 11c11f8 acc06f5 1eb24b2 acc06f5 1eb24b2 acc06f5 1eb24b2 995be31 11c11f8 1eb24b2 11c11f8 995be31 11c11f8 995be31 11c11f8 1eb24b2 11c11f8 acc06f5 11c11f8 995be31 11c11f8 acc06f5 11c11f8 1eb24b2 11c11f8 995be31 1eb24b2 995be31 11c11f8 1eb24b2 11c11f8 1eb24b2 11c11f8 1eb24b2 11c11f8 995be31 11c11f8 995be31 11c11f8 995be31 11c11f8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | #!/usr/bin/env python3
"""
Chimera 5.3 β HYPER CPU Training v4.2
=======================================
All ch1mera features active: 28 layers (GD/XM/TM/SK pattern), MoE,
Parcae looping (locked to 1 for 300-step runs), SelfEvolution,
SpanInference, Grammar, EntropyValve, DebtLedger.
Training paradigms:
P1 GrowLength Curriculum β seq 16β32β64β128, 4-stage front-loaded
P2 Reservoir Freezing β freeze recurrent gates as random ternary
P5 STE + Muon β BitNet-paper training with NS-orthogonalized momentum
P6 Aggressive Token Packing β zero padding waste (implicit in GrowLengthDataset)
P10 Progressive Looping β locked to loops=1 for 300-step throughput
P11 NaN-safe training β skip + recover on gradient explosion
P15 Token Triage β focus on top-50% informative tokens
P16 Plateau Breaker β adaptive LR burst (patience=60)
P18 Grokfast-EMA β amplify slow grads (alpha=0.95, lambda=1.5)
v4.2 β Memory-safe batch sizing for vocab=200073 on 32 GB RAM.
"""
from __future__ import annotations
import argparse
import os
def _setup_cpu():
n = os.cpu_count() or 4
os.environ.setdefault("OMP_NUM_THREADS", str(n))
os.environ.setdefault("MKL_NUM_THREADS", str(n))
os.environ.setdefault("KMP_AFFINITY", "granularity=fine,compact,1,0")
os.environ.setdefault("KMP_BLOCKTIME", "0")
return n
_NCPU = _setup_cpu()
import torch
from chimera.paths import DEFAULT_CONFIG_PATH
from chimera.training import (
GrowLengthDataset,
GrowLengthScheduler,
ProgressiveUnfreezer,
apply_reservoir_freezing,
benchmark_hyper,
build_model_from_args,
build_token_buffer,
patch_training_loops,
train_hyper_loop,
)
torch.set_num_threads(int(os.environ["OMP_NUM_THREADS"]))
try:
torch.set_num_interop_threads(max(1, _NCPU // 4))
except RuntimeError:
pass
_HAS_IPEX = False
try:
import intel_extension_for_pytorch as ipex
_HAS_IPEX = True
except Exception:
pass
def build_model(args):
return build_model_from_args(args)
def train_hyper(args):
model, config = build_model(args)
counts = model.count_parameters()
print("=" * 65)
print(f"CHIMERA 5.3 HYPER v4.2 β scale={args.scale} bf16={args.bf16}")
print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} "
f"vocab={config['vocab_size']} target_seq={args.seq_len}")
print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}")
print(f"Params: total={counts['total']:,} ternary={counts['ternary']:,}")
print(f"ALL features ON: looping={model.looping_enabled} "
f"evolution={model.evolution is not None} "
f"span={model.span_engine is not None}")
print("=" * 65)
# ββ Parcae: lock to 1 loop for throughput
patch_training_loops(model, num_loops=1)
print(f"[P10] Looping locked to 1 (throughput > refinement at 300 steps)")
# ββ P2: Reservoir Freezing
if args.reservoir:
frozen = apply_reservoir_freezing(model)
print(f"[P2] Reservoir: froze {frozen:,} gate params")
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}")
# ββ P7: Progressive Unfreezing (OFF)
unfreezer = None
if args.progressive_unfreeze:
unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages)
active = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"[P7] Progressive unfreeze: {active:,} initially trainable")
else:
print(f"[P7] Progressive unfreeze: OFF (all layers train from start)")
# ββ P1: GrowLength β 4-stage front-loaded schedule ββ
if args.growlength:
# v4.2: batch sizes are base Γ (target_seq / stage_seq), then
# capped by _safe_batch() in loops.py to stay under 2 GB logits.
#
# With base batch=4, target_seq=128, vocab=200073:
# Stage 1: seq=16, desired=32 β logits=0.41GB β
# Stage 2: seq=32, desired=16 β logits=0.41GB β
# Stage 3: seq=64, desired=8 β logits=0.41GB β
# Stage 4: seq=128, desired=4 β logits=0.41GB β
stages = [
(16, 0.10), # 30 steps at seq=16
(32, 0.15), # 45 steps at seq=32
(64, 0.25), # 75 steps at seq=64
(args.seq_len, 0.50), # 150 steps at seq=128
]
grow = GrowLengthScheduler(stages, args.max_steps)
initial_seq = stages[0][0]
print(f"[P1] GrowLength 4-stage: {' β '.join(str(s) for s, _ in stages)}")
else:
grow = None
initial_seq = args.seq_len
# ββ Data ββ
tok_budget = args.max_tokens or max(500_000,
args.max_steps * args.batch_size * (args.seq_len + 1) * 4)
token_buf = build_token_buffer(
args.dataset_name, args.dataset_split, args.text_column,
tok_budget, args.cache_dir)
dataset = GrowLengthDataset(token_buf, initial_seq)
print(f"[DATA] {token_buf.numel():,} tokens seq={initial_seq}")
train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
def cli():
p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v4.2")
p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"])
p.add_argument("--seq_len", type=int, default=128)
p.add_argument("--batch_size", type=int, default=4) # base batch; GrowLength scales up
p.add_argument("--lr", type=float, default=1.2e-2)
p.add_argument("--warmup", type=int, default=30)
p.add_argument("--max_steps", type=int, default=300)
p.add_argument("--max_tokens", type=int, default=None)
p.add_argument("--max_samples", type=int, default=None)
p.add_argument("--bf16", action="store_true", default=True)
p.add_argument("--no-bf16", dest="bf16", action="store_false")
p.add_argument("--compile", action="store_true", default=False)
p.add_argument("--dataset_name", default="roneneldan/TinyStories")
p.add_argument("--dataset_split", default="train")
p.add_argument("--text_column", default="auto")
p.add_argument("--cache_dir", default="./cache")
p.add_argument("--log_every", type=int, default=10)
p.add_argument("--save_every", type=int, default=100)
p.add_argument("--output_dir", default="./chimera_hyper_output")
g = p.add_argument_group("paradigms")
g.add_argument("--all", action="store_true", default=False)
g.add_argument("--growlength", action="store_true", default=False)
g.add_argument("--reservoir", action="store_true", default=False)
g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps")
g.add_argument("--progressive-unfreeze", action="store_true", default=False,
dest="progressive_unfreeze")
g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages")
p.add_argument("--benchmark", action="store_true", default=False)
return p
if __name__ == "__main__":
args = cli().parse_args()
if args.max_samples and not args.max_tokens:
args.max_tokens = args.max_samples * (args.seq_len + 1)
if args.all:
args.growlength = True
args.reservoir = True
if args.benchmark:
args.growlength = True
args.reservoir = True
benchmark_hyper(args)
else:
train_hyper(args)
|