fix: --all no longer enables progressive_unfreeze (counterproductive with backprop)"
Browse files- train_hyper.py +26 -40
train_hyper.py
CHANGED
|
@@ -1,27 +1,19 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
Chimera 5.3 β HYPER CPU Training
|
| 4 |
-
=====================================
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
P3 In-Place Seed MeZO β no randn allocation, seed-replay perturbation
|
| 14 |
-
P4 torch.compile β fuse ops, eliminate Python overhead
|
| 15 |
-
P5 Train-Mode STE Path β BitLinear uses STE (no invalidate_packed)
|
| 16 |
P6 Aggressive Token Packing β zero padding waste
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
P9 Loop-1 Training β force num_loops=1 during training (full arch)
|
| 20 |
-
|
| 21 |
-
Key insight: MeZO's bottleneck is not the forward pass β it's
|
| 22 |
-
generating+applying random perturbations to 227M params 3Γ per step.
|
| 23 |
-
Seed-replay MeZO eliminates this entirely: perturb in-place using a
|
| 24 |
-
single seed, replay the same seed to restore/update.
|
| 25 |
"""
|
| 26 |
|
| 27 |
from __future__ import annotations
|
|
@@ -72,16 +64,12 @@ def build_model(args):
|
|
| 72 |
return build_model_from_args(args)
|
| 73 |
|
| 74 |
|
| 75 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
-
# MAIN HYPER TRAIN
|
| 77 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 78 |
-
|
| 79 |
def train_hyper(args):
|
| 80 |
model, config = build_model(args)
|
| 81 |
counts = model.count_parameters()
|
| 82 |
|
| 83 |
print("=" * 65)
|
| 84 |
-
print(f"CHIMERA 5.3 HYPER
|
| 85 |
print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} "
|
| 86 |
f"vocab={config['vocab_size']} target_seq={args.seq_len}")
|
| 87 |
print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}")
|
|
@@ -91,12 +79,11 @@ def train_hyper(args):
|
|
| 91 |
f"span={model.span_engine is not None}")
|
| 92 |
print("=" * 65)
|
| 93 |
|
| 94 |
-
# ββ
|
| 95 |
-
# Architecture intact, but save 1 full pass through layers 4-23
|
| 96 |
patch_training_loops(model, num_loops=1)
|
| 97 |
-
print(f"[
|
| 98 |
|
| 99 |
-
# ββ P2: Reservoir Freezing
|
| 100 |
if args.reservoir:
|
| 101 |
frozen = apply_reservoir_freezing(model)
|
| 102 |
print(f"[P2] Reservoir: froze {frozen:,} gate params")
|
|
@@ -104,14 +91,16 @@ def train_hyper(args):
|
|
| 104 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 105 |
print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}")
|
| 106 |
|
| 107 |
-
# ββ P7: Progressive Unfreezing
|
| 108 |
unfreezer = None
|
| 109 |
if args.progressive_unfreeze:
|
| 110 |
unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages)
|
| 111 |
active = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 112 |
print(f"[P7] Progressive unfreeze: {active:,} initially trainable")
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
# ββ P1: GrowLength
|
| 115 |
if args.growlength:
|
| 116 |
stages = [
|
| 117 |
(max(8, args.seq_len // 4), 0.30),
|
|
@@ -125,7 +114,7 @@ def train_hyper(args):
|
|
| 125 |
grow = None
|
| 126 |
initial_seq = args.seq_len
|
| 127 |
|
| 128 |
-
# ββ Data
|
| 129 |
tok_budget = args.max_tokens or max(500_000,
|
| 130 |
args.max_steps * args.batch_size * (args.seq_len + 1) * 4)
|
| 131 |
token_buf = build_token_buffer(
|
|
@@ -137,18 +126,14 @@ def train_hyper(args):
|
|
| 137 |
train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
| 138 |
|
| 139 |
|
| 140 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 141 |
-
# CLI
|
| 142 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 143 |
-
|
| 144 |
def cli():
|
| 145 |
-
p = argparse.ArgumentParser(description="Chimera 5.3 HYPER
|
| 146 |
p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
|
| 147 |
p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"])
|
| 148 |
p.add_argument("--seq_len", type=int, default=64)
|
| 149 |
p.add_argument("--batch_size", type=int, default=8)
|
| 150 |
-
p.add_argument("--lr", type=float, default=
|
| 151 |
-
p.add_argument("--warmup", type=int, default=
|
| 152 |
p.add_argument("--max_steps", type=int, default=5000)
|
| 153 |
p.add_argument("--max_tokens", type=int, default=None)
|
| 154 |
p.add_argument("--max_samples", type=int, default=None)
|
|
@@ -168,6 +153,7 @@ def cli():
|
|
| 168 |
g.add_argument("--growlength", action="store_true", default=False)
|
| 169 |
g.add_argument("--reservoir", action="store_true", default=False)
|
| 170 |
g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps")
|
|
|
|
| 171 |
g.add_argument("--progressive-unfreeze", action="store_true", default=False,
|
| 172 |
dest="progressive_unfreeze")
|
| 173 |
g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages")
|
|
@@ -182,11 +168,11 @@ if __name__ == "__main__":
|
|
| 182 |
if args.all:
|
| 183 |
args.growlength = True
|
| 184 |
args.reservoir = True
|
| 185 |
-
|
|
|
|
| 186 |
if args.benchmark:
|
| 187 |
args.growlength = True
|
| 188 |
args.reservoir = True
|
| 189 |
-
args.progressive_unfreeze = True
|
| 190 |
benchmark_hyper(args)
|
| 191 |
else:
|
| 192 |
train_hyper(args)
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Chimera 5.3 β HYPER CPU Training v4
|
| 4 |
+
=====================================
|
| 5 |
|
| 6 |
+
All ch1mera features active: 28 layers (GD/XM/TM/SK pattern), MoE,
|
| 7 |
+
Parcae looping (progressive 1β2β3), SelfEvolution, SpanInference,
|
| 8 |
+
Grammar, EntropyValve, DebtLedger.
|
| 9 |
|
| 10 |
+
Training paradigms:
|
| 11 |
+
P1 GrowLength Curriculum β seq 8βtarget, huge batch at short seq
|
| 12 |
+
P2 Reservoir Freezing β freeze recurrent gates as random ternary
|
| 13 |
+
P5 STE + AdamW β BitNet-paper training (replaces MeZO)
|
|
|
|
|
|
|
|
|
|
| 14 |
P6 Aggressive Token Packing β zero padding waste
|
| 15 |
+
P10 Progressive Looping β Parcae loops 1β2β3 during training
|
| 16 |
+
P11 NaN-safe training β skip + recover on gradient explosion
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
|
| 19 |
from __future__ import annotations
|
|
|
|
| 64 |
return build_model_from_args(args)
|
| 65 |
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
def train_hyper(args):
|
| 68 |
model, config = build_model(args)
|
| 69 |
counts = model.count_parameters()
|
| 70 |
|
| 71 |
print("=" * 65)
|
| 72 |
+
print(f"CHIMERA 5.3 HYPER v4 β scale={args.scale} bf16={args.bf16}")
|
| 73 |
print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} "
|
| 74 |
f"vocab={config['vocab_size']} target_seq={args.seq_len}")
|
| 75 |
print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}")
|
|
|
|
| 79 |
f"span={model.span_engine is not None}")
|
| 80 |
print("=" * 65)
|
| 81 |
|
| 82 |
+
# ββ Parcae: start at 1 loop, progressive scheduler will increase to 2β3
|
|
|
|
| 83 |
patch_training_loops(model, num_loops=1)
|
| 84 |
+
print(f"[P10] Progressive looping enabled (1β2β3)")
|
| 85 |
|
| 86 |
+
# ββ P2: Reservoir Freezing
|
| 87 |
if args.reservoir:
|
| 88 |
frozen = apply_reservoir_freezing(model)
|
| 89 |
print(f"[P2] Reservoir: froze {frozen:,} gate params")
|
|
|
|
| 91 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 92 |
print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}")
|
| 93 |
|
| 94 |
+
# ββ P7: Progressive Unfreezing (OFF by default β counterproductive with backprop)
|
| 95 |
unfreezer = None
|
| 96 |
if args.progressive_unfreeze:
|
| 97 |
unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages)
|
| 98 |
active = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 99 |
print(f"[P7] Progressive unfreeze: {active:,} initially trainable")
|
| 100 |
+
else:
|
| 101 |
+
print(f"[P7] Progressive unfreeze: OFF (all layers train from start)")
|
| 102 |
|
| 103 |
+
# ββ P1: GrowLength
|
| 104 |
if args.growlength:
|
| 105 |
stages = [
|
| 106 |
(max(8, args.seq_len // 4), 0.30),
|
|
|
|
| 114 |
grow = None
|
| 115 |
initial_seq = args.seq_len
|
| 116 |
|
| 117 |
+
# ββ Data
|
| 118 |
tok_budget = args.max_tokens or max(500_000,
|
| 119 |
args.max_steps * args.batch_size * (args.seq_len + 1) * 4)
|
| 120 |
token_buf = build_token_buffer(
|
|
|
|
| 126 |
train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
| 127 |
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
def cli():
|
| 130 |
+
p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v4")
|
| 131 |
p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
|
| 132 |
p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"])
|
| 133 |
p.add_argument("--seq_len", type=int, default=64)
|
| 134 |
p.add_argument("--batch_size", type=int, default=8)
|
| 135 |
+
p.add_argument("--lr", type=float, default=1.5e-3) # β BitNet-interpolated default
|
| 136 |
+
p.add_argument("--warmup", type=int, default=750) # β BitNet paper-exact
|
| 137 |
p.add_argument("--max_steps", type=int, default=5000)
|
| 138 |
p.add_argument("--max_tokens", type=int, default=None)
|
| 139 |
p.add_argument("--max_samples", type=int, default=None)
|
|
|
|
| 153 |
g.add_argument("--growlength", action="store_true", default=False)
|
| 154 |
g.add_argument("--reservoir", action="store_true", default=False)
|
| 155 |
g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps")
|
| 156 |
+
# Progressive unfreeze: OFF by default (counterproductive with backprop)
|
| 157 |
g.add_argument("--progressive-unfreeze", action="store_true", default=False,
|
| 158 |
dest="progressive_unfreeze")
|
| 159 |
g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages")
|
|
|
|
| 168 |
if args.all:
|
| 169 |
args.growlength = True
|
| 170 |
args.reservoir = True
|
| 171 |
+
# NOTE: progressive_unfreeze deliberately NOT set by --all
|
| 172 |
+
# It was designed for MeZO and is counterproductive with STE+AdamW
|
| 173 |
if args.benchmark:
|
| 174 |
args.growlength = True
|
| 175 |
args.reservoir = True
|
|
|
|
| 176 |
benchmark_hyper(args)
|
| 177 |
else:
|
| 178 |
train_hyper(args)
|