File size: 7,568 Bytes
11c11f8
 
995be31
1eb24b2
11c11f8
acc06f5
1eb24b2
 
11c11f8
acc06f5
1eb24b2
acc06f5
1eb24b2
 
 
acc06f5
1eb24b2
 
 
 
995be31
11c11f8
 
 
 
 
 
 
 
 
 
 
 
1eb24b2
11c11f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
995be31
11c11f8
 
 
 
 
 
 
 
 
995be31
11c11f8
1eb24b2
11c11f8
acc06f5
11c11f8
 
 
 
 
 
 
995be31
11c11f8
 
 
 
 
acc06f5
 
11c11f8
1eb24b2
11c11f8
995be31
 
1eb24b2
995be31
 
 
 
 
11c11f8
1eb24b2
 
 
 
11c11f8
 
 
1eb24b2
11c11f8
 
 
 
1eb24b2
11c11f8
 
 
 
 
 
 
 
 
 
 
 
995be31
11c11f8
 
995be31
 
 
 
 
11c11f8
 
 
 
 
 
 
 
 
 
995be31
11c11f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python3
"""
Chimera 5.3 β€” HYPER CPU Training v4.2
=======================================

All ch1mera features active: 28 layers (GD/XM/TM/SK pattern), MoE,
Parcae looping (locked to 1 for 300-step runs), SelfEvolution,
SpanInference, Grammar, EntropyValve, DebtLedger.

Training paradigms:
 P1  GrowLength Curriculum     β€” seq 16β†’32β†’64β†’128, 4-stage front-loaded
 P2  Reservoir Freezing        β€” freeze recurrent gates as random ternary
 P5  STE + Muon                β€” BitNet-paper training with NS-orthogonalized momentum
 P6  Aggressive Token Packing  β€” zero padding waste (implicit in GrowLengthDataset)
 P10 Progressive Looping       β€” locked to loops=1 for 300-step throughput
 P11 NaN-safe training         β€” skip + recover on gradient explosion
 P15 Token Triage              β€” focus on top-50% informative tokens
 P16 Plateau Breaker           β€” adaptive LR burst (patience=60)
 P18 Grokfast-EMA              β€” amplify slow grads (alpha=0.95, lambda=1.5)

v4.2 β€” Memory-safe batch sizing for vocab=200073 on 32 GB RAM.
"""

from __future__ import annotations

import argparse
import os

def _setup_cpu():
    n = os.cpu_count() or 4
    os.environ.setdefault("OMP_NUM_THREADS", str(n))
    os.environ.setdefault("MKL_NUM_THREADS", str(n))
    os.environ.setdefault("KMP_AFFINITY", "granularity=fine,compact,1,0")
    os.environ.setdefault("KMP_BLOCKTIME", "0")
    return n

_NCPU = _setup_cpu()

import torch

from chimera.paths import DEFAULT_CONFIG_PATH
from chimera.training import (
    GrowLengthDataset,
    GrowLengthScheduler,
    ProgressiveUnfreezer,
    apply_reservoir_freezing,
    benchmark_hyper,
    build_model_from_args,
    build_token_buffer,
    patch_training_loops,
    train_hyper_loop,
)

torch.set_num_threads(int(os.environ["OMP_NUM_THREADS"]))
try:
    torch.set_num_interop_threads(max(1, _NCPU // 4))
except RuntimeError:
    pass

_HAS_IPEX = False
try:
    import intel_extension_for_pytorch as ipex
    _HAS_IPEX = True
except Exception:
    pass


def build_model(args):
    return build_model_from_args(args)


def train_hyper(args):
    model, config = build_model(args)
    counts = model.count_parameters()

    print("=" * 65)
    print(f"CHIMERA 5.3 HYPER v4.2 β€” scale={args.scale}  bf16={args.bf16}")
    print(f"Layers={config['num_hidden_layers']}  hidden={config['hidden_size']}  "
          f"vocab={config['vocab_size']}  target_seq={args.seq_len}")
    print(f"Threads: {torch.get_num_threads()}  IPEX={_HAS_IPEX}")
    print(f"Params: total={counts['total']:,}  ternary={counts['ternary']:,}")
    print(f"ALL features ON: looping={model.looping_enabled} "
          f"evolution={model.evolution is not None} "
          f"span={model.span_engine is not None}")
    print("=" * 65)

    # ── Parcae: lock to 1 loop for throughput
    patch_training_loops(model, num_loops=1)
    print(f"[P10] Looping locked to 1 (throughput > refinement at 300 steps)")

    # ── P2: Reservoir Freezing
    if args.reservoir:
        frozen = apply_reservoir_freezing(model)
        print(f"[P2] Reservoir: froze {frozen:,} gate params")

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}")

    # ── P7: Progressive Unfreezing (OFF)
    unfreezer = None
    if args.progressive_unfreeze:
        unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages)
        active = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"[P7] Progressive unfreeze: {active:,} initially trainable")
    else:
        print(f"[P7] Progressive unfreeze: OFF (all layers train from start)")

    # ── P1: GrowLength β€” 4-stage front-loaded schedule ──
    if args.growlength:
        # v4.2: batch sizes are base Γ— (target_seq / stage_seq), then
        # capped by _safe_batch() in loops.py to stay under 2 GB logits.
        #
        # With base batch=4, target_seq=128, vocab=200073:
        #   Stage 1: seq=16,  desired=32 β†’ logits=0.41GB βœ“
        #   Stage 2: seq=32,  desired=16 β†’ logits=0.41GB βœ“
        #   Stage 3: seq=64,  desired=8  β†’ logits=0.41GB βœ“
        #   Stage 4: seq=128, desired=4  β†’ logits=0.41GB βœ“
        stages = [
            (16,           0.10),  # 30 steps  at seq=16
            (32,           0.15),  # 45 steps  at seq=32
            (64,           0.25),  # 75 steps  at seq=64
            (args.seq_len, 0.50),  # 150 steps at seq=128
        ]
        grow = GrowLengthScheduler(stages, args.max_steps)
        initial_seq = stages[0][0]
        print(f"[P1] GrowLength 4-stage: {' β†’ '.join(str(s) for s, _ in stages)}")
    else:
        grow = None
        initial_seq = args.seq_len

    # ── Data ──
    tok_budget = args.max_tokens or max(500_000,
        args.max_steps * args.batch_size * (args.seq_len + 1) * 4)
    token_buf = build_token_buffer(
        args.dataset_name, args.dataset_split, args.text_column,
        tok_budget, args.cache_dir)
    dataset = GrowLengthDataset(token_buf, initial_seq)
    print(f"[DATA] {token_buf.numel():,} tokens  seq={initial_seq}")

    train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)


def cli():
    p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v4.2")
    p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
    p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"])
    p.add_argument("--seq_len", type=int, default=128)
    p.add_argument("--batch_size", type=int, default=4)      # base batch; GrowLength scales up
    p.add_argument("--lr", type=float, default=1.2e-2)
    p.add_argument("--warmup", type=int, default=30)
    p.add_argument("--max_steps", type=int, default=300)
    p.add_argument("--max_tokens", type=int, default=None)
    p.add_argument("--max_samples", type=int, default=None)
    p.add_argument("--bf16", action="store_true", default=True)
    p.add_argument("--no-bf16", dest="bf16", action="store_false")
    p.add_argument("--compile", action="store_true", default=False)
    p.add_argument("--dataset_name", default="roneneldan/TinyStories")
    p.add_argument("--dataset_split", default="train")
    p.add_argument("--text_column", default="auto")
    p.add_argument("--cache_dir", default="./cache")
    p.add_argument("--log_every", type=int, default=10)
    p.add_argument("--save_every", type=int, default=100)
    p.add_argument("--output_dir", default="./chimera_hyper_output")

    g = p.add_argument_group("paradigms")
    g.add_argument("--all", action="store_true", default=False)
    g.add_argument("--growlength", action="store_true", default=False)
    g.add_argument("--reservoir", action="store_true", default=False)
    g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps")
    g.add_argument("--progressive-unfreeze", action="store_true", default=False,
                   dest="progressive_unfreeze")
    g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages")
    p.add_argument("--benchmark", action="store_true", default=False)
    return p


if __name__ == "__main__":
    args = cli().parse_args()
    if args.max_samples and not args.max_tokens:
        args.max_tokens = args.max_samples * (args.seq_len + 1)
    if args.all:
        args.growlength = True
        args.reservoir = True
    if args.benchmark:
        args.growlength = True
        args.reservoir = True
        benchmark_hyper(args)
    else:
        train_hyper(args)