fix: loops.py — use chimera_turbo v8 defaults (wd=0.01, warmup=750, β2=0.98) instead of hardcoded values"
Browse files- chimera/training/loops.py +18 -24
chimera/training/loops.py
CHANGED
|
@@ -13,7 +13,7 @@ from .common import cosine_lr, save_final_checkpoint, save_training_checkpoint
|
|
| 13 |
|
| 14 |
|
| 15 |
def train_fast_loop(args, model, config, loader, compute_loss) -> str:
|
| 16 |
-
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.
|
| 17 |
os.makedirs(args.output_dir, exist_ok=True)
|
| 18 |
log_f = open(os.path.join(args.output_dir, "log.jsonl"), "w", encoding="utf-8")
|
| 19 |
|
|
@@ -147,19 +147,21 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
|
|
| 147 |
|
| 148 |
|
| 149 |
def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
| 152 |
model, optimizer, scheduler = chimera_turbo.apply(
|
| 153 |
model,
|
| 154 |
max_steps=args.max_steps,
|
| 155 |
-
lr=args.lr,
|
| 156 |
-
weight_decay
|
| 157 |
-
warmup_steps=min(500, args.max_steps // 10),
|
| 158 |
use_compile=use_compile,
|
| 159 |
use_ipex=True,
|
| 160 |
)
|
| 161 |
model.train()
|
| 162 |
-
print(f"[P5] Train mode: BitLinear STE
|
| 163 |
use_bf16 = bool(args.bf16)
|
| 164 |
|
| 165 |
os.makedirs(args.output_dir, exist_ok=True)
|
|
@@ -200,37 +202,29 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 200 |
data_iter = iter(loader)
|
| 201 |
batch = next(data_iter)
|
| 202 |
|
| 203 |
-
# grad_accum_steps=1: DataLoader already provides eff_batch items.
|
| 204 |
loss_val = chimera_turbo.training_step(
|
| 205 |
-
model,
|
| 206 |
-
|
| 207 |
-
optimizer,
|
| 208 |
-
scheduler,
|
| 209 |
-
grad_accum_steps=1,
|
| 210 |
-
step=step,
|
| 211 |
autocast_dtype=torch.bfloat16 if use_bf16 else None,
|
| 212 |
)
|
| 213 |
|
| 214 |
cur_lr = optimizer.param_groups[0]["lr"]
|
| 215 |
-
total_loss += loss_val
|
| 216 |
toks += batch["input_ids"].numel()
|
| 217 |
step += 1
|
| 218 |
|
| 219 |
if step % args.log_every == 0:
|
| 220 |
dt = time.time() - t0
|
| 221 |
avg = total_loss / args.log_every
|
| 222 |
-
ppl = math.exp(min(avg, 20))
|
| 223 |
tps = toks / dt if dt > 0 else 0
|
| 224 |
eta = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0
|
| 225 |
log_f.write(
|
| 226 |
json.dumps({
|
| 227 |
-
"step": step,
|
| 228 |
-
"
|
| 229 |
-
"
|
| 230 |
-
"
|
| 231 |
-
"tok/s": round(tps),
|
| 232 |
-
"seq_len": cur_seq,
|
| 233 |
-
"eff_batch": eff_batch,
|
| 234 |
}) + "\n"
|
| 235 |
)
|
| 236 |
log_f.flush()
|
|
@@ -238,7 +232,7 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 238 |
f" step {step:>6}/{args.max_steps} | loss {avg:.4f} | ppl {ppl:>8.2f} "
|
| 239 |
f"| lr {cur_lr:.2e} | {tps:,.0f} tok/s | seq {cur_seq} | ETA {eta:.1f}h"
|
| 240 |
)
|
| 241 |
-
best_loss = min(best_loss, avg)
|
| 242 |
total_loss = 0.0
|
| 243 |
toks = 0
|
| 244 |
t0 = time.time()
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
def train_fast_loop(args, model, config, loader, compute_loss) -> str:
|
| 16 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.98))
|
| 17 |
os.makedirs(args.output_dir, exist_ok=True)
|
| 18 |
log_f = open(os.path.join(args.output_dir, "log.jsonl"), "w", encoding="utf-8")
|
| 19 |
|
|
|
|
| 147 |
|
| 148 |
|
| 149 |
def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
|
| 150 |
+
use_compile = getattr(args, "compile", False)
|
| 151 |
+
|
| 152 |
+
# Let chimera_turbo.apply() use its v8 BitNet-aligned defaults:
|
| 153 |
+
# lr=1.5e-3, wd=0.01, β=(0.9,0.98), warmup=750
|
| 154 |
+
# Only override lr if the user explicitly set it via CLI
|
| 155 |
model, optimizer, scheduler = chimera_turbo.apply(
|
| 156 |
model,
|
| 157 |
max_steps=args.max_steps,
|
| 158 |
+
lr=args.lr, # CLI default is 1e-3; chimera_turbo default is 1.5e-3
|
| 159 |
+
# weight_decay and warmup_steps use chimera_turbo defaults (0.01, 750)
|
|
|
|
| 160 |
use_compile=use_compile,
|
| 161 |
use_ipex=True,
|
| 162 |
)
|
| 163 |
model.train()
|
| 164 |
+
print(f"[P5] Train mode: BitLinear STE (clamp-aware, NaN-safe)")
|
| 165 |
use_bf16 = bool(args.bf16)
|
| 166 |
|
| 167 |
os.makedirs(args.output_dir, exist_ok=True)
|
|
|
|
| 202 |
data_iter = iter(loader)
|
| 203 |
batch = next(data_iter)
|
| 204 |
|
|
|
|
| 205 |
loss_val = chimera_turbo.training_step(
|
| 206 |
+
model, batch, optimizer, scheduler,
|
| 207 |
+
grad_accum_steps=1, step=step,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
autocast_dtype=torch.bfloat16 if use_bf16 else None,
|
| 209 |
)
|
| 210 |
|
| 211 |
cur_lr = optimizer.param_groups[0]["lr"]
|
| 212 |
+
total_loss += loss_val if math.isfinite(loss_val) else 0.0
|
| 213 |
toks += batch["input_ids"].numel()
|
| 214 |
step += 1
|
| 215 |
|
| 216 |
if step % args.log_every == 0:
|
| 217 |
dt = time.time() - t0
|
| 218 |
avg = total_loss / args.log_every
|
| 219 |
+
ppl = math.exp(min(avg, 20)) if math.isfinite(avg) else float("nan")
|
| 220 |
tps = toks / dt if dt > 0 else 0
|
| 221 |
eta = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0
|
| 222 |
log_f.write(
|
| 223 |
json.dumps({
|
| 224 |
+
"step": step, "loss": round(avg, 4) if math.isfinite(avg) else None,
|
| 225 |
+
"ppl": round(ppl, 2) if math.isfinite(ppl) else None,
|
| 226 |
+
"lr": round(cur_lr, 6), "tok/s": round(tps),
|
| 227 |
+
"seq_len": cur_seq, "eff_batch": eff_batch,
|
|
|
|
|
|
|
|
|
|
| 228 |
}) + "\n"
|
| 229 |
)
|
| 230 |
log_f.flush()
|
|
|
|
| 232 |
f" step {step:>6}/{args.max_steps} | loss {avg:.4f} | ppl {ppl:>8.2f} "
|
| 233 |
f"| lr {cur_lr:.2e} | {tps:,.0f} tok/s | seq {cur_seq} | ETA {eta:.1f}h"
|
| 234 |
)
|
| 235 |
+
best_loss = min(best_loss, avg) if math.isfinite(avg) else best_loss
|
| 236 |
total_loss = 0.0
|
| 237 |
toks = 0
|
| 238 |
t0 = time.time()
|