Lgr54HFi
/

chomera

Lgr54HFi commited on 12 days ago

Commit

f6670ea

verified ·

1 Parent(s): dd57d33

fix: re-enable torch.compile in train_hyper_loop (STE graph breaks fixed)"

Files changed (1) hide show

chimera/training/loops.py CHANGED Viewed

@@ -147,17 +147,19 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
 def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
     model, optimizer, scheduler = chimera_turbo.apply(
         model,
         max_steps=args.max_steps,
         lr=args.lr,
         weight_decay=0.05,
         warmup_steps=min(500, args.max_steps // 10),
-        use_compile=False,  # ← disabled: 84 graph breaks from STE
         use_ipex=True,
     )
     model.train()
-    print(f"[P5] Train mode: BitLinear STE path (no invalidate_packed)")
     use_bf16 = bool(args.bf16)
     os.makedirs(args.output_dir, exist_ok=True)
@@ -199,7 +201,6 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
             batch = next(data_iter)
         # grad_accum_steps=1: DataLoader already provides eff_batch items.
-        # The effective batch IS eff_batch. No need to accumulate further.
         loss_val = chimera_turbo.training_step(
             model,
             batch,

 def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
+    # use_compile=True now works: STE uses detach() trick = zero graph breaks
+    use_compile = getattr(args, "compile", True)
     model, optimizer, scheduler = chimera_turbo.apply(
         model,
         max_steps=args.max_steps,
         lr=args.lr,
         weight_decay=0.05,
         warmup_steps=min(500, args.max_steps // 10),
+        use_compile=use_compile,
         use_ipex=True,
     )
     model.train()
+    print(f"[P5] Train mode: BitLinear STE path (detach trick, compile-friendly)")
     use_bf16 = bool(args.bf16)
     os.makedirs(args.output_dir, exist_ok=True)
             batch = next(data_iter)
         # grad_accum_steps=1: DataLoader already provides eff_batch items.
         loss_val = chimera_turbo.training_step(
             model,
             batch,