fix: re-enable torch.compile in train_hyper_loop (STE graph breaks fixed)"
Browse files
chimera/training/loops.py
CHANGED
|
@@ -147,17 +147,19 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
|
|
| 147 |
|
| 148 |
|
| 149 |
def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
|
|
|
|
|
|
|
| 150 |
model, optimizer, scheduler = chimera_turbo.apply(
|
| 151 |
model,
|
| 152 |
max_steps=args.max_steps,
|
| 153 |
lr=args.lr,
|
| 154 |
weight_decay=0.05,
|
| 155 |
warmup_steps=min(500, args.max_steps // 10),
|
| 156 |
-
use_compile=
|
| 157 |
use_ipex=True,
|
| 158 |
)
|
| 159 |
model.train()
|
| 160 |
-
print(f"[P5] Train mode: BitLinear STE path (
|
| 161 |
use_bf16 = bool(args.bf16)
|
| 162 |
|
| 163 |
os.makedirs(args.output_dir, exist_ok=True)
|
|
@@ -199,7 +201,6 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 199 |
batch = next(data_iter)
|
| 200 |
|
| 201 |
# grad_accum_steps=1: DataLoader already provides eff_batch items.
|
| 202 |
-
# The effective batch IS eff_batch. No need to accumulate further.
|
| 203 |
loss_val = chimera_turbo.training_step(
|
| 204 |
model,
|
| 205 |
batch,
|
|
|
|
| 147 |
|
| 148 |
|
| 149 |
def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
|
| 150 |
+
# use_compile=True now works: STE uses detach() trick = zero graph breaks
|
| 151 |
+
use_compile = getattr(args, "compile", True)
|
| 152 |
model, optimizer, scheduler = chimera_turbo.apply(
|
| 153 |
model,
|
| 154 |
max_steps=args.max_steps,
|
| 155 |
lr=args.lr,
|
| 156 |
weight_decay=0.05,
|
| 157 |
warmup_steps=min(500, args.max_steps // 10),
|
| 158 |
+
use_compile=use_compile,
|
| 159 |
use_ipex=True,
|
| 160 |
)
|
| 161 |
model.train()
|
| 162 |
+
print(f"[P5] Train mode: BitLinear STE path (detach trick, compile-friendly)")
|
| 163 |
use_bf16 = bool(args.bf16)
|
| 164 |
|
| 165 |
os.makedirs(args.output_dir, exist_ok=True)
|
|
|
|
| 201 |
batch = next(data_iter)
|
| 202 |
|
| 203 |
# grad_accum_steps=1: DataLoader already provides eff_batch items.
|
|
|
|
| 204 |
loss_val = chimera_turbo.training_step(
|
| 205 |
model,
|
| 206 |
batch,
|