tiny-flux / trainer_colab.py

Update trainer_colab.py

61d1a19 verified 3 months ago

26.1 kB

	# ============================================================================
	# TinyFlux Training Cell - OPTIMIZED
	# ============================================================================
	# Optimizations:
	# - TF32 and cuDNN settings for faster matmuls
	# - Fused AdamW optimizer
	# - Pre-encoded prompts (encode once at startup, not per batch)
	# - Batched prompt encoding
	# - DataLoader with num_workers and pin_memory
	# - torch.inference_mode() for sampling
	# - Cached img_ids in model
	# - torch.compile with max-autotune
	# ============================================================================

	import torch
	import torch.nn.functional as F
	from torch.utils.data import DataLoader
	from datasets import load_dataset
	from transformers import T5EncoderModel, T5Tokenizer, CLIPTextModel, CLIPTokenizer
	from huggingface_hub import HfApi, hf_hub_download
	from safetensors.torch import save_file, load_file
	from torch.utils.tensorboard import SummaryWriter
	from tqdm.auto import tqdm
	import numpy as np
	import math
	import os
	import json
	from datetime import datetime

	# ============================================================================
	# CUDA OPTIMIZATIONS - Set these BEFORE model creation
	# ============================================================================
	# New PyTorch 2.x API for TF32
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cudnn.benchmark = True
	torch.set_float32_matmul_precision('high')

	# Suppress the deprecation warning (settings still work)
	import warnings
	warnings.filterwarnings('ignore', message='.TF32.')

	# ============================================================================
	# CONFIG
	# ============================================================================
	BATCH_SIZE = 128
	GRAD_ACCUM = 1
	LR = 1e-4
	EPOCHS = 10
	MAX_SEQ = 128
	MIN_SNR = 5.0
	SHIFT = 3.0
	DEVICE = "cuda"
	DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

	# HuggingFace Hub
	HF_REPO = "AbstractPhil/tiny-flux"
	SAVE_EVERY = 1000
	UPLOAD_EVERY = 1000
	SAMPLE_EVERY = 500
	LOG_EVERY = 10

	# Checkpoint loading
	LOAD_TARGET = "hub:step_24000" # "latest", "best", int, "hub:step_X", "local:path", "none"
	RESUME_STEP = None

	# Paths
	CHECKPOINT_DIR = "./tiny_flux_checkpoints"
	LOG_DIR = "./tiny_flux_logs"
	SAMPLE_DIR = "./tiny_flux_samples"

	os.makedirs(CHECKPOINT_DIR, exist_ok=True)
	os.makedirs(LOG_DIR, exist_ok=True)
	os.makedirs(SAMPLE_DIR, exist_ok=True)

	# ============================================================================
	# HF HUB SETUP
	# ============================================================================
	print("Setting up HuggingFace Hub...")
	api = HfApi()
	try:
	api.create_repo(repo_id=HF_REPO, exist_ok=True, repo_type="model")
	print(f"✓ Repo ready: {HF_REPO}")
	except Exception as e:
	print(f"Note: {e}")

	# ============================================================================
	# TENSORBOARD
	# ============================================================================
	run_name = datetime.now().strftime("%Y%m%d_%H%M%S")
	writer = SummaryWriter(log_dir=os.path.join(LOG_DIR, run_name))
	print(f"✓ Tensorboard: {LOG_DIR}/{run_name}")

	# ============================================================================
	# LOAD DATASET
	# ============================================================================
	print("\nLoading dataset...")
	ds = load_dataset("AbstractPhil/flux-schnell-teacher-latents", "train_3_512", split="train")
	print(f"Samples: {len(ds)}")

	# ============================================================================
	# LOAD TEXT ENCODERS
	# ============================================================================
	print("\nLoading flan-t5-base (768 dim)...")
	t5_tok = T5Tokenizer.from_pretrained("google/flan-t5-base")
	t5_enc = T5EncoderModel.from_pretrained("google/flan-t5-base", torch_dtype=DTYPE).to(DEVICE).eval()

	print("Loading CLIP-L...")
	clip_tok = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
	clip_enc = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=DTYPE).to(DEVICE).eval()

	for p in t5_enc.parameters(): p.requires_grad = False
	for p in clip_enc.parameters(): p.requires_grad = False

	# ============================================================================
	# LOAD VAE FOR SAMPLE GENERATION
	# ============================================================================
	print("Loading Flux VAE for samples...")
	from diffusers import AutoencoderKL

	vae = AutoencoderKL.from_pretrained(
	"black-forest-labs/FLUX.1-schnell",
	subfolder="vae",
	torch_dtype=DTYPE
	).to(DEVICE).eval()
	for p in vae.parameters(): p.requires_grad = False

	# ============================================================================
	# BATCHED ENCODING - Much faster than one-by-one
	# ============================================================================
	@torch.inference_mode()
	def encode_prompts_batched(prompts: list) -> tuple:
	"""Encode multiple prompts at once - MUCH faster than loop."""
	# T5 encoding
	t5_in = t5_tok(
	prompts,
	max_length=MAX_SEQ,
	padding="max_length",
	truncation=True,
	return_tensors="pt"
	).to(DEVICE)
	t5_out = t5_enc(
	input_ids=t5_in.input_ids,
	attention_mask=t5_in.attention_mask
	).last_hidden_state

	# CLIP encoding
	clip_in = clip_tok(
	prompts,
	max_length=77,
	padding="max_length",
	truncation=True,
	return_tensors="pt"
	).to(DEVICE)
	clip_out = clip_enc(
	input_ids=clip_in.input_ids,
	attention_mask=clip_in.attention_mask
	)

	return t5_out, clip_out.pooler_output


	@torch.inference_mode()
	def encode_prompt(prompt: str) -> tuple:
	"""Encode single prompt (for compatibility)."""
	return encode_prompts_batched([prompt])


	# ============================================================================
	# PRE-ENCODE ALL PROMPTS (with disk caching)
	# ============================================================================
	print("\nPre-encoding prompts...")
	PRECOMPUTE_ENCODINGS = True
	ENCODING_CACHE_DIR = "./encoding_cache"
	os.makedirs(ENCODING_CACHE_DIR, exist_ok=True)

	# Cache filename based on dataset size and encoder
	cache_file = os.path.join(ENCODING_CACHE_DIR, f"encodings_{len(ds)}_t5base_clipL.pt")

	if PRECOMPUTE_ENCODINGS:
	if os.path.exists(cache_file):
	# Load from cache
	print(f"Loading cached encodings from {cache_file}...")
	cached = torch.load(cache_file, weights_only=True)
	all_t5_embeds = cached["t5_embeds"]
	all_clip_pooled = cached["clip_pooled"]
	print(f"✓ Loaded cached encodings")
	else:
	# Get all prompts via columnar access (instant, no iteration)
	print("Encoding prompts (will cache for future runs)...")
	all_prompts = ds["prompt"] # Columnar access - instant!

	encode_batch_size = 64
	all_t5_embeds = []
	all_clip_pooled = []

	for i in tqdm(range(0, len(all_prompts), encode_batch_size), desc="Encoding"):
	batch_prompts = all_prompts[i:i+encode_batch_size]
	t5_out, clip_out = encode_prompts_batched(batch_prompts)
	all_t5_embeds.append(t5_out.cpu())
	all_clip_pooled.append(clip_out.cpu())

	all_t5_embeds = torch.cat(all_t5_embeds, dim=0)
	all_clip_pooled = torch.cat(all_clip_pooled, dim=0)

	# Save cache (~750MB for 10k samples)
	torch.save({
	"t5_embeds": all_t5_embeds,
	"clip_pooled": all_clip_pooled,
	}, cache_file)
	print(f"✓ Saved encoding cache to {cache_file}")

	print(f" T5 embeds: {all_t5_embeds.shape}")
	print(f" CLIP pooled: {all_clip_pooled.shape}")


	# ============================================================================
	# FLOW MATCHING HELPERS
	# ============================================================================
	def flux_shift(t, s=SHIFT):
	"""Flux timestep shift for training distribution."""
	return s * t / (1 + (s - 1) * t)


	def min_snr_weight(t, gamma=MIN_SNR):
	"""Min-SNR weighting to balance loss across timesteps."""
	snr = (t / (1 - t).clamp(min=1e-5)).pow(2)
	return torch.clamp(snr, max=gamma) / snr.clamp(min=1e-5)


	# ============================================================================
	# SAMPLING FUNCTION - Optimized
	# ============================================================================
	@torch.inference_mode()
	def generate_samples(model, prompts, num_steps=20, guidance_scale=3.5, H=64, W=64):
	"""Generate sample images using Euler sampling."""
	model.eval()
	B = len(prompts)
	C = 16

	# Batch encode prompts
	t5_embeds, clip_pooleds = encode_prompts_batched(prompts)
	t5_embeds = t5_embeds.to(DTYPE)
	clip_pooleds = clip_pooleds.to(DTYPE)

	# Start from pure noise
	x = torch.randn(B, H * W, C, device=DEVICE, dtype=DTYPE)

	# Create image IDs (cached in optimized model)
	img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)

	# Timesteps with flux_shift
	t_linear = torch.linspace(0, 1, num_steps + 1, device=DEVICE, dtype=DTYPE)
	timesteps = flux_shift(t_linear, s=SHIFT)

	# Euler sampling
	for i in range(num_steps):
	t_curr = timesteps[i]
	t_next = timesteps[i + 1]
	dt = t_next - t_curr

	t_batch = t_curr.expand(B).to(DTYPE)
	guidance = torch.full((B,), guidance_scale, device=DEVICE, dtype=DTYPE)

	v_cond = model(
	hidden_states=x,
	encoder_hidden_states=t5_embeds,
	pooled_projections=clip_pooleds,
	timestep=t_batch,
	img_ids=img_ids,
	guidance=guidance,
	)

	x = x + v_cond * dt

	# Decode
	latents = x.reshape(B, H, W, C).permute(0, 3, 1, 2)
	latents = latents / vae.config.scaling_factor
	images = vae.decode(latents.to(vae.dtype)).sample
	images = (images / 2 + 0.5).clamp(0, 1)

	model.train()
	return images


	def save_samples(images, prompts, step, save_dir):
	"""Save sample images."""
	from torchvision.utils import make_grid, save_image

	for i, (img, prompt) in enumerate(zip(images, prompts)):
	safe_prompt = prompt[:50].replace(" ", "_").replace("/", "-")
	path = os.path.join(save_dir, f"step{step}_{i}_{safe_prompt}.png")
	save_image(img, path)

	grid = make_grid(images, nrow=2, normalize=False)
	writer.add_image("samples", grid, step)
	writer.add_text("sample_prompts", "\n".join(prompts), step)
	print(f" ✓ Saved {len(images)} samples")


	# ============================================================================
	# OPTIMIZED COLLATE - Returns CPU tensors (GPU transfer in training loop)
	# ============================================================================
	def collate_preencoded(batch):
	"""Collate using pre-encoded embeddings - returns CPU tensors."""
	indices = [b["__index__"] for b in batch]
	latents = torch.stack([
	torch.tensor(np.array(b["latent"]), dtype=DTYPE)
	for b in batch
	])

	# Return CPU tensors - move to GPU in training loop
	return {
	"latents": latents,
	"t5_embeds": all_t5_embeds[indices].to(DTYPE),
	"clip_pooled": all_clip_pooled[indices].to(DTYPE),
	}


	def collate_online(batch):
	"""Collate with online encoding - returns CPU tensors."""
	prompts = [b["prompt"] for b in batch]
	latents = torch.stack([
	torch.tensor(np.array(b["latent"]), dtype=DTYPE)
	for b in batch
	])

	# This still needs CUDA for encoding, so use num_workers=0
	t5_embeds, clip_pooled = encode_prompts_batched(prompts)

	return {
	"latents": latents,
	"t5_embeds": t5_embeds.cpu().to(DTYPE),
	"clip_pooled": clip_pooled.cpu().to(DTYPE),
	}


	# Simple wrapper to add index without touching the data
	class IndexedDataset:
	"""Wraps dataset to add __index__ field without expensive ds.map()"""
	def __init__(self, ds):
	self.ds = ds
	def __len__(self):
	return len(self.ds)
	def __getitem__(self, idx):
	item = dict(self.ds[idx])
	item["__index__"] = idx
	return item

	# Choose collate strategy
	if PRECOMPUTE_ENCODINGS:
	ds = IndexedDataset(ds) # Instant, no iteration
	collate_fn = collate_preencoded
	num_workers = 2
	else:
	collate_fn = collate_online
	num_workers = 0


	# ============================================================================
	# CHECKPOINT FUNCTIONS
	# ============================================================================
	def load_weights(path):
	"""Load weights, handling torch.compile prefix."""
	if path.endswith(".safetensors"):
	state_dict = load_file(path)
	elif path.endswith(".pt"):
	ckpt = torch.load(path, map_location=DEVICE, weights_only=False)
	if isinstance(ckpt, dict):
	state_dict = ckpt.get("model", ckpt.get("state_dict", ckpt))
	else:
	state_dict = ckpt
	else:
	try:
	state_dict = load_file(path)
	except:
	state_dict = torch.load(path, map_location=DEVICE, weights_only=False)

	# Strip torch.compile prefix
	if isinstance(state_dict, dict) and any(k.startswith("_orig_mod.") for k in state_dict.keys()):
	print(" Stripping torch.compile prefix...")
	state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}

	return state_dict


	def save_checkpoint(model, optimizer, scheduler, step, epoch, loss, path):
	"""Save checkpoint, stripping torch.compile prefix."""
	os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)

	state_dict = model.state_dict()
	if any(k.startswith("_orig_mod.") for k in state_dict.keys()):
	state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}

	weights_path = path.replace(".pt", ".safetensors")
	save_file(state_dict, weights_path)

	torch.save({
	"step": step,
	"epoch": epoch,
	"loss": loss,
	"optimizer": optimizer.state_dict(),
	"scheduler": scheduler.state_dict(),
	}, path)
	print(f" ✓ Saved checkpoint: step {step}")
	return weights_path


	def upload_checkpoint(weights_path, step, config):
	"""Upload to HuggingFace Hub."""
	try:
	api.upload_file(
	path_or_fileobj=weights_path,
	path_in_repo=f"checkpoints/step_{step}.safetensors",
	repo_id=HF_REPO,
	commit_message=f"Checkpoint step {step}",
	)

	config_path = os.path.join(CHECKPOINT_DIR, "config.json")
	with open(config_path, "w") as f:
	json.dump(config.__dict__, f, indent=2)
	api.upload_file(path_or_fileobj=config_path, path_in_repo="config.json", repo_id=HF_REPO)

	print(f" ✓ Uploaded step {step} to {HF_REPO}")
	except Exception as e:
	print(f" ⚠ Upload failed: {e}")


	def load_checkpoint(model, optimizer, scheduler, target):
	"""Load checkpoint from various sources."""
	start_step, start_epoch = 0, 0

	if target == "none" or target is None:
	print("Starting fresh (no checkpoint)")
	return 0, 0

	# Hub loading
	if target == "hub" or (isinstance(target, str) and target.startswith("hub:")):
	try:
	if target == "hub":
	weights_path = hf_hub_download(repo_id=HF_REPO, filename="model.safetensors")
	else:
	step_name = target.split(":")[1]
	try:
	weights_path = hf_hub_download(repo_id=HF_REPO, filename=f"checkpoints/{step_name}.safetensors")
	except:
	weights_path = hf_hub_download(repo_id=HF_REPO, filename=f"checkpoints/{step_name}.pt")
	start_step = int(step_name.split("_")[-1]) if "_" in step_name else 0

	weights = load_weights(weights_path)
	# strict=False: ignore missing buffers (sin_basis, freqs) - they're precomputed constants
	missing, unexpected = model.load_state_dict(weights, strict=False)
	if missing:
	# Filter out expected missing buffers
	expected_missing = {'time_in.sin_basis', 'guidance_in.sin_basis',
	'rope.freqs_0', 'rope.freqs_1', 'rope.freqs_2'}
	actual_missing = set(missing) - expected_missing
	if actual_missing:
	print(f" ⚠ Unexpected missing keys: {actual_missing}")
	else:
	print(f" ✓ Missing only precomputed buffers (OK)")
	print(f"✓ Loaded from hub: {target}")
	return start_step, start_epoch
	except Exception as e:
	print(f"Hub load failed: {e}")
	return 0, 0

	# Local loading
	if isinstance(target, str) and target.startswith("local:"):
	path = target.split(":", 1)[1]
	weights = load_weights(path)
	missing, unexpected = model.load_state_dict(weights, strict=False)
	if missing:
	expected_missing = {'time_in.sin_basis', 'guidance_in.sin_basis',
	'rope.freqs_0', 'rope.freqs_1', 'rope.freqs_2'}
	actual_missing = set(missing) - expected_missing
	if actual_missing:
	print(f" ⚠ Unexpected missing keys: {actual_missing}")
	print(f"✓ Loaded from local: {path}")
	return 0, 0

	print("No checkpoint found, starting fresh")
	return 0, 0


	# ============================================================================
	# DATALOADER - Optimized
	# ============================================================================
	loader = DataLoader(
	ds,
	batch_size=BATCH_SIZE,
	shuffle=True,
	collate_fn=collate_fn,
	num_workers=num_workers, # 2 for precomputed, 0 for online
	pin_memory=True,
	persistent_workers=(num_workers > 0),
	prefetch_factor=2 if num_workers > 0 else None,
	)

	# ============================================================================
	# MODEL
	# ============================================================================
	config = TinyFluxConfig()
	model = TinyFlux(config).to(DEVICE).to(DTYPE)
	print(f"\nParams: {sum(p.numel() for p in model.parameters()):,}")

	# ============================================================================
	# OPTIMIZER - Fused for speed
	# ============================================================================
	opt = torch.optim.AdamW(
	model.parameters(),
	lr=LR,
	betas=(0.9, 0.99),
	weight_decay=0.01,
	fused=True,
	)

	total_steps = len(loader) * EPOCHS // GRAD_ACCUM
	warmup = min(500, total_steps // 10)


	def lr_fn(step):
	if step < warmup:
	return step / warmup
	return 0.5 * (1 + math.cos(math.pi * (step - warmup) / (total_steps - warmup)))


	sched = torch.optim.lr_scheduler.LambdaLR(opt, lr_fn)

	# ============================================================================
	# LOAD CHECKPOINT (before compile!)
	# ============================================================================
	print(f"\nLoad target: {LOAD_TARGET}")
	start_step, start_epoch = load_checkpoint(model, opt, sched, LOAD_TARGET)

	if RESUME_STEP is not None:
	print(f"Overriding start_step: {start_step} -> {RESUME_STEP}")
	start_step = RESUME_STEP

	# ============================================================================
	# COMPILE MODEL (after loading weights)
	# ============================================================================
	model = torch.compile(model, mode="default")

	# Log config
	writer.add_text("config", json.dumps(config.__dict__, indent=2), 0)
	writer.add_text("training_config", json.dumps({
	"batch_size": BATCH_SIZE,
	"grad_accum": GRAD_ACCUM,
	"lr": LR,
	"epochs": EPOCHS,
	"min_snr": MIN_SNR,
	"shift": SHIFT,
	"optimizations": ["TF32", "fused_adamw", "precomputed_encodings", "flash_attention", "torch.compile"]
	}, indent=2), 0)

	# Sample prompts
	SAMPLE_PROMPTS = [
	"a photo of a cat sitting on a windowsill",
	"a beautiful sunset over mountains",
	"a portrait of a woman with red hair",
	"a futuristic cityscape at night",
	]

	# ============================================================================
	# TRAINING LOOP
	# ============================================================================
	print(f"\nTraining {EPOCHS} epochs, {total_steps} total steps")
	print(f"Resuming from step {start_step}, epoch {start_epoch}")
	print(f"Save: {SAVE_EVERY}, Upload: {UPLOAD_EVERY}, Sample: {SAMPLE_EVERY}, Log: {LOG_EVERY}")
	print("Optimizations: TF32, fused AdamW, pre-encoded prompts, Flash Attention, torch.compile")

	model.train()
	step = start_step
	best = float("inf")

	# Pre-create img_ids for common resolution (will be cached)
	_cached_img_ids = None

	for ep in range(start_epoch, EPOCHS):
	ep_loss = 0
	ep_batches = 0
	pbar = tqdm(loader, desc=f"E{ep + 1}")

	for i, batch in enumerate(pbar):
	# Move to GPU here (not in collate, to support multiprocessing)
	latents = batch["latents"].to(DEVICE, non_blocking=True)
	t5 = batch["t5_embeds"].to(DEVICE, non_blocking=True)
	clip = batch["clip_pooled"].to(DEVICE, non_blocking=True)

	B, C, H, W = latents.shape

	# Reshape: (B, C, H, W) -> (B, H*W, C)
	data = latents.permute(0, 2, 3, 1).reshape(B, H * W, C)
	noise = torch.randn_like(data)

	# Sample timesteps with logit-normal + flux shift
	t = torch.sigmoid(torch.randn(B, device=DEVICE))
	t = flux_shift(t, s=SHIFT).to(DTYPE).clamp(1e-4, 1 - 1e-4)

	# Linear interpolation
	t_expanded = t.view(B, 1, 1)
	x_t = (1 - t_expanded) * noise + t_expanded * data

	# Velocity target
	v_target = data - noise

	# Get img_ids (cached in model)
	img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)

	# Random guidance
	guidance = torch.rand(B, device=DEVICE, dtype=DTYPE) * 4 + 1

	# Forward
	with torch.autocast("cuda", dtype=DTYPE):
	v_pred = model(
	hidden_states=x_t,
	encoder_hidden_states=t5,
	pooled_projections=clip,
	timestep=t,
	img_ids=img_ids,
	guidance=guidance,
	)

	# Loss with Min-SNR weighting
	loss_raw = F.mse_loss(v_pred, v_target, reduction="none").mean(dim=[1, 2])
	snr_weights = min_snr_weight(t)
	loss = (loss_raw * snr_weights).mean() / GRAD_ACCUM
	loss.backward()

	if (i + 1) % GRAD_ACCUM == 0:
	grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	opt.step()
	sched.step()
	opt.zero_grad(set_to_none=True) # Slightly faster than zero_grad()
	step += 1

	if step % LOG_EVERY == 0:
	writer.add_scalar("train/loss", loss.item() * GRAD_ACCUM, step)
	writer.add_scalar("train/lr", sched.get_last_lr()[0], step)
	writer.add_scalar("train/grad_norm", grad_norm.item(), step)
	writer.add_scalar("train/t_mean", t.mean().item(), step)

	if step % SAMPLE_EVERY == 0:
	print(f"\n Generating samples at step {step}...")
	images = generate_samples(model, SAMPLE_PROMPTS, num_steps=20)
	save_samples(images, SAMPLE_PROMPTS, step, SAMPLE_DIR)

	if step % SAVE_EVERY == 0:
	ckpt_path = os.path.join(CHECKPOINT_DIR, f"step_{step}.pt")
	weights_path = save_checkpoint(model, opt, sched, step, ep, loss.item(), ckpt_path)

	if step % UPLOAD_EVERY == 0:
	upload_checkpoint(weights_path, step, config)

	ep_loss += loss.item() * GRAD_ACCUM
	ep_batches += 1
	pbar.set_postfix(loss=f"{loss.item() * GRAD_ACCUM:.4f}", lr=f"{sched.get_last_lr()[0]:.1e}", step=step)

	avg = ep_loss / max(ep_batches, 1)
	print(f"Epoch {ep + 1} loss: {avg:.4f}")
	writer.add_scalar("train/epoch_loss", avg, ep + 1)

	if avg < best:
	best = avg
	best_path = os.path.join(CHECKPOINT_DIR, "best.pt")
	weights_path = save_checkpoint(model, opt, sched, step, ep, avg, best_path)

	try:
	api.upload_file(
	path_or_fileobj=weights_path,
	path_in_repo="model.safetensors",
	repo_id=HF_REPO,
	commit_message=f"Best model (epoch {ep + 1}, loss {avg:.4f})",
	)
	print(f" ✓ Uploaded best to {HF_REPO}")
	except Exception as e:
	print(f" ⚠ Upload failed: {e}")

	# ============================================================================
	# FINAL
	# ============================================================================
	print("\nSaving final model...")
	final_path = os.path.join(CHECKPOINT_DIR, "final.pt")
	weights_path = save_checkpoint(model, opt, sched, step, EPOCHS, best, final_path)

	print("Generating final samples...")
	images = generate_samples(model, SAMPLE_PROMPTS, num_steps=20)
	save_samples(images, SAMPLE_PROMPTS, step, SAMPLE_DIR)

	try:
	api.upload_file(path_or_fileobj=weights_path, path_in_repo="model.safetensors", repo_id=HF_REPO)
	config_path = os.path.join(CHECKPOINT_DIR, "config.json")
	with open(config_path, "w") as f:
	json.dump(config.__dict__, f, indent=2)
	api.upload_file(path_or_fileobj=config_path, path_in_repo="config.json", repo_id=HF_REPO)
	print(f"\n✓ Training complete! https://huggingface.co/{HF_REPO}")
	except Exception as e:
	print(f"\n⚠ Final upload failed: {e}")

	writer.close()
	print(f"Best loss: {best:.4f}")