Update svae_cadence.py

e37d8c5 verified 10 days ago

21.1 kB

	"""
	SVAE v2 Conduit Trainer — Prototype
	=====================================
	Train PatchSVAEv2 from random init on noise.
	The decoder MUST reconstruct from decomposed spectral + conduit bundles.
	No M_hat shortcut. Every conduit element is load-bearing.

	Readouts per epoch:
	Standard: MSE, S profile, erank, s_delta, CV
	Conduit: friction stats, settle distribution, char_coeff profile,
	per-mode reconstruction contribution

	Should converge rapidly or fail — we'll know within 10 epochs.

	Usage:
	python train_v2_conduit.py
	"""

	import os
	import math
	import time
	import json
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from tqdm import tqdm

	try:
	from google.colab import userdata
	os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
	from huggingface_hub import login
	login(token=os.environ["HF_TOKEN"])
	except Exception:
	pass

	from geolip_svae.model import cv_of, extract_patches, stitch_patches
	from geolip_svae.model_v2 import PatchSVAEv2


	# ═══════════════════════════════════════════════════════════════
	# CONFIG
	# ═══════════════════════════════════════════════════════════════

	HF_REPO = 'AbstractPhil/geolip-SVAE'
	VERSION = 'version2_v2_conduit_proto_2'
	LOCAL_DIR = f'/content/{VERSION}_checkpoints'
	LOG_PATH = os.path.join(LOCAL_DIR, 'training_log.json')

	CFG = dict(
	# Architecture (inherited from Fresnel v50)
	V=16, D=4, ps=4, hidden=384, depth=4, n_cross=2,
	stage_hidden=128, stage_V=64,

	# Training
	img_size=64,
	batch_size=256,
	lr=3e-4,
	epochs=50,
	ds_size=1280000,
	val_size=10000,

	# CV soft hand
	target_cv=0.2915,
	cv_weight=0.3,
	boost=0.5,
	sigma=0.15,

	# Checkpointing
	save_every=5,
	val_per_type_every=5,
	)


	# ═══════════════════════════════════════════════════════════════
	# NOISE DATASET (16 types, same as Freckles)
	# ═══════════════════════════════════════════════════════════════

	NOISE_NAMES = {
	0: 'gaussian', 1: 'uniform', 2: 'uniform_sc', 3: 'poisson',
	4: 'pink', 5: 'brown', 6: 'salt_pepper', 7: 'sparse',
	8: 'block', 9: 'gradient', 10: 'checker', 11: 'mixed',
	12: 'structural', 13: 'cauchy', 14: 'exponential', 15: 'laplace',
	}


	def _pink(shape):
	w = torch.randn(shape)
	S = torch.fft.rfft2(w)
	h, ww = shape[-2], shape[-1]
	fy = torch.fft.fftfreq(h).unsqueeze(-1).expand(-1, ww // 2 + 1)
	fx = torch.fft.rfftfreq(ww).unsqueeze(0).expand(h, -1)
	return torch.fft.irfft2(S / torch.sqrt(fx2 + fy2).clamp(min=1e-8), s=(h, ww))


	def _brown(shape):
	w = torch.randn(shape)
	S = torch.fft.rfft2(w)
	h, ww = shape[-2], shape[-1]
	fy = torch.fft.fftfreq(h).unsqueeze(-1).expand(-1, ww // 2 + 1)
	fx = torch.fft.rfftfreq(ww).unsqueeze(0).expand(h, -1)
	return torch.fft.irfft2(S / (fx2 + fy2).clamp(min=1e-8), s=(h, ww))


	def _gen_noise(noise_type, s, rng):
	if noise_type == 0: return torch.randn(3, s, s)
	elif noise_type == 1: return torch.rand(3, s, s) * 2 - 1
	elif noise_type == 2: return (torch.rand(3, s, s) - 0.5) * 4
	elif noise_type == 3:
	lam = rng.uniform(0.5, 20.0)
	return torch.poisson(torch.full((3, s, s), lam)) / lam - 1.0
	elif noise_type == 4:
	img = _pink((3, s, s)); return img / (img.std() + 1e-8)
	elif noise_type == 5:
	img = _brown((3, s, s)); return img / (img.std() + 1e-8)
	elif noise_type == 6:
	return torch.where(torch.rand(3, s, s) > 0.5,
	torch.ones(3, s, s) * 2, -torch.ones(3, s, s) * 2) + torch.randn(3, s, s) * 0.1
	elif noise_type == 7:
	return torch.randn(3, s, s) * (torch.rand(3, s, s) > 0.9).float() * 3
	elif noise_type == 8:
	b = rng.randint(2, max(3, s // 2))
	sm = torch.randn(3, s // b + 1, s // b + 1)
	return F.interpolate(sm.unsqueeze(0), size=s, mode='nearest').squeeze(0)
	elif noise_type == 9:
	gy = torch.linspace(-2, 2, s).unsqueeze(1).expand(s, s)
	gx = torch.linspace(-2, 2, s).unsqueeze(0).expand(s, s)
	a = rng.uniform(0, 2 * math.pi)
	return (math.cos(a) * gx + math.sin(a) * gy).unsqueeze(0).expand(3, -1, -1) + torch.randn(3, s, s) * 0.5
	elif noise_type == 10:
	cs = rng.randint(2, max(3, s // 2))
	cy = torch.arange(s) // cs; cx = torch.arange(s) // cs
	return ((cy.unsqueeze(1) + cx.unsqueeze(0)) % 2).float().unsqueeze(0).expand(3, -1, -1) * 2 - 1 + torch.randn(3, s, s) * 0.3
	elif noise_type == 11:
	alpha = rng.uniform(0.2, 0.8)
	return alpha * torch.randn(3, s, s) + (1 - alpha) * (torch.rand(3, s, s) * 2 - 1)
	elif noise_type == 12:
	img = torch.zeros(3, s, s); h2 = s // 2
	img[:, :h2, :h2] = torch.randn(3, h2, h2)
	img[:, :h2, h2:] = torch.rand(3, h2, h2) * 2 - 1
	img[:, h2:, :h2] = _pink((3, h2, h2)) / 2
	img[:, h2:, h2:] = torch.where(torch.rand(3, h2, h2) > 0.5,
	torch.ones(3, h2, h2), -torch.ones(3, h2, h2))
	return img
	elif noise_type == 13:
	return torch.tan(math.pi * (torch.rand(3, s, s) - 0.5)).clamp(-3, 3)
	elif noise_type == 14:
	return torch.empty(3, s, s).exponential_(1.0) - 1.0
	elif noise_type == 15:
	u = torch.rand(3, s, s) - 0.5
	return -torch.sign(u) * torch.log1p(-2 * u.abs())
	return torch.randn(3, s, s)


	class OmegaNoiseDataset(torch.utils.data.Dataset):
	def __init__(self, size=1280000, img_size=64):
	self.size = size
	self.img_size = img_size
	self._rng = np.random.RandomState(42)
	self._call_count = 0
	def __len__(self):
	return self.size
	def __getitem__(self, idx):
	self._call_count += 1
	if self._call_count % 1000 == 0:
	self._rng = np.random.RandomState(int.from_bytes(os.urandom(4), 'big'))
	torch.manual_seed(int.from_bytes(os.urandom(4), 'big'))
	noise_type = idx % 16
	img = _gen_noise(noise_type, self.img_size, self._rng).clamp(-4, 4)
	return img.float(), noise_type


	def eval_per_type(model, img_size, device, n_per=32):
	rng = np.random.RandomState(99)
	model.eval()
	results = {}
	with torch.no_grad():
	for t in range(16):
	imgs = torch.stack([_gen_noise(t, img_size, rng).clamp(-4, 4)
	for _ in range(n_per)]).to(device)
	out = model(imgs)
	results[t] = F.mse_loss(out['recon'], imgs).item()
	return results


	# ═══════════════════════════════════════════════════════════════
	# CONDUIT READOUTS
	# ═══════════════════════════════════════════════════════════════

	def conduit_readout(model, images, device):
	"""Extract and summarize conduit telemetry from a batch."""
	model.eval()
	with torch.no_grad():
	out = model(images.to(device))
	packet = model.last_conduit_packet

	S = out['svd']['S_orig'] # (B, N, D)
	B, N, D = S.shape

	friction = packet.friction.reshape(B, N, D)
	settle = packet.settle.reshape(B, N, D)
	char_coeffs = packet.char_coeffs.reshape(B, N, D)
	ext_order = packet.extraction_order.reshape(B, N, D)
	refine_res = packet.refinement_residual.reshape(B, N)

	# Log-friction for readable stats
	log_fric = torch.log1p(friction)

	stats = {
	'S_mean': S.mean(dim=(0, 1)).cpu().tolist(),
	'S_std': S.std(dim=(0, 1)).cpu().tolist(),
	'friction_mean': friction.mean().item(),
	'friction_max': friction.max().item(),
	'friction_std': friction.std().item(),
	'log_fric_mean': log_fric.mean(dim=(0, 1)).cpu().tolist(),
	'log_fric_std': log_fric.std(dim=(0, 1)).cpu().tolist(),
	'settle_mean': settle.mean(dim=(0, 1)).cpu().tolist(),
	'settle_frac_gt2': (settle > 2).float().mean().item(),
	'char_coeffs_mean': char_coeffs.mean(dim=(0, 1)).cpu().tolist(),
	'refine_res_mean': refine_res.mean().item(),
	'refine_res_max': refine_res.max().item(),
	}

	# Per-mode friction spatial CV
	for d in range(D):
	per_img = friction[:, :, d].reshape(B, -1)
	cvs = per_img.std(dim=1) / (per_img.mean(dim=1) + 1e-8)
	stats[f'friction_spatial_cv_mode{d}'] = cvs.mean().item()

	return stats


	def print_conduit_readout(stats, D=4):
	"""Pretty-print conduit telemetry."""
	print(f" S: [{', '.join(f'{v:.3f}' for v in stats['S_mean'])}]")
	print(f" S_std: [{', '.join(f'{v:.4f}' for v in stats['S_std'])}]")
	print(f" log_fric: [{', '.join(f'{v:.3f}' for v in stats['log_fric_mean'])}] "
	f"± [{', '.join(f'{v:.3f}' for v in stats['log_fric_std'])}]")
	print(f" fric_raw: mean={stats['friction_mean']:.1f} max={stats['friction_max']:.0f}")
	print(f" settle: [{', '.join(f'{v:.2f}' for v in stats['settle_mean'])}] "
	f"(>{2}: {stats['settle_frac_gt2']:.1%})")
	print(f" char_c: [{', '.join(f'{v:.4f}' for v in stats['char_coeffs_mean'])}]")
	print(f" refine: mean={stats['refine_res_mean']:.2e} max={stats['refine_res_max']:.2e}")
	spatial_cvs = [stats.get(f'friction_spatial_cv_mode{d}', 0) for d in range(D)]
	print(f" fric_cv: [{', '.join(f'{v:.4f}' for v in spatial_cvs)}]")


	# ═══════════════════════════════════════════════════════════════
	# LOGGING
	# ═══════════════════════════════════════════════════════════════

	def load_log():
	if os.path.exists(LOG_PATH):
	with open(LOG_PATH) as f:
	return json.load(f)
	return {'version': VERSION, 'entries': []}


	def save_log(log):
	os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
	with open(LOG_PATH, 'w') as f:
	json.dump(log, f, indent=2)


	# ═══════════════════════════════════════════════════════════════
	# SAVE & PUSH
	# ═══════════════════════════════════════════════════════════════

	def save_checkpoint(model, opt, sched, epoch, val_mse, log,
	path, is_best=False):
	os.makedirs(os.path.dirname(path), exist_ok=True)
	ckpt = {
	'config': {
	'V': CFG['V'], 'D': CFG['D'], 'patch_size': CFG['ps'],
	'hidden': CFG['hidden'], 'depth': CFG['depth'],
	'n_cross_layers': CFG['n_cross'],
	'stage_hidden': CFG.get('stage_hidden', 128),
	'stage_V': CFG.get('stage_V', 16),
	'img_size': CFG['img_size'],
	'model_type': 'v2',
	},
	'model_state_dict': model.state_dict(),
	'optimizer_state_dict': opt.state_dict(),
	'scheduler_state_dict': sched.state_dict(),
	'epoch': epoch,
	'val_mse': val_mse,
	}
	torch.save(ckpt, path)
	size_mb = os.path.getsize(path) / (1024 * 1024)
	print(f" 💾 {path} ({size_mb:.1f}MB, ep{epoch}, MSE={val_mse:.6f})")

	try:
	from huggingface_hub import HfApi
	api = HfApi()
	api.upload_file(
	path_or_fileobj=path,
	path_in_repo=f'{VERSION}/checkpoints/{os.path.basename(path)}',
	repo_id=HF_REPO, repo_type='model',
	commit_message=f'{VERSION} ep{epoch} mse={val_mse:.6f}')
	if is_best:
	api.upload_file(
	path_or_fileobj=path,
	path_in_repo=f'{VERSION}/checkpoints/best.pt',
	repo_id=HF_REPO, repo_type='model',
	commit_message=f'{VERSION} BEST ep{epoch} mse={val_mse:.6f}')
	save_log(log)
	api.upload_file(
	path_or_fileobj=LOG_PATH,
	path_in_repo=f'{VERSION}/training_log.json',
	repo_id=HF_REPO, repo_type='model',
	commit_message=f'{VERSION} log ep{epoch}')
	print(f" ☁️ Pushed ep{epoch}")
	except Exception as e:
	print(f" ⚠️ Push failed: {e}")


	# ═══════════════════════════════════════════════════════════════
	# TRAINING
	# ═══════════════════════════════════════════════════════════════

	def train():
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	os.makedirs(LOCAL_DIR, exist_ok=True)

	print("\n" + "=" * 70)
	print(f"SVAE v2 CONDUIT TRAINER — {VERSION}")
	print("=" * 70)

	# ── Fresh v2 model from random init ──
	D = CFG['D']
	model = PatchSVAEv2(
	V=CFG['V'], D=D, ps=CFG['ps'],
	hidden=CFG['hidden'], depth=CFG['depth'],
	n_cross=CFG['n_cross'],
	stage_hidden=CFG.get('stage_hidden', 128),
	stage_V=CFG.get('stage_V', 16),
	).to(device)

	n_params = sum(p.numel() for p in model.parameters())
	print(f"\n Fresh PatchSVAEv2 from random init")
	print(f" Total params: {n_params:,}")

	# ── Data ──
	print(f"\n Dataset: 16 noise types, {CFG['ds_size']:,} samples/epoch")
	print(f" Image size: {CFG['img_size']}×{CFG['img_size']}")
	print(f" Batch size: {CFG['batch_size']}")

	train_ds = OmegaNoiseDataset(size=CFG['ds_size'], img_size=CFG['img_size'])
	val_ds = OmegaNoiseDataset(size=CFG['val_size'], img_size=CFG['img_size'])
	train_loader = torch.utils.data.DataLoader(
	train_ds, batch_size=CFG['batch_size'], shuffle=True,
	num_workers=4, pin_memory=True, drop_last=True)
	val_loader = torch.utils.data.DataLoader(
	val_ds, batch_size=CFG['batch_size'], shuffle=False,
	num_workers=4, pin_memory=True)

	# ── Optimizer (all params — full model training) ──
	opt = torch.optim.Adam(model.parameters(), lr=CFG['lr'])
	sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=CFG['epochs'])

	# CV soft hand
	target_cv = CFG['target_cv']
	cv_weight = CFG['cv_weight']
	boost = CFG['boost']
	sigma = CFG['sigma']

	# Log
	log = load_log()
	best_mse = float('inf')

	# ── Initial conduit readout ──
	print(f"\n Initial conduit profile:")
	sample_batch = next(iter(val_loader))[0][:64]
	init_stats = conduit_readout(model, sample_batch, device)
	print_conduit_readout(init_stats, D)

	# ── Initial MSE (will be terrible — decoder is random) ──
	model.eval()
	init_mse = 0
	init_n = 0
	with torch.no_grad():
	for imgs, _ in val_loader:
	imgs = imgs.to(device)
	out = model(imgs)
	init_mse += F.mse_loss(out['recon'], imgs).item() * len(imgs)
	init_n += len(imgs)
	if init_n >= 2560:
	break
	init_mse /= init_n
	print(f"\n Initial MSE (random decoder): {init_mse:.4f}")
	print("=" * 70)

	# ═══════════════════════════════════════════════════════════
	# TRAINING LOOP
	# ═══════════════════════════════════════════════════════════

	for epoch in range(1, CFG['epochs'] + 1):

	model.train()
	total_loss, total_recon, n = 0, 0, 0
	last_cv = target_cv
	t0 = time.time()

	pbar = tqdm(train_loader,
	desc=f"Ep {epoch}/{CFG['epochs']}",
	bar_format='{l_bar}{bar:20}{r_bar}')

	for batch_idx, (images, _) in enumerate(pbar):
	images = images.to(device)
	opt.zero_grad()
	out = model(images)
	recon_loss = F.mse_loss(out['recon'], images)

	# CV soft hand
	with torch.no_grad():
	if batch_idx % 50 == 0:
	cur_cv = cv_of(out['svd']['M'][0, 0])
	if cur_cv > 0:
	last_cv = cur_cv
	delta = last_cv - target_cv
	prox = math.exp(-delta*2 / (2 sigma**2))

	recon_w = 1.0 + boost * prox
	cv_pen = cv_weight * (1.0 - prox)
	loss = recon_w * recon_loss + cv_pen * (last_cv - target_cv)**2
	loss.backward()

	torch.nn.utils.clip_grad_norm_(
	model.cross_attn.parameters(), max_norm=0.5)

	opt.step()

	total_loss += loss.item() * len(images)
	total_recon += recon_loss.item() * len(images)
	n += len(images)
	pbar.set_postfix_str(f"mse={recon_loss.item():.4f} cv={last_cv:.3f}")

	sched.step()
	epoch_time = time.time() - t0

	# ── Validation ──
	model.eval()
	val_mse, val_n = 0, 0
	with torch.no_grad():
	for imgs, _ in val_loader:
	imgs = imgs.to(device)
	out = model(imgs)
	val_mse += F.mse_loss(out['recon'], imgs).item() * len(imgs)
	val_n += len(imgs)
	val_mse /= val_n

	# ── Geometry snapshot ──
	with torch.no_grad():
	sample = next(iter(val_loader))[0][:64].to(device)
	out = model(sample)
	S_mean = out['svd']['S_orig'].mean(dim=(0, 1))
	S_coord = out['svd']['S'].mean(dim=(0, 1))
	erank = model.effective_rank(
	out['svd']['S'].reshape(-1, D)).mean().item()
	s_delta = model.s_delta(out['svd']['S_orig'], out['svd']['S'])

	# ── Conduit readout ──
	cond_stats = conduit_readout(model, sample.cpu(), device)

	# ── Print ──
	is_best = val_mse < best_mse
	if is_best:
	best_mse = val_mse

	print(f"\n ep{epoch:3d} \| recon={total_recon/n:.4f} val={val_mse:.4f} "
	f"{'★ BEST' if is_best else ''} \| "
	f"er={erank:.2f} Sd={s_delta:.4f} cv={last_cv:.3f} \| {epoch_time:.0f}s")
	print_conduit_readout(cond_stats, D)

	# ── Per-type eval ──
	if epoch % CFG['val_per_type_every'] == 0 or epoch <= 3:
	type_mse = eval_per_type(model, CFG['img_size'], device)
	type_str = " ".join(
	f"{NOISE_NAMES[t][:4]}={v:.3f}" for t, v in sorted(type_mse.items()))
	print(f" types: {type_str}")

	# ── Log entry ──
	log['entries'].append({
	'epoch': epoch,
	'train_mse': total_recon / n,
	'val_mse': val_mse,
	'cv': last_cv,
	'erank': erank,
	's_delta': s_delta,
	'S_mean': S_mean.cpu().tolist(),
	'conduit': cond_stats,
	'epoch_time': epoch_time,
	'lr': opt.param_groups[0]['lr'],
	})

	# ── Checkpoint ──
	if is_best:
	save_checkpoint(model, opt, sched, epoch, val_mse, log,
	os.path.join(LOCAL_DIR, 'best.pt'),
	is_best=True)

	if epoch % CFG['save_every'] == 0:
	save_checkpoint(model, opt, sched, epoch, val_mse, log,
	os.path.join(LOCAL_DIR, f'epoch_{epoch:04d}.pt'))

	# ═══════════════════════════════════════════════════════════
	# DONE
	# ═══════════════════════════════════════════════════════════

	print(f"\n{'=' * 70}")
	print(f"v2 CONDUIT TRAINING COMPLETE — {VERSION}")
	print(f" Best MSE: {best_mse:.6f}")
	print(f" Epochs: {CFG['epochs']}")
	print(f" Params: {n_params:,}")
	print(f"{'=' * 70}")

	return model


	if __name__ == "__main__":
	torch.set_float32_matmul_precision('high')
	train()