Prisma / graft_g2lu.py

Initial commit

56e82ec about 2 months ago

11.1 kB

	"""
	G²LU Gate Grafting: Surgically upgrade pretrained SwiGLU models to G²LU.

	Takes any HuggingFace model with SwiGLU (gate_proj + up_proj), freezes everything
	except gate weights, adds W4 for nested gating, and trains with alignment + LM loss.

	This is grafting applied to the gate mechanism — the same methodology validated for
	full layer replacement, now targeting the minimum surgical unit.

	Usage:
	python -m circuits.train --arch graft_g2lu --pretrained meta-llama/Llama-3.2-1B \
	--align-weight 1.0 --graft-warmup 500 --data hf:Bingsu/openwebtext_20p ...
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from pathlib import Path


	class G2LU_MLP(nn.Module):
	"""Per-layer MLP wrapper that upgrades SwiGLU to G²LU.

	Holds references to the original gate_proj (W3, frozen), up_proj (W1, frozen),
	down_proj (W2, frozen), plus a new w4 (zero-initialized, trainable).

	Gate ordering: silu(W4@x * silu(W3@x)) — the pretrained gate (W3) acts as
	structural prior, constraining W4 to operate within the feature subspace the
	pretrained model already deems relevant. W4's gradients are scaled by silu(W3@x),
	inheriting the pretrained model's feature selection hierarchy.
	"""

	def __init__(self, original_mlp: nn.Module):
	super().__init__()
	# References to original weights (all frozen)
	self.gate_proj = original_mlp.gate_proj # W3 — frozen
	self.up_proj = original_mlp.up_proj # W1 — frozen
	self.down_proj = original_mlp.down_proj # W2 — frozen

	# New W4: same shape as gate_proj, zero-initialized, matched dtype
	self.w4 = nn.Linear(
	self.gate_proj.in_features,
	self.gate_proj.out_features,
	bias=self.gate_proj.bias is not None,
	dtype=self.gate_proj.weight.dtype,
	device=self.gate_proj.weight.device,
	)
	nn.init.zeros_(self.w4.weight)
	if self.w4.bias is not None:
	nn.init.zeros_(self.w4.bias)

	# Blend alpha: 0 = pure SwiGLU, 1 = full G²LU
	self._alpha = 0.0
	# Per-layer alignment loss (collected by parent)
	self._align_loss = None

	def forward(self, x):
	# Pretrained gate (frozen W3) — structural prior
	w3_gate = F.silu(self.gate_proj(x))

	# G²LU gate: silu(W4@x * silu(W3@x))
	# W4 modulated BY pretrained knowledge, not the reverse
	g2lu_gate = F.silu(self.w4(x) * w3_gate)

	# Blend warmup: smooth transition from SwiGLU → G²LU
	if self._alpha < 1.0:
	gate = (1.0 - self._alpha) * w3_gate + self._alpha * g2lu_gate
	else:
	gate = g2lu_gate

	# Per-layer alignment loss (compare against original SwiGLU gate)
	self._align_loss = F.mse_loss(gate, w3_gate.detach())

	return self.down_proj(gate * self.up_proj(x))


	class G2LU_GraftedModel(nn.Module):
	"""Full model wrapper that upgrades a pretrained HF model's MLPs to G²LU.

	Interface matches CircuitTransformer: forward(input_ids, labels=labels) returns
	{"loss", "logits", "align_loss"}.
	"""

	def __init__(
	self,
	pretrained_name: str,
	align_weight: float = 1.0,
	warmup_steps: int = 500,
	device: str = "cuda",
	dtype=torch.bfloat16,
	):
	super().__init__()
	self.pretrained_name = pretrained_name
	self.align_weight = align_weight
	self.warmup_steps = warmup_steps
	self._current_step = 0

	# Load pretrained HF model
	from transformers import AutoModelForCausalLM
	self.model = AutoModelForCausalLM.from_pretrained(
	pretrained_name,
	dtype=dtype,
	trust_remote_code=True,
	)

	# Discover and replace MLPs
	self.g2lu_mlps = []
	self._replace_mlps()

	# Freeze everything, then selectively unfreeze W4 only
	for param in self.model.parameters():
	param.requires_grad = False

	for g2lu in self.g2lu_mlps:
	for param in g2lu.w4.parameters():
	param.requires_grad = True

	self.model.to(device)

	# Print summary
	total_params = sum(p.numel() for p in self.model.parameters())
	trainable = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
	print(f"G²LU Graft: {pretrained_name}")
	print(f" Layers upgraded: {len(self.g2lu_mlps)}")
	print(f" Total params: {total_params:,} ({total_params/1e6:.1f}M)")
	print(f" Trainable params: {trainable:,} ({trainable/1e6:.1f}M, {100*trainable/total_params:.1f}%)")
	print(f" Align weight: {align_weight}, Warmup: {warmup_steps} steps")

	def _replace_mlps(self):
	"""Walk the model tree and replace SwiGLU MLPs with G²LU wrappers."""
	# Try common decoder layer paths
	layers = None
	for attr_path in ["model.layers", "gpt_neox.layers", "transformer.h"]:
	obj = self.model
	try:
	for attr in attr_path.split("."):
	obj = getattr(obj, attr)
	layers = obj
	break
	except AttributeError:
	continue

	if layers is None:
	raise ValueError(
	f"Could not find decoder layers in {type(self.model).__name__}. "
	f"Tried: model.layers, gpt_neox.layers, transformer.h"
	)

	for i, layer in enumerate(layers):
	# Try common MLP attribute names
	mlp = None
	mlp_attr = None
	for attr in ["mlp", "feed_forward"]:
	if hasattr(layer, attr):
	mlp = getattr(layer, attr)
	mlp_attr = attr
	break

	if mlp is None:
	continue

	# Check for SwiGLU signature (gate_proj + up_proj)
	if hasattr(mlp, "gate_proj") and hasattr(mlp, "up_proj"):
	g2lu = G2LU_MLP(mlp)
	setattr(layer, mlp_attr, g2lu)
	self.g2lu_mlps.append(g2lu)

	if not self.g2lu_mlps:
	raise ValueError(
	"No SwiGLU MLPs found (need gate_proj + up_proj attributes). "
	"This model may not use gated linear units."
	)

	def set_step(self, step: int):
	"""Update blend alpha across all G²LU MLPs."""
	self._current_step = step
	alpha = min(step / max(self.warmup_steps, 1), 1.0)
	for g2lu in self.g2lu_mlps:
	g2lu._alpha = alpha

	def trainable_parameters(self):
	"""Yield only unfrozen parameters (for optimizer and grad clipping)."""
	for param in self.model.parameters():
	if param.requires_grad:
	yield param

	def collect_align_loss(self):
	"""Average per-layer alignment losses."""
	losses = [g2lu._align_loss for g2lu in self.g2lu_mlps if g2lu._align_loss is not None]
	if not losses:
	return torch.tensor(0.0)
	return torch.stack(losses).mean()

	def forward(self, input_ids, labels=None, **kwargs):
	outputs = self.model(input_ids=input_ids, labels=labels, **kwargs)

	result = {"logits": outputs.logits}

	align_loss = self.collect_align_loss()
	result["align_loss"] = align_loss

	if labels is not None:
	# Combine LM loss + alignment loss
	result["loss"] = outputs.loss + self.align_weight * align_loss
	result["lm_loss"] = outputs.loss
	else:
	result["loss"] = align_loss

	return result

	def generate(self, input_ids, **kwargs):
	"""Delegate to HF model's .generate()."""
	return self.model.generate(input_ids=input_ids, **kwargs)


	def save_g2lu_checkpoint(
	model: G2LU_GraftedModel,
	optimizer: torch.optim.Optimizer,
	step: int,
	epoch: int,
	loss: float,
	path: str,
	epoch_step: int = 0,
	best_val_loss: float \| None = None,
	scaler=None,
	tokenizer_name: str = None,
	):
	"""Delta save: only trainable params + metadata."""
	# Extract only requires_grad params
	raw = model.model if not hasattr(model, '_orig_mod') else model._orig_mod.model
	# Handle torch.compile wrapper
	if hasattr(model, '_orig_mod'):
	g2lu_model = model._orig_mod
	else:
	g2lu_model = model

	delta_sd = {}
	full_sd = g2lu_model.model.state_dict()
	for name, param in g2lu_model.model.named_parameters():
	if param.requires_grad:
	# Strip _orig_mod. prefix if present
	clean_name = name.removeprefix("_orig_mod.")
	delta_sd[clean_name] = full_sd.get(name, param.data).clone()

	# Also save the w4 weights explicitly (they're part of the replaced modules)
	for name, val in full_sd.items():
	clean_name = name.removeprefix("_orig_mod.")
	if ".w4." in clean_name and clean_name not in delta_sd:
	delta_sd[clean_name] = val.clone()

	checkpoint = {
	"model": delta_sd,
	"optimizer": optimizer.state_dict(),
	"step": step,
	"epoch": epoch,
	"epoch_step": epoch_step,
	"loss": loss,
	"model_type": "graft_g2lu",
	"pretrained_name": g2lu_model.pretrained_name,
	"align_weight": g2lu_model.align_weight,
	"warmup_steps": g2lu_model.warmup_steps,
	"tokenizer_name": tokenizer_name or g2lu_model.pretrained_name,
	}
	if best_val_loss is not None:
	checkpoint["best_val_loss"] = best_val_loss
	if scaler is not None:
	checkpoint["scaler"] = scaler.state_dict()
	torch.save(checkpoint, path)


	def load_g2lu_model(checkpoint_path: str, device: str = "cuda", dtype=torch.bfloat16):
	"""Delta load: recreate model from pretrained + apply delta weights."""
	checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)

	pretrained_name = checkpoint["pretrained_name"]
	align_weight = checkpoint.get("align_weight", 1.0)
	warmup_steps = checkpoint.get("warmup_steps", 500)

	model = G2LU_GraftedModel(
	pretrained_name=pretrained_name,
	align_weight=align_weight,
	warmup_steps=warmup_steps,
	device=device,
	dtype=dtype,
	)

	# Load delta weights
	delta_sd = checkpoint["model"]
	# Strip _orig_mod. prefix if present
	delta_sd = {k.removeprefix("_orig_mod."): v for k, v in delta_sd.items()}

	# Apply delta weights to the model
	missing, unexpected = model.model.load_state_dict(delta_sd, strict=False)
	if unexpected:
	print(f" Warning: unexpected keys in delta checkpoint: {unexpected[:5]}...")

	# Set alpha to 1.0 for inference (full G²LU)
	model.set_step(warmup_steps + 1)

	return model