AbstractPhil
/

geolip-hypersphere-experiments

TensorBoard

Model card Files Files and versions

xet

Metrics Training metrics Community

AbstractPhil commited on Mar 20

Commit

e50ad6c

verified ·

1 Parent(s): cc4f091

Update constellation.py

Browse files

Files changed (1) hide show

constellation.py +467 -292

constellation.py CHANGED Viewed

@@ -1,39 +1,32 @@
 """
-Constellation — Unified Geometric Observer + Interpreter
-==========================================================
-Configurable implementation covering all validated constellation forms.
-PROVEN RESULTS:
-  Form 1 (Core):   91.5% CIFAR-10 @ 1.6M params, CV=0.2045
-  Form 5 (Relay):  cos_to_orig=0.994 @ depth 16, 8.4× faster than attn @ 131K
-  Hybrid:          88.0% CIFAR-10 @ 23.5M (conv encoder + constellation)
-  Scattering v1:   81.9% CIFAR-10 @ 17M (frozen scattering + constellation)
-UNIVERSAL RULES (empirically validated):
-  - SquaredReLU in all constellation paths, never GELU
-  - Patchwork: Linear(in, in*2) → SquaredReLU → LN → Linear(in*2, out)
-  - Gate init: -3.0 (sigmoid ≈ 0.047) for relay/residual forms
-  - SLERP: acos in fp32, everything else in compute dtype
-  - Adam, NO weight decay — geometry IS regularization
-  - InfoNCE is alignment FORCE, Procrustes is REGULARIZER
-  - CV loss on the BOTTLENECK, weight 0.001 or below
-  - Anchor dropout (30%) prevents collapse in high-anchor configs
-FORMS:
-  Constellation       — observation + interpretation, configurable
-  ConstellationRelay  — per-token geometric layer with gated residual
 Usage:
-    from constellation import Constellation, ConstellationRelay
-    # Form 1 (Core): single vector per image
-    c = Constellation(n_anchors=16, dim=16, n_directions=8,
-                      d_comp=64, n_phases=3)
-    output = c(directions)  # (B, 8, 16) → ConstellationOutput
-    # Form 5 (Relay): per-token processing
-    r = ConstellationRelay(dim=256, patch_dim=16, n_anchors=16)
-    out = r(tokens)  # (B, S, 256) → (B, S, 256)
 """
 import torch
@@ -41,32 +34,58 @@ import torch.nn as nn
 import torch.nn.functional as F
 import math
 from dataclasses import dataclass
-from typing import Optional
 # ══════════════════════════════════════════════════════════════════
-# ACTIVATION
 # ══════════════════════════════════════════════════════════════════
 class SquaredReLU(nn.Module):
-    """x → ReLU(x)². Proven superior to GELU in all constellation paths."""
     def forward(self, x):
         return F.relu(x) ** 2
 # ══════════════════════════════════════════════════════════════════
 # ANCHOR INITIALIZATION
 # ═══════════════════════════════════════════════��══════════════════
 def init_anchors_xavier(n, d):
-    """Xavier normal → normalize. Near-orthogonal in high-d. Used in Core."""
     w = torch.empty(n, d)
     nn.init.xavier_normal_(w)
     return F.normalize(w, dim=-1)
 def init_anchors_orthogonal(n, d):
-    """QR decomposition → exact orthonormal basis. Used when n <= d."""
     if n <= d:
         M = torch.randn(d, n)
         Q, _ = torch.linalg.qr(M)
@@ -80,7 +99,7 @@ def init_anchors_orthogonal(n, d):
 def init_anchors_repulsion(n, d, iters=200, lr=0.05):
-    """QR + iterative repulsion for even coverage beyond d anchors."""
     vecs = init_anchors_orthogonal(n, d)
     vecs = F.normalize(vecs, dim=-1)
     for _ in range(iters):
@@ -99,213 +118,330 @@ INIT_METHODS = {
 # ══════════════════════════════════════════════════════════════════
-# OUTPUT
 # ══════════════════════════════════════════════════════════════════
-@dataclass
-class ConstellationOutput:
-    """Full output from constellation forward pass."""
-    embedding: torch.Tensor       # (B, pw_dim) — interpreted observation
-    cosines: torch.Tensor         # (B, N, A) or (B, N, A*phases)
-    distances: torch.Tensor       # (B, N, A) or (B, N, A*phases)
-    nearest: torch.Tensor         # (B, N) — collapsed anchor assignment
-    directions: torch.Tensor      # (B, N, D) — input directions on S^(D-1)
-    tri_flat: torch.Tensor        # (B, tri_dim) — flattened triangulation
 # ══════════════════════════════════════════════════════════════════
-# CONSTELLATION — observation + interpretation
 # ══════════════════════════════════════════════════════════════════
-class Constellation(nn.Module):
-    """Geometric observer with anchor-aligned interpretation.
-    Anchors on S^(D-1) observe input directions via triangulation.
-    Compartments interpret per-anchor observations.
-    SLERP phases provide multi-scale angular measurement.
-    All coupled through gradient flow.
     Args:
-        n_anchors: reference directions on S^(D-1)
-        dim: anchor/direction dimensionality
-        n_directions: input directions per sample
         d_comp: hidden dim per compartment
-        n_phases: SLERP interpolation phases (1=static, 3=proven default)
-        anchor_init: 'xavier', 'orthogonal', or 'repulsion'
-        anchor_dropout: fraction of anchors to drop during training (0.3 for soup)
-        compartment: 'aligned' (one per anchor) or 'flat' (single patchwork)
     """
     def __init__(
         self,
-        n_anchors: int,
-        dim: int,
-        n_directions: int,
-        d_comp: int = 64,
-        n_phases: int = 3,
-        anchor_init: str = 'xavier',
-        anchor_dropout: float = 0.0,
-        compartment: str = 'aligned',
     ):
         super().__init__()
-        self.n_anchors = n_anchors
         self.dim = dim
-        self.n_directions = n_directions
-        self.d_comp = d_comp
-        self.n_phases = n_phases
-        self.anchor_dropout = anchor_dropout
-        self.compartment_type = compartment
-        # Anchors: home (frozen) + current (learned)
-        init_fn = INIT_METHODS[anchor_init]
-        home = init_fn(n_anchors, dim)
-        self.register_buffer('home', home)
-        self.anchors = nn.Parameter(home.clone())
-        # Triangulation dimensions
-        if compartment == 'aligned':
-            # tri: (B, N, A * phases) → each compartment reads its anchor's column
-            self.tri_dim = n_directions * n_anchors * n_phases
-            self.embedding_dim = n_anchors * d_comp
-            # One compartment per anchor — reads tri[:, :, k] across all phases
-            # Input: n_directions * n_phases values per anchor
-            comp_in = n_directions * n_phases
-            self.compartments = nn.ModuleList([
-                nn.Sequential(
-                    nn.Linear(comp_in, d_comp * 2),
-                    SquaredReLU(),
-                    nn.Linear(d_comp * 2, d_comp),
-                    nn.LayerNorm(d_comp),
-                ) for _ in range(n_anchors)
-            ])
-        elif compartment == 'flat':
-            # tri: (B, tri_dim) → single patchwork MLP
-            self.tri_dim = n_directions * n_anchors * n_phases
-            self.embedding_dim = dim
-            self.patchwork = nn.Sequential(
-                nn.Linear(self.tri_dim, self.tri_dim * 2),
-                SquaredReLU(),
-                nn.LayerNorm(self.tri_dim * 2),
-                nn.Linear(self.tri_dim * 2, dim),
-            )
-        else:
-            raise ValueError(f"Unknown compartment type: {compartment}")
-        self._init_weights()
-    def _init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Linear):
-                nn.init.trunc_normal_(m.weight, std=0.02)
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.LayerNorm):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-    def drift(self):
-        """Geodesic distance between home and learned anchor positions."""
-        h = F.normalize(self.home.float(), dim=-1)
-        c = F.normalize(self.anchors.float(), dim=-1)
-        return torch.acos((h * c).sum(-1).clamp(-1 + 1e-6, 1 - 1e-6))
-    def at_phase(self, t):
-        """SLERP between home and learned positions at phase t ∈ [0, 1]."""
-        h = F.normalize(self.home.float(), dim=-1)
-        c = F.normalize(self.anchors.float(), dim=-1)
-        omega = self.drift().unsqueeze(-1)  # (A, 1)
-        so = omega.sin().clamp(min=1e-6)
-        return torch.sin((1 - t) * omega) / so * h + torch.sin(t * omega) / so * c
-    def _triangulate(self, directions, anchors):
-        """(B, N, D) × (A, D) → (B, N, A) cosines and distances."""
-        cos = torch.einsum('bnd,ad->bna', directions, anchors)
-        return cos, 1.0 - cos
-    def forward(self, directions: torch.Tensor) -> ConstellationOutput:
-        """Observe and interpret.
         Args:
-            directions: (B, N, D) — L2-normalized to S^(D-1)
         Returns:
-            ConstellationOutput
         """
-        B, N, D = directions.shape
-        # Multi-phase triangulation
-        phases = torch.linspace(0, 1, self.n_phases, device=directions.device).tolist()
-        all_cos = []
-        all_dist = []
-        for t in phases:
-            anchors_t = F.normalize(self.at_phase(t), dim=-1).to(directions.dtype)
-            # Anchor dropout during training
-            if self.training and self.anchor_dropout > 0:
-                mask = torch.rand(anchors_t.shape[0], device=anchors_t.device) > self.anchor_dropout
-                if mask.sum() < 2:
-                    mask[:2] = True
-                anchors_t = anchors_t[mask]
-            cos, dist = self._triangulate(directions, anchors_t)
-            all_cos.append(cos)
-            all_dist.append(dist)
-        # Stack phases: (B, N, A*phases) if no dropout, variable if dropout
-        cos_cat = torch.cat(all_cos, dim=-1)
-        dist_cat = torch.cat(all_dist, dim=-1)
-        # Nearest anchor (from phase 0, no dropout)
-        anchors_0 = F.normalize(self.at_phase(0.0), dim=-1).to(directions.dtype)
-        cos_0 = torch.einsum('bnd,ad->bna', directions, anchors_0)
-        nearest = cos_0.max(dim=-1).indices
-        # Interpret
-        if self.compartment_type == 'aligned' and not (self.training and self.anchor_dropout > 0):
-            # dist_cat: (B, N, A * n_phases)
-            # Reshape to (B, N, n_phases, A) then (B, A, N * n_phases)
-            A = self.n_anchors
-            dist_reshape = dist_cat.reshape(B, N, self.n_phases, A)
-            # For compartment k: gather distances to anchor k across all directions and phases
-            # dist_reshape[:, :, :, k] → (B, N, n_phases) → flatten → (B, N*n_phases)
-            parts = []
-            for k in range(A):
-                comp_input = dist_reshape[:, :, :, k].reshape(B, N * self.n_phases)
-                parts.append(self.compartments[k](comp_input))
-            embedding = torch.cat(parts, dim=-1)  # (B, A * d_comp)
-        elif self.compartment_type == 'flat' or (self.training and self.anchor_dropout > 0):
-            tri_flat = dist_cat.reshape(B, -1)
-            if self.compartment_type == 'flat':
-                embedding = self.patchwork(tri_flat)
-            else:
-                # Fallback for aligned + dropout: pad and use compartments
-                # This is a training-only path
-                embedding = torch.zeros(B, self.embedding_dim,
-                                        device=directions.device, dtype=directions.dtype)
-                # Use flat mean as fallback during dropout
-                for k in range(self.n_anchors):
-                    comp_in_size = self.n_directions * self.n_phases
-                    if tri_flat.shape[1] >= comp_in_size:
-                        chunk = tri_flat[:, :comp_in_size]
-                    else:
-                        chunk = F.pad(tri_flat, (0, comp_in_size - tri_flat.shape[1]))
-                    embedding[:, k * self.d_comp:(k + 1) * self.d_comp] = self.compartments[k](chunk)
-        else:
-            tri_flat = dist_cat.reshape(B, -1)
-            embedding = self.patchwork(tri_flat)
-        tri_flat = dist_cat.reshape(B, -1)
-        return ConstellationOutput(
-            embedding=embedding,
-            cosines=cos_cat,
-            distances=dist_cat,
-            nearest=nearest,
-            directions=directions,
-            tri_flat=tri_flat,
-        )
 # ══════════════════════════════════════════════════════════════════
@@ -313,67 +449,51 @@ class Constellation(nn.Module):
 # ══════════════════════════════════════════════════════════════════
 class ConstellationRelay(nn.Module):
-    """Per-token geometric processing layer with gated residual.
-    Replaces attention as a per-token processing layer.
-    O(S) complexity. No cross-token interaction.
-    Preserves 99.4% cosine similarity to input at depth 16.
     Pipeline:
-      LayerNorm → chunk D into patches → L2 norm per patch
-      → Constellation observation + interpretation
-      → Project back to D → gated residual
     Args:
-        dim: token dimension (must be divisible by patch_dim)
-        patch_dim: dimension per patch subspace (default 16)
-        n_anchors: anchors per patch subspace
         d_comp: hidden dim per compartment
-        n_phases: SLERP phases
-        gate_init: initial gate bias (default -3.0 → sigmoid ≈ 0.047)
         anchor_init: initialization method
     """
     def __init__(
         self,
-        dim: int,
-        patch_dim: int = 16,
-        n_anchors: int = 16,
-        d_comp: int = 64,
-        n_phases: int = 3,
-        gate_init: float = -3.0,
-        anchor_init: str = 'xavier',
     ):
         super().__init__()
-        assert dim % patch_dim == 0
         self.dim = dim
-        self.patch_dim = patch_dim
-        self.n_patches = dim // patch_dim
         self.norm = nn.LayerNorm(dim)
-        # Constellation operates on (B*S, n_patches, patch_dim)
         self.constellation = Constellation(
-            n_anchors=n_anchors,
-            dim=patch_dim,
-            n_directions=self.n_patches,
-            d_comp=d_comp,
-            n_phases=n_phases,
-            anchor_init=anchor_init,
-            compartment='aligned',
-        )
-        # Project constellation embedding back to token dim
-        self.proj = nn.Linear(self.constellation.embedding_dim, dim)
-        # Gated residual — init at -3.0 so gate starts near 0
         self.gate = nn.Parameter(torch.full((dim,), gate_init))
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        x: (B, S, D) or (B, D)
-        Returns: same shape as input
-        """
         squeeze = False
         if x.dim() == 2:
             x = x.unsqueeze(1)
@@ -382,21 +502,14 @@ class ConstellationRelay(nn.Module):
         B, S, D = x.shape
         residual = x
-        # Normalize
         h = self.norm(x)
-        # Chunk into patches and normalize to S^(patch_dim-1)
-        h_flat = h.reshape(B * S, self.n_patches, self.patch_dim)
         h_flat = F.normalize(h_flat, dim=-1)
-        # Constellation: observe + interpret
-        output = self.constellation(h_flat)
-        # Project back to token dim
-        update = self.proj(output.embedding)  # (B*S, D)
-        update = update.reshape(B, S, D)
-        # Gated residual
         g = torch.sigmoid(self.gate)
         out = residual + g * update
@@ -406,11 +519,11 @@ class ConstellationRelay(nn.Module):
 # ══════════════════════════════════════════════════════════════════
-# GEOMETRIC OPS — measurement tools
 # ══════════════════════════════════════════════════════════════════
 class GeometricOps:
-    """Static geometric utilities for constellation monitoring and loss."""
     @staticmethod
     def cayley_menger_vol2(points):
@@ -428,6 +541,7 @@ class GeometricOps:
         return sign * torch.linalg.det(cm.float()).to(points.dtype) / ((2 ** k) * (fact ** 2))
     @staticmethod
     def cv_metric(emb, n_samples=200, n_points=5):
         """Non-differentiable CV for monitoring. Target band: 0.20–0.23."""
         vols = []
@@ -442,35 +556,96 @@ class GeometricOps:
         return (vols_t.std() / (vols_t.mean() + 1e-8)).item()
     @staticmethod
-    def cv_loss(emb, target=0.22, n_samples=100, n_points=5):
-        """Differentiable CV loss. Weight: 0.001 or below."""
         vols = []
         for _ in range(n_samples):
-            idx = torch.randperm(min(emb.shape[0], 512))[:n_points]
-            v2 = GeometricOps.cayley_menger_vol2(emb[idx].unsqueeze(0))
-            if v2[0] > 1e-20:
-                vols.append(v2[0].sqrt())
         if len(vols) < 5:
             return torch.tensor(0.0, device=emb.device)
-        vols_t = torch.stack(vols)
-        cv = vols_t.std() / (vols_t.mean() + 1e-8)
         return (cv - target).pow(2)
     @staticmethod
     def anchor_spread_loss(anchors, target_cos=0.0):
-        """Repulsion loss keeping anchors spread on the sphere."""
         a = F.normalize(anchors, dim=-1)
         sim = a @ a.T
         mask = ~torch.eye(a.shape[0], dtype=torch.bool, device=a.device)
         return F.relu(sim[mask] - target_cos).mean()
     @staticmethod
-    def diagnostics(output: ConstellationOutput, n_anchors: int) -> dict:
-        """Compute diagnostic metrics."""
-        diag = {}
-        diag['n_active'] = output.nearest.flatten().unique().numel()
-        counts = torch.bincount(output.nearest.flatten(), minlength=n_anchors).float()
-        diag['anchor_util_std'] = counts.std().item()
-        diag['nearest_cos'] = output.cosines[:, :, :n_anchors].max(dim=-1).values.mean().item()
-        diag['mean_tri'] = output.distances.mean().item()
-        return diag

 """
+Constellation — Geometric Observer + Interpreter
+===================================================
+Aligned to the proven GeoLIP Core trainer (91.2% CIFAR-10 @ 1.65M params).
+Architecture:
+  emb @ anchors.T → 64 distances → 8 round-robin compartments → cat(pw, emb) → classifier
+Key mechanisms:
+  - Round-robin compartments: 8 groups of 8 anchors, diverse measurements per group
+  - cat(patchwork, embedding): classifier sees both interpreted distances AND raw position
+  - Anchor push: direct centroid placement every N batches (self-distillation across time)
+  - Attraction loss: pulls embeddings toward nearest anchor
+  - InfoNCE on two views: alignment force
+  - Simple triangulation: emb @ anchors.T, no SLERP, no phases
+Classes:
+  Constellation      — triangulation against anchors on S^(d-1)
+  Patchwork          — round-robin compartmentalized interpretation
+  ConstellationCore  — full pipeline: constellation + patchwork + classifier
+  GeometricOps       — CV, spread, Cayley-Menger utilities
+  GeometricAutograd  — Form 12 manifold-aware gradient correction
 Usage:
+    from constellation import ConstellationCore
+    model = ConstellationCore(num_classes=10, dim=192, n_anchors=64)
+    out = model(images)  # dict: logits, embedding, triangulation, nearest, patchwork
+    loss, ld = model.compute_loss(out, targets, output_aug=out2)
 """
 import torch
 import torch.nn.functional as F
 import math
 from dataclasses import dataclass
+from typing import Optional, Dict, Any
 # ══════════════════════════════════════════════════════════════════
+# ACTIVATIONS
 # ══════════════════════════════════════════════════════════════════
 class SquaredReLU(nn.Module):
+    """x → ReLU(x)². Proven #1 in bulk activation tests."""
     def forward(self, x):
         return F.relu(x) ** 2
+class StarReLU(nn.Module):
+    """x → (ReLU(x))² * scale + bias. Runner-up in bulk tests."""
+    def __init__(self):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(1) * 0.8944)
+        self.bias = nn.Parameter(torch.zeros(1) - 0.4472)
+    def forward(self, x):
+        return F.relu(x) ** 2 * self.scale + self.bias
+ACTIVATIONS = {
+    'squared_relu': SquaredReLU,
+    'star_relu': StarReLU,
+    'gelu': lambda: nn.GELU(),
+    'relu': lambda: nn.ReLU(),
+    'sigmoid': lambda: nn.Sigmoid(),
+}
+def make_activation(name='squared_relu'):
+    """Create activation by name."""
+    if name not in ACTIVATIONS:
+        raise ValueError(f"Unknown activation '{name}'. Choose from: {list(ACTIVATIONS.keys())}")
+    return ACTIVATIONS[name]()
 # ══════════════════════════════════════════════════════════════════
 # ANCHOR INITIALIZATION
 # ═══════════════════════════════════════════════��══════════════════
 def init_anchors_xavier(n, d):
+    """Xavier normal → normalize. Near-orthogonal in high-d."""
     w = torch.empty(n, d)
     nn.init.xavier_normal_(w)
     return F.normalize(w, dim=-1)
 def init_anchors_orthogonal(n, d):
+    """QR decomposition → exact orthonormal basis when n <= d."""
     if n <= d:
         M = torch.randn(d, n)
         Q, _ = torch.linalg.qr(M)
 def init_anchors_repulsion(n, d, iters=200, lr=0.05):
+    """QR + iterative repulsion for even coverage. Used in proven Core."""
     vecs = init_anchors_orthogonal(n, d)
     vecs = F.normalize(vecs, dim=-1)
     for _ in range(iters):
 # ══════════════════════════════════════════════════════════════════
+# CONSTELLATION — triangulation on S^(d-1)
 # ══════════════════════════════════════════════════════════════════
+class Constellation(nn.Module):
+    """Anchors on S^(d-1). Triangulates input embeddings.
+    Simple: emb @ anchors.T → cosines → distances.
+    No SLERP, no phases, no home/learned split.
+    Args:
+        n_anchors: number of reference points on S^(d-1)
+        dim: dimensionality of the sphere
+        anchor_drop: fraction to drop during training (0.15 proven)
+        anchor_init: 'repulsion', 'xavier', or 'orthogonal'
+    """
+    def __init__(self, n_anchors, dim, anchor_drop=0.0, anchor_init='repulsion'):
+        super().__init__()
+        init_fn = INIT_METHODS[anchor_init]
+        self.anchors = nn.Parameter(init_fn(n_anchors, dim))
+        self.anchor_drop = anchor_drop
+        self.n_anchors = n_anchors
+        self.dim = dim
+    def triangulate(self, emb, training=False):
+        """emb: (B, D) L2-normalized → (tri, nearest).
+        tri: (B, A) angular distances to all anchors
+        nearest: (B,) index of closest anchor
+        """
+        anchors = F.normalize(self.anchors, dim=-1)
+        if training and self.anchor_drop > 0:
+            mask = torch.rand(anchors.shape[0], device=anchors.device) > self.anchor_drop
+            if mask.sum() < 2:
+                mask[:2] = True
+            anchors_drop = anchors[mask]
+            cos = emb @ anchors_drop.T
+            tri = 1.0 - cos
+            _, nearest_local = cos.max(dim=-1)
+            nearest = mask.nonzero(as_tuple=True)[0][nearest_local]
+        else:
+            cos = emb @ anchors.T
+            tri = 1.0 - cos
+            _, nearest = cos.max(dim=-1)
+        return tri, nearest
+    def forward(self, emb, training=False):
+        return self.triangulate(emb, training=training)
 # ══════════════════════════════════════════════════════════════════
+# PATCHWORK — round-robin compartmentalized interpretation
 # ══════════════════════════════════════════════════════════════════
+class Patchwork(nn.Module):
+    """Round-robin compartments reading diverse anchor subsets.
+    64 anchors, 8 compartments → each reads 8 anchors.
+    Assignment: anchor k goes to compartment (k % n_comp).
+    Each compartment: Linear(anchors_per, d_comp*2) → act → Linear → LN → d_comp
+    Args:
+        n_anchors: total anchors (must be divisible by n_comp)
+        n_comp: number of compartments
+        d_comp: output dim per compartment
+        activation: activation function name
+    """
+    def __init__(self, n_anchors, n_comp=8, d_comp=64, activation='squared_relu'):
+        super().__init__()
+        self.n_comp = n_comp
+        self.d_comp = d_comp
+        self.output_dim = n_comp * d_comp
+        # Round-robin assignment: anchor k → compartment (k % n_comp)
+        self.register_buffer('asgn', torch.arange(n_anchors) % n_comp)
+        anchors_per = n_anchors // n_comp
+        self.comps = nn.ModuleList([
+            nn.Sequential(
+                nn.Linear(anchors_per, d_comp * 2),
+                make_activation(activation),
+                nn.Linear(d_comp * 2, d_comp),
+                nn.LayerNorm(d_comp),
+            ) for _ in range(n_comp)
+        ])
+    def forward(self, tri):
+        """tri: (B, n_anchors) → (B, n_comp * d_comp)"""
+        return torch.cat([
+            self.comps[k](tri[:, self.asgn == k])
+            for k in range(self.n_comp)
+        ], dim=-1)
+# ══════════════════════════════════════════════════════════════════
+# CONSTELLATION CORE — full pipeline
+# ══════════════════════════════════════════════════════════════════
+class ConstellationCore(nn.Module):
+    """Constellation + Patchwork + Classifier.
+    Forward returns dict with all outputs for downstream consumers.
+    Classifier reads cat(patchwork, embedding).
     Args:
+        num_classes: classification targets
+        dim: embedding dimension (encoder output)
+        n_anchors: anchors on S^(dim-1)
+        n_comp: patchwork compartments
         d_comp: hidden dim per compartment
+        anchor_drop: training dropout rate for anchors
+        anchor_init: initialization method
+        activation: activation for patchwork compartments
+        cv_target: target CV for geometric loss
+        infonce_temp: temperature for InfoNCE
     """
     def __init__(
         self,
+        num_classes=10,
+        dim=192,
+        n_anchors=64,
+        n_comp=8,
+        d_comp=64,
+        anchor_drop=0.15,
+        anchor_init='repulsion',
+        activation='squared_relu',
+        cv_target=0.22,
+        infonce_temp=0.07,
     ):
         super().__init__()
+        self.num_classes = num_classes
         self.dim = dim
+        self.cv_target = cv_target
+        self.infonce_temp = infonce_temp
+        self.config = {k: v for k, v in locals().items()
+                       if k != 'self' and not k.startswith('_')}
+        self.constellation = Constellation(
+            n_anchors, dim, anchor_drop, anchor_init)
+        self.patchwork = Patchwork(
+            n_anchors, n_comp, d_comp, activation)
+        pw_dim = self.patchwork.output_dim
+        # Classifier reads cat(patchwork, embedding)
+        self.classifier = nn.Sequential(
+            nn.Linear(pw_dim + dim, pw_dim),
+            make_activation(activation),
+            nn.LayerNorm(pw_dim),
+            nn.Dropout(0.1),
+            nn.Linear(pw_dim, num_classes),
+        )
+    def forward(self, emb_normalized):
+        """Forward pass on L2-normalized embeddings.
         Args:
+            emb_normalized: (B, D) already on S^(d-1)
         Returns:
+            dict with: logits, embedding, triangulation, nearest, patchwork
         """
+        emb = emb_normalized
+        # Full triangulation for patchwork
+        tri, nearest = self.constellation.triangulate(emb, training=False)
+        pw = self.patchwork(tri)
+        # Dropout version for nearest tracking only
+        if self.training:
+            _, nearest = self.constellation.triangulate(emb, training=True)
+        # Classifier sees BOTH patchwork interpretation AND raw position
+        logits = self.classifier(torch.cat([pw, emb], dim=-1))
+        return {
+            'logits': logits,
+            'embedding': emb,
+            'triangulation': tri,
+            'nearest': nearest,
+            'patchwork': pw,
+        }
+    def compute_loss(self, output, targets, output_aug=None):
+        """Compute all losses.
+        Args:
+            output: dict from forward()
+            targets: (B,) class indices
+            output_aug: optional dict from forward() on second view
+        Returns:
+            (total_loss, loss_dict)
+        """
+        ld = {}
+        emb = output['embedding']
+        B = emb.shape[0]
+        # CE classification
+        l_ce = F.cross_entropy(output['logits'], targets)
+        ld['ce'] = l_ce
+        ld['acc'] = (output['logits'].argmax(-1) == targets).float().mean().item()
+        # InfoNCE between augmented views
+        if output_aug is not None:
+            emb_aug = output_aug['embedding']
+            labels_nce = torch.arange(B, device=emb.device)
+            sim = emb @ emb_aug.T / self.infonce_temp
+            l_nce = F.cross_entropy(sim, labels_nce)
+            nce_acc = (sim.argmax(1) == labels_nce).float().mean().item()
+            ld['nce'] = l_nce
+            ld['nce_acc'] = nce_acc
+        # Anchor attraction: pull embeddings toward nearest anchor
+        anchors_n = F.normalize(self.constellation.anchors, dim=-1)
+        cos_to_anchors = emb @ anchors_n.T
+        nearest_cos = cos_to_anchors.max(dim=1).values
+        l_attract = (1.0 - nearest_cos).mean()
+        ld['attract'] = l_attract
+        ld['nearest_cos'] = nearest_cos.mean().item()
+        # CV on embeddings
+        l_cv = GeometricOps.cv_loss(emb, target=self.cv_target)
+        ld['cv'] = l_cv
+        # Anchor spread
+        l_spread = GeometricOps.anchor_spread_loss(self.constellation.anchors)
+        ld['spread'] = l_spread
+        # Total
+        loss = (l_ce
+                + ld.get('nce', 0.0) * 1.0
+                + l_attract * 0.5
+                + l_cv * 0.01
+                + l_spread * 0.001)
+        ld['total'] = loss
+        return loss, ld
+    @torch.no_grad()
+    def push_anchors_to_centroids(self, emb_buffer, label_buffer, lr=0.1):
+        """Push anchors toward class centroids — self-distillation across time.
+        Phase 1: Compute class centroids from labels
+        Phase 2: Greedy-assign anchors to classes (round-robin capacity)
+        Phase 3: SLERP each anchor toward its class centroid with perpendicular
+                 perturbation so co-class anchors don't collapse
+        Args:
+            emb_buffer: (N, D) accumulated embeddings
+            label_buffer: (N,) class labels
+            lr: blend rate toward centroid
+        Returns:
+            number of anchors moved
+        """
+        anchors = self.constellation.anchors.data
+        n_a = anchors.shape[0]
+        emb_n = F.normalize(emb_buffer, dim=-1)
+        device = anchors.device
+        # Phase 1: class centroids
+        classes = label_buffer.unique()
+        n_cls = classes.shape[0]
+        centroids = []
+        for c in classes:
+            mask = label_buffer == c
+            if mask.sum() > 0:
+                centroids.append(
+                    F.normalize(emb_n[mask].mean(0, keepdim=True), dim=-1))
+        if len(centroids) == 0:
+            return 0
+        centroids = torch.cat(centroids, dim=0)
+        # Phase 2: greedy anchor-to-class assignment
+        anchors_n = F.normalize(anchors, dim=-1)
+        cos = anchors_n @ centroids.T
+        anchors_per_class = n_a // n_cls
+        assigned_class = torch.full((n_a,), -1, dtype=torch.long, device=device)
+        class_count = torch.zeros(n_cls, dtype=torch.long, device=device)
+        _, flat_idx = cos.flatten().sort(descending=True)
+        for idx in flat_idx:
+            a = (idx // n_cls).item()
+            c = (idx % n_cls).item()
+            if assigned_class[a] >= 0:
+                continue
+            if class_count[c] >= anchors_per_class + 1:
+                continue
+            assigned_class[a] = c
+            class_count[c] += 1
+            if (assigned_class >= 0).all():
+                break
+        # Unassigned leftovers
+        unassigned = (assigned_class < 0).nonzero(as_tuple=True)[0]
+        if len(unassigned) > 0:
+            leftover_cos = anchors_n[unassigned] @ centroids.T
+            assigned_class[unassigned] = leftover_cos.argmax(dim=1)
+        # Phase 3: push with perpendicular perturbation
+        moved = 0
+        for a in range(n_a):
+            c = assigned_class[a].item()
+            target = centroids[c]
+            rank_in_class = (assigned_class[:a] == c).sum().item()
+            if anchors_per_class > 1 and rank_in_class > 0:
+                noise = torch.randn_like(target) * 0.05
+                noise = noise - (noise * target).sum() * target
+                target = F.normalize(
+                    (target + noise).unsqueeze(0), dim=-1).squeeze(0)
+            anchors[a] = F.normalize(
+                (anchors_n[a] + lr * (target - anchors_n[a])).unsqueeze(0),
+                dim=-1).squeeze(0)
+            moved += 1
+        return moved
 # ══════════════════════════════════════════════════════════════════
 # ══════════════════════════════════════════════════════════════════
 class ConstellationRelay(nn.Module):
+    """Per-token geometric processing with gated residual.
+    O(S) complexity. Preserves 99.4% cos similarity at depth 16.
     Pipeline:
+      LayerNorm → L2 normalize → triangulate → patchwork → project → gated residual
     Args:
+        dim: token dimension
+        n_anchors: anchors on S^(dim-1)
+        n_comp: patchwork compartments
         d_comp: hidden dim per compartment
+        gate_init: initial gate bias (-3.0 → sigmoid ≈ 0.047)
         anchor_init: initialization method
+        activation: activation function name
     """
     def __init__(
         self,
+        dim,
+        n_anchors=16,
+        n_comp=8,
+        d_comp=64,
+        gate_init=-3.0,
+        anchor_init='repulsion',
+        activation='squared_relu',
     ):
         super().__init__()
         self.dim = dim
         self.norm = nn.LayerNorm(dim)
         self.constellation = Constellation(
+            n_anchors, dim, anchor_init=anchor_init)
+        self.patchwork = Patchwork(
+            n_anchors, n_comp, d_comp, activation)
+        # Project patchwork back to token dim
+        self.proj = nn.Linear(self.patchwork.output_dim, dim)
+        # Gated residual
         self.gate = nn.Parameter(torch.full((dim,), gate_init))
+    def forward(self, x):
+        """x: (B, S, D) or (B, D) → same shape."""
         squeeze = False
         if x.dim() == 2:
             x = x.unsqueeze(1)
         B, S, D = x.shape
         residual = x
         h = self.norm(x)
+        h_flat = h.reshape(B * S, D)
         h_flat = F.normalize(h_flat, dim=-1)
+        tri, _ = self.constellation.triangulate(h_flat)
+        pw = self.patchwork(tri)
+        update = self.proj(pw).reshape(B, S, D)
         g = torch.sigmoid(self.gate)
         out = residual + g * update
 # ══════════════════════════════════════════════════════════════════
+# GEOMETRIC OPS
 # ══════════════════════════════════════════════════════════════════
 class GeometricOps:
+    """Static geometric utilities."""
     @staticmethod
     def cayley_menger_vol2(points):
         return sign * torch.linalg.det(cm.float()).to(points.dtype) / ((2 ** k) * (fact ** 2))
     @staticmethod
+    @torch.no_grad()
     def cv_metric(emb, n_samples=200, n_points=5):
         """Non-differentiable CV for monitoring. Target band: 0.20–0.23."""
         vols = []
         return (vols_t.std() / (vols_t.mean() + 1e-8)).item()
     @staticmethod
+    def cv_loss(emb, target=0.22, n_samples=64, n_points=5):
+        """Differentiable CV loss. Weight: 0.01 or below."""
+        B = emb.shape[0]
+        if B < n_points:
+            return torch.tensor(0.0, device=emb.device)
         vols = []
         for _ in range(n_samples):
+            idx = torch.randperm(min(B, 512), device=emb.device)[:n_points]
+            pts = emb[idx].unsqueeze(0)
+            gram = torch.bmm(pts, pts.transpose(1, 2))
+            norms = torch.diagonal(gram, dim1=1, dim2=2)
+            d2 = norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram
+            d2 = F.relu(d2)
+            N = n_points
+            cm = torch.zeros(1, N + 1, N + 1, device=emb.device, dtype=emb.dtype)
+            cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
+            k = N - 1
+            pf = ((-1.0) ** (k + 1)) / ((2.0 ** k) * (math.factorial(k) ** 2))
+            v2 = pf * torch.linalg.det(cm.float())
+            if v2[0].item() > 1e-20:
+                vols.append(v2[0].to(emb.dtype).sqrt())
         if len(vols) < 5:
             return torch.tensor(0.0, device=emb.device)
+        vt = torch.stack(vols)
+        cv = vt.std() / (vt.mean() + 1e-8)
         return (cv - target).pow(2)
     @staticmethod
     def anchor_spread_loss(anchors, target_cos=0.0):
+        """Repulsion loss keeping anchors spread."""
         a = F.normalize(anchors, dim=-1)
         sim = a @ a.T
         mask = ~torch.eye(a.shape[0], dtype=torch.bool, device=a.device)
         return F.relu(sim[mask] - target_cos).mean()
     @staticmethod
+    def diagnostics(constellation, emb):
+        """Compute health metrics from a constellation and embeddings."""
+        tri, nearest = constellation.triangulate(emb, training=False)
+        n_active = nearest.unique().numel()
+        anchors_n = F.normalize(constellation.anchors, dim=-1)
+        cos_to_anchors = emb @ anchors_n.T
+        nearest_cos = cos_to_anchors.max(dim=1).values.mean().item()
+        counts = torch.bincount(nearest, minlength=constellation.n_anchors).float()
+        return {
+            'n_active': n_active,
+            'nearest_cos': nearest_cos,
+            'anchor_util_std': counts.std().item(),
+            'anchor_util_min': counts.min().item(),
+            'anchor_util_max': counts.max().item(),
+        }
+# ══════════════════════════════════════════════════════════════════
+# GEOMETRIC AUTOGRAD — Form 12
+# ══════════════════════════════════════════════════════════════════
+class GeometricAutograd(torch.autograd.Function):
+    """Manifold-aware gradient correction on S^(D-1).
+    Forward: identity.
+    Backward: tangential projection + separation from nearest anchor.
+    Proven settings: tang=0.01, sep=1.0
+    """
+    @staticmethod
+    def forward(ctx, emb, anchors, tang_strength, sep_strength):
+        ctx.save_for_backward(emb, anchors)
+        ctx.tang = tang_strength
+        ctx.sep = sep_strength
+        return emb
+    @staticmethod
+    def backward(ctx, grad):
+        emb, anchors = ctx.saved_tensors
+        tang = ctx.tang
+        sep = ctx.sep
+        dot = (grad * emb).sum(dim=-1, keepdim=True)
+        radial = dot * emb
+        tangential = grad - radial
+        corrected = tangential + (1.0 - tang) * radial
+        if sep > 0:
+            anchors_n = F.normalize(anchors.detach(), dim=-1)
+            cos_to_anchors = emb @ anchors_n.T
+            nearest_idx = cos_to_anchors.argmax(dim=-1)
+            nearest = anchors_n[nearest_idx]
+            toward = (corrected * nearest).sum(dim=-1, keepdim=True)
+            corrected = corrected - sep * F.relu(toward) * nearest
+        return corrected, None, None, None