"""
GeoLIP Losses & Regularization
=================================
Every loss and metric in the GeoLIP pipeline, with uniform interfaces.

All loss functions: (inputs) → scalar tensor (differentiable)
All metrics: (inputs) → float (non-differentiable, for monitoring)

CV functions default to batched computation (141x speedup).
Set batched=False for sequential fallback.

Loss Spectrum (3 domains):
  EXTERNAL:   ce_loss, nce_loss (embedding-level)
  GEOMETRIC:  nce_loss (patchwork), bridge_loss
  INTERNAL:   assign_bce, assign_nce, nce_loss (triangulation),
              attraction_loss, cv_loss, spread_loss

Metrics:
  cv_metric, cv_multi_scale, cayley_menger_vol2

Compound:
  three_domain_loss — the full cooperative loss from InternalConstellationCore

Usage:
    from geolip_losses import cv_loss, cv_metric, nce_loss, three_domain_loss

Author: AbstractPhil + Claude Opus 4.6
License: Apache 2.0
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


# ══════════════════════════════════════════════════════════════════
# CV — Coefficient of Variation of Pentachoron Volumes
# ══════════════════════════════════════════════════════════════════

def _batch_pentachoron_volumes(emb, n_samples=200, n_points=5):
    """Compute pentachoron volumes in one batched operation. Zero Python loops.

    Args:
        emb: (N, D) embeddings on S^(d-1)
        n_samples: random pentachora to sample
        n_points: points per simplex (5 = pentachoron)

    Returns:
        (n_valid,) tensor of simplex volumes
    """
    N, D = emb.shape
    device, dtype = emb.device, emb.dtype
    pool = min(N, 512)

    # Batched randperm via argsort on random values
    indices = torch.rand(n_samples, pool, device=device).argsort(dim=1)[:, :n_points]
    pts = emb[:pool][indices]  # (n_samples, n_points, D)

    gram = torch.bmm(pts, pts.transpose(1, 2))
    norms = torch.diagonal(gram, dim1=1, dim2=2)
    d2 = F.relu(norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram)

    M = n_points + 1
    cm = torch.zeros(n_samples, M, M, device=device, dtype=dtype)
    cm[:, 0, 1:] = 1.0
    cm[:, 1:, 0] = 1.0
    cm[:, 1:, 1:] = d2

    k = n_points - 1
    pf = ((-1.0) ** (k + 1)) / ((2.0 ** k) * (math.factorial(k) ** 2))
    dets = pf * torch.linalg.det(cm.float())

    valid = dets > 1e-20
    return dets[valid].to(dtype).sqrt()


def _sequential_pentachoron_volumes(emb, n_samples=200, n_points=5):
    """Sequential fallback. One det call per sample."""
    N = emb.shape[0]
    device, dtype = emb.device, emb.dtype
    vols = []
    for _ in range(n_samples):
        idx = torch.randperm(min(N, 512), device=device)[:n_points]
        pts = emb[idx].unsqueeze(0)
        gram = torch.bmm(pts, pts.transpose(1, 2))
        norms = torch.diagonal(gram, dim1=1, dim2=2)
        d2 = F.relu(norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram)
        M = n_points + 1
        cm = torch.zeros(1, M, M, device=device, dtype=dtype)
        cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
        k = n_points - 1
        pf = ((-1.0) ** (k + 1)) / ((2.0 ** k) * (math.factorial(k) ** 2))
        v2 = pf * torch.linalg.det(cm.float())
        if v2[0].item() > 1e-20:
            vols.append(v2[0].to(dtype).sqrt())
    if len(vols) < 5:
        return torch.tensor([], device=device, dtype=dtype)
    return torch.stack(vols)


def cv_loss(emb, target=0.22, n_samples=64, n_points=5, batched=True):
    """Differentiable CV loss. Returns (CV - target)².

    Args:
        emb: (N, D) L2-normalized embeddings
        target: CV target (0.22 = natural basin of S^(d-1) at eff_dim ~16)
        n_samples: pentachora to sample (32-64 for training)
        n_points: points per simplex
        batched: use batched computation (141x faster, default True)

    Returns:
        scalar tensor, differentiable w.r.t. emb
    """
    if emb.shape[0] < n_points:
        return torch.tensor(0.0, device=emb.device, requires_grad=True)

    if batched:
        vols = _batch_pentachoron_volumes(emb, n_samples, n_points)
    else:
        vols = _sequential_pentachoron_volumes(emb, n_samples, n_points)

    if vols.shape[0] < 5:
        return torch.tensor(0.0, device=emb.device, requires_grad=True)
    cv = vols.std() / (vols.mean() + 1e-8)
    return (cv - target).pow(2)


def cv_metric(emb, n_samples=200, n_points=5, batched=True):
    """Non-differentiable CV for monitoring. Target band: 0.20–0.23.

    Returns:
        float: coefficient of variation of simplex volumes
    """
    with torch.no_grad():
        if batched:
            vols = _batch_pentachoron_volumes(emb, n_samples, n_points)
        else:
            vols = _sequential_pentachoron_volumes(emb, n_samples, n_points)
        if vols.shape[0] < 10:
            return 0.0
        return (vols.std() / (vols.mean() + 1e-8)).item()


def cv_multi_scale(emb, scales=(3, 4, 5, 6, 7, 8), n_samples=100, batched=True):
    """CV at multiple simplex sizes. Returns dict: {n_points: cv_value}.

    Healthy geometry: all scales in [0.18, 0.25].
    """
    results = {}
    with torch.no_grad():
        for n_pts in scales:
            if batched:
                vols = _batch_pentachoron_volumes(emb, n_samples, n_pts)
            else:
                vols = _sequential_pentachoron_volumes(emb, n_samples, n_pts)
            if vols.shape[0] >= 10:
                results[n_pts] = round((vols.std() / (vols.mean() + 1e-8)).item(), 4)
            else:
                results[n_pts] = None
    return results


def cayley_menger_vol2(points):
    """Squared simplex volume. points: (B, N, D) → (B,)."""
    B, N, D = points.shape
    gram = torch.bmm(points, points.transpose(1, 2))
    norms = torch.diagonal(gram, dim1=1, dim2=2)
    d2 = F.relu(norms.unsqueeze(2) + norms.unsqueeze(1) - 2 * gram)
    cm = torch.zeros(B, N + 1, N + 1, device=points.device, dtype=points.dtype)
    cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
    k = N - 1
    sign = (-1.0) ** (k + 1)
    fact = math.factorial(k)
    return sign * torch.linalg.det(cm.float()).to(points.dtype) / ((2 ** k) * (fact ** 2))


# ══════════════════════════════════════════════════════════════════
# NCE — InfoNCE contrastive loss
# ══════════════════════════════════════════════════════════════════

def nce_loss(z1, z2, temperature=0.07, normalize=True):
    """Symmetric InfoNCE between two views.

    Args:
        z1, z2: (B, D) embeddings from two augmented views
        temperature: softmax temperature (lower = sharper)
        normalize: L2-normalize before computing similarity

    Returns:
        scalar loss, float accuracy
    """
    if normalize:
        z1 = F.normalize(z1, dim=-1)
        z2 = F.normalize(z2, dim=-1)
    B = z1.shape[0]
    labels = torch.arange(B, device=z1.device)
    sim = z1 @ z2.T / temperature
    loss = F.cross_entropy(sim, labels)
    acc = (sim.argmax(1) == labels).float().mean().item()
    return loss, acc


# ══════════════════════════════════════════════════════════════════
# CLASSIFICATION
# ══════════════════════════════════════════════════════════════════

def ce_loss(logits, targets):
    """Cross-entropy classification loss.

    Args:
        logits: (B, C) raw logits
        targets: (B,) class indices

    Returns:
        scalar loss, float accuracy
    """
    loss = F.cross_entropy(logits, targets)
    acc = (logits.argmax(-1) == targets).float().mean().item()
    return loss, acc


def ce_loss_paired(logits1, logits2, targets):
    """Averaged CE over two views.

    Returns:
        scalar loss, float accuracy (from view 1)
    """
    l1 = F.cross_entropy(logits1, targets)
    l2 = F.cross_entropy(logits2, targets)
    acc = (logits1.argmax(-1) == targets).float().mean().item()
    return (l1 + l2) / 2, acc


# ══════════════════════════════════════════════════════════════════
# BRIDGE — patchwork predicts constellation's assignment
# ══════════════════════════════════════════════════════════════════

def bridge_loss(bridge_logits, assign_targets, detach_targets=True):
    """Soft cross-entropy: patchwork predicts constellation's soft assignment.

    One-way teaching: constellation → patchwork.
    Targets are detached so constellation is shaped only by internal losses.

    Args:
        bridge_logits: (B, A) raw logits from bridge head
        assign_targets: (B, A) soft assignment from constellation
        detach_targets: detach targets from graph (default True)

    Returns:
        scalar loss, float accuracy (hard agreement)
    """
    if detach_targets:
        assign_targets = assign_targets.detach()
    loss = -(assign_targets * F.log_softmax(bridge_logits, dim=-1)).sum(-1).mean()
    acc = (bridge_logits.argmax(-1) == assign_targets.argmax(-1)).float().mean().item()
    return loss, acc


def bridge_loss_paired(bridge1, bridge2, assign1, assign2, detach_targets=True):
    """Bridge loss averaged over two views.

    Returns:
        scalar loss, float accuracy (from view 1)
    """
    l1, acc = bridge_loss(bridge1, assign1, detach_targets)
    l2, _ = bridge_loss(bridge2, assign2, detach_targets)
    return (l1 + l2) / 2, acc


# ══════════════════════════════════════════════════════════════════
# ASSIGNMENT — internal constellation self-organization
# ══════════════════════════════════════════════════════════════════

def assign_bce_loss(soft_assign, cos_to_anchors):
    """Assignment crispness: BCE toward hard nearest-anchor target.

    Args:
        soft_assign: (B, A) softmax assignment
        cos_to_anchors: (B, A) cosine similarities to anchors

    Returns:
        scalar loss, float entropy
    """
    nearest = cos_to_anchors.argmax(dim=-1)
    hard = torch.zeros_like(soft_assign)
    hard.scatter_(1, nearest.unsqueeze(1), 1.0)

    with torch.amp.autocast("cuda", enabled=False):
        loss = F.binary_cross_entropy(
            soft_assign.float().clamp(1e-7, 1 - 1e-7),
            hard.float(), reduction='mean')

    entropy = -(soft_assign * soft_assign.clamp(min=1e-8).log()).sum(-1).mean().item()
    return loss, entropy


def assign_nce_loss(assign1, assign2, temperature=0.1):
    """Assignment consistency: NCE across two views.

    Args:
        assign1, assign2: (B, A) soft assignments from two views
        temperature: softmax temperature

    Returns:
        scalar loss, float accuracy
    """
    B = assign1.shape[0]
    labels = torch.arange(B, device=assign1.device)
    sim = assign1 @ assign2.T / temperature
    loss = F.cross_entropy(sim, labels)
    acc = (sim.argmax(1) == labels).float().mean().item()
    return loss, acc


# ══════════════════════════════════════════════════════════════════
# ATTRACTION — embeddings near their anchors
# ══════════════════════════════════════════════════════════════════

def attraction_loss(cos_to_anchors):
    """Pull embeddings toward nearest anchor. Higher cos = closer.

    Args:
        cos_to_anchors: (B, A) cosine similarities

    Returns:
        scalar loss, float mean nearest cosine
    """
    nearest_cos = cos_to_anchors.max(dim=1).values
    loss = (1.0 - nearest_cos).mean()
    return loss, nearest_cos.mean().item()


# ══════════════════════════════════════════════════════════════════
# SPREAD — anchor repulsion
# ══════════════════════════════════════════════════════════════════

def spread_loss(anchors, target_cos=0.0):
    """Repulsion loss keeping anchors spread on S^(d-1).

    Args:
        anchors: (A, D) anchor parameters
        target_cos: cosine threshold (0.0 = orthogonal target)

    Returns:
        scalar loss
    """
    a = F.normalize(anchors, dim=-1)
    sim = a @ a.T
    mask = ~torch.eye(a.shape[0], dtype=torch.bool, device=a.device)
    return F.relu(sim[mask] - target_cos).mean()


# ══════════════════════════════════════════════════════════════════
# kNN — non-differentiable validation metric
# ══════════════════════════════════════════════════════════════════

@torch.no_grad()
def knn_accuracy(embeddings, targets, k=1):
    """k-NN classification accuracy in embedding space.

    Args:
        embeddings: (N, D) L2-normalized
        targets: (N,) class labels
        k: number of neighbors (1 for simple NN)

    Returns:
        float accuracy
    """
    sim = embeddings @ embeddings.T
    sim.fill_diagonal_(-1)
    if k == 1:
        nn_idx = sim.argmax(dim=1)
        return (targets[nn_idx] == targets).float().mean().item()
    else:
        _, topk_idx = sim.topk(k, dim=1)
        nn_labels = targets[topk_idx]  # (N, k)
        # Majority vote
        pred = nn_labels.mode(dim=1).values
        return (pred == targets).float().mean().item()


# ══════════════════════════════════════════════════════════════════
# THREE-DOMAIN COMPOUND LOSS
# ══════════════════════════════════════════════════════════════════

def three_domain_loss(output, targets, constellation, cv_target=0.22,
                      infonce_temp=0.07, assign_temp=0.1,
                      w_ce=1.0, w_nce_emb=0.5,
                      w_nce_pw=1.0, w_bridge=1.0,
                      w_assign=0.5, w_assign_nce=0.25,
                      w_nce_tri=0.5, w_attract=0.25,
                      w_cv=0.01, w_spread=0.01,
                      cv_batched=True):
    """Full three-domain cooperative loss.

    EXTERNAL:   CE + embedding NCE
    GEOMETRIC:  patchwork NCE + bridge
    INTERNAL:   assign BCE + assign NCE + tri NCE + attraction + CV + spread

    Args:
        output: dict from InternalConstellationCore.forward_paired()
        targets: (B,) class labels
        constellation: Constellation module (for anchors)
        cv_target: CV loss target
        infonce_temp: embedding NCE temperature
        assign_temp: assignment NCE / patchwork NCE temperature
        w_*: per-term weights
        cv_batched: use batched CV (default True)

    Returns:
        total_loss: scalar tensor
        ld: dict with all per-term values and diagnostics
    """
    ld = {}
    emb1, emb2 = output['embedding'], output['embedding_aug']
    B = emb1.shape[0]
    device = emb1.device

    # ── EXTERNAL ──
    l_ce, acc = ce_loss_paired(output['logits'], output['logits_aug'], targets)
    ld['ce'], ld['acc'] = l_ce, acc

    l_nce_emb, nce_emb_acc = nce_loss(emb1, emb2, infonce_temp, normalize=False)
    ld['nce_emb'], ld['nce_emb_acc'] = l_nce_emb, nce_emb_acc

    # ── GEOMETRIC ──
    l_nce_pw, nce_pw_acc = nce_loss(output['patchwork1'], output['patchwork1_aug'],
                                     assign_temp, normalize=True)
    ld['nce_pw'], ld['nce_pw_acc'] = l_nce_pw, nce_pw_acc

    l_bridge, bridge_acc = bridge_loss_paired(
        output['bridge1'], output['bridge2'],
        output['assign1'], output['assign2'])
    ld['bridge'], ld['bridge_acc'] = l_bridge, bridge_acc

    # ── INTERNAL ──
    l_assign, assign_ent = assign_bce_loss(output['assign1'], output['cos1'])
    ld['assign'], ld['assign_entropy'] = l_assign, assign_ent

    l_assign_nce, assign_nce_acc = assign_nce_loss(
        output['assign1'], output['assign2'], assign_temp)
    ld['assign_nce'], ld['assign_nce_acc'] = l_assign_nce, assign_nce_acc

    l_nce_tri, nce_tri_acc = nce_loss(output['tri1'], output['tri2'], 0.1, normalize=True)
    ld['nce_tri'], ld['nce_tri_acc'] = l_nce_tri, nce_tri_acc

    l_attract, nearest_cos = attraction_loss(output['cos1'])
    ld['attract'], ld['nearest_cos'] = l_attract, nearest_cos

    l_cv = cv_loss(emb1, target=cv_target, batched=cv_batched)
    ld['cv'] = l_cv

    l_spread = spread_loss(constellation.anchors)
    ld['spread'] = l_spread

    # ── kNN (non-differentiable) ──
    ld['knn_acc'] = knn_accuracy(emb1, targets)

    # ── TOTAL ──
    loss_external = w_ce * l_ce + w_nce_emb * l_nce_emb
    loss_geometric = w_nce_pw * l_nce_pw + w_bridge * l_bridge
    loss_internal = (w_assign * l_assign + w_assign_nce * l_assign_nce
                     + w_nce_tri * l_nce_tri + w_attract * l_attract
                     + w_cv * l_cv + w_spread * l_spread)

    loss = loss_external + loss_geometric + loss_internal

    ld['loss_external'] = loss_external.item()
    ld['loss_geometric'] = loss_geometric.item()
    ld['loss_internal'] = loss_internal.item()
    ld['total'] = loss

    # Per-term raw values for analysis
    ld['t_ce'] = l_ce.item()
    ld['t_nce_emb'] = l_nce_emb.item()
    ld['t_nce_pw'] = l_nce_pw.item()
    ld['t_bridge'] = l_bridge.item()
    ld['t_assign'] = l_assign.item()
    ld['t_assign_nce'] = l_assign_nce.item()
    ld['t_nce_tri'] = l_nce_tri.item()
    ld['t_attract'] = l_attract.item()

    return loss, ld