jacklangerman commited on Mar 23

Commit

f4487da

verified ·

1 Parent(s): 4ddee35

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

checkpoint.pt +3 -0
s23dr_2026_example/__init__.py +0 -0
s23dr_2026_example/attention.py +226 -0
s23dr_2026_example/bad_samples.txt +156 -0
s23dr_2026_example/cache_scenes.py +373 -0
s23dr_2026_example/color_mappings.py +209 -0
s23dr_2026_example/data.py +237 -0
s23dr_2026_example/losses.py +311 -0
s23dr_2026_example/make_sampled_cache.py +260 -0
s23dr_2026_example/model.py +696 -0
s23dr_2026_example/point_fusion.py +554 -0
s23dr_2026_example/postprocess_v2.py +39 -0
s23dr_2026_example/segment_postprocess.py +60 -0
s23dr_2026_example/sinkhorn.py +181 -0
s23dr_2026_example/soft_hss_loss.py +507 -0
s23dr_2026_example/tokenizer.py +88 -0
s23dr_2026_example/varifold.py +196 -0
s23dr_2026_example/wire_varifold_kernels.py +461 -0
script.py +322 -0

checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc38a61ff512948b1dc92a30129d6efdd093f507948fc5b538050c4a38bfbf6c
+size 106460054

s23dr_2026_example/__init__.py ADDED Viewed

File without changes

s23dr_2026_example/attention.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# custom_transformer.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# =============================================================================
+# Core Efficient Multihead Attention using Scaled Dot Product Attention (SDPA)
+# =============================================================================
+class MultiHeadSDPA(nn.Module):
+    """
+    Multi-head cross-attention using torch.nn.functional.scaled_dot_product_attention
+    without causal masking. Suitable for set inputs and cross-attention.
+    If qk_norm=True, L2-normalizes Q and K per-head before the dot product,
+    then scales by a learned per-head temperature (log_scale). This caps logit
+    magnitude to [-1, +1] * exp(log_scale), preventing attention entropy
+    collapse at large head_dim.
+    """
+    def __init__(self, d_model: int, num_heads: int, kv_heads: int = None,
+                 qk_norm: bool = False, qk_norm_type: str = "l2"):
+        super().__init__()
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.kv_heads = kv_heads or num_heads
+        assert self.num_heads % self.kv_heads == 0, "kv_heads must divide num_heads"
+        self.head_dim = d_model // num_heads
+        self.qk_norm = qk_norm
+        self.qk_norm_type = qk_norm_type
+        # Input projection layers
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, self.kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(d_model, self.kv_heads * self.head_dim, bias=False)
+        # Output projection
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+        nn.init.zeros_(self.out_proj.weight)
+        if qk_norm:
+            import math
+            if qk_norm_type == "rms":
+                # Standard QK-norm (Qwen3/Gemma3 style): RMSNorm on Q and K,
+                # no learned temperature. SDPA's 1/sqrt(d) scaling is sufficient
+                # because RMSNorm preserves the expected logit variance.
+                pass  # no extra parameters needed
+            else:
+                # L2 + learned temperature (nGPT/ViT-22B style):
+                # L2 projects to unit sphere, needs learned scale to compensate.
+                self.log_scale = nn.Parameter(
+                    torch.full((num_heads,), math.log(math.sqrt(self.head_dim))))
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        key_padding_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # Project
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(key)
+        B, Tq, _ = q.shape
+        _, Tk, _ = k.shape
+        q = q.view(B, Tq, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(B, Tk, self.kv_heads, self.head_dim).transpose(1, 2)
+        v = v.view(B, Tk, self.kv_heads, self.head_dim).transpose(1, 2)
+        if self.kv_heads != self.num_heads:
+            repeat = self.num_heads // self.kv_heads
+            k = k.repeat_interleave(repeat, dim=1)
+            v = v.repeat_interleave(repeat, dim=1)
+        if self.qk_norm:
+            if self.qk_norm_type == "rms":
+                # RMSNorm (Qwen3/Gemma3 style): no learned temperature needed.
+                # After RMSNorm, logit variance matches standard SDPA naturally.
+                q = q * torch.rsqrt(q.square().mean(dim=-1, keepdim=True) + 1e-6)
+                k = k * torch.rsqrt(k.square().mean(dim=-1, keepdim=True) + 1e-6)
+                attn_mask = None
+                if key_padding_mask is not None:
+                    attn_mask = ~key_padding_mask[:, None, None, :].to(dtype=torch.bool)
+                attn_out = F.scaled_dot_product_attention(
+                    q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=False,
+                )
+            else:
+                # L2 + learned temperature (nGPT/ViT-22B style)
+                q = F.normalize(q, dim=-1)
+                k = F.normalize(k, dim=-1)
+                scale = self.log_scale.exp().view(1, -1, 1, 1)
+                q = q * scale
+                attn_mask = None
+                if key_padding_mask is not None:
+                    attn_mask = ~key_padding_mask[:, None, None, :].to(dtype=torch.bool)
+                attn_out = F.scaled_dot_product_attention(
+                    q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=False,
+                    scale=1.0,
+                )
+        else:
+            attn_mask = None
+            if key_padding_mask is not None:
+                attn_mask = ~key_padding_mask[:, None, None, :].to(dtype=torch.bool)
+            attn_out = F.scaled_dot_product_attention(
+                q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=False
+            )
+        attn_out = attn_out.transpose(1, 2).reshape(B, Tq, self.d_model)
+        return self.out_proj(attn_out)
+# =============================================================================
+# Transformer Feed-Forward Block
+# =============================================================================
+def _get_activation(name: str):
+    """Look up activation function by name. Supports 'relu_sq' for ReLU^2."""
+    if name == "relu_sq":
+        return lambda x: F.relu(x).square()
+    return getattr(F, name)
+class FeedForward(nn.Module):
+    """
+    Position-wise MLP block: linear -> activation -> linear.
+    Supports 'gelu', 'relu', 'relu_sq', etc.
+    """
+    def __init__(self, d_model: int, dim_ff: int, activation: str = "gelu"):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, dim_ff)
+        self.linear2 = nn.Linear(dim_ff, d_model)
+        nn.init.zeros_(self.linear2.weight)
+        nn.init.zeros_(self.linear2.bias)
+        self.activation = _get_activation(activation)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.linear1(x)
+        return self.linear2(self.activation(x))
+# =============================================================================
+# Custom Transformer Block
+# =============================================================================
+class TransformerBlock(nn.Module):
+    """
+    Single transformer block combining:
+      - multi-head SDPA (non-causal)
+      - layernorm + residual
+      - feed-forward MLP + residual
+    """
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dim_ff: int,
+        dropout: float = 0.0,
+        activation: str = "gelu",
+        kv_heads: int = None,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.attn = MultiHeadSDPA(d_model, num_heads, kv_heads=kv_heads)
+        self.dropout1 = nn.Dropout(dropout)
+        self.ffn = FeedForward(d_model, dim_ff, activation=activation)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        memory: torch.Tensor,
+        memory_key_padding_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        res = x
+        x = self.norm1(x)
+        x = self.attn(x, memory, key_padding_mask=memory_key_padding_mask)
+        x = res + self.dropout1(x)
+        res = x
+        x = self.norm2(x)
+        x = self.ffn(x)
+        return res + self.dropout2(x)
+class TransformerDecoderSets(nn.Module):
+    """
+    A stack of TransformerBlock layers for set-to-set
+    modeling without causal masks.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dim_ff: int,
+        num_layers: int,
+        dropout: float = 0.0,
+        activation: str = "gelu",
+        kv_heads: int = None,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            TransformerBlock(
+                d_model,
+                num_heads,
+                dim_ff,
+                dropout=dropout,
+                activation=activation,
+                kv_heads=kv_heads,
+            )
+            for _ in range(num_layers)
+        ])
+    def forward(
+        self,
+        tgt: torch.Tensor,
+        memory: torch.Tensor,
+        memory_key_padding_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            tgt = layer(tgt, memory, memory_key_padding_mask=memory_key_padding_mask)
+        return tgt

s23dr_2026_example/bad_samples.txt ADDED Viewed

	@@ -0,0 +1,156 @@

+14b1872e960
+1807ef90db4
+180e6a67e87
+1ad5c6bd31f
+1c3f939ad93
+1ede4c0d52f
+214f17d9cc4
+22256d88df9
+24a92a8de6d
+24b4e984bad
+2565978cf53
+2a71f1a2072
+2d44c1fade6
+2ebed43823a
+33982551420
+3b480496f82
+412a2bdf7a4
+44343bbabbb
+4a0b3f04cbd
+4a7fa170826
+4b7dc027214
+4e0dc2c9b18
+5172a516c8b
+529e8f15cd2
+56fc6f6f163
+575963ce814
+578ec40a278
+5a0c07c575a
+5d521223c26
+6148b5c9461
+631eb6d7c03
+655a14f8a75
+66502d7ee6f
+6da76fc6687
+777eaaad0ca
+7a4e2909d68
+7c5c9baf483
+80806dfd75e
+81a4ead431d
+833152dd554
+85797868c0f
+86460ad8181
+86783a6bee4
+95193322d7a
+99a9d056200
+9b1d4eeaab9
+9ff759f2e4c
+acbd243da16
+b9b275710c0
+beceaa9bb7c
+c243d079286
+c5c7337d2cb
+cdf6f2d3b35
+cfe370f1c87
+d4a72aea80c
+d655f066cd3
+d79e8d9455c
+d7d6c5be76e
+dc30ae4b93b
+de9495f7ca3
+e1901819c72
+e1d88c1a6b1
+e5d3eb0a617
+ec11d3cdcf6
+ecb21fad0ad
+ee55d8c6493
+ee7e6d4dee1
+008052054aa
+03ecb7d3cf3
+0555a655534
+099cad230c6
+0d061ae23f0
+10741a421c0
+110d5e407b9
+128a7fb415a
+13177736b26
+1635d73bf7d
+18a760de9ea
+18d90d03e95
+209627a5c1a
+21e3cd4b7b8
+22f5499200d
+266eb64de68
+269235f770b
+2758490e558
+2a203cf5d35
+2a878ec47ab
+2cb43eb2201
+393298e282b
+395abe6aac7
+3d19c7a4ca3
+44e2b719b1e
+45039819fcc
+4cb4ff01619
+4e5eb5712fa
+4e988765a6d
+5077bf42714
+55ed69b2622
+5ae3b651a37
+5ca1edeed4c
+5daa76b1c7f
+5fdd11dfae5
+6078cf180c2
+6682b309e9c
+6c02d2038c0
+71c595506c8
+73c8f960c18
+74ccc8fd057
+7a34156a798
+7ac7af9f59c
+7f2ec0ea179
+823b837b36c
+82d7600f9a3
+848161a2900
+88cedf129eb
+8dec106b6a6
+8e335d08ca4
+8ecf7c58193
+8fa55008beb
+90e09de2301
+9197acc0b9d
+954c25e876c
+98517d5563d
+99e717a0148
+9a0c0635bd7
+9ad436b7b3d
+9be351cbf14
+9e2a2e51798
+a84a7ea9220
+aa8cb84d3eb
+b07977292da
+b3e33456f0b
+b7823de373e
+bac379382d9
+bd2d9bf67a3
+c14584a84cd
+c497170c970
+cd8e767612b
+d17917bb279
+d42b9d432a9
+d53d8857a85
+d6808cf3d98
+d6f509d1dd9
+d7abd08e643
+d83493bf974
+d87293651ee
+da9d4ac9e8e
+daa1702791a
+dcb12411c14
+de9ab9cdd5b
+df906c58a3c
+e3870649eb5
+ea90aed9b98
+ecaa81b9711
+efc1238665b
+c5a65219daf

s23dr_2026_example/cache_scenes.py ADDED Viewed

	@@ -0,0 +1,373 @@

+#!/usr/bin/env python3
+"""Cache compact scenes from HoHo22k shards to training-ready .pt files.
+Runs build_compact_scene + precomputes group_id, semantic class, and
+normalization so training only needs fast sampling + GPU forward.
+Usage:
+  python cache_scenes.py --data-dir data/ --out-dir cache/train
+  python cache_scenes.py --streaming --out-dir cache/train --limit 5000
+  python cache_scenes.py --data-dir data/ --out-dir cache/train --workers 4
+Cache format per file (.pt):
+  xyz:          float32 [P, 3]   all points in world space
+  source:       uint8   [P]      0=colmap, 1=depth
+  group_id:     int8    [P]      priority tier 0-4, -1=excluded
+  class_id:     uint8   [P]      one-hot class index (0-12), see SEMANTIC_CLASSES
+  visible_src:  uint8   [P]      for visualization (1=gestalt, 2=ade)
+  visible_id:   int16   [P]      for visualization (class id within space)
+  center:       float32 [3]      smart normalization center
+  scale:        float32 scalar   smart normalization scale
+  gt_vertices:  float32 [V, 3]   ground truth wireframe vertices
+  gt_edges:     int32   [E, 2]   ground truth wireframe edge indices
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path as _Path
+if __package__ is None or __package__ == "":
+    _here = _Path(__file__).resolve().parent
+    if str(_here.parent) not in sys.path:
+        sys.path.insert(0, str(_here.parent))
+    __package__ = _here.name
+import argparse
+import time
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+import numpy as np
+import torch
+from .point_fusion import (
+    FuserConfig, build_compact_scene,
+    GEST_ID_TO_NAME, ADE_ID_TO_NAME, NUM_GEST,
+)
+# ---------------------------------------------------------------------------
+# Semantic class encoding: 11 structural + 1 other_house + 1 non_house = 13
+# ---------------------------------------------------------------------------
+# Each structural gestalt class gets its own one-hot bit.
+STRUCTURAL_CLASSES = (
+    "apex", "eave_end_point", "flashing_end_point",     # point classes (tier 0)
+    "rake", "ridge", "eave", "hip", "valley",           # roof edges (tier 1)
+    "flashing", "step_flashing",
+    "roof",                                              # roof face (tier 2)
+)
+# Index 11 = other house part (door, window, siding, etc.)
+# Index 12 = non-house / ADE / unlabeled
+NUM_SEMANTIC_CLASSES = len(STRUCTURAL_CLASSES) + 2  # 13
+# Priority tiers (same as tokenizer.py)
+_GEST_NAME_TO_ID = {n: i for i, n in enumerate(GEST_ID_TO_NAME)}
+_POINT_IDS = {_GEST_NAME_TO_ID[n] for n in ("apex", "eave_end_point", "flashing_end_point") if n in _GEST_NAME_TO_ID}
+_EDGE_IDS = {_GEST_NAME_TO_ID[n] for n in ("rake", "ridge", "eave", "hip", "valley", "flashing", "step_flashing") if n in _GEST_NAME_TO_ID}
+_FACE_IDS = {_GEST_NAME_TO_ID[n] for n in ("roof",) if n in _GEST_NAME_TO_ID}
+_HOUSE_IDS = {_GEST_NAME_TO_ID[n] for n in (
+    "apex", "eave_end_point", "flashing_end_point",
+    "rake", "ridge", "eave", "hip", "valley", "flashing", "step_flashing",
+    "roof", "door", "garage", "window", "shutter", "fascia", "soffit",
+    "horizontal_siding", "vertical_siding", "brick", "concrete",
+    "other_wall", "trim", "post", "ground_line",
+) if n in _GEST_NAME_TO_ID}
+_ADE_NAME_TO_ID = {n.lower(): i for i, n in enumerate(ADE_ID_TO_NAME)}
+_ADE_HOUSE_IDS = {_ADE_NAME_TO_ID[n] for n in ("building;edifice", "house", "wall", "windowpane;window", "door;double;door") if n in _ADE_NAME_TO_ID}
+_UNCLS_ID = _GEST_NAME_TO_ID.get("unclassified", -1)
+# Map structural gestalt names to one-hot index
+_STRUCTURAL_ONEHOT = {}
+for idx, name in enumerate(STRUCTURAL_CLASSES):
+    gid = _GEST_NAME_TO_ID.get(name)
+    if gid is not None:
+        _STRUCTURAL_ONEHOT[gid] = idx
+def _compute_group_and_class(visible_src, visible_id, behind_id, source):
+    """Compute priority group_id and semantic class_id per point (vectorized).
+    Args:
+        visible_src: uint8 [P] -- 0=unlabeled, 1=gestalt, 2=ade
+        visible_id:  int16 [P] -- class id within gestalt or ade space
+        behind_id:   int16 [P] -- behind-gestalt id (-1 if none)
+        source:      uint8 [P] -- 0=colmap, 1=depth
+    Returns:
+        group_id:  int8  [P] -- priority tier 0-4, -1 for excluded (unclassified)
+        class_id:  uint8 [P] -- one-hot class index 0-12
+    """
+    P = len(visible_src)
+    vsrc = visible_src.astype(np.int32)
+    vid = visible_id.astype(np.int32)
+    bid = behind_id.astype(np.int32)
+    # Effective gestalt id: prefer visible gestalt, fall back to behind
+    gest_id = np.full(P, -1, dtype=np.int32)
+    has_vis_gest = (vsrc == 1) & (vid >= 0)
+    has_behind = (bid >= 0) & ~has_vis_gest
+    gest_id[has_vis_gest] = vid[has_vis_gest]
+    gest_id[has_behind] = bid[has_behind]
+    # Exclude unclassified points
+    if _UNCLS_ID >= 0:
+        is_uncls = ((vsrc == 1) & (vid == _UNCLS_ID)) | (bid == _UNCLS_ID)
+        gest_id[is_uncls] = -1  # force excluded
+    # Build lookup arrays for gestalt id -> group and gestalt id -> class
+    max_gid = NUM_GEST
+    gid_to_group = np.full(max_gid, 4, dtype=np.int8)  # default: tier 4
+    gid_to_class = np.full(max_gid, NUM_SEMANTIC_CLASSES - 1, dtype=np.uint8)  # default: non-house
+    for gid in _POINT_IDS:
+        gid_to_group[gid] = 0
+    for gid in _EDGE_IDS:
+        gid_to_group[gid] = 1
+    for gid in _FACE_IDS:
+        gid_to_group[gid] = 2
+    for gid in _HOUSE_IDS - _POINT_IDS - _EDGE_IDS - _FACE_IDS:
+        gid_to_group[gid] = 3
+    for gid, onehot_idx in _STRUCTURAL_ONEHOT.items():
+        gid_to_class[gid] = onehot_idx
+    for gid in _HOUSE_IDS - set(_STRUCTURAL_ONEHOT.keys()):
+        gid_to_class[gid] = len(STRUCTURAL_CLASSES)  # other_house
+    # Apply lookup for points with valid gestalt ids
+    has_gest = gest_id >= 0
+    group_id = np.full(P, 4, dtype=np.int8)  # default: tier 4
+    class_id = np.full(P, NUM_SEMANTIC_CLASSES - 1, dtype=np.uint8)  # default: non-house
+    group_id[has_gest] = gid_to_group[gest_id[has_gest]]
+    class_id[has_gest] = gid_to_class[gest_id[has_gest]]
+    # ADE house points (no gestalt) get tier 3 + class_id = other_house
+    ade_house_arr = np.array(sorted(_ADE_HOUSE_IDS), dtype=np.int32)
+    is_ade_house = ~has_gest & (vsrc == 2) & (vid >= 0) & np.isin(vid, ade_house_arr)
+    group_id[is_ade_house] = 3
+    class_id[is_ade_house] = len(STRUCTURAL_CLASSES)  # other_house (index 11)
+    # Mark excluded points (unclassified) as -1
+    if _UNCLS_ID >= 0:
+        group_id[is_uncls] = -1
+        class_id[is_uncls] = NUM_SEMANTIC_CLASSES - 1
+    return group_id, class_id
+def _compute_smart_center_scale(xyz, source, mad_k=2.5, percentile=95.0,
+                                 max_points=8000):
+    """Compute normalization center and scale from depth points with MAD filter."""
+    depth_mask = source == 1
+    ref = xyz[depth_mask] if depth_mask.any() else xyz
+    if ref.shape[0] == 0:
+        center = xyz.mean(axis=0)
+        scale = max(np.linalg.norm(xyz - center, axis=1).max(), 1e-6)
+        return center.astype(np.float32), np.float32(scale)
+    if ref.shape[0] > max_points:
+        idx = np.random.choice(ref.shape[0], max_points, replace=False)
+        ref = ref[idx]
+    center0 = np.median(ref, axis=0)
+    dist = np.linalg.norm(ref - center0, axis=1)
+    med = np.median(dist)
+    mad = max(np.median(np.abs(dist - med)), 1e-6)
+    inliers = dist <= (med + mad_k * mad)
+    if inliers.any():
+        ref = ref[inliers]
+    # Percentile bounding box
+    lo_f = (100.0 - percentile) * 0.5 / 100.0
+    sorted_v = np.sort(ref, axis=0)
+    n = sorted_v.shape[0]
+    lo_idx = max(0, min(n - 1, int(lo_f * (n - 1))))
+    hi_idx = max(0, min(n - 1, int((1.0 - lo_f) * (n - 1))))
+    low = sorted_v[lo_idx]
+    high = sorted_v[hi_idx]
+    center = 0.5 * (low + high)
+    scale = max(np.sqrt(((high - low) ** 2).sum()), 1e-6)
+    return center.astype(np.float32), np.float32(scale)
+def _process_one(sample, cfg):
+    """Process a single HF sample into a cache dict. Returns (order_id, dict) or None."""
+    rng = np.random.RandomState()  # worker-local rng
+    n_edges = len(sample.get("wf_edges", []))
+    if n_edges == 0 or n_edges > 64:
+        return None
+    scene = build_compact_scene(sample, cfg, rng=rng)
+    if scene is None:
+        return None
+    gt_v = scene.get("gt_vertices")
+    gt_e = scene.get("gt_edges")
+    if gt_v is None or gt_e is None or len(gt_e) == 0:
+        return None
+    xyz = scene["xyz"]
+    source = scene["source"]
+    visible_src = scene["visible_src"]
+    visible_id = scene["visible_id"]
+    behind_id = scene["behind_gest_id"]
+    group_id, class_id = _compute_group_and_class(
+        visible_src, visible_id, behind_id, source
+    )
+    center, scale = _compute_smart_center_scale(xyz, source)
+    order_id = sample.get("order_id", "unknown")
+    return order_id, {
+        "xyz": xyz.astype(np.float32),
+        "source": source.astype(np.uint8),
+        "group_id": group_id,
+        "class_id": class_id,
+        "behind_gest_id": behind_id.astype(np.int16),
+        "visible_src": visible_src.astype(np.uint8),
+        "visible_id": visible_id.astype(np.int16),
+        "n_views_voted": scene["n_views_voted"],
+        "vote_frac": scene["vote_frac"],
+        "center": center,
+        "scale": scale,
+        "gt_vertices": gt_v.astype(np.float32),
+        "gt_edges": gt_e.astype(np.int32),
+    }
+def main():
+    p = argparse.ArgumentParser(description="Cache compact scenes from HoHo22k")
+    g = p.add_mutually_exclusive_group(required=True)
+    g.add_argument("--data-dir", help="Local dir with shards")
+    g.add_argument("--streaming", action="store_true", help="Stream from HuggingFace")
+    p.add_argument("--out-dir", required=True, help="Output directory for .pt files")
+    p.add_argument("--limit", type=int, default=0)
+    p.add_argument("--depth-per-view", type=int, default=8000)
+    p.add_argument("--workers", type=int, default=0,
+                   help="Parallel workers (0=sequential)")
+    p.add_argument("--skip-existing", action="store_true",
+                   help="Skip samples whose .pt already exists in out-dir")
+    p.add_argument("--shard-start", type=int, default=0,
+                   help="First shard index (for parallel launches)")
+    p.add_argument("--shard-stride", type=int, default=1,
+                   help="Stride between shards (e.g. 8 means take every 8th shard)")
+    args = p.parse_args()
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    existing_ids = set(p.stem for p in out_dir.glob("*.pt")) if args.skip_existing else set()
+    # Load dataset
+    from datasets import load_dataset
+    if args.streaming:
+        ds = load_dataset(
+            "usm3d/hoho22k_2026_trainval",
+            streaming=True, trust_remote_code=True, split="train",
+        )
+    else:
+        data_root = Path(args.data_dir).resolve()
+        tars = []
+        for candidate in [data_root / "data" / "train", data_root / "train", data_root]:
+            if candidate.exists():
+                tars = sorted(str(p) for p in candidate.glob("*.tar"))
+                if tars:
+                    break
+        loader = None
+        for c in [data_root / "hoho22k_2026_trainval.py"]:
+            if c.exists():
+                loader = c
+                break
+        if loader is None:
+            found = list(data_root.rglob("hoho22k_2026_trainval.py"))
+            loader = found[0] if found else None
+        if loader is None:
+            raise FileNotFoundError("Cannot find loader script")
+        # Shard-level parallelism: each process handles a slice of tars
+        if args.shard_stride > 1:
+            tars = tars[args.shard_start::args.shard_stride]
+            print(f"Shard slice: start={args.shard_start} stride={args.shard_stride} -> {len(tars)} shards")
+        ds = load_dataset(str(loader), data_files={"train": tars},
+                          streaming=True, trust_remote_code=True, split="train")
+    cfg = FuserConfig(depth_points_per_view=args.depth_per_view)
+    saved = 0
+    skipped = 0
+    t_start = time.perf_counter()
+    if args.workers > 0:
+        # Parallel: collect samples into batches, process in worker pool
+        # Note: HF streaming datasets can't be shared across workers, so we
+        # iterate in the main thread and dispatch processing to workers.
+        with ProcessPoolExecutor(max_workers=args.workers) as pool:
+            futures = {}
+            for i, sample in enumerate(ds):
+                if args.limit > 0 and i >= args.limit:
+                    break
+                oid = sample.get("order_id", "unknown")
+                if oid in existing_ids:
+                    skipped += 1
+                    continue
+                future = pool.submit(_process_one, sample, cfg)
+                futures[future] = i
+                # Drain completed futures to bound memory
+                if len(futures) >= args.workers * 4:
+                    done = [f for f in futures if f.done()]
+                    for f in done:
+                        result = f.result()
+                        del futures[f]
+                        if result is None:
+                            skipped += 1
+                            continue
+                        order_id, data = result
+                        torch.save(data, out_dir / f"{order_id}.pt")
+                        saved += 1
+                        if saved % 50 == 0:
+                            elapsed = time.perf_counter() - t_start
+                            print(f"Saved {saved} (skipped {skipped}) "
+                                  f"[{saved / elapsed:.1f} samples/s]")
+            # Drain remaining
+            for f in as_completed(futures):
+                result = f.result()
+                if result is None:
+                    skipped += 1
+                    continue
+                order_id, data = result
+                torch.save(data, out_dir / f"{order_id}.pt")
+                saved += 1
+    else:
+        # Sequential
+        for i, sample in enumerate(ds):
+            if args.limit > 0 and i >= args.limit:
+                break
+            oid = sample.get("order_id", "unknown")
+            if oid in existing_ids:
+                skipped += 1
+                continue
+            result = _process_one(sample, cfg)
+            if result is None:
+                skipped += 1
+                continue
+            order_id, data = result
+            torch.save(data, out_dir / f"{order_id}.pt")
+            saved += 1
+            if saved % 50 == 0:
+                elapsed = time.perf_counter() - t_start
+                print(f"Saved {saved} (skipped {skipped}) "
+                      f"[{saved / elapsed:.1f} samples/s]")
+    elapsed = time.perf_counter() - t_start
+    print(f"Done. Saved {saved}, skipped {skipped} in {elapsed:.0f}s "
+          f"({saved / elapsed:.1f} samples/s)")
+if __name__ == "__main__":
+    main()

s23dr_2026_example/color_mappings.py ADDED Viewed

	@@ -0,0 +1,209 @@

+gestalt_color_mapping = {
+    "unclassified": (215, 62, 138),
+    "apex": (235, 88, 48),
+    "eave_end_point": (248, 130, 228),
+    "flashing_end_point": (71, 11, 161),
+    "ridge": (214, 251, 248),
+    "rake": (13, 94, 47),
+    "eave": (54, 243, 63),
+    "post": (187, 123, 236),
+    "ground_line": (136, 206, 14),
+    "flashing": (162, 162, 32),
+    "step_flashing": (169, 255, 219),
+    "hip": (8, 89, 52),
+    "valley": (85, 27, 65),
+    "roof": (215, 232, 179),
+    "door": (110, 52, 23),
+    "garage": (50, 233, 171),
+    "window": (230, 249, 40),
+    "shutter": (122, 4, 233),
+    "fascia": (95, 230, 240),
+    "soffit": (2, 102, 197),
+    "horizontal_siding": (131, 88, 59),
+    "vertical_siding": (110, 187, 198),
+    "brick": (171, 252, 7),
+    "concrete": (32, 47, 246),
+    "other_wall": (112, 61, 240),
+    "trim": (151, 206, 58),
+    "unknown": (127, 127, 127),
+    "transition_line": (0,0,0),
+}
+ade20k_color_mapping = {
+    'wall': (120, 120, 120),
+    'building;edifice': (180, 120, 120),
+    'sky': (6, 230, 230),
+    'floor;flooring': (80, 50, 50),
+    'tree': (4, 200, 3),
+    'ceiling': (120, 120, 80),
+    'road;route': (140, 140, 140),
+    'bed': (204, 5, 255),
+    'windowpane;window': (230, 230, 230),
+    'grass': (4, 250, 7),
+    'cabinet': (224, 5, 255),
+    'sidewalk;pavement': (235, 255, 7),
+    'person;individual;someone;somebody;mortal;soul': (150, 5, 61),
+    'earth;ground': (120, 120, 70),
+    'door;double;door': (8, 255, 51),
+    'table': (255, 6, 82),
+    'mountain;mount': (143, 255, 140),
+    'plant;flora;plant;life': (204, 255, 4),
+    'curtain;drape;drapery;mantle;pall': (255, 51, 7),
+    'chair': (204, 70, 3),
+    'car;auto;automobile;machine;motorcar': (0, 102, 200),
+    'water': (61, 230, 250),
+    'painting;picture': (255, 6, 51),
+    'sofa;couch;lounge': (11, 102, 255),
+    'shelf': (255, 7, 71),
+    'house': (255, 9, 224),
+    'sea': (9, 7, 230),
+    'mirror': (220, 220, 220),
+    'rug;carpet;carpeting': (255, 9, 92),
+    'field': (112, 9, 255),
+    'armchair': (8, 255, 214),
+    'seat': (7, 255, 224),
+    'fence;fencing': (255, 184, 6),
+    'desk': (10, 255, 71),
+    'rock;stone': (255, 41, 10),
+    'wardrobe;closet;press': (7, 255, 255),
+    'lamp': (224, 255, 8),
+    'bathtub;bathing;tub;bath;tub': (102, 8, 255),
+    'railing;rail': (255, 61, 6),
+    'cushion': (255, 194, 7),
+    'base;pedestal;stand': (255, 122, 8),
+    'box': (0, 255, 20),
+    'column;pillar': (255, 8, 41),
+    'signboard;sign': (255, 5, 153),
+    'chest;of;drawers;chest;bureau;dresser': (6, 51, 255),
+    'counter': (235, 12, 255),
+    'sand': (160, 150, 20),
+    'sink': (0, 163, 255),
+    'skyscraper': (140, 140, 140),
+    'fireplace;hearth;open;fireplace': (250, 10, 15),
+    'refrigerator;icebox': (20, 255, 0),
+    'grandstand;covered;stand': (31, 255, 0),
+    'path': (255, 31, 0),
+    'stairs;steps': (255, 224, 0),
+    'runway': (153, 255, 0),
+    'case;display;case;showcase;vitrine': (0, 0, 255),
+    'pool;table;billiard;table;snooker;table': (255, 71, 0),
+    'pillow': (0, 235, 255),
+    'screen;door;screen': (0, 173, 255),
+    'stairway;staircase': (31, 0, 255),
+    'river': (11, 200, 200),
+    'bridge;span': (255 ,82, 0),
+    'bookcase': (0, 255, 245),
+    'blind;screen': (0, 61, 255),
+    'coffee;table;cocktail;table': (0, 255, 112),
+    'toilet;can;commode;crapper;pot;potty;stool;throne': (0, 255, 133),
+    'flower': (255, 0, 0),
+    'book': (255, 163, 0),
+    'hill': (255, 102, 0),
+    'bench': (194, 255, 0),
+    'countertop': (0, 143, 255),
+    'stove;kitchen;stove;range;kitchen;range;cooking;stove': (51, 255, 0),
+    'palm;palm;tree': (0, 82, 255),
+    'kitchen;island': (0, 255, 41),
+    'computer;computing;machine;computing;device;data;processor;electronic;computer;information;processing;system': (0, 255, 173),
+    'swivel;chair': (10, 0, 255),
+    'boat': (173, 255, 0),
+    'bar': (0, 255, 153),
+    'arcade;machine': (255, 92, 0),
+    'hovel;hut;hutch;shack;shanty': (255, 0, 255),
+    'bus;autobus;coach;charabanc;double-decker;jitney;motorbus;motorcoach;omnibus;passenger;vehicle': (255, 0, 245),
+    'towel': (255, 0, 102),
+    'light;light;source': (255, 173, 0),
+    'truck;motortruck': (255, 0, 20),
+    'tower': (255, 184, 184),
+    'chandelier;pendant;pendent': (0, 31, 255),
+    'awning;sunshade;sunblind': (0, 255, 61),
+    'streetlight;street;lamp': (0, 71, 255),
+    'booth;cubicle;stall;kiosk': (255, 0, 204),
+    'television;television;receiver;television;set;tv;tv;set;idiot;box;boob;tube;telly;goggle;box': (0, 255, 194),
+    'airplane;aeroplane;plane': (0, 255, 82),
+    'dirt;track': (0, 10, 255),
+    'apparel;wearing;apparel;dress;clothes': (0, 112, 255),
+    'pole': (51, 0, 255),
+    'land;ground;soil': (0, 194, 255),
+    'bannister;banister;balustrade;balusters;handrail': (0, 122, 255),
+    'escalator;moving;staircase;moving;stairway': (0, 255, 163),
+    'ottoman;pouf;pouffe;puff;hassock': (255, 153, 0),
+    'bottle': (0, 255, 10),
+    'buffet;counter;sideboard': (255, 112, 0),
+    'poster;posting;placard;notice;bill;card': (143, 255, 0),
+    'stage': (82, 0, 255),
+    'van': (163, 255, 0),
+    'ship': (255, 235, 0),
+    'fountain': (8, 184, 170),
+    'conveyer;belt;conveyor;belt;conveyer;conveyor;transporter': (133, 0, 255),
+    'canopy': (0, 255, 92),
+    'washer;automatic;washer;washing;machine': (184, 0, 255),
+    'plaything;toy': (255, 0, 31),
+    'swimming;pool;swimming;bath;natatorium': (0, 184, 255),
+    'stool': (0, 214, 255),
+    'barrel;cask': (255, 0, 112),
+    'basket;handbasket': (92, 255, 0),
+    'waterfall;falls': (0, 224, 255),
+    'tent;collapsible;shelter': (112, 224, 255),
+    'bag': (70, 184, 160),
+    'minibike;motorbike': (163, 0, 255),
+    'cradle': (153, 0, 255),
+    'oven': (71, 255, 0),
+    'ball': (255, 0, 163),
+    'food;solid;food': (255, 204, 0),
+    'step;stair': (255, 0, 143),
+    'tank;storage;tank': (0, 255, 235),
+    'trade;name;brand;name;brand;marque': (133, 255, 0),
+    'microwave;microwave;oven': (255, 0, 235),
+    'pot;flowerpot': (245, 0, 255),
+    'animal;animate;being;beast;brute;creature;fauna': (255, 0, 122),
+    'bicycle;bike;wheel;cycle': (255, 245, 0),
+    'lake': (10, 190, 212),
+    'dishwasher;dish;washer;dishwashing;machine': (214, 255, 0),
+    'screen;silver;screen;projection;screen': (0, 204, 255),
+    'blanket;cover': (20, 0, 255),
+    'sculpture': (255, 255, 0),
+    'hood;exhaust;hood': (0, 153, 255),
+    'sconce': (0, 41, 255),
+    'vase': (0, 255, 204),
+    'traffic;light;traffic;signal;stoplight': (41, 0, 255),
+    'tray': (41, 255, 0),
+    'ashcan;trash;can;garbage;can;wastebin;ash;bin;ash-bin;ashbin;dustbin;trash;barrel;trash;bin': (173, 0, 255),
+    'fan': (0, 245, 255),
+    'pier;wharf;wharfage;dock': (71, 0, 255),
+    'crt;screen': (122, 0, 255),
+    'plate': (0, 255, 184),
+    'monitor;monitoring;device': (0, 92, 255),
+    'bulletin;board;notice;board': (184, 255, 0),
+    'shower': (0, 133, 255),
+    'radiator': (255, 214, 0),
+    'glass;drinking;glass': (25, 194, 194),
+    'clock': (102, 255, 0),
+    'flag': (92, 0, 255),
+}
+EDGE_CLASSES = {'cornice_return': 0,
+                'cornice_strip': 1,
+                'eave': 2,
+                'flashing': 3,
+                'hip': 4,
+                'rake': 5,
+                'ridge': 6,
+                'step_flashing': 7,
+                'transition_line': 8,
+                'valley': 9}
+EDGE_CLASSES_BY_ID = {v: k for k, v in EDGE_CLASSES.items()}
+edge_color_mapping = {
+    'cornice_return': (215, 62, 138),
+    'cornice_strip': (235, 88, 48),
+    'eave':  (54, 243, 63),
+    "flashing": (162, 162, 32),
+    'hip': (8, 89, 52),
+    'rake': (13, 94, 47),
+    'ridge': (214, 251, 248),
+    "step_flashing": (169, 255, 219),
+    'transition_line': (200,0,50),
+    'valley': (85, 27, 65),
+}

s23dr_2026_example/data.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""Data loading for pre-sampled HF datasets.
+Expects pre-sampled npz blobs with xyz_norm [2048, 3] (not full PCD).
+Use make_sampled_cache.py to produce these from full point clouds.
+"""
+from __future__ import annotations
+from pathlib import Path
+import numpy as np
+import torch
+from .tokenizer import EdgeDepthSequenceConfig
+# Default token budget (must match make_sampled_cache.py)
+SEQ_LEN = 2048
+COLMAP_POINTS = 1536
+DEPTH_POINTS = 512
+# ---------------------------------------------------------------------------
+# Datasets
+# ---------------------------------------------------------------------------
+def _load_bad_sample_ids():
+    """Load the set of known-bad sample IDs (misaligned GT, extreme scale)."""
+    bad_file = Path(__file__).parent / "bad_samples.txt"
+    if not bad_file.exists():
+        return set()
+    return set(line.strip() for line in bad_file.read_text().splitlines() if line.strip())
+class HFCachedDataset(torch.utils.data.Dataset):
+    """Load pre-sampled HuggingFace dataset into memory."""
+    def __init__(self, hf_dataset, aug_rotate=False, aug_jitter=0.0,
+                 aug_drop=0.0, aug_flip=False):
+        import io as _io
+        bad_ids = _load_bad_sample_ids()
+        print(f"Pre-decoding {len(hf_dataset)} samples into memory...")
+        self.samples = []
+        self.order_ids = []
+        n_skipped = 0
+        for i, sample in enumerate(hf_dataset):
+            if sample["order_id"] in bad_ids:
+                n_skipped += 1
+                continue
+            d = dict(np.load(_io.BytesIO(sample["data"])))
+            if "xyz_norm" not in d:
+                raise ValueError(
+                    f"Sample {sample['order_id']} missing 'xyz_norm' -- this looks like "
+                    f"a full PCD dataset, not pre-sampled. Use make_sampled_cache.py first.")
+            self.samples.append(d)
+            self.order_ids.append(sample["order_id"])
+            if (i + 1) % 2000 == 0:
+                print(f"  {i+1}/{len(hf_dataset)}...")
+        print(f"  Done. {len(self.samples)} samples in memory"
+              f" ({n_skipped} bad samples filtered).")
+        self.aug_rotate = aug_rotate
+        self.aug_jitter = aug_jitter
+        self.aug_drop = aug_drop
+        self.aug_flip = aug_flip
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        out = _process_sample(self.samples[idx], self.aug_rotate,
+                              self.aug_jitter, self.aug_drop, self.aug_flip)
+        out["sample_id"] = self.order_ids[idx]
+        return out
+def _process_sample(d, aug_rotate, aug_jitter=0.0, aug_drop=0.0, aug_flip=False):
+    """Process a pre-sampled npz dict into training tensors.
+    Args:
+        aug_rotate: random yaw rotation
+        aug_jitter: std of Gaussian noise added to point positions (0=disabled)
+        aug_drop: fraction of points to randomly drop (0=disabled)
+        aug_flip: random mirror along X axis (50% chance)
+    """
+    xyz_norm = d["xyz_norm"].copy()
+    gt_seg = d["gt_segments"].copy()
+    mask = d["mask"].copy()
+    if aug_rotate:
+        theta = np.random.rand() * 2 * np.pi
+        cos_t, sin_t = np.cos(theta), np.sin(theta)
+        x, z = xyz_norm[:, 0].copy(), xyz_norm[:, 2].copy()
+        xyz_norm[:, 0] = x * cos_t - z * sin_t
+        xyz_norm[:, 2] = x * sin_t + z * cos_t
+        for ep in range(2):
+            sx, sz = gt_seg[:, ep, 0].copy(), gt_seg[:, ep, 2].copy()
+            gt_seg[:, ep, 0] = sx * cos_t - sz * sin_t
+            gt_seg[:, ep, 2] = sx * sin_t + sz * cos_t
+    if aug_flip and np.random.rand() < 0.5:
+        xyz_norm[:, 0] = -xyz_norm[:, 0]
+        gt_seg[:, :, 0] = -gt_seg[:, :, 0]
+    if aug_jitter > 0:
+        valid = mask.astype(bool)
+        xyz_norm[valid] += np.random.randn(valid.sum(), 3).astype(np.float32) * aug_jitter
+    if aug_drop > 0:
+        valid_idx = np.where(mask)[0]
+        n_drop = int(len(valid_idx) * aug_drop)
+        if n_drop > 0:
+            drop_idx = np.random.choice(valid_idx, n_drop, replace=False)
+            mask[drop_idx] = False
+    result = {
+        "xyz_norm": torch.as_tensor(xyz_norm, dtype=torch.float32),
+        "class_id": torch.as_tensor(d["class_id"], dtype=torch.long),
+        "source": torch.as_tensor(d["source"], dtype=torch.long),
+        "mask": torch.as_tensor(mask),
+        "gt_segments": torch.as_tensor(gt_seg, dtype=torch.float32),
+        "scale": torch.tensor(float(d["scale"]), dtype=torch.float32),
+        "center": torch.as_tensor(d["center"], dtype=torch.float32),
+        "gt_vertices": d["gt_vertices"],
+        "gt_edges": d["gt_edges"],
+        "visible_src": torch.as_tensor(d["visible_src"], dtype=torch.long),
+        "visible_id": torch.as_tensor(d["visible_id"], dtype=torch.long),
+    }
+    if "behind" in d:
+        result["behind"] = torch.as_tensor(
+            np.clip(np.asarray(d["behind"], dtype=np.int16), 0, None), dtype=torch.long)
+    if "n_views_voted" in d:
+        result["n_views_voted"] = torch.as_tensor(d["n_views_voted"], dtype=torch.float32)
+    if "vote_frac" in d:
+        result["vote_frac"] = torch.as_tensor(d["vote_frac"], dtype=torch.float32)
+    if "gt_edge_classes" in d:
+        result["gt_edge_classes"] = torch.as_tensor(
+            np.asarray(d["gt_edge_classes"], dtype=np.int64), dtype=torch.long)
+    return result
+# ---------------------------------------------------------------------------
+# Collation + DataLoader
+# ---------------------------------------------------------------------------
+def collate(batch):
+    """Stack samples into batched tensors."""
+    out = {
+        "xyz_norm": torch.stack([d["xyz_norm"] for d in batch]),
+        "class_id": torch.stack([d["class_id"] for d in batch]),
+        "source": torch.stack([d["source"] for d in batch]),
+        "mask": torch.stack([d["mask"] for d in batch]),
+        "gt_segments": [d["gt_segments"] for d in batch],
+        "scales": torch.stack([d["scale"] for d in batch]),
+        "meta": batch,
+    }
+    # Optional fields: check ALL samples, not just batch[0].
+    # If any sample has it, all must have it (no mixed data versions).
+    for field in ("behind", "n_views_voted", "vote_frac"):
+        if any(field in d for d in batch):
+            missing = [i for i, d in enumerate(batch) if field not in d]
+            if missing:
+                raise KeyError(
+                    f"Field '{field}' present in some batch samples but missing in "
+                    f"{len(missing)}/{len(batch)}. Mixed data versions in cache?")
+            out[field] = torch.stack([d[field] for d in batch])
+    # gt_edge_classes: variable length per sample (like gt_segments), keep as list
+    if any("gt_edge_classes" in d for d in batch):
+        missing = [i for i, d in enumerate(batch) if "gt_edge_classes" not in d]
+        if missing:
+            raise KeyError(
+                f"Field 'gt_edge_classes' present in some batch samples but missing in "
+                f"{len(missing)}/{len(batch)}. Mixed data versions in cache?")
+        out["gt_edge_classes"] = [d["gt_edge_classes"] for d in batch]
+    return out
+def build_loader(cache_dir, batch_size, aug_rotate=False, aug_jitter=0.0,
+                 aug_drop=0.0, aug_flip=False):
+    """Create a DataLoader from HF dataset.
+    cache_dir should be 'hf://repo/name:split' format.
+    """
+    if not cache_dir.startswith("hf://"):
+        raise ValueError(
+            f"cache_dir must be 'hf://repo:split' format, got: {cache_dir}. "
+            f"Local .pt caches are no longer supported in the training path.")
+    parts = cache_dir[5:].split(":")
+    repo = parts[0]
+    split = parts[1] if len(parts) > 1 else "train"
+    from datasets import load_dataset
+    hf_ds = load_dataset(repo, split=split)
+    ds = HFCachedDataset(hf_ds, aug_rotate=aug_rotate, aug_jitter=aug_jitter,
+                          aug_drop=aug_drop, aug_flip=aug_flip)
+    loader = torch.utils.data.DataLoader(
+        ds, batch_size=batch_size, shuffle=True,
+        num_workers=0, collate_fn=collate,
+    )
+    print(f"Dataset: {len(ds)} scenes, batch_size={batch_size}")
+    return loader
+# ---------------------------------------------------------------------------
+# Token building (GPU)
+# ---------------------------------------------------------------------------
+def build_tokens(batch, model, device):
+    """Apply Fourier features + learned embeddings on GPU."""
+    xyz = batch["xyz_norm"].to(device)
+    cid = batch["class_id"].to(device)
+    src = batch["source"].to(device)
+    masks = batch["mask"].to(device)
+    gt = [g.to(device) for g in batch["gt_segments"]]
+    scales = batch["scales"]
+    B, T, _ = xyz.shape
+    tok = model.tokenizer
+    fourier = tok.pos_enc(xyz.reshape(-1, 3)).reshape(B, T, -1) \
+        if tok.pos_enc is not None else xyz.new_zeros(B, T, 0)
+    parts = [xyz, fourier, tok.label_emb(cid), tok.src_emb(src.clamp(0, 1))]
+    if tok.behind_emb_dim > 0:
+        if "behind" in batch:
+            beh = batch["behind"].to(device)
+        else:
+            # Data doesn't have behind -- use zeros (embed index 0).
+            # This is intentional for eval on old data; for training,
+            # fail fast by requiring the field (checked in _process_sample).
+            beh = xyz.new_zeros(B, T, dtype=torch.long)
+        parts.append(tok.behind_emb(beh))
+    if tok.use_vote_features:
+        if "n_views_voted" not in batch or "vote_frac" not in batch:
+            raise KeyError(
+                "Model expects vote features (--vote-features) but data is missing "
+                "'n_views_voted'/'vote_frac'. Use v2 dataset or regenerate cache.")
+        # Normalize to ~zero mean, unit variance (dataset stats: nv~2.7+/-1.0, vf~0.5+/-0.25)
+        nv = ((batch["n_views_voted"].to(device).float() - 2.7) / 1.0).unsqueeze(-1)
+        vf = ((batch["vote_frac"].to(device).float() - 0.5) / 0.25).unsqueeze(-1)
+        parts.extend([nv, vf])
+    tokens = torch.cat(parts, dim=-1)
+    return tokens, masks, gt, scales, batch["meta"]

s23dr_2026_example/losses.py ADDED Viewed

	@@ -0,0 +1,311 @@

+"""Loss computation for wireframe prediction."""
+from __future__ import annotations
+import torch
+from .varifold import varifold_loss_batch
+from .sinkhorn import batched_sinkhorn_loss
+from .soft_hss_loss import batched_sinkhorn_vertex_f1, batched_soft_hss_v2
+# Varifold config
+VARIANT = "simpson3"
+SIGMAS = [0.5, 1.0, 2.0]  # meters (divided by per-scene scale at runtime)
+ALPHAS = [0.2, 0.6, 0.2]
+LEN_POW = 1.0
+VARIFOLD_CROSS_ONLY = False  # Set to True to drop self-energy (avoids O(S^2) blowup)
+# Sinkhorn config (note: near-zero gradients at eps=0.05, effectively disabled)
+SINKHORN_EPS = 0.05
+SINKHORN_ITERS = 10
+# Distance thresholds in meters (divided by per-scene scale at runtime)
+VERTEX_THRESH_M = 0.5      # vertex match threshold (mirrors real HSS)
+TUBE_RADIUS_M = 0.5        # tube IoU radius (mirrors real HSS)
+# Sinkhorn dustbin cost: controls the OT "not matching" penalty.
+# Like tau, this is an OT behavior parameter, NOT a physical distance.
+# Must be comparable to typical matching costs in normalized space (~0.1).
+# Do NOT divide by scale.
+SINKHORN_DUSTBIN = 0.1
+# Sigmoid temperature: controls gradient smoothness, NOT a distance threshold.
+# Must stay large enough in normalized space to provide useful gradients.
+# Do NOT divide by scale (unlike the thresholds above).
+SIGMOID_TAU = 0.05
+MAX_GT = 64  # fixed pad size for compile-friendly shapes
+# Precomputed constants (created once on first call)
+_loss_constants = {}
+def _get_loss_constants(device, dtype):
+    key = (device, dtype)
+    if key not in _loss_constants:
+        _loss_constants[key] = {
+            "sigmas": torch.tensor(SIGMAS, device=device, dtype=dtype),
+            "alphas": torch.tensor(ALPHAS, device=device, dtype=dtype),
+        }
+    return _loss_constants[key]
+def pad_gt_fixed(gt_list, device, dtype):
+    """Pad GT segments to fixed MAX_GT for compile-friendly shapes."""
+    B = len(gt_list)
+    gt_pad = torch.zeros((B, MAX_GT, 2, 3), device=device, dtype=dtype)
+    gt_mask = torch.zeros((B, MAX_GT), device=device, dtype=torch.bool)
+    gt_lengths = torch.zeros(B, device=device, dtype=dtype)
+    for i, g in enumerate(gt_list):
+        n = g.shape[0]
+        if n > 0:
+            gt_pad[i, :n] = g
+            gt_mask[i, :n] = True
+            gt_lengths[i] = torch.linalg.norm(g[:, 1] - g[:, 0], dim=-1).sum()
+    return gt_pad, gt_mask, gt_lengths
+def _loss_inner(pred_segments, gt_pad, gt_mask, gt_lengths, scales,
+                sigmas, alphas, varifold_w, vertex_f1_w):
+    """Pure tensor loss -- no Python control flow, no boolean indexing."""
+    has_gt = (gt_lengths > 0).float()
+    sigmas_eff = sigmas / scales[:, None]
+    loss_batch = varifold_loss_batch(
+        pred_segments, gt_pad, gt_mask=gt_mask,
+        variant=VARIANT, sigmas=sigmas_eff, alpha=alphas, len_pow=LEN_POW,
+        cross_only=VARIFOLD_CROSS_ONLY,
+    )
+    v = loss_batch / gt_lengths.clamp(min=1.0)
+    v = (v * has_gt).sum() / has_gt.sum().clamp(min=1.0)
+    thresh = VERTEX_THRESH_M / scales
+    f1 = batched_sinkhorn_vertex_f1(
+        pred_segments, gt_pad, gt_mask, thresh=thresh, tau=SIGMOID_TAU)
+    f1 = (f1 * has_gt).sum() / has_gt.sum().clamp(min=1.0)
+    total = varifold_w * v + vertex_f1_w * f1
+    return total, v, f1
+# Will be replaced with compiled version on CUDA
+_loss_fn = _loss_inner
+def _conf_match_loss(pred_segments, gt_pad, gt_mask, conf_logits, scales):
+    """Auxiliary BCE loss: train conf to predict whether each segment matches GT.
+    Computes per-segment min-distance to GT, creates soft match target via
+    sigmoid thresholding, and returns BCE(sigmoid(conf), target).
+    """
+    B, S = pred_segments.shape[:2]
+    # Decoupled cost: midpoint + direction + length (same as sinkhorn)
+    p0, p1 = pred_segments[:, :, 0], pred_segments[:, :, 1]
+    g0, g1 = gt_pad[:, :, 0], gt_pad[:, :, 1]
+    mid_p, half_p = 0.5 * (p0 + p1), 0.5 * (p1 - p0)
+    mid_g, half_g = 0.5 * (g0 + g1), 0.5 * (g1 - g0)
+    d_mid = torch.linalg.norm(mid_p.unsqueeze(2) - mid_g.unsqueeze(1), dim=-1)
+    len_p = torch.linalg.norm(half_p, dim=-1, keepdim=True).clamp(min=1e-6)
+    len_g = torch.linalg.norm(half_g, dim=-1, keepdim=True).clamp(min=1e-6)
+    dir_p = half_p / len_p
+    dir_g = half_g / len_g
+    cos_angle = (dir_p.unsqueeze(2) * dir_g.unsqueeze(1)).sum(dim=-1)
+    d_dir = 1.0 - cos_angle.abs()
+    d_len = (len_p.unsqueeze(2) - len_g.unsqueeze(1)).squeeze(-1).abs()
+    cost = d_mid + d_dir + d_len  # [B, S, M]
+    # Mask invalid GT with high cost
+    cost = torch.where(gt_mask.unsqueeze(1), cost, cost.new_tensor(1e6))
+    min_dist = cost.min(dim=2).values  # [B, S]
+    # Soft target: sigmoid((thresh - dist) / tau), in normalized space
+    thresh = VERTEX_THRESH_M / scales  # [B]
+    target = torch.sigmoid((thresh[:, None] - min_dist) / SIGMOID_TAU)
+    return torch.nn.functional.binary_cross_entropy_with_logits(
+        conf_logits, target.detach(), reduction="mean")
+def compute_loss(pred_segments, gt_list, scales, device,
+                 varifold_w, sinkhorn_w, vertex_f1_w=0.0, soft_hss_w=0.0,
+                 endpoint_w=0.0,
+                 conf_logits=None, conf_weight=0.0, conf_mode="match",
+                 sinkhorn_eps=None, sinkhorn_iters=None,
+                 sinkhorn_dustbin=None, conf_clamp_min=None):
+    """Combined loss with fixed-size GT padding.
+    conf_mode: "match" = BCE matching supervision, "sinkhorn" = conf-weighted sinkhorn.
+    """
+    if conf_logits is not None and conf_clamp_min is not None:
+        conf_logits = conf_logits.clamp(min=conf_clamp_min)
+    gt_pad, gt_mask, gt_lengths = pad_gt_fixed(gt_list, device, pred_segments.dtype)
+    c = _get_loss_constants(device, pred_segments.dtype)
+    total, v, f1 = _loss_fn(
+        pred_segments, gt_pad, gt_mask, gt_lengths, scales,
+        c["sigmas"], c["alphas"], varifold_w, vertex_f1_w)
+    terms = {}
+    if varifold_w > 0:
+        terms["varifold"] = v.detach()
+    if vertex_f1_w > 0:
+        terms["vertex_f1"] = f1.detach()
+    if sinkhorn_w > 0:
+        has_gt = (gt_lengths > 0).float()
+        if conf_logits is not None and conf_mode == "sinkhorn":
+            pred_mass = torch.sigmoid(conf_logits)
+        elif conf_logits is not None and conf_mode == "sinkhorn_detach":
+            pred_mass = torch.sigmoid(conf_logits.detach())
+        else:
+            pred_mass = None
+        eps = sinkhorn_eps if sinkhorn_eps is not None else SINKHORN_EPS
+        iters = sinkhorn_iters if sinkhorn_iters is not None else SINKHORN_ITERS
+        dustbin = sinkhorn_dustbin if sinkhorn_dustbin is not None else SINKHORN_DUSTBIN
+        S = pred_segments.shape[1]
+        sink_per = batched_sinkhorn_loss(
+            pred_segments, gt_pad, gt_mask,
+            eps, iters, dustbin,
+            pred_mass=pred_mass,
+        ) / (gt_lengths.clamp(min=1.0) * S)
+        s = (sink_per * has_gt).sum() / has_gt.sum().clamp(min=1.0)
+        total = total + sinkhorn_w * s
+        terms["sinkhorn"] = s.detach()
+    if soft_hss_w > 0:
+        has_gt = (gt_lengths > 0).float()
+        vert_thresh = VERTEX_THRESH_M / scales
+        edge_thresh = TUBE_RADIUS_M / scales
+        hss_loss = batched_soft_hss_v2(
+            pred_segments, gt_pad, gt_mask,
+            vert_thresh=vert_thresh, edge_thresh=edge_thresh, tau=SIGMOID_TAU)
+        hs = (hss_loss * has_gt).sum() / has_gt.sum().clamp(min=1.0)
+        total = total + soft_hss_w * hs
+        terms["soft_hss"] = hs.detach()
+    if conf_logits is not None and conf_weight > 0:
+        if conf_mode == "match":
+            # Explicit BCE supervision from nearest-GT distances
+            cl = _conf_match_loss(pred_segments, gt_pad, gt_mask, conf_logits, scales)
+            total = total + conf_weight * cl
+            terms["conf"] = cl.detach()
+        elif conf_mode in ("sinkhorn", "sinkhorn_detach"):
+            # Conf trained through sinkhorn transport gradients (via pred_mass).
+            # sinkhorn_detach: pred_mass uses detached conf, so OT can't push conf negative.
+            # Add count regularizer to prevent all-zero conf collapse.
+            # Normalized by S so magnitude doesn't depend on segment count.
+            conf_w = torch.sigmoid(conf_logits)
+            S = conf_logits.shape[1]
+            gt_counts = gt_mask.sum(dim=1).float()
+            conf_sum = conf_w.sum(dim=1)
+            reg = (((conf_sum - gt_counts) / S) ** 2).mean()
+            total = total + conf_weight * reg
+            terms["conf_reg"] = reg.detach()
+        elif conf_mode == "varifold":
+            # Conf-weighted varifold: weight each pred segment's contribution
+            # by sigmoid(conf). Low-conf segments contribute less to the loss.
+            # Needs regularizer to prevent all-zero conf collapse.
+            has_gt = (gt_lengths > 0).float()
+            conf_w = torch.sigmoid(conf_logits)  # [B, S]
+            sigmas_eff = c["sigmas"] / scales[:, None]
+            vf_conf = varifold_loss_batch(
+                pred_segments, gt_pad, gt_mask=gt_mask,
+                variant=VARIANT, sigmas=sigmas_eff, alpha=c["alphas"],
+                len_pow=LEN_POW, pred_weights=conf_w,
+            )
+            vc = (vf_conf / gt_lengths.clamp(min=1.0))
+            vc = (vc * has_gt).sum() / has_gt.sum().clamp(min=1.0)
+            # Regularizer: penalize total conf being far from n_gt
+            # Normalized by S so magnitude doesn't depend on segment count
+            S = conf_logits.shape[1]
+            gt_counts = gt_mask.sum(dim=1).float()  # [B]
+            conf_sum = conf_w.sum(dim=1)  # [B]
+            reg = (((conf_sum - gt_counts) / S) ** 2).mean()
+            total = total + conf_weight * vc + 0.01 * reg
+            terms["conf_vf"] = vc.detach()
+            terms["conf_reg"] = reg.detach()
+        else:
+            raise ValueError(f"Unknown conf_mode: {conf_mode}")
+    if endpoint_w > 0:
+        has_gt = (gt_lengths > 0).float()
+        eps_ep = sinkhorn_eps if sinkhorn_eps is not None else SINKHORN_EPS
+        iters_ep = sinkhorn_iters if sinkhorn_iters is not None else SINKHORN_ITERS
+        dustbin_ep = sinkhorn_dustbin if sinkhorn_dustbin is not None else SINKHORN_DUSTBIN
+        B, S = pred_segments.shape[:2]
+        M = gt_pad.shape[1]
+        # Compute hard assignment via sinkhorn (detached — matching is not trained)
+        with torch.no_grad():
+            pred_mass_ep = torch.sigmoid(conf_logits) if conf_logits is not None else None
+            sink_loss_for_assign = batched_sinkhorn_loss(
+                pred_segments, gt_pad, gt_mask, eps_ep, iters_ep, dustbin_ep,
+                pred_mass=pred_mass_ep)
+            # Re-run sinkhorn to get transport matrix for assignment
+            # (reuse the cost computation from batched_sinkhorn_loss internals)
+            p0, p1 = pred_segments[:, :, 0], pred_segments[:, :, 1]
+            g0, g1 = gt_pad[:, :, 0], gt_pad[:, :, 1]
+            mid_p, half_p = 0.5 * (p0 + p1), 0.5 * (p1 - p0)
+            mid_g, half_g = 0.5 * (g0 + g1), 0.5 * (g1 - g0)
+            d_mid = torch.linalg.norm(mid_p.unsqueeze(2) - mid_g.unsqueeze(1), dim=-1)
+            len_p = torch.linalg.norm(half_p, dim=-1, keepdim=True).clamp(min=1e-6)
+            len_g = torch.linalg.norm(half_g, dim=-1, keepdim=True).clamp(min=1e-6)
+            dir_p, dir_g = half_p / len_p, half_g / len_g
+            cos_a = (dir_p.unsqueeze(2) * dir_g.unsqueeze(1)).sum(dim=-1)
+            d_dir = 1.0 - cos_a.abs()
+            d_len = (len_p.unsqueeze(2) - len_g.unsqueeze(1)).squeeze(-1).abs()
+            cost = d_mid + d_dir + d_len
+            dc = torch.as_tensor(dustbin_ep, device=cost.device, dtype=cost.dtype)
+            cost = torch.where(gt_mask.unsqueeze(1), cost, dc * 10.0)
+            cost_pad = dc.expand(B, S + 1, M + 1).clone()
+            cost_pad[:, :S, :M] = cost
+            cost_pad[:, -1, -1] = 0.0
+            gt_counts = gt_mask.sum(dim=1).float()
+            if pred_mass_ep is not None:
+                pm = pred_mass_ep.clamp(min=0.0)
+                a = torch.cat([pm, (gt_counts - pm.sum(1)).clamp(min=0).unsqueeze(1)], dim=1)
+                b_val = torch.zeros(B, M + 1, device=cost.device, dtype=cost.dtype)
+                b_val[:, :M] = gt_mask.float()
+                b_val[:, -1] = (pm.sum(1) - gt_counts).clamp(min=0)
+            else:
+                n = float(S)
+                denom = n + gt_counts
+                a = (1.0 / denom).unsqueeze(1).expand(B, S + 1).clone()
+                a[:, -1] = gt_counts / denom
+                b_val = (1.0 / denom).unsqueeze(1).expand(B, M + 1).clone()
+                b_val[:, -1] = n / denom
+                b_val[:, :M] = b_val[:, :M] * gt_mask.float()
+            log_a = torch.log(a + 1e-9)
+            log_b = torch.log(b_val + 1e-9)
+            log_k = -cost_pad / eps_ep
+            log_u = torch.zeros_like(a)
+            log_v = torch.zeros_like(b_val)
+            for _ in range(iters_ep):
+                log_u = log_a - torch.logsumexp(log_k + log_v.unsqueeze(1), dim=2)
+                log_v = log_b - torch.logsumexp(log_k + log_u.unsqueeze(2), dim=1)
+            transport = torch.exp(log_u.unsqueeze(2) + log_v.unsqueeze(1) + log_k)
+            assignment = transport[:, :S, :M+1].argmax(dim=2)
+            assignment[assignment >= M] = -1
+        # Everything below is WITH gradients (assignment is detached but pred_segments is live)
+        matched = (assignment >= 0)  # [B, S]
+        n_matched = matched.float().sum().clamp(min=1.0)
+        assign_safe = assignment.clamp(min=0)
+        gt_matched = gt_pad[
+            torch.arange(B, device=device)[:, None].expand(B, S),
+            assign_safe]  # [B, S, 2, 3]
+        # Symmetric endpoint distance
+        ref_ep1 = pred_segments[:, :, 0]
+        ref_ep2 = pred_segments[:, :, 1]
+        gt_ep1 = gt_matched[:, :, 0]
+        gt_ep2 = gt_matched[:, :, 1]
+        dist_fwd = (ref_ep1 - gt_ep1).norm(dim=-1) + (ref_ep2 - gt_ep2).norm(dim=-1)
+        dist_rev = (ref_ep1 - gt_ep2).norm(dim=-1) + (ref_ep2 - gt_ep1).norm(dim=-1)
+        ep_dist = torch.min(dist_fwd, dist_rev)
+        # Normalize by GT total length * S (same scale as sinkhorn)
+        ep_loss = (ep_dist * matched.float()).sum() / n_matched
+        total = total + endpoint_w * ep_loss
+        terms["endpoint"] = ep_loss.detach()
+    return total, terms

s23dr_2026_example/make_sampled_cache.py ADDED Viewed

	@@ -0,0 +1,260 @@

+#!/usr/bin/env python3
+"""Convert full point cloud cache to pre-sampled 2048-point npz files.
+Reads from either local .pt files or the HF dataset, priority-samples
+2048 points, normalizes, and saves as compact npz files (~50KB each).
+Usage:
+  # From local cache:
+  python make_sampled_cache.py --in-dir /workspace/cache/v2 --out-dir /workspace/cache/sampled
+  # From HF dataset:
+  python make_sampled_cache.py --hf-repo usm3d/s23dr-2026-cached_full_pcd --out-dir /workspace/cache/sampled
+  # Specify split:
+  python make_sampled_cache.py --hf-repo usm3d/s23dr-2026-cached_full_pcd --split validation --out-dir /workspace/cache/sampled_val
+  # With edge classifications (from extract_edge_classes.py):
+  python make_sampled_cache.py --hf-repo usm3d/s23dr-2026-cached_full_pcd --out-dir /workspace/cache/sampled \
+    --edge-classes edge_classifications.npz
+Note: uses a fixed seed so each scene gets one deterministic sample of 2048
+points. This means no sampling augmentation across epochs -- every epoch sees
+the same points. Fine for now; better augmentation can be added later.
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path as _Path
+if __package__ is None or __package__ == "":
+    _here = _Path(__file__).resolve().parent
+    if str(_here.parent) not in sys.path:
+        sys.path.insert(0, str(_here.parent))
+    __package__ = _here.name
+import argparse
+import io
+import time
+from pathlib import Path
+import numpy as np
+import torch
+# Priority sampling (same logic as train.py)
+def _priority_sample(source, group_id, seq_len, colmap_quota, depth_quota):
+    def pick(src_id, quota):
+        base = source == src_id
+        picked, remaining = [], quota
+        for tier in range(5):
+            if remaining <= 0:
+                break
+            pool = np.where(base & (group_id == tier))[0]
+            if len(pool) == 0:
+                continue
+            np.random.shuffle(pool)
+            take = min(remaining, len(pool))
+            picked.append(pool[:take])
+            remaining -= take
+        if remaining > 0:
+            pool = np.where(base & (group_id >= 0))[0]
+            if len(pool) > 0:
+                np.random.shuffle(pool)
+                picked.append(pool[:min(remaining, len(pool))])
+                remaining -= min(remaining, len(pool))
+        return np.concatenate(picked) if picked else np.array([], dtype=np.int64), remaining
+    idx_c, rem_c = pick(0, colmap_quota)
+    idx_d, rem_d = pick(1, depth_quota)
+    if rem_c > 0:
+        extra = np.setdiff1d(np.where((source == 1) & (group_id >= 0))[0], idx_d)
+        np.random.shuffle(extra)
+        idx_d = np.concatenate([idx_d, extra[:rem_c]])
+    if rem_d > 0:
+        extra = np.setdiff1d(np.where((source == 0) & (group_id >= 0))[0], idx_c)
+        np.random.shuffle(extra)
+        idx_c = np.concatenate([idx_c, extra[:rem_d]])
+    indices = np.concatenate([idx_c, idx_d])
+    num_valid = len(indices)
+    if num_valid < seq_len:
+        if num_valid == 0:
+            return np.zeros(seq_len, dtype=np.int64), np.zeros(seq_len, dtype=bool)
+        indices = np.concatenate([indices, np.full(seq_len - num_valid, indices[-1])])
+    mask = np.zeros(seq_len, dtype=bool)
+    mask[:num_valid] = True
+    return indices[:seq_len], mask
+def process_sample(xyz, source, group_id, class_id, vis_src, vis_id,
+                   center, scale, gt_v, gt_e, behind=None,
+                   n_views_voted=None, vote_frac=None,
+                   gt_edge_classes=None,
+                   seq_len=2048, colmap_q=1536, depth_q=512):
+    """Sample and normalize one scene. Returns dict of numpy arrays."""
+    indices, mask = _priority_sample(source, group_id, seq_len, colmap_q, depth_q)
+    xyz_norm = ((xyz[indices] - center) / scale).astype(np.float32)
+    gt_seg = np.stack([gt_v[gt_e[:, 0]], gt_v[gt_e[:, 1]]], axis=1)
+    gt_seg_norm = ((gt_seg - center) / scale).astype(np.float32)
+    result = {
+        "xyz_norm": xyz_norm,
+        "class_id": class_id[indices].astype(np.uint8),
+        "source": source[indices].astype(np.uint8),
+        "mask": mask,
+        "gt_segments": gt_seg_norm,
+        "scale": np.float32(scale),
+        "center": center.astype(np.float32),
+        "gt_vertices": gt_v.astype(np.float32),
+        "gt_edges": gt_e.astype(np.int32),
+        "visible_src": vis_src[indices].astype(np.uint8),
+        "visible_id": vis_id[indices].astype(np.int16),
+    }
+    if behind is not None:
+        result["behind"] = behind[indices].astype(np.int16)
+    if n_views_voted is not None:
+        result["n_views_voted"] = n_views_voted[indices].astype(np.uint8)
+    if vote_frac is not None:
+        result["vote_frac"] = vote_frac[indices].astype(np.float32)
+    if gt_edge_classes is not None:
+        if len(gt_edge_classes) != len(gt_e):
+            raise ValueError(
+                f"gt_edge_classes length {len(gt_edge_classes)} != "
+                f"gt_edges length {len(gt_e)}")
+        result["gt_edge_classes"] = gt_edge_classes.astype(np.int64)
+    return result
+def _load_edge_classes(path):
+    """Load edge classifications lookup from npz file."""
+    if path is None:
+        return None
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Edge classifications file not found: {path}")
+    data = np.load(str(path), allow_pickle=False)
+    lookup = {k: data[k] for k in data.files}
+    print(f"Loaded edge classifications for {len(lookup)} orders from {path}")
+    return lookup
+def main():
+    p = argparse.ArgumentParser()
+    g = p.add_mutually_exclusive_group(required=True)
+    g.add_argument("--in-dir", help="Local directory of .pt files")
+    g.add_argument("--hf-repo", help="HuggingFace dataset repo (e.g. usm3d/s23dr-2026-cached_full_pcd)")
+    p.add_argument("--split", default="train", help="HF dataset split")
+    p.add_argument("--out-dir", required=True)
+    p.add_argument("--edge-classes", default=None,
+                   help="Path to edge_classifications.npz from extract_edge_classes.py")
+    p.add_argument("--seq-len", type=int, default=2048)
+    p.add_argument("--colmap-quota", type=int, default=1536)
+    p.add_argument("--depth-quota", type=int, default=512)
+    p.add_argument("--seed", type=int, default=7)
+    args = p.parse_args()
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    np.random.seed(args.seed)
+    edge_cls_lookup = _load_edge_classes(args.edge_classes)
+    n_edge_matched, n_edge_missing = 0, 0
+    t_start = time.perf_counter()
+    done = 0
+    if args.in_dir:
+        # Local .pt files
+        files = sorted(Path(args.in_dir).glob("*.pt"))
+        print(f"Converting {len(files)} local .pt files...")
+        for f in files:
+            out_f = out_dir / (f.stem + ".npz")
+            if out_f.exists():
+                done += 1
+                continue
+            d = torch.load(f, weights_only=False)
+            behind = np.asarray(d["behind_gest_id"], np.int16) if "behind_gest_id" in d else None
+            n_vv = np.asarray(d["n_views_voted"], np.uint8) if "n_views_voted" in d else None
+            vf = np.asarray(d["vote_frac"], np.float32) if "vote_frac" in d else None
+            gt_ec = None
+            if edge_cls_lookup is not None:
+                order_id = f.stem
+                if order_id in edge_cls_lookup:
+                    gt_ec = edge_cls_lookup[order_id]
+                    n_edge_matched += 1
+                else:
+                    n_edge_missing += 1
+            result = process_sample(
+                np.asarray(d["xyz"], np.float32),
+                np.asarray(d["source"], np.uint8),
+                np.asarray(d["group_id"], np.int8),
+                np.asarray(d["class_id"], np.uint8),
+                np.asarray(d["visible_src"], np.uint8),
+                np.asarray(d["visible_id"], np.int16),
+                np.asarray(d["center"], np.float32),
+                float(d["scale"]),
+                np.asarray(d["gt_vertices"], np.float32),
+                np.asarray(d["gt_edges"], np.int32),
+                behind=behind, n_views_voted=n_vv, vote_frac=vf,
+                gt_edge_classes=gt_ec,
+                seq_len=args.seq_len, colmap_q=args.colmap_quota, depth_q=args.depth_quota,
+            )
+            np.savez(out_f, **result)
+            done += 1
+            if done % 2000 == 0:
+                print(f"  {done}/{len(files)} [{done/(time.perf_counter()-t_start):.0f}/s]")
+    else:
+        # HF dataset
+        from datasets import load_dataset
+        print(f"Loading {args.hf_repo} split={args.split}...")
+        ds = load_dataset(args.hf_repo, split=args.split)
+        print(f"Converting {len(ds)} samples...")
+        for i, sample in enumerate(ds):
+            order_id = sample["order_id"]
+            out_f = out_dir / f"{order_id}.npz"
+            if out_f.exists():
+                done += 1
+                continue
+            arrays = np.load(io.BytesIO(sample["data"]))
+            behind = arrays["behind_gest_id"] if "behind_gest_id" in arrays else None
+            n_vv = arrays["n_views_voted"] if "n_views_voted" in arrays else None
+            vf = arrays["vote_frac"] if "vote_frac" in arrays else None
+            gt_ec = None
+            if edge_cls_lookup is not None:
+                if order_id in edge_cls_lookup:
+                    gt_ec = edge_cls_lookup[order_id]
+                    n_edge_matched += 1
+                else:
+                    n_edge_missing += 1
+            result = process_sample(
+                arrays["xyz"], arrays["source"], arrays["group_id"],
+                arrays["class_id"], arrays["visible_src"], arrays["visible_id"],
+                arrays["center"], float(arrays["scale"]),
+                arrays["gt_vertices"], arrays["gt_edges"],
+                behind=behind, n_views_voted=n_vv, vote_frac=vf,
+                gt_edge_classes=gt_ec,
+                seq_len=args.seq_len, colmap_q=args.colmap_quota, depth_q=args.depth_quota,
+            )
+            np.savez(out_f, **result)
+            done += 1
+            if done % 2000 == 0:
+                print(f"  {done}/{len(ds)} [{done/(time.perf_counter()-t_start):.0f}/s]")
+    elapsed = time.perf_counter() - t_start
+    print(f"Done: {done} files in {elapsed:.0f}s ({done/max(1,elapsed):.0f}/s)")
+    if edge_cls_lookup is not None:
+        print(f"Edge classifications: {n_edge_matched} matched, {n_edge_missing} missing")
+    # Report sizes
+    import os
+    npz_files = list(out_dir.glob("*.npz"))
+    if npz_files:
+        sizes = [os.path.getsize(f) for f in npz_files[:100]]
+        print(f"Avg file size: {np.mean(sizes)/1024:.0f}KB")
+        print(f"Est total: {np.mean(sizes)*len(npz_files)/1e9:.1f}GB")
+if __name__ == "__main__":
+    main()

s23dr_2026_example/model.py ADDED Viewed

	@@ -0,0 +1,696 @@

+"""
+Perceiver-based transformer for 3D roof wireframe prediction.
+Architecture overview:
+    Input tokens [B, T, D]
+        |
+        v
+    input_proj: Linear -> GELU -> Linear -> LayerNorm   =>  [B, T, hidden]
+        |
+        v
+    Perceiver latent bottleneck (N PerceiverLatentLayers):
+        Learnable latent embeddings [L, hidden] are broadcast to batch.
+        Each layer: cross-attn(latents <- tokens) -> self-attn(latents) -> FFN
+        Output: latents [B, L, hidden]
+        |
+        v
+    Segment decoder (M SegmentDecoderLayers):
+        Learnable query embeddings [S, hidden] are broadcast to batch.
+        Each layer: cross-attn(queries <- latents) -> self-attn(queries) -> FFN
+        Output: queries [B, S, hidden]
+        |
+        v
+    segment_head: Linear -> 6D -> (midpoint, half_vector)
+        + query_offsets (learnable per-query bias)
+        endpoints = midpoint +/- half_vector  ->  [B, S, 2, 3]
+"""
+import torch
+import torch.nn as nn
+from .attention import MultiHeadSDPA, FeedForward
+# ---------------------------------------------------------------------------
+# Building blocks
+# ---------------------------------------------------------------------------
+class AttnResidual(nn.Module):
+    """Pre-norm attention + residual + dropout."""
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        kv_heads: int | None = None,
+        norm_class=None,
+        qk_norm: bool = False,
+        qk_norm_type: str = "l2",
+    ):
+        super().__init__()
+        norm_class = norm_class or nn.LayerNorm
+        self.norm = norm_class(d_model)
+        self.attn = MultiHeadSDPA(d_model, num_heads, kv_heads=kv_heads, qk_norm=qk_norm, qk_norm_type=qk_norm_type)
+        self.drop = nn.Dropout(dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        memory: torch.Tensor,
+        memory_key_padding_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        res = x
+        x = self.norm(x)
+        x = self.attn(x, memory, key_padding_mask=memory_key_padding_mask)
+        return res + self.drop(x)
+class FFNResidual(nn.Module):
+    """Pre-norm feed-forward + residual + dropout."""
+    def __init__(
+        self,
+        d_model: int,
+        dim_ff: int,
+        dropout: float = 0.0,
+        activation: str = "gelu",
+        norm_class=None,
+    ):
+        super().__init__()
+        norm_class = norm_class or nn.LayerNorm
+        self.norm = norm_class(d_model)
+        self.ffn = FeedForward(d_model, dim_ff, activation=activation)
+        self.drop = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        res = x
+        x = self.norm(x)
+        x = self.ffn(x)
+        return res + self.drop(x)
+# ---------------------------------------------------------------------------
+# Perceiver encoder layer
+# ---------------------------------------------------------------------------
+class PerceiverLatentLayer(nn.Module):
+    """Single Perceiver latent layer.
+    If use_cross=True:  cross-attn(latents <- points) -> self-attn -> FFN
+    If use_cross=False: self-attn -> FFN  (saves compute in deep stacks)
+    """
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dim_ff: int,
+        dropout: float = 0.0,
+        activation: str = "gelu",
+        kv_heads_cross: int | None = None,
+        kv_heads_self: int | None = None,
+        use_cross: bool = True,
+        norm_class=None,
+        qk_norm: bool = False,
+        qk_norm_type: str = "l2",
+    ):
+        super().__init__()
+        self.use_cross = use_cross
+        if use_cross:
+            self.cross = AttnResidual(d_model, num_heads, dropout, kv_heads=kv_heads_cross, norm_class=norm_class, qk_norm=qk_norm, qk_norm_type=qk_norm_type)
+        self.self_attn = AttnResidual(d_model, num_heads, dropout, kv_heads=kv_heads_self, norm_class=norm_class, qk_norm=qk_norm, qk_norm_type=qk_norm_type)
+        self.ffn = FFNResidual(d_model, dim_ff, dropout, activation=activation, norm_class=norm_class)
+    def forward(
+        self,
+        latents: torch.Tensor,
+        points: torch.Tensor,
+        points_key_padding_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if self.use_cross:
+            latents = self.cross(latents, points, memory_key_padding_mask=points_key_padding_mask)
+        latents = self.self_attn(latents, latents)
+        latents = self.ffn(latents)
+        return latents
+# ---------------------------------------------------------------------------
+# Segment decoder layer
+# ---------------------------------------------------------------------------
+class SegmentDecoderLayer(nn.Module):
+    """Single segment decoder layer.
+    cross-attn(queries <- latents) -> [cross-attn(queries <- inputs)] -> self-attn(queries) -> FFN
+    If input_xattn=True, adds a second cross-attention that attends directly
+    to the projected input tokens (bypassing the latent bottleneck). This gives
+    queries access to fine-grained point-level detail for vertex precision.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dim_ff: int,
+        dropout: float = 0.0,
+        activation: str = "gelu",
+        kv_heads_cross: int | None = None,
+        kv_heads_self: int | None = None,
+        norm_class=None,
+        input_xattn: bool = False,
+        qk_norm: bool = False,
+        qk_norm_type: str = "l2",
+    ):
+        super().__init__()
+        self.cross = AttnResidual(d_model, num_heads, dropout, kv_heads=kv_heads_cross, norm_class=norm_class, qk_norm=qk_norm, qk_norm_type=qk_norm_type)
+        self.input_xattn = input_xattn
+        if input_xattn:
+            self.cross_input = AttnResidual(d_model, num_heads, dropout, kv_heads=kv_heads_cross, norm_class=norm_class, qk_norm=qk_norm, qk_norm_type=qk_norm_type)
+        self.self_attn = AttnResidual(d_model, num_heads, dropout, kv_heads=kv_heads_self, norm_class=norm_class, qk_norm=qk_norm, qk_norm_type=qk_norm_type)
+        self.ffn = FFNResidual(d_model, dim_ff, dropout, activation=activation, norm_class=norm_class)
+    def forward(
+        self,
+        queries: torch.Tensor,
+        latents: torch.Tensor,
+        src: torch.Tensor | None = None,
+        src_key_padding_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        queries = self.cross(queries, latents)
+        if self.input_xattn and src is not None:
+            queries = self.cross_input(queries, src, memory_key_padding_mask=src_key_padding_mask)
+        queries = self.self_attn(queries, queries)
+        queries = self.ffn(queries)
+        return queries
+# ---------------------------------------------------------------------------
+# Full model
+# ---------------------------------------------------------------------------
+class TokenTransformerSegments(nn.Module):
+    """Perceiver transformer that predicts 3D roof wireframe segments.
+    Takes point-cloud tokens and outputs segment endpoints as [B, S, 2, 3]
+    where S is the number of segments and each segment has two 3D endpoints.
+    Args:
+        segments:       Number of predicted segments (S).
+        in_dim:         Dimensionality of input tokens.
+        hidden:         Internal hidden dimension throughout the model.
+        num_heads:      Number of attention heads.
+        kv_heads_cross: Grouped-query heads for cross-attention (None = standard MHA).
+        kv_heads_self:  Grouped-query heads for self-attention (None = standard MHA).
+        dim_feedforward: FFN intermediate dimension.
+        dropout:        Dropout rate applied after attention and FFN.
+        latent_tokens:  Number of learnable latent embeddings (L) in the bottleneck.
+        latent_layers:  Number of PerceiverLatentLayers (N).
+        decoder_layers: Number of SegmentDecoderLayers (M).
+    """
+    def __init__(
+        self,
+        segments: int = 32,
+        in_dim: int = 128,
+        hidden: int = 128,
+        num_heads: int = 4,
+        kv_heads_cross: int | None = 2,
+        kv_heads_self: int | None = 0,
+        dim_feedforward: int = 256,
+        dropout: float = 0.01,
+        latent_tokens: int = 64,
+        latent_layers: int = 2,
+        decoder_layers: int = 2,
+        cross_attn_interval: int = 1,
+        norm_class=None,
+        activation: str = "gelu",
+        segment_conf: bool = False,
+        pre_encoder_layers: int = 0,
+        segment_param: str = "midpoint_halfvec",
+        length_floor: float = 0.0,
+        decoder_input_xattn: bool = False,
+        qk_norm: bool = False,
+        qk_norm_type: str = "l2",
+    ):
+        super().__init__()
+        self.segments = segments
+        self.out_vertices = segments * 2
+        self.segment_param = segment_param
+        self.length_floor = length_floor
+        self.decoder_input_xattn = decoder_input_xattn
+        norm_class = norm_class or nn.LayerNorm
+        # Treat 0 as "use standard MHA"
+        if kv_heads_cross is not None and kv_heads_cross <= 0:
+            kv_heads_cross = None
+        if kv_heads_self is not None and kv_heads_self <= 0:
+            kv_heads_self = None
+        # -- Input projection --
+        self.input_proj = nn.Sequential(
+            nn.Linear(in_dim, dim_feedforward),
+            nn.GELU(),
+            nn.Linear(dim_feedforward, hidden),
+            norm_class(hidden),
+        )
+        # -- Optional pre-encoder: self-attention on full token sequence --
+        if pre_encoder_layers > 0:
+            self.pre_encoder = nn.ModuleList([
+                SelfAttentionEncoderLayer(
+                    d_model=hidden,
+                    num_heads=num_heads,
+                    dim_ff=dim_feedforward,
+                    dropout=dropout,
+                    activation=activation,
+                    kv_heads=kv_heads_self,
+                    norm_class=norm_class,
+                    qk_norm=qk_norm, qk_norm_type=qk_norm_type,
+                )
+                for _ in range(pre_encoder_layers)
+            ])
+        else:
+            self.pre_encoder = None
+        # -- Perceiver latent bottleneck --
+        self.latent_embed = nn.Embedding(latent_tokens, hidden)
+        N = latent_layers
+        self.latent_layers = nn.ModuleList([
+            PerceiverLatentLayer(
+                d_model=hidden,
+                num_heads=num_heads,
+                dim_ff=dim_feedforward,
+                dropout=dropout,
+                activation=activation,
+                kv_heads_cross=kv_heads_cross,
+                kv_heads_self=kv_heads_self,
+                use_cross=(i == 0) or (i == N - 1) or (i % cross_attn_interval == 0),
+                norm_class=norm_class,
+                qk_norm=qk_norm, qk_norm_type=qk_norm_type,
+            )
+            for i in range(N)
+        ])
+        # -- Segment decoder --
+        self.query_embed = nn.Embedding(segments, hidden)
+        self.decoder_layers = nn.ModuleList([
+            SegmentDecoderLayer(
+                d_model=hidden,
+                num_heads=num_heads,
+                dim_ff=dim_feedforward,
+                dropout=dropout,
+                activation=activation,
+                kv_heads_cross=kv_heads_cross,
+                kv_heads_self=kv_heads_self,
+                norm_class=norm_class,
+                input_xattn=decoder_input_xattn,
+                qk_norm=qk_norm, qk_norm_type=qk_norm_type,
+            )
+            for _ in range(decoder_layers)
+        ])
+        # -- Output head --
+        if segment_param == "midpoint_dir_len":
+            self.segment_head = nn.Linear(hidden, 7)  # mid(3) + dir(3) + len(1)
+        else:
+            self.segment_head = nn.Linear(hidden, 6)  # mid(3) + half(3)
+        self.query_offsets = nn.Parameter(torch.zeros(segments, 2, 3))
+        nn.init.trunc_normal_(self.segment_head.weight, mean=0.0, std=1e-3)
+        if self.segment_head.bias is not None:
+            nn.init.zeros_(self.segment_head.bias)
+            if segment_param == "midpoint_dir_len":
+                # softplus(0.5) * 0.1 ≈ 0.097 default length in normalized space
+                self.segment_head.bias.data[6] = 0.5
+        nn.init.normal_(self.query_offsets, mean=0.0, std=0.05)
+        # -- Optional confidence head --
+        self.segment_conf = segment_conf
+        if segment_conf:
+            self.conf_head = nn.Linear(hidden, 1)
+            nn.init.zeros_(self.conf_head.bias)
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        mask: torch.Tensor | None = None,
+    ) -> dict[str, torch.Tensor | list]:
+        """
+        Args:
+            tokens: Input point-cloud tokens [B, T, in_dim].
+            mask:   Boolean validity mask [B, T]. True = valid token.
+        Returns:
+            Dict with keys:
+                "vertices": [B, S*2, 3] flattened endpoints.
+                "segments": [B, S, 2, 3] segment endpoints.
+                "edges":    Per-batch list of (start, end) index pairs into vertices.
+                "conf":     [B, S] logits (only if segment_conf=True).
+        """
+        B = tokens.shape[0]
+        # Project input tokens
+        src = self.input_proj(tokens)  # [B, T, hidden]
+        # Padding mask (True where padded) for cross-attention
+        pad_mask = ~mask.bool() if mask is not None else None
+        # Optional pre-encoder: self-attention on full token sequence
+        if self.pre_encoder is not None:
+            for layer in self.pre_encoder:
+                src = layer(src, key_padding_mask=pad_mask)
+        # Perceiver latent bottleneck
+        latents = self.latent_embed.weight.unsqueeze(0).expand(B, -1, -1)
+        for layer in self.latent_layers:
+            latents = layer(latents, src, points_key_padding_mask=pad_mask)
+        # Segment decoder
+        queries = self.query_embed.weight.unsqueeze(0).expand(B, -1, -1)
+        for layer in self.decoder_layers:
+            queries = layer(queries, latents,
+                            src=src if self.decoder_input_xattn else None,
+                            src_key_padding_mask=pad_mask if self.decoder_input_xattn else None)
+        # Predict segments -> endpoints
+        if self.segment_param == "midpoint_dir_len":
+            raw = self.segment_head(queries)  # [B, S, 7]
+            mid = raw[:, :, :3] + self.query_offsets[:, 0, :].unsqueeze(0)
+            direction = torch.nn.functional.normalize(raw[:, :, 3:6], dim=-1)
+            length = torch.nn.functional.softplus(raw[:, :, 6:7]) * 0.1
+            half = direction * length * 0.5
+        else:
+            raw = self.segment_head(queries).view(B, self.segments, 2, 3)
+            raw = raw + self.query_offsets.unsqueeze(0)
+            mid, half = raw[:, :, 0], raw[:, :, 1]
+        seg_params = torch.stack([mid - half, mid + half], dim=2)
+        vertices = seg_params.reshape(B, self.out_vertices, 3)
+        edges = [[(2 * i, 2 * i + 1) for i in range(self.segments)] for _ in range(B)]
+        out = {"vertices": vertices, "segments": seg_params, "edges": edges,
+               "src": src, "pad_mask": pad_mask, "queries": queries}
+        if self.segment_conf:
+            out["conf"] = self.conf_head(queries).squeeze(-1)  # [B, S]
+        return out
+# ---------------------------------------------------------------------------
+# Encoder-only layer (self-attention on full token sequence)
+# ---------------------------------------------------------------------------
+class SelfAttentionEncoderLayer(nn.Module):
+    """Single self-attention layer: self-attn(tokens) -> FFN."""
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dim_ff: int,
+        dropout: float = 0.0,
+        activation: str = "gelu",
+        kv_heads: int | None = None,
+        norm_class=None,
+        qk_norm: bool = False,
+        qk_norm_type: str = "l2",
+    ):
+        super().__init__()
+        self.self_attn = AttnResidual(d_model, num_heads, dropout, kv_heads=kv_heads, norm_class=norm_class, qk_norm=qk_norm, qk_norm_type=qk_norm_type)
+        self.ffn = FFNResidual(d_model, dim_ff, dropout, activation=activation, norm_class=norm_class)
+    def forward(self, x: torch.Tensor, key_padding_mask: torch.Tensor | None = None) -> torch.Tensor:
+        x = self.self_attn(x, x, memory_key_padding_mask=key_padding_mask)
+        x = self.ffn(x)
+        return x
+# ---------------------------------------------------------------------------
+# Vanilla transformer: self-attention encoder + segment query decoder
+# ---------------------------------------------------------------------------
+class TransformerSegments(nn.Module):
+    """Standard transformer encoder + cross-attention segment decoder.
+    Architecture:
+        Input tokens [B, T, D]
+            |
+            v
+        input_proj: Linear -> GELU -> Linear -> Norm   =>  [B, T, hidden]
+            |
+            v
+        N SelfAttentionEncoderLayers (self-attn over all T tokens)
+            |
+            v
+        Segment decoder (same as Perceiver version):
+            M SegmentDecoderLayers (queries cross-attend to encoded tokens)
+            |
+            v
+        segment_head -> endpoints [B, S, 2, 3]  (midpoint_halfvec or midpoint_dir_len)
+    """
+    def __init__(
+        self,
+        segments: int = 32,
+        in_dim: int = 128,
+        hidden: int = 128,
+        num_heads: int = 4,
+        kv_heads_cross: int | None = 2,
+        kv_heads_self: int | None = 0,
+        dim_feedforward: int = 256,
+        dropout: float = 0.01,
+        encoder_layers: int = 4,
+        decoder_layers: int = 2,
+        norm_class=None,
+        activation: str = "gelu",
+        segment_conf: bool = False,
+        segment_param: str = "midpoint_halfvec",
+        length_floor: float = 0.0,
+        decoder_input_xattn: bool = False,
+        qk_norm: bool = False,
+        qk_norm_type: str = "l2",
+    ):
+        super().__init__()
+        self.segments = segments
+        self.out_vertices = segments * 2
+        self.segment_param = segment_param
+        self.length_floor = length_floor
+        norm_class = norm_class or nn.LayerNorm
+        if kv_heads_cross is not None and kv_heads_cross <= 0:
+            kv_heads_cross = None
+        if kv_heads_self is not None and kv_heads_self <= 0:
+            kv_heads_self = None
+        # -- Input projection --
+        self.input_proj = nn.Sequential(
+            nn.Linear(in_dim, dim_feedforward),
+            nn.GELU(),
+            nn.Linear(dim_feedforward, hidden),
+            norm_class(hidden),
+        )
+        # -- Self-attention encoder --
+        self.encoder_layers = nn.ModuleList([
+            SelfAttentionEncoderLayer(
+                d_model=hidden,
+                num_heads=num_heads,
+                dim_ff=dim_feedforward,
+                dropout=dropout,
+                activation=activation,
+                kv_heads=kv_heads_self,
+                norm_class=norm_class,
+                qk_norm=qk_norm, qk_norm_type=qk_norm_type,
+            )
+            for _ in range(encoder_layers)
+        ])
+        # -- Segment decoder (same structure as Perceiver version) --
+        # Note: for transformer arch, decoder_input_xattn is ignored because
+        # the decoder already cross-attends to the full encoded token sequence.
+        self.query_embed = nn.Embedding(segments, hidden)
+        self.decoder_layers = nn.ModuleList([
+            SegmentDecoderLayer(
+                d_model=hidden,
+                num_heads=num_heads,
+                dim_ff=dim_feedforward,
+                dropout=dropout,
+                activation=activation,
+                kv_heads_cross=kv_heads_cross,
+                kv_heads_self=kv_heads_self,
+                norm_class=norm_class,
+                qk_norm=qk_norm, qk_norm_type=qk_norm_type,
+            )
+            for _ in range(decoder_layers)
+        ])
+        # -- Output head (shared logic with Perceiver version) --
+        if segment_param == "midpoint_dir_len":
+            self.segment_head = nn.Linear(hidden, 7)  # mid(3) + dir(3) + len(1)
+        else:
+            self.segment_head = nn.Linear(hidden, 6)  # mid(3) + half(3)
+        self.query_offsets = nn.Parameter(torch.zeros(segments, 2, 3))
+        nn.init.trunc_normal_(self.segment_head.weight, mean=0.0, std=1e-3)
+        if self.segment_head.bias is not None:
+            nn.init.zeros_(self.segment_head.bias)
+            if segment_param == "midpoint_dir_len":
+                # sigmoid(-2.2) ~ 0.1 default length in normalized space (~3m)
+                self.segment_head.bias.data[6] = -2.2
+        nn.init.normal_(self.query_offsets, mean=0.0, std=0.05)
+        self.segment_conf = segment_conf
+        if segment_conf:
+            self.conf_head = nn.Linear(hidden, 1)
+            nn.init.zeros_(self.conf_head.bias)
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        mask: torch.Tensor | None = None,
+    ) -> dict[str, torch.Tensor | list]:
+        B = tokens.shape[0]
+        src = self.input_proj(tokens)
+        pad_mask = ~mask.bool() if mask is not None else None
+        # Encode: self-attention over all tokens
+        for layer in self.encoder_layers:
+            src = layer(src, key_padding_mask=pad_mask)
+        # Decode: segment queries cross-attend to encoded tokens
+        queries = self.query_embed.weight.unsqueeze(0).expand(B, -1, -1)
+        for layer in self.decoder_layers:
+            queries = layer(queries, src)
+        # Predict segments -> endpoints
+        if self.segment_param == "midpoint_dir_len":
+            raw = self.segment_head(queries)  # [B, S, 7]
+            mid = raw[:, :, :3] + self.query_offsets[:, 0, :].unsqueeze(0)
+            direction = torch.nn.functional.normalize(raw[:, :, 3:6], dim=-1)
+            length = torch.nn.functional.softplus(raw[:, :, 6:7]) * 0.1
+            half = direction * length * 0.5
+        else:
+            raw = self.segment_head(queries).view(B, self.segments, 2, 3)
+            raw = raw + self.query_offsets.unsqueeze(0)
+            mid, half = raw[:, :, 0], raw[:, :, 1]
+        seg_params = torch.stack([mid - half, mid + half], dim=2)
+        vertices = seg_params.reshape(B, self.out_vertices, 3)
+        edges = [[(2 * i, 2 * i + 1) for i in range(self.segments)] for _ in range(B)]
+        out = {"vertices": vertices, "segments": seg_params, "edges": edges}
+        if self.segment_conf:
+            out["conf"] = self.conf_head(queries).squeeze(-1)
+        return out
+# ---------------------------------------------------------------------------
+# End-to-end model: tokenizer embeddings + transformer/perceiver
+# ---------------------------------------------------------------------------
+class EdgeDepthSegmentsModel(nn.Module):
+    """Tokenizer embeddings + transformer for 3D roof wireframes.
+    Supports two architectures via the `arch` parameter:
+    - "perceiver": Perceiver latent bottleneck (default, O(L*T) attention)
+    - "transformer": Standard self-attention encoder (O(T^2) attention)
+    Both share the same decoder, output head, and tokenizer.
+    """
+    def __init__(
+        self,
+        seq_cfg,
+        segments: int = 32,
+        hidden: int = 128,
+        num_heads: int = 4,
+        kv_heads_cross: int | None = 2,
+        kv_heads_self: int | None = 0,
+        dim_feedforward: int = 256,
+        dropout: float = 0.1,
+        latent_tokens: int = 64,
+        latent_layers: int = 1,
+        decoder_layers: int = 2,
+        label_emb_dim: int = 16,
+        src_emb_dim: int = 2,
+        behind_emb_dim: int = 8,
+        fourier_seed: int = 0,
+        cross_attn_interval: int = 1,
+        norm_class=None,
+        activation: str = "gelu",
+        segment_conf: bool = False,
+        use_vote_features: bool = False,
+        arch: str = "perceiver",
+        encoder_layers: int = 4,
+        pre_encoder_layers: int = 0,
+        segment_param: str = "midpoint_halfvec",
+        length_floor: float = 0.0,
+        decoder_input_xattn: bool = False,
+        qk_norm: bool = False,
+        qk_norm_type: str = "l2",
+        learnable_fourier: bool = False,
+    ):
+        super().__init__()
+        self.seq_cfg = seq_cfg
+        from .tokenizer import EdgeDepthSequenceBuilder
+        self.tokenizer = EdgeDepthSequenceBuilder(
+            seq_cfg,
+            label_emb_dim=label_emb_dim,
+            src_emb_dim=src_emb_dim,
+            behind_emb_dim=behind_emb_dim,
+            fourier_seed=fourier_seed,
+            use_vote_features=use_vote_features,
+            learnable_fourier=learnable_fourier,
+        )
+        if arch == "transformer":
+            self.segmenter = TransformerSegments(
+                segments=segments,
+                in_dim=self.tokenizer.out_dim,
+                hidden=hidden,
+                num_heads=num_heads,
+                kv_heads_cross=kv_heads_cross,
+                kv_heads_self=kv_heads_self,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                encoder_layers=encoder_layers,
+                decoder_layers=decoder_layers,
+                norm_class=norm_class,
+                activation=activation,
+                segment_conf=segment_conf,
+                segment_param=segment_param,
+                length_floor=length_floor,
+                decoder_input_xattn=decoder_input_xattn,
+                qk_norm=qk_norm, qk_norm_type=qk_norm_type,
+            )
+        else:
+            self.segmenter = TokenTransformerSegments(
+                segments=segments,
+                in_dim=self.tokenizer.out_dim,
+                hidden=hidden,
+                num_heads=num_heads,
+                kv_heads_cross=kv_heads_cross,
+                kv_heads_self=kv_heads_self,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                latent_tokens=latent_tokens,
+                latent_layers=latent_layers,
+                decoder_layers=decoder_layers,
+                cross_attn_interval=cross_attn_interval,
+                norm_class=norm_class,
+                activation=activation,
+                segment_conf=segment_conf,
+                pre_encoder_layers=pre_encoder_layers,
+                segment_param=segment_param,
+                length_floor=length_floor,
+                decoder_input_xattn=decoder_input_xattn,
+                qk_norm=qk_norm, qk_norm_type=qk_norm_type,
+            )
+    def forward_tokens(self, tokens: torch.Tensor, mask: torch.Tensor):
+        """Run the segmenter on pre-built token tensors."""
+        return self.segmenter(tokens, mask)

s23dr_2026_example/point_fusion.py ADDED Viewed

	@@ -0,0 +1,554 @@

+"""
+point_fusion.py
+Simplified semantic point fusion for the 2026 dataset format.
+Takes per-view (ADE segmap, Gestalt segmap, depth) + sparse COLMAP point cloud
+from the usm3d/hoho22k_2026_trainval dataset and builds a compact, house-centric
+semantic point representation suitable for downstream wireframe prediction.
+Key differences from the 2025 pipeline:
+  - COLMAP is a ZIP of text files (cameras.txt, images.txt, points3D.txt)
+  - Depth is millimeter I;16 PNG (depth_scale=0.001 converts to meters)
+  - Views flagged with pose_only_in_colmap=True have zeroed K/R/t and must be
+    skipped for depth unprojection and projection
+  - Images arrive as PIL Images, not byte arrays
+"""
+from __future__ import annotations
+import zipfile
+from dataclasses import dataclass
+from io import BytesIO
+from typing import Dict, List, Optional, Tuple
+import cv2
+import numpy as np
+from scipy.stats import mode as scipy_mode
+from .color_mappings import ade20k_color_mapping, gestalt_color_mapping
+# ---------------------------------------------------------------------------
+# Color packing helpers
+# ---------------------------------------------------------------------------
+def _pack_rgb_u32(rgb: np.ndarray) -> np.ndarray:
+    """Pack uint8 RGB (..., 3) into uint32 codes."""
+    rgb = rgb.astype(np.uint32, copy=False)
+    return (rgb[..., 0] << 16) | (rgb[..., 1] << 8) | rgb[..., 2]
+def _build_rgbcode_maps(color_mapping):
+    """Return (rgbcode_to_id, id_to_name) for a color mapping dict."""
+    names = list(color_mapping.keys())
+    rgbs = np.array([color_mapping[n] for n in names], dtype=np.uint8)
+    codes = _pack_rgb_u32(rgbs.reshape(-1, 1, 3)).reshape(-1)
+    rgbcode_to_id = {int(c): i for i, c in enumerate(codes)}
+    return rgbcode_to_id, names
+def _name_to_packed_rgb(name, mapping):
+    """Case-insensitive lookup returning a packed RGB code, or None."""
+    for key in mapping:
+        if key.lower() == name.lower():
+            rgb = np.array(mapping[key], np.uint8).reshape(1, 1, 3)
+            return int(_pack_rgb_u32(rgb).reshape(()))
+    return None
+# ---------------------------------------------------------------------------
+# Label mapping constants
+# ---------------------------------------------------------------------------
+ADE_RGBCODE_TO_ID, ADE_ID_TO_NAME = _build_rgbcode_maps(ade20k_color_mapping)
+GEST_RGBCODE_TO_ID, GEST_ID_TO_NAME = _build_rgbcode_maps(gestalt_color_mapping)
+NUM_ADE = len(ADE_ID_TO_NAME)
+NUM_GEST = len(GEST_ID_TO_NAME)
+GEST_INVALID_NAMES = ("unclassified", "unknown", "transition_line")
+GEST_INVALID_CODES = set(
+    int(_pack_rgb_u32(np.array(gestalt_color_mapping[n], np.uint8).reshape(1, 1, 3)).reshape(()))
+    for n in GEST_INVALID_NAMES if n in gestalt_color_mapping
+)
+# ADE classes whose surfaces are "see-through" for label fusion: when a point
+# projects onto one of these, we use the Gestalt label behind it instead.
+ADE_TRANSPARENT_NAMES = (
+    "wall", "building;edifice", "floor;flooring", "ceiling",
+    "windowpane;window", "door;double;door", "house", "skyscraper",
+    "screen;door;screen", "blind;screen", "hovel;hut;hutch;shack;shanty",
+    "tower", "booth;cubicle;stall;kiosk",
+)
+# ADE classes kept as "occluders/add-ons" when overlapping the house silhouette.
+ADE_OCCLUDER_ALLOWLIST_NAMES = (
+    "tree", "person;individual;someone;somebody;mortal;soul",
+    "car;auto;automobile;machine;motorcar", "truck;motortruck", "van",
+    "fence;fencing", "railing;rail",
+    "bannister;banister;balustrade;balusters;handrail",
+    "stairs;steps", "stairway;staircase", "step;stair", "pole",
+    "streetlight;street;lamp", "signboard;sign", "awning;sunshade;sunblind",
+    "plant;flora;plant;life", "pot;flowerpot",
+)
+# Precomputed arrays for the default name lists (avoids re-lookup every call).
+_DEFAULT_ADE_TRANSPARENT_CODES = np.array(
+    [c for n in ADE_TRANSPARENT_NAMES
+     if (c := _name_to_packed_rgb(n, ade20k_color_mapping)) is not None],
+    dtype=np.uint32,
+)
+_DEFAULT_ADE_OCCLUDER_IDS = np.array(
+    sorted({ADE_RGBCODE_TO_ID[c]
+            for n in ADE_OCCLUDER_ALLOWLIST_NAMES
+            if (c := _name_to_packed_rgb(n, ade20k_color_mapping)) is not None
+            and c in ADE_RGBCODE_TO_ID}),
+    dtype=np.int32,
+)
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+@dataclass(frozen=True)
+class FuserConfig:
+    """Simplified fusion configuration (no depth calibration fields)."""
+    depth_points_per_view: int = 20_000          # depth samples per view
+    depth_scale: float = 0.001                   # mm -> meters
+    depth_clip_percentile: float = 99.5          # drop extreme outliers
+    house_mask_dilate_px: int = 5                # dilate gestalt mask
+    min_support_views: int = 1                   # min views for a kept point
+    ade_transparent_classes: Tuple[str, ...] = ADE_TRANSPARENT_NAMES
+    ade_occluder_allowlist: Tuple[str, ...] = ADE_OCCLUDER_ALLOWLIST_NAMES
+# ---------------------------------------------------------------------------
+# Geometry: projection + depth unprojection
+# ---------------------------------------------------------------------------
+def project_world_points(points_world, K, R, t):
+    """Project (N,3) world points to pixel (u,v) with validity mask."""
+    pts = points_world.astype(np.float32, copy=False)
+    cam = (R @ pts.T + t).T  # (N, 3)
+    z = cam[:, 2]
+    valid = z > 1e-6
+    inv_z = np.zeros_like(z)
+    inv_z[valid] = 1.0 / z[valid]
+    x = cam[:, 0] * inv_z
+    y = cam[:, 1] * inv_z
+    u = K[0, 0] * x + K[0, 2]
+    v = K[1, 1] * y + K[1, 2]
+    return u, v, valid
+def unproject_depth_to_world(depth, K, R, t, num_points, sample_mask=None, rng=None):
+    """Convert a depth map + camera params to (M, 3) world points, M <= num_points."""
+    if rng is None:
+        rng = np.random.default_rng()
+    d = np.asarray(depth, dtype=np.float32)
+    if d.ndim != 2:
+        return np.zeros((0, 3), dtype=np.float32)
+    valid = np.isfinite(d) & (d > 1e-6)
+    if sample_mask is not None:
+        mask = np.asarray(sample_mask, dtype=bool)
+        if mask.shape != d.shape:
+            return np.zeros((0, 3), dtype=np.float32)
+        valid &= mask
+    ys, xs = np.where(valid)
+    if ys.size == 0:
+        return np.zeros((0, 3), dtype=np.float32)
+    idx = rng.choice(ys.size, size=min(num_points, ys.size), replace=False)
+    y = ys[idx].astype(np.float32)
+    x = xs[idx].astype(np.float32)
+    z = d[ys[idx], xs[idx]].astype(np.float32)
+    fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+    cam_pts = np.stack([(x - cx) * z / fx, (y - cy) * z / fy, z], axis=0)
+    # cam = R * world + t  =>  world = R^T * (cam - t)
+    world = (R.T @ (cam_pts - t)).T
+    return world.astype(np.float32, copy=False)
+def clean_depth(depth, clip_percentile):
+    """Clip extreme depth values."""
+    d = np.asarray(depth, dtype=np.float32)
+    d = np.where(np.isfinite(d), d, 0.0)
+    d[d <= 0] = 0.0
+    if clip_percentile is not None and clip_percentile > 0 and np.any(d > 0):
+        hi = float(np.percentile(d[d > 0], clip_percentile))
+        d = np.clip(d, 0.0, hi)
+    return d
+def dilate_mask(mask, radius_px):
+    """Binary dilation via cv2.  mask: (H, W) bool."""
+    if radius_px <= 0:
+        return mask
+    k = 2 * radius_px + 1
+    kernel = np.ones((k, k), np.uint8)
+    return cv2.dilate(mask.astype(np.uint8), kernel) > 0
+# ---------------------------------------------------------------------------
+# COLMAP extraction (2026 format)
+# ---------------------------------------------------------------------------
+def extract_colmap_points_2026(sample):
+    """Extract (N, 3) float32 COLMAP world points from a 2026-format sample.
+    sample['colmap'] must be a ZIP archive containing points3D.txt.
+    Fails fast if that file is missing (it is always present in the 2026 format).
+    """
+    colmap_blob = sample.get("colmap")
+    if colmap_blob is None:
+        return np.zeros((0, 3), dtype=np.float32)
+    if not isinstance(colmap_blob, (bytes, bytearray, memoryview)):
+        return np.zeros((0, 3), dtype=np.float32)
+    try:
+        with zipfile.ZipFile(BytesIO(colmap_blob)) as zf:
+            if "points3D.txt" not in set(zf.namelist()):
+                raise FileNotFoundError(
+                    "COLMAP ZIP is missing points3D.txt -- "
+                    "this is required in the 2026 dataset format")
+            with zf.open("points3D.txt") as f:
+                text = f.read().decode("utf-8", errors="ignore")
+            # Format: POINT3D_ID X Y Z R G B ERROR TRACK[]
+            # Filter comment/blank lines, parse columns 1-3 (X,Y,Z)
+            from io import StringIO
+            clean = "\n".join(l for l in text.split("\n") if l and not l.startswith("#"))
+            if not clean:
+                return np.zeros((0, 3), dtype=np.float32)
+            return np.loadtxt(StringIO(clean), dtype=np.float32, usecols=(1, 2, 3))
+    except zipfile.BadZipFile:
+        pass
+    return np.zeros((0, 3), dtype=np.float32)
+# ---------------------------------------------------------------------------
+# Label helpers
+# ---------------------------------------------------------------------------
+def _codes_from_image(img):
+    """Convert a PIL Image or numpy array to a (H, W) uint32 packed-RGB map."""
+    arr = np.asarray(img)
+    if arr.ndim == 2:
+        arr = np.stack([arr, arr, arr], axis=-1)
+    arr = arr[..., :3]
+    if arr.dtype != np.uint8:
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+    return _pack_rgb_u32(arr)
+def _row_majority(values):
+    """Row-wise majority vote on (P, V) int array; -1 means "no vote".
+    Returns (P,) with the most frequent non-negative value per row, or -1.
+    Masks -1 entries before voting so that abstentions don't outvote
+    actual labels (which happens when a point is visible in only 1-2 views).
+    """
+    P, V = values.shape
+    result = np.full(P, -1, dtype=values.dtype)
+    # For each row, find the most frequent non-negative value.
+    # Vectorized approach: flatten valid entries per row using argmax on counts.
+    # Since values are typically small non-negative ints (0-200), we can use
+    # a simple max-of-first-valid approach for speed when V is small.
+    for vi in range(V):
+        # For rows still unset, take the first valid vote
+        col = values[:, vi]
+        unset = result == -1
+        has_val = col >= 0
+        update = unset & has_val
+        result[update] = col[update]
+    # Now refine: if a row has multiple different valid votes, pick the mode.
+    # Check if any row has conflicting votes across views.
+    has_any = np.any(values >= 0, axis=1)
+    n_valid = np.sum(values >= 0, axis=1)
+    needs_vote = has_any & (n_valid > 1)
+    if np.any(needs_vote):
+        for i in np.where(needs_vote)[0]:
+            valid = values[i][values[i] >= 0]
+            # Use numpy bincount for speed (values are small non-neg ints)
+            counts = np.bincount(valid.astype(np.intp))
+            result[i] = counts.argmax()
+    return result
+# ---------------------------------------------------------------------------
+# Semantic fusion: house-centric, occluder-aware
+# ---------------------------------------------------------------------------
+def _fuse_labels_for_points(
+    points_world, Ks, Rs, ts, ade_images, gestalt_images,
+    ade_transparent_codes, ade_occluder_allowed_ids,
+    min_support_views, valid_view_mask=None,
+):
+    """Multi-view semantic label fusion with majority voting.
+    For each 3D point, project into every valid view:
+      - ADE "envelope" class -> use the Gestalt label behind it.
+      - ADE non-envelope -> keep if on the occluder allowlist.
+    Then majority-vote across views.
+    Returns dict: keep, visible_src, visible_id, behind_gest_id, support
+    """
+    P = points_world.shape[0]
+    V = min(len(Ks), len(Rs), len(ts), len(ade_images), len(gestalt_images))
+    empty = {
+        "keep": np.zeros(P, dtype=bool),
+        "visible_src": np.zeros(P, np.uint8),
+        "visible_id": np.full(P, -1, np.int16),
+        "behind_gest_id": np.full(P, -1, np.int16),
+        "support": np.zeros(P, np.uint8),
+    }
+    if P == 0 or V == 0:
+        return empty
+    # Per-view labels. src: 1=gestalt, 2=ade; -1 = no contribution.
+    visible_src_pv = np.full((P, V), -1, dtype=np.int8)
+    visible_id_pv = np.full((P, V), -1, dtype=np.int32)
+    behind_id_pv = np.full((P, V), -1, dtype=np.int32)
+    support = np.zeros(P, dtype=np.int32)
+    ade_allowed_set = set(ade_occluder_allowed_ids.tolist())
+    ade_transparent_u32 = ade_transparent_codes.astype(np.uint32, copy=False)
+    gest_invalid_arr = np.array(list(GEST_INVALID_CODES), dtype=np.uint32)
+    for vi in range(V):
+        if valid_view_mask is not None and not valid_view_mask[vi]:
+            continue
+        K = np.asarray(Ks[vi], np.float32)
+        R = np.asarray(Rs[vi], np.float32)
+        t = np.asarray(ts[vi], np.float32).reshape(3, 1)
+        ade_codes_img = _codes_from_image(ade_images[vi])
+        gest_codes_img = _codes_from_image(gestalt_images[vi])
+        H, W = ade_codes_img.shape
+        u, v, valid = project_world_points(points_world, K, R, t)
+        in_img = valid & (u >= 0) & (u < W) & (v >= 0) & (v < H)
+        if not np.any(in_img):
+            continue
+        ui = np.clip(np.round(u[in_img]).astype(np.int32), 0, W - 1)
+        vi_pix = np.clip(np.round(v[in_img]).astype(np.int32), 0, H - 1)
+        ade_codes = ade_codes_img[vi_pix, ui]
+        gest_codes = gest_codes_img[vi_pix, ui]
+        in_house = ~np.isin(gest_codes, gest_invalid_arr)
+        if not np.any(in_house):
+            continue
+        idx = np.where(in_img)[0][in_house]
+        ade_codes_h = ade_codes[in_house]
+        gest_codes_h = gest_codes[in_house]
+        behind_local = np.array(
+            [GEST_RGBCODE_TO_ID.get(int(c), -1) for c in gest_codes_h],
+            dtype=np.int32)
+        behind_id_pv[idx, vi] = behind_local
+        ade_is_transparent = np.isin(ade_codes_h, ade_transparent_u32)
+        # Case A: ADE is envelope -- use Gestalt label.
+        mask_a = ade_is_transparent & (behind_local >= 0)
+        if np.any(mask_a):
+            visible_src_pv[idx[mask_a], vi] = 1
+            visible_id_pv[idx[mask_a], vi] = behind_local[mask_a]
+        # Case B: ADE is non-envelope -- use ADE label (allowlist-filtered).
+        mask_b = ~ade_is_transparent
+        if np.any(mask_b):
+            ade_local = np.array(
+                [ADE_RGBCODE_TO_ID.get(int(c), -1) for c in ade_codes_h[mask_b]],
+                dtype=np.int32)
+            on_allowlist = np.array(
+                [int(a) in ade_allowed_set for a in ade_local], dtype=bool
+            ) & (ade_local >= 0)
+            if np.any(on_allowlist):
+                visible_src_pv[idx[mask_b][on_allowlist], vi] = 2
+                visible_id_pv[idx[mask_b][on_allowlist], vi] = ade_local[on_allowlist]
+        support[idx] += 1
+    # ---- Aggregate across views via majority vote ----
+    keep = (support >= min_support_views) & np.any(visible_src_pv >= 0, axis=1)
+    # Combine (src, id) into a single key for voting, then split back.
+    # src in {1,2} and id in [0, ~150], so stride=100k avoids collisions.
+    VIS_STRIDE = 100_000
+    vis_key = np.where(
+        visible_src_pv >= 0,
+        visible_src_pv.astype(np.int64) * VIS_STRIDE + visible_id_pv.astype(np.int64),
+        -1)
+    voted_key = _row_majority(vis_key)
+    voted_behind = _row_majority(behind_id_pv)
+    final_src = np.zeros(P, dtype=np.uint8)
+    final_id = np.full(P, -1, dtype=np.int16)
+    ok = voted_key >= 0
+    if np.any(ok):
+        final_src[ok] = (voted_key[ok] // VIS_STRIDE).astype(np.uint8)
+        final_id[ok] = (voted_key[ok] % VIS_STRIDE).astype(np.int16)
+    # ---- Vote confidence metadata ----
+    n_views_voted = np.sum(visible_src_pv >= 0, axis=1).astype(np.uint8)
+    # Fraction of voting views that agreed with the majority label
+    vote_frac = np.zeros(P, dtype=np.float32)
+    if np.any(ok):
+        for i in np.where(ok)[0]:
+            votes = vis_key[i][vis_key[i] >= 0]
+            if len(votes) > 0:
+                vote_frac[i] = (votes == voted_key[i]).sum() / len(votes)
+    return {
+        "keep": keep,
+        "visible_src": final_src,
+        "visible_id": final_id,
+        "behind_gest_id": voted_behind.astype(np.int16),
+        "support": support.astype(np.uint8),
+        "n_views_voted": n_views_voted,
+        "vote_frac": vote_frac,
+    }
+# ---------------------------------------------------------------------------
+# Compact scene builder (2026 dataset format)
+# ---------------------------------------------------------------------------
+def _resolve_ade_codes(cfg):
+    """Return (transparent_codes, occluder_ids) for the given config.
+    Uses precomputed module-level arrays when the config has default names.
+    """
+    if cfg.ade_transparent_classes == ADE_TRANSPARENT_NAMES:
+        transparent = _DEFAULT_ADE_TRANSPARENT_CODES
+    else:
+        transparent = np.array(
+            [c for n in cfg.ade_transparent_classes
+             if (c := _name_to_packed_rgb(n, ade20k_color_mapping)) is not None],
+            dtype=np.uint32)
+    if cfg.ade_occluder_allowlist == ADE_OCCLUDER_ALLOWLIST_NAMES:
+        occluder_ids = _DEFAULT_ADE_OCCLUDER_IDS
+    else:
+        occluder_ids = np.array(
+            sorted({ADE_RGBCODE_TO_ID[c]
+                    for n in cfg.ade_occluder_allowlist
+                    if (c := _name_to_packed_rgb(n, ade20k_color_mapping)) is not None
+                    and c in ADE_RGBCODE_TO_ID}),
+            dtype=np.int32)
+    return transparent, occluder_ids
+def _parse_gt_array(sample, key, dtype, expected_cols):
+    """Parse an optional ground-truth array from the sample dict."""
+    raw = sample.get(key)
+    if raw is None:
+        return None
+    arr = np.asarray(raw, dtype=dtype)
+    if arr.ndim == 2 and arr.shape[1] == expected_cols:
+        return arr
+    return None
+def build_compact_scene(sample, cfg, rng):
+    """Build a compact semantic point representation from a HuggingFace sample.
+    Expected sample keys: K, R, t, ade, gestalt, depth, colmap,
+    pose_only_in_colmap, wf_vertices (opt), wf_edges (opt), __key__ (opt).
+    Returns dict (xyz, source, visible_src, visible_id, behind_gest_id,
+    gt_vertices, gt_edges, sample_id) or None if no points survive fusion.
+    """
+    Ks = sample.get("K") or []
+    Rs = sample.get("R") or []
+    ts = sample.get("t") or []
+    ade_imgs = sample.get("ade") or []
+    gest_imgs = sample.get("gestalt") or []
+    depths = sample.get("depth") or []
+    pose_flags = sample.get("pose_only_in_colmap") or []
+    V = min(len(Ks), len(Rs), len(ts), len(ade_imgs), len(gest_imgs))
+    if V == 0:
+        return None
+    valid_view = [not (vi < len(pose_flags) and pose_flags[vi]) for vi in range(V)]
+    if not any(valid_view):
+        return None
+    # ---- COLMAP points ----
+    colmap_pts = extract_colmap_points_2026(sample)
+    # ---- Precompute house masks (from Gestalt), optionally dilated ----
+    gest_invalid_arr = np.array(list(GEST_INVALID_CODES), dtype=np.uint32)
+    house_masks = []
+    for vi in range(V):
+        if not valid_view[vi]:
+            house_masks.append(None)
+            continue
+        mask = ~np.isin(_codes_from_image(gest_imgs[vi]), gest_invalid_arr)
+        if cfg.house_mask_dilate_px > 0:
+            mask = dilate_mask(mask, cfg.house_mask_dilate_px)
+        house_masks.append(mask)
+    # ---- Sample depth points per view ----
+    depth_points_all = []
+    for vi in range(min(V, len(depths))):
+        if not valid_view[vi] or depths[vi] is None:
+            continue
+        d = clean_depth(
+            np.asarray(depths[vi], dtype=np.float32) * cfg.depth_scale,
+            cfg.depth_clip_percentile)
+        pts = unproject_depth_to_world(
+            depth=d,
+            K=np.asarray(Ks[vi], np.float32),
+            R=np.asarray(Rs[vi], np.float32),
+            t=np.asarray(ts[vi], np.float32).reshape(3, 1),
+            num_points=cfg.depth_points_per_view,
+            sample_mask=house_masks[vi], rng=rng)
+        if pts.shape[0]:
+            depth_points_all.append(pts)
+    # ---- Combine COLMAP + depth points ----
+    pts_list, src_list = [], []
+    if colmap_pts.shape[0]:
+        pts_list.append(colmap_pts)
+        src_list.append(np.zeros(colmap_pts.shape[0], dtype=np.uint8))   # 0=colmap
+    if depth_points_all:
+        all_depth = np.concatenate(depth_points_all, axis=0)
+        pts_list.append(all_depth)
+        src_list.append(np.ones(all_depth.shape[0], dtype=np.uint8))     # 1=depth
+    if not pts_list:
+        return None
+    points_world = np.concatenate(pts_list, axis=0).astype(np.float32, copy=False)
+    point_source = np.concatenate(src_list, axis=0).astype(np.uint8, copy=False)
+    # ---- Fuse semantic labels ----
+    ade_transparent_arr, ade_allow_ids = _resolve_ade_codes(cfg)
+    fused = _fuse_labels_for_points(
+        points_world=points_world, Ks=Ks, Rs=Rs, ts=ts,
+        ade_images=ade_imgs, gestalt_images=gest_imgs,
+        ade_transparent_codes=ade_transparent_arr,
+        ade_occluder_allowed_ids=ade_allow_ids,
+        min_support_views=cfg.min_support_views,
+        valid_view_mask=valid_view)
+    keep = fused["keep"]
+    if not np.any(keep):
+        return None
+    return {
+        "xyz": points_world[keep],
+        "source": point_source[keep],               # 0=colmap, 1=monodepth
+        "visible_src": fused["visible_src"][keep],   # 1=gestalt, 2=ade
+        "visible_id": fused["visible_id"][keep],
+        "behind_gest_id": fused["behind_gest_id"][keep],
+        "n_views_voted": fused["n_views_voted"][keep],
+        "vote_frac": fused["vote_frac"][keep],
+        "gt_vertices": _parse_gt_array(sample, "wf_vertices", np.float32, 3),
+        "gt_edges": _parse_gt_array(sample, "wf_edges", np.int64, 2),
+        "sample_id": sample.get("__key__", None),
+    }

s23dr_2026_example/postprocess_v2.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""Post-processing functions for segment predictions."""
+import numpy as np
+def snap_to_point_cloud(vertices, xyz, class_id, snap_radius=0.5,
+                         target_classes=None):
+    """Snap vertices to nearby point cloud clusters of specific semantic classes."""
+    if target_classes is None:
+        target_classes = [1, 2]  # apex, eave_end_point
+    snapped = vertices.copy()
+    mask = np.isin(class_id, target_classes)
+    if mask.sum() < 2:
+        return snapped
+    target_pts = xyz[mask]
+    for i, v in enumerate(vertices):
+        dists = np.linalg.norm(target_pts - v, axis=-1)
+        close = dists < snap_radius
+        if close.sum() >= 2:
+            snapped[i] = target_pts[close].mean(axis=0)
+    return snapped
+def snap_horizontal(vertices, edges, max_slope=0.05):
+    """Snap near-horizontal edges to be exactly horizontal."""
+    verts = vertices.copy()
+    for a, b in edges:
+        a, b = int(a), int(b)
+        dy = abs(verts[a, 1] - verts[b, 1])
+        dxz = np.sqrt((verts[a, 0] - verts[b, 0])**2 + (verts[a, 2] - verts[b, 2])**2)
+        if dxz > 0.1 and dy / dxz < max_slope:
+            avg_y = 0.5 * (verts[a, 1] + verts[b, 1])
+            verts[a, 1] = avg_y
+            verts[b, 1] = avg_y
+    return verts

s23dr_2026_example/segment_postprocess.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from __future__ import annotations
+import numpy as np
+def merge_vertices(vertices: np.ndarray, edges: np.ndarray, thresh: float):
+    verts = np.asarray(vertices, dtype=np.float32)
+    edges = np.asarray(edges, dtype=np.int64)
+    if verts.size == 0 or edges.size == 0:
+        return verts, edges
+    n = verts.shape[0]
+    parent = np.arange(n, dtype=np.int64)
+    def find(i):
+        while parent[i] != i:
+            parent[i] = parent[parent[i]]
+            i = parent[i]
+        return i
+    def union(i, j):
+        ri = find(i)
+        rj = find(j)
+        if ri != rj:
+            parent[rj] = ri
+    for i in range(n):
+        vi = verts[i]
+        for j in range(i + 1, n):
+            if np.linalg.norm(vi - verts[j]) <= thresh:
+                union(i, j)
+    clusters = {}
+    for i in range(n):
+        root = find(i)
+        clusters.setdefault(root, []).append(i)
+    new_vertices = []
+    mapping = {}
+    for new_idx, idxs in enumerate(clusters.values()):
+        pts = verts[idxs]
+        center = pts.mean(axis=0)
+        new_vertices.append(center)
+        for i in idxs:
+            mapping[i] = new_idx
+    new_edges = []
+    seen = set()
+    for a, b in edges:
+        na = mapping.get(int(a), int(a))
+        nb = mapping.get(int(b), int(b))
+        if na == nb:
+            continue
+        key = (na, nb) if na <= nb else (nb, na)
+        if key in seen:
+            continue
+        seen.add(key)
+        new_edges.append([na, nb])
+    return np.asarray(new_vertices, dtype=np.float32), np.asarray(new_edges, dtype=np.int64)

s23dr_2026_example/sinkhorn.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""Sinkhorn optimal transport loss for segment matching.
+Note: at eps=0.05, sinkhorn gradients are near-zero (~1e-7 norm) for
+typical matrix sizes. The loss value is tracked but does not meaningfully
+train the model. Default sinkhorn_weight=0.0. See worklog.md for details.
+Future: schedule eps from large (1.0) to small (0.05) during training
+to get useful gradients early and precise matching late.
+"""
+import torch
+def segment_pair_cost(pred_segments: torch.Tensor, gt_segments: torch.Tensor) -> torch.Tensor:
+    """Cost between pred and GT segments: midpoint + direction + length (decoupled).
+    pred_segments: [N, 2, 3], gt_segments: [M, 2, 3] -> [N, M]
+    """
+    p0, p1 = pred_segments[:, 0], pred_segments[:, 1]
+    g0, g1 = gt_segments[:, 0], gt_segments[:, 1]
+    mid_p, half_p = 0.5 * (p0 + p1), 0.5 * (p1 - p0)
+    mid_g, half_g = 0.5 * (g0 + g1), 0.5 * (g1 - g0)
+    d_mid = torch.cdist(mid_p, mid_g)
+    len_p = torch.linalg.norm(half_p, dim=-1, keepdim=True).clamp(min=1e-6)
+    len_g = torch.linalg.norm(half_g, dim=-1, keepdim=True).clamp(min=1e-6)
+    dir_p = half_p / len_p
+    dir_g = half_g / len_g
+    cos_angle = (dir_p[:, None, :] * dir_g[None, :, :]).sum(dim=-1)
+    d_dir = 1.0 - cos_angle.abs()
+    d_len = (len_p[:, None, :] - len_g[None, :, :]).squeeze(-1).abs()
+    return d_mid + d_dir + d_len
+def batched_sinkhorn_loss(
+    pred_segments: torch.Tensor,
+    gt_pad: torch.Tensor,
+    gt_mask: torch.Tensor,
+    eps: float,
+    iters: int,
+    dustbin_cost: float | torch.Tensor,
+    pred_mass: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """Batched sinkhorn segment matching loss.
+    Args:
+        pred_segments: [B, S, 2, 3] predicted segments
+        gt_pad:        [B, M, 2, 3] padded GT segments
+        gt_mask:       [B, M] bool mask (True = valid GT segment)
+        eps:           sinkhorn regularization
+        iters:         sinkhorn iterations
+        dustbin_cost:  cost for unmatched segments (scalar or [B])
+        pred_mass:     [B, S] per-segment mass weights (e.g. sigmoid(conf)).
+                       If None, uniform masses are used.
+    Returns:
+        [B] per-sample sinkhorn transport cost
+    """
+    B, S, _, _ = pred_segments.shape
+    M = gt_pad.shape[1]
+    # Allow per-sample dustbin cost
+    dc = torch.as_tensor(dustbin_cost, device=pred_segments.device, dtype=pred_segments.dtype)
+    if dc.dim() == 0:
+        dc = dc.expand(B)
+    # Compute cost matrices [B, S, M] in midpoint-halfvec space.
+    # Decouples position from direction: mid gradient is pure position,
+    # half gradient is pure direction/length. Sign-invariance on half
+    # handles segment direction ambiguity cleanly.
+    p0 = pred_segments[:, :, 0]  # [B, S, 3]
+    p1 = pred_segments[:, :, 1]  # [B, S, 3]
+    g0 = gt_pad[:, :, 0]         # [B, M, 3]
+    g1 = gt_pad[:, :, 1]         # [B, M, 3]
+    mid_pred = 0.5 * (p0 + p1)   # [B, S, 3]
+    half_pred = 0.5 * (p1 - p0)  # [B, S, 3]
+    mid_gt = 0.5 * (g0 + g1)     # [B, M, 3]
+    half_gt = 0.5 * (g1 - g0)    # [B, M, 3]
+    # Midpoint distance [B, S, M]
+    d_mid = torch.linalg.norm(
+        mid_pred.unsqueeze(2) - mid_gt.unsqueeze(1), dim=-1)
+    # Decoupled direction + length distance (sign-invariant for direction ambiguity)
+    len_pred = torch.linalg.norm(half_pred, dim=-1, keepdim=True).clamp(min=1e-6)  # [B, S, 1]
+    len_gt = torch.linalg.norm(half_gt, dim=-1, keepdim=True).clamp(min=1e-6)      # [B, M, 1]
+    dir_pred = half_pred / len_pred  # [B, S, 3]
+    dir_gt = half_gt / len_gt        # [B, M, 3]
+    # Direction distance: 1 - |cos(angle)|, sign-invariant [B, S, M]
+    cos_angle = (dir_pred.unsqueeze(2) * dir_gt.unsqueeze(1)).sum(dim=-1)  # [B, S, M]
+    d_dir = 1.0 - cos_angle.abs()
+    # Length distance [B, S, M]
+    d_len = (len_pred.unsqueeze(2) - len_gt.unsqueeze(1)).squeeze(-1).abs()
+    cost = d_mid + d_dir + d_len  # [B, S, M]
+    # Mask invalid GT segments with high cost so they go to dustbin
+    cost = torch.where(gt_mask.unsqueeze(1), cost, dc[:, None, None] * 10.0)
+    # Pad with dustbin row and column: [B, S+1, M+1]
+    cost_pad = dc[:, None, None].expand(B, S + 1, M + 1).clone()
+    cost_pad[:, :S, :M] = cost
+    cost_pad[:, -1, -1] = 0.0
+    # Masses
+    gt_counts = gt_mask.sum(dim=1).float()  # [B]
+    if pred_mass is not None:
+        # Confidence-weighted masses (matches learned_v2 approach).
+        # sigmoid(conf) gives per-segment mass; dustbin masses balance the totals.
+        # No normalization -- sum(a) == sum(b) == max(sum_pred, sum_gt).
+        pm = pred_mass.clamp(min=0.0)  # [B, S]
+        sum_pred = pm.sum(dim=1)       # [B]
+        sum_gt = gt_counts             # [B]
+        pred_dustbin = (sum_gt - sum_pred).clamp(min=0.0)  # [B]
+        gt_dustbin = (sum_pred - sum_gt).clamp(min=0.0)    # [B]
+        a = torch.cat([pm, pred_dustbin.unsqueeze(1)], dim=1)  # [B, S+1]
+        b_val = torch.zeros(B, M + 1, device=cost.device, dtype=cost.dtype)
+        b_val[:, :M] = gt_mask.float()  # 1.0 per valid GT segment
+        b_val[:, -1] = gt_dustbin
+    else:
+        # Uniform masses (normalized)
+        n = float(S)
+        denom = n + gt_counts  # [B]
+        a = (1.0 / denom).unsqueeze(1).expand(B, S + 1).clone()  # [B, S+1]
+        a[:, -1] = gt_counts / denom
+        b_val = (1.0 / denom).unsqueeze(1).expand(B, M + 1).clone()  # [B, M+1]
+        b_val[:, -1] = n / denom
+        # Zero out mass for invalid GT
+        b_val[:, :M] = b_val[:, :M] * gt_mask.float()
+    # Log-domain sinkhorn
+    log_a = torch.log(a + 1e-9)
+    log_b = torch.log(b_val + 1e-9)
+    log_k = -cost_pad / eps
+    log_u = torch.zeros_like(a)
+    log_v = torch.zeros_like(b_val)
+    for _ in range(iters):
+        log_u = log_a - torch.logsumexp(log_k + log_v.unsqueeze(1), dim=2)
+        log_v = log_b - torch.logsumexp(log_k + log_u.unsqueeze(2), dim=1)
+    transport = torch.exp(log_u.unsqueeze(2) + log_v.unsqueeze(1) + log_k)
+    return (transport * cost_pad).sum(dim=(1, 2))  # [B]
+# Keep the per-sample version for compatibility
+def sinkhorn_segment_loss(
+    pred_segments: torch.Tensor,
+    gt_segments: torch.Tensor,
+    eps: float,
+    iters: int,
+    dustbin_cost: float,
+    pred_mass: torch.Tensor | None = None,
+) -> torch.Tensor:
+    if pred_segments.numel() == 0 or gt_segments.numel() == 0:
+        return pred_segments.new_tensor(dustbin_cost)
+    cost = segment_pair_cost(pred_segments, gt_segments)
+    n, m = cost.shape
+    if n == 0 or m == 0:
+        return cost.new_tensor(dustbin_cost)
+    cost_pad = torch.full((n + 1, m + 1), dustbin_cost, device=cost.device, dtype=cost.dtype)
+    cost_pad[:n, :m] = cost
+    cost_pad[-1, -1] = 0.0
+    denom = float(n + m)
+    a = torch.full((n + 1,), 1.0 / denom, device=cost.device, dtype=cost.dtype)
+    b = torch.full((m + 1,), 1.0 / denom, device=cost.device, dtype=cost.dtype)
+    a[-1] = m / denom
+    b[-1] = n / denom
+    log_a = torch.log(a + 1e-9)
+    log_b = torch.log(b + 1e-9)
+    log_k = -cost_pad / eps
+    log_u = torch.zeros_like(a)
+    log_v = torch.zeros_like(b)
+    for _ in range(iters):
+        log_u = log_a - torch.logsumexp(log_k + log_v[None, :], dim=1)
+        log_v = log_b - torch.logsumexp(log_k + log_u[:, None], dim=0)
+    transport = torch.exp(log_u[:, None] + log_v[None, :] + log_k)
+    return torch.sum(transport * cost_pad)

s23dr_2026_example/soft_hss_loss.py ADDED Viewed

	@@ -0,0 +1,507 @@

+import torch
+def _softmin(values: torch.Tensor, dim: int, tau: float) -> torch.Tensor:
+    tau_t = torch.as_tensor(tau, device=values.device, dtype=values.dtype).clamp_min(1e-8)
+    return -tau_t * torch.logsumexp(-values / tau_t, dim=dim)
+def point_segment_distance_squared(
+    points: torch.Tensor,
+    seg_a: torch.Tensor,
+    seg_b: torch.Tensor,
+    eps: float = 1e-9,
+) -> torch.Tensor:
+    """
+    points: (P,3)
+    seg_a/seg_b: (S,3)
+    returns dist2: (P,S)
+    """
+    ab = seg_b - seg_a  # (S,3)
+    ab2 = (ab * ab).sum(dim=-1).clamp_min(eps)  # (S,)
+    ap = points[:, None, :] - seg_a[None, :, :]  # (P,S,3)
+    t = (ap * ab[None, :, :]).sum(dim=-1) / ab2[None, :]  # (P,S)
+    t = t.clamp(0.0, 1.0)
+    closest = seg_a[None, :, :] + t[:, :, None] * ab[None, :, :]
+    diff = points[:, None, :] - closest
+    return (diff * diff).sum(dim=-1)
+def distance_to_segments(
+    points: torch.Tensor,
+    segments: torch.Tensor,
+    eps: float = 1e-9,
+) -> torch.Tensor:
+    """
+    points: (P,3)
+    segments: (S,2,3)
+    returns min distance: (P,)
+    """
+    a = segments[:, 0]
+    b = segments[:, 1]
+    dist2 = point_segment_distance_squared(points, a, b, eps=eps)
+    return torch.sqrt(dist2.min(dim=1).values + eps)
+def soft_vertex_f1(
+    pred_vertices: torch.Tensor,
+    gt_vertices: torch.Tensor,
+    thresh: float,
+    tau: float = 0.05,
+    softmin_tau: float = 0.05,
+    eps: float = 1e-8,
+) -> torch.Tensor:
+    """
+    Soft surrogate for the Hungarian-thresholded corner F1 used by HSS.
+    Uses (soft) nearest-neighbor distances and a sigmoid threshold.
+    """
+    if pred_vertices.numel() == 0 or gt_vertices.numel() == 0:
+        return torch.zeros((), device=pred_vertices.device, dtype=pred_vertices.dtype)
+    pred = pred_vertices
+    gt = gt_vertices
+    diff = pred[:, None, :] - gt[None, :, :]
+    dist = torch.sqrt((diff * diff).sum(dim=-1) + eps)  # (P,G)
+    d_pred = _softmin(dist, dim=1, tau=softmin_tau)  # (P,)
+    d_gt = _softmin(dist, dim=0, tau=softmin_tau)    # (G,)
+    tau_t = torch.as_tensor(tau, device=dist.device, dtype=dist.dtype).clamp_min(1e-8)
+    thresh_t = torch.as_tensor(thresh, device=dist.device, dtype=dist.dtype)
+    p_match = torch.sigmoid((thresh_t - d_pred) / tau_t).mean()
+    r_match = torch.sigmoid((thresh_t - d_gt) / tau_t).mean()
+    return 2.0 * p_match * r_match / (p_match + r_match + eps)
+def soft_tube_iou_mc(
+    pred_segments: torch.Tensor,
+    gt_segments: torch.Tensor,
+    radius: float,
+    n_samples: int = 4096,
+    tau: float = 0.05,
+    seed: int = 0,
+    eps: float = 1e-8,
+) -> torch.Tensor:
+    """
+    Soft surrogate for volumetric tube IoU (edge_thresh in HSS).
+    Samples points uniformly in a padded bbox around {pred,gt} endpoints.
+    Occupancy is sigmoid((radius - d(x, segments))/tau).
+    IoU is approximated by mean(min(occ_p, occ_g)) / mean(max(occ_p, occ_g)).
+    """
+    if pred_segments.numel() == 0 or gt_segments.numel() == 0:
+        return torch.zeros((), device=pred_segments.device, dtype=pred_segments.dtype)
+    pts_all = torch.cat([pred_segments.reshape(-1, 3), gt_segments.reshape(-1, 3)], dim=0)
+    pad = torch.as_tensor(radius, device=pts_all.device, dtype=pts_all.dtype)
+    lo = pts_all.min(dim=0).values - pad
+    hi = pts_all.max(dim=0).values + pad
+    gen = torch.Generator(device=pts_all.device)
+    gen.manual_seed(int(seed))
+    u = torch.rand((int(n_samples), 3), generator=gen, device=pts_all.device, dtype=pts_all.dtype)
+    x = lo[None, :] + u * (hi - lo)[None, :]
+    d_p = distance_to_segments(x, pred_segments, eps=eps)
+    d_g = distance_to_segments(x, gt_segments, eps=eps)
+    tau_t = torch.as_tensor(tau, device=pts_all.device, dtype=pts_all.dtype).clamp_min(1e-8)
+    rad_t = torch.as_tensor(radius, device=pts_all.device, dtype=pts_all.dtype)
+    occ_p = torch.sigmoid((rad_t - d_p) / tau_t)
+    occ_g = torch.sigmoid((rad_t - d_g) / tau_t)
+    inter = torch.minimum(occ_p, occ_g).mean()
+    union = torch.maximum(occ_p, occ_g).mean().clamp_min(eps)
+    return inter / union
+def soft_hss(
+    pred_segments: torch.Tensor,
+    gt_segments: torch.Tensor,
+    gt_vertices: torch.Tensor,
+    vert_thresh: float = 0.5,
+    edge_thresh: float = 0.5,
+    tau: float = 0.05,
+    softmin_tau: float = 0.05,
+    n_samples: int = 4096,
+    seed: int = 0,
+    eps: float = 1e-8,
+):
+    """
+    Returns (soft_hss, soft_f1, soft_iou), all scalars in [0,1] (approximately).
+    """
+    pred_vertices = pred_segments.reshape(-1, 3)
+    f1 = soft_vertex_f1(pred_vertices, gt_vertices, thresh=vert_thresh, tau=tau, softmin_tau=softmin_tau, eps=eps)
+    iou = soft_tube_iou_mc(
+        pred_segments,
+        gt_segments,
+        radius=edge_thresh,
+        n_samples=n_samples,
+        tau=tau,
+        seed=seed,
+        eps=eps,
+    )
+    denom = (f1 + iou).clamp_min(eps)
+    hss = 2.0 * f1 * iou / denom
+    return hss, f1, iou
+# ---------------------------------------------------------------------------
+# Improved: Sinkhorn-matched vertex F1
+# ---------------------------------------------------------------------------
+#
+# The original soft_vertex_f1 uses independent softmin nearest-neighbor
+# distances, which allows multiple predicted vertices to claim the same GT
+# vertex. This inflates precision and fails to penalize duplicate vertices --
+# the exact failure mode that requires merge_vertices post-processing.
+#
+# This version uses Sinkhorn optimal transport to find a soft one-to-one
+# assignment between predicted and GT vertices, then computes precision and
+# recall from the matched distances. This is a better surrogate for the
+# Hungarian matching used by the real HSS metric.
+def sinkhorn_vertex_f1(
+    pred_vertices: torch.Tensor,
+    gt_vertices: torch.Tensor,
+    thresh: float = 0.5,
+    tau: float = 0.05,
+    eps_sinkhorn: float = 0.05,
+    iters: int = 20,
+    eps: float = 1e-8,
+) -> torch.Tensor:
+    """Soft vertex F1 using Sinkhorn matching (better aligned with real HSS).
+    Instead of independent nearest-neighbor distances (which allow double-
+    claiming), this uses optimal transport to find a soft one-to-one assignment
+    between predicted and GT vertices.
+    Returns a differentiable scalar in [0, 1].
+    """
+    if pred_vertices.numel() == 0 or gt_vertices.numel() == 0:
+        return torch.zeros((), device=pred_vertices.device, dtype=pred_vertices.dtype)
+    P = pred_vertices.shape[0]
+    G = gt_vertices.shape[0]
+    # Pairwise distance matrix (P, G)
+    dist = torch.cdist(pred_vertices, gt_vertices)
+    # Sinkhorn with dustbin: (P+1) x (G+1)
+    # Dustbin cost = thresh (unmatched vertices are "at threshold distance")
+    dustbin = thresh
+    cost_pad = torch.full((P + 1, G + 1), dustbin, device=dist.device, dtype=dist.dtype)
+    cost_pad[:P, :G] = dist
+    cost_pad[-1, -1] = 0.0
+    # Uniform masses with dustbin slack
+    denom = float(P + G)
+    a = torch.full((P + 1,), 1.0 / denom, device=dist.device, dtype=dist.dtype)
+    b = torch.full((G + 1,), 1.0 / denom, device=dist.device, dtype=dist.dtype)
+    a[-1] = G / denom  # pred dustbin absorbs unmatched GT
+    b[-1] = P / denom  # GT dustbin absorbs unmatched pred
+    # Log-domain Sinkhorn
+    log_a = torch.log(a + 1e-9)
+    log_b = torch.log(b + 1e-9)
+    log_k = -cost_pad / max(eps_sinkhorn, 1e-6)
+    log_u = torch.zeros_like(a)
+    log_v = torch.zeros_like(b)
+    for _ in range(iters):
+        log_u = log_a - torch.logsumexp(log_k + log_v[None, :], dim=1)
+        log_v = log_b - torch.logsumexp(log_k + log_u[:, None], dim=0)
+    # Transport plan (P+1, G+1)
+    transport = torch.exp(log_u[:, None] + log_v[None, :] + log_k)
+    # Extract the non-dustbin transport (P, G) -- these are the soft assignments
+    T = transport[:P, :G]
+    # For each predicted vertex, its matched distance is the transport-weighted
+    # average distance to GT vertices
+    # Normalize rows to sum to 1 (how much of this pred is matched vs dustbin)
+    row_sums = T.sum(dim=1).clamp_min(eps)
+    matched_dist_pred = (T * dist).sum(dim=1) / row_sums  # (P,)
+    match_weight_pred = row_sums * denom  # how much of this pred is matched (0-1 ish)
+    # Same for GT vertices (column perspective)
+    col_sums = T.sum(dim=0).clamp_min(eps)
+    matched_dist_gt = (T * dist).sum(dim=0) / col_sums  # (G,)
+    match_weight_gt = col_sums * denom
+    # Soft precision: fraction of pred vertices that are matched AND within threshold
+    tau_t = torch.as_tensor(tau, device=dist.device, dtype=dist.dtype).clamp_min(1e-8)
+    thresh_t = torch.as_tensor(thresh, device=dist.device, dtype=dist.dtype)
+    prec_per = match_weight_pred * torch.sigmoid((thresh_t - matched_dist_pred) / tau_t)
+    precision = prec_per.mean()
+    # Soft recall: fraction of GT vertices that are matched AND within threshold
+    rec_per = match_weight_gt * torch.sigmoid((thresh_t - matched_dist_gt) / tau_t)
+    recall = rec_per.mean()
+    return 2.0 * precision * recall / (precision + recall + eps)
+# ---------------------------------------------------------------------------
+# Improved: Segment-sampled tube IoU
+# ---------------------------------------------------------------------------
+#
+# The original soft_tube_iou_mc samples random points in the bounding box,
+# wasting most samples in empty space. This version samples along the segments
+# themselves, concentrating gradient signal where it matters.
+def _sample_along_segments(segments: torch.Tensor, n_per_seg: int = 64) -> torch.Tensor:
+    """Sample n_per_seg points uniformly along each segment.
+    segments: (S, 2, 3)
+    returns: (S * n_per_seg, 3)
+    """
+    t = torch.linspace(0, 1, n_per_seg, device=segments.device, dtype=segments.dtype)
+    # (S, 1, 3) + (1, N, 1) * (S, 1, 3) -> (S, N, 3)
+    a = segments[:, 0:1, :]
+    b = segments[:, 1:2, :]
+    pts = a + t[None, :, None] * (b - a)
+    return pts.reshape(-1, 3)
+def segment_sampled_tube_iou(
+    pred_segments: torch.Tensor,
+    gt_segments: torch.Tensor,
+    radius: float = 0.5,
+    n_per_seg: int = 64,
+    tau: float = 0.05,
+    eps: float = 1e-8,
+) -> torch.Tensor:
+    """Soft tube IoU by sampling along segments instead of in the bounding box.
+    Samples points along predicted and GT segments, then checks what fraction
+    of each set falls within radius of the other. More sample-efficient than
+    bbox Monte Carlo and gives better gradients.
+    Returns a differentiable scalar in [0, 1].
+    """
+    if pred_segments.numel() == 0 or gt_segments.numel() == 0:
+        return torch.zeros((), device=pred_segments.device, dtype=pred_segments.dtype)
+    pred_pts = _sample_along_segments(pred_segments, n_per_seg)
+    gt_pts = _sample_along_segments(gt_segments, n_per_seg)
+    tau_t = torch.as_tensor(tau, device=pred_pts.device, dtype=pred_pts.dtype).clamp_min(1e-8)
+    rad_t = torch.as_tensor(radius, device=pred_pts.device, dtype=pred_pts.dtype)
+    # Precision: fraction of pred points within radius of any GT segment
+    d_pred = distance_to_segments(pred_pts, gt_segments, eps=eps)
+    prec = torch.sigmoid((rad_t - d_pred) / tau_t).mean()
+    # Recall: fraction of GT points within radius of any pred segment
+    d_gt = distance_to_segments(gt_pts, pred_segments, eps=eps)
+    rec = torch.sigmoid((rad_t - d_gt) / tau_t).mean()
+    # Soft IoU from precision and recall:
+    # IoU = intersection/union = (P*R) / (P + R - P*R) for occupancy overlap
+    return prec * rec / (prec + rec - prec * rec + eps)
+def soft_hss_v2(
+    pred_segments: torch.Tensor,
+    gt_segments: torch.Tensor,
+    gt_vertices: torch.Tensor,
+    vert_thresh: float = 0.5,
+    edge_thresh: float = 0.5,
+    tau: float = 0.05,
+    sinkhorn_eps: float = 0.05,
+    sinkhorn_iters: int = 20,
+    n_per_seg: int = 64,
+    eps: float = 1e-8,
+):
+    """Improved soft HSS using Sinkhorn vertex matching + segment-sampled IoU.
+    Returns (soft_hss, soft_f1, soft_iou).
+    """
+    pred_vertices = pred_segments.reshape(-1, 3)
+    f1 = sinkhorn_vertex_f1(
+        pred_vertices, gt_vertices,
+        thresh=vert_thresh, tau=tau,
+        eps_sinkhorn=sinkhorn_eps, iters=sinkhorn_iters, eps=eps,
+    )
+    iou = segment_sampled_tube_iou(
+        pred_segments, gt_segments,
+        radius=edge_thresh, n_per_seg=n_per_seg, tau=tau, eps=eps,
+    )
+    denom = (f1 + iou).clamp_min(eps)
+    hss = 2.0 * f1 * iou / denom
+    return hss, f1, iou
+# ---------------------------------------------------------------------------
+# Batched versions for training speed
+# ---------------------------------------------------------------------------
+def batched_sinkhorn_vertex_f1(
+    pred_segments: torch.Tensor,
+    gt_pad: torch.Tensor,
+    gt_mask: torch.Tensor,
+    thresh: float | torch.Tensor = 0.5,
+    tau: float | torch.Tensor = 0.05,
+    eps_sinkhorn: float = 0.05,
+    iters: int = 10,
+    eps: float = 1e-8,
+) -> torch.Tensor:
+    """Batched Sinkhorn vertex F1 loss.
+    Args:
+        pred_segments: [B, S, 2, 3] predicted segments
+        gt_pad:        [B, M, 2, 3] padded GT segments
+        gt_mask:       [B, M] bool mask (True = valid GT segment)
+        thresh:        distance threshold for a vertex match (scalar or [B])
+        tau:           sigmoid temperature (scalar or [B])
+    Returns:
+        [B] per-sample (1 - F1) loss
+    """
+    B, S = pred_segments.shape[:2]
+    M = gt_pad.shape[1]
+    P = S * 2  # pred vertices (both endpoints)
+    # Allow per-sample thresh and tau ([B] tensors or scalars)
+    thresh_t = torch.as_tensor(thresh, device=pred_segments.device, dtype=pred_segments.dtype)
+    if thresh_t.dim() == 0:
+        thresh_t = thresh_t.expand(B)
+    tau_t = torch.as_tensor(tau, device=pred_segments.device, dtype=pred_segments.dtype)
+    if tau_t.dim() == 0:
+        tau_t = tau_t.expand(B)
+    tau_t = tau_t.clamp_min(1e-8)
+    pred_verts = pred_segments.reshape(B, P, 3)
+    gt_verts = gt_pad.reshape(B, M * 2, 3)  # will mask invalid ones
+    # Build GT vertex mask: each valid segment contributes 2 vertices
+    gt_vert_mask = gt_mask.unsqueeze(2).expand(B, M, 2).reshape(B, M * 2)
+    G = M * 2
+    # Pairwise distances [B, P, G]
+    dist = torch.linalg.norm(
+        pred_verts.unsqueeze(2) - gt_verts.unsqueeze(1), dim=-1)
+    # Mask invalid GT with high distance
+    dist = torch.where(gt_vert_mask.unsqueeze(1), dist, thresh_t[:, None, None] * 10.0)
+    # Sinkhorn matching: [B, P+1, G+1]
+    cost_pad = thresh_t[:, None, None].expand(B, P + 1, G + 1).clone()
+    cost_pad[:, :P, :G] = dist
+    cost_pad[:, -1, -1] = 0.0
+    gt_counts = gt_vert_mask.sum(dim=1).float()  # [B]
+    n = float(P)
+    denom = n + gt_counts  # [B]
+    a = (1.0 / denom).unsqueeze(1).expand(B, P + 1).clone()
+    a[:, -1] = gt_counts / denom
+    b = (1.0 / denom).unsqueeze(1).expand(B, G + 1).clone()
+    b[:, -1] = n / denom
+    b[:, :G] = b[:, :G] * gt_vert_mask.float()
+    log_a = torch.log(a + 1e-9)
+    log_b = torch.log(b + 1e-9)
+    log_k = -cost_pad / max(eps_sinkhorn, 1e-6)
+    log_u = torch.zeros_like(a)
+    log_v = torch.zeros_like(b)
+    for _ in range(iters):
+        log_u = log_a - torch.logsumexp(log_k + log_v.unsqueeze(1), dim=2)
+        log_v = log_b - torch.logsumexp(log_k + log_u.unsqueeze(2), dim=1)
+    transport = torch.exp(log_u.unsqueeze(2) + log_v.unsqueeze(1) + log_k)
+    T = transport[:, :P, :G]  # [B, P, G]
+    # Matched distances
+    row_sums = T.sum(dim=2).clamp_min(eps)
+    matched_d_pred = (T * dist).sum(dim=2) / row_sums  # [B, P]
+    w_pred = row_sums * denom.unsqueeze(1)
+    col_sums = T.sum(dim=1).clamp_min(eps)
+    matched_d_gt = (T * dist).sum(dim=1) / col_sums  # [B, G]
+    w_gt = col_sums * denom.unsqueeze(1)
+    precision = (w_pred * torch.sigmoid((thresh_t[:, None] - matched_d_pred) / tau_t[:, None])).mean(dim=1)
+    recall_raw = w_gt * torch.sigmoid((thresh_t[:, None] - matched_d_gt) / tau_t[:, None])
+    # Mask invalid GT vertices in recall
+    recall = (recall_raw * gt_vert_mask.float()).sum(dim=1) / gt_counts.clamp_min(1.0)
+    f1 = 2.0 * precision * recall / (precision + recall + eps)
+    return 1.0 - f1  # return loss (1 - F1)
+def batched_segment_sampled_iou(
+    pred_segments: torch.Tensor,
+    gt_pad: torch.Tensor,
+    gt_mask: torch.Tensor,
+    radius: float | torch.Tensor = 0.5,
+    n_per_seg: int = 32,
+    tau: float | torch.Tensor = 0.05,
+    eps: float = 1e-8,
+) -> torch.Tensor:
+    """Batched segment-sampled tube IoU loss.
+    Returns [B] per-sample (1 - IoU) loss.
+    """
+    B, S = pred_segments.shape[:2]
+    M = gt_pad.shape[1]
+    # Allow per-sample radius and tau ([B] tensors or scalars)
+    rad_t = torch.as_tensor(radius, device=pred_segments.device, dtype=pred_segments.dtype)
+    if rad_t.dim() == 0:
+        rad_t = rad_t.expand(B)
+    tau_t = torch.as_tensor(tau, device=pred_segments.device, dtype=pred_segments.dtype)
+    if tau_t.dim() == 0:
+        tau_t = tau_t.expand(B)
+    tau_t = tau_t.clamp_min(1e-8)
+    # Sample points along segments
+    t = torch.linspace(0, 1, n_per_seg, device=pred_segments.device, dtype=pred_segments.dtype)
+    # Pred points: [B, S*n_per_seg, 3]
+    pa = pred_segments[:, :, 0:1, :]  # [B, S, 1, 3]
+    pb = pred_segments[:, :, 1:2, :]
+    pred_pts = (pa + t[None, None, :, None] * (pb - pa)).reshape(B, S * n_per_seg, 3)
+    # GT points: [B, M*n_per_seg, 3]
+    ga = gt_pad[:, :, 0:1, :]
+    gb = gt_pad[:, :, 1:2, :]
+    gt_pts = (ga + t[None, None, :, None] * (gb - ga)).reshape(B, M * n_per_seg, 3)
+    # For each pred point, min distance to any GT segment endpoint samples
+    d_pred_to_gt = torch.cdist(pred_pts, gt_pts)  # [B, S*n, M*n]
+    d_pred = d_pred_to_gt.min(dim=2).values  # [B, S*n]
+    prec = torch.sigmoid((rad_t[:, None] - d_pred) / tau_t[:, None]).mean(dim=1)  # [B]
+    d_gt_to_pred = d_pred_to_gt.min(dim=1).values  # [B, M*n]
+    # Mask invalid GT points
+    gt_pt_mask = gt_mask.unsqueeze(2).expand(B, M, n_per_seg).reshape(B, M * n_per_seg)
+    rec_raw = torch.sigmoid((rad_t[:, None] - d_gt_to_pred) / tau_t[:, None])
+    rec = (rec_raw * gt_pt_mask.float()).sum(dim=1) / gt_pt_mask.float().sum(dim=1).clamp_min(1.0)
+    iou = prec * rec / (prec + rec - prec * rec + eps)
+    return 1.0 - iou  # return loss
+def batched_soft_hss_v2(pred_segments, gt_pad, gt_mask,
+                         vert_thresh=0.5, edge_thresh=0.5, tau=0.05,
+                         sinkhorn_iters=10, n_per_seg=32):
+    """Batched soft HSS loss. Returns [B] per-sample (1 - HSS)."""
+    f1_loss = batched_sinkhorn_vertex_f1(
+        pred_segments, gt_pad, gt_mask,
+        thresh=vert_thresh, tau=tau, iters=sinkhorn_iters)
+    iou_loss = batched_segment_sampled_iou(
+        pred_segments, gt_pad, gt_mask,
+        radius=edge_thresh, n_per_seg=n_per_seg, tau=tau)
+    f1 = 1.0 - f1_loss
+    iou = 1.0 - iou_loss
+    hss = 2.0 * f1 * iou / (f1 + iou + 1e-8)
+    return 1.0 - hss

s23dr_2026_example/tokenizer.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""Tokenizer: learned embeddings + Fourier features for the point cloud tokens.
+The EdgeDepthSequenceBuilder holds the learned embedding tables (label, source,
+behind) and the random Fourier positional encoding. At training time,
+build_tokens() in data.py applies these to pre-sampled point indices on GPU.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+from .point_fusion import NUM_ADE, NUM_GEST
+# -- Config --
+@dataclass(frozen=True)
+class EdgeDepthSequenceConfig:
+    seq_len: int = 2048
+    colmap_points: int = 1280
+    depth_points: int = 768
+    use_fourier: bool = True
+    fourier_dim: int = 32
+    fourier_scale: float = 10.0
+# -- Fourier positional encoding --
+class FourierFeatures(nn.Module):
+    def __init__(self, in_dim: int = 3, fourier_dim: int = 64,
+                 scale: float = 10.0, seed: int = 0,
+                 learnable: bool = False):
+        super().__init__()
+        gen = torch.Generator()
+        gen.manual_seed(seed)
+        B = torch.randn(fourier_dim, in_dim, generator=gen) * scale
+        if learnable:
+            self.B = nn.Parameter(B)
+        else:
+            self.register_buffer("B", B, persistent=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        proj = (2.0 * np.pi) * (x @ self.B.t())
+        return torch.cat([torch.sin(proj), torch.cos(proj)], dim=-1)
+# -- Sequence builder (holds embeddings) --
+class EdgeDepthSequenceBuilder(nn.Module):
+    """Holds learned embeddings for point cloud tokenization.
+    Used by the model at training time: build_tokens() calls
+    self.label_emb(class_id), self.src_emb(source), etc.
+    """
+    def __init__(self, cfg: EdgeDepthSequenceConfig, label_emb_dim: int = 16,
+                 src_emb_dim: int = 2, behind_emb_dim: int = 8,
+                 fourier_seed: int = 0, use_vote_features: bool = False,
+                 learnable_fourier: bool = False):
+        super().__init__()
+        self.cfg = cfg
+        self.num_labels = 13  # 11 structural + other_house + non_house
+        self.label_emb = nn.Embedding(self.num_labels, label_emb_dim)
+        self.src_emb = nn.Embedding(2, src_emb_dim)
+        self.behind_emb_dim = behind_emb_dim
+        if behind_emb_dim > 0:
+            self.behind_emb = nn.Embedding(NUM_GEST + 1, behind_emb_dim)
+        # Fourier positional encoding
+        if cfg.use_fourier:
+            self.pos_enc = FourierFeatures(
+                in_dim=3, fourier_dim=cfg.fourier_dim,
+                scale=cfg.fourier_scale, seed=fourier_seed,
+                learnable=learnable_fourier,
+            )
+            pos_dim = 3 + 2 * cfg.fourier_dim
+        else:
+            self.pos_enc = None
+            pos_dim = 3
+        vote_dim = 2 if use_vote_features else 0  # n_views_voted + vote_frac
+        self.use_vote_features = use_vote_features
+        self.out_dim = pos_dim + label_emb_dim + src_emb_dim + behind_emb_dim + vote_dim

s23dr_2026_example/varifold.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import torch
+from .wire_varifold_kernels import (
+    loss_semi_lobatto3,
+    loss_semi_lobatto3_mix,
+    loss_semi_lobatto3_mix_simple,
+    loss_simpson3,
+    loss_simpson3_batch,
+    loss_simpson3_mix,
+    loss_simpson3_mix_batch,
+    loss_simpson3_lenpow,
+    loss_simpson3_lenpow_mix,
+    loss_semi_legendre,
+)
+def edges_to_segments(vertices, edges) -> torch.Tensor:
+    verts = torch.as_tensor(vertices, dtype=torch.float32)
+    idx = torch.as_tensor(edges, dtype=torch.long)
+    return torch.stack([verts[idx[:, 0]], verts[idx[:, 1]]], dim=1)
+def segments_to_vertices_edges(segments: torch.Tensor):
+    segs = torch.as_tensor(segments, dtype=torch.float32)
+    vertices = segs.reshape(-1, 3)
+    edges = [(2 * i, 2 * i + 1) for i in range(segs.shape[0])]
+    return vertices, edges
+def varifold_loss(
+    pred_segments: torch.Tensor,
+    gt_segments: torch.Tensor,
+    sigma: float = 0.1,
+    variant: str = "semi_lobatto3",
+    t_nodes01: torch.Tensor | None = None,
+    t_w: torch.Tensor | None = None,
+    sigmas: torch.Tensor | None = None,
+    alpha: torch.Tensor | None = None,
+    normalize_alpha: bool = True,
+    len_pow: float | None = None,
+) -> torch.Tensor:
+    p_pred, q_pred = pred_segments[:, 0], pred_segments[:, 1]
+    p_gt, q_gt = gt_segments[:, 0], gt_segments[:, 1]
+    if variant == "semi_lobatto3":
+        return loss_semi_lobatto3(p_pred, q_pred, p_gt, q_gt, sigma)
+    if variant == "semi_lobatto3_mix":
+        if sigmas is None or alpha is None:
+            raise ValueError("sigmas and alpha are required for semi_lobatto3_mix")
+        return loss_semi_lobatto3_mix(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, normalize_alpha)
+    if variant == "semi_lobatto3_mix_simple":
+        if sigmas is None or alpha is None:
+            raise ValueError("sigmas and alpha are required for semi_lobatto3_mix_simple")
+        return loss_semi_lobatto3_mix_simple(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, normalize_alpha)
+    if variant == "simpson3":
+        if sigmas is not None or alpha is not None:
+            if sigmas is None or alpha is None:
+                raise ValueError("sigmas and alpha are required for simpson3 mix")
+            return loss_simpson3_mix(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, normalize_alpha)
+        return loss_simpson3(p_pred, q_pred, p_gt, q_gt, sigma)
+    if variant == "simpson3_lenpow":
+        if len_pow is None:
+            len_pow = 1.0
+        if sigmas is not None or alpha is not None:
+            if sigmas is None or alpha is None:
+                raise ValueError("sigmas and alpha are required for simpson3_lenpow mix")
+            return loss_simpson3_lenpow_mix(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, len_pow, normalize_alpha)
+        return loss_simpson3_lenpow(p_pred, q_pred, p_gt, q_gt, sigma, len_pow)
+    if variant == "semi_legendre":
+        return loss_semi_legendre(p_pred, q_pred, p_gt, q_gt, sigma, t_nodes01, t_w)
+    if variant in ("centers", "segments_varifold", "semi_lobatto1"):
+        return varifold_loss_centers(pred_segments, gt_segments, sigma)
+    raise ValueError(f"Unknown varifold variant: {variant}")
+def varifold_loss_batch(
+    pred_segments: torch.Tensor,
+    gt_segments: torch.Tensor,
+    *,
+    sigma: float = 0.1,
+    variant: str = "semi_lobatto3",
+    t_nodes01: torch.Tensor | None = None,
+    t_w: torch.Tensor | None = None,
+    sigmas: torch.Tensor | None = None,
+    alpha: torch.Tensor | None = None,
+    normalize_alpha: bool = True,
+    len_pow: float | None = None,
+    gt_mask: torch.Tensor | None = None,
+    pred_weights: torch.Tensor | None = None,
+    cross_only: bool = False,
+) -> torch.Tensor:
+    if pred_segments.dim() != 4 or gt_segments.dim() != 4:
+        raise ValueError("pred_segments and gt_segments must be (B, N, 2, 3)")
+    p_pred, q_pred = pred_segments[:, :, 0], pred_segments[:, :, 1]
+    p_gt, q_gt = gt_segments[:, :, 0], gt_segments[:, :, 1]
+    w_gt = None
+    if gt_mask is not None:
+        w_gt = gt_mask.to(device=pred_segments.device, dtype=pred_segments.dtype)
+    w_pred = None
+    if pred_weights is not None:
+        w_pred = pred_weights.to(device=pred_segments.device, dtype=pred_segments.dtype)
+    if variant == "simpson3":
+        if sigmas is not None or alpha is not None:
+            if sigmas is None or alpha is None:
+                raise ValueError("sigmas and alpha are required for simpson3 mix")
+            return loss_simpson3_mix_batch(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, w_gt=w_gt, w_pred=w_pred, normalize_alpha=normalize_alpha, cross_only=cross_only)
+        return loss_simpson3_batch(p_pred, q_pred, p_gt, q_gt, sigma, w_gt=w_gt, w_pred=w_pred)
+    # Fallback to per-sample loop for unsupported variants.
+    losses = []
+    sigmas_t = None
+    if sigmas is not None:
+        sigmas_t = torch.as_tensor(sigmas, device=pred_segments.device, dtype=pred_segments.dtype)
+    for idx in range(pred_segments.shape[0]):
+        gt_b = gt_segments[idx]
+        if gt_mask is not None:
+            gt_b = gt_b[gt_mask[idx]]
+        sigmas_i = sigmas
+        if sigmas_t is not None and sigmas_t.ndim == 2:
+            sigmas_i = sigmas_t[idx]
+        losses.append(
+            varifold_loss(
+                pred_segments[idx],
+                gt_b,
+                sigma=sigma,
+                variant=variant,
+                t_nodes01=t_nodes01,
+                t_w=t_w,
+                sigmas=sigmas_i,
+                alpha=alpha,
+                normalize_alpha=normalize_alpha,
+                len_pow=len_pow,
+            )
+        )
+    return torch.stack(losses, dim=0)
+def varifold_loss_centers(
+    pred_segments: torch.Tensor,
+    gt_segments: torch.Tensor,
+    sigma: float = 0.1,
+    normalize_weights: bool = True,
+) -> torch.Tensor:
+    eps = 1e-8
+    a_p, b_p = pred_segments[:, 0], pred_segments[:, 1]
+    a_g, b_g = gt_segments[:, 0], gt_segments[:, 1]
+    v_p = b_p - a_p
+    v_g = b_g - a_g
+    len_p = torch.linalg.norm(v_p, dim=-1)
+    len_g = torch.linalg.norm(v_g, dim=-1)
+    x_p = 0.5 * (a_p + b_p)
+    x_g = 0.5 * (a_g + b_g)
+    u_p = v_p / (len_p[:, None] + eps)
+    u_g = v_g / (len_g[:, None] + eps)
+    w_p = len_p
+    w_g = len_g
+    if normalize_weights:
+        w_p = w_p / (w_p.sum() + eps)
+        w_g = w_g / (w_g.sum() + eps)
+    diff_pp = x_p[:, None, :] - x_p[None, :, :]
+    diff_gg = x_g[:, None, :] - x_g[None, :, :]
+    diff_pg = x_p[:, None, :] - x_g[None, :, :]
+    d_pp = (diff_pp * diff_pp).sum(dim=-1)
+    d_gg = (diff_gg * diff_gg).sum(dim=-1)
+    d_pg = (diff_pg * diff_pg).sum(dim=-1)
+    inv2s2 = 1.0 / (2.0 * sigma * sigma)
+    k_pp = torch.exp(-d_pp * inv2s2)
+    k_gg = torch.exp(-d_gg * inv2s2)
+    k_pg = torch.exp(-d_pg * inv2s2)
+    dot_pp = (u_p[:, None, :] * u_p[None, :, :]).sum(dim=-1)
+    dot_gg = (u_g[:, None, :] * u_g[None, :, :]).sum(dim=-1)
+    dot_pg = (u_p[:, None, :] * u_g[None, :, :]).sum(dim=-1)
+    k_pp = k_pp * (dot_pp * dot_pp)
+    k_gg = k_gg * (dot_gg * dot_gg)
+    k_pg = k_pg * (dot_pg * dot_pg)
+    wp_row = w_p[:, None]
+    wp_col = w_p[None, :]
+    wg_row = w_g[:, None]
+    wg_col = w_g[None, :]
+    a_pp = (wp_row * wp_col * k_pp).sum(dim=-1).sum(dim=-1)
+    a_gg = (wg_row * wg_col * k_gg).sum(dim=-1).sum(dim=-1)
+    a_pg = (w_p[:, None] * w_g[None, :] * k_pg).sum(dim=-1).sum(dim=-1)
+    return a_pp + a_gg - 2.0 * a_pg

s23dr_2026_example/wire_varifold_kernels.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import math
+import torch
+# -----------------------------
+# Helpers
+# -----------------------------
+def segment_geom(p: torch.Tensor, q: torch.Tensor, eps: float = 1e-9):
+    """
+    p,q: (...,3)
+    returns d, a, ell, u:
+      d   = q - p
+      a   = ||d||^2
+      ell = sqrt(a + eps^2)
+      u   = d / ell
+    """
+    d = q - p
+    a = (d * d).sum(dim=-1)
+    eps_val = eps
+    if p.dtype in (torch.float16, torch.bfloat16):
+        eps_val = max(eps, float(torch.finfo(p.dtype).eps))
+    ell = torch.sqrt(a + eps_val * eps_val)
+    u = d / ell.unsqueeze(-1)
+    return d, a, ell, u
+def sample_points(p: torch.Tensor, q: torch.Tensor, nodes01: torch.Tensor):
+    # (...,3) + (K,) -> (...,K,3)
+    d = q - p
+    nodes = nodes01.to(device=p.device, dtype=p.dtype)
+    shape = [1] * (p.dim() - 1) + [nodes.shape[0], 1]
+    nodes = nodes.view(*shape)
+    return p.unsqueeze(-2) + nodes * d.unsqueeze(-2)
+# Fixed Lobatto-3 / Simpson nodes+weights on [0,1]
+LOBATTO3_NODES = torch.tensor([0.0, 0.5, 1.0])
+# LOBATTO3_W = torch.tensor([1.0/6.0, 4.0/6.0, 1.0/6.0])
+LOBATTO3_W = torch.tensor([1/3, 1/3, 1/3])
+LOBATTO3_W2 = LOBATTO3_W[:, None] * LOBATTO3_W[None, :]  # (3,3)
+def _prepare_mix_weights(sigmas, alpha, device, dtype, normalize_alpha: bool):
+    sigmas_t = torch.as_tensor(sigmas, device=device, dtype=dtype).clamp_min(1e-6)
+    alpha_t = torch.as_tensor(alpha, device=device, dtype=dtype)
+    if normalize_alpha:
+        alpha_t = alpha_t / alpha_t.sum().clamp_min(1e-12)
+    return sigmas_t, alpha_t
+# -----------------------------
+# 1) Simpson-3 on both segments (3x3 product rule)
+# -----------------------------
+def _prep_weight(w, n: int, b: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor | None:
+    if w is None:
+        return None
+    w = torch.as_tensor(w, device=device, dtype=dtype)
+    if w.dim() == 1:
+        if w.shape[0] != n:
+            raise ValueError(f"weight length {w.shape[0]} != {n}")
+        w = w.unsqueeze(0).expand(b, -1)
+    elif w.dim() == 2:
+        if w.shape[0] != b or w.shape[1] != n:
+            raise ValueError(f"weight shape {tuple(w.shape)} != ({b}, {n})")
+    else:
+        raise ValueError("weights must be 1D or 2D")
+    return w
+def cross_simpson3(
+    pA,
+    qA,
+    pB,
+    qB,
+    sigma: float | torch.Tensor,
+    wA: torch.Tensor | None = None,
+    wB: torch.Tensor | None = None,
+):
+    device, dtype = pA.device, pA.dtype
+    batched = pA.dim() == 3
+    if not batched:
+        pA = pA.unsqueeze(0)
+        qA = qA.unsqueeze(0)
+        pB = pB.unsqueeze(0)
+        qB = qB.unsqueeze(0)
+    nodes = LOBATTO3_NODES.to(device=device, dtype=dtype)
+    w2 = LOBATTO3_W2.to(device=device, dtype=dtype)
+    bsz, nA, _ = pA.shape
+    nB = pB.shape[1]
+    wA = _prep_weight(wA, nA, bsz, device, dtype)
+    wB = _prep_weight(wB, nB, bsz, device, dtype)
+    _, _, ellA, uA = segment_geom(pA, qA)
+    _, _, ellB, uB = segment_geom(pB, qB)
+    XA = sample_points(pA, qA, nodes)  # (B,N,3,3)
+    YB = sample_points(pB, qB, nodes)  # (B,M,3,3)
+    # angular + length factors: (N,M)
+    ang = torch.matmul(uA, uB.transpose(-1, -2)).pow(2)
+    lenfac = ellA[:, :, None] * ellB[:, None, :]
+    if wA is not None or wB is not None:
+        if wA is None:
+            wA = torch.ones((bsz, nA), device=device, dtype=dtype)
+        if wB is None:
+            wB = torch.ones((bsz, nB), device=device, dtype=dtype)
+        lenfac = lenfac * (wA[:, :, None] * wB[:, None, :])
+    # spatial: build (N,M,3,3) kernel via broadcasting
+    diff = XA[:, :, None, :, None, :] - YB[:, None, :, None, :, :]  # (B,N,M,3,3,3)
+    r2 = (diff * diff).sum(dim=-1)                                 # (B,N,M,3,3)
+    sigma_t = torch.as_tensor(sigma, device=device, dtype=dtype)
+    if sigma_t.ndim == 0:
+        inv2s2 = 1.0 / (2.0 * sigma_t * sigma_t)
+    else:
+        if sigma_t.shape[0] != bsz:
+            raise ValueError(f"sigma batch {sigma_t.shape[0]} != {bsz}")
+        inv2s2 = (1.0 / (2.0 * sigma_t * sigma_t)).view(bsz, 1, 1, 1, 1)
+    K = torch.exp(-r2 * inv2s2)                                     # (B,N,M,3,3)
+    spatial = (K * w2).sum(dim=-1).sum(dim=-1)                     # (B,N,M)
+    out = (ang * lenfac * spatial).sum(dim=-1).sum(dim=-1)         # (B,)
+    return out[0] if not batched else out
+def cross_simpson3_lenpow(
+    pA,
+    qA,
+    pB,
+    qB,
+    sigma: float | torch.Tensor,
+    len_pow: float,
+    wA: torch.Tensor | None = None,
+    wB: torch.Tensor | None = None,
+):
+    device, dtype = pA.device, pA.dtype
+    batched = pA.dim() == 3
+    if not batched:
+        pA = pA.unsqueeze(0)
+        qA = qA.unsqueeze(0)
+        pB = pB.unsqueeze(0)
+        qB = qB.unsqueeze(0)
+    nodes = LOBATTO3_NODES.to(device=device, dtype=dtype)
+    w2 = LOBATTO3_W2.to(device=device, dtype=dtype)
+    bsz, nA, _ = pA.shape
+    nB = pB.shape[1]
+    wA = _prep_weight(wA, nA, bsz, device, dtype)
+    wB = _prep_weight(wB, nB, bsz, device, dtype)
+    _, _, ellA, uA = segment_geom(pA, qA)
+    _, _, ellB, uB = segment_geom(pB, qB)
+    XA = sample_points(pA, qA, nodes)  # (B,N,3,3)
+    YB = sample_points(pB, qB, nodes)  # (B,M,3,3)
+    ang = torch.matmul(uA, uB.transpose(-1, -2)).pow(2)
+    lenfac = (ellA[:, :, None] * ellB[:, None, :]).pow(len_pow)
+    if wA is not None or wB is not None:
+        if wA is None:
+            wA = torch.ones((bsz, nA), device=device, dtype=dtype)
+        if wB is None:
+            wB = torch.ones((bsz, nB), device=device, dtype=dtype)
+        lenfac = lenfac * (wA[:, :, None] * wB[:, None, :])
+    diff = XA[:, :, None, :, None, :] - YB[:, None, :, None, :, :]  # (B,N,M,3,3,3)
+    r2 = (diff * diff).sum(dim=-1)                                 # (B,N,M,3,3)
+    sigma_t = torch.as_tensor(sigma, device=device, dtype=dtype)
+    if sigma_t.ndim == 0:
+        inv2s2 = 1.0 / (2.0 * sigma_t * sigma_t)
+    else:
+        if sigma_t.shape[0] != bsz:
+            raise ValueError(f"sigma batch {sigma_t.shape[0]} != {bsz}")
+        inv2s2 = (1.0 / (2.0 * sigma_t * sigma_t)).view(bsz, 1, 1, 1, 1)
+    K = torch.exp(-r2 * inv2s2)                                     # (B,N,M,3,3)
+    spatial = (K * w2).sum(dim=-1).sum(dim=-1)                     # (B,N,M)
+    out = (ang * lenfac * spatial).sum(dim=-1).sum(dim=-1)         # (B,)
+    return out[0] if not batched else out
+# -----------------------------
+# 2/3) Semi-analytic in s, quadrature in t
+#     - Lobatto-3 (endpoints+midpoint)
+#     - Gauss-Legendre Q (nodes/weights passed in)
+# -----------------------------
+def cross_semi_analytic(pA, qA, pB, qB, sigma: float, t_nodes01: torch.Tensor, t_w: torch.Tensor):
+    """
+    Gaussian k_x. Integrate s exactly along A, integrate t numerically along B.
+    t_nodes01, t_w: (Q,) nodes/weights on [0,1] (constants you pass in)
+    """
+    device, dtype = pA.device, pA.dtype
+    t = t_nodes01.to(device=device, dtype=dtype)  # (Q,)
+    w = t_w.to(device=device, dtype=dtype)         # (Q,)
+    dA, aA, ellA, uA = segment_geom(pA, qA)
+    dB, _,  ellB, uB = segment_geom(pB, qB)
+    # (N,M) factors
+    ang = (uA @ uB.t()).pow(2)
+    lenfac = ellA[:, None] * ellB[None, :]
+    # r0: (N,M,3)
+    r0 = pA[:, None, :] - pB[None, :, :]
+    # r(t): (N,M,Q,3)
+    r = r0[:, :, None, :] - t[None, None, :, None] * dB[None, :, None, :]
+    # beta, r2: (N,M,Q)
+    beta = (r * dA[:, None, None, :]).sum(dim=-1)
+    r2 = (r * r).sum(dim=-1)
+    # semi-analytic constants per A segment: shapes broadcast to (N,1,1)
+    a = aA.clamp_min(1e-12)
+    inv_a = (1.0 / a).view(-1, 1, 1)
+    denom = (torch.sqrt(2.0 * a) * sigma).view(-1, 1, 1)
+    pref = (math.sqrt(math.pi) * sigma / torch.sqrt(2.0 * a)).view(-1, 1, 1)
+    # J(t): (N,M,Q)
+    exp_term = torch.exp(-(r2 - (beta * beta) * inv_a) / (2.0 * sigma * sigma))
+    erf1 = torch.special.erf((a.view(-1, 1, 1) + beta) / denom)
+    erf0 = torch.special.erf(beta / denom)
+    J = pref * (erf1 - erf0) * exp_term
+    # integrate over t: (N,M)
+    spatial = (J * w.view(1, 1, -1)).sum(dim=-1)
+    return (ang * lenfac * spatial).sum(dim=-1).sum(dim=-1)
+def cross_semi_lobatto3(pA, qA, pB, qB, sigma: float):
+    device, dtype = pA.device, pA.dtype
+    t = LOBATTO3_NODES.to(device=device, dtype=dtype)
+    w = LOBATTO3_W.to(device=device, dtype=dtype)
+    return cross_semi_analytic(pA, qA, pB, qB, sigma, t, w)
+def cross_semi_lobatto3_mix(
+    pA,
+    qA,
+    pB,
+    qB,
+    sigmas,
+    alpha,
+    normalize_alpha: bool = True,
+):
+    """
+    Semi-analytic in s (along A), Lobatto-3 in t (along B), with a sigma mixture.
+    """
+    device, dtype = pA.device, pA.dtype
+    t_nodes = LOBATTO3_NODES.to(device=device, dtype=dtype)
+    t_w = LOBATTO3_W.to(device=device, dtype=dtype)
+    sigmas_t, alpha_t = _prepare_mix_weights(sigmas, alpha, device, dtype, normalize_alpha)
+    dA, aA, ellA, uA = segment_geom(pA, qA)
+    dB, _, ellB, uB = segment_geom(pB, qB)
+    ang = (uA @ uB.t()).pow(2)
+    lenfac = ellA[:, None] * ellB[None, :]
+    r0 = pA[:, None, :] - pB[None, :, :]
+    a = aA.clamp_min(1e-12)
+    inv_a = (1.0 / a).view(-1, 1)
+    sqrt_a = torch.sqrt(2.0 * a).clamp_min(1e-12)
+    denom = (sqrt_a[:, None] * sigmas_t[None, :]).clamp_min(1e-12)
+    pref = math.sqrt(math.pi) * sigmas_t[None, :] / sqrt_a[:, None]
+    inv2s2 = (1.0 / (2.0 * sigmas_t * sigmas_t)).view(1, 1, -1)
+    denom_nmS = denom[:, None, :]
+    pref_nmS = pref[:, None, :]
+    alpha_nmS = alpha_t.view(1, 1, -1)
+    a_nm1 = a[:, None, None]
+    spatial = torch.zeros((pA.shape[0], pB.shape[0]), device=device, dtype=dtype)
+    for tk, wk in zip(t_nodes, t_w):
+        r = r0 - tk * dB[None, :, :]
+        beta = (r * dA[:, None, :]).sum(dim=-1)
+        r2 = (r * r).sum(dim=-1)
+        core = r2 - (beta * beta) * inv_a
+        exp_term = torch.exp(-core[:, :, None] * inv2s2)
+        erf1 = torch.special.erf((a_nm1 + beta[:, :, None]) / denom_nmS)
+        erf0 = torch.special.erf(beta[:, :, None] / denom_nmS)
+        J = pref_nmS * (erf1 - erf0) * exp_term
+        spatial = spatial + wk * (J * alpha_nmS).sum(dim=-1)
+    return (ang * lenfac * spatial).sum(dim=-1).sum(dim=-1)
+# -----------------------------
+# Full losses (self + self - 2 cross)
+# -----------------------------
+# def loss_simpson3(p_pred, q_pred, p_gt, q_gt, sigma: float):
+#     s_pred = cross_simpson3(p_pred, q_pred, p_pred, q_pred, sigma)
+#     # s_gt   = cross_simpson3(p_gt,   q_gt,   p_gt,   q_gt,   sigma)
+#     cross  = cross_simpson3(p_pred, q_pred, p_gt,   q_gt,   sigma)
+#     # return s_pred + s_gt - 2.0 * cross
+#     return s_pred - 2.0 * cross
+def loss_simpson3(p_pred, q_pred, p_gt, q_gt, sigma: float):
+    s_pred = cross_simpson3(p_pred, q_pred, p_pred, q_pred, sigma)
+    # s_gt   = cross_simpson3(p_gt,   q_gt,   p_gt,   q_gt,   sigma)
+    cross  = cross_simpson3(p_pred, q_pred, p_gt,   q_gt,   sigma)
+    # return s_pred + s_gt - 2.0 * cross
+    return s_pred - 2.0 * cross
+def loss_simpson3_lenpow(p_pred, q_pred, p_gt, q_gt, sigma: float, len_pow: float):
+    s_pred = cross_simpson3_lenpow(p_pred, q_pred, p_pred, q_pred, sigma, len_pow)
+    # s_gt   = cross_simpson3_lenpow(p_gt,   q_gt,   p_gt,   q_gt,   sigma, len_pow)
+    cross  = cross_simpson3_lenpow(p_pred, q_pred, p_gt,   q_gt,   sigma, len_pow)
+    # return s_pred + s_gt - 2.0 * cross
+    return s_pred - 2.0 * cross
+def loss_simpson3_mix(
+    p_pred,
+    q_pred,
+    p_gt,
+    q_gt,
+    sigmas,
+    alpha,
+    normalize_alpha: bool = True,
+):
+    device, dtype = p_pred.device, p_pred.dtype
+    sigmas_t, alpha_t = _prepare_mix_weights(sigmas, alpha, device, dtype, normalize_alpha)
+    losses = [loss_simpson3(p_pred, q_pred, p_gt, q_gt, s) for s in sigmas_t]
+    return (torch.stack(losses) * alpha_t).sum()
+# def loss_simpson3_batch(
+#     p_pred: torch.Tensor,
+#     q_pred: torch.Tensor,
+#     p_gt: torch.Tensor,
+#     q_gt: torch.Tensor,
+#     sigma: float | torch.Tensor,
+#     w_gt: torch.Tensor | None = None,
+# ) -> torch.Tensor:
+#     s_pred = cross_simpson3(p_pred, q_pred, p_pred, q_pred, sigma)
+#     # s_gt = cross_simpson3(p_gt, q_gt, p_gt, q_gt, sigma, wA=w_gt, wB=w_gt)
+#     cross = cross_simpson3(p_pred, q_pred, p_gt, q_gt, sigma, wB=w_gt)
+#     # return s_pred + s_gt - 2.0 * cross
+#     return s_pred - 2.0 * cross
+def loss_simpson3_batch(
+    p_pred: torch.Tensor,
+    q_pred: torch.Tensor,
+    p_gt: torch.Tensor,
+    q_gt: torch.Tensor,
+    sigma: float | torch.Tensor,
+    w_gt: torch.Tensor | None = None,
+    w_pred: torch.Tensor | None = None,
+    cross_only: bool = False,
+) -> torch.Tensor:
+    cross = cross_simpson3(p_pred, q_pred, p_gt, q_gt, sigma, wA=w_pred, wB=w_gt)
+    if cross_only:
+        # No self-energy: avoids O(S^2) blowup, sinkhorn handles repulsion
+        return -2.0 * cross
+    s_pred = cross_simpson3(p_pred, q_pred, p_pred, q_pred, sigma, wA=w_pred, wB=w_pred)
+    return s_pred - 2.0 * cross
+def loss_simpson3_mix_batch(
+    p_pred: torch.Tensor,
+    q_pred: torch.Tensor,
+    p_gt: torch.Tensor,
+    q_gt: torch.Tensor,
+    sigmas,
+    alpha,
+    w_gt: torch.Tensor | None = None,
+    w_pred: torch.Tensor | None = None,
+    normalize_alpha: bool = True,
+    cross_only: bool = False,
+) -> torch.Tensor:
+    device, dtype = p_pred.device, p_pred.dtype
+    sigmas_t = torch.as_tensor(sigmas, device=device, dtype=dtype).clamp_min(1e-6)
+    alpha_t = torch.as_tensor(alpha, device=device, dtype=dtype)
+    if normalize_alpha:
+        alpha_t = alpha_t / alpha_t.sum().clamp_min(1e-12)
+    if sigmas_t.ndim == 1:
+        losses = [loss_simpson3_batch(p_pred, q_pred, p_gt, q_gt, s, w_gt=w_gt, w_pred=w_pred, cross_only=cross_only) for s in sigmas_t]
+        return (torch.stack(losses, dim=0) * alpha_t[:, None]).sum(dim=0)
+    if sigmas_t.ndim == 2:
+        losses = [loss_simpson3_batch(p_pred, q_pred, p_gt, q_gt, sigmas_t[:, i], w_gt=w_gt, w_pred=w_pred, cross_only=cross_only) for i in range(sigmas_t.shape[1])]
+        return (torch.stack(losses, dim=0) * alpha_t[:, None]).sum(dim=0)
+    raise ValueError("sigmas must be 1D or 2D for batch loss")
+def loss_simpson3_lenpow_mix(
+    p_pred,
+    q_pred,
+    p_gt,
+    q_gt,
+    sigmas,
+    alpha,
+    len_pow: float,
+    normalize_alpha: bool = True,
+):
+    device, dtype = p_pred.device, p_pred.dtype
+    sigmas_t, alpha_t = _prepare_mix_weights(sigmas, alpha, device, dtype, normalize_alpha)
+    losses = [loss_simpson3_lenpow(p_pred, q_pred, p_gt, q_gt, s, len_pow) for s in sigmas_t]
+    return (torch.stack(losses) * alpha_t).sum()
+def loss_semi_lobatto3(p_pred, q_pred, p_gt, q_gt, sigma: float):
+    s_pred = cross_semi_lobatto3(p_pred, q_pred, p_pred, q_pred, sigma)
+    # s_gt   = cross_semi_lobatto3(p_gt,   q_gt,   p_gt,   q_gt,   sigma)
+    cross  = cross_semi_lobatto3(p_pred, q_pred, p_gt,   q_gt,   sigma)
+    # return s_pred + s_gt - 2.0 * cross
+    return s_pred - 2.0 * cross
+def loss_semi_lobatto3_mix(
+    p_pred,
+    q_pred,
+    p_gt,
+    q_gt,
+    sigmas,
+    alpha,
+    normalize_alpha: bool = True,
+):
+    s_pred = cross_semi_lobatto3_mix(p_pred, q_pred, p_pred, q_pred, sigmas, alpha, normalize_alpha)
+    # s_gt = cross_semi_lobatto3_mix(p_gt, q_gt, p_gt, q_gt, sigmas, alpha, normalize_alpha)
+    cross = cross_semi_lobatto3_mix(p_pred, q_pred, p_gt, q_gt, sigmas, alpha, normalize_alpha)
+    # return s_pred + s_gt - 2.0 * cross
+    return s_pred - 2.0 * cross
+def loss_semi_lobatto3_mix_simple(
+    p_pred,
+    q_pred,
+    p_gt,
+    q_gt,
+    sigmas,
+    alpha,
+    normalize_alpha: bool = True,
+):
+    device, dtype = p_pred.device, p_pred.dtype
+    sigmas_t, alpha_t = _prepare_mix_weights(sigmas, alpha, device, dtype, normalize_alpha)
+    losses = [loss_semi_lobatto3(p_pred, q_pred, p_gt, q_gt, s) for s in sigmas_t]
+    return (torch.stack(losses) * alpha_t).sum()
+def loss_semi_legendre(p_pred, q_pred, p_gt, q_gt, sigma: float, t_nodes01, t_w):
+    s_pred = cross_semi_analytic(p_pred, q_pred, p_pred, q_pred, sigma, t_nodes01, t_w)
+    s_gt   = cross_semi_analytic(p_gt,   q_gt,   p_gt,   q_gt,   sigma, t_nodes01, t_w)
+    cross  = cross_semi_analytic(p_pred, q_pred, p_gt,   q_gt,   sigma, t_nodes01, t_w)
+    return s_pred + s_gt - 2.0 * cross
+# -----------------------------
+# torch.compile usage
+# -----------------------------
+# For Legendre: generate nodes/weights ONCE outside compile and pass them in.
+# Example:
+#   import numpy as np
+#   x,w = np.polynomial.legendre.leggauss(Q)
+#   t_nodes = torch.tensor(0.5*(x+1.0), device=device, dtype=dtype)
+#   t_w     = torch.tensor(0.5*w,       device=device, dtype=dtype)
+#
+# compiled_loss = torch.compile(loss_semi_lobatto3, fullgraph=True)
+# compiled_loss_leg = torch.compile(lambda pp,qp,pg,qg,s: loss_semi_legendre(pp,qp,pg,qg,s,t_nodes,t_w),
+#                                   fullgraph=True)

script.py ADDED Viewed

	@@ -0,0 +1,322 @@

+"""S23DR 2026 submission: learned wireframe prediction from fused point clouds.
+Pipeline: raw sample -> point fusion -> priority sample 2048 -> model -> post-process -> wireframe
+"""
+from pathlib import Path
+from tqdm import tqdm
+import json
+import os
+import sys
+import time
+import numpy as np
+import torch
+def empty_solution():
+    return np.zeros((2, 3)), [(0, 1)]
+# ---------------------------------------------------------------------------
+# Point fusion + sampling (from cache_scenes.py / make_sampled_cache.py)
+# ---------------------------------------------------------------------------
+# Add our package to path
+SCRIPT_DIR = Path(__file__).resolve().parent
+sys.path.insert(0, str(SCRIPT_DIR))
+from s23dr_2026_example.point_fusion import build_compact_scene, FuserConfig
+from s23dr_2026_example.cache_scenes import (
+    _compute_group_and_class, _compute_smart_center_scale,
+)
+from s23dr_2026_example.make_sampled_cache import _priority_sample
+# Tokenizer / model imports
+from s23dr_2026_example.tokenizer import EdgeDepthSequenceConfig
+from s23dr_2026_example.model import EdgeDepthSegmentsModel
+from s23dr_2026_example.segment_postprocess import merge_vertices
+from s23dr_2026_example.varifold import segments_to_vertices_edges
+from s23dr_2026_example.postprocess_v2 import snap_to_point_cloud, snap_horizontal
+SEQ_LEN = 2048
+COLMAP_QUOTA = 1536
+DEPTH_QUOTA = 512
+CONF_THRESH = 0.7
+MERGE_THRESH = 0.4
+SNAP_RADIUS = 0.5
+def fuse_and_sample(sample, cfg, rng):
+    """Run point fusion + priority sampling on a raw dataset sample.
+    Returns a dict with xyz_norm, class_id, source, mask, center, scale, etc.
+    ready for model inference. Returns None if fusion fails.
+    """
+    try:
+        scene = build_compact_scene(sample, cfg, rng)
+    except Exception as e:
+        print(f"  Fusion failed: {e}")
+        return None
+    xyz = scene["xyz"]
+    source = scene["source"]
+    if len(xyz) < 10:
+        return None
+    # Compute group_id and class_id (same as cache_scenes.py)
+    behind_id = scene.get("behind_gest_id", np.full(len(xyz), -1, dtype=np.int16))
+    group_id, class_id = _compute_group_and_class(
+        scene["visible_src"], scene["visible_id"], behind_id, source)
+    # Normalize
+    center, scale = _compute_smart_center_scale(xyz, source)
+    # Priority sample
+    indices, mask = _priority_sample(source, group_id, SEQ_LEN, COLMAP_QUOTA, DEPTH_QUOTA)
+    xyz_norm = (xyz[indices] - center) / scale
+    result = {
+        "xyz_norm": xyz_norm.astype(np.float32),
+        "class_id": class_id[indices].astype(np.int64),
+        "source": source[indices].astype(np.int64),
+        "mask": mask,
+        "center": center.astype(np.float32),
+        "scale": np.float32(scale),
+    }
+    # Optional fields
+    if "behind_gest_id" in scene:
+        behind = np.clip(scene["behind_gest_id"][indices].astype(np.int16), 0, None)
+        result["behind"] = behind.astype(np.int64)
+    if "n_views_voted" in scene:
+        result["n_views_voted"] = scene["n_views_voted"][indices].astype(np.float32)
+    if "vote_frac" in scene:
+        result["vote_frac"] = scene["vote_frac"][indices].astype(np.float32)
+    # Visible src/id for snap post-processing
+    result["visible_src"] = scene["visible_src"][indices].astype(np.int64)
+    result["visible_id"] = scene["visible_id"][indices].astype(np.int64)
+    return result
+def load_model(checkpoint_path, device):
+    """Load model from checkpoint."""
+    ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    args = ckpt.get("args", {})
+    norm_class = torch.nn.RMSNorm if args.get("rms_norm") else None
+    seq_cfg = EdgeDepthSequenceConfig(
+        seq_len=SEQ_LEN, colmap_points=COLMAP_QUOTA, depth_points=DEPTH_QUOTA)
+    model = EdgeDepthSegmentsModel(
+        seq_cfg=seq_cfg,
+        segments=args.get("segments", 64),
+        hidden=args.get("hidden", 256),
+        num_heads=args.get("num_heads", 4),
+        kv_heads_cross=args.get("kv_heads_cross", 2),
+        kv_heads_self=args.get("kv_heads_self", 2),
+        dim_feedforward=args.get("ff", 1024),
+        dropout=args.get("dropout", 0.1),
+        latent_tokens=args.get("latent_tokens", 256),
+        latent_layers=args.get("latent_layers", 7),
+        decoder_layers=args.get("decoder_layers", 3),
+        cross_attn_interval=args.get("cross_attn_interval", 4),
+        norm_class=norm_class,
+        activation=args.get("activation", "gelu"),
+        segment_conf=args.get("segment_conf", True),
+        behind_emb_dim=args.get("behind_emb_dim", 8),
+        use_vote_features=args.get("vote_features", True),
+        arch=args.get("arch", "perceiver"),
+        encoder_layers=args.get("encoder_layers", 4),
+        pre_encoder_layers=args.get("pre_encoder_layers", 0),
+        segment_param=args.get("segment_param", "midpoint_dir_len"),
+        qk_norm=args.get("qk_norm", True),
+    ).to(device)
+    # Handle torch.compile _orig_mod prefix
+    state = ckpt["model"]
+    fixed = {k.replace("segmenter._orig_mod.", "segmenter."): v
+             for k, v in state.items()}
+    model.load_state_dict(fixed, strict=True)
+    model.eval()
+    return model
+def build_tokens_single(sample_dict, model, device):
+    """Build token tensor for a single sample (no DataLoader)."""
+    xyz = torch.as_tensor(sample_dict["xyz_norm"], dtype=torch.float32).unsqueeze(0).to(device)
+    cid = torch.as_tensor(sample_dict["class_id"], dtype=torch.long).unsqueeze(0).to(device)
+    src = torch.as_tensor(sample_dict["source"], dtype=torch.long).unsqueeze(0).to(device)
+    masks = torch.as_tensor(sample_dict["mask"], dtype=torch.bool).unsqueeze(0).to(device)
+    B, T, _ = xyz.shape
+    tok = model.tokenizer
+    fourier = tok.pos_enc(xyz.reshape(-1, 3)).reshape(B, T, -1) \
+        if tok.pos_enc is not None else xyz.new_zeros(B, T, 0)
+    parts = [xyz, fourier, tok.label_emb(cid), tok.src_emb(src.clamp(0, 1))]
+    if tok.behind_emb_dim > 0:
+        if "behind" in sample_dict:
+            beh = torch.as_tensor(sample_dict["behind"], dtype=torch.long).unsqueeze(0).to(device)
+        else:
+            beh = xyz.new_zeros(B, T, dtype=torch.long)
+        parts.append(tok.behind_emb(beh))
+    if tok.use_vote_features:
+        if "n_views_voted" in sample_dict and "vote_frac" in sample_dict:
+            nv = ((torch.as_tensor(sample_dict["n_views_voted"], dtype=torch.float32).unsqueeze(0).to(device) - 2.7) / 1.0).unsqueeze(-1)
+            vf = ((torch.as_tensor(sample_dict["vote_frac"], dtype=torch.float32).unsqueeze(0).to(device) - 0.5) / 0.25).unsqueeze(-1)
+            parts.extend([nv, vf])
+        else:
+            parts.extend([xyz.new_zeros(B, T, 1), xyz.new_zeros(B, T, 1)])
+    tokens = torch.cat(parts, dim=-1)
+    return tokens, masks
+def predict_sample(sample_dict, model, device):
+    """Run model inference + post-processing on a fused sample.
+    Returns (vertices, edges) in world space.
+    """
+    tokens, masks = build_tokens_single(sample_dict, model, device)
+    scale = float(sample_dict["scale"])
+    center = sample_dict["center"]
+    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16,
+                                          enabled=(device.type == 'cuda')):
+        out = model.forward_tokens(tokens, masks)
+    segs = out["segments"][0].float().cpu()
+    conf = torch.sigmoid(out["conf"][0].float()).cpu().numpy() if "conf" in out else None
+    # Confidence filter
+    if conf is not None:
+        keep = conf > CONF_THRESH
+        segs = segs[keep]
+    if len(segs) < 1:
+        return empty_solution()
+    # To world space
+    segs_world = segs.numpy() * scale + center
+    # Vertices + edges from segments
+    pv, pe = segments_to_vertices_edges(torch.tensor(segs_world))
+    pv, pe = pv.numpy(), np.array(pe, dtype=np.int32)
+    # Merge
+    pv, pe = merge_vertices(pv, pe, MERGE_THRESH)
+    # Snap to point cloud
+    xyz_norm = sample_dict["xyz_norm"]
+    mask = sample_dict["mask"]
+    cid = sample_dict["class_id"]
+    xyz_world = xyz_norm[mask] * scale + center
+    cid_valid = cid[mask]
+    pv = snap_to_point_cloud(pv, xyz_world, cid_valid, snap_radius=SNAP_RADIUS)
+    # Horizontal snap
+    pv = snap_horizontal(pv, pe)
+    if len(pv) < 2 or len(pe) < 1:
+        return empty_solution()
+    edges = [(int(a), int(b)) for a, b in pe]
+    return pv, edges
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    t_start = time.time()
+    # Load params
+    param_path = Path("params.json")
+    with param_path.open() as f:
+        params = json.load(f)
+    print(f"Competition: {params.get('competition_id', '?')}")
+    print(f"Dataset: {params.get('dataset', '?')}")
+    # Load test data
+    data_path = Path("/tmp/data")
+    if not data_path.exists():
+        from huggingface_hub import snapshot_download
+        snapshot_download(
+            repo_id=params["dataset"],
+            local_dir="/tmp/data",
+            repo_type="dataset",
+        )
+    from datasets import load_dataset
+    data_files = {
+        "validation": [str(p) for p in data_path.rglob("*public*/**/*.tar")],
+        "test": [str(p) for p in data_path.rglob("*private*/**/*.tar")],
+    }
+    print(f"Data files: {data_files}")
+    dataset = load_dataset(
+        str(data_path / "hoho22k_2026_test_x_anon.py"),
+        data_files=data_files,
+        trust_remote_code=True,
+        writer_batch_size=100,
+    )
+    print(f"Loaded: {dataset}")
+    # Load model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Device: {device}")
+    checkpoint_path = SCRIPT_DIR / "checkpoint.pt"
+    model = load_model(checkpoint_path, device)
+    print(f"Model loaded: {sum(p.numel() for p in model.parameters()):,} params")
+    # Point fusion config
+    cfg = FuserConfig()
+    rng = np.random.RandomState(2718)
+    # Process all samples
+    solution = []
+    total_samples = sum(len(dataset[s]) for s in dataset)
+    processed = 0
+    for subset_name in dataset:
+        print(f"\nProcessing {subset_name} ({len(dataset[subset_name])} samples)...")
+        for sample in tqdm(dataset[subset_name], desc=subset_name):
+            order_id = sample["order_id"]
+            # Fuse + sample
+            fused = fuse_and_sample(sample, cfg, rng)
+            if fused is None:
+                pred_v, pred_e = empty_solution()
+            else:
+                try:
+                    pred_v, pred_e = predict_sample(fused, model, device)
+                except Exception as e:
+                    print(f"  Predict failed for {order_id}: {e}")
+                    pred_v, pred_e = empty_solution()
+            solution.append({
+                "order_id": order_id,
+                "wf_vertices": pred_v.tolist() if isinstance(pred_v, np.ndarray) else pred_v,
+                "wf_edges": [(int(a), int(b)) for a, b in pred_e],
+            })
+            processed += 1
+            if processed % 50 == 0:
+                elapsed = time.time() - t_start
+                rate = elapsed / processed
+                remaining = (total_samples - processed) * rate
+                print(f"  [{processed}/{total_samples}] "
+                      f"{elapsed:.0f}s elapsed, ~{remaining:.0f}s remaining")
+    # Save
+    with open("submission.json", "w") as f:
+        json.dump(solution, f)
+    elapsed = time.time() - t_start
+    print(f"\nDone. {processed} samples in {elapsed:.0f}s ({elapsed/max(processed,1):.1f}s/sample)")
+    print(f"Saved submission.json ({len(solution)} entries)")