"""
rig_yolo.py — Rig a humanoid mesh using YOLO-pose joint detection.

Instead of estimating T-pose rotations (which failed), detect where joints
actually ARE in the mesh's current pose and use those positions as the bind pose.

Pipeline:
  1. Render front view (azimuth=-90, same camera as triposg_app.py views)
  2. YOLOv8x-pose → COCO-17 2D keypoints
  3. Unproject to 3D in original mesh coordinate space
  4. Map COCO-17 → SMPL-24 (interpolate spine, collar, hand, foot joints)
  5. LBS weights: proximity-based (k=4 nearest joints per vertex)
  6. Export rigged GLB — bind pose = current pose

Usage:
    python rig_yolo.py --body /tmp/triposg_textured.glb \
                       --out  /tmp/rig_out/rigged.glb \
                       [--debug_dir /tmp/rig_debug]
"""

import os, sys, argparse, warnings
warnings.filterwarnings('ignore')

import numpy as np
import cv2
import trimesh
from scipy.spatial import cKDTree

sys.path.insert(0, '/root/MV-Adapter')

# ── Camera constants — MUST match triposg_app.py ──────────────────────────────
ORTHO_LEFT, ORTHO_RIGHT = -0.55, 0.55
ORTHO_BOT,  ORTHO_TOP   = -0.55, 0.55
RENDER_W, RENDER_H      = 768, 1024
FRONT_AZ                = -90       # azimuth that gives front view
# Orthographic proj scale: 2/(right-left) = 1.818...
PROJ_SCALE = 2.0 / (ORTHO_RIGHT - ORTHO_LEFT)

SMPL_PARENTS = [-1,0,0,0,1,2,3,4,5,6,7,8,9,9,9,
                12,13,14,16,17,18,19,20,21]
SMPL_JOINT_NAMES = [
    'pelvis','left_hip','right_hip','spine1',
    'left_knee','right_knee','spine2',
    'left_ankle','right_ankle','spine3',
    'left_foot','right_foot','neck',
    'left_collar','right_collar','head',
    'left_shoulder','right_shoulder',
    'left_elbow','right_elbow',
    'left_wrist','right_wrist',
    'left_hand','right_hand',
]

# COCO-17 order
COCO_NAMES = ['nose','L_eye','R_eye','L_ear','R_ear',
              'L_shoulder','R_shoulder','L_elbow','R_elbow','L_wrist','R_wrist',
              'L_hip','R_hip','L_knee','R_knee','L_ankle','R_ankle']


# ── Step 0: Load mesh directly from GLB (correct UV channel) ─────────────────

def load_mesh_from_gltf(body_glb):
    """
    Load mesh from GLB using pygltflib, reading the UV channel the material
    actually references (TEXCOORD_0 or TEXCOORD_1).
    Returns: verts (N,3) float64, faces (F,3) int32,
             uv (N,2) float32 or None, texture_pil PIL.Image or None
    """
    import pygltflib
    from PIL import Image as PILImage
    import io

    gltf = pygltflib.GLTF2().load(body_glb)
    blob = gltf.binary_blob()

    # componentType → (numpy dtype, bytes per element)
    _DTYPE = {5120: np.int8, 5121: np.uint8, 5122: np.int16,
              5123: np.uint16, 5125: np.uint32, 5126: np.float32}
    _NCOMP = {'SCALAR': 1, 'VEC2': 2, 'VEC3': 3, 'VEC4': 4, 'MAT4': 16}

    def read_accessor(idx):
        if idx is None:
            return None
        acc = gltf.accessors[idx]
        bv  = gltf.bufferViews[acc.bufferView]
        dtype  = _DTYPE[acc.componentType]
        n_comp = _NCOMP[acc.type]
        bv_off  = bv.byteOffset  or 0
        acc_off = acc.byteOffset or 0
        elem_bytes = np.dtype(dtype).itemsize * n_comp
        stride = bv.byteStride if (bv.byteStride and bv.byteStride != elem_bytes) else elem_bytes

        if stride == elem_bytes:
            start = bv_off + acc_off
            size  = acc.count * elem_bytes
            arr   = np.frombuffer(blob[start:start + size], dtype=dtype)
        else:
            # interleaved buffer
            rows = []
            for i in range(acc.count):
                start = bv_off + acc_off + i * stride
                rows.append(np.frombuffer(blob[start:start + elem_bytes], dtype=dtype))
            arr = np.concatenate(rows)

        return arr.reshape(acc.count, n_comp) if n_comp > 1 else arr

    # ── Find which texCoord index the material references ──────────────────────
    texcoord_idx = 0
    if gltf.materials:
        pbr = gltf.materials[0].pbrMetallicRoughness
        if pbr and pbr.baseColorTexture:
            texcoord_idx = getattr(pbr.baseColorTexture, 'texCoord', 0) or 0
    print(f'  material uses TEXCOORD_{texcoord_idx}')

    # ── Read primitive ─────────────────────────────────────────────────────────
    prim  = gltf.meshes[0].primitives[0]
    attrs = prim.attributes

    verts = read_accessor(attrs.POSITION).astype(np.float64)

    idx_data = read_accessor(prim.indices).flatten()
    faces = idx_data.reshape(-1, 3).astype(np.int32)

    # Read the correct UV channel; fall back to TEXCOORD_0
    uv_acc_idx = getattr(attrs, f'TEXCOORD_{texcoord_idx}', None)
    if uv_acc_idx is None and texcoord_idx != 0:
        uv_acc_idx = getattr(attrs, 'TEXCOORD_0', None)
    uv_raw = read_accessor(uv_acc_idx)
    uv = uv_raw.astype(np.float32) if uv_raw is not None else None

    print(f'  verts={len(verts)}  faces={len(faces)}  uv={len(uv) if uv is not None else None}')

    # ── Extract embedded texture ───────────────────────────────────────────────
    texture_pil = None
    try:
        pbr = gltf.materials[0].pbrMetallicRoughness
        if pbr and pbr.baseColorTexture is not None:
            tex_idx = pbr.baseColorTexture.index
            if tex_idx is not None and tex_idx < len(gltf.textures):
                src_idx = gltf.textures[tex_idx].source
                if src_idx is not None and src_idx < len(gltf.images):
                    img_obj = gltf.images[src_idx]
                    if img_obj.bufferView is not None:
                        bv = gltf.bufferViews[img_obj.bufferView]
                        bv_off = bv.byteOffset or 0
                        img_bytes = blob[bv_off:bv_off + bv.byteLength]
                        texture_pil = PILImage.open(io.BytesIO(img_bytes)).convert('RGBA')
                        print(f'  texture: {texture_pil.size}')
    except Exception as e:
        print(f'  texture extraction failed: {e}')

    return verts, faces, uv, texture_pil


# ── Step 1: Render front view ─────────────────────────────────────────────────

def render_front(body_glb, debug_dir=None):
    """
    Render front view using MV-Adapter.
    Returns (img_bgr, scale_factor) where scale_factor = max_abs / 0.5
    (used to convert std-space back to original mesh space).
    """
    from mvadapter.utils.mesh_utils import (
        NVDiffRastContextWrapper, load_mesh, get_orthogonal_camera, render,
    )
    ctx = NVDiffRastContextWrapper(device='cuda', context_type='cuda')
    mesh_mv, _offset, scale_factor = load_mesh(
        body_glb, rescale=True, return_transform=True, device='cuda')
    camera = get_orthogonal_camera(
        elevation_deg=[0], distance=[1.8],
        left=ORTHO_LEFT, right=ORTHO_RIGHT,
        bottom=ORTHO_BOT, top=ORTHO_TOP,
        azimuth_deg=[FRONT_AZ], device='cuda')
    out = render(ctx, mesh_mv, camera,
                 height=RENDER_H, width=RENDER_W,
                 render_attr=True, render_depth=False, render_normal=False,
                 attr_background=0.5)
    img_np  = (out.attr[0].cpu().numpy() * 255).clip(0, 255).astype(np.uint8)
    img_bgr = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
    if debug_dir:
        cv2.imwrite(os.path.join(debug_dir, 'front_render.png'), img_bgr)
    print(f'  render: {RENDER_W}x{RENDER_H}, scale_factor={scale_factor:.4f}')
    return img_bgr, scale_factor


# ── Step 2: YOLO-pose keypoints ───────────────────────────────────────────────

def detect_keypoints(img_bgr, debug_dir=None):
    """
    Run YOLOv8x-pose on the rendered image.
    Returns (17, 3) array: [pixel_x, pixel_y, confidence] for COCO-17 joints.
    Picks the largest detected bounding box (the character body).
    """
    from ultralytics import YOLO
    model = YOLO('yolov8x-pose.pt')
    results = model(img_bgr, verbose=False)

    if not results or results[0].keypoints is None or len(results[0].boxes) == 0:
        raise RuntimeError('YOLO: no person detected in front render')

    r = results[0]
    boxes = r.boxes.xyxy.cpu().numpy()
    areas = (boxes[:,2]-boxes[:,0]) * (boxes[:,3]-boxes[:,1])
    idx   = int(areas.argmax())

    kp_xy   = r.keypoints[idx].xy[0].cpu().numpy()    # (17, 2) pixel
    kp_conf = r.keypoints[idx].conf[0].cpu().numpy()  # (17,) confidence
    kp      = np.concatenate([kp_xy, kp_conf[:,None]], axis=1)  # (17, 3)

    print('  YOLO detections: %d boxes, using largest' % len(boxes))
    for i, name in enumerate(COCO_NAMES):
        if kp_conf[i] > 0.3:
            print('    [%d] %-14s  px=(%.0f, %.0f)  conf=%.2f' % (
                i, name, kp_xy[i,0], kp_xy[i,1], kp_conf[i]))

    if debug_dir:
        vis = img_bgr.copy()
        for i in range(17):
            if kp_conf[i] > 0.3:
                x, y = int(kp_xy[i,0]), int(kp_xy[i,1])
                cv2.circle(vis, (x, y), 6, (0, 255, 0), -1)
                cv2.putText(vis, COCO_NAMES[i][:4], (x+4, y-4),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,255,0), 1)
        cv2.imwrite(os.path.join(debug_dir, 'yolo_keypoints.png'), vis)

    return kp


# ── Step 3: Unproject 2D → 3D ────────────────────────────────────────────────

def unproject_to_3d(kp_2d_conf, scale_factor, mesh_verts_orig):
    """
    Convert COCO-17 pixel positions to 3D positions in original mesh space.

    MV-Adapter orthographic camera at azimuth=-90 maps:
      pixel_x  →  orig_x   (character lateral axis)
      pixel_y  →  orig_y   (character height axis, flipped from pixel)
      orig_z   estimated from k-nearest mesh vertices in image space

    Forward projection (for reference):
      std_x = orig_x / scale_factor
      NDC_x = PROJ_SCALE * std_x
      pixel_x = (NDC_x + 1) / 2 * W

      std_z = orig_y / scale_factor        (mesh Y ↔ std Z ↔ image vertical)
      NDC_y = -PROJ_SCALE * std_z          (Y-flipped by proj matrix)
      pixel_y = (NDC_y + 1) / 2 * H

    Inverse:
      orig_x = (2*px/W - 1) / PROJ_SCALE * scale_factor
      orig_y = -(2*py/H - 1) / PROJ_SCALE * scale_factor
    """
    W, H = RENDER_W, RENDER_H

    # Project all mesh vertices to image space (for Z lookup)
    verts_px_x = ((mesh_verts_orig[:,0] / scale_factor * PROJ_SCALE) + 1.0) / 2.0 * W
    verts_px_y = ((-mesh_verts_orig[:,1] / scale_factor * PROJ_SCALE) + 1.0) / 2.0 * H

    joints_3d = np.full((17, 3), np.nan)
    for i in range(17):
        px, py, conf = kp_2d_conf[i]
        if conf < 0.15 or px < 1 or py < 1:
            continue

        orig_x = (2.0*px/W - 1.0) / PROJ_SCALE * scale_factor
        orig_y = -(2.0*py/H - 1.0) / PROJ_SCALE * scale_factor

        # Z: median of k-nearest mesh vertices in image space
        dist_2d = np.hypot(verts_px_x - px, verts_px_y - py)
        k = 30
        near_idx = np.argpartition(dist_2d, k-1)[:k]
        orig_z   = float(np.median(mesh_verts_orig[near_idx, 2]))

        joints_3d[i] = [orig_x, orig_y, orig_z]

    return joints_3d


# ── Step 4: COCO-17 → SMPL-24 ────────────────────────────────────────────────

def coco17_to_smpl24(coco_3d, mesh_verts):
    """
    Build 24 SMPL joint positions from COCO-17 detections.
    Spine / collar / hand / foot joints are interpolated.
    Low-confidence (NaN) COCO joints fall back to mesh geometry.
    """
    def lerp(a, b, t):
        return a + t * (b - a)

    def valid(i):
        return not np.any(np.isnan(coco_3d[i]))

    # Fill NaN joints from mesh geometry (centroid fallback)
    c = coco_3d.copy()
    centroid = mesh_verts.mean(axis=0)
    for i in range(17):
        if not valid(i):
            c[i] = centroid

    # Key anchor points
    L_shoulder = c[5]
    R_shoulder = c[6]
    L_hip      = c[11]
    R_hip      = c[12]

    pelvis = lerp(L_hip, R_hip, 0.5)
    mid_shoulder = lerp(L_shoulder, R_shoulder, 0.5)
    # Neck: midpoint of shoulders, raised slightly (~ collar bone level)
    neck   = mid_shoulder + np.array([0.0, 0.04 * (mid_shoulder[1] - pelvis[1]), 0.0])

    J = np.zeros((24, 3), dtype=np.float64)

    J[0]  = pelvis                         # pelvis
    J[1]  = L_hip                          # left_hip
    J[2]  = R_hip                          # right_hip
    J[3]  = lerp(pelvis, neck, 0.25)       # spine1
    J[4]  = c[13]                          # left_knee
    J[5]  = c[14]                          # right_knee
    J[6]  = lerp(pelvis, neck, 0.5)        # spine2
    J[7]  = c[15]                          # left_ankle
    J[8]  = c[16]                          # right_ankle
    J[9]  = lerp(pelvis, neck, 0.75)       # spine3
    J[12] = neck                           # neck

    # Feet: project ankle downward toward mesh floor
    mesh_floor_y = mesh_verts[:,1].min()
    foot_y = mesh_floor_y + 0.02 * (c[15][1] - mesh_floor_y)  # 2% above floor
    J[10] = np.array([c[15][0], foot_y, c[15][2]])  # left_foot
    J[11] = np.array([c[16][0], foot_y, c[16][2]])  # right_foot

    J[13] = lerp(neck, L_shoulder, 0.5)   # left_collar
    J[14] = lerp(neck, R_shoulder, 0.5)   # right_collar
    J[15] = c[0]                           # head (nose as proxy)
    J[16] = L_shoulder                    # left_shoulder
    J[17] = R_shoulder                    # right_shoulder
    J[18] = c[7]                           # left_elbow
    J[19] = c[8]                           # right_elbow
    J[20] = c[9]                           # left_wrist
    J[21] = c[10]                          # right_wrist

    # Hands: extrapolate one step beyond wrist in elbow→wrist direction
    for side, (elbow_i, wrist_i, hand_i) in enumerate([(7,9,22), (8,10,23)]):
        elbow = c[elbow_i]; wrist = c[wrist_i]
        bone  = wrist - elbow
        blen  = np.linalg.norm(bone)
        if blen > 1e-3:
            J[hand_i] = wrist + bone / blen * 0.05
        else:
            J[hand_i] = wrist

    print('  SMPL-24 joints:')
    print('    pelvis   : (%.3f, %.3f, %.3f)' % tuple(J[0]))
    print('    L_hip    : (%.3f, %.3f, %.3f)' % tuple(J[1]))
    print('    R_hip    : (%.3f, %.3f, %.3f)' % tuple(J[2]))
    print('    neck     : (%.3f, %.3f, %.3f)' % tuple(J[12]))
    print('    L_shoulder: (%.3f, %.3f, %.3f)' % tuple(J[16]))
    print('    R_shoulder: (%.3f, %.3f, %.3f)' % tuple(J[17]))
    print('    head     : (%.3f, %.3f, %.3f)' % tuple(J[15]))

    return J.astype(np.float32)


# ── Step 5: LBS skinning weights ─────────────────────────────────────────────

def compute_skinning_weights(mesh_verts, joints, k=4):
    """
    Proximity-based LBS weights: each vertex gets k-nearest joint weights
    via inverse-distance weighting.
    Returns (N, 24) float32 full weight matrix.
    """
    N = len(mesh_verts)
    tree = cKDTree(joints)
    dists, idxs = tree.query(mesh_verts, k=k, workers=-1)

    # Clamp minimum distance to avoid division by zero
    inv_d = 1.0 / np.maximum(dists, 1e-6)
    inv_d /= inv_d.sum(axis=1, keepdims=True)

    W_full = np.zeros((N, 24), dtype=np.float32)
    for ki in range(k):
        W_full[np.arange(N), idxs[:, ki]] += inv_d[:, ki].astype(np.float32)

    # Normalize (should already be normalized, but just in case)
    row_sum = W_full.sum(axis=1, keepdims=True)
    W_full /= np.where(row_sum > 0, row_sum, 1.0)

    print('  weights: max_joint=%d  mean_support=%.2f joints/vert' % (
        W_full.argmax(axis=1).max(),
        (W_full > 0.01).sum(axis=1).mean()))

    return W_full


# ── Skeleton mesh builder ─────────────────────────────────────────────────────

def make_skeleton_mesh(joints, radius=0.008):
    """
    Build a mesh of hexagonal-prism cylinders connecting parent→child joints.
    Returns (verts, faces) as float32 / int32 numpy arrays.
    """
    SEG = 6  # hexagonal cross-section
    angles = np.linspace(0, 2 * np.pi, SEG, endpoint=False)
    circle = np.stack([np.cos(angles), np.sin(angles)], axis=1)  # (SEG, 2)

    all_verts, all_faces = [], []
    vert_offset = 0

    for i, parent in enumerate(SMPL_PARENTS):
        if parent == -1:
            continue
        p0 = joints[parent].astype(np.float64)
        p1 = joints[i].astype(np.float64)
        bone_vec = p1 - p0
        length = np.linalg.norm(bone_vec)
        if length < 1e-4:
            continue

        z_axis = bone_vec / length
        ref = np.array([0., 1., 0.]) if abs(z_axis[1]) < 0.9 else np.array([1., 0., 0.])
        x_axis = np.cross(ref, z_axis)
        x_axis /= np.linalg.norm(x_axis)
        y_axis = np.cross(z_axis, x_axis)

        # Bottom ring at p0, top ring at p1
        offsets = radius * (circle[:, 0:1] * x_axis + circle[:, 1:2] * y_axis)
        bottom  = p0 + offsets   # (SEG, 3)
        top     = p1 + offsets   # (SEG, 3)

        all_verts.append(np.vstack([bottom, top]).astype(np.float32))

        for j in range(SEG):
            j1 = (j + 1) % SEG
            b0, b1 = vert_offset + j,       vert_offset + j1
            t0, t1 = vert_offset + SEG + j, vert_offset + SEG + j1
            all_faces.extend([[b0, b1, t0], [b1, t1, t0]])

        vert_offset += 2 * SEG

    if not all_verts:
        return np.zeros((0, 3), np.float32), np.zeros((0, 3), np.int32)

    return np.vstack(all_verts), np.array(all_faces, dtype=np.int32)


# ── Step 6: Export rigged GLB ─────────────────────────────────────────────────

def export_rigged_glb(verts, faces, uv, texture_pil, joints, skin_weights,
                      out_path, skel_verts=None, skel_faces=None):
    """
    Export skinned GLB using pygltflib.
    bind pose = current pose (joints at detected positions).
    IBM[j] = Translation(-J_world[j])  (pure offset, no rotation).

    If skel_verts/skel_faces are provided, a second mesh (bright green skeleton
    sticks) is embedded alongside the body mesh.
    """
    import pygltflib
    from pygltflib import (GLTF2, Scene, Node, Mesh, Primitive, Accessor,
                            BufferView, Buffer, Material, Texture,
                            Image as GImage, Sampler, Skin, Asset)
    from pygltflib import (ARRAY_BUFFER, ELEMENT_ARRAY_BUFFER, FLOAT,
                            UNSIGNED_INT, UNSIGNED_SHORT, LINEAR,
                            LINEAR_MIPMAP_LINEAR, REPEAT, SCALAR, VEC2,
                            VEC3, VEC4, MAT4)

    gltf  = GLTF2()
    gltf.asset = Asset(version='2.0', generator='rig_yolo.py')
    blobs = []

    def _add(data, comp, acc_type, target=None):
        b   = data.tobytes()
        pad = (4 - len(b) % 4) % 4
        off = sum(len(x) for x in blobs)
        blobs.append(b + b'\x00' * pad)
        bv = len(gltf.bufferViews)
        gltf.bufferViews.append(BufferView(
            buffer=0, byteOffset=off, byteLength=len(b), target=target))
        ac = len(gltf.accessors)
        flat = data.flatten()
        gltf.accessors.append(Accessor(
            bufferView=bv, byteOffset=0, componentType=comp,
            type=acc_type, count=len(data),
            min=[float(flat.min())], max=[float(flat.max())]))
        return ac

    # Geometry
    pos_acc = _add(verts.astype(np.float32), FLOAT, VEC3, ARRAY_BUFFER)

    v0, v1, v2 = verts[faces[:,0]], verts[faces[:,1]], verts[faces[:,2]]
    fn = np.cross(v1-v0, v2-v0)
    fn /= (np.linalg.norm(fn, axis=1, keepdims=True) + 1e-8)
    vn = np.zeros_like(verts)
    for i in range(3):
        np.add.at(vn, faces[:,i], fn)
    vn /= (np.linalg.norm(vn, axis=1, keepdims=True) + 1e-8)
    nor_acc = _add(vn.astype(np.float32), FLOAT, VEC3, ARRAY_BUFFER)

    if uv is None:
        uv = np.zeros((len(verts), 2), np.float32)
    uv_acc  = _add(uv.astype(np.float32), FLOAT, VEC2, ARRAY_BUFFER)
    idx_acc = _add(faces.astype(np.uint32).flatten(), UNSIGNED_INT, SCALAR,
                   ELEMENT_ARRAY_BUFFER)

    # Skinning: top-4 joints per vertex
    top4_idx = np.argsort(-skin_weights, axis=1)[:, :4].astype(np.uint16)
    top4_w   = np.take_along_axis(skin_weights, top4_idx.astype(np.int64), axis=1)
    top4_w   = top4_w.astype(np.float32)
    top4_w  /= top4_w.sum(axis=1, keepdims=True).clip(1e-8, None)
    j_acc   = _add(top4_idx, UNSIGNED_SHORT, VEC4, ARRAY_BUFFER)
    w_acc   = _add(top4_w,   FLOAT,          VEC4, ARRAY_BUFFER)

    # Texture
    if texture_pil is not None:
        import io
        buf = io.BytesIO()
        texture_pil.save(buf, format='PNG')
        ib  = buf.getvalue()
        off = sum(len(x) for x in blobs)
        pad = (4 - len(ib) % 4) % 4
        blobs.append(ib + b'\x00' * pad)
        gltf.bufferViews.append(
            BufferView(buffer=0, byteOffset=off, byteLength=len(ib)))
        gltf.images.append(
            GImage(mimeType='image/png', bufferView=len(gltf.bufferViews)-1))
        gltf.samplers.append(
            Sampler(magFilter=LINEAR, minFilter=LINEAR_MIPMAP_LINEAR,
                    wrapS=REPEAT, wrapT=REPEAT))
        gltf.textures.append(Texture(sampler=0, source=0))
        gltf.materials.append(Material(
            name='body',
            pbrMetallicRoughness={
                'baseColorTexture': {'index': 0},
                'metallicFactor': 0.0,
                'roughnessFactor': 0.8},
            doubleSided=True))
    else:
        gltf.materials.append(Material(name='body', doubleSided=True))

    body_prim = Primitive(
        attributes={'POSITION': pos_acc, 'NORMAL': nor_acc,
                    'TEXCOORD_0': uv_acc, 'JOINTS_0': j_acc, 'WEIGHTS_0': w_acc},
        indices=idx_acc, material=0)
    gltf.meshes.append(Mesh(name='body', primitives=[body_prim]))

    # ── Optional skeleton mesh ─────────────────────────────────────────────────
    skel_mesh_idx = None
    if skel_verts is not None and len(skel_verts) > 0:
        sv = skel_verts.astype(np.float32)
        sf = skel_faces.astype(np.int32)

        sv0, sv1, sv2 = sv[sf[:,0]], sv[sf[:,1]], sv[sf[:,2]]
        sfn = np.cross(sv1-sv0, sv2-sv0)
        sfn /= (np.linalg.norm(sfn, axis=1, keepdims=True) + 1e-8)
        svn = np.zeros_like(sv)
        for i in range(3):
            np.add.at(svn, sf[:,i], sfn)
        svn /= (np.linalg.norm(svn, axis=1, keepdims=True) + 1e-8)

        s_pos_acc = _add(sv,                  FLOAT,        VEC3, ARRAY_BUFFER)
        s_nor_acc = _add(svn.astype(np.float32), FLOAT,     VEC3, ARRAY_BUFFER)
        s_idx_acc = _add(sf.astype(np.uint32).flatten(), UNSIGNED_INT, SCALAR,
                         ELEMENT_ARRAY_BUFFER)

        # Lime-green unlit material for skeleton sticks
        mat_idx = len(gltf.materials)
        gltf.materials.append(Material(
            name='skeleton',
            pbrMetallicRoughness={
                'baseColorFactor': [0.2, 1.0, 0.3, 1.0],
                'metallicFactor': 0.0,
                'roughnessFactor': 0.5},
            doubleSided=True))

        skel_mesh_idx = len(gltf.meshes)
        skel_prim = Primitive(
            attributes={'POSITION': s_pos_acc, 'NORMAL': s_nor_acc},
            indices=s_idx_acc, material=mat_idx)
        gltf.meshes.append(Mesh(name='skeleton', primitives=[skel_prim]))

    # ── Skeleton nodes ─────────────────────────────────────────────────────────
    jnodes = []
    for i, (name, parent) in enumerate(zip(SMPL_JOINT_NAMES, SMPL_PARENTS)):
        t = joints[i].tolist() if parent == -1 else (joints[i] - joints[parent]).tolist()
        n = Node(name=name, translation=t, children=[])
        jnodes.append(len(gltf.nodes))
        gltf.nodes.append(n)
    for i, p in enumerate(SMPL_PARENTS):
        if p != -1:
            gltf.nodes[jnodes[p]].children.append(jnodes[i])

    # Inverse bind matrices: IBM[j] = Translation(-J_world[j])
    # glTF MAT4 is column-major; numpy .tobytes() is row-major.
    # glTF reads the numpy buffer as the TRANSPOSE of what numpy stores.
    # So we set the translation in the last ROW of the numpy matrix — glTF
    # reads that as the last COLUMN (translation column) of a 4x4 mat.
    ibms = np.stack([np.eye(4, dtype=np.float32) for _ in range(len(joints))])
    for i in range(len(joints)):
        ibms[i, 3, :3] = -joints[i]
    ibm_acc = _add(ibms.astype(np.float32), FLOAT, MAT4)

    skin_idx = len(gltf.skins)
    gltf.skins.append(Skin(
        name='smpl_skin', skeleton=jnodes[0],
        joints=jnodes, inverseBindMatrices=ibm_acc))

    mesh_node = len(gltf.nodes)
    gltf.nodes.append(Node(name='body_mesh', mesh=0, skin=skin_idx))

    root_children = [jnodes[0], mesh_node]

    if skel_mesh_idx is not None:
        skel_node_idx = len(gltf.nodes)
        gltf.nodes.append(Node(name='skeleton_mesh', mesh=skel_mesh_idx))
        root_children.append(skel_node_idx)

    root_node = len(gltf.nodes)
    gltf.nodes.append(Node(name='root', children=root_children))
    gltf.scenes.append(Scene(name='Scene', nodes=[root_node]))
    gltf.scene = 0

    bin_data = b''.join(blobs)
    gltf.buffers.append(Buffer(byteLength=len(bin_data)))
    gltf.set_binary_blob(bin_data)
    gltf.save_binary(out_path)
    print('  rigged GLB -> %s  (%d KB)' % (out_path, os.path.getsize(out_path) // 1024))


# ── Main ──────────────────────────────────────────────────────────────────────

def rig_yolo(body_glb, out_glb, debug_dir=None):
    """
    Rig body_glb and write to out_glb.
    Returns (out_glb, out_skel_glb) where out_skel_glb includes visible
    skeleton bone sticks alongside the body mesh.
    """
    os.makedirs(os.path.dirname(out_glb) or '.', exist_ok=True)
    if debug_dir:
        os.makedirs(debug_dir, exist_ok=True)

    print('[rig_yolo] Rendering front view ...')
    img_bgr, scale_factor = render_front(body_glb, debug_dir)

    print('[rig_yolo] Running YOLO-pose ...')
    kp = detect_keypoints(img_bgr, debug_dir)

    print('[rig_yolo] Loading original mesh (pygltflib, correct UV channel) ...')
    verts, faces, uv, texture_pil = load_mesh_from_gltf(body_glb)

    print('[rig_yolo] Unprojecting YOLO keypoints to 3D ...')
    coco_3d = unproject_to_3d(kp, scale_factor, verts)

    print('[rig_yolo] Building SMPL-24 skeleton ...')
    joints = coco17_to_smpl24(coco_3d, verts)

    print('[rig_yolo] Computing skinning weights ...')
    skin_weights = compute_skinning_weights(verts, joints, k=4)

    print('[rig_yolo] Exporting rigged GLB (no skeleton) ...')
    export_rigged_glb(verts, faces, uv, texture_pil, joints, skin_weights, out_glb)

    print('[rig_yolo] Building skeleton mesh ...')
    skel_verts, skel_faces = make_skeleton_mesh(joints)
    out_skel_glb = out_glb.replace('.glb', '_skel.glb')
    print('[rig_yolo] Exporting rigged GLB (with skeleton) ...')
    export_rigged_glb(verts, faces, uv, texture_pil, joints, skin_weights,
                      out_skel_glb, skel_verts=skel_verts, skel_faces=skel_faces)

    print('[rig_yolo] Done.')
    return out_glb, out_skel_glb


if __name__ == '__main__':
    ap = argparse.ArgumentParser()
    ap.add_argument('--body',      required=True, help='Input textured GLB')
    ap.add_argument('--out',       required=True, help='Output rigged GLB')
    ap.add_argument('--debug_dir', default=None,  help='Save debug renders here')
    args = ap.parse_args()
    rigged, rigged_skel = rig_yolo(args.body, args.out, args.debug_dir)
    print('Rigged:        ', rigged)
    print('Rigged + skel: ', rigged_skel)