"""
tpose_smpl.py -- T-pose a humanoid GLB via inverse Linear Blend Skinning.

Pipeline:
  1. Render front view and run HMR2 -> SMPL body_pose + betas
  2. Read rigged.glb: mesh verts (rig world space), skinning weights, T-pose joints
  3. Compute FK transforms in rig world space using HMR2 body_pose
  4. Apply inverse LBS: v_tpose = (Sum_j W_j * A_j)^-1 * v_posed
  5. Map T-posed verts back to original mesh coordinate space, preserve UV/texture
  6. Optionally export SKEL bone mesh in T-pose

Usage:
    python tpose_smpl.py --body /tmp/triposg_textured.glb \
                         --rig  /tmp/rig_out/rigged.glb \
                         --out  /tmp/tposed_surface.glb \
                         [--skel_out /tmp/tposed_bones.glb] \
                         [--debug_dir /tmp/tpose_debug]
"""

import os, sys, argparse, struct, json, warnings
warnings.filterwarnings('ignore')

import numpy as np
import cv2
import torch
import trimesh
from trimesh.visual.texture import TextureVisuals
from trimesh.visual.material import PBRMaterial
from scipy.spatial.transform import Rotation as R

sys.path.insert(0, '/root/MV-Adapter')
SMPL_NEUTRAL = '/root/body_models/smpl/SMPL_NEUTRAL.pkl'
SKEL_DIR     = '/root/body_models/skel'

SMPL_PARENTS = [-1, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9,
                12, 13, 14, 16, 17, 18, 19, 20, 21]


# ---- Step 1: Render front view -----------------------------------------------

def render_front(body_glb, H=1024, W=768, device='cuda'):
    from mvadapter.utils.mesh_utils import (
        NVDiffRastContextWrapper, load_mesh, get_orthogonal_camera, render,
    )
    ctx     = NVDiffRastContextWrapper(device=device, context_type='cuda')
    mesh_mv = load_mesh(body_glb, rescale=True, device=device)
    camera  = get_orthogonal_camera(
        elevation_deg=[0], distance=[1.8],
        left=-0.55, right=0.55, bottom=-0.55, top=0.55,
        azimuth_deg=[-90], device=device,
    )
    out = render(ctx, mesh_mv, camera, height=H, width=W,
                 render_attr=True, render_depth=False, render_normal=False,
                 attr_background=0.5)
    img_np = (out.attr[0].cpu().numpy() * 255).clip(0, 255).astype(np.uint8)
    return cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)


# ---- Step 2: HMR2 pose estimation --------------------------------------------

def run_hmr2(img_bgr, device='cuda'):
    from pathlib import Path
    from hmr2.configs import CACHE_DIR_4DHUMANS
    from hmr2.models import load_hmr2, DEFAULT_CHECKPOINT, download_models
    from hmr2.utils import recursive_to
    from hmr2.datasets.vitdet_dataset import ViTDetDataset
    from hmr2.utils.utils_detectron2 import DefaultPredictor_Lazy
    from detectron2.config import LazyConfig
    import hmr2 as hmr2_pkg

    download_models(CACHE_DIR_4DHUMANS)
    model, model_cfg = load_hmr2(DEFAULT_CHECKPOINT)
    model = model.to(device).eval()

    cfg_path = Path(hmr2_pkg.__file__).parent / 'configs' / 'cascade_mask_rcnn_vitdet_h_75ep.py'
    det_cfg  = LazyConfig.load(str(cfg_path))
    det_cfg.train.init_checkpoint = (
        'https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h'
        '/f328730692/model_final_f05665.pkl'
    )
    for i in range(3):
        det_cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25
    detector = DefaultPredictor_Lazy(det_cfg)

    det_out   = detector(img_bgr)
    instances = det_out['instances']
    valid     = (instances.pred_classes == 0) & (instances.scores > 0.5)
    boxes     = instances.pred_boxes.tensor[valid].cpu().numpy()
    if len(boxes) == 0:
        raise RuntimeError('HMR2: no person detected in render')

    areas = (boxes[:,2]-boxes[:,0]) * (boxes[:,3]-boxes[:,1])
    boxes = boxes[areas.argmax():areas.argmax()+1]

    dataset    = ViTDetDataset(model_cfg, img_bgr, boxes)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
    for batch in dataloader:
        batch = recursive_to(batch, device)
        with torch.no_grad():
            out = model(batch)
        sp = out['pred_smpl_params']
        return {
            'body_pose': sp['body_pose'][0].cpu(),    # (23, 3, 3)
            'betas':     sp['betas'][0].cpu(),         # (10,)
        }


# ---- Step 3: Read all data from rigged.glb -----------------------------------

def read_rigged_glb(rig_glb):
    """
    Returns dict with:
      verts        : (N, 3) mesh vertices in rig world space
      j_idx        : (N, 4) joint indices
      w_arr        : (N, 4) skinning weights
      J_bind       : (24, 3) T-pose joint world positions
    """
    with open(rig_glb, 'rb') as fh:
        raw = fh.read()
    ch_len, _ = struct.unpack_from('<II', raw, 12)
    gltf = json.loads(raw[20:20+ch_len])
    bin_data = raw[20+ch_len+8:]

    def _read(acc_i):
        acc = gltf['accessors'][acc_i]
        bv  = gltf['bufferViews'][acc['bufferView']]
        off = bv.get('byteOffset', 0) + acc.get('byteOffset', 0)
        cnt = acc['count']
        n   = {'SCALAR':1,'VEC2':2,'VEC3':3,'VEC4':4,'MAT4':16}[acc['type']]
        fmt = {5121:'B',5123:'H',5125:'I',5126:'f'}[acc['componentType']]
        nb  = {'B':1,'H':2,'I':4,'f':4}[fmt]
        return np.frombuffer(bin_data[off:off+cnt*n*nb],
                             dtype=np.dtype(fmt)).reshape(cnt, n)

    prim  = gltf['meshes'][0]['primitives'][0]['attributes']
    verts = _read(prim['POSITION']).astype(np.float64)   # (N, 3)
    j_idx = _read(prim['JOINTS_0']).astype(int)          # (N, 4)
    w_arr = _read(prim['WEIGHTS_0']).astype(np.float64)  # (N, 4)
    row_sum = w_arr.sum(axis=1, keepdims=True)
    w_arr /= np.where(row_sum > 0, row_sum, 1.0)

    # Read T-pose joint world positions by accumulating node translations
    nodes   = gltf['nodes']
    skin    = gltf['skins'][0]
    j_nodes = skin['joints']                             # [0, 1, ..., 23]
    J_bind  = np.zeros((24, 3), dtype=np.float64)
    for ji, ni in enumerate(j_nodes):
        t_local = np.array(nodes[ni].get('translation', [0, 0, 0]))
        p = SMPL_PARENTS[ji]
        J_bind[ji] = (J_bind[p] if p >= 0 else np.zeros(3)) + t_local

    print('  Rig verts: %d  Y: [%.3f, %.3f]  X: [%.3f, %.3f]' % (
        len(verts),
        verts[:,1].min(), verts[:,1].max(),
        verts[:,0].min(), verts[:,0].max()))
    print('  J_bind pelvis: (%.3f, %.3f, %.3f)  L_shoulder: (%.3f, %.3f, %.3f)' % (
        *J_bind[0], *J_bind[16]))
    return {'verts': verts, 'j_idx': j_idx, 'w_arr': w_arr, 'J_bind': J_bind}


# ---- Step 4: FK in rig world space -> A matrices -----------------------------

_FLIP_X = np.diag([-1.0, 1.0, 1.0])   # X-axis mirror matrix


def _adapt_rotmat_to_flipped_x(R_smpl):
    """
    Convert an SO(3) rotation matrix from SMPL convention (left=+X)
    to rig convention (left=-X).  F @ R @ F  where F = diag(-1,1,1).
    """
    return _FLIP_X @ R_smpl @ _FLIP_X


def compute_rig_fk_transforms(J_bind, body_pose_rotmats):
    """
    Compute A_j = G_j_posed * IBM_j in rig world space.
    A_j maps T-pose -> posed, so A_j^{-1} maps posed -> T-pose.

    HMR2 returns rotations in SMPL convention (left shoulder at +X).
    The rig uses the opposite convention (left shoulder at -X).
    We convert by conjugating with the X-flip matrix before building FK.

    J_bind          : (24, 3) T-pose joint world positions from rig
    body_pose_rotmats: (23, 3, 3) HMR2 body pose rotation matrices (joints 1-23)
    Returns A: (24, 4, 4)
    """
    G = [None] * 24
    for j in range(24):
        p = SMPL_PARENTS[j]
        # Convert rotation from SMPL (+X=left) to rig (-X=left) convention
        R_smpl = body_pose_rotmats[j-1].numpy() if j >= 1 else np.eye(3)
        R_j    = _adapt_rotmat_to_flipped_x(R_smpl)

        if p < 0:
            t_j = J_bind[j]           # root: absolute world position
        else:
            t_j = J_bind[j] - J_bind[p]

        L = np.eye(4, dtype=np.float64)
        L[:3, :3] = R_j
        L[:3, 3]  = t_j

        G[j] = L if p < 0 else G[p] @ L

    G = np.stack(G)

    A = np.zeros((24, 4, 4), dtype=np.float64)
    for j in range(24):
        IBM = np.eye(4, dtype=np.float64)
        IBM[:3, 3] = -J_bind[j]
        A[j] = G[j] @ IBM

    return A


# ---- Step 5: Inverse LBS -----------------------------------------------------

def inverse_lbs(verts, j_idx, w_arr, A):
    """
    v_tpose = (Sum_j W_j * A_j)^{-1} * v_posed
    All inputs in rig world space.
    Returns (N, 3) T-posed vertices.
    """
    N = len(verts)
    # Blend forward transforms
    T_fwd = np.zeros((N, 4, 4), dtype=np.float64)
    for k in range(4):
        ji   = j_idx[:, k]
        w    = w_arr[:, k]
        mask = w > 1e-6
        if mask.any():
            T_fwd[mask] += w[mask, None, None] * A[ji[mask]]

    T_inv = np.linalg.inv(T_fwd)
    v_h   = np.concatenate([verts, np.ones((N, 1))], axis=1)
    v_tp  = np.einsum('nij,nj->ni', T_inv, v_h)[:, :3]

    disp  = np.linalg.norm(v_tp - verts, axis=1)
    print('  inverse LBS: mean_disp=%.4f  max_disp=%.4f' % (disp.mean(), disp.max()))
    return v_tp


# ---- Step 6: Map T-posed rig verts back to original mesh space ---------------

def rig_to_original_space(rig_verts_tposed, rig_verts_original, orig_mesh_verts):
    """
    Rig verts are a scaled + translated version of the original mesh verts.
    Recover the (scale, offset) from the mapping:
      rig_vert = orig_vert * scale + offset

    Estimates scale from height ratio, offset from floor alignment.
    Returns T-posed vertices in original mesh coordinate space.
    """
    rig_h  = rig_verts_original[:, 1].max() - rig_verts_original[:, 1].min()
    orig_h = orig_mesh_verts[:, 1].max()    - orig_mesh_verts[:, 1].min()
    scale  = rig_h / max(orig_h, 1e-6)

    # The rig aligns: orig * scale, then v[:,1] -= v[:,1].min() (floor at 0)
    # and v[:,0] += smpl_joints[0,0] - cx; v[:,2] += smpl_joints[0,2] - cz
    # We can recover offset from comparing means/floors
    # offset = rig_floor_Y - (orig_floor_Y * scale)
    rig_floor  = rig_verts_original[:, 1].min()
    orig_floor = orig_mesh_verts[:, 1].min()
    y_offset   = rig_floor - orig_floor * scale

    # X, Z: center offset
    rig_cx  = (rig_verts_original[:, 0].max() + rig_verts_original[:, 0].min()) * 0.5
    orig_cx = (orig_mesh_verts[:, 0].max()    + orig_mesh_verts[:, 0].min())    * 0.5
    x_offset = rig_cx - orig_cx * scale

    rig_cz  = (rig_verts_original[:, 2].max() + rig_verts_original[:, 2].min()) * 0.5
    orig_cz = (orig_mesh_verts[:, 2].max()    + orig_mesh_verts[:, 2].min())    * 0.5
    z_offset = rig_cz - orig_cz * scale

    print('  rig->orig: scale=%.4f  offset=[%.3f, %.3f, %.3f]' % (scale, x_offset, y_offset, z_offset))

    # Invert: orig_vert = (rig_vert - offset) / scale
    # For T-posed verts: they're in rig space but T-posed, so same inversion
    tposed_orig = np.zeros_like(rig_verts_tposed)
    tposed_orig[:, 0] = (rig_verts_tposed[:, 0] - x_offset) / scale
    tposed_orig[:, 1] = (rig_verts_tposed[:, 1] - y_offset) / scale
    tposed_orig[:, 2] = (rig_verts_tposed[:, 2] - z_offset) / scale
    return tposed_orig


# ---- SKEL bone geometry ------------------------------------------------------

def export_skel_bones(betas, out_path, gender='male'):
    try:
        from skel.skel_model import SKEL
    except ImportError:
        print('  [skel] Not installed')
        return None
    skel_file = os.path.join(SKEL_DIR, 'skel_%s.pkl' % gender)
    if not os.path.exists(skel_file):
        print('  [skel] Weights not found: %s' % skel_file)
        return None
    try:
        skel_model = SKEL(gender=gender, model_path=SKEL_DIR)
        betas_t    = betas.unsqueeze(0)[:, :10]
        poses_zero = torch.zeros(1, 46)
        trans_zero = torch.zeros(1, 3)
        with torch.no_grad():
            out = skel_model(poses=poses_zero, betas=betas_t, trans=trans_zero, skelmesh=True)
        bone_verts = out.skel_verts[0].numpy()
        bone_faces = skel_model.skel_f.numpy()
        mesh = trimesh.Trimesh(vertices=bone_verts, faces=bone_faces, process=False)
        mesh.export(out_path)
        print('  [skel] Bone mesh -> %s  (%d verts)' % (out_path, len(bone_verts)))
        return out_path
    except Exception as e:
        print('  [skel] Export failed: %s' % e)
        return None


# ---- Main --------------------------------------------------------------------

def tpose_smpl(body_glb, out_glb, rig_glb=None, debug_dir=None, skel_out=None):
    device = 'cuda'

    if not rig_glb or not os.path.exists(rig_glb):
        raise RuntimeError('--rig is required: provide the rigged.glb from the Rig step.')

    print('[tpose_smpl] Rendering front view ...')
    img_bgr = render_front(body_glb, device=device)
    if debug_dir:
        cv2.imwrite(os.path.join(debug_dir, 'tpose_render.png'), img_bgr)

    print('[tpose_smpl] Running HMR2 pose estimation ...')
    hmr2_out = run_hmr2(img_bgr, device=device)
    print('  betas: %s' % hmr2_out['betas'].numpy().round(3))

    print('[tpose_smpl] Reading rigged GLB (rig world space) ...')
    rig_data = read_rigged_glb(rig_glb)

    print('[tpose_smpl] Loading original mesh for UV/texture ...')
    scene = trimesh.load(body_glb)
    if isinstance(scene, trimesh.Scene):
        geom_name = list(scene.geometry.keys())[0]
        orig_mesh  = scene.geometry[geom_name]
    else:
        orig_mesh = scene; geom_name = None

    orig_verts = np.array(orig_mesh.vertices, dtype=np.float64)
    uvs        = np.array(orig_mesh.visual.uv, dtype=np.float64)
    orig_tex   = orig_mesh.visual.material.baseColorTexture
    print('  Orig mesh: %d verts  Y: [%.3f, %.3f]  X: [%.3f, %.3f]' % (
        len(orig_verts),
        orig_verts[:,1].min(), orig_verts[:,1].max(),
        orig_verts[:,0].min(), orig_verts[:,0].max()))

    print('[tpose_smpl] Computing FK transforms in rig world space ...')
    body_pose_rotmats = hmr2_out['body_pose']   # (23, 3, 3)
    A = compute_rig_fk_transforms(rig_data['J_bind'], body_pose_rotmats)

    # Verify zero-pose gives identity (sanity check)
    A_zero = compute_rig_fk_transforms(rig_data['J_bind'],
                                        torch.zeros(23, 3, 3) + torch.eye(3))
    v_test = rig_data['verts'][:3]
    v_h = np.concatenate([v_test, np.ones((3,1))], axis=1)
    T_fwd_test = np.zeros((3, 4, 4))
    for k in range(4):
        ji = rig_data['j_idx'][:3, k]; w = rig_data['w_arr'][:3, k]
        T_fwd_test += w[:, None, None] * A_zero[ji]
    identity_err = np.abs(T_fwd_test - np.eye(4)).max()
    print('  zero-pose identity check: max_err=%.6f (expect ~0)' % identity_err)

    print('[tpose_smpl] Applying inverse LBS ...')
    rig_verts_tposed = inverse_lbs(
        rig_data['verts'], rig_data['j_idx'], rig_data['w_arr'], A)

    print('[tpose_smpl] T-posed rig verts: Y: [%.3f, %.3f]  X: [%.3f, %.3f]' % (
        rig_verts_tposed[:,1].min(), rig_verts_tposed[:,1].max(),
        rig_verts_tposed[:,0].min(), rig_verts_tposed[:,0].max()))

    print('[tpose_smpl] Mapping back to original mesh coordinate space ...')
    tposed_orig = rig_to_original_space(
        rig_verts_tposed, rig_data['verts'], orig_verts)

    print('[tpose_smpl] T-posed orig: Y: [%.3f, %.3f]  X: [%.3f, %.3f]' % (
        tposed_orig[:,1].min(), tposed_orig[:,1].max(),
        tposed_orig[:,0].min(), tposed_orig[:,0].max()))

    orig_mesh.vertices = tposed_orig
    orig_mesh.visual = TextureVisuals(uv=uvs,
                                      material=PBRMaterial(baseColorTexture=orig_tex))

    if geom_name and isinstance(scene, trimesh.Scene):
        scene.geometry[geom_name] = orig_mesh
        scene.export(out_glb)
    else:
        orig_mesh.export(out_glb)

    print('[tpose_smpl] Saved: %s  (%d KB)' % (out_glb, os.path.getsize(out_glb)//1024))

    if skel_out:
        print('[tpose_smpl] Exporting SKEL bone geometry ...')
        export_skel_bones(hmr2_out['betas'], skel_out)

    return out_glb


if __name__ == '__main__':
    ap = argparse.ArgumentParser()
    ap.add_argument('--body',      required=True)
    ap.add_argument('--out',       required=True)
    ap.add_argument('--rig',       required=True, help='Rigged GLB from rig step')
    ap.add_argument('--skel_out',  default=None,  help='SKEL BSM bone mesh output')
    ap.add_argument('--debug_dir', default=None)
    args = ap.parse_args()
    os.makedirs(args.debug_dir, exist_ok=True) if args.debug_dir else None
    tpose_smpl(args.body, args.out, rig_glb=args.rig,
               debug_dir=args.debug_dir, skel_out=args.skel_out)