"""
pshuman_local.py
================
Run PSHuman multi-view diffusion in-process on ZeroGPU.
Generates 6 colour views + 6 normal views from a single portrait image.

Full mesh reconstruction (SMPL fitting, pytorch3d/kaolin texture projection)
is intentionally skipped — those deps have no Python 3.13 wheels.

Model : pengHTYX/PSHuman_Unclip_768_6views
Repo  : https://github.com/pengHTYX/PSHuman (cloned to /tmp/pshuman-src)

The preprocessing replaces PSHuman's mediapipe-based face-crop helper with a
self-contained approach (insightface if available, else top-centre crop) so no
additional exotic deps are needed.
"""
from __future__ import annotations

import os
import sys
import subprocess
import tempfile
from pathlib import Path
from typing import List, Optional, Tuple

import numpy as np
import torch
import torch.nn.functional as F
from PIL import Image

# ── Constants ─────────────────────────────────────────────────────────────────
_PSHUMAN_SRC   = Path("/tmp/pshuman-src")
_PSHUMAN_CKPT  = Path("/tmp/pshuman-ckpts")
_PSHUMAN_REPO  = "https://github.com/pengHTYX/PSHuman.git"
_PSHUMAN_HF_ID = "pengHTYX/PSHuman_Unclip_768_6views"

_CANVAS_SIZE   = 768    # model input resolution
_CROP_SIZE     = 740    # subject bounding-box target size within canvas
_NUM_VIEWS     = 7      # 6 body views + 1 face-crop view (model expectation)

# Cached pipeline (survives across ZeroGPU calls when the Space is warm)
_pipeline = None

# ── Setup helpers ─────────────────────────────────────────────────────────────

def _ensure_repo() -> None:
    if _PSHUMAN_SRC.exists():
        return
    print("[pshuman] Cloning PSHuman repo…")
    subprocess.run(
        ["git", "clone", "--depth=1", _PSHUMAN_REPO, str(_PSHUMAN_SRC)],
        check=True,
    )
    # Apply pre-patched files (e.g. transformers>=5.0 compatibility fixes)
    import shutil as _shutil
    _patches = Path(__file__).parent.parent / "patches" / "pshuman"
    for _pf in _patches.rglob("*"):
        if _pf.is_file():
            _dest = _PSHUMAN_SRC / _pf.relative_to(_patches)
            _dest.parent.mkdir(parents=True, exist_ok=True)
            _shutil.copy2(str(_pf), str(_dest))
    print("[pshuman] Repo cloned + patches applied.")


def _ensure_sys_path() -> None:
    _ensure_repo()
    src = str(_PSHUMAN_SRC)
    if src not in sys.path:
        sys.path.insert(0, src)


def _ensure_ckpt() -> None:
    if _PSHUMAN_CKPT.exists():
        return
    from huggingface_hub import snapshot_download
    print("[pshuman] Downloading model weights…")
    snapshot_download(repo_id=_PSHUMAN_HF_ID, local_dir=str(_PSHUMAN_CKPT))
    print("[pshuman] Weights downloaded.")


def load_pipeline(device: str = "cuda"):
    """Load (and cache) the PSHuman StableUnCLIPImg2ImgPipeline."""
    global _pipeline
    if _pipeline is not None:
        _pipeline.to(device)
        return _pipeline

    _ensure_sys_path()
    _ensure_ckpt()

    from mvdiffusion.pipelines.pipeline_mvdiffusion_unclip import (
        StableUnCLIPImg2ImgPipeline,
    )

    print("[pshuman] Loading pipeline…")
    pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
        str(_PSHUMAN_CKPT),
        torch_dtype=torch.float16,
    )
    try:
        pipe.unet.enable_xformers_memory_efficient_attention()
    except Exception:
        pass
    pipe.to(device)
    _pipeline = pipe
    print("[pshuman] Pipeline ready.")
    return pipe


# ── Image preprocessing ───────────────────────────────────────────────────────

def _preprocess_subject(image: Image.Image) -> Image.Image:
    """
    Replicate PSHuman's load_image preprocessing:
      1. Ensure RGBA (background = white if no alpha).
      2. Crop to the bounding box of the alpha channel.
      3. Scale the longest side to _CROP_SIZE.
      4. Paste centred on a white _CANVAS_SIZE × _CANVAS_SIZE canvas.

    Returns an RGB image (768 × 768).
    """
    if image.mode != "RGBA":
        # No alpha → treat entire image as foreground
        rgba = image.convert("RGBA")
        alpha = np.ones((rgba.size[1], rgba.size[0]), dtype=np.uint8) * 255
        rgba.putalpha(Image.fromarray(alpha))
        image = rgba
    else:
        image = image.copy()

    arr = np.array(image)          # H W 4
    a   = arr[:, :, 3]

    # Bounding box of non-transparent pixels
    rows = np.any(a > 0, axis=1)
    cols = np.any(a > 0, axis=0)
    if rows.any():
        r0, r1 = np.where(rows)[0][[0, -1]]
        c0, c1 = np.where(cols)[0][[0, -1]]
        crop = image.crop((c0, r0, c1 + 1, r1 + 1))
    else:
        crop = image

    # Scale longest side → _CROP_SIZE
    cw, ch = crop.size
    scale  = _CROP_SIZE / max(cw, ch)
    new_w  = round(cw * scale)
    new_h  = round(ch * scale)
    crop   = crop.resize((new_w, new_h), Image.LANCZOS)

    # Paste on white canvas
    canvas = Image.new("RGB", (_CANVAS_SIZE, _CANVAS_SIZE), (255, 255, 255))
    x_off  = (_CANVAS_SIZE - new_w) // 2
    y_off  = (_CANVAS_SIZE - new_h) // 2
    if crop.mode == "RGBA":
        canvas.paste(crop.convert("RGB"), (x_off, y_off), crop.split()[3])
    else:
        canvas.paste(crop.convert("RGB"), (x_off, y_off))

    return canvas


def _get_face_crop(image: Image.Image) -> Image.Image:
    """
    Return a 256×256 face crop from *image* (768×768 preprocessed subject).
    Tries insightface first; falls back to top-centre heuristic.
    """
    size = 256
    # ── insightface ────────────────────────────────────────────────────────
    try:
        import insightface
        from insightface.app import FaceAnalysis
        _fa = FaceAnalysis(allowed_modules=["detection"])
        _fa.prepare(ctx_id=0 if torch.cuda.is_available() else -1, det_size=(320, 320))
        faces = _fa.get(np.array(image))
        if faces:
            b = faces[0].bbox.astype(int)
            x0, y0, x1, y1 = max(0, b[0]), max(0, b[1]), min(image.width, b[2]), min(image.height, b[3])
            face_img = image.crop((x0, y0, x1, y1)).resize((size, size), Image.LANCZOS)
            return face_img
    except Exception:
        pass

    # ── heuristic: top-centre 40 % of image ────────────────────────────────
    w, h = image.size
    face_h = int(h * 0.40)
    margin = int(w * 0.15)
    face_img = image.crop((margin, 0, w - margin, face_h)).resize((size, size), Image.LANCZOS)
    return face_img


def _to_tensor(img: Image.Image) -> torch.Tensor:
    """PIL RGB → float32 tensor (3, H, W) in [0, 1]."""
    arr = np.array(img.convert("RGB"), dtype=np.float32) / 255.0
    return torch.from_numpy(arr).permute(2, 0, 1)  # (3, H, W)


def _to_pil(t: torch.Tensor) -> Image.Image:
    """Float tensor (3, H, W) in [0, 1] → PIL RGB."""
    arr = (t.float().clamp(0.0, 1.0).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
    return Image.fromarray(arr)


# ── Main inference ─────────────────────────────────────────────────────────────

def run_pshuman_diffusion(
    image: Image.Image,
    device: str = "cuda",
    seed: int = 42,
    guidance_scale: float = 3.0,
    num_inference_steps: int = 40,
    remove_bg_output: bool = True,
) -> Tuple[List[Image.Image], List[Image.Image]]:
    """
    Run PSHuman multi-view diffusion on a single person image.

    Parameters
    ----------
    image              : Input PIL image (RGBA with bg removed, or RGB)
    device             : 'cuda' or 'cpu'
    seed               : RNG seed for reproducibility
    guidance_scale     : CFG scale (default 3.0, per PSHuman paper)
    num_inference_steps: Diffusion steps (default 40)
    remove_bg_output   : Strip background from output colour views via rembg

    Returns
    -------
    colour_views : List[PIL.Image] — 6 colour images
                   [front, front-right, right, back, left, front-left]
    normal_views : List[PIL.Image] — 6 matching normal maps
    """
    from einops import rearrange

    _ensure_sys_path()
    _ensure_ckpt()

    # ── 1. Preprocessing ──────────────────────────────────────────────────────
    # PSHuman was trained on right-facing subjects → flip input left-right
    flipped = image.transpose(Image.FLIP_LEFT_RIGHT)
    subject = _preprocess_subject(flipped)   # 768×768 RGB
    face    = _get_face_crop(subject)        # 256×256 RGB

    body_t  = _to_tensor(subject)            # (3, 768, 768)
    face_t  = _to_tensor(face.resize((_CANVAS_SIZE, _CANVAS_SIZE), Image.LANCZOS))  # (3, 768, 768)

    # Stack: first (Nv-1) slots = body image, last slot = face crop
    imgs_in = torch.stack(
        [body_t] * (_NUM_VIEWS - 1) + [face_t], dim=0
    ).float().unsqueeze(0)                   # (1, 7, 3, 768, 768)

    # ── 2. Prompt embeddings ─────────────────────────────────────────────────
    embeds_dir = _PSHUMAN_SRC / "mvdiffusion" / "data" / "fixed_prompt_embeds_7view"
    normal_embeds = torch.load(embeds_dir / "normal_embeds.pt", map_location="cpu")
    color_embeds  = torch.load(embeds_dir / "clr_embeds.pt",    map_location="cpu")

    # Shapes from repo: (1, Nv, N, C) or (Nv, N, C) — normalise to (1, Nv, N, C)
    if normal_embeds.dim() == 3:
        normal_embeds = normal_embeds.unsqueeze(0)
    if color_embeds.dim() == 3:
        color_embeds = color_embeds.unsqueeze(0)

    # ── 3. Batch construction (duplicate for CFG) ─────────────────────────────
    # imgs_in: (2, 7, 3, 768, 768) → flat (14, 3, 768, 768)
    imgs_2x   = torch.cat([imgs_in, imgs_in], dim=0).to(device, dtype=torch.float16)
    imgs_flat = rearrange(imgs_2x, "B Nv C H W -> (B Nv) C H W")

    # prompt_embeds: (2, 7, N, C) → flat (14, N, C)
    p_emb = torch.cat([normal_embeds, color_embeds], dim=0).to(device, dtype=torch.float16)
    p_emb = rearrange(p_emb, "B Nv N C -> (B Nv) N C")

    # ── 4. Diffusion ──────────────────────────────────────────────────────────
    pipe = load_pipeline(device)
    gen  = torch.Generator(device=device).manual_seed(seed)

    with torch.autocast("cuda"):
        out_images = pipe(
            imgs_flat,
            None,                           # image_embeds slot
            prompt_embeds=p_emb,
            generator=gen,
            guidance_scale=guidance_scale,
            num_inference_steps=num_inference_steps,
            eta=1.0,
            output_type="pt",
            num_images_per_prompt=1,
        ).images                            # (14, 3, H, W)  float [0,1]

    # ── 5. Split normals / colours ────────────────────────────────────────────
    bsz        = out_images.shape[0] // 2  # = 7
    normals_pt = out_images[:bsz].clone()  # (7, 3, H, W)
    colors_pt  = out_images[bsz:].clone()  # (7, 3, H, W)

    # View 0 colour = input (PSHuman convention — not generated)
    colors_pt[0] = imgs_flat[0].to(out_images.device)

    # Views 3 & 4 are generated horizontally mirrored → flip back
    for j in (3, 4):
        normals_pt[j] = torch.flip(normals_pt[j], dims=[2])
        colors_pt[j]  = torch.flip(colors_pt[j],  dims=[2])

    # Paste the face-normal crop (view 6, 256px) into the top-right of normals[0]
    face_nrm = F.interpolate(
        normals_pt[6].unsqueeze(0), size=(256, 256),
        mode="bilinear", align_corners=False,
    ).squeeze(0)
    normals_pt[0][:, :256, 256:512] = face_nrm

    # ── 6. Convert to PIL (body views 0–5 only, skip face-crop view 6) ───────
    colour_views = [_to_pil(colors_pt[j])  for j in range(6)]
    normal_views = [_to_pil(normals_pt[j]) for j in range(6)]

    # All outputs are in PSHuman's flipped (right-facing) space.
    # Flip back so they match the user's original image orientation.
    colour_views = [v.transpose(Image.FLIP_LEFT_RIGHT) for v in colour_views]
    normal_views = [v.transpose(Image.FLIP_LEFT_RIGHT) for v in normal_views]

    if remove_bg_output:
        try:
            from rembg import remove as _rembg
            colour_views = [_rembg(v) for v in colour_views]
        except Exception:
            pass  # rembg optional — return raw outputs if unavailable

    return colour_views, normal_views