""" pshuman_local.py ================ Run PSHuman multi-view diffusion in-process on ZeroGPU. Generates 6 colour views + 6 normal views from a single portrait image. Full mesh reconstruction (SMPL fitting, pytorch3d/kaolin texture projection) is intentionally skipped — those deps have no Python 3.13 wheels. Model : pengHTYX/PSHuman_Unclip_768_6views Repo : https://github.com/pengHTYX/PSHuman (cloned to /tmp/pshuman-src) The preprocessing replaces PSHuman's mediapipe-based face-crop helper with a self-contained approach (insightface if available, else top-centre crop) so no additional exotic deps are needed. """ from __future__ import annotations import os import sys import subprocess import tempfile from pathlib import Path from typing import List, Optional, Tuple import numpy as np import torch import torch.nn.functional as F from PIL import Image # ── Constants ───────────────────────────────────────────────────────────────── _PSHUMAN_SRC = Path("/tmp/pshuman-src") _PSHUMAN_CKPT = Path("/tmp/pshuman-ckpts") _PSHUMAN_REPO = "https://github.com/pengHTYX/PSHuman.git" _PSHUMAN_HF_ID = "pengHTYX/PSHuman_Unclip_768_6views" _CANVAS_SIZE = 768 # model input resolution _CROP_SIZE = 740 # subject bounding-box target size within canvas _NUM_VIEWS = 7 # 6 body views + 1 face-crop view (model expectation) # Cached pipeline (survives across ZeroGPU calls when the Space is warm) _pipeline = None # ── Setup helpers ───────────────────────────────────────────────────────────── def _ensure_repo() -> None: if _PSHUMAN_SRC.exists(): return print("[pshuman] Cloning PSHuman repo…") subprocess.run( ["git", "clone", "--depth=1", _PSHUMAN_REPO, str(_PSHUMAN_SRC)], check=True, ) # Apply pre-patched files (e.g. transformers>=5.0 compatibility fixes) import shutil as _shutil _patches = Path(__file__).parent.parent / "patches" / "pshuman" for _pf in _patches.rglob("*"): if _pf.is_file(): _dest = _PSHUMAN_SRC / _pf.relative_to(_patches) _dest.parent.mkdir(parents=True, exist_ok=True) _shutil.copy2(str(_pf), str(_dest)) print("[pshuman] Repo cloned + patches applied.") def _ensure_sys_path() -> None: _ensure_repo() src = str(_PSHUMAN_SRC) if src not in sys.path: sys.path.insert(0, src) def _ensure_ckpt() -> None: if _PSHUMAN_CKPT.exists(): return from huggingface_hub import snapshot_download print("[pshuman] Downloading model weights…") snapshot_download(repo_id=_PSHUMAN_HF_ID, local_dir=str(_PSHUMAN_CKPT)) print("[pshuman] Weights downloaded.") def load_pipeline(device: str = "cuda"): """Load (and cache) the PSHuman StableUnCLIPImg2ImgPipeline.""" global _pipeline if _pipeline is not None: _pipeline.to(device) return _pipeline _ensure_sys_path() _ensure_ckpt() from mvdiffusion.pipelines.pipeline_mvdiffusion_unclip import ( StableUnCLIPImg2ImgPipeline, ) print("[pshuman] Loading pipeline…") pipe = StableUnCLIPImg2ImgPipeline.from_pretrained( str(_PSHUMAN_CKPT), torch_dtype=torch.float16, ) try: pipe.unet.enable_xformers_memory_efficient_attention() except Exception: pass pipe.to(device) _pipeline = pipe print("[pshuman] Pipeline ready.") return pipe # ── Image preprocessing ─────────────────────────────────────────────────────── def _preprocess_subject(image: Image.Image) -> Image.Image: """ Replicate PSHuman's load_image preprocessing: 1. Ensure RGBA (background = white if no alpha). 2. Crop to the bounding box of the alpha channel. 3. Scale the longest side to _CROP_SIZE. 4. Paste centred on a white _CANVAS_SIZE × _CANVAS_SIZE canvas. Returns an RGB image (768 × 768). """ if image.mode != "RGBA": # No alpha → treat entire image as foreground rgba = image.convert("RGBA") alpha = np.ones((rgba.size[1], rgba.size[0]), dtype=np.uint8) * 255 rgba.putalpha(Image.fromarray(alpha)) image = rgba else: image = image.copy() arr = np.array(image) # H W 4 a = arr[:, :, 3] # Bounding box of non-transparent pixels rows = np.any(a > 0, axis=1) cols = np.any(a > 0, axis=0) if rows.any(): r0, r1 = np.where(rows)[0][[0, -1]] c0, c1 = np.where(cols)[0][[0, -1]] crop = image.crop((c0, r0, c1 + 1, r1 + 1)) else: crop = image # Scale longest side → _CROP_SIZE cw, ch = crop.size scale = _CROP_SIZE / max(cw, ch) new_w = round(cw * scale) new_h = round(ch * scale) crop = crop.resize((new_w, new_h), Image.LANCZOS) # Paste on white canvas canvas = Image.new("RGB", (_CANVAS_SIZE, _CANVAS_SIZE), (255, 255, 255)) x_off = (_CANVAS_SIZE - new_w) // 2 y_off = (_CANVAS_SIZE - new_h) // 2 if crop.mode == "RGBA": canvas.paste(crop.convert("RGB"), (x_off, y_off), crop.split()[3]) else: canvas.paste(crop.convert("RGB"), (x_off, y_off)) return canvas def _get_face_crop(image: Image.Image) -> Image.Image: """ Return a 256×256 face crop from *image* (768×768 preprocessed subject). Tries insightface first; falls back to top-centre heuristic. """ size = 256 # ── insightface ──────────────────────────────────────────────────────── try: import insightface from insightface.app import FaceAnalysis _fa = FaceAnalysis(allowed_modules=["detection"]) _fa.prepare(ctx_id=0 if torch.cuda.is_available() else -1, det_size=(320, 320)) faces = _fa.get(np.array(image)) if faces: b = faces[0].bbox.astype(int) x0, y0, x1, y1 = max(0, b[0]), max(0, b[1]), min(image.width, b[2]), min(image.height, b[3]) face_img = image.crop((x0, y0, x1, y1)).resize((size, size), Image.LANCZOS) return face_img except Exception: pass # ── heuristic: top-centre 40 % of image ──────────────────────────────── w, h = image.size face_h = int(h * 0.40) margin = int(w * 0.15) face_img = image.crop((margin, 0, w - margin, face_h)).resize((size, size), Image.LANCZOS) return face_img def _to_tensor(img: Image.Image) -> torch.Tensor: """PIL RGB → float32 tensor (3, H, W) in [0, 1].""" arr = np.array(img.convert("RGB"), dtype=np.float32) / 255.0 return torch.from_numpy(arr).permute(2, 0, 1) # (3, H, W) def _to_pil(t: torch.Tensor) -> Image.Image: """Float tensor (3, H, W) in [0, 1] → PIL RGB.""" arr = (t.float().clamp(0.0, 1.0).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8) return Image.fromarray(arr) # ── Main inference ───────────────────────────────────────────────────────────── def run_pshuman_diffusion( image: Image.Image, device: str = "cuda", seed: int = 42, guidance_scale: float = 3.0, num_inference_steps: int = 40, remove_bg_output: bool = True, ) -> Tuple[List[Image.Image], List[Image.Image]]: """ Run PSHuman multi-view diffusion on a single person image. Parameters ---------- image : Input PIL image (RGBA with bg removed, or RGB) device : 'cuda' or 'cpu' seed : RNG seed for reproducibility guidance_scale : CFG scale (default 3.0, per PSHuman paper) num_inference_steps: Diffusion steps (default 40) remove_bg_output : Strip background from output colour views via rembg Returns ------- colour_views : List[PIL.Image] — 6 colour images [front, front-right, right, back, left, front-left] normal_views : List[PIL.Image] — 6 matching normal maps """ from einops import rearrange _ensure_sys_path() _ensure_ckpt() # ── 1. Preprocessing ────────────────────────────────────────────────────── # PSHuman was trained on right-facing subjects → flip input left-right flipped = image.transpose(Image.FLIP_LEFT_RIGHT) subject = _preprocess_subject(flipped) # 768×768 RGB face = _get_face_crop(subject) # 256×256 RGB body_t = _to_tensor(subject) # (3, 768, 768) face_t = _to_tensor(face.resize((_CANVAS_SIZE, _CANVAS_SIZE), Image.LANCZOS)) # (3, 768, 768) # Stack: first (Nv-1) slots = body image, last slot = face crop imgs_in = torch.stack( [body_t] * (_NUM_VIEWS - 1) + [face_t], dim=0 ).float().unsqueeze(0) # (1, 7, 3, 768, 768) # ── 2. Prompt embeddings ───────────────────────────────────────────────── embeds_dir = _PSHUMAN_SRC / "mvdiffusion" / "data" / "fixed_prompt_embeds_7view" normal_embeds = torch.load(embeds_dir / "normal_embeds.pt", map_location="cpu") color_embeds = torch.load(embeds_dir / "clr_embeds.pt", map_location="cpu") # Shapes from repo: (1, Nv, N, C) or (Nv, N, C) — normalise to (1, Nv, N, C) if normal_embeds.dim() == 3: normal_embeds = normal_embeds.unsqueeze(0) if color_embeds.dim() == 3: color_embeds = color_embeds.unsqueeze(0) # ── 3. Batch construction (duplicate for CFG) ───────────────────────────── # imgs_in: (2, 7, 3, 768, 768) → flat (14, 3, 768, 768) imgs_2x = torch.cat([imgs_in, imgs_in], dim=0).to(device, dtype=torch.float16) imgs_flat = rearrange(imgs_2x, "B Nv C H W -> (B Nv) C H W") # prompt_embeds: (2, 7, N, C) → flat (14, N, C) p_emb = torch.cat([normal_embeds, color_embeds], dim=0).to(device, dtype=torch.float16) p_emb = rearrange(p_emb, "B Nv N C -> (B Nv) N C") # ── 4. Diffusion ────────────────────────────────────────────────────────── pipe = load_pipeline(device) gen = torch.Generator(device=device).manual_seed(seed) with torch.autocast("cuda"): out_images = pipe( imgs_flat, None, # image_embeds slot prompt_embeds=p_emb, generator=gen, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, eta=1.0, output_type="pt", num_images_per_prompt=1, ).images # (14, 3, H, W) float [0,1] # ── 5. Split normals / colours ──────────────────────────────────────────── bsz = out_images.shape[0] // 2 # = 7 normals_pt = out_images[:bsz].clone() # (7, 3, H, W) colors_pt = out_images[bsz:].clone() # (7, 3, H, W) # View 0 colour = input (PSHuman convention — not generated) colors_pt[0] = imgs_flat[0].to(out_images.device) # Views 3 & 4 are generated horizontally mirrored → flip back for j in (3, 4): normals_pt[j] = torch.flip(normals_pt[j], dims=[2]) colors_pt[j] = torch.flip(colors_pt[j], dims=[2]) # Paste the face-normal crop (view 6, 256px) into the top-right of normals[0] face_nrm = F.interpolate( normals_pt[6].unsqueeze(0), size=(256, 256), mode="bilinear", align_corners=False, ).squeeze(0) normals_pt[0][:, :256, 256:512] = face_nrm # ── 6. Convert to PIL (body views 0–5 only, skip face-crop view 6) ─────── colour_views = [_to_pil(colors_pt[j]) for j in range(6)] normal_views = [_to_pil(normals_pt[j]) for j in range(6)] # All outputs are in PSHuman's flipped (right-facing) space. # Flip back so they match the user's original image orientation. colour_views = [v.transpose(Image.FLIP_LEFT_RIGHT) for v in colour_views] normal_views = [v.transpose(Image.FLIP_LEFT_RIGHT) for v in normal_views] if remove_bg_output: try: from rembg import remove as _rembg colour_views = [_rembg(v) for v in colour_views] except Exception: pass # rembg optional — return raw outputs if unavailable return colour_views, normal_views