Spaces:

Daankular
/

Image2Model

Running on Zero

Daankular commited on 6 days ago

Commit

c1af5fa

1 Parent(s): 0a1745e

Integrate PSHuman locally — multi-view diffusion in-process, no remote calls

- pipeline/pshuman_local.py: clones pengHTYX/PSHuman, downloads
pengHTYX/PSHuman_Unclip_768_6views, runs StableUnCLIPImg2ImgPipeline
directly (bypasses inference.py to avoid pytorch3d/kaolin imports).
Uses our own preprocessing (no mediapipe): insightface face-crop or
top-centre heuristic fallback. Returns 6 colour + 6 normal PIL views.
- app.py: gradio_pshuman_face now calls pshuman_local.run_pshuman_diffusion
with @spaces.GPU(duration=180). PSHuman tab shows colour/normal galleries.
Removes remote service URL input entirely.
- requirements.txt: add rembg (bg removal for outputs), icecream (PSHuman dep)

Files changed (3) hide show

app.py +41 -85
pipeline/pshuman_local.py +316 -0
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1525,69 +1525,40 @@ def gradio_animate(
         return None, f"Error:\n{traceback.format_exc()}", None
-# ── PSHuman Face Transplant ────────────────────────────────────────────────────
 def gradio_pshuman_face(
     input_image,
-    rigged_glb_path,
-    weight_threshold: float,
-    retract_mm: float,
-    pshuman_url: str,
     progress=gr.Progress(),
 ):
     """
-    Full PSHuman face transplant pipeline:
-      1. Run PSHuman on input_image → colored OBJ face mesh
-      2. Run face_transplant.py → stitch face into rigged GLB
-      3. Return the combined GLB
-    PSHuman runs as a remote service (pshuman_url). On ZeroGPU the service_url
-    must point to an externally-deployed PSHuman endpoint (PSHUMAN_URL env var
-    or user-provided URL in the UI). Local localhost will not work on ZeroGPU.
     """
     try:
         if input_image is None:
-            return None, "Upload a portrait image first.", None
-        rigged = rigged_glb_path
-        if not rigged or not os.path.exists(str(rigged)):
-            return None, "No rigged GLB found — run the Rig step first.", None
-        work_dir = tempfile.mkdtemp(prefix="pshuman_transplant_")
-        img_path = os.path.join(work_dir, "portrait.png")
-        if isinstance(input_image, np.ndarray):
-            Image.fromarray(input_image).save(img_path)
-        else:
-            input_image.save(img_path)
-        # pipeline/ is already in sys.path via PIPELINE_DIR insertion at startup
-        # ── Step 1: PSHuman inference ──────────────────────────────────────────
-        progress(0.05, desc="Step 1/2: Running PSHuman (generates multi-view face)...")
-        from pipeline.pshuman_client import generate_pshuman_mesh
-        face_obj = os.path.join(work_dir, "pshuman_face.obj")
-        generate_pshuman_mesh(
-            image_path  = img_path,
-            output_path = face_obj,
-            service_url = pshuman_url.strip() or "http://localhost:7862",
-        )
-        # ── Step 2: Face transplant ────────────────────────────────────────────
-        progress(0.7, desc="Step 2/2: Stitching PSHuman face into rigged GLB...")
-        out_glb = os.path.join(work_dir, "rigged_pshuman_face.glb")
-        from pipeline.face_transplant import transplant_face
-        transplant_face(
-            body_glb_path      = str(rigged),
-            pshuman_mesh_path  = face_obj,
-            output_path        = out_glb,
-            weight_threshold   = float(weight_threshold),
-            retract_amount     = float(retract_mm) / 1000.0,  # mm → metres
-        )
         progress(1.0, desc="Done!")
-        return out_glb, "PSHuman face transplant complete.", out_glb
     except Exception:
-        return None, f"Error:\n{traceback.format_exc()}", None
 # ── Full pipeline ─────────────────────────────────────────────────────────────
@@ -1867,53 +1838,38 @@ with gr.Blocks(title="Image2Model", theme=gr.themes.Soft()) as demo:
         # ════════════════════════════════════════════════════════════════════
         with gr.Tab("PSHuman Face"):
             gr.Markdown(
-                "### PSHuman Face Transplant\n"
-                "Generates a high-detail face mesh via PSHuman (multi-view diffusion), "
-                "then transplants it into the rigged GLB.\n\n"
-                "**Pipeline:** portrait → PSHuman (remote service) → colored OBJ → face_transplant → rigged GLB with HD face\n\n"
-                "**Note:** On ZeroGPU, PSHuman must run as a remote service. "
-                "Set `PSHUMAN_URL` environment variable or enter the URL below."
             )
             with gr.Row():
                 with gr.Column(scale=1):
                     pshuman_img_input = gr.Image(
-                        label="Portrait image (same as used for Generate)",
                         type="pil",
                     )
-                    with gr.Accordion("Advanced settings", open=False):
-                        pshuman_weight_thresh = gr.Slider(
-                            minimum=0.1, maximum=0.9, value=0.35, step=0.05,
-                            label="Head bone weight threshold",
-                            info="Vertices with head-bone weight above this get replaced",
-                        )
-                        pshuman_retract_mm = gr.Slider(
-                            minimum=0.0, maximum=20.0, value=4.0, step=0.5,
-                            label="Face retract (mm)",
-                            info="How far to push original face verts inward to avoid z-fighting",
-                        )
-                        pshuman_service_url = gr.Textbox(
-                            label="PSHuman service URL",
-                            value=os.environ.get("PSHUMAN_URL", "http://localhost:7862"),
-                            info="pshuman_app.py Gradio endpoint (deployed separately)",
-                        )
-                    pshuman_btn = gr.Button("Generate HD Face", variant="primary")
                 with gr.Column(scale=2):
-                    pshuman_status   = gr.Textbox(label="Status", lines=4, interactive=False)
-                    pshuman_model_3d = gr.Model3D(
-                        label="Preview", clear_color=[0.9, 0.9, 0.9, 1.0])
-                    pshuman_glb_dl   = gr.File(label="Download GLB (with PSHuman face)")
             pshuman_btn.click(
                 fn=gradio_pshuman_face,
-                inputs=[
-                    pshuman_img_input,
-                    rigged_glb_state,
-                    pshuman_weight_thresh,
-                    pshuman_retract_mm,
-                    pshuman_service_url,
-                ],
-                outputs=[pshuman_glb_dl, pshuman_status, pshuman_model_3d],
             )
         # ════════════════════════════════════════════════════════════════════

         return None, f"Error:\n{traceback.format_exc()}", None
+# ── PSHuman Multi-View ────────────────────────────────────────────────────────
+@spaces.GPU(duration=180)
 def gradio_pshuman_face(
     input_image,
     progress=gr.Progress(),
 ):
     """
+    Run PSHuman multi-view diffusion locally (in-process).
+    Returns 6 colour views + 6 normal-map views of the person.
+    Full mesh reconstruction (pytorch3d / kaolin / torch_scatter) is skipped —
+    those packages have no Python 3.13 wheels.  The generated views can be used
+    directly for inspection or fed into the face-swap step.
     """
     try:
         if input_image is None:
+            return None, None, "Upload a portrait image first."
+        img = (Image.fromarray(input_image) if isinstance(input_image, np.ndarray)
+               else input_image.convert("RGBA") if input_image.mode != "RGBA"
+               else input_image)
+        progress(0.1, desc="Loading PSHuman pipeline…")
+        from pipeline.pshuman_local import run_pshuman_diffusion
+        progress(0.2, desc="Running multi-view diffusion (40 steps × 7 views)…")
+        colour_views, normal_views = run_pshuman_diffusion(img, device="cuda")
         progress(1.0, desc="Done!")
+        return colour_views, normal_views, "Multi-view generation complete."
     except Exception:
+        return None, None, f"Error:\n{traceback.format_exc()}"
 # ── Full pipeline ─────────────────────────────────────────────────────────────
         # ════════════════════════════════════════════════════════════════════
         with gr.Tab("PSHuman Face"):
             gr.Markdown(
+                "### PSHuman Multi-View (local)\n"
+                "Generates 6 colour + 6 normal-map views of a person using "
+                "[PSHuman](https://github.com/pengHTYX/PSHuman) "
+                "(StableUnCLIP fine-tuned on multi-view human images).\n\n"
+                "**Pipeline:** portrait → multi-view diffusion (in-process) → "
+                "6 × colour + 6 × normal views\n\n"
+                "**Views:** front · front-right · right · back · left · front-left"
             )
             with gr.Row():
                 with gr.Column(scale=1):
                     pshuman_img_input = gr.Image(
+                        label="Portrait image",
                         type="pil",
                     )
+                    pshuman_btn = gr.Button("Generate Views", variant="primary")
                 with gr.Column(scale=2):
+                    pshuman_status = gr.Textbox(
+                        label="Status", lines=2, interactive=False)
+                    pshuman_colour_gallery = gr.Gallery(
+                        label="Colour views (front → front-right → right → back → left → front-left)",
+                        columns=3, rows=2, height=420,
+                    )
+                    pshuman_normal_gallery = gr.Gallery(
+                        label="Normal maps",
+                        columns=3, rows=2, height=420,
+                    )
             pshuman_btn.click(
                 fn=gradio_pshuman_face,
+                inputs=[pshuman_img_input],
+                outputs=[pshuman_colour_gallery, pshuman_normal_gallery, pshuman_status],
             )
         # ════════════════════════════════════════════════════════════════════

pipeline/pshuman_local.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+pshuman_local.py
+================
+Run PSHuman multi-view diffusion in-process on ZeroGPU.
+Generates 6 colour views + 6 normal views from a single portrait image.
+Full mesh reconstruction (SMPL fitting, pytorch3d/kaolin texture projection)
+is intentionally skipped — those deps have no Python 3.13 wheels.
+Model : pengHTYX/PSHuman_Unclip_768_6views
+Repo  : https://github.com/pengHTYX/PSHuman (cloned to /tmp/pshuman-src)
+The preprocessing replaces PSHuman's mediapipe-based face-crop helper with a
+self-contained approach (insightface if available, else top-centre crop) so no
+additional exotic deps are needed.
+"""
+from __future__ import annotations
+import os
+import sys
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+# ── Constants ─────────────────────────────────────────────────────────────────
+_PSHUMAN_SRC   = Path("/tmp/pshuman-src")
+_PSHUMAN_CKPT  = Path("/tmp/pshuman-ckpts")
+_PSHUMAN_REPO  = "https://github.com/pengHTYX/PSHuman.git"
+_PSHUMAN_HF_ID = "pengHTYX/PSHuman_Unclip_768_6views"
+_CANVAS_SIZE   = 768    # model input resolution
+_CROP_SIZE     = 740    # subject bounding-box target size within canvas
+_NUM_VIEWS     = 7      # 6 body views + 1 face-crop view (model expectation)
+# Cached pipeline (survives across ZeroGPU calls when the Space is warm)
+_pipeline = None
+# ── Setup helpers ─────────────────────────────────────────────────────────────
+def _ensure_repo() -> None:
+    if _PSHUMAN_SRC.exists():
+        return
+    print("[pshuman] Cloning PSHuman repo…")
+    subprocess.run(
+        ["git", "clone", "--depth=1", _PSHUMAN_REPO, str(_PSHUMAN_SRC)],
+        check=True,
+    )
+    print("[pshuman] Repo cloned.")
+def _ensure_sys_path() -> None:
+    _ensure_repo()
+    src = str(_PSHUMAN_SRC)
+    if src not in sys.path:
+        sys.path.insert(0, src)
+def _ensure_ckpt() -> None:
+    if _PSHUMAN_CKPT.exists():
+        return
+    from huggingface_hub import snapshot_download
+    print("[pshuman] Downloading model weights…")
+    snapshot_download(repo_id=_PSHUMAN_HF_ID, local_dir=str(_PSHUMAN_CKPT))
+    print("[pshuman] Weights downloaded.")
+def load_pipeline(device: str = "cuda"):
+    """Load (and cache) the PSHuman StableUnCLIPImg2ImgPipeline."""
+    global _pipeline
+    if _pipeline is not None:
+        _pipeline.to(device)
+        return _pipeline
+    _ensure_sys_path()
+    _ensure_ckpt()
+    from mvdiffusion.pipelines.pipeline_mvdiffusion_unclip import (
+        StableUnCLIPImg2ImgPipeline,
+    )
+    print("[pshuman] Loading pipeline…")
+    pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+        str(_PSHUMAN_CKPT),
+        torch_dtype=torch.float16,
+    )
+    try:
+        pipe.unet.enable_xformers_memory_efficient_attention()
+    except Exception:
+        pass
+    pipe.to(device)
+    _pipeline = pipe
+    print("[pshuman] Pipeline ready.")
+    return pipe
+# ── Image preprocessing ───────────────────────────────────────────────────────
+def _preprocess_subject(image: Image.Image) -> Image.Image:
+    """
+    Replicate PSHuman's load_image preprocessing:
+      1. Ensure RGBA (background = white if no alpha).
+      2. Crop to the bounding box of the alpha channel.
+      3. Scale the longest side to _CROP_SIZE.
+      4. Paste centred on a white _CANVAS_SIZE × _CANVAS_SIZE canvas.
+    Returns an RGB image (768 × 768).
+    """
+    if image.mode != "RGBA":
+        # No alpha → treat entire image as foreground
+        rgba = image.convert("RGBA")
+        alpha = np.ones((rgba.size[1], rgba.size[0]), dtype=np.uint8) * 255
+        rgba.putalpha(Image.fromarray(alpha))
+        image = rgba
+    else:
+        image = image.copy()
+    arr = np.array(image)          # H W 4
+    a   = arr[:, :, 3]
+    # Bounding box of non-transparent pixels
+    rows = np.any(a > 0, axis=1)
+    cols = np.any(a > 0, axis=0)
+    if rows.any():
+        r0, r1 = np.where(rows)[0][[0, -1]]
+        c0, c1 = np.where(cols)[0][[0, -1]]
+        crop = image.crop((c0, r0, c1 + 1, r1 + 1))
+    else:
+        crop = image
+    # Scale longest side → _CROP_SIZE
+    cw, ch = crop.size
+    scale  = _CROP_SIZE / max(cw, ch)
+    new_w  = round(cw * scale)
+    new_h  = round(ch * scale)
+    crop   = crop.resize((new_w, new_h), Image.LANCZOS)
+    # Paste on white canvas
+    canvas = Image.new("RGB", (_CANVAS_SIZE, _CANVAS_SIZE), (255, 255, 255))
+    x_off  = (_CANVAS_SIZE - new_w) // 2
+    y_off  = (_CANVAS_SIZE - new_h) // 2
+    if crop.mode == "RGBA":
+        canvas.paste(crop.convert("RGB"), (x_off, y_off), crop.split()[3])
+    else:
+        canvas.paste(crop.convert("RGB"), (x_off, y_off))
+    return canvas
+def _get_face_crop(image: Image.Image) -> Image.Image:
+    """
+    Return a 256×256 face crop from *image* (768×768 preprocessed subject).
+    Tries insightface first; falls back to top-centre heuristic.
+    """
+    size = 256
+    # ── insightface ────────────────────────────────────────────────────────
+    try:
+        import insightface
+        from insightface.app import FaceAnalysis
+        _fa = FaceAnalysis(allowed_modules=["detection"])
+        _fa.prepare(ctx_id=0 if torch.cuda.is_available() else -1, det_size=(320, 320))
+        faces = _fa.get(np.array(image))
+        if faces:
+            b = faces[0].bbox.astype(int)
+            x0, y0, x1, y1 = max(0, b[0]), max(0, b[1]), min(image.width, b[2]), min(image.height, b[3])
+            face_img = image.crop((x0, y0, x1, y1)).resize((size, size), Image.LANCZOS)
+            return face_img
+    except Exception:
+        pass
+    # ── heuristic: top-centre 40 % of image ────────────────────────────────
+    w, h = image.size
+    face_h = int(h * 0.40)
+    margin = int(w * 0.15)
+    face_img = image.crop((margin, 0, w - margin, face_h)).resize((size, size), Image.LANCZOS)
+    return face_img
+def _to_tensor(img: Image.Image) -> torch.Tensor:
+    """PIL RGB → float32 tensor (3, H, W) in [0, 1]."""
+    arr = np.array(img.convert("RGB"), dtype=np.float32) / 255.0
+    return torch.from_numpy(arr).permute(2, 0, 1)  # (3, H, W)
+def _to_pil(t: torch.Tensor) -> Image.Image:
+    """Float tensor (3, H, W) in [0, 1] → PIL RGB."""
+    arr = (t.float().clamp(0.0, 1.0).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+    return Image.fromarray(arr)
+# ── Main inference ─────────────────────────────────────────────────────────────
+def run_pshuman_diffusion(
+    image: Image.Image,
+    device: str = "cuda",
+    seed: int = 42,
+    guidance_scale: float = 3.0,
+    num_inference_steps: int = 40,
+    remove_bg_output: bool = True,
+) -> Tuple[List[Image.Image], List[Image.Image]]:
+    """
+    Run PSHuman multi-view diffusion on a single person image.
+    Parameters
+    ----------
+    image              : Input PIL image (RGBA with bg removed, or RGB)
+    device             : 'cuda' or 'cpu'
+    seed               : RNG seed for reproducibility
+    guidance_scale     : CFG scale (default 3.0, per PSHuman paper)
+    num_inference_steps: Diffusion steps (default 40)
+    remove_bg_output   : Strip background from output colour views via rembg
+    Returns
+    -------
+    colour_views : List[PIL.Image] — 6 colour images
+                   [front, front-right, right, back, left, front-left]
+    normal_views : List[PIL.Image] — 6 matching normal maps
+    """
+    from einops import rearrange
+    _ensure_sys_path()
+    _ensure_ckpt()
+    # ── 1. Preprocessing ──────────────────────────────────────────────────────
+    # PSHuman was trained on right-facing subjects → flip input left-right
+    flipped = image.transpose(Image.FLIP_LEFT_RIGHT)
+    subject = _preprocess_subject(flipped)   # 768×768 RGB
+    face    = _get_face_crop(subject)        # 256×256 RGB
+    body_t  = _to_tensor(subject)            # (3, 768, 768)
+    face_t  = _to_tensor(face.resize((_CANVAS_SIZE, _CANVAS_SIZE), Image.LANCZOS))  # (3, 768, 768)
+    # Stack: first (Nv-1) slots = body image, last slot = face crop
+    imgs_in = torch.stack(
+        [body_t] * (_NUM_VIEWS - 1) + [face_t], dim=0
+    ).float().unsqueeze(0)                   # (1, 7, 3, 768, 768)
+    # ── 2. Prompt embeddings ─────────────────────────────────────────────────
+    embeds_dir = _PSHUMAN_SRC / "mvdiffusion" / "data" / "fixed_prompt_embeds_7view"
+    normal_embeds = torch.load(embeds_dir / "normal_embeds.pt", map_location="cpu")
+    color_embeds  = torch.load(embeds_dir / "clr_embeds.pt",    map_location="cpu")
+    # Shapes from repo: (1, Nv, N, C) or (Nv, N, C) — normalise to (1, Nv, N, C)
+    if normal_embeds.dim() == 3:
+        normal_embeds = normal_embeds.unsqueeze(0)
+    if color_embeds.dim() == 3:
+        color_embeds = color_embeds.unsqueeze(0)
+    # ── 3. Batch construction (duplicate for CFG) ─────────────────────────────
+    # imgs_in: (2, 7, 3, 768, 768) → flat (14, 3, 768, 768)
+    imgs_2x   = torch.cat([imgs_in, imgs_in], dim=0).to(device, dtype=torch.float16)
+    imgs_flat = rearrange(imgs_2x, "B Nv C H W -> (B Nv) C H W")
+    # prompt_embeds: (2, 7, N, C) → flat (14, N, C)
+    p_emb = torch.cat([normal_embeds, color_embeds], dim=0).to(device, dtype=torch.float16)
+    p_emb = rearrange(p_emb, "B Nv N C -> (B Nv) N C")
+    # ── 4. Diffusion ──────────────────────────────────────────────────────────
+    pipe = load_pipeline(device)
+    gen  = torch.Generator(device=device).manual_seed(seed)
+    with torch.autocast("cuda"):
+        out_images = pipe(
+            imgs_flat,
+            None,                           # image_embeds slot
+            prompt_embeds=p_emb,
+            generator=gen,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            eta=1.0,
+            output_type="pt",
+            num_images_per_prompt=1,
+        ).images                            # (14, 3, H, W)  float [0,1]
+    # ── 5. Split normals / colours ────────────────────────────────────────────
+    bsz        = out_images.shape[0] // 2  # = 7
+    normals_pt = out_images[:bsz].clone()  # (7, 3, H, W)
+    colors_pt  = out_images[bsz:].clone()  # (7, 3, H, W)
+    # View 0 colour = input (PSHuman convention — not generated)
+    colors_pt[0] = imgs_flat[0].to(out_images.device)
+    # Views 3 & 4 are generated horizontally mirrored → flip back
+    for j in (3, 4):
+        normals_pt[j] = torch.flip(normals_pt[j], dims=[2])
+        colors_pt[j]  = torch.flip(colors_pt[j],  dims=[2])
+    # Paste the face-normal crop (view 6, 256px) into the top-right of normals[0]
+    face_nrm = F.interpolate(
+        normals_pt[6].unsqueeze(0), size=(256, 256),
+        mode="bilinear", align_corners=False,
+    ).squeeze(0)
+    normals_pt[0][:, :256, 256:512] = face_nrm
+    # ── 6. Convert to PIL (body views 0–5 only, skip face-crop view 6) ───────
+    colour_views = [_to_pil(colors_pt[j])  for j in range(6)]
+    normal_views = [_to_pil(normals_pt[j]) for j in range(6)]
+    # All outputs are in PSHuman's flipped (right-facing) space.
+    # Flip back so they match the user's original image orientation.
+    colour_views = [v.transpose(Image.FLIP_LEFT_RIGHT) for v in colour_views]
+    normal_views = [v.transpose(Image.FLIP_LEFT_RIGHT) for v in normal_views]
+    if remove_bg_output:
+        try:
+            from rembg import remove as _rembg
+            colour_views = [_rembg(v) for v in colour_views]
+        except Exception:
+            pass  # rembg optional — return raw outputs if unavailable
+    return colour_views, normal_views

requirements.txt CHANGED Viewed

@@ -68,6 +68,8 @@ scikit-learn
 pandas
 # Utils
 easydict
 omegaconf
 yacs

 pandas
 # Utils
+rembg
+icecream
 easydict
 omegaconf
 yacs