Spaces:

123123aa123
/

UniGeo

Running on Zero

App Files Files Community

123123aa123 commited on 11 days ago

Commit

2a9fea7

verified ·

1 Parent(s): 9c8a5bf

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -76

app.py CHANGED Viewed

@@ -136,7 +136,7 @@ def load_models():
         #wan_pipe.to(device)
         #wan_pipe.to(dtype=torch.bfloat16)
 # =========================
 # Renderer
 # =========================
@@ -266,20 +266,21 @@ def build_estimate_rel(x, y, z, phi, theta):
 # =========================
 @spaces.GPU
-def infer(image, prompt, seed):
-    load_models()
     img = image.convert("RGB")
     TARGET_H, TARGET_W = img.size[1], img.size[0]
     TARGET_H = TARGET_H // 32 * 32
     TARGET_W = TARGET_W // 32 * 32
     img = img.resize((TARGET_W, TARGET_H), Image.BICUBIC)
     all_steps = generate_all_motions_from_prompt(prompt, num_frames=81)
     cam_idx = list(range(81))
     traj = [build_estimate_rel(*all_steps[idx]) for idx in cam_idx]
@@ -287,16 +288,11 @@ def infer(image, prompt, seed):
     first_frame = load_and_preprocess_images(first_frame)
     first_frame = first_frame.to(device)
     with torch.no_grad():
         with torch.cuda.amp.autocast(dtype=dtype):
             predictions = vggt_model(first_frame)
             extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], first_frame.shape[-2:])
             first_frame_world_points = predictions["world_points"][0][0]
             focals = intrinsic[0][0][:2, :2].diag().unsqueeze(0).to(device)
             principal_points = intrinsic[0][0][:2, 2].unsqueeze(0).to(device)
@@ -304,24 +300,20 @@ def infer(image, prompt, seed):
     raw_image = raw_image.transpose(1, 2, 0)
     render_results_list = []
     for estimate_rel in traj:
         estimate_rel = torch.from_numpy(estimate_rel).float().to(device)
         relative_c2ws = estimate_rel.unsqueeze(0)
         R, T = relative_c2ws[:, :3, :3], relative_c2ws[:, :3, 3:]
         R = torch.stack([-R[:, :, 0], -R[:, :, 1], R[:, :, 2]], 2)
         new_c2w = torch.cat([R, T], 2)
         w2c = torch.linalg.inv(torch.cat(
             (new_c2w, torch.Tensor([[[0, 0, 0, 1]]]).to(device).repeat(new_c2w.shape[0], 1, 1)),
             1
         ))
         R_new, T_new = w2c[:, :3, :3].permute(0, 2, 1), w2c[:, :3, 3]
         image_size = (first_frame.shape[-2:],)
         cameras = PerspectiveCameras(
             focal_length=focals,
             principal_point=principal_points,
@@ -331,7 +323,7 @@ def infer(image, prompt, seed):
             T=T_new,
             device=device
         )
         masks = None
         render_results, viewmask = run_render(
             [first_frame_world_points],
@@ -342,55 +334,60 @@ def infer(image, prompt, seed):
             1,
             device=device
         )
         render_result = (render_results[-1].detach().cpu().numpy() * 255).astype(np.uint8)
         if len(render_result.shape) == 2:
             render_result = cv2.cvtColor(render_result, cv2.COLOR_GRAY2RGB)
         elif render_result.shape[-1] == 4:
             render_result = render_result[..., :3]
         render_results_list.append(render_result)
     raw_image = first_frame[0].cpu().numpy()
     raw_image = raw_image.transpose(1, 2, 0)
     raw_image = (raw_image * 255).clip(0, 255).astype(np.uint8)
     render_results_list[0] = raw_image
-    frame_indices = np.linspace(
-        0,
-        80,
-        25
-    ).round().astype(int)
     frames = []
     for idx in frame_indices:
         frame = render_results_list[idx]
         frame = Image.fromarray(frame)
         frames.append(frame)
     last = frames[-1]
     for _ in range(4):
         frames.append(last)
-    # TARGET_H, TARGET_W = 704, 1248
     def resize_pil(img):
         return img.resize((TARGET_W, TARGET_H), Image.BICUBIC)
     frames = [resize_pil(f) for f in frames]
-    image = resize_pil(image)
     # ===== Wan =====
     video = wan_pipe(
         prompt="Ensure the consistency of the video",
         negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，��容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
-        src_video=frames,
         input_image=image,
         height=TARGET_H,
         width=TARGET_W,
@@ -403,58 +400,67 @@ def infer(image, prompt, seed):
     video_frames = list(video)
     last_frame = np.array(video_frames[-1])
-    pcd_last = frames[-1]
-    return Image.fromarray(last_frame), pcd_last
 # =========================
 # Gradio UI
 # =========================
 with gr.Blocks() as demo:
     # ===== 标题 + 说明 =====
-    gr.Markdown("""
-<div style="line-height:1.2; font-size:16px">
-<b>UniGeo: Unifying Geometric Guidance for Camera-Controllable Image Editing via Video Models</b><br>
-<hr style="margin:6px 0;">
-<b>Input Requirement / 输入要求</b><br>
-The input image is recommended to have width ≥ height due to VGGT and Wan model constraints.<br>
-由于 VGGT 与 Wan 模型限制，建议输入图像满足 宽 ≥ 高。
-<hr style="margin:6px 0;">
-<b>Usage Guide / 使用说明</b><br>
-You can input one or multiple camera commands separated by semicolons, such as “Camera pans left by 15 degrees” or “Camera moves left by 0.27; Camera pans right by 26 degrees”. The motion scale is normalized by VGGT, and the final point cloud is provided to help adjust motion parameters.<br>
-支持输入一条或多条相机控制指令（使用分号分隔），例如“Camera pans left by 15 degrees”或“Camera moves left by 0.27; Camera pans right by 26 degrees”。所有运动数值由 VGGT 统一尺度建模，最终提供的点云结果可用于辅助调整相机运动参数。
-</div>
-""")
-    # ===== 输入输出图 =====
     with gr.Row():
-        inp = gr.Image(type="pil", label="Input Image")
-        out = gr.Image(type="numpy", label="Output Image")
-    # ===== prompt + seed =====
     with gr.Row():
-        txt = gr.Textbox(label="Camera Prompt")
-        seed_inp = gr.Number(value=0, label="Seed", precision=0)
-    run_btn = gr.Button("Run")
-    # ===== 点云输出 =====
-    pcd_out = gr.Image(type="pil", label="Final Frame Point Cloud")
-    # ===== 绑定 =====
-    run_btn.click(
-        fn=infer,
-        inputs=[inp, txt, seed_inp],
-        outputs=[out, pcd_out]
     )
 if __name__ == "__main__":

         #wan_pipe.to(device)
         #wan_pipe.to(dtype=torch.bfloat16)
+load_models()
 # =========================
 # Renderer
 # =========================
 # =========================
 @spaces.GPU
+def generate_pcd(image, prompt):
+    if image is None:
+        raise gr.Error("Please upload an input image!")
+    if not prompt:
+        raise gr.Error("Please enter camera control prompts!")
     img = image.convert("RGB")
     TARGET_H, TARGET_W = img.size[1], img.size[0]
     TARGET_H = TARGET_H // 32 * 32
     TARGET_W = TARGET_W // 32 * 32
     img = img.resize((TARGET_W, TARGET_H), Image.BICUBIC)
     all_steps = generate_all_motions_from_prompt(prompt, num_frames=81)
     cam_idx = list(range(81))
     traj = [build_estimate_rel(*all_steps[idx]) for idx in cam_idx]
     first_frame = load_and_preprocess_images(first_frame)
     first_frame = first_frame.to(device)
     with torch.no_grad():
         with torch.cuda.amp.autocast(dtype=dtype):
             predictions = vggt_model(first_frame)
             extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], first_frame.shape[-2:])
             first_frame_world_points = predictions["world_points"][0][0]
             focals = intrinsic[0][0][:2, :2].diag().unsqueeze(0).to(device)
             principal_points = intrinsic[0][0][:2, 2].unsqueeze(0).to(device)
     raw_image = raw_image.transpose(1, 2, 0)
     render_results_list = []
     for estimate_rel in traj:
         estimate_rel = torch.from_numpy(estimate_rel).float().to(device)
         relative_c2ws = estimate_rel.unsqueeze(0)
         R, T = relative_c2ws[:, :3, :3], relative_c2ws[:, :3, 3:]
         R = torch.stack([-R[:, :, 0], -R[:, :, 1], R[:, :, 2]], 2)
         new_c2w = torch.cat([R, T], 2)
         w2c = torch.linalg.inv(torch.cat(
             (new_c2w, torch.Tensor([[[0, 0, 0, 1]]]).to(device).repeat(new_c2w.shape[0], 1, 1)),
             1
         ))
         R_new, T_new = w2c[:, :3, :3].permute(0, 2, 1), w2c[:, :3, 3]
         image_size = (first_frame.shape[-2:],)
         cameras = PerspectiveCameras(
             focal_length=focals,
             principal_point=principal_points,
             T=T_new,
             device=device
         )
         masks = None
         render_results, viewmask = run_render(
             [first_frame_world_points],
             1,
             device=device
         )
         render_result = (render_results[-1].detach().cpu().numpy() * 255).astype(np.uint8)
         if len(render_result.shape) == 2:
             render_result = cv2.cvtColor(render_result, cv2.COLOR_GRAY2RGB)
         elif render_result.shape[-1] == 4:
             render_result = render_result[..., :3]
         render_results_list.append(render_result)
     raw_image = first_frame[0].cpu().numpy()
     raw_image = raw_image.transpose(1, 2, 0)
     raw_image = (raw_image * 255).clip(0, 255).astype(np.uint8)
     render_results_list[0] = raw_image
+    frame_indices = np.linspace(0, 80, 25).round().astype(int)
     frames = []
     for idx in frame_indices:
         frame = render_results_list[idx]
         frame = Image.fromarray(frame)
         frames.append(frame)
     last = frames[-1]
     for _ in range(4):
         frames.append(last)
     def resize_pil(img):
         return img.resize((TARGET_W, TARGET_H), Image.BICUBIC)
     frames = [resize_pil(f) for f in frames]
+    pcd_last = frames[-1]
+    # 返回给 UI 界面显示最后一张点云图，同时把所有帧数组传给隐藏的 state 变量
+    return pcd_last, frames
+@spaces.GPU
+def generate_final(image, frames, seed):
+    if not frames:
+        raise gr.Error("Please generate point cloud first!")
+    img = image.convert("RGB")
+    TARGET_H, TARGET_W = img.size[1], img.size[0]
+    TARGET_H = TARGET_H // 32 * 32
+    TARGET_W = TARGET_W // 32 * 32
+    def resize_pil(img_to_resize):
+        return img_to_resize.resize((TARGET_W, TARGET_H), Image.BICUBIC)
+    image = resize_pil(img)
     # ===== Wan =====
     video = wan_pipe(
         prompt="Ensure the consistency of the video",
         negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，��容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+        src_video=frames, # 直接使用上一步传过来的 frames 状态
         input_image=image,
         height=TARGET_H,
         width=TARGET_W,
     video_frames = list(video)
     last_frame = np.array(video_frames[-1])
+    return Image.fromarray(last_frame)
 # =========================
 # Gradio UI
 # =========================
 with gr.Blocks() as demo:
     # ===== 标题 + 说明 =====
+    gr.HTML("""<div style="line-height:1.4; font-size:15px">
+            <b style="font-size:18px">UniGeo: Unifying Geometric Guidance for Camera-Controllable Image Editing via Video Models</b><br>
+            <hr style="margin:8px 0;">
+            <b>Input Requirement / 输入要求</b><br>
+            The input image is recommended to have width ≥ height due to VGGT and Wan model constraints.<br>
+            由于 VGGT 与 Wan 模型限制，建议输入图像满足 宽 ≥ 高。<br>
+            <hr style="margin:8px 0;">
+            <b>Usage Guide / 使用说明</b>
+            <ul style="margin-top: 4px; padding-left: 20px;">
+                <li style="margin-bottom: 4px;"><b>Command Format / 指令格式：</b>You can input one or multiple camera commands separated by semicolons (e.g., “Camera pans left by 15 degrees” or “Camera moves left by 0.27; Camera pans right by 26 degrees”).<br>
+                支持输入一条或多条相机控制指令，使用分号分隔（例如“Camera pans left by 15 degrees”或“Camera moves left by 0.27; Camera pans right by 26 degrees”）。</li>
+                <li style="margin-bottom: 4px;"><b>Scale & Adjustment / 尺度与调整：</b>The motion scale is normalized by VGGT, and the final point cloud is provided to help adjust motion parameters.<br>
+                所有运动数值由 VGGT 统一尺度建模，最终提供的点云结果可用于辅助调整相机运动参数。</li>
+                <li><b>First Run / 首次运行：</b>Please note that the first execution will take slightly longer as the models are being loaded into the GPU. <br>
+                首次运行需要将模型权重加载到显存，耗时会稍微久一点，请耐心等待。</li>
+            </ul>
+        </div>""")
+    # 隐藏的状态变量，用于在两步之间传递生成的视频帧
+    frames_state = gr.State([])
+    gr.Markdown("### Step 1: Point Cloud Preview / 步骤一：点云预览与调节")
     with gr.Row():
+        with gr.Column():
+            inp = gr.Image(type="pil", label="Input Image")
+            txt = gr.Textbox(label="Camera Prompt")
+            btn_pcd = gr.Button("Generate Point Cloud (生成点云)")
+        with gr.Column():
+            pcd_out = gr.Image(type="pil", label="Final Frame Point Cloud (预览结果)")
+    gr.Markdown("### Step 2: Final Result Generation / 步骤二：生成最终结果")
     with gr.Row():
+        with gr.Column():
+            seed_inp = gr.Number(value=0, label="Seed", precision=0)
+            btn_final = gr.Button("Generate Final Result (生成编辑结果)", variant="primary")
+        with gr.Column():
+            out = gr.Image(type="numpy", label="Output Image")
+    # ===== 绑定第一步：只生成点云和缓存视频帧 =====
+    btn_pcd.click(
+        fn=generate_pcd,
+        inputs=[inp, txt],
+        outputs=[pcd_out, frames_state] # 界面更新点云图，后台偷偷存下 frames 序列
+    )
+    # ===== 绑定第二步：读取缓存的帧，生成最终图 =====
+    btn_final.click(
+        fn=generate_final,
+        inputs=[inp, frames_state, seed_inp],
+        outputs=[out]
     )
 if __name__ == "__main__":