Spaces:
Running on Zero
Running on Zero
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import os | |
| import cv2 | |
| import numpy as np | |
| import requests | |
| import trimesh | |
| from matplotlib import colormaps | |
| from scipy.spatial.transform import Rotation | |
| def predictions_to_glb( | |
| predictions: dict, | |
| conf_thres: float = 20.0, | |
| mask_black_bg: bool = False, | |
| mask_white_bg: bool = False, | |
| show_cam: bool = True, | |
| mask_sky: bool = False, | |
| target_dir: str | None = None, | |
| max_points: int = 300000, | |
| filter_depth_edges: bool = True, | |
| depth_edge_rtol: float = 0.03, | |
| ) -> trimesh.Scene: | |
| """Convert VGGT-Omega camera/depth predictions to a GLB scene.""" | |
| if not isinstance(predictions, dict): | |
| raise ValueError("predictions must be a dictionary") | |
| conf_thres = max(2.0, float(conf_thres)) | |
| points = predictions["world_points_from_depth"] | |
| conf = predictions["depth_conf"] | |
| if filter_depth_edges and "depth" in predictions: | |
| conf = conf.copy() | |
| conf[depth_edge(predictions["depth"][..., 0], rtol=depth_edge_rtol)] = 0.0 | |
| images = predictions["images"] | |
| camera_matrices = predictions["extrinsic"] | |
| if mask_sky and target_dir is not None: | |
| conf = apply_sky_mask(conf, target_dir) | |
| vertices = points.reshape(-1, 3) | |
| colors = _images_to_rgb(images).reshape(-1, 3) | |
| colors = (colors * 255).clip(0, 255).astype(np.uint8) | |
| conf = conf.reshape(-1) | |
| mask = np.isfinite(vertices).all(axis=1) & np.isfinite(conf) | |
| if conf_thres > 0 and np.any(mask): | |
| conf_threshold = np.percentile(conf[mask], conf_thres) | |
| mask &= conf >= conf_threshold | |
| mask &= conf > 1e-5 | |
| if mask_black_bg: | |
| mask &= colors.sum(axis=1) >= 16 | |
| if mask_white_bg: | |
| mask &= ~((colors[:, 0] > 240) & (colors[:, 1] > 240) & (colors[:, 2] > 240)) | |
| vertices = vertices[mask] | |
| colors = colors[mask] | |
| vertices, colors = _limit_points(vertices, colors, max_points) | |
| if vertices.size == 0: | |
| vertices = np.array([[0.0, 0.0, 0.0]], dtype=np.float32) | |
| colors = np.array([[255, 255, 255]], dtype=np.uint8) | |
| scene_scale = 1.0 | |
| else: | |
| lower = np.percentile(vertices, 5, axis=0) | |
| upper = np.percentile(vertices, 95, axis=0) | |
| scene_scale = float(np.linalg.norm(upper - lower)) | |
| if scene_scale <= 0: | |
| scene_scale = 1.0 | |
| scene = trimesh.Scene() | |
| scene.add_geometry(trimesh.PointCloud(vertices=vertices, colors=colors)) | |
| extrinsics = np.zeros((len(camera_matrices), 4, 4), dtype=np.float64) | |
| extrinsics[:, :3, :4] = camera_matrices | |
| extrinsics[:, 3, 3] = 1.0 | |
| if show_cam: | |
| colormap = colormaps.get_cmap("gist_rainbow") | |
| for i, world_to_camera in enumerate(extrinsics): | |
| camera_to_world = np.linalg.inv(world_to_camera) | |
| rgba = colormap(i / max(len(extrinsics), 1)) | |
| color = tuple(int(255 * x) for x in rgba[:3]) | |
| integrate_camera_into_scene(scene, camera_to_world, color, scene_scale) | |
| return apply_scene_alignment(scene, extrinsics) | |
| def _images_to_rgb(images: np.ndarray) -> np.ndarray: | |
| if images.ndim == 4 and images.shape[1] == 3: | |
| return np.transpose(images, (0, 2, 3, 1)) | |
| return images | |
| def _limit_points(vertices: np.ndarray, colors: np.ndarray, max_points: int) -> tuple[np.ndarray, np.ndarray]: | |
| if max_points <= 0 or len(vertices) <= max_points: | |
| return vertices, colors | |
| indices = np.linspace(0, len(vertices) - 1, max_points).astype(np.int64) | |
| return vertices[indices], colors[indices] | |
| def depth_edge(depth: np.ndarray, rtol: float = 0.03, kernel_size: int = 3) -> np.ndarray: | |
| depth = np.asarray(depth) | |
| original_shape = depth.shape | |
| depth = depth.reshape(-1, *original_shape[-2:]) | |
| pad = kernel_size // 2 | |
| padded = np.pad(depth, ((0, 0), (pad, pad), (pad, pad)), mode="edge") | |
| depth_max = np.full_like(depth, -np.inf) | |
| depth_min = np.full_like(depth, np.inf) | |
| for y in range(kernel_size): | |
| for x in range(kernel_size): | |
| window = padded[:, y : y + depth.shape[-2], x : x + depth.shape[-1]] | |
| depth_max = np.maximum(depth_max, window) | |
| depth_min = np.minimum(depth_min, window) | |
| relative_jump = (depth_max - depth_min) / np.maximum(np.abs(depth), 1e-6) | |
| return (relative_jump > rtol).reshape(original_shape) | |
| def integrate_camera_into_scene(scene: trimesh.Scene, transform: np.ndarray, face_colors: tuple, scene_scale: float): | |
| cam_width = scene_scale * 0.05 | |
| cam_height = scene_scale * 0.1 | |
| rot_45_degree = np.eye(4) | |
| rot_45_degree[:3, :3] = Rotation.from_euler("z", 45, degrees=True).as_matrix() | |
| rot_45_degree[2, 3] = -cam_height | |
| complete_transform = transform @ get_opengl_conversion_matrix() @ rot_45_degree | |
| camera_cone_shape = trimesh.creation.cone(cam_width, cam_height, sections=4) | |
| slight_rotation = np.eye(4) | |
| slight_rotation[:3, :3] = Rotation.from_euler("z", 2, degrees=True).as_matrix() | |
| vertices = np.concatenate( | |
| [ | |
| camera_cone_shape.vertices, | |
| 0.95 * camera_cone_shape.vertices, | |
| transform_points(slight_rotation, camera_cone_shape.vertices), | |
| ] | |
| ) | |
| vertices = transform_points(complete_transform, vertices) | |
| camera_mesh = trimesh.Trimesh(vertices=vertices, faces=compute_camera_faces(camera_cone_shape)) | |
| camera_mesh.visual.face_colors[:, :3] = face_colors | |
| scene.add_geometry(camera_mesh) | |
| def apply_scene_alignment(scene: trimesh.Scene, extrinsics: np.ndarray) -> trimesh.Scene: | |
| opengl_conversion_matrix = get_opengl_conversion_matrix() | |
| scene.apply_transform(np.linalg.inv(extrinsics[0]) @ opengl_conversion_matrix) | |
| return scene | |
| def get_opengl_conversion_matrix() -> np.ndarray: | |
| matrix = np.identity(4) | |
| matrix[1, 1] = -1 | |
| matrix[2, 2] = -1 | |
| return matrix | |
| def transform_points(transformation: np.ndarray, points: np.ndarray, dim: int | None = None) -> np.ndarray: | |
| points = np.asarray(points) | |
| initial_shape = points.shape[:-1] | |
| dim = dim or points.shape[-1] | |
| transformation = transformation.swapaxes(-1, -2) | |
| points = points @ transformation[..., :-1, :] + transformation[..., -1:, :] | |
| return points[..., :dim].reshape(*initial_shape, dim) | |
| def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray: | |
| faces = [] | |
| num_vertices = len(cone_shape.vertices) | |
| for face in cone_shape.faces: | |
| if 0 in face: | |
| continue | |
| v1, v2, v3 = face | |
| v1_offset, v2_offset, v3_offset = face + num_vertices | |
| v1_offset_2, v2_offset_2, v3_offset_2 = face + 2 * num_vertices | |
| faces.extend( | |
| [ | |
| (v1, v2, v2_offset), | |
| (v1, v1_offset, v3), | |
| (v3_offset, v2, v3), | |
| (v1, v2, v2_offset_2), | |
| (v1, v1_offset_2, v3), | |
| (v3_offset_2, v2, v3), | |
| ] | |
| ) | |
| faces += [(v3, v2, v1) for v1, v2, v3 in faces] | |
| return np.array(faces) | |
| def apply_sky_mask(conf: np.ndarray, target_dir: str) -> np.ndarray: | |
| image_dir = os.path.join(target_dir, "images") | |
| image_names = sorted(os.listdir(image_dir)) | |
| height, width = conf.shape[-2:] | |
| masks = [] | |
| skyseg_session = None | |
| for image_name in image_names: | |
| image_path = os.path.join(image_dir, image_name) | |
| mask_path = os.path.join(target_dir, "sky_masks", image_name) | |
| if os.path.exists(mask_path): | |
| sky_mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE) | |
| else: | |
| if not os.path.exists("skyseg.onnx"): | |
| download_file_from_url( | |
| "https://huggingface.co/JianyuanWang/skyseg/resolve/main/skyseg.onnx", | |
| "skyseg.onnx", | |
| ) | |
| if skyseg_session is None: | |
| import onnxruntime | |
| skyseg_session = onnxruntime.InferenceSession("skyseg.onnx") | |
| sky_mask = segment_sky(image_path, skyseg_session, mask_path) | |
| if sky_mask.shape != (height, width): | |
| sky_mask = cv2.resize(sky_mask, (width, height)) | |
| masks.append(sky_mask) | |
| return conf * (np.array(masks) > 0.1).astype(np.float32) | |
| def segment_sky(image_path: str, onnx_session, mask_filename: str) -> np.ndarray: | |
| image = cv2.imread(image_path) | |
| result_map = run_skyseg(onnx_session, [320, 320], image) | |
| result_map = cv2.resize(result_map, (image.shape[1], image.shape[0])) | |
| output_mask = np.zeros_like(result_map) | |
| output_mask[result_map < 32] = 255 | |
| os.makedirs(os.path.dirname(mask_filename), exist_ok=True) | |
| cv2.imwrite(mask_filename, output_mask) | |
| return output_mask | |
| def run_skyseg(onnx_session, input_size: list[int], image: np.ndarray) -> np.ndarray: | |
| image = cv2.resize(image, dsize=(input_size[0], input_size[1])) | |
| image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| image = np.array(image, dtype=np.float32) | |
| image = (image / 255 - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225] | |
| image = image.transpose(2, 0, 1) | |
| image = image.reshape(-1, 3, input_size[0], input_size[1]).astype("float32") | |
| input_name = onnx_session.get_inputs()[0].name | |
| output_name = onnx_session.get_outputs()[0].name | |
| result = onnx_session.run([output_name], {input_name: image}) | |
| result = np.array(result).squeeze() | |
| result_min = np.min(result) | |
| result_max = np.max(result) | |
| if result_max > result_min: | |
| result = (result - result_min) / (result_max - result_min) | |
| else: | |
| result = np.zeros_like(result) | |
| return (result * 255).astype("uint8") | |
| def download_file_from_url(url: str, filename: str) -> None: | |
| tmp_filename = f"{filename}.tmp" | |
| response = requests.get(url, stream=True) | |
| response.raise_for_status() | |
| with open(tmp_filename, "wb") as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| os.replace(tmp_filename, filename) | |