Spaces:

facebook
/

vggt-omega

Running on Zero

App Files Files Community

vggt-omega / visual_util.py

JianyuanWang

coordinate

a00efd4 9 days ago

raw

history blame contribute delete

10 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import os

	import cv2
	import numpy as np
	import requests
	import trimesh
	from matplotlib import colormaps
	from scipy.spatial.transform import Rotation


	def predictions_to_glb(
	predictions: dict,
	conf_thres: float = 20.0,
	mask_black_bg: bool = False,
	mask_white_bg: bool = False,
	show_cam: bool = True,
	mask_sky: bool = False,
	target_dir: str \| None = None,
	max_points: int = 300000,
	filter_depth_edges: bool = True,
	depth_edge_rtol: float = 0.03,
	) -> trimesh.Scene:
	"""Convert VGGT-Omega camera/depth predictions to a GLB scene."""
	if not isinstance(predictions, dict):
	raise ValueError("predictions must be a dictionary")

	conf_thres = max(2.0, float(conf_thres))

	points = predictions["world_points_from_depth"]
	conf = predictions["depth_conf"]
	if filter_depth_edges and "depth" in predictions:
	conf = conf.copy()
	conf[depth_edge(predictions["depth"][..., 0], rtol=depth_edge_rtol)] = 0.0
	images = predictions["images"]
	camera_matrices = predictions["extrinsic"]

	if mask_sky and target_dir is not None:
	conf = apply_sky_mask(conf, target_dir)

	vertices = points.reshape(-1, 3)
	colors = _images_to_rgb(images).reshape(-1, 3)
	colors = (colors * 255).clip(0, 255).astype(np.uint8)
	conf = conf.reshape(-1)

	mask = np.isfinite(vertices).all(axis=1) & np.isfinite(conf)
	if conf_thres > 0 and np.any(mask):
	conf_threshold = np.percentile(conf[mask], conf_thres)
	mask &= conf >= conf_threshold
	mask &= conf > 1e-5

	if mask_black_bg:
	mask &= colors.sum(axis=1) >= 16
	if mask_white_bg:
	mask &= ~((colors[:, 0] > 240) & (colors[:, 1] > 240) & (colors[:, 2] > 240))

	vertices = vertices[mask]
	colors = colors[mask]
	vertices, colors = _limit_points(vertices, colors, max_points)

	if vertices.size == 0:
	vertices = np.array([[0.0, 0.0, 0.0]], dtype=np.float32)
	colors = np.array([[255, 255, 255]], dtype=np.uint8)
	scene_scale = 1.0
	else:
	lower = np.percentile(vertices, 5, axis=0)
	upper = np.percentile(vertices, 95, axis=0)
	scene_scale = float(np.linalg.norm(upper - lower))
	if scene_scale <= 0:
	scene_scale = 1.0

	scene = trimesh.Scene()
	scene.add_geometry(trimesh.PointCloud(vertices=vertices, colors=colors))

	extrinsics = np.zeros((len(camera_matrices), 4, 4), dtype=np.float64)
	extrinsics[:, :3, :4] = camera_matrices
	extrinsics[:, 3, 3] = 1.0

	if show_cam:
	colormap = colormaps.get_cmap("gist_rainbow")
	for i, world_to_camera in enumerate(extrinsics):
	camera_to_world = np.linalg.inv(world_to_camera)
	rgba = colormap(i / max(len(extrinsics), 1))
	color = tuple(int(255 * x) for x in rgba[:3])
	integrate_camera_into_scene(scene, camera_to_world, color, scene_scale)

	return apply_scene_alignment(scene, extrinsics)


	def _images_to_rgb(images: np.ndarray) -> np.ndarray:
	if images.ndim == 4 and images.shape[1] == 3:
	return np.transpose(images, (0, 2, 3, 1))
	return images


	def _limit_points(vertices: np.ndarray, colors: np.ndarray, max_points: int) -> tuple[np.ndarray, np.ndarray]:
	if max_points <= 0 or len(vertices) <= max_points:
	return vertices, colors
	indices = np.linspace(0, len(vertices) - 1, max_points).astype(np.int64)
	return vertices[indices], colors[indices]


	def depth_edge(depth: np.ndarray, rtol: float = 0.03, kernel_size: int = 3) -> np.ndarray:
	depth = np.asarray(depth)
	original_shape = depth.shape
	depth = depth.reshape(-1, *original_shape[-2:])

	pad = kernel_size // 2
	padded = np.pad(depth, ((0, 0), (pad, pad), (pad, pad)), mode="edge")
	depth_max = np.full_like(depth, -np.inf)
	depth_min = np.full_like(depth, np.inf)

	for y in range(kernel_size):
	for x in range(kernel_size):
	window = padded[:, y : y + depth.shape[-2], x : x + depth.shape[-1]]
	depth_max = np.maximum(depth_max, window)
	depth_min = np.minimum(depth_min, window)

	relative_jump = (depth_max - depth_min) / np.maximum(np.abs(depth), 1e-6)
	return (relative_jump > rtol).reshape(original_shape)


	def integrate_camera_into_scene(scene: trimesh.Scene, transform: np.ndarray, face_colors: tuple, scene_scale: float):
	cam_width = scene_scale * 0.05
	cam_height = scene_scale * 0.1

	rot_45_degree = np.eye(4)
	rot_45_degree[:3, :3] = Rotation.from_euler("z", 45, degrees=True).as_matrix()
	rot_45_degree[2, 3] = -cam_height

	complete_transform = transform @ get_opengl_conversion_matrix() @ rot_45_degree
	camera_cone_shape = trimesh.creation.cone(cam_width, cam_height, sections=4)

	slight_rotation = np.eye(4)
	slight_rotation[:3, :3] = Rotation.from_euler("z", 2, degrees=True).as_matrix()

	vertices = np.concatenate(
	[
	camera_cone_shape.vertices,
	0.95 * camera_cone_shape.vertices,
	transform_points(slight_rotation, camera_cone_shape.vertices),
	]
	)
	vertices = transform_points(complete_transform, vertices)

	camera_mesh = trimesh.Trimesh(vertices=vertices, faces=compute_camera_faces(camera_cone_shape))
	camera_mesh.visual.face_colors[:, :3] = face_colors
	scene.add_geometry(camera_mesh)


	def apply_scene_alignment(scene: trimesh.Scene, extrinsics: np.ndarray) -> trimesh.Scene:
	opengl_conversion_matrix = get_opengl_conversion_matrix()
	scene.apply_transform(np.linalg.inv(extrinsics[0]) @ opengl_conversion_matrix)
	return scene


	def get_opengl_conversion_matrix() -> np.ndarray:
	matrix = np.identity(4)
	matrix[1, 1] = -1
	matrix[2, 2] = -1
	return matrix


	def transform_points(transformation: np.ndarray, points: np.ndarray, dim: int \| None = None) -> np.ndarray:
	points = np.asarray(points)
	initial_shape = points.shape[:-1]
	dim = dim or points.shape[-1]
	transformation = transformation.swapaxes(-1, -2)
	points = points @ transformation[..., :-1, :] + transformation[..., -1:, :]
	return points[..., :dim].reshape(*initial_shape, dim)


	def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray:
	faces = []
	num_vertices = len(cone_shape.vertices)

	for face in cone_shape.faces:
	if 0 in face:
	continue
	v1, v2, v3 = face
	v1_offset, v2_offset, v3_offset = face + num_vertices
	v1_offset_2, v2_offset_2, v3_offset_2 = face + 2 * num_vertices

	faces.extend(
	[
	(v1, v2, v2_offset),
	(v1, v1_offset, v3),
	(v3_offset, v2, v3),
	(v1, v2, v2_offset_2),
	(v1, v1_offset_2, v3),
	(v3_offset_2, v2, v3),
	]
	)

	faces += [(v3, v2, v1) for v1, v2, v3 in faces]
	return np.array(faces)


	def apply_sky_mask(conf: np.ndarray, target_dir: str) -> np.ndarray:
	image_dir = os.path.join(target_dir, "images")
	image_names = sorted(os.listdir(image_dir))
	height, width = conf.shape[-2:]
	masks = []
	skyseg_session = None

	for image_name in image_names:
	image_path = os.path.join(image_dir, image_name)
	mask_path = os.path.join(target_dir, "sky_masks", image_name)
	if os.path.exists(mask_path):
	sky_mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
	else:
	if not os.path.exists("skyseg.onnx"):
	download_file_from_url(
	"https://huggingface.co/JianyuanWang/skyseg/resolve/main/skyseg.onnx",
	"skyseg.onnx",
	)
	if skyseg_session is None:
	import onnxruntime

	skyseg_session = onnxruntime.InferenceSession("skyseg.onnx")
	sky_mask = segment_sky(image_path, skyseg_session, mask_path)

	if sky_mask.shape != (height, width):
	sky_mask = cv2.resize(sky_mask, (width, height))
	masks.append(sky_mask)

	return conf * (np.array(masks) > 0.1).astype(np.float32)


	def segment_sky(image_path: str, onnx_session, mask_filename: str) -> np.ndarray:
	image = cv2.imread(image_path)
	result_map = run_skyseg(onnx_session, [320, 320], image)
	result_map = cv2.resize(result_map, (image.shape[1], image.shape[0]))

	output_mask = np.zeros_like(result_map)
	output_mask[result_map < 32] = 255

	os.makedirs(os.path.dirname(mask_filename), exist_ok=True)
	cv2.imwrite(mask_filename, output_mask)
	return output_mask


	def run_skyseg(onnx_session, input_size: list[int], image: np.ndarray) -> np.ndarray:
	image = cv2.resize(image, dsize=(input_size[0], input_size[1]))
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	image = np.array(image, dtype=np.float32)
	image = (image / 255 - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
	image = image.transpose(2, 0, 1)
	image = image.reshape(-1, 3, input_size[0], input_size[1]).astype("float32")

	input_name = onnx_session.get_inputs()[0].name
	output_name = onnx_session.get_outputs()[0].name
	result = onnx_session.run([output_name], {input_name: image})
	result = np.array(result).squeeze()
	result_min = np.min(result)
	result_max = np.max(result)
	if result_max > result_min:
	result = (result - result_min) / (result_max - result_min)
	else:
	result = np.zeros_like(result)
	return (result * 255).astype("uint8")


	def download_file_from_url(url: str, filename: str) -> None:
	tmp_filename = f"{filename}.tmp"
	response = requests.get(url, stream=True)
	response.raise_for_status()

	with open(tmp_filename, "wb") as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)
	os.replace(tmp_filename, filename)