zeyuren2002

Add files using upload-large-folder tool

4165f20 verified 12 days ago

7.71 kB

	# Reference: https://github.com/EnVision-Research/Lotus
	# Strictly follows official `infer.py`:
	# from pipeline import LotusGPipeline, LotusDPipeline
	# pipeline = LotusXPipeline.from_pretrained(args.pretrained_model_name_or_path, torch_dtype=dtype)
	# # image in [-1, 1] tensor, shape (1, 3, H, W)
	# task_emb = torch.tensor([1, 0]).float().unsqueeze(0)
	# task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1)
	# pred = pipeline(rgb_in=image, prompt='', num_inference_steps=1,
	# timesteps=[args.timestep], task_emb=task_emb,
	# processing_res=processing_res, match_input_res=match_input_res,
	# resample_method=resample_method).images[0]
	# if args.task_name == 'depth':
	# output_npy = pred.mean(axis=-1)
	#
	# Default released depth checkpoints (per README):
	# jingheya/lotus-depth-g-v1-0 (generation, depth)
	# jingheya/lotus-depth-d-v1-0 (regression, depth)
	# jingheya/lotus-depth-g-v2-1-disparity (generation, disparity)
	# jingheya/lotus-depth-d-v2-0-disparity (regression, disparity)
	# Output key depends on whether the checkpoint predicts depth or disparity.

	import os
	import sys
	from typing import *
	from pathlib import Path

	import click
	import torch
	import torch.nn.functional as F
	import numpy as np

	from moge.test.baseline import MGEBaselineInterface


	class Baseline(MGEBaselineInterface):
	def __init__(self, repo_path: str, pretrained: str, mode: str, task_name: str,
	disparity: bool, timestep: int, processing_res: Optional[int],
	half_precision: bool, seed: Optional[int], device: Union[torch.device, str]):
	repo_path = os.path.abspath(repo_path)
	if not Path(repo_path).exists():
	raise FileNotFoundError(
	f"Cannot find Lotus repo at {repo_path}. Clone https://github.com/EnVision-Research/Lotus."
	)
	# Lotus' pipeline / utils packages are at the repo root.
	if repo_path not in sys.path:
	sys.path.insert(0, repo_path)
	# MoGe's dataloader imports a different top-level package also named `pipeline`
	# (from EasternJournalist/pipeline). It is already cached in sys.modules by the
	# time we reach here, so `from pipeline import LotusGPipeline` would resolve to
	# the wrong module. Evict the cached entry so Python re-resolves against
	# Lotus' repo (which is first on sys.path).
	sys.modules.pop('pipeline', None)
	from pipeline import LotusGPipeline, LotusDPipeline

	device = torch.device(device)
	dtype = torch.float16 if half_precision else torch.float32

	if mode == 'generation':
	pipeline = LotusGPipeline.from_pretrained(pretrained, torch_dtype=dtype)
	elif mode == 'regression':
	pipeline = LotusDPipeline.from_pretrained(pretrained, torch_dtype=dtype)
	else:
	raise ValueError(f"Invalid mode: {mode}")
	pipeline = pipeline.to(device)
	pipeline.set_progress_bar_config(disable=True)

	self.pipeline = pipeline
	self.device = device
	self.dtype = dtype
	self.mode = mode
	self.task_name = task_name
	self.disparity = disparity
	self.timestep = timestep
	self.processing_res = processing_res
	self.generator = torch.Generator(device=device).manual_seed(seed) if seed is not None else None

	@click.command()
	@click.option('--repo', 'repo_path', type=click.Path(), default='../Lotus',
	help='Path to the EnVision-Research/Lotus repository.')
	@click.option('--pretrained', type=str, default='jingheya/lotus-depth-d-v2-0-disparity',
	help='HF checkpoint name or local dir. README default disparity v2 is recommended.')
	@click.option('--mode', type=click.Choice(['generation', 'regression']), default='regression',
	help='Which Lotus pipeline (G/generation or D/regression).')
	@click.option('--task_name', type=click.Choice(['depth', 'normal']), default='depth')
	@click.option('--disparity', is_flag=True,
	help='Set if the checkpoint predicts disparity (e.g. *-disparity ckpts).')
	@click.option('--timestep', type=int, default=999)
	@click.option('--processing_res', type=int, default=None,
	help='Pipeline processing resolution. None uses default in checkpoint.')
	@click.option('--fp16', 'half_precision', is_flag=True, help='Run in half precision.')
	@click.option('--seed', type=int, default=None, help='Reproducibility seed (Lotus eval.sh uses 42).')
	@click.option('--device', type=str, default='cuda')
	@staticmethod
	def load(repo_path: str, pretrained: str, mode: str, task_name: str, disparity: bool,
	timestep: int, processing_res: Optional[int], half_precision: bool,
	seed: Optional[int], device: str = 'cuda'):
	return Baseline(repo_path, pretrained, mode, task_name, disparity, timestep,
	processing_res, half_precision, seed, device)

	@torch.inference_mode()
	def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
	omit_batch = image.ndim == 3
	if omit_batch:
	image = image.unsqueeze(0)
	assert image.shape[0] == 1, "Lotus baseline only supports batch size 1"
	_, _, H, W = image.shape

	# infer.py converts uint8 [0,255] to [-1,1] via `/127.5 - 1.0`. MoGe gives [0,1] floats,
	# so the equivalent normalization is `image * 2 - 1`.
	rgb_in = (image.to(self.device, dtype=self.dtype) * 2.0 - 1.0)

	task_emb = torch.tensor([1, 0], device=self.device, dtype=self.dtype).unsqueeze(0)
	task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1)

	pred = self.pipeline(
	rgb_in=rgb_in,
	prompt='',
	num_inference_steps=1,
	generator=self.generator,
	output_type='np',
	timesteps=[self.timestep],
	task_emb=task_emb,
	processing_res=self.processing_res,
	match_input_res=True,
	resample_method='bilinear',
	).images[0]

	# Per infer.py: depth uses mean over channels; pred is HxWx3 in [0, 1].
	if self.task_name == 'depth':
	arr = pred.mean(axis=-1)
	else:
	raise NotImplementedError("Normal task is not exposed by this baseline.")
	depth_or_disp = torch.from_numpy(np.ascontiguousarray(arr)).to(self.device).float()
	if depth_or_disp.shape != (H, W):
	depth_or_disp = F.interpolate(depth_or_disp[None, None], size=(H, W),
	mode='bilinear', align_corners=False)[0, 0]

	# Lotus disparity ckpts: model physically predicts disparity in [0, 1]. Emit
	# ONLY `disparity_affine_invariant`. We previously synthesized `depth_affine_invariant`
	# via 1/disp, but this is numerically unstable near disparity=0 — the resulting
	# depth-space affine alignment is dominated by inverted-small-disparity outliers,
	# not by the model's actual depth quality. Cross-comparison with depth-emitting
	# models happens via MoGe's fall-through to `disparity_affine_invariant` (1/depth),
	# which IS numerically stable.
	if self.disparity:
	result = {'disparity_affine_invariant': depth_or_disp}
	else:
	# Lotus depth ckpt: directly affine-invariant depth.
	result = {'depth_affine_invariant': depth_or_disp}
	if not omit_batch:
	for k in result: result[k] = result[k].unsqueeze(0)
	return result