Instructions to use zeyuren2002/EvalMDE with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use zeyuren2002/EvalMDE with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("zeyuren2002/EvalMDE", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
| # Reference: https://github.com/EnVision-Research/Lotus | |
| # Strictly follows official `infer.py`: | |
| # from pipeline import LotusGPipeline, LotusDPipeline | |
| # pipeline = LotusXPipeline.from_pretrained(args.pretrained_model_name_or_path, torch_dtype=dtype) | |
| # # image in [-1, 1] tensor, shape (1, 3, H, W) | |
| # task_emb = torch.tensor([1, 0]).float().unsqueeze(0) | |
| # task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1) | |
| # pred = pipeline(rgb_in=image, prompt='', num_inference_steps=1, | |
| # timesteps=[args.timestep], task_emb=task_emb, | |
| # processing_res=processing_res, match_input_res=match_input_res, | |
| # resample_method=resample_method).images[0] | |
| # if args.task_name == 'depth': | |
| # output_npy = pred.mean(axis=-1) | |
| # | |
| # Default released depth checkpoints (per README): | |
| # jingheya/lotus-depth-g-v1-0 (generation, depth) | |
| # jingheya/lotus-depth-d-v1-0 (regression, depth) | |
| # jingheya/lotus-depth-g-v2-1-disparity (generation, disparity) | |
| # jingheya/lotus-depth-d-v2-0-disparity (regression, disparity) | |
| # Output key depends on whether the checkpoint predicts depth or disparity. | |
| import os | |
| import sys | |
| from typing import * | |
| from pathlib import Path | |
| import click | |
| import torch | |
| import torch.nn.functional as F | |
| import numpy as np | |
| from moge.test.baseline import MGEBaselineInterface | |
| class Baseline(MGEBaselineInterface): | |
| def __init__(self, repo_path: str, pretrained: str, mode: str, task_name: str, | |
| disparity: bool, timestep: int, processing_res: Optional[int], | |
| half_precision: bool, seed: Optional[int], device: Union[torch.device, str]): | |
| repo_path = os.path.abspath(repo_path) | |
| if not Path(repo_path).exists(): | |
| raise FileNotFoundError( | |
| f"Cannot find Lotus repo at {repo_path}. Clone https://github.com/EnVision-Research/Lotus." | |
| ) | |
| # Lotus' pipeline / utils packages are at the repo root. | |
| if repo_path not in sys.path: | |
| sys.path.insert(0, repo_path) | |
| # MoGe's dataloader imports a different top-level package also named `pipeline` | |
| # (from EasternJournalist/pipeline). It is already cached in sys.modules by the | |
| # time we reach here, so `from pipeline import LotusGPipeline` would resolve to | |
| # the wrong module. Evict the cached entry so Python re-resolves against | |
| # Lotus' repo (which is first on sys.path). | |
| sys.modules.pop('pipeline', None) | |
| from pipeline import LotusGPipeline, LotusDPipeline | |
| device = torch.device(device) | |
| dtype = torch.float16 if half_precision else torch.float32 | |
| if mode == 'generation': | |
| pipeline = LotusGPipeline.from_pretrained(pretrained, torch_dtype=dtype) | |
| elif mode == 'regression': | |
| pipeline = LotusDPipeline.from_pretrained(pretrained, torch_dtype=dtype) | |
| else: | |
| raise ValueError(f"Invalid mode: {mode}") | |
| pipeline = pipeline.to(device) | |
| pipeline.set_progress_bar_config(disable=True) | |
| self.pipeline = pipeline | |
| self.device = device | |
| self.dtype = dtype | |
| self.mode = mode | |
| self.task_name = task_name | |
| self.disparity = disparity | |
| self.timestep = timestep | |
| self.processing_res = processing_res | |
| self.generator = torch.Generator(device=device).manual_seed(seed) if seed is not None else None | |
| def load(repo_path: str, pretrained: str, mode: str, task_name: str, disparity: bool, | |
| timestep: int, processing_res: Optional[int], half_precision: bool, | |
| seed: Optional[int], device: str = 'cuda'): | |
| return Baseline(repo_path, pretrained, mode, task_name, disparity, timestep, | |
| processing_res, half_precision, seed, device) | |
| def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: | |
| omit_batch = image.ndim == 3 | |
| if omit_batch: | |
| image = image.unsqueeze(0) | |
| assert image.shape[0] == 1, "Lotus baseline only supports batch size 1" | |
| _, _, H, W = image.shape | |
| # infer.py converts uint8 [0,255] to [-1,1] via `/127.5 - 1.0`. MoGe gives [0,1] floats, | |
| # so the equivalent normalization is `image * 2 - 1`. | |
| rgb_in = (image.to(self.device, dtype=self.dtype) * 2.0 - 1.0) | |
| task_emb = torch.tensor([1, 0], device=self.device, dtype=self.dtype).unsqueeze(0) | |
| task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1) | |
| pred = self.pipeline( | |
| rgb_in=rgb_in, | |
| prompt='', | |
| num_inference_steps=1, | |
| generator=self.generator, | |
| output_type='np', | |
| timesteps=[self.timestep], | |
| task_emb=task_emb, | |
| processing_res=self.processing_res, | |
| match_input_res=True, | |
| resample_method='bilinear', | |
| ).images[0] | |
| # Per infer.py: depth uses mean over channels; pred is HxWx3 in [0, 1]. | |
| if self.task_name == 'depth': | |
| arr = pred.mean(axis=-1) | |
| else: | |
| raise NotImplementedError("Normal task is not exposed by this baseline.") | |
| depth_or_disp = torch.from_numpy(np.ascontiguousarray(arr)).to(self.device).float() | |
| if depth_or_disp.shape != (H, W): | |
| depth_or_disp = F.interpolate(depth_or_disp[None, None], size=(H, W), | |
| mode='bilinear', align_corners=False)[0, 0] | |
| # Lotus disparity ckpts: model physically predicts disparity in [0, 1]. Emit | |
| # ONLY `disparity_affine_invariant`. We previously synthesized `depth_affine_invariant` | |
| # via 1/disp, but this is numerically unstable near disparity=0 — the resulting | |
| # depth-space affine alignment is dominated by inverted-small-disparity outliers, | |
| # not by the model's actual depth quality. Cross-comparison with depth-emitting | |
| # models happens via MoGe's fall-through to `disparity_affine_invariant` (1/depth), | |
| # which IS numerically stable. | |
| if self.disparity: | |
| result = {'disparity_affine_invariant': depth_or_disp} | |
| else: | |
| # Lotus depth ckpt: directly affine-invariant depth. | |
| result = {'depth_affine_invariant': depth_or_disp} | |
| if not omit_batch: | |
| for k in result: result[k] = result[k].unsqueeze(0) | |
| return result | |