Instructions to use zeyuren2002/EvalMDE with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use zeyuren2002/EvalMDE with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("zeyuren2002/EvalMDE", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
| # Reference: https://github.com/AMAP-ML/FE2E | |
| # Strictly follows the official `infer/inference.py::ImageGenerator` API. | |
| # Distributed entrypoint `evaluation.py` is bypassed; we use the same `ImageGenerator` | |
| # class on a single GPU. README usage (for depth): | |
| # python -u evaluation.py \ | |
| # --model_path ./pretrain --lora ./lora/LDRN.safetensors --single_denoise \ | |
| # --prompt_type empty --norm_type ln --task_name depth ... | |
| # | |
| # Important calling convention (from inference.py): | |
| # ImageGenerator.__init__ requires an `args` namespace with at least: | |
| # args.prompt_type (='empty' to skip Qwen), | |
| # args.single_denoise (sets num_steps = 1 via FE2E's parse_args), | |
| # args.empty_prompt_cache (path to latent/no_info.npz), | |
| # `generate_image(prompt, negative_prompt, ref_images=PIL_or_tensor, num_steps, | |
| # cfg_guidance, seed, ..., args=args)` returns (images_list, Lpred, Rpred). | |
| # - Lpred: float tensor in [0, 1] (mul .5 + .5 applied) — corresponds to the *edited* | |
| # frame (= depth output for FE2E's depth LoRA). | |
| # - Rpred: float tensor in [-1, 1] — the reconstructed reference RGB. | |
| # | |
| # Output key: `depth_affine_invariant` (Lpred mean over channels). | |
| # NEEDS_VERIFICATION: Lpred vs Rpred meaning is inferred from generate_image semantics | |
| # (Lpred = left/first frame = denoised target = depth) and confirmed by inner_evaluation.py | |
| # using `Lpred` for depth metrics. Switch via `--use-rpred` if a sanity run shows the | |
| # depth is actually carried by Rpred. | |
| import os | |
| import sys | |
| from typing import * | |
| from pathlib import Path | |
| from types import SimpleNamespace | |
| import click | |
| import torch | |
| import torch.nn.functional as F | |
| import numpy as np | |
| from PIL import Image | |
| from moge.test.baseline import MGEBaselineInterface | |
| class Baseline(MGEBaselineInterface): | |
| def __init__(self, repo_path: str, model_path: str, lora_path: str, | |
| qwen2vl_path: Optional[str], empty_prompt_cache: Optional[str], | |
| num_steps: int, cfg_guidance: float, size_level: int, | |
| prompt_type: str, single_denoise: bool, seed: int, | |
| quantized: bool, offload: bool, use_rpred: bool, | |
| device: Union[torch.device, str]): | |
| repo_path = os.path.abspath(repo_path) | |
| if not Path(repo_path).exists(): | |
| raise FileNotFoundError( | |
| f"Cannot find FE2E repo at {repo_path}. Clone https://github.com/AMAP-ML/FE2E." | |
| ) | |
| if repo_path not in sys.path: | |
| sys.path.insert(0, repo_path) | |
| from infer.inference import ImageGenerator | |
| from infer.seed_all import seed_all | |
| seed_all(seed) | |
| def _resolve(p): | |
| return p if os.path.isabs(p) else os.path.join(repo_path, p) | |
| model_path = _resolve(model_path) | |
| lora_path = _resolve(lora_path) | |
| if qwen2vl_path is not None: | |
| qwen2vl_path = _resolve(qwen2vl_path) | |
| else: | |
| qwen2vl_path = os.path.join(repo_path, "Qwen") # FE2E DEFAULT_QWEN_DIR | |
| if empty_prompt_cache is not None: | |
| empty_prompt_cache = _resolve(empty_prompt_cache) | |
| else: | |
| empty_prompt_cache = os.path.join(repo_path, "latent", "no_info.npz") | |
| # ImageGenerator reads several attrs off args (prompt_type, single_denoise, empty_prompt_cache). | |
| ig_args = SimpleNamespace( | |
| prompt_type=prompt_type, | |
| single_denoise=single_denoise, | |
| empty_prompt_cache=empty_prompt_cache, | |
| ) | |
| device = torch.device(device) | |
| ae_path = os.path.join(model_path, "vae.safetensors") | |
| dit_basename = "step1x-edit-i1258-FP8.safetensors" if quantized else "step1x-edit-i1258.safetensors" | |
| dit_path = os.path.join(model_path, dit_basename) | |
| for p in (ae_path, dit_path, lora_path, empty_prompt_cache): | |
| if not os.path.exists(p): | |
| raise FileNotFoundError(f"Missing required FE2E artifact: {p}") | |
| self.image_gen = ImageGenerator( | |
| ae_path=ae_path, | |
| dit_path=dit_path, | |
| qwen2vl_model_path=qwen2vl_path, | |
| max_length=640, | |
| quantized=quantized, | |
| offload=offload, | |
| lora=lora_path, | |
| device=str(device), | |
| args=ig_args, | |
| ) | |
| self.device = device | |
| self.num_steps = 1 if single_denoise else num_steps | |
| self.cfg_guidance = cfg_guidance | |
| self.size_level = size_level | |
| self.seed = seed | |
| self.ig_args = ig_args | |
| self.use_rpred = use_rpred | |
| def load(repo_path: str, model_path: str, lora_path: str, | |
| qwen2vl_path: Optional[str], empty_prompt_cache: Optional[str], | |
| num_steps: int, cfg_guidance: float, size_level: int, | |
| prompt_type: str, single_denoise: bool, seed: int, | |
| quantized: bool, offload: bool, use_rpred: bool, | |
| device: str = 'cuda'): | |
| return Baseline(repo_path, model_path, lora_path, qwen2vl_path, | |
| empty_prompt_cache, num_steps, cfg_guidance, size_level, | |
| prompt_type, single_denoise, seed, quantized, offload, | |
| use_rpred, device) | |
| def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: | |
| omit_batch = image.ndim == 3 | |
| if omit_batch: | |
| image = image.unsqueeze(0) | |
| assert image.shape[0] == 1, "FE2E baseline only supports batch size 1" | |
| _, _, H, W = image.shape | |
| # generate_image accepts PIL or torch.Tensor for ref_images. | |
| arr = (image[0].cpu().permute(1, 2, 0).clamp(0, 1).numpy() * 255).astype(np.uint8) | |
| pil = Image.fromarray(arr) | |
| images_list, Lpred, Rpred = self.image_gen.generate_image( | |
| prompt='', | |
| negative_prompt='', | |
| ref_images=pil, | |
| num_samples=1, | |
| num_steps=self.num_steps, | |
| cfg_guidance=self.cfg_guidance, | |
| seed=self.seed, | |
| show_progress=False, | |
| size_level=self.size_level, | |
| args=self.ig_args, | |
| ) | |
| # Lpred: [1, 3, h', w'] in [0, 1]; Rpred: [1, 3, h', w'] in [-1, 1]. | |
| if self.use_rpred: | |
| pred = Rpred.clamp(-1, 1) | |
| pred = pred.mul(0.5).add(0.5) | |
| else: | |
| pred = Lpred # already in [0, 1] | |
| # Mean over the channel dim to get scalar depth (same convention as | |
| # Marigold / Lotus / DepthMaster). The eval pipeline aligns affine afterwards. | |
| depth = pred[0].mean(dim=0).to(self.device).float() | |
| if depth.shape != (H, W): | |
| depth = F.interpolate(depth[None, None], size=(H, W), | |
| mode='bilinear', align_corners=False)[0, 0] | |
| # FE2E predicts affine-invariant depth via Step1X-Edit + LDRN LoRA (Wang et al., CVPR 2026). | |
| # Emit only this physical key. | |
| result = {'depth_affine_invariant': depth} | |
| if not omit_batch: | |
| result['depth_affine_invariant'] = result['depth_affine_invariant'].unsqueeze(0) | |
| return result | |