lens / app.py
multimodalart
Lazy-cache examples; default inference params; rename to Lens
d6e9e07
"""Gradio demo for Microsoft Lens (RL) and Lens-Turbo (4-step distilled).
Both pipelines are preloaded at import time and share a single GPT-OSS text
encoder to fit ZeroGPU memory. ZeroGPU hijacks CUDA on `import spaces`, so we
do the heavy load at module scope, not inside a `@spaces.GPU` function.
"""
from __future__ import annotations
import os
import random
import spaces
import torch
import gradio as gr
from lens import LensGptOssEncoder, LensPipeline
from lens.resolution import SUPPORTED_ASPECT_RATIOS, SUPPORTED_BASE_RESOLUTIONS
DTYPE = torch.bfloat16
TURBO_REPO = "microsoft/Lens-Turbo"
LENS_REPO = "microsoft/Lens"
# ---------------------------------------------------------------------------
# Global preload: shared text encoder, then both DiT pipelines.
# ---------------------------------------------------------------------------
text_encoder_kwargs = {"subfolder": "text_encoder", "dtype": DTYPE}
try:
from transformers import Mxfp4Config
# Keep GPT-OSS in MXFP4 — ZeroGPU runs H200 (Hopper), which supports the
# native kernels and saves ~25 GB vs. dequantized bf16.
text_encoder_kwargs["quantization_config"] = Mxfp4Config(dequantize=False)
except ImportError:
pass
text_encoder = LensGptOssEncoder.from_pretrained(TURBO_REPO, **text_encoder_kwargs)
turbo_pipe = LensPipeline.from_pretrained(
TURBO_REPO, text_encoder=text_encoder, torch_dtype=DTYPE
).to("cuda")
lens_pipe = LensPipeline.from_pretrained(
LENS_REPO, text_encoder=text_encoder, torch_dtype=DTYPE
).to("cuda")
PIPES = {"Lens-Turbo (4 steps)": turbo_pipe, "Lens (20 steps, RL)": lens_pipe}
MODEL_CHOICES = list(PIPES.keys())
MAX_SEED = 2**31 - 1
def model_defaults(model_name: str):
if "Turbo" in model_name:
return 4, 1.0
return 20, 5.0
@spaces.GPU(duration=120)
def generate(
prompt: str,
model_name: str = MODEL_CHOICES[0],
base_resolution: int = 1024,
aspect_ratio: str = "1:1",
steps: int | None = None,
cfg: float | None = None,
seed: int = 0,
randomize_seed: bool = True,
progress=gr.Progress(track_tqdm=True),
):
if not prompt or not prompt.strip():
raise gr.Error("Please enter a prompt.")
pipe = PIPES[model_name]
default_steps, default_cfg = model_defaults(model_name)
steps = default_steps if steps is None else int(steps)
cfg = default_cfg if cfg is None else float(cfg)
if randomize_seed:
seed = random.randint(0, MAX_SEED)
seed = int(seed)
generator = torch.Generator(device=pipe._execution_device).manual_seed(seed)
out = pipe(
prompt=prompt.strip(),
base_resolution=int(base_resolution),
aspect_ratio=aspect_ratio,
num_inference_steps=steps,
guidance_scale=cfg,
num_images_per_prompt=1,
generator=generator,
)
return out.images[0], seed
CSS = """
#col-container { max-width: 1100px; margin: 0 auto; }
"""
with gr.Blocks(theme=gr.themes.Citrus(), css=CSS, title="Lens / Lens-Turbo") as demo:
with gr.Column(elem_id="col-container"):
gr.Markdown(
"""
# Microsoft Lens
3.8B foundational text-to-image model. Switch between **Lens-Turbo**
(4-step distilled, fast) and **Lens** (20-step RL-tuned, higher
quality).
[Paper](https://arxiv.org/abs/2605.21573) · [Code](https://github.com/microsoft/Lens) · [Lens](https://huggingface.co/microsoft/Lens) · [Lens-Turbo](https://huggingface.co/microsoft/Lens-Turbo)
"""
)
with gr.Row():
with gr.Column(scale=3):
prompt = gr.Textbox(
label="Prompt",
placeholder="A cinematic mountain lake at sunrise, soft golden light, mist rising off the water",
lines=3,
)
with gr.Row():
model = gr.Radio(
choices=MODEL_CHOICES,
value=MODEL_CHOICES[0],
label="Model",
)
run_btn = gr.Button("Generate", variant="primary")
with gr.Accordion("Advanced", open=False):
with gr.Row():
base_res = gr.Radio(
choices=list(SUPPORTED_BASE_RESOLUTIONS),
value=1024,
label="Base resolution",
)
aspect = gr.Dropdown(
choices=list(SUPPORTED_ASPECT_RATIOS),
value="1:1",
label="Aspect ratio (W:H)",
)
with gr.Row():
steps = gr.Slider(1, 50, value=4, step=1, label="Steps")
cfg = gr.Slider(1.0, 10.0, value=1.0, step=0.1, label="Guidance scale")
with gr.Row():
seed = gr.Slider(0, MAX_SEED, value=0, step=1, label="Seed")
randomize = gr.Checkbox(value=True, label="Randomize seed")
with gr.Column(scale=4):
image = gr.Image(label="Output", type="pil", height=640)
used_seed = gr.Number(label="Seed used", interactive=False)
gr.Examples(
examples=[
["A generous portion of classic British fish and chips on white paper, golden crispy beer-battered cod, thick-cut chips, lemon wedge, mushy peas, wooden pub table, overhead shot", MODEL_CHOICES[0]],
["A crystal dragon soaring through an aurora borealis sky, transparent faceted body refracting green and purple light, ice trail from its wings, high fantasy digital art", MODEL_CHOICES[0]],
["Aerial view of Yuanyang rice terraces at sunrise, cascading water-filled paddies reflecting pink sky, morning mist between layers, drone photography", MODEL_CHOICES[1]],
["A green iguana basking on a moss-covered log in a tropical rainforest, every scale rendered sharply, dewdrops on its skin, National Geographic style", MODEL_CHOICES[1]],
],
inputs=[prompt, model],
outputs=[image, used_seed],
fn=generate,
cache_examples=True,
cache_mode="lazy",
)
def _sync_defaults(model_name):
s, g = model_defaults(model_name)
return gr.update(value=s), gr.update(value=g)
model.change(_sync_defaults, inputs=model, outputs=[steps, cfg])
run_btn.click(
generate,
inputs=[prompt, model, base_res, aspect, steps, cfg, seed, randomize],
outputs=[image, used_seed],
)
prompt.submit(
generate,
inputs=[prompt, model, base_res, aspect, steps, cfg, seed, randomize],
outputs=[image, used_seed],
)
if __name__ == "__main__":
demo.launch()