"""Gradio demo for Microsoft Lens (RL) and Lens-Turbo (4-step distilled). Both pipelines are preloaded at import time and share a single GPT-OSS text encoder to fit ZeroGPU memory. ZeroGPU hijacks CUDA on `import spaces`, so we do the heavy load at module scope, not inside a `@spaces.GPU` function. """ from __future__ import annotations import os import random import spaces import torch import gradio as gr from lens import LensGptOssEncoder, LensPipeline from lens.resolution import SUPPORTED_ASPECT_RATIOS, SUPPORTED_BASE_RESOLUTIONS DTYPE = torch.bfloat16 TURBO_REPO = "microsoft/Lens-Turbo" LENS_REPO = "microsoft/Lens" # --------------------------------------------------------------------------- # Global preload: shared text encoder, then both DiT pipelines. # --------------------------------------------------------------------------- text_encoder_kwargs = {"subfolder": "text_encoder", "dtype": DTYPE} try: from transformers import Mxfp4Config # Keep GPT-OSS in MXFP4 — ZeroGPU runs H200 (Hopper), which supports the # native kernels and saves ~25 GB vs. dequantized bf16. text_encoder_kwargs["quantization_config"] = Mxfp4Config(dequantize=False) except ImportError: pass text_encoder = LensGptOssEncoder.from_pretrained(TURBO_REPO, **text_encoder_kwargs) turbo_pipe = LensPipeline.from_pretrained( TURBO_REPO, text_encoder=text_encoder, torch_dtype=DTYPE ).to("cuda") lens_pipe = LensPipeline.from_pretrained( LENS_REPO, text_encoder=text_encoder, torch_dtype=DTYPE ).to("cuda") PIPES = {"Lens-Turbo (4 steps)": turbo_pipe, "Lens (20 steps, RL)": lens_pipe} MODEL_CHOICES = list(PIPES.keys()) MAX_SEED = 2**31 - 1 def model_defaults(model_name: str): if "Turbo" in model_name: return 4, 1.0 return 20, 5.0 @spaces.GPU(duration=120) def generate( prompt: str, model_name: str = MODEL_CHOICES[0], base_resolution: int = 1024, aspect_ratio: str = "1:1", steps: int | None = None, cfg: float | None = None, seed: int = 0, randomize_seed: bool = True, progress=gr.Progress(track_tqdm=True), ): if not prompt or not prompt.strip(): raise gr.Error("Please enter a prompt.") pipe = PIPES[model_name] default_steps, default_cfg = model_defaults(model_name) steps = default_steps if steps is None else int(steps) cfg = default_cfg if cfg is None else float(cfg) if randomize_seed: seed = random.randint(0, MAX_SEED) seed = int(seed) generator = torch.Generator(device=pipe._execution_device).manual_seed(seed) out = pipe( prompt=prompt.strip(), base_resolution=int(base_resolution), aspect_ratio=aspect_ratio, num_inference_steps=steps, guidance_scale=cfg, num_images_per_prompt=1, generator=generator, ) return out.images[0], seed CSS = """ #col-container { max-width: 1100px; margin: 0 auto; } """ with gr.Blocks(theme=gr.themes.Citrus(), css=CSS, title="Lens / Lens-Turbo") as demo: with gr.Column(elem_id="col-container"): gr.Markdown( """ # Microsoft Lens 3.8B foundational text-to-image model. Switch between **Lens-Turbo** (4-step distilled, fast) and **Lens** (20-step RL-tuned, higher quality). [Paper](https://arxiv.org/abs/2605.21573) · [Code](https://github.com/microsoft/Lens) · [Lens](https://huggingface.co/microsoft/Lens) · [Lens-Turbo](https://huggingface.co/microsoft/Lens-Turbo) """ ) with gr.Row(): with gr.Column(scale=3): prompt = gr.Textbox( label="Prompt", placeholder="A cinematic mountain lake at sunrise, soft golden light, mist rising off the water", lines=3, ) with gr.Row(): model = gr.Radio( choices=MODEL_CHOICES, value=MODEL_CHOICES[0], label="Model", ) run_btn = gr.Button("Generate", variant="primary") with gr.Accordion("Advanced", open=False): with gr.Row(): base_res = gr.Radio( choices=list(SUPPORTED_BASE_RESOLUTIONS), value=1024, label="Base resolution", ) aspect = gr.Dropdown( choices=list(SUPPORTED_ASPECT_RATIOS), value="1:1", label="Aspect ratio (W:H)", ) with gr.Row(): steps = gr.Slider(1, 50, value=4, step=1, label="Steps") cfg = gr.Slider(1.0, 10.0, value=1.0, step=0.1, label="Guidance scale") with gr.Row(): seed = gr.Slider(0, MAX_SEED, value=0, step=1, label="Seed") randomize = gr.Checkbox(value=True, label="Randomize seed") with gr.Column(scale=4): image = gr.Image(label="Output", type="pil", height=640) used_seed = gr.Number(label="Seed used", interactive=False) gr.Examples( examples=[ ["A generous portion of classic British fish and chips on white paper, golden crispy beer-battered cod, thick-cut chips, lemon wedge, mushy peas, wooden pub table, overhead shot", MODEL_CHOICES[0]], ["A crystal dragon soaring through an aurora borealis sky, transparent faceted body refracting green and purple light, ice trail from its wings, high fantasy digital art", MODEL_CHOICES[0]], ["Aerial view of Yuanyang rice terraces at sunrise, cascading water-filled paddies reflecting pink sky, morning mist between layers, drone photography", MODEL_CHOICES[1]], ["A green iguana basking on a moss-covered log in a tropical rainforest, every scale rendered sharply, dewdrops on its skin, National Geographic style", MODEL_CHOICES[1]], ], inputs=[prompt, model], outputs=[image, used_seed], fn=generate, cache_examples=True, cache_mode="lazy", ) def _sync_defaults(model_name): s, g = model_defaults(model_name) return gr.update(value=s), gr.update(value=g) model.change(_sync_defaults, inputs=model, outputs=[steps, cfg]) run_btn.click( generate, inputs=[prompt, model, base_res, aspect, steps, cfg, seed, randomize], outputs=[image, used_seed], ) prompt.submit( generate, inputs=[prompt, model, base_res, aspect, steps, cfg, seed, randomize], outputs=[image, used_seed], ) if __name__ == "__main__": demo.launch()