| import gradio as gr |
| import torch |
| from PIL import Image |
| from huggingface_hub import hf_hub_download |
| import sys |
| import os |
|
|
| |
| REPO_ID = "huggingface/nanoVLM" |
| LOCAL_MODEL_DIR = "models" |
| if not os.path.isdir(LOCAL_MODEL_DIR): |
| |
| from git import Repo |
| Repo.clone_from("https://github.com/huggingface/nanoVLM.git", ".", depth=1, no_single_branch=True, multi_options=["--filter=blob:none","--sparse"]) |
| |
| Repo().git.sparse_checkout("set", "models") |
|
|
| |
| sys.path.insert(0, os.path.abspath(LOCAL_MODEL_DIR)) |
|
|
| from vision_language_model import VisionLanguageModel |
|
|
| |
| model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-222M") |
| model.eval() |
|
|
| def predict(img: Image.Image, prompt: str = "") -> str: |
| |
| img_tensor = model.preprocess_image(img).unsqueeze(0) |
| with torch.no_grad(): |
| |
| output = model.generate_text(img_tensor, prompt=prompt) |
| return output |
|
|
| demo = gr.Interface( |
| fn=predict, |
| inputs=[ |
| gr.Image(type="pil", label="Upload Image"), |
| gr.Textbox(lines=1, placeholder="Prompt (e.g. 'What is in this picture?')", label="Prompt") |
| ], |
| outputs=gr.Textbox(label="Model Output"), |
| title="nanoVLM-222M Vision-Language Demo", |
| description="A minimal Gradio app for image captioning and VQA with nanoVLM-222M." |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|