Spaces:

witcher23
/

NanoVLM

Runtime error

NanoVLM / app.py

Update app.py

d5d0989 verified 11 months ago

1.58 kB

	import gradio as gr
	import torch
	from PIL import Image
	from huggingface_hub import hf_hub_download
	import sys
	import os

	# Ensure our working directory has the nanoVLM code
	REPO_ID = "huggingface/nanoVLM"
	LOCAL_MODEL_DIR = "models"
	if not os.path.isdir(LOCAL_MODEL_DIR):
	# clone just the models folder
	from git import Repo
	Repo.clone_from("https://github.com/huggingface/nanoVLM.git", ".", depth=1, no_single_branch=True, multi_options=["--filter=blob:none","--sparse"])
	# enable sparse checkout of models/
	Repo().git.sparse_checkout("set", "models")

	# Add to path so we can import
	sys.path.insert(0, os.path.abspath(LOCAL_MODEL_DIR))

	from vision_language_model import VisionLanguageModel

	# Load the VLM
	model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-222M")
	model.eval()

	def predict(img: Image.Image, prompt: str = "") -> str:
	# Preprocess image, add batch dimension
	img_tensor = model.preprocess_image(img).unsqueeze(0) # (1, 3, H, W)
	with torch.no_grad():
	# generate_text handles your prompt internally
	output = model.generate_text(img_tensor, prompt=prompt)
	return output

	demo = gr.Interface(
	fn=predict,
	inputs=[
	gr.Image(type="pil", label="Upload Image"),
	gr.Textbox(lines=1, placeholder="Prompt (e.g. 'What is in this picture?')", label="Prompt")
	],
	outputs=gr.Textbox(label="Model Output"),
	title="nanoVLM-222M Vision-Language Demo",
	description="A minimal Gradio app for image captioning and VQA with nanoVLM-222M."
	)

	if __name__ == "__main__":
	demo.launch()