import json import torch import gradio as gr from PIL import Image from transformers import AutoModelForImageTextToText, AutoProcessor from peft import PeftModel from pdf2image import convert_from_path import tempfile import os BASE_MODEL = "HuggingFaceTB/SmolVLM2-500M-Instruct" ADAPTER = "honesdev/smolvlm2-500-invoice-extractor-v3" PROMPT = "Extract the invoice details and return a JSON object only. Return null for any missing fields." print("Loading processor...") processor = AutoProcessor.from_pretrained(BASE_MODEL) print("Loading base model...") model = AutoModelForImageTextToText.from_pretrained( BASE_MODEL, torch_dtype=torch.float32, # CPU requires float32 device_map="cpu" ) print("Loading adapter...") model = PeftModel.from_pretrained(model, ADAPTER) model.eval() print("Model ready ✓") def extract_invoice(file): """ Accepts a PDF or image invoice and returns extracted fields as JSON. """ if file is None: return "Please upload an invoice image or PDF." # Handle PDF — convert first page to image if file.name.lower().endswith(".pdf"): pages = convert_from_path(file.name, dpi=150) image = pages[0].convert("RGB") else: image = Image.open(file.name).convert("RGB") messages = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": PROMPT} ] } ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor( text=text, images=[image], return_tensors="pt", ) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=512, do_sample=False, ) generated = outputs[0][inputs["input_ids"].shape[1]:] response = processor.decode(generated, skip_special_tokens=True) try: parsed = json.loads(response.strip()) return json.dumps(parsed, indent=2) except json.JSONDecodeError: return response.strip() demo = gr.Interface( fn=extract_invoice, inputs=gr.File( label="Upload Invoice (PDF or Image)", file_types=[".pdf", ".png", ".jpg", ".jpeg"] ), outputs=gr.Textbox( label="Extracted Fields (JSON)", lines=25, ), title="🧾 Invoice Extractor", description=""" Upload an invoice as a PDF or image. The model extracts key fields automatically. **Extracts:** vendor name · customer name · bank account · issue date · due date · currency · line items · total amount **Model:** SmolVLM2-500M fine-tuned on 4924 invoices (synthetic + real) ⚠️ Running on CPU — extraction takes 30-60 seconds per invoice. """, theme=gr.themes.Soft() ) demo.launch()