| import gradio as gr |
| from PIL import Image |
| import torch |
| from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig |
|
|
| |
| |
| |
| model_id = 'microsoft/Florence-2-large' |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| |
| config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) |
| if not hasattr(config, 'forced_bos_token_id'): |
| config.forced_bos_token_id = None |
|
|
| |
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| config=config, |
| trust_remote_code=True |
| ).to(device).eval() |
|
|
| processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) |
|
|
| def run_ocr(image): |
| if image is None: |
| return "⚠️ Please upload an image." |
|
|
| |
| |
| prompt = "<OCR>" |
| |
| inputs = processor(text=prompt, images=image, return_tensors="pt").to(device) |
|
|
| with torch.no_grad(): |
| generated_ids = model.generate( |
| input_ids=inputs["input_ids"], |
| pixel_values=inputs["pixel_values"], |
| max_new_tokens=1024, |
| do_sample=False, |
| num_beams=3 |
| ) |
|
|
| generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
| |
| |
| parsed_answer = processor.post_process_generation( |
| generated_text, |
| task=prompt, |
| image_size=(image.width, image.height) |
| ) |
|
|
| return parsed_answer[prompt] |
|
|
| |
| |
| |
| with gr.Blocks() as demo: |
| gr.Markdown("## 🖋️ Handwritten Note to Text (Florence-2)") |
| |
| with gr.Row(): |
| input_img = gr.Image(type="pil") |
| output_text = gr.Textbox(label="Extracted Text", lines=10) |
| |
| btn = gr.Button("Convert to Text", variant="primary") |
| btn.click(fn=run_ocr, inputs=input_img, outputs=output_text) |
|
|
| if __name__ == "__main__": |
| demo.launch() |