Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 18, 2025

Commit

33cd763

verified ·

1 Parent(s): b413167

update app

Browse files

Files changed (1) hide show

app.py +69 -66

app.py CHANGED Viewed

@@ -15,23 +15,15 @@ from PIL import Image, ImageOps
 import requests
 from transformers import (
-    Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
-    AutoModelForImageTextToText,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-from docling_core.types.doc import DoclingDocument, DocTagsDocument
-import re
-import ast
-import html
 # --- Theme and CSS Definition ---
@@ -107,44 +99,50 @@ css = """
 """
 # Constants for text generation
-MAX_MAX_NEW_TOKENS = 4096
-DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-# Check for CUDA availability
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load Nanonets-OCR2-3B
-MODEL_ID_3B = "nanonets/Nanonets-OCR2-3B"
-processor_3b = AutoProcessor.from_pretrained(MODEL_ID_3B, trust_remote_code=True)
-model_3b = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_3B,
-    torch_dtype=torch.float16,
     trust_remote_code=True,
 ).to(device).eval()
-# Load Nanonets-OCR2-1.5B-exp
-MODEL_ID_1_5B = "nanonets/Nanonets-OCR2-1.5B-exp"
-processor_1_5b = AutoProcessor.from_pretrained(MODEL_ID_1_5B, trust_remote_code=True)
-model_1_5b = AutoModelForImageTextToText.from_pretrained(
-    MODEL_ID_1_5B,
-    torch_dtype=torch.float16,
-    trust_remote_code=True,
-    attn_implementation="flash_attention_2"
-).to(device).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
-                   max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
-                   top_k: int = 50,
-                   repetition_penalty: float = 1.2):
-    """Generation function for image input."""
     if model_name == "Nanonets-OCR2-3B":
-        processor, model = processor_3b, model_3b
-    elif model_name == "Nanonets-OCR2-1.5B-exp":
-        processor, model = processor_1_5b, model_1_5b
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -152,18 +150,19 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     if image is None:
         yield "Please upload an image.", "Please upload an image."
         return
     images = [image]
     messages = [
         {
             "role": "user",
-            "content": [{"type": "image"}] + [{"type": "text", "text": text}]
         }
     ]
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
@@ -175,38 +174,49 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
-        buffer += new_text.replace("<|im_end|>", "")
         yield buffer, buffer
 # Define examples for image inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["Describe the image!", "images/8.png"],
     ["OCR the image", "images/2.jpg"],
-    ["Convert this page to docling", "images/1.png"],
-    ["Convert this page to docling", "images/3.png"],
-    ["Convert chart to OTSL.", "images/4.png"],
-    ["Convert code to text", "images/5.jpg"],
-    ["Convert this table to OTSL.", "images/6.jpg"],
-    ["Convert formula to late.", "images/7.jpg"],
 ]
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
-    gr.Markdown("# **Multimodal OCR3**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
-            # Image Inference Components
-            image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-            image_upload = gr.Image(type="pil", label="Upload Image", height=290)
-            image_submit = gr.Button("Submit", variant="primary")
-            gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -216,19 +226,12 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
-            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
-            with gr.Accordion("(Result.md)", open=True):
-                formatted_output = gr.Markdown(label="(Result.md)")
-            model_choice = gr.Radio(
-                choices=["Nanonets-OCR2-3B", "Nanonets-OCR2-1.5B-exp"],
-                label="Select Model",
-                value="Nanonets-OCR2-3B"
-            )
-    image_submit.click(
         fn=generate_image,
-        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output, formatted_output]
     )

 import requests
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
+    AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+from huggingface_hub import snapshot_download
 # --- Theme and CSS Definition ---
 """
 # Constants for text generation
+MAX_MAX_NEW_TOKENS = 5120
+DEFAULT_MAX_NEW_TOKENS = 3072
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load Nanonets-OCR-s
+MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
+model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_M,
     trust_remote_code=True,
+    torch_dtype=torch.float16
 ).to(device).eval()
+# Load Dots.OCR
+MODEL_ID_D = "rednote-hilab/dots.ocr"
+model_path_d = "./models/dots-ocr-local"
+snapshot_download(
+    repo_id=MODEL_ID_D,
+    local_dir=model_path_d,
+    local_dir_use_symlinks=False,
+)
+model_d = AutoModelForCausalLM.from_pretrained(
+    model_path_d,
+    attn_implementation="flash_attention_2" if "cuda" in device.type else "eager",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True
+)
+processor_d = AutoProcessor.from_pretrained(
+    model_path_d,
+    trust_remote_code=True
+)
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
+                   max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
+    """Generate responses for image input using the selected model."""
     if model_name == "Nanonets-OCR2-3B":
+        processor, model = processor_m, model_m
+    elif model_name == "Dots.OCR":
+        processor, model = processor_d, model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     if image is None:
         yield "Please upload an image.", "Please upload an image."
         return
     images = [image]
     messages = [
         {
             "role": "user",
+            "content": [{"type": "image"}] * len(images) + [
+                {"type": "text", "text": text}
+            ]
         }
     ]
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
+    # Dots.OCR uses a different generation parameter name for end-of-sequence
+    if "dots.ocr" in model.config.name_or_path.lower():
+        generation_kwargs["eos_token_id"] = processor.tokenizer.eos_token_id
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
+        buffer += new_text.replace("<|im_end|>", "").replace("</s>", "")
         yield buffer, buffer
+    # The formatted output is the same as the raw output in this version
+    yield buffer, buffer
 # Define examples for image inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["Describe the image!", "images/8.png"],
     ["OCR the image", "images/2.jpg"],
+    ["Convert this page to markdown", "images/1.png"],
 ]
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
+    gr.Markdown("# **Multimodal Image OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
+            model_choice = gr.Radio(
+                choices=["Nanonets-OCR2-3B", "Dots.OCR"],
+                label="Select Model",
+                value="Nanonets-OCR-s"
+            )
+            query_input = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+            image_upload = gr.Image(type="pil", label="Upload Image", height=320)
+            submit_button = gr.Button("Submit", variant="primary")
+            gr.Examples(examples=image_examples, inputs=[query_input, image_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
+            raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=18, show_copy_button=True)
+            formatted_output = gr.Markdown(label="Formatted Output (Result.md)")
+    submit_button.click(
         fn=generate_image,
+        inputs=[model_choice, query_input, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output, formatted_output]
     )