Spaces:

prithivMLmods
/

Multimodal-OCR3

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 20, 2025

Commit

2174e9c

verified ·

1 Parent(s): d7c41ec

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -11

app.py CHANGED Viewed

@@ -18,7 +18,6 @@ from transformers import (
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
@@ -103,7 +102,6 @@ model_path_d_local = snapshot_download(
     local_dir_use_symlinks=False
 )
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
@@ -125,15 +123,12 @@ if os.path.exists(config_file_path):
 sys.path.append(model_path_d_local)
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR2-3B
 MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -174,7 +169,6 @@ model_p = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.bfloat16
 ).to(device).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
@@ -235,14 +229,12 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
         yield buffer, buffer
 image_examples = [
-    ["Reconstruct the doc [table] as it is.", "images/0.png"],
-    ["Describe the image!", "images/8.png"],
-    ["OCR the image", "images/2.jpg"],
 ]
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Multimodal OCR3**", elem_id="main-title")
     with gr.Row():

 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
     local_dir_use_symlinks=False
 )
 config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
 if os.path.exists(config_file_path):
 sys.path.append(model_path_d_local)
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR2-3B
 MODEL_ID_M = "nanonets/Nanonets-OCR2-3B"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
     torch_dtype=torch.bfloat16
 ).to(device).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
         buffer += new_text.replace("<|im_end|>", "").replace("<end_of_utterance>", "")
         yield buffer, buffer
 image_examples = [
+    ["Perform OCR on the image.", "examples/1.jpg"],
+    ["Phrase the document [page].", "examples/2.jpg"],
+    ["OCR and reconstruct the table perfectly.", "examples/3.jpg"],
 ]
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Multimodal OCR3**", elem_id="main-title")
     with gr.Row():