Spaces:

OmarAbualrob
/

ocr-api

Paused

App Files Files Community

OmarAbualrob commited on Jul 29, 2025

Commit

523b7a6

verified ·

1 Parent(s): 499d6a4

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -77

app.py CHANGED Viewed

@@ -1,110 +1,109 @@
 from fastapi import FastAPI, File, UploadFile, HTTPException
-from fastapi.responses import JSONResponse
-from transformers import AutoProcessor, AutoModelForCausalLM
 from PIL import Image
-import torch
-import io
-import logging
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# --- 1. Initialize FastAPI App ---
-app = FastAPI(title="Mixed-Content OCR API", description="An API to extract text from images containing both printed and handwritten text.")
-# --- 2. Load the Model and Processor (at startup) ---
-# This is a critical step. We load the model only once when the app starts.
-# This prevents reloading the model on every API call, which would be very slow.
 try:
-    logger.info("Loading model and processor...")
-    # Use the large model for better accuracy
-    model_id = "microsoft/Florence-2-large"
-    # NOTE: We need to trust remote code for Florence-2
-    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
-    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-    logger.info("Model and processor loaded successfully.")
 except Exception as e:
-    logger.error(f"Error loading model: {e}")
-    # If the model fails to load, the API is not usable. We can't proceed.
     model = None
     processor = None
-# --- 3. Define the OCR Task Function ---
-def run_ocr(image: Image.Image) -> str:
     """
-    Performs OCR on a given PIL Image using the Florence-2 model.
     """
-    if model is None or processor is None:
-        raise RuntimeError("Model is not available. Check logs for loading errors.")
     # Ensure image is in RGB format
     if image.mode != "RGB":
         image = image.convert("RGB")
-    # Define the task prompt
-    prompt = "<OCR>"
     # Preprocess the image and prompt
-    inputs = processor(text=prompt, images=image, return_tensors="pt")
     # Generate text from the image
-    # Note: max_new_tokens can be adjusted based on expected text length
-    os.environ["DISABLE_FLASH_ATTN"] = "1"
     generated_ids = model.generate(
         input_ids=inputs["input_ids"],
         pixel_values=inputs["pixel_values"],
-        max_new_tokens=4096, # Increased token limit for long documents
-        do_sample=False, # Use greedy decoding for deterministic output
-        num_beams=3
     )
-    # Decode the generated IDs to a string
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-    # Post-process the output to get the clean text
-    # The model's output for OCR is typically in the format: <OCR>extracted_text</s>
-    parsed_text = processor.post_process_generation(generated_text, task="<OCR>", image_size=(image.width, image.height))
-    return parsed_text.get("<OCR>", "Error: Could not parse OCR output.")
-# --- 4. Create the API Endpoint ---
-@app.post("/ocr", summary="Extract Text from Image")
-async def perform_ocr(file: UploadFile = File(..., description="Image file to perform OCR on.")):
     """
-    Takes an image file, extracts both printed and handwritten text,
-    and returns it as a JSON object.
     """
-    if model is None:
-        raise HTTPException(status_code=503, detail="Model is not loaded or unavailable.")
-    # Validate file type
-    if not file.content_type.startswith("image/"):
-        raise HTTPException(status_code=400, detail="Invalid file type. Please upload an image.")
     try:
-        # Read the image content from the uploaded file
         contents = await file.read()
         image = Image.open(io.BytesIO(contents))
-        # Run the OCR task
-        logger.info("Running OCR on the uploaded image...")
-        extracted_text = run_ocr(image)
-        logger.info("OCR completed successfully.")
-        # Return the result
-        return JSONResponse(
-            content={"filename": file.filename, "text": extracted_text}
-        )
-    except Exception as e:
-        logger.error(f"An error occurred during OCR processing: {e}")
-        raise HTTPException(status_code=500, detail=f"An internal error occurred: {str(e)}")
-@app.get("/", summary="Health Check")
-def read_root():
-    """
-    A simple health check endpoint to confirm the API is running.
-    """
-    return {"status": "ok", "model_loaded": model is not None}

+import io
+import torch
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+# --- 1. SCRIPT SETUP ---
+# Set up device (use GPU if available, otherwise CPU)
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"--- Running on {DEVICE} ---")
+# Define model and processor IDs from Hugging Face Hub
+MODEL_ID = "microsoft/Florence-2-large"
+# For better performance, you can use the float16 version if your hardware supports it
+# MODEL_ID = "microsoft/Florence-2-large-ft"
+# --- 2. LOAD MODEL AND PROCESSOR ---
+# Load the model and processor from Hugging Face
+# trust_remote_code=True is required for Florence-2
+# torch_dtype=torch.float16 is used for faster inference and lower memory on GPUs
 try:
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16).to(DEVICE)
+    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+    print("--- Model and processor loaded successfully ---")
 except Exception as e:
+    print(f"--- Error loading model: {e} ---")
     model = None
     processor = None
+# --- 3. FASTAPI APP INITIALIZATION ---
+app = FastAPI(
+    title="Florence-2 OCR API",
+    description="An API for extracting text from images using Microsoft's Florence-2-large model. "
+                "Handles both printed and handwritten text.",
+    version="1.0.0"
+)
+# --- 4. HELPER FUNCTION ---
+def run_florence2_ocr(image: Image.Image):
     """
+    Runs the Florence-2 model to perform OCR on a given image.
+    Args:
+        image (Image.Image): The input image in PIL format.
+    Returns:
+        str: The extracted text.
     """
+    if not model or not processor:
+        raise HTTPException(status_code=503, detail="Model is not available. Please check server logs.")
+    # The task prompt for OCR
+    task_prompt = "<OCR>"
     # Ensure image is in RGB format
     if image.mode != "RGB":
         image = image.convert("RGB")
     # Preprocess the image and prompt
+    inputs = processor(text=task_prompt, images=image, return_tensors="pt").to(DEVICE)
+    # Move inputs to float16 if the model is in float16
+    if model.dtype == torch.float16:
+      inputs = inputs.to(torch.float16)
     # Generate text from the image
     generated_ids = model.generate(
         input_ids=inputs["input_ids"],
         pixel_values=inputs["pixel_values"],
+        max_new_tokens=2048,  # Increased token limit for dense text
+        num_beams=3,
+        do_sample=False # Use greedy decoding for more deterministic results
     )
+    # Decode the generated IDs to text
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    # Parse the output to get only the OCR result
+    # The model's output format is typically "<OCR>extracted_text</s>"
+    # We remove the prompt and the end-of-sequence token
+    parsed_text = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
+    return parsed_text.get('<OCR>', "Error: Could not parse OCR output.")
+# --- 5. API ENDPOINTS ---
+@app.get("/", summary="Root Endpoint", description="Returns a welcome message.")
+def read_root():
+    return {"message": "Welcome to the Florence-2 OCR API. Go to /docs for usage."}
+@app.post("/ocr", summary="Extract Text from Image", description="Upload an image file to extract text. Supports both computer and handwritten text.")
+async def extract_text_from_image(file: UploadFile = File(..., description="Image file to process.")):
     """
+    Endpoint to perform OCR on an uploaded image.
     """
+    # Read image content from the uploaded file
     try:
         contents = await file.read()
         image = Image.open(io.BytesIO(contents))
+    except Exception:
+        raise HTTPException(status_code=400, detail="Invalid image file. Could not open image.")
+    # Run the OCR model
+    try:
+        extracted_text = run_florence2_ocr(image)
+        return {"filename": file.filename, "extracted_text": extracted_text}
+    except Exception as e:
+        print(f"Error during model inference: {e}")
+        raise HTTPException(status_code=500, detail=f"An error occurred during processing: {str(e)}")