Spaces:
Runtime error
Runtime error
Update ocr_agent.py
Browse files- ocr_agent.py +55 -109
ocr_agent.py
CHANGED
|
@@ -1,124 +1,70 @@
|
|
| 1 |
-
# ocr_agent.py
|
| 2 |
-
import re
|
| 3 |
-
import json
|
| 4 |
import os
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
import pdfplumber
|
| 8 |
-
except ImportError:
|
| 9 |
-
pdfplumber = None
|
| 10 |
-
|
| 11 |
-
try:
|
| 12 |
-
from pdf2image import convert_from_path
|
| 13 |
-
import pytesseract
|
| 14 |
-
except ImportError:
|
| 15 |
-
convert_from_path = None
|
| 16 |
-
pytesseract = None
|
| 17 |
-
|
| 18 |
|
| 19 |
class OcrAgent:
|
| 20 |
def __init__(self, data_dir="data"):
|
| 21 |
self.data_dir = data_dir
|
| 22 |
os.makedirs(self.data_dir, exist_ok=True)
|
| 23 |
|
| 24 |
-
def
|
| 25 |
-
"""Extract text using pdfplumber."""
|
| 26 |
-
text = ""
|
| 27 |
-
with pdfplumber.open(pdf_path) as pdf:
|
| 28 |
-
for page in pdf.pages:
|
| 29 |
-
text += page.extract_text() or ""
|
| 30 |
-
return text
|
| 31 |
-
|
| 32 |
-
def _extract_tesseract(self, pdf_path: str) -> str:
|
| 33 |
-
"""Fallback: OCR via pdf2image + Tesseract."""
|
| 34 |
-
if not (convert_from_path and pytesseract):
|
| 35 |
-
return ""
|
| 36 |
-
text = ""
|
| 37 |
-
pages = convert_from_path(pdf_path, dpi=300)
|
| 38 |
-
for page in pages:
|
| 39 |
-
text += pytesseract.image_to_string(page, lang="eng+msa") + "\n"
|
| 40 |
-
return text
|
| 41 |
-
|
| 42 |
-
def _clean_text(self, raw: str) -> str:
|
| 43 |
-
"""Remove watermarks, scanner marks, and noise."""
|
| 44 |
-
lines = []
|
| 45 |
-
for line in raw.splitlines():
|
| 46 |
-
l = line.strip()
|
| 47 |
-
if not l:
|
| 48 |
-
continue
|
| 49 |
-
# Remove watermarks
|
| 50 |
-
if "bmspm.net" in l.lower():
|
| 51 |
-
continue
|
| 52 |
-
if "camscanner" in l.lower():
|
| 53 |
-
continue
|
| 54 |
-
# Remove page numbers (single integers)
|
| 55 |
-
if re.match(r"^\d+$", l):
|
| 56 |
-
continue
|
| 57 |
-
lines.append(l)
|
| 58 |
-
return "\n".join(lines)
|
| 59 |
-
|
| 60 |
-
def extract_text(self, pdf_path: str) -> str:
|
| 61 |
-
"""Extract and clean text from PDF."""
|
| 62 |
-
text = ""
|
| 63 |
-
try:
|
| 64 |
-
if pdfplumber:
|
| 65 |
-
text = self._extract_pdfplumber(pdf_path)
|
| 66 |
-
if not text or len(text.strip()) < 120:
|
| 67 |
-
text = self._extract_tesseract(pdf_path)
|
| 68 |
-
except Exception:
|
| 69 |
-
if convert_from_path and pytesseract:
|
| 70 |
-
text = self._extract_tesseract(pdf_path)
|
| 71 |
-
return self._clean_text(text)
|
| 72 |
-
|
| 73 |
-
def parse_questions(self, cleaned_text: str, subject: str, year: str):
|
| 74 |
"""
|
| 75 |
-
|
| 76 |
-
|
| 77 |
"""
|
|
|
|
| 78 |
questions = []
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
"
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
| 107 |
return questions
|
| 108 |
|
| 109 |
-
def
|
| 110 |
-
"""
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import fitz # PyMuPDF
|
| 3 |
+
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
class OcrAgent:
|
| 6 |
def __init__(self, data_dir="data"):
|
| 7 |
self.data_dir = data_dir
|
| 8 |
os.makedirs(self.data_dir, exist_ok=True)
|
| 9 |
|
| 10 |
+
def extract_questions(self, pdf_path, subject, year):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"""
|
| 12 |
+
Extracts questions from PDF with text + visual support.
|
| 13 |
+
Returns a list of formatted question blocks (HTML with text + images).
|
| 14 |
"""
|
| 15 |
+
doc = fitz.open(pdf_path)
|
| 16 |
questions = []
|
| 17 |
+
q_counter = 1
|
| 18 |
+
|
| 19 |
+
for page_num, page in enumerate(doc, start=1):
|
| 20 |
+
text = page.get_text("text").strip()
|
| 21 |
+
images = page.get_images(full=True)
|
| 22 |
+
|
| 23 |
+
# Save images if present
|
| 24 |
+
img_paths = []
|
| 25 |
+
for i, img in enumerate(images, start=1):
|
| 26 |
+
xref = img[0]
|
| 27 |
+
pix = fitz.Pixmap(doc, xref)
|
| 28 |
+
img_filename = f"{subject}_{year}_q{q_counter}_{i}.png"
|
| 29 |
+
img_path = os.path.join(self.data_dir, img_filename)
|
| 30 |
+
pix.save(img_path)
|
| 31 |
+
img_paths.append(img_path)
|
| 32 |
+
|
| 33 |
+
# Split text into question + choices
|
| 34 |
+
match = re.split(r"\n[A-D]\.", text)
|
| 35 |
+
if len(match) > 1:
|
| 36 |
+
q_text = match[0].strip()
|
| 37 |
+
choices = re.findall(r"[A-D]\.\s?.*", text)
|
| 38 |
+
else:
|
| 39 |
+
q_text = text
|
| 40 |
+
choices = []
|
| 41 |
+
|
| 42 |
+
formatted = self.format_question_block(
|
| 43 |
+
q_counter, q_text, choices, img_paths
|
| 44 |
+
)
|
| 45 |
+
questions.append(formatted)
|
| 46 |
+
q_counter += 1
|
| 47 |
+
|
| 48 |
return questions
|
| 49 |
|
| 50 |
+
def format_question_block(self, q_num, q_text, choices, img_paths):
|
| 51 |
+
"""
|
| 52 |
+
Format one question into HTML with optional images and choices.
|
| 53 |
+
"""
|
| 54 |
+
block = f"<b>Q{q_num}.</b> {q_text}<br>"
|
| 55 |
+
|
| 56 |
+
for img_path in img_paths:
|
| 57 |
+
rel_path = os.path.relpath(img_path, self.data_dir)
|
| 58 |
+
block += f'<img src="data/{rel_path}" style="max-width:400px;"><br>'
|
| 59 |
+
|
| 60 |
+
if choices:
|
| 61 |
+
block += "<ul>"
|
| 62 |
+
for choice in choices:
|
| 63 |
+
block += f"<li>{choice}</li>"
|
| 64 |
+
block += "</ul>"
|
| 65 |
+
|
| 66 |
+
return block
|
| 67 |
+
|
| 68 |
|
| 69 |
|
| 70 |
|