Spaces:
Runtime error
Runtime error
Update ocr_agent.py
Browse files- ocr_agent.py +88 -91
ocr_agent.py
CHANGED
|
@@ -1,88 +1,64 @@
|
|
| 1 |
# ocr_agent.py
|
| 2 |
-
import os
|
| 3 |
import re
|
| 4 |
import json
|
|
|
|
| 5 |
|
| 6 |
try:
|
| 7 |
import pdfplumber
|
| 8 |
-
except
|
| 9 |
pdfplumber = None
|
| 10 |
|
| 11 |
try:
|
| 12 |
from pdf2image import convert_from_path
|
| 13 |
import pytesseract
|
| 14 |
-
except
|
| 15 |
convert_from_path = None
|
| 16 |
pytesseract = None
|
| 17 |
|
| 18 |
|
| 19 |
-
def _split_blocks_by_number(text: str):
|
| 20 |
-
# split on lines starting with "1. " or similar
|
| 21 |
-
return re.split(r"\n(?=\s*\d+\.)", text)
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def _extract_choices_from_block(block: str):
|
| 25 |
-
block = block.strip()
|
| 26 |
-
# look for A) or A. markers
|
| 27 |
-
m = re.search(r"\bA[\)\.]\s*", block)
|
| 28 |
-
if m:
|
| 29 |
-
start = m.start()
|
| 30 |
-
qtext = block[:start].strip()
|
| 31 |
-
opts_text = block[start:].strip()
|
| 32 |
-
items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts_text)
|
| 33 |
-
choices = []
|
| 34 |
-
for it in items:
|
| 35 |
-
it = it.strip()
|
| 36 |
-
if not it:
|
| 37 |
-
continue
|
| 38 |
-
it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
|
| 39 |
-
choices.append(it2)
|
| 40 |
-
if choices:
|
| 41 |
-
return qtext, choices
|
| 42 |
-
|
| 43 |
-
# fallback: lines style
|
| 44 |
-
lines = block.splitlines()
|
| 45 |
-
q_lines = []
|
| 46 |
-
choices = []
|
| 47 |
-
started = False
|
| 48 |
-
for ln in lines:
|
| 49 |
-
ln = ln.strip()
|
| 50 |
-
if re.match(r'^[A-D][\)\.]\s*', ln):
|
| 51 |
-
started = True
|
| 52 |
-
cl = re.sub(r'^[A-D][\)\.]\s*', '', ln).strip()
|
| 53 |
-
choices.append(cl)
|
| 54 |
-
else:
|
| 55 |
-
if not started:
|
| 56 |
-
q_lines.append(ln)
|
| 57 |
-
else:
|
| 58 |
-
if choices:
|
| 59 |
-
choices[-1] += " " + ln
|
| 60 |
-
return " ".join(q_lines).strip(), choices
|
| 61 |
-
|
| 62 |
-
|
| 63 |
class OcrAgent:
|
| 64 |
-
def __init__(self,
|
| 65 |
-
self.
|
|
|
|
| 66 |
|
| 67 |
-
def _extract_pdfplumber(self,
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
if convert_from_path
|
| 78 |
return ""
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
for
|
| 82 |
-
|
| 83 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
def extract_text(self, pdf_path: str) -> str:
|
|
|
|
| 86 |
text = ""
|
| 87 |
try:
|
| 88 |
if pdfplumber:
|
|
@@ -90,39 +66,60 @@ class OcrAgent:
|
|
| 90 |
if not text or len(text.strip()) < 120:
|
| 91 |
text = self._extract_tesseract(pdf_path)
|
| 92 |
except Exception:
|
| 93 |
-
|
| 94 |
-
|
|
|
|
| 95 |
|
| 96 |
-
def
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
| 98 |
questions = []
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
| 102 |
continue
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
return questions
|
| 108 |
|
| 109 |
-
def
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
os.makedirs(out_dir, exist_ok=True)
|
| 118 |
-
with open(qfile, "w", encoding="utf-8") as f:
|
| 119 |
-
json.dump(questions, f, indent=2, ensure_ascii=False)
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
return qfile, scheme_file
|
| 126 |
|
| 127 |
|
| 128 |
|
|
|
|
| 1 |
# ocr_agent.py
|
|
|
|
| 2 |
import re
|
| 3 |
import json
|
| 4 |
+
import os
|
| 5 |
|
| 6 |
try:
|
| 7 |
import pdfplumber
|
| 8 |
+
except ImportError:
|
| 9 |
pdfplumber = None
|
| 10 |
|
| 11 |
try:
|
| 12 |
from pdf2image import convert_from_path
|
| 13 |
import pytesseract
|
| 14 |
+
except ImportError:
|
| 15 |
convert_from_path = None
|
| 16 |
pytesseract = None
|
| 17 |
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
class OcrAgent:
|
| 20 |
+
def __init__(self, data_dir="data"):
|
| 21 |
+
self.data_dir = data_dir
|
| 22 |
+
os.makedirs(self.data_dir, exist_ok=True)
|
| 23 |
|
| 24 |
+
def _extract_pdfplumber(self, pdf_path: str) -> str:
|
| 25 |
+
"""Extract text using pdfplumber."""
|
| 26 |
+
text = ""
|
| 27 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 28 |
+
for page in pdf.pages:
|
| 29 |
+
text += page.extract_text() or ""
|
| 30 |
+
return text
|
| 31 |
+
|
| 32 |
+
def _extract_tesseract(self, pdf_path: str) -> str:
|
| 33 |
+
"""Fallback: OCR via pdf2image + Tesseract."""
|
| 34 |
+
if not (convert_from_path and pytesseract):
|
| 35 |
return ""
|
| 36 |
+
text = ""
|
| 37 |
+
pages = convert_from_path(pdf_path, dpi=300)
|
| 38 |
+
for page in pages:
|
| 39 |
+
text += pytesseract.image_to_string(page, lang="eng+msa") + "\n"
|
| 40 |
+
return text
|
| 41 |
+
|
| 42 |
+
def _clean_text(self, raw: str) -> str:
|
| 43 |
+
"""Remove watermarks, scanner marks, and noise."""
|
| 44 |
+
lines = []
|
| 45 |
+
for line in raw.splitlines():
|
| 46 |
+
l = line.strip()
|
| 47 |
+
if not l:
|
| 48 |
+
continue
|
| 49 |
+
# Remove watermarks
|
| 50 |
+
if "bmspm.net" in l.lower():
|
| 51 |
+
continue
|
| 52 |
+
if "camscanner" in l.lower():
|
| 53 |
+
continue
|
| 54 |
+
# Remove page numbers (single integers)
|
| 55 |
+
if re.match(r"^\d+$", l):
|
| 56 |
+
continue
|
| 57 |
+
lines.append(l)
|
| 58 |
+
return "\n".join(lines)
|
| 59 |
|
| 60 |
def extract_text(self, pdf_path: str) -> str:
|
| 61 |
+
"""Extract and clean text from PDF."""
|
| 62 |
text = ""
|
| 63 |
try:
|
| 64 |
if pdfplumber:
|
|
|
|
| 66 |
if not text or len(text.strip()) < 120:
|
| 67 |
text = self._extract_tesseract(pdf_path)
|
| 68 |
except Exception:
|
| 69 |
+
if convert_from_path and pytesseract:
|
| 70 |
+
text = self._extract_tesseract(pdf_path)
|
| 71 |
+
return self._clean_text(text)
|
| 72 |
|
| 73 |
+
def parse_questions(self, cleaned_text: str, subject: str, year: str):
|
| 74 |
+
"""
|
| 75 |
+
Convert extracted text into structured question JSON.
|
| 76 |
+
Very naive parsing for now.
|
| 77 |
+
"""
|
| 78 |
questions = []
|
| 79 |
+
blocks = re.split(r"\n(?=\d+\.)", cleaned_text)
|
| 80 |
+
|
| 81 |
+
q_id = 1000
|
| 82 |
+
for block in blocks:
|
| 83 |
+
block = block.strip()
|
| 84 |
+
if not block:
|
| 85 |
continue
|
| 86 |
+
# First line is question
|
| 87 |
+
parts = block.split("\n")
|
| 88 |
+
q_text = parts[0]
|
| 89 |
+
|
| 90 |
+
# Remaining lines treated as choices
|
| 91 |
+
choices = []
|
| 92 |
+
for c in parts[1:]:
|
| 93 |
+
c = c.strip()
|
| 94 |
+
if re.match(r"^[A-D]\)", c):
|
| 95 |
+
choices.append(c)
|
| 96 |
+
|
| 97 |
+
questions.append({
|
| 98 |
+
"id": q_id,
|
| 99 |
+
"text": q_text,
|
| 100 |
+
"choices": choices,
|
| 101 |
+
"topics": [],
|
| 102 |
+
"source": "pastpaper",
|
| 103 |
+
"subject": subject,
|
| 104 |
+
"year": year
|
| 105 |
+
})
|
| 106 |
+
q_id += 1
|
| 107 |
return questions
|
| 108 |
|
| 109 |
+
def save_questions(self, questions, subject: str, year: str):
|
| 110 |
+
"""Save questions into a JSON file like spm_2018_bm.json."""
|
| 111 |
+
filename = f"spm_{year}_{subject.lower()}.json"
|
| 112 |
+
out_path = os.path.join(self.data_dir, filename)
|
| 113 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 114 |
+
json.dump(questions, f, ensure_ascii=False, indent=2)
|
| 115 |
+
return out_path
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
def process_pdf(self, pdf_path: str, subject: str, year: str):
|
| 118 |
+
"""Main pipeline: extract → clean → parse → save."""
|
| 119 |
+
raw_text = self.extract_text(pdf_path)
|
| 120 |
+
questions = self.parse_questions(raw_text, subject, year)
|
| 121 |
+
return self.save_questions(questions, subject, year)
|
| 122 |
|
|
|
|
| 123 |
|
| 124 |
|
| 125 |
|