Benny-Tang commited on
Commit
e14bed5
·
verified ·
1 Parent(s): 25efed5

Update ocr_agent.py

Browse files
Files changed (1) hide show
  1. ocr_agent.py +55 -109
ocr_agent.py CHANGED
@@ -1,124 +1,70 @@
1
- # ocr_agent.py
2
- import re
3
- import json
4
  import os
5
-
6
- try:
7
- import pdfplumber
8
- except ImportError:
9
- pdfplumber = None
10
-
11
- try:
12
- from pdf2image import convert_from_path
13
- import pytesseract
14
- except ImportError:
15
- convert_from_path = None
16
- pytesseract = None
17
-
18
 
19
  class OcrAgent:
20
  def __init__(self, data_dir="data"):
21
  self.data_dir = data_dir
22
  os.makedirs(self.data_dir, exist_ok=True)
23
 
24
- def _extract_pdfplumber(self, pdf_path: str) -> str:
25
- """Extract text using pdfplumber."""
26
- text = ""
27
- with pdfplumber.open(pdf_path) as pdf:
28
- for page in pdf.pages:
29
- text += page.extract_text() or ""
30
- return text
31
-
32
- def _extract_tesseract(self, pdf_path: str) -> str:
33
- """Fallback: OCR via pdf2image + Tesseract."""
34
- if not (convert_from_path and pytesseract):
35
- return ""
36
- text = ""
37
- pages = convert_from_path(pdf_path, dpi=300)
38
- for page in pages:
39
- text += pytesseract.image_to_string(page, lang="eng+msa") + "\n"
40
- return text
41
-
42
- def _clean_text(self, raw: str) -> str:
43
- """Remove watermarks, scanner marks, and noise."""
44
- lines = []
45
- for line in raw.splitlines():
46
- l = line.strip()
47
- if not l:
48
- continue
49
- # Remove watermarks
50
- if "bmspm.net" in l.lower():
51
- continue
52
- if "camscanner" in l.lower():
53
- continue
54
- # Remove page numbers (single integers)
55
- if re.match(r"^\d+$", l):
56
- continue
57
- lines.append(l)
58
- return "\n".join(lines)
59
-
60
- def extract_text(self, pdf_path: str) -> str:
61
- """Extract and clean text from PDF."""
62
- text = ""
63
- try:
64
- if pdfplumber:
65
- text = self._extract_pdfplumber(pdf_path)
66
- if not text or len(text.strip()) < 120:
67
- text = self._extract_tesseract(pdf_path)
68
- except Exception:
69
- if convert_from_path and pytesseract:
70
- text = self._extract_tesseract(pdf_path)
71
- return self._clean_text(text)
72
-
73
- def parse_questions(self, cleaned_text: str, subject: str, year: str):
74
  """
75
- Convert extracted text into structured question JSON.
76
- Very naive parsing for now.
77
  """
 
78
  questions = []
79
- blocks = re.split(r"\n(?=\d+\.)", cleaned_text)
80
-
81
- q_id = 1000
82
- for block in blocks:
83
- block = block.strip()
84
- if not block:
85
- continue
86
- # First line is question
87
- parts = block.split("\n")
88
- q_text = parts[0]
89
-
90
- # Remaining lines treated as choices
91
- choices = []
92
- for c in parts[1:]:
93
- c = c.strip()
94
- if re.match(r"^[A-D]\)", c):
95
- choices.append(c)
96
-
97
- questions.append({
98
- "id": q_id,
99
- "text": q_text,
100
- "choices": choices,
101
- "topics": [],
102
- "source": "pastpaper",
103
- "subject": subject,
104
- "year": year
105
- })
106
- q_id += 1
 
 
 
107
  return questions
108
 
109
- def save_questions(self, questions, subject: str, year: str):
110
- """Save questions into a JSON file like spm_2018_bm.json."""
111
- filename = f"spm_{year}_{subject.lower()}.json"
112
- out_path = os.path.join(self.data_dir, filename)
113
- with open(out_path, "w", encoding="utf-8") as f:
114
- json.dump(questions, f, ensure_ascii=False, indent=2)
115
- return out_path
116
-
117
- def process_pdf(self, pdf_path: str, subject: str, year: str):
118
- """Main pipeline: extract → clean → parse → save."""
119
- raw_text = self.extract_text(pdf_path)
120
- questions = self.parse_questions(raw_text, subject, year)
121
- return self.save_questions(questions, subject, year)
 
 
 
 
 
122
 
123
 
124
 
 
 
 
 
1
  import os
2
+ import fitz # PyMuPDF
3
+ import re
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  class OcrAgent:
6
  def __init__(self, data_dir="data"):
7
  self.data_dir = data_dir
8
  os.makedirs(self.data_dir, exist_ok=True)
9
 
10
+ def extract_questions(self, pdf_path, subject, year):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  """
12
+ Extracts questions from PDF with text + visual support.
13
+ Returns a list of formatted question blocks (HTML with text + images).
14
  """
15
+ doc = fitz.open(pdf_path)
16
  questions = []
17
+ q_counter = 1
18
+
19
+ for page_num, page in enumerate(doc, start=1):
20
+ text = page.get_text("text").strip()
21
+ images = page.get_images(full=True)
22
+
23
+ # Save images if present
24
+ img_paths = []
25
+ for i, img in enumerate(images, start=1):
26
+ xref = img[0]
27
+ pix = fitz.Pixmap(doc, xref)
28
+ img_filename = f"{subject}_{year}_q{q_counter}_{i}.png"
29
+ img_path = os.path.join(self.data_dir, img_filename)
30
+ pix.save(img_path)
31
+ img_paths.append(img_path)
32
+
33
+ # Split text into question + choices
34
+ match = re.split(r"\n[A-D]\.", text)
35
+ if len(match) > 1:
36
+ q_text = match[0].strip()
37
+ choices = re.findall(r"[A-D]\.\s?.*", text)
38
+ else:
39
+ q_text = text
40
+ choices = []
41
+
42
+ formatted = self.format_question_block(
43
+ q_counter, q_text, choices, img_paths
44
+ )
45
+ questions.append(formatted)
46
+ q_counter += 1
47
+
48
  return questions
49
 
50
+ def format_question_block(self, q_num, q_text, choices, img_paths):
51
+ """
52
+ Format one question into HTML with optional images and choices.
53
+ """
54
+ block = f"<b>Q{q_num}.</b> {q_text}<br>"
55
+
56
+ for img_path in img_paths:
57
+ rel_path = os.path.relpath(img_path, self.data_dir)
58
+ block += f'<img src="data/{rel_path}" style="max-width:400px;"><br>'
59
+
60
+ if choices:
61
+ block += "<ul>"
62
+ for choice in choices:
63
+ block += f"<li>{choice}</li>"
64
+ block += "</ul>"
65
+
66
+ return block
67
+
68
 
69
 
70