Benny-Tang commited on
Commit
b21249a
·
verified ·
1 Parent(s): 61f8c8e

Update ocr_agent.py

Browse files
Files changed (1) hide show
  1. ocr_agent.py +163 -23
ocr_agent.py CHANGED
@@ -1,32 +1,172 @@
1
- import fitz # PyMuPDF
2
- import json
3
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  class OcrAgent:
7
- def extract_questions(self, pdf_path, output_path):
8
- doc = fitz.open(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  questions = []
10
- qid = 1000
11
- for page in doc:
12
- text = page.get_text("text")
13
- for line in text.splitlines():
14
- if line.strip().endswith("?") or line.strip().startswith("Q"):
15
- questions.append(
16
- {
17
- "id": qid,
18
- "text": line.strip(),
19
- "choices": ["A", "B", "C", "D"],
20
- "topics": [],
21
- "correct_answer": None,
22
- }
23
- )
24
- qid += 1
25
-
26
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
27
- with open(output_path, "w", encoding="utf-8") as f:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  json.dump(questions, f, indent=2, ensure_ascii=False)
29
- print(f"✅ Extracted {len(questions)} questions → {output_path}")
 
 
 
 
 
 
 
30
 
31
 
32
 
 
 
 
1
  import os
2
+ import re
3
+ import json
4
+
5
+ # optional heavy deps
6
+ try:
7
+ import pdfplumber
8
+ except Exception:
9
+ pdfplumber = None
10
+
11
+ try:
12
+ from pdf2image import convert_from_path
13
+ from PIL import Image
14
+ import pytesseract
15
+ except Exception:
16
+ convert_from_path = None
17
+ pytesseract = None
18
+
19
+
20
+ def _normalize_choice_text(s: str) -> str:
21
+ return s.strip()
22
+
23
+
24
+ def _split_blocks_by_question_number(text: str):
25
+ # split on lines starting with "1. " "2. " etc.
26
+ parts = re.split(r"\n(?=\s*\d+\.)", text)
27
+ return parts
28
+
29
+
30
+ def _extract_choices_from_block(block: str):
31
+ # tries to find A) B) C) D) style or A. B. C. D.
32
+ # returns (question_text, [choices]) best-effort
33
+ # normalize newlines into spaces inside each piece
34
+ block = block.strip()
35
+ # find where options start (search for "A)" or "A.")
36
+ m = re.search(r"\bA[\)\.]\s*", block)
37
+ if m:
38
+ start = m.start()
39
+ qtext = block[:start].strip()
40
+ opts_text = block[start:].strip()
41
+ # split by "A)", "B)", etc.
42
+ items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts_text)
43
+ choices = []
44
+ for it in items:
45
+ it = it.strip()
46
+ if not it:
47
+ continue
48
+ # remove leading "A)"/"A."
49
+ it2 = re.sub(r'^[A-D][\)\.]\s*', '', it)
50
+ choices.append(_normalize_choice_text(it2))
51
+ if len(choices) >= 2:
52
+ return qtext, choices
53
+
54
+ # fallback: maybe choices are on separate lines starting with "A. "
55
+ lines = block.splitlines()
56
+ q_lines = []
57
+ choices = []
58
+ choices_started = False
59
+ for ln in lines:
60
+ ln = ln.strip()
61
+ if re.match(r'^[A-D][\)\.]\s*', ln):
62
+ choices_started = True
63
+ cl = re.sub(r'^[A-D][\)\.]\s*', '', ln)
64
+ choices.append(_normalize_choice_text(cl))
65
+ else:
66
+ if not choices_started:
67
+ q_lines.append(ln)
68
+ else:
69
+ # continuation of last choice?
70
+ if choices:
71
+ choices[-1] += " " + ln
72
+
73
+ if choices:
74
+ return " ".join(q_lines).strip(), choices
75
+
76
+ # no choices found
77
+ return block.strip(), []
78
 
79
 
80
  class OcrAgent:
81
+ def __init__(self, tesseract_lang="eng"):
82
+ self.tesseract_lang = tesseract_lang
83
+
84
+ def _extract_text_pdfplumber(self, pdf_path: str) -> str:
85
+ if pdfplumber is None:
86
+ raise RuntimeError("pdfplumber not available")
87
+ texts = []
88
+ with pdfplumber.open(pdf_path) as pdf:
89
+ for page in pdf.pages:
90
+ t = page.extract_text() or ""
91
+ texts.append(t)
92
+ return "\n\n".join(texts)
93
+
94
+ def _extract_text_tesseract(self, pdf_path: str) -> str:
95
+ if convert_from_path is None or pytesseract is None:
96
+ raise RuntimeError("pdf2image/pytesseract not available")
97
+ images = convert_from_path(pdf_path, dpi=200)
98
+ texts = []
99
+ for img in images:
100
+ t = pytesseract.image_to_string(img, lang=self.tesseract_lang)
101
+ texts.append(t)
102
+ return "\n\n".join(texts)
103
+
104
+ def extract_text(self, pdf_path: str) -> str:
105
+ # try pdfplumber first (best for digital PDFs)
106
+ text = ""
107
+ try:
108
+ if pdfplumber:
109
+ text = self._extract_text_pdfplumber(pdf_path)
110
+ if not text or len(text.strip()) < 100:
111
+ # fallback to tesseract OCR
112
+ if convert_from_path and pytesseract:
113
+ text = self._extract_text_tesseract(pdf_path)
114
+ except Exception:
115
+ # try fallback if any error
116
+ if convert_from_path and pytesseract:
117
+ text = self._extract_text_tesseract(pdf_path)
118
+ else:
119
+ raise
120
+ return text
121
+
122
+ def parse_questions_from_text(self, raw_text: str) -> list:
123
+ # heuristic parser – splits by numbered questions and attempts to extract choices
124
+ blocks = _split_blocks_by_question_number(raw_text)
125
  questions = []
126
+ for blk in blocks:
127
+ blk = blk.strip()
128
+ if not blk:
129
+ continue
130
+ # remove leading number if present
131
+ blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
132
+ qtext, choices = _extract_choices_from_block(blk2)
133
+ qtype = "mcq" if choices else "short_answer"
134
+ questions.append({
135
+ "text": qtext,
136
+ "choices": choices,
137
+ "question_type": qtype,
138
+ "topics": [],
139
+ "difficulty": 3
140
+ })
141
+ return questions
142
+
143
+ def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
144
+ """
145
+ Extract questions from PDF and save:
146
+ - data/spm_{year}_{subject_token}.json (list of question objects)
147
+ - data/spm_{year}_{subject_token}_scheme.json (mapping "1": null, "2": null, ...)
148
+ Returns (questions_path, scheme_path)
149
+ """
150
+ text = self.extract_text(pdf_path)
151
+ questions = self.parse_questions_from_text(text)
152
+
153
+ # filenames lower-case
154
+ subject_token = subject_token.lower()
155
+ q_filename = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
156
+ scheme_filename = os.path.join(out_dir, f"spm_{year}_{subject_token}_scheme.json")
157
+
158
+ # write questions list
159
+ os.makedirs(out_dir, exist_ok=True)
160
+ with open(q_filename, "w", encoding="utf-8") as f:
161
  json.dump(questions, f, indent=2, ensure_ascii=False)
162
+
163
+ # create a scheme placeholder mapping by index (1-based) -> None
164
+ scheme_map = {str(i + 1): None for i in range(len(questions))}
165
+ with open(scheme_filename, "w", encoding="utf-8") as f:
166
+ json.dump(scheme_map, f, indent=2, ensure_ascii=False)
167
+
168
+ return q_filename, scheme_filename
169
+
170
 
171
 
172