Benny-Tang commited on
Commit
dd23511
·
verified ·
1 Parent(s): 1662836

Update ocr_agent.py

Browse files
Files changed (1) hide show
  1. ocr_agent.py +41 -85
ocr_agent.py CHANGED
@@ -1,8 +1,8 @@
 
1
  import os
2
  import re
3
  import json
4
 
5
- # optional heavy deps
6
  try:
7
  import pdfplumber
8
  except Exception:
@@ -10,162 +10,118 @@ except Exception:
10
 
11
  try:
12
  from pdf2image import convert_from_path
13
- from PIL import Image
14
  import pytesseract
15
  except Exception:
16
  convert_from_path = None
17
  pytesseract = None
18
 
19
 
20
- def _normalize_choice_text(s: str) -> str:
21
- return s.strip()
22
-
23
-
24
- def _split_blocks_by_question_number(text: str):
25
- # split on lines starting with "1. " "2. " etc.
26
  parts = re.split(r"\n(?=\s*\d+\.)", text)
27
  return parts
28
 
29
 
30
- def _extract_choices_from_block(block: str):
31
- # tries to find A) B) C) D) style or A. B. C. D.
32
- # returns (question_text, [choices]) best-effort
33
- # normalize newlines into spaces inside each piece
34
- block = block.strip()
35
- # find where options start (search for "A)" or "A.")
36
  m = re.search(r"\bA[\)\.]\s*", block)
37
  if m:
38
  start = m.start()
39
  qtext = block[:start].strip()
40
- opts_text = block[start:].strip()
41
- # split by "A)", "B)", etc.
42
- items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts_text)
43
  choices = []
44
  for it in items:
45
  it = it.strip()
46
  if not it:
47
  continue
48
- # remove leading "A)"/"A."
49
- it2 = re.sub(r'^[A-D][\)\.]\s*', '', it)
50
- choices.append(_normalize_choice_text(it2))
51
- if len(choices) >= 2:
52
- return qtext, choices
53
 
54
- # fallback: maybe choices are on separate lines starting with "A. "
55
  lines = block.splitlines()
56
  q_lines = []
57
  choices = []
58
- choices_started = False
59
  for ln in lines:
60
  ln = ln.strip()
61
  if re.match(r'^[A-D][\)\.]\s*', ln):
62
- choices_started = True
63
- cl = re.sub(r'^[A-D][\)\.]\s*', '', ln)
64
- choices.append(_normalize_choice_text(cl))
65
  else:
66
- if not choices_started:
67
  q_lines.append(ln)
68
  else:
69
- # continuation of last choice?
70
  if choices:
71
  choices[-1] += " " + ln
72
-
73
- if choices:
74
- return " ".join(q_lines).strip(), choices
75
-
76
- # no choices found
77
- return block.strip(), []
78
 
79
 
80
  class OcrAgent:
81
  def __init__(self, tesseract_lang="eng"):
82
- self.tesseract_lang = tesseract_lang
83
 
84
- def _extract_text_pdfplumber(self, pdf_path: str) -> str:
85
  if pdfplumber is None:
86
- raise RuntimeError("pdfplumber not available")
87
  texts = []
88
- with pdfplumber.open(pdf_path) as pdf:
89
- for page in pdf.pages:
90
- t = page.extract_text() or ""
91
- texts.append(t)
92
  return "\n\n".join(texts)
93
 
94
- def _extract_text_tesseract(self, pdf_path: str) -> str:
95
  if convert_from_path is None or pytesseract is None:
96
- raise RuntimeError("pdf2image/pytesseract not available")
97
- images = convert_from_path(pdf_path, dpi=200)
98
  texts = []
99
  for img in images:
100
- t = pytesseract.image_to_string(img, lang=self.tesseract_lang)
101
- texts.append(t)
102
  return "\n\n".join(texts)
103
 
104
  def extract_text(self, pdf_path: str) -> str:
105
- # try pdfplumber first (best for digital PDFs)
106
  text = ""
107
  try:
108
  if pdfplumber:
109
- text = self._extract_text_pdfplumber(pdf_path)
110
- if not text or len(text.strip()) < 100:
111
- # fallback to tesseract OCR
112
- if convert_from_path and pytesseract:
113
- text = self._extract_text_tesseract(pdf_path)
114
  except Exception:
115
- # try fallback if any error
116
- if convert_from_path and pytesseract:
117
- text = self._extract_text_tesseract(pdf_path)
118
- else:
119
- raise
120
  return text
121
 
122
- def parse_questions_from_text(self, raw_text: str) -> list:
123
- # heuristic parser – splits by numbered questions and attempts to extract choices
124
- blocks = _split_blocks_by_question_number(raw_text)
125
  questions = []
126
  for blk in blocks:
127
  blk = blk.strip()
128
  if not blk:
129
  continue
130
- # remove leading number if present
131
  blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
132
- qtext, choices = _extract_choices_from_block(blk2)
133
  qtype = "mcq" if choices else "short_answer"
134
- questions.append({
135
- "text": qtext,
136
- "choices": choices,
137
- "question_type": qtype,
138
- "topics": [],
139
- "difficulty": 3
140
- })
141
  return questions
142
 
143
  def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
144
- """
145
- Extract questions from PDF and save:
146
- - data/spm_{year}_{subject_token}.json (list of question objects)
147
- - data/spm_{year}_{subject_token}_scheme.json (mapping "1": null, "2": null, ...)
148
- Returns (questions_path, scheme_path)
149
- """
150
  text = self.extract_text(pdf_path)
151
- questions = self.parse_questions_from_text(text)
152
 
153
- # filenames lower-case
154
  subject_token = subject_token.lower()
155
- q_filename = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
156
- scheme_filename = os.path.join(out_dir, f"spm_{year}_{subject_token}_scheme.json")
157
 
158
- # write questions list
159
  os.makedirs(out_dir, exist_ok=True)
160
- with open(q_filename, "w", encoding="utf-8") as f:
161
  json.dump(questions, f, indent=2, ensure_ascii=False)
162
 
163
- # create a scheme placeholder mapping by index (1-based) -> None
164
  scheme_map = {str(i + 1): None for i in range(len(questions))}
165
- with open(scheme_filename, "w", encoding="utf-8") as f:
166
  json.dump(scheme_map, f, indent=2, ensure_ascii=False)
167
 
168
- return q_filename, scheme_filename
 
169
 
170
 
171
 
 
1
+ # ocr_agent.py
2
  import os
3
  import re
4
  import json
5
 
 
6
  try:
7
  import pdfplumber
8
  except Exception:
 
10
 
11
  try:
12
  from pdf2image import convert_from_path
 
13
  import pytesseract
14
  except Exception:
15
  convert_from_path = None
16
  pytesseract = None
17
 
18
 
19
+ def _split_blocks_by_num(text):
 
 
 
 
 
20
  parts = re.split(r"\n(?=\s*\d+\.)", text)
21
  return parts
22
 
23
 
24
+ def _extract_choices(block):
 
 
 
 
 
25
  m = re.search(r"\bA[\)\.]\s*", block)
26
  if m:
27
  start = m.start()
28
  qtext = block[:start].strip()
29
+ opts = block[start:].strip()
30
+ items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts)
 
31
  choices = []
32
  for it in items:
33
  it = it.strip()
34
  if not it:
35
  continue
36
+ it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
37
+ choices.append(it2)
38
+ return qtext, choices
 
 
39
 
40
+ # fallback line by line
41
  lines = block.splitlines()
42
  q_lines = []
43
  choices = []
44
+ started = False
45
  for ln in lines:
46
  ln = ln.strip()
47
  if re.match(r'^[A-D][\)\.]\s*', ln):
48
+ started = True
49
+ cl = re.sub(r'^[A-D][\)\.]\s*', '', ln).strip()
50
+ choices.append(cl)
51
  else:
52
+ if not started:
53
  q_lines.append(ln)
54
  else:
 
55
  if choices:
56
  choices[-1] += " " + ln
57
+ return " ".join(q_lines).strip(), choices
 
 
 
 
 
58
 
59
 
60
  class OcrAgent:
61
  def __init__(self, tesseract_lang="eng"):
62
+ self.lang = tesseract_lang
63
 
64
+ def _extract_pdfplumber(self, path: str) -> str:
65
  if pdfplumber is None:
66
+ return ""
67
  texts = []
68
+ with pdfplumber.open(path) as pdf:
69
+ for p in pdf.pages:
70
+ texts.append(p.extract_text() or "")
 
71
  return "\n\n".join(texts)
72
 
73
+ def _extract_tesseract(self, path: str) -> str:
74
  if convert_from_path is None or pytesseract is None:
75
+ return ""
76
+ images = convert_from_path(path, dpi=200)
77
  texts = []
78
  for img in images:
79
+ texts.append(pytesseract.image_to_string(img, lang=self.lang))
 
80
  return "\n\n".join(texts)
81
 
82
  def extract_text(self, pdf_path: str) -> str:
 
83
  text = ""
84
  try:
85
  if pdfplumber:
86
+ text = self._extract_pdfplumber(pdf_path)
87
+ if not text or len(text.strip()) < 120:
88
+ # fallback
89
+ text = self._extract_tesseract(pdf_path)
 
90
  except Exception:
91
+ text = self._extract_tesseract(pdf_path) if convert_from_path and pytesseract else ""
 
 
 
 
92
  return text
93
 
94
+ def parse_questions(self, raw_text: str):
95
+ blocks = _split_blocks_by_num(raw_text)
 
96
  questions = []
97
  for blk in blocks:
98
  blk = blk.strip()
99
  if not blk:
100
  continue
 
101
  blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
102
+ qtext, choices = _extract_choices(blk2)
103
  qtype = "mcq" if choices else "short_answer"
104
+ questions.append({"text": qtext, "choices": choices, "question_type": qtype, "topics": [], "difficulty": 3})
 
 
 
 
 
 
105
  return questions
106
 
107
  def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
 
 
 
 
 
 
108
  text = self.extract_text(pdf_path)
109
+ questions = self.parse_questions(text)
110
 
 
111
  subject_token = subject_token.lower()
112
+ qfile = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
113
+ scheme_file = os.path.join(out_dir, f"spm_{year}_{subject_token}_scheme.json")
114
 
 
115
  os.makedirs(out_dir, exist_ok=True)
116
+ with open(qfile, "w", encoding="utf-8") as f:
117
  json.dump(questions, f, indent=2, ensure_ascii=False)
118
 
 
119
  scheme_map = {str(i + 1): None for i in range(len(questions))}
120
+ with open(scheme_file, "w", encoding="utf-8") as f:
121
  json.dump(scheme_map, f, indent=2, ensure_ascii=False)
122
 
123
+ return qfile, scheme_file
124
+
125
 
126
 
127