Benny-Tang commited on
Commit
855ae47
·
verified ·
1 Parent(s): 346cec5

Update ocr_agent.py

Browse files
Files changed (1) hide show
  1. ocr_agent.py +17 -14
ocr_agent.py CHANGED
@@ -16,18 +16,20 @@ except Exception:
16
  pytesseract = None
17
 
18
 
19
- def _split_blocks_by_num(text):
20
- parts = re.split(r"\n(?=\s*\d+\.)", text)
21
- return parts
22
 
23
 
24
- def _extract_choices(block):
 
 
25
  m = re.search(r"\bA[\)\.]\s*", block)
26
  if m:
27
  start = m.start()
28
  qtext = block[:start].strip()
29
- opts = block[start:].strip()
30
- items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts)
31
  choices = []
32
  for it in items:
33
  it = it.strip()
@@ -35,9 +37,10 @@ def _extract_choices(block):
35
  continue
36
  it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
37
  choices.append(it2)
38
- return qtext, choices
 
39
 
40
- # fallback line by line
41
  lines = block.splitlines()
42
  q_lines = []
43
  choices = []
@@ -85,28 +88,27 @@ class OcrAgent:
85
  if pdfplumber:
86
  text = self._extract_pdfplumber(pdf_path)
87
  if not text or len(text.strip()) < 120:
88
- # fallback
89
  text = self._extract_tesseract(pdf_path)
90
  except Exception:
91
- text = self._extract_tesseract(pdf_path) if convert_from_path and pytesseract else ""
92
  return text
93
 
94
- def parse_questions(self, raw_text: str):
95
- blocks = _split_blocks_by_num(raw_text)
96
  questions = []
97
  for blk in blocks:
98
  blk = blk.strip()
99
  if not blk:
100
  continue
101
  blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
102
- qtext, choices = _extract_choices(blk2)
103
  qtype = "mcq" if choices else "short_answer"
104
  questions.append({"text": qtext, "choices": choices, "question_type": qtype, "topics": [], "difficulty": 3})
105
  return questions
106
 
107
  def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
108
  text = self.extract_text(pdf_path)
109
- questions = self.parse_questions(text)
110
 
111
  subject_token = subject_token.lower()
112
  qfile = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
@@ -126,3 +128,4 @@ class OcrAgent:
126
 
127
 
128
 
 
 
16
  pytesseract = None
17
 
18
 
19
+ def _split_blocks_by_number(text: str):
20
+ # split on lines starting with "1. " or similar
21
+ return re.split(r"\n(?=\s*\d+\.)", text)
22
 
23
 
24
+ def _extract_choices_from_block(block: str):
25
+ block = block.strip()
26
+ # look for A) or A. markers
27
  m = re.search(r"\bA[\)\.]\s*", block)
28
  if m:
29
  start = m.start()
30
  qtext = block[:start].strip()
31
+ opts_text = block[start:].strip()
32
+ items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts_text)
33
  choices = []
34
  for it in items:
35
  it = it.strip()
 
37
  continue
38
  it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
39
  choices.append(it2)
40
+ if choices:
41
+ return qtext, choices
42
 
43
+ # fallback: lines style
44
  lines = block.splitlines()
45
  q_lines = []
46
  choices = []
 
88
  if pdfplumber:
89
  text = self._extract_pdfplumber(pdf_path)
90
  if not text or len(text.strip()) < 120:
 
91
  text = self._extract_tesseract(pdf_path)
92
  except Exception:
93
+ text = self._extract_tesseract(pdf_path) if (convert_from_path and pytesseract) else ""
94
  return text
95
 
96
+ def parse_questions_from_text(self, raw_text: str):
97
+ blocks = _split_blocks_by_number(raw_text)
98
  questions = []
99
  for blk in blocks:
100
  blk = blk.strip()
101
  if not blk:
102
  continue
103
  blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
104
+ qtext, choices = _extract_choices_from_block(blk2)
105
  qtype = "mcq" if choices else "short_answer"
106
  questions.append({"text": qtext, "choices": choices, "question_type": qtype, "topics": [], "difficulty": 3})
107
  return questions
108
 
109
  def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
110
  text = self.extract_text(pdf_path)
111
+ questions = self.parse_questions_from_text(text)
112
 
113
  subject_token = subject_token.lower()
114
  qfile = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
 
128
 
129
 
130
 
131
+