Benny-Tang commited on
Commit
25efed5
·
verified ·
1 Parent(s): 9eec4e7

Update ocr_agent.py

Browse files
Files changed (1) hide show
  1. ocr_agent.py +88 -91
ocr_agent.py CHANGED
@@ -1,88 +1,64 @@
1
  # ocr_agent.py
2
- import os
3
  import re
4
  import json
 
5
 
6
  try:
7
  import pdfplumber
8
- except Exception:
9
  pdfplumber = None
10
 
11
  try:
12
  from pdf2image import convert_from_path
13
  import pytesseract
14
- except Exception:
15
  convert_from_path = None
16
  pytesseract = None
17
 
18
 
19
- def _split_blocks_by_number(text: str):
20
- # split on lines starting with "1. " or similar
21
- return re.split(r"\n(?=\s*\d+\.)", text)
22
-
23
-
24
- def _extract_choices_from_block(block: str):
25
- block = block.strip()
26
- # look for A) or A. markers
27
- m = re.search(r"\bA[\)\.]\s*", block)
28
- if m:
29
- start = m.start()
30
- qtext = block[:start].strip()
31
- opts_text = block[start:].strip()
32
- items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts_text)
33
- choices = []
34
- for it in items:
35
- it = it.strip()
36
- if not it:
37
- continue
38
- it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
39
- choices.append(it2)
40
- if choices:
41
- return qtext, choices
42
-
43
- # fallback: lines style
44
- lines = block.splitlines()
45
- q_lines = []
46
- choices = []
47
- started = False
48
- for ln in lines:
49
- ln = ln.strip()
50
- if re.match(r'^[A-D][\)\.]\s*', ln):
51
- started = True
52
- cl = re.sub(r'^[A-D][\)\.]\s*', '', ln).strip()
53
- choices.append(cl)
54
- else:
55
- if not started:
56
- q_lines.append(ln)
57
- else:
58
- if choices:
59
- choices[-1] += " " + ln
60
- return " ".join(q_lines).strip(), choices
61
-
62
-
63
  class OcrAgent:
64
- def __init__(self, tesseract_lang="eng"):
65
- self.lang = tesseract_lang
 
66
 
67
- def _extract_pdfplumber(self, path: str) -> str:
68
- if pdfplumber is None:
69
- return ""
70
- texts = []
71
- with pdfplumber.open(path) as pdf:
72
- for p in pdf.pages:
73
- texts.append(p.extract_text() or "")
74
- return "\n\n".join(texts)
75
-
76
- def _extract_tesseract(self, path: str) -> str:
77
- if convert_from_path is None or pytesseract is None:
78
  return ""
79
- images = convert_from_path(path, dpi=200)
80
- texts = []
81
- for img in images:
82
- texts.append(pytesseract.image_to_string(img, lang=self.lang))
83
- return "\n\n".join(texts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  def extract_text(self, pdf_path: str) -> str:
 
86
  text = ""
87
  try:
88
  if pdfplumber:
@@ -90,39 +66,60 @@ class OcrAgent:
90
  if not text or len(text.strip()) < 120:
91
  text = self._extract_tesseract(pdf_path)
92
  except Exception:
93
- text = self._extract_tesseract(pdf_path) if (convert_from_path and pytesseract) else ""
94
- return text
 
95
 
96
- def parse_questions_from_text(self, raw_text: str):
97
- blocks = _split_blocks_by_number(raw_text)
 
 
 
98
  questions = []
99
- for blk in blocks:
100
- blk = blk.strip()
101
- if not blk:
 
 
 
102
  continue
103
- blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
104
- qtext, choices = _extract_choices_from_block(blk2)
105
- qtype = "mcq" if choices else "short_answer"
106
- questions.append({"text": qtext, "choices": choices, "question_type": qtype, "topics": [], "difficulty": 3})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  return questions
108
 
109
- def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
110
- text = self.extract_text(pdf_path)
111
- questions = self.parse_questions_from_text(text)
112
-
113
- subject_token = subject_token.lower()
114
- qfile = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
115
- scheme_file = os.path.join(out_dir, f"spm_{year}_{subject_token}_scheme.json")
116
-
117
- os.makedirs(out_dir, exist_ok=True)
118
- with open(qfile, "w", encoding="utf-8") as f:
119
- json.dump(questions, f, indent=2, ensure_ascii=False)
120
 
121
- scheme_map = {str(i + 1): None for i in range(len(questions))}
122
- with open(scheme_file, "w", encoding="utf-8") as f:
123
- json.dump(scheme_map, f, indent=2, ensure_ascii=False)
 
 
124
 
125
- return qfile, scheme_file
126
 
127
 
128
 
 
1
  # ocr_agent.py
 
2
  import re
3
  import json
4
+ import os
5
 
6
  try:
7
  import pdfplumber
8
+ except ImportError:
9
  pdfplumber = None
10
 
11
  try:
12
  from pdf2image import convert_from_path
13
  import pytesseract
14
+ except ImportError:
15
  convert_from_path = None
16
  pytesseract = None
17
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  class OcrAgent:
20
+ def __init__(self, data_dir="data"):
21
+ self.data_dir = data_dir
22
+ os.makedirs(self.data_dir, exist_ok=True)
23
 
24
+ def _extract_pdfplumber(self, pdf_path: str) -> str:
25
+ """Extract text using pdfplumber."""
26
+ text = ""
27
+ with pdfplumber.open(pdf_path) as pdf:
28
+ for page in pdf.pages:
29
+ text += page.extract_text() or ""
30
+ return text
31
+
32
+ def _extract_tesseract(self, pdf_path: str) -> str:
33
+ """Fallback: OCR via pdf2image + Tesseract."""
34
+ if not (convert_from_path and pytesseract):
35
  return ""
36
+ text = ""
37
+ pages = convert_from_path(pdf_path, dpi=300)
38
+ for page in pages:
39
+ text += pytesseract.image_to_string(page, lang="eng+msa") + "\n"
40
+ return text
41
+
42
+ def _clean_text(self, raw: str) -> str:
43
+ """Remove watermarks, scanner marks, and noise."""
44
+ lines = []
45
+ for line in raw.splitlines():
46
+ l = line.strip()
47
+ if not l:
48
+ continue
49
+ # Remove watermarks
50
+ if "bmspm.net" in l.lower():
51
+ continue
52
+ if "camscanner" in l.lower():
53
+ continue
54
+ # Remove page numbers (single integers)
55
+ if re.match(r"^\d+$", l):
56
+ continue
57
+ lines.append(l)
58
+ return "\n".join(lines)
59
 
60
  def extract_text(self, pdf_path: str) -> str:
61
+ """Extract and clean text from PDF."""
62
  text = ""
63
  try:
64
  if pdfplumber:
 
66
  if not text or len(text.strip()) < 120:
67
  text = self._extract_tesseract(pdf_path)
68
  except Exception:
69
+ if convert_from_path and pytesseract:
70
+ text = self._extract_tesseract(pdf_path)
71
+ return self._clean_text(text)
72
 
73
+ def parse_questions(self, cleaned_text: str, subject: str, year: str):
74
+ """
75
+ Convert extracted text into structured question JSON.
76
+ Very naive parsing for now.
77
+ """
78
  questions = []
79
+ blocks = re.split(r"\n(?=\d+\.)", cleaned_text)
80
+
81
+ q_id = 1000
82
+ for block in blocks:
83
+ block = block.strip()
84
+ if not block:
85
  continue
86
+ # First line is question
87
+ parts = block.split("\n")
88
+ q_text = parts[0]
89
+
90
+ # Remaining lines treated as choices
91
+ choices = []
92
+ for c in parts[1:]:
93
+ c = c.strip()
94
+ if re.match(r"^[A-D]\)", c):
95
+ choices.append(c)
96
+
97
+ questions.append({
98
+ "id": q_id,
99
+ "text": q_text,
100
+ "choices": choices,
101
+ "topics": [],
102
+ "source": "pastpaper",
103
+ "subject": subject,
104
+ "year": year
105
+ })
106
+ q_id += 1
107
  return questions
108
 
109
+ def save_questions(self, questions, subject: str, year: str):
110
+ """Save questions into a JSON file like spm_2018_bm.json."""
111
+ filename = f"spm_{year}_{subject.lower()}.json"
112
+ out_path = os.path.join(self.data_dir, filename)
113
+ with open(out_path, "w", encoding="utf-8") as f:
114
+ json.dump(questions, f, ensure_ascii=False, indent=2)
115
+ return out_path
 
 
 
 
116
 
117
+ def process_pdf(self, pdf_path: str, subject: str, year: str):
118
+ """Main pipeline: extract → clean → parse → save."""
119
+ raw_text = self.extract_text(pdf_path)
120
+ questions = self.parse_questions(raw_text, subject, year)
121
+ return self.save_questions(questions, subject, year)
122
 
 
123
 
124
 
125