Benny-Tang commited on
Commit
467a83a
·
verified ·
1 Parent(s): cc73727

Update ocr_agent.py

Browse files
Files changed (1) hide show
  1. ocr_agent.py +138 -63
ocr_agent.py CHANGED
@@ -1,72 +1,147 @@
1
- import fitz # PyMuPDF
2
  import os
3
- import json
4
  import re
5
- from PIL import Image
6
  import io
 
 
7
  import pytesseract
 
8
 
9
  DATA_DIR = "data"
10
- IMAGES_DIR = os.path.join(DATA_DIR, "images")
11
-
12
- # Ensure dirs exist
13
- os.makedirs(DATA_DIR, exist_ok=True)
14
- os.makedirs(IMAGES_DIR, exist_ok=True)
15
-
16
- class OcrAgent:
17
- """Extracts text + images from PDF SPM past papers."""
18
-
19
- def extract_questions(self, pdf_path, subject, year, paper=2):
20
- doc = fitz.open(pdf_path)
21
- questions = []
22
- qid = 1000
23
-
24
- for page_num, page in enumerate(doc):
25
- text = page.get_text("text")
26
- images = page.get_images(full=True)
27
-
28
- # OCR fallback if no text
29
- if not text.strip() and images:
30
- for img in images:
31
- xref = img[0]
32
- base_image = doc.extract_image(xref)
33
- image_bytes = base_image["image"]
34
- img_obj = Image.open(io.BytesIO(image_bytes))
35
- ocr_text = pytesseract.image_to_string(img_obj, lang="eng+msa")
36
- text += "\n" + ocr_text
37
-
38
- # Save images
39
- for img_index, img in enumerate(images):
40
- xref = img[0]
41
- base_image = doc.extract_image(xref)
42
- image_bytes = base_image["image"]
43
- ext = base_image["ext"]
44
- img_path = os.path.join(IMAGES_DIR, f"{year}_{subject}_p{paper}_{page_num+1}_{img_index}.{ext}")
45
- with open(img_path, "wb") as f:
46
- f.write(image_bytes)
47
- text += f"\n[Image included: {img_path}]"
48
-
49
- # Split text into questions
50
- raw_questions = re.split(r"\n\d+\s", text)
51
- for chunk in raw_questions:
52
- chunk = chunk.strip()
53
- if not chunk or len(chunk) < 20:
54
- continue
55
- questions.append({
56
- "id": qid,
57
- "text": chunk,
58
- "choices": [],
59
- "topics": [],
60
- "source": f"spm_{year}_{subject.lower()}_paper{paper}"
61
- })
62
- qid += 1
63
-
64
- # Save to JSON
65
- out_file = os.path.join(DATA_DIR, f"spm_{year}_{subject.lower()}_paper{paper}.json")
66
- with open(out_file, "w", encoding="utf-8") as f:
67
- json.dump(questions, f, ensure_ascii=False, indent=2)
68
-
69
- return questions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
 
 
1
+ # ocr_agent.py
2
  import os
 
3
  import re
4
+ import json
5
  import io
6
+ import fitz # PyMuPDF
7
+ from PIL import Image
8
  import pytesseract
9
+ from datetime import datetime
10
 
11
  DATA_DIR = "data"
12
+ PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
13
+ MEDIA_DIR = "media"
14
+ os.makedirs(PROCESSED_DIR, exist_ok=True)
15
+ os.makedirs(MEDIA_DIR, exist_ok=True)
16
+
17
+ def sanitize_text(s):
18
+ return re.sub(r'\s+', ' ', s).strip()
19
+
20
+ def parse_mcq_from_text(block_text):
21
+ """
22
+ Try to parse a block of text for a single question with choices.
23
+ Return dict: {"text": "...", "choices": [...], "answer": None}
24
+ """
25
+ # Attempt to find choice lines starting with A., A), A
26
+ lines = [l.strip() for l in block_text.splitlines() if l.strip()]
27
+ # find choice start indices
28
+ choice_indices = []
29
+ for i, ln in enumerate(lines):
30
+ if re.match(r'^[A-D][\.\)]\s+', ln, re.IGNORECASE) or re.match(r'^[A-D]\s+-\s+', ln, re.IGNORECASE):
31
+ choice_indices.append(i)
32
+ if not choice_indices:
33
+ # no obvious choices — return whole block as question text
34
+ return {"text": sanitize_text(block_text), "choices": [], "answer": None}
35
+
36
+ # question text = lines up to first choice line
37
+ first_choice_i = choice_indices[0]
38
+ qtext = " ".join(lines[:first_choice_i])
39
+ choices = []
40
+ # collect consecutive choice lines until next question (non-choice)
41
+ for idx in choice_indices:
42
+ ln = lines[idx]
43
+ # remove leading A. or A) or A -
44
+ ln_clean = re.sub(r'^[A-D][\.\)\-]\s*', '', ln, flags=re.IGNORECASE)
45
+ choices.append(ln_clean.strip())
46
+
47
+ return {"text": sanitize_text(qtext), "choices": choices, "answer": None}
48
+
49
+ def extract_questions(pdf_path, subject, year, paper=2):
50
+ """
51
+ Extracts questions and images from given PDF.
52
+ Saves output JSON to data/processed/spm_{year}_{subject}_paper{paper}.json
53
+ Returns list of question dicts.
54
+ """
55
+ doc = fitz.open(pdf_path)
56
+ questions = []
57
+ qid_base = int(datetime.now().timestamp()) # base for ids if missing
58
+ qcounter = 0
59
+
60
+ for page_num in range(len(doc)):
61
+ page = doc.load_page(page_num)
62
+ text = page.get_text("text") or ""
63
+ # if no text, attempt OCR over the image of whole page
64
+ if not text.strip():
65
+ pix = page.get_pixmap(dpi=200)
66
+ img_bytes = pix.tobytes()
67
+ try:
68
+ img = Image.open(io.BytesIO(img_bytes))
69
+ ocr_text = pytesseract.image_to_string(img, lang="eng+msa")
70
+ text = ocr_text
71
+ except Exception:
72
+ text = ""
73
+
74
+ # extract embedded images (diagrams)
75
+ images = page.get_images(full=True)
76
+ image_paths = []
77
+ for img_index, imginfo in enumerate(images):
78
+ xref = imginfo[0]
79
+ base_image = doc.extract_image(xref)
80
+ image_bytes = base_image["image"]
81
+ ext = base_image.get("ext", "png")
82
+ img_name = f"{year}_{subject}_p{paper}_pg{page_num+1}_{img_index}.{ext}"
83
+ img_path = os.path.join(MEDIA_DIR, img_name)
84
+ with open(img_path, "wb") as f:
85
+ f.write(image_bytes)
86
+ image_paths.append(img_path)
87
+
88
+ # Try splitting page text by question numbers (e.g., '1.', '2.' at start of line)
89
+ # Create chunks
90
+ # use regex to split on lines starting with number dot or number)
91
+ splits = re.split(r'\n\s*(\d+)[\.\)]\s*', text)
92
+ # splits format: ['', '1', 'text1', '2', 'text2', ...] or similar
93
+ if len(splits) <= 1:
94
+ # fallback: treat whole page as single block
95
+ parsed = parse_mcq_from_text(text)
96
+ parsed["id"] = qid_base + qcounter
97
+ parsed["subject"] = subject
98
+ parsed["paper"] = paper
99
+ parsed["year"] = year
100
+ parsed["image"] = image_paths[0] if image_paths else None
101
+ parsed["source"] = os.path.basename(pdf_path)
102
+ questions.append(parsed)
103
+ qcounter += 1
104
+ else:
105
+ # iterate pairs
106
+ # splits structure: prefix, num1, block1, num2, block2, ...
107
+ it = iter(splits)
108
+ prefix = next(it, "")
109
+ while True:
110
+ try:
111
+ num = next(it)
112
+ block = next(it)
113
+ except StopIteration:
114
+ break
115
+ parsed = parse_mcq_from_text(block)
116
+ parsed["id"] = qid_base + qcounter
117
+ parsed["subject"] = subject
118
+ parsed["paper"] = paper
119
+ parsed["year"] = year
120
+ # attach first diagram of page (best-effort)
121
+ parsed["image"] = image_paths[0] if image_paths else None
122
+ parsed["source"] = os.path.basename(pdf_path)
123
+ questions.append(parsed)
124
+ qcounter += 1
125
+
126
+ # Save processed JSON
127
+ out_fname = os.path.join(PROCESSED_DIR, f"spm_{year}_{subject.lower()}_paper{paper}.json")
128
+ with open(out_fname, "w", encoding="utf-8") as f:
129
+ json.dump(questions, f, ensure_ascii=False, indent=2)
130
+
131
+ return questions
132
+
133
+ # CLI convenience
134
+ if __name__ == "__main__":
135
+ import argparse
136
+ parser = argparse.ArgumentParser(description="Extract questions & images from a PDF into JSON")
137
+ parser.add_argument("--pdf", required=True, help="Path to PDF")
138
+ parser.add_argument("--subject", required=True, help="Subject short name e.g. BM, English")
139
+ parser.add_argument("--year", required=True, type=int, help="Year e.g. 2019")
140
+ parser.add_argument("--paper", default=2, type=int, help="Paper number (1 or 2)")
141
+ args = parser.parse_args()
142
+ res = extract_questions(args.pdf, args.subject, args.year, args.paper)
143
+ print(f"Extracted {len(res)} questions -> {os.path.join(PROCESSED_DIR, f'spm_{args.year}_{args.subject.lower()}_paper{args.paper}.json')}")
144
+
145
 
146
 
147