Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 15, 2025

Commit

467a83a

verified ·

1 Parent(s): cc73727

Update ocr_agent.py

Browse files

Files changed (1) hide show

ocr_agent.py +138 -63

ocr_agent.py CHANGED Viewed

@@ -1,72 +1,147 @@
-import fitz  # PyMuPDF
 import os
-import json
 import re
-from PIL import Image
 import io
 import pytesseract
 DATA_DIR = "data"
-IMAGES_DIR = os.path.join(DATA_DIR, "images")
-# Ensure dirs exist
-os.makedirs(DATA_DIR, exist_ok=True)
-os.makedirs(IMAGES_DIR, exist_ok=True)
-class OcrAgent:
-    """Extracts text + images from PDF SPM past papers."""
-    def extract_questions(self, pdf_path, subject, year, paper=2):
-        doc = fitz.open(pdf_path)
-        questions = []
-        qid = 1000
-        for page_num, page in enumerate(doc):
-            text = page.get_text("text")
-            images = page.get_images(full=True)
-            # OCR fallback if no text
-            if not text.strip() and images:
-                for img in images:
-                    xref = img[0]
-                    base_image = doc.extract_image(xref)
-                    image_bytes = base_image["image"]
-                    img_obj = Image.open(io.BytesIO(image_bytes))
-                    ocr_text = pytesseract.image_to_string(img_obj, lang="eng+msa")
-                    text += "\n" + ocr_text
-            # Save images
-            for img_index, img in enumerate(images):
-                xref = img[0]
-                base_image = doc.extract_image(xref)
-                image_bytes = base_image["image"]
-                ext = base_image["ext"]
-                img_path = os.path.join(IMAGES_DIR, f"{year}_{subject}_p{paper}_{page_num+1}_{img_index}.{ext}")
-                with open(img_path, "wb") as f:
-                    f.write(image_bytes)
-                text += f"\n[Image included: {img_path}]"
-            # Split text into questions
-            raw_questions = re.split(r"\n\d+\s", text)
-            for chunk in raw_questions:
-                chunk = chunk.strip()
-                if not chunk or len(chunk) < 20:
-                    continue
-                questions.append({
-                    "id": qid,
-                    "text": chunk,
-                    "choices": [],
-                    "topics": [],
-                    "source": f"spm_{year}_{subject.lower()}_paper{paper}"
-                })
-                qid += 1
-        # Save to JSON
-        out_file = os.path.join(DATA_DIR, f"spm_{year}_{subject.lower()}_paper{paper}.json")
-        with open(out_file, "w", encoding="utf-8") as f:
-            json.dump(questions, f, ensure_ascii=False, indent=2)
-        return questions

+# ocr_agent.py
 import os
 import re
+import json
 import io
+import fitz  # PyMuPDF
+from PIL import Image
 import pytesseract
+from datetime import datetime
 DATA_DIR = "data"
+PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
+MEDIA_DIR = "media"
+os.makedirs(PROCESSED_DIR, exist_ok=True)
+os.makedirs(MEDIA_DIR, exist_ok=True)
+def sanitize_text(s):
+    return re.sub(r'\s+', ' ', s).strip()
+def parse_mcq_from_text(block_text):
+    """
+    Try to parse a block of text for a single question with choices.
+    Return dict: {"text": "...", "choices": [...], "answer": None}
+    """
+    # Attempt to find choice lines starting with A., A), A
+    lines = [l.strip() for l in block_text.splitlines() if l.strip()]
+    # find choice start indices
+    choice_indices = []
+    for i, ln in enumerate(lines):
+        if re.match(r'^[A-D][\.\)]\s+', ln, re.IGNORECASE) or re.match(r'^[A-D]\s+-\s+', ln, re.IGNORECASE):
+            choice_indices.append(i)
+    if not choice_indices:
+        # no obvious choices — return whole block as question text
+        return {"text": sanitize_text(block_text), "choices": [], "answer": None}
+    # question text = lines up to first choice line
+    first_choice_i = choice_indices[0]
+    qtext = " ".join(lines[:first_choice_i])
+    choices = []
+    # collect consecutive choice lines until next question (non-choice)
+    for idx in choice_indices:
+        ln = lines[idx]
+        # remove leading A. or A) or A -
+        ln_clean = re.sub(r'^[A-D][\.\)\-]\s*', '', ln, flags=re.IGNORECASE)
+        choices.append(ln_clean.strip())
+    return {"text": sanitize_text(qtext), "choices": choices, "answer": None}
+def extract_questions(pdf_path, subject, year, paper=2):
+    """
+    Extracts questions and images from given PDF.
+    Saves output JSON to data/processed/spm_{year}_{subject}_paper{paper}.json
+    Returns list of question dicts.
+    """
+    doc = fitz.open(pdf_path)
+    questions = []
+    qid_base = int(datetime.now().timestamp())  # base for ids if missing
+    qcounter = 0
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        text = page.get_text("text") or ""
+        # if no text, attempt OCR over the image of whole page
+        if not text.strip():
+            pix = page.get_pixmap(dpi=200)
+            img_bytes = pix.tobytes()
+            try:
+                img = Image.open(io.BytesIO(img_bytes))
+                ocr_text = pytesseract.image_to_string(img, lang="eng+msa")
+                text = ocr_text
+            except Exception:
+                text = ""
+        # extract embedded images (diagrams)
+        images = page.get_images(full=True)
+        image_paths = []
+        for img_index, imginfo in enumerate(images):
+            xref = imginfo[0]
+            base_image = doc.extract_image(xref)
+            image_bytes = base_image["image"]
+            ext = base_image.get("ext", "png")
+            img_name = f"{year}_{subject}_p{paper}_pg{page_num+1}_{img_index}.{ext}"
+            img_path = os.path.join(MEDIA_DIR, img_name)
+            with open(img_path, "wb") as f:
+                f.write(image_bytes)
+            image_paths.append(img_path)
+        # Try splitting page text by question numbers (e.g., '1.', '2.' at start of line)
+        # Create chunks
+        # use regex to split on lines starting with number dot or number)
+        splits = re.split(r'\n\s*(\d+)[\.\)]\s*', text)
+        # splits format: ['', '1', 'text1', '2', 'text2', ...] or similar
+        if len(splits) <= 1:
+            # fallback: treat whole page as single block
+            parsed = parse_mcq_from_text(text)
+            parsed["id"] = qid_base + qcounter
+            parsed["subject"] = subject
+            parsed["paper"] = paper
+            parsed["year"] = year
+            parsed["image"] = image_paths[0] if image_paths else None
+            parsed["source"] = os.path.basename(pdf_path)
+            questions.append(parsed)
+            qcounter += 1
+        else:
+            # iterate pairs
+            # splits structure: prefix, num1, block1, num2, block2, ...
+            it = iter(splits)
+            prefix = next(it, "")
+            while True:
+                try:
+                    num = next(it)
+                    block = next(it)
+                except StopIteration:
+                    break
+                parsed = parse_mcq_from_text(block)
+                parsed["id"] = qid_base + qcounter
+                parsed["subject"] = subject
+                parsed["paper"] = paper
+                parsed["year"] = year
+                # attach first diagram of page (best-effort)
+                parsed["image"] = image_paths[0] if image_paths else None
+                parsed["source"] = os.path.basename(pdf_path)
+                questions.append(parsed)
+                qcounter += 1
+    # Save processed JSON
+    out_fname = os.path.join(PROCESSED_DIR, f"spm_{year}_{subject.lower()}_paper{paper}.json")
+    with open(out_fname, "w", encoding="utf-8") as f:
+        json.dump(questions, f, ensure_ascii=False, indent=2)
+    return questions
+# CLI convenience
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Extract questions & images from a PDF into JSON")
+    parser.add_argument("--pdf", required=True, help="Path to PDF")
+    parser.add_argument("--subject", required=True, help="Subject short name e.g. BM, English")
+    parser.add_argument("--year", required=True, type=int, help="Year e.g. 2019")
+    parser.add_argument("--paper", default=2, type=int, help="Paper number (1 or 2)")
+    args = parser.parse_args()
+    res = extract_questions(args.pdf, args.subject, args.year, args.paper)
+    print(f"Extracted {len(res)} questions -> {os.path.join(PROCESSED_DIR, f'spm_{args.year}_{args.subject.lower()}_paper{args.paper}.json')}")