Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 14, 2025

Commit

b21249a

verified ·

1 Parent(s): 61f8c8e

Update ocr_agent.py

Browse files

Files changed (1) hide show

ocr_agent.py +163 -23

ocr_agent.py CHANGED Viewed

@@ -1,32 +1,172 @@
-import fitz  # PyMuPDF
-import json
 import os
 class OcrAgent:
-    def extract_questions(self, pdf_path, output_path):
-        doc = fitz.open(pdf_path)
         questions = []
-        qid = 1000
-        for page in doc:
-            text = page.get_text("text")
-            for line in text.splitlines():
-                if line.strip().endswith("?") or line.strip().startswith("Q"):
-                    questions.append(
-                        {
-                            "id": qid,
-                            "text": line.strip(),
-                            "choices": ["A", "B", "C", "D"],
-                            "topics": [],
-                            "correct_answer": None,
-                        }
-                    )
-                    qid += 1
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
-        with open(output_path, "w", encoding="utf-8") as f:
             json.dump(questions, f, indent=2, ensure_ascii=False)
-        print(f"✅ Extracted {len(questions)} questions → {output_path}")

 import os
+import re
+import json
+# optional heavy deps
+try:
+    import pdfplumber
+except Exception:
+    pdfplumber = None
+try:
+    from pdf2image import convert_from_path
+    from PIL import Image
+    import pytesseract
+except Exception:
+    convert_from_path = None
+    pytesseract = None
+def _normalize_choice_text(s: str) -> str:
+    return s.strip()
+def _split_blocks_by_question_number(text: str):
+    # split on lines starting with "1. " "2. " etc.
+    parts = re.split(r"\n(?=\s*\d+\.)", text)
+    return parts
+def _extract_choices_from_block(block: str):
+    # tries to find A) B) C) D) style or A. B. C. D.
+    # returns (question_text, [choices]) best-effort
+    # normalize newlines into spaces inside each piece
+    block = block.strip()
+    # find where options start (search for "A)" or "A.")
+    m = re.search(r"\bA[\)\.]\s*", block)
+    if m:
+        start = m.start()
+        qtext = block[:start].strip()
+        opts_text = block[start:].strip()
+        # split by "A)", "B)", etc.
+        items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts_text)
+        choices = []
+        for it in items:
+            it = it.strip()
+            if not it:
+                continue
+            # remove leading "A)"/"A."
+            it2 = re.sub(r'^[A-D][\)\.]\s*', '', it)
+            choices.append(_normalize_choice_text(it2))
+        if len(choices) >= 2:
+            return qtext, choices
+    # fallback: maybe choices are on separate lines starting with "A. "
+    lines = block.splitlines()
+    q_lines = []
+    choices = []
+    choices_started = False
+    for ln in lines:
+        ln = ln.strip()
+        if re.match(r'^[A-D][\)\.]\s*', ln):
+            choices_started = True
+            cl = re.sub(r'^[A-D][\)\.]\s*', '', ln)
+            choices.append(_normalize_choice_text(cl))
+        else:
+            if not choices_started:
+                q_lines.append(ln)
+            else:
+                # continuation of last choice?
+                if choices:
+                    choices[-1] += " " + ln
+    if choices:
+        return " ".join(q_lines).strip(), choices
+    # no choices found
+    return block.strip(), []
 class OcrAgent:
+    def __init__(self, tesseract_lang="eng"):
+        self.tesseract_lang = tesseract_lang
+    def _extract_text_pdfplumber(self, pdf_path: str) -> str:
+        if pdfplumber is None:
+            raise RuntimeError("pdfplumber not available")
+        texts = []
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                t = page.extract_text() or ""
+                texts.append(t)
+        return "\n\n".join(texts)
+    def _extract_text_tesseract(self, pdf_path: str) -> str:
+        if convert_from_path is None or pytesseract is None:
+            raise RuntimeError("pdf2image/pytesseract not available")
+        images = convert_from_path(pdf_path, dpi=200)
+        texts = []
+        for img in images:
+            t = pytesseract.image_to_string(img, lang=self.tesseract_lang)
+            texts.append(t)
+        return "\n\n".join(texts)
+    def extract_text(self, pdf_path: str) -> str:
+        # try pdfplumber first (best for digital PDFs)
+        text = ""
+        try:
+            if pdfplumber:
+                text = self._extract_text_pdfplumber(pdf_path)
+            if not text or len(text.strip()) < 100:
+                # fallback to tesseract OCR
+                if convert_from_path and pytesseract:
+                    text = self._extract_text_tesseract(pdf_path)
+        except Exception:
+            # try fallback if any error
+            if convert_from_path and pytesseract:
+                text = self._extract_text_tesseract(pdf_path)
+            else:
+                raise
+        return text
+    def parse_questions_from_text(self, raw_text: str) -> list:
+        # heuristic parser – splits by numbered questions and attempts to extract choices
+        blocks = _split_blocks_by_question_number(raw_text)
         questions = []
+        for blk in blocks:
+            blk = blk.strip()
+            if not blk:
+                continue
+            # remove leading number if present
+            blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
+            qtext, choices = _extract_choices_from_block(blk2)
+            qtype = "mcq" if choices else "short_answer"
+            questions.append({
+                "text": qtext,
+                "choices": choices,
+                "question_type": qtype,
+                "topics": [],
+                "difficulty": 3
+            })
+        return questions
+    def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
+        """
+        Extract questions from PDF and save:
+          - data/spm_{year}_{subject_token}.json   (list of question objects)
+          - data/spm_{year}_{subject_token}_scheme.json  (mapping "1": null, "2": null, ...)
+        Returns (questions_path, scheme_path)
+        """
+        text = self.extract_text(pdf_path)
+        questions = self.parse_questions_from_text(text)
+        # filenames lower-case
+        subject_token = subject_token.lower()
+        q_filename = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
+        scheme_filename = os.path.join(out_dir, f"spm_{year}_{subject_token}_scheme.json")
+        # write questions list
+        os.makedirs(out_dir, exist_ok=True)
+        with open(q_filename, "w", encoding="utf-8") as f:
             json.dump(questions, f, indent=2, ensure_ascii=False)
+        # create a scheme placeholder mapping by index (1-based) -> None
+        scheme_map = {str(i + 1): None for i in range(len(questions))}
+        with open(scheme_filename, "w", encoding="utf-8") as f:
+            json.dump(scheme_map, f, indent=2, ensure_ascii=False)
+        return q_filename, scheme_filename