Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 14, 2025

Commit

dd23511

verified ·

1 Parent(s): 1662836

Update ocr_agent.py

Browse files

Files changed (1) hide show

ocr_agent.py +41 -85

ocr_agent.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import os
 import re
 import json
-# optional heavy deps
 try:
     import pdfplumber
 except Exception:
@@ -10,162 +10,118 @@ except Exception:
 try:
     from pdf2image import convert_from_path
-    from PIL import Image
     import pytesseract
 except Exception:
     convert_from_path = None
     pytesseract = None
-def _normalize_choice_text(s: str) -> str:
-    return s.strip()
-def _split_blocks_by_question_number(text: str):
-    # split on lines starting with "1. " "2. " etc.
     parts = re.split(r"\n(?=\s*\d+\.)", text)
     return parts
-def _extract_choices_from_block(block: str):
-    # tries to find A) B) C) D) style or A. B. C. D.
-    # returns (question_text, [choices]) best-effort
-    # normalize newlines into spaces inside each piece
-    block = block.strip()
-    # find where options start (search for "A)" or "A.")
     m = re.search(r"\bA[\)\.]\s*", block)
     if m:
         start = m.start()
         qtext = block[:start].strip()
-        opts_text = block[start:].strip()
-        # split by "A)", "B)", etc.
-        items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts_text)
         choices = []
         for it in items:
             it = it.strip()
             if not it:
                 continue
-            # remove leading "A)"/"A."
-            it2 = re.sub(r'^[A-D][\)\.]\s*', '', it)
-            choices.append(_normalize_choice_text(it2))
-        if len(choices) >= 2:
-            return qtext, choices
-    # fallback: maybe choices are on separate lines starting with "A. "
     lines = block.splitlines()
     q_lines = []
     choices = []
-    choices_started = False
     for ln in lines:
         ln = ln.strip()
         if re.match(r'^[A-D][\)\.]\s*', ln):
-            choices_started = True
-            cl = re.sub(r'^[A-D][\)\.]\s*', '', ln)
-            choices.append(_normalize_choice_text(cl))
         else:
-            if not choices_started:
                 q_lines.append(ln)
             else:
-                # continuation of last choice?
                 if choices:
                     choices[-1] += " " + ln
-    if choices:
-        return " ".join(q_lines).strip(), choices
-    # no choices found
-    return block.strip(), []
 class OcrAgent:
     def __init__(self, tesseract_lang="eng"):
-        self.tesseract_lang = tesseract_lang
-    def _extract_text_pdfplumber(self, pdf_path: str) -> str:
         if pdfplumber is None:
-            raise RuntimeError("pdfplumber not available")
         texts = []
-        with pdfplumber.open(pdf_path) as pdf:
-            for page in pdf.pages:
-                t = page.extract_text() or ""
-                texts.append(t)
         return "\n\n".join(texts)
-    def _extract_text_tesseract(self, pdf_path: str) -> str:
         if convert_from_path is None or pytesseract is None:
-            raise RuntimeError("pdf2image/pytesseract not available")
-        images = convert_from_path(pdf_path, dpi=200)
         texts = []
         for img in images:
-            t = pytesseract.image_to_string(img, lang=self.tesseract_lang)
-            texts.append(t)
         return "\n\n".join(texts)
     def extract_text(self, pdf_path: str) -> str:
-        # try pdfplumber first (best for digital PDFs)
         text = ""
         try:
             if pdfplumber:
-                text = self._extract_text_pdfplumber(pdf_path)
-            if not text or len(text.strip()) < 100:
-                # fallback to tesseract OCR
-                if convert_from_path and pytesseract:
-                    text = self._extract_text_tesseract(pdf_path)
         except Exception:
-            # try fallback if any error
-            if convert_from_path and pytesseract:
-                text = self._extract_text_tesseract(pdf_path)
-            else:
-                raise
         return text
-    def parse_questions_from_text(self, raw_text: str) -> list:
-        # heuristic parser – splits by numbered questions and attempts to extract choices
-        blocks = _split_blocks_by_question_number(raw_text)
         questions = []
         for blk in blocks:
             blk = blk.strip()
             if not blk:
                 continue
-            # remove leading number if present
             blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
-            qtext, choices = _extract_choices_from_block(blk2)
             qtype = "mcq" if choices else "short_answer"
-            questions.append({
-                "text": qtext,
-                "choices": choices,
-                "question_type": qtype,
-                "topics": [],
-                "difficulty": 3
-            })
         return questions
     def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
-        """
-        Extract questions from PDF and save:
-          - data/spm_{year}_{subject_token}.json   (list of question objects)
-          - data/spm_{year}_{subject_token}_scheme.json  (mapping "1": null, "2": null, ...)
-        Returns (questions_path, scheme_path)
-        """
         text = self.extract_text(pdf_path)
-        questions = self.parse_questions_from_text(text)
-        # filenames lower-case
         subject_token = subject_token.lower()
-        q_filename = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
-        scheme_filename = os.path.join(out_dir, f"spm_{year}_{subject_token}_scheme.json")
-        # write questions list
         os.makedirs(out_dir, exist_ok=True)
-        with open(q_filename, "w", encoding="utf-8") as f:
             json.dump(questions, f, indent=2, ensure_ascii=False)
-        # create a scheme placeholder mapping by index (1-based) -> None
         scheme_map = {str(i + 1): None for i in range(len(questions))}
-        with open(scheme_filename, "w", encoding="utf-8") as f:
             json.dump(scheme_map, f, indent=2, ensure_ascii=False)
-        return q_filename, scheme_filename

+# ocr_agent.py
 import os
 import re
 import json
 try:
     import pdfplumber
 except Exception:
 try:
     from pdf2image import convert_from_path
     import pytesseract
 except Exception:
     convert_from_path = None
     pytesseract = None
+def _split_blocks_by_num(text):
     parts = re.split(r"\n(?=\s*\d+\.)", text)
     return parts
+def _extract_choices(block):
     m = re.search(r"\bA[\)\.]\s*", block)
     if m:
         start = m.start()
         qtext = block[:start].strip()
+        opts = block[start:].strip()
+        items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts)
         choices = []
         for it in items:
             it = it.strip()
             if not it:
                 continue
+            it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
+            choices.append(it2)
+        return qtext, choices
+    # fallback line by line
     lines = block.splitlines()
     q_lines = []
     choices = []
+    started = False
     for ln in lines:
         ln = ln.strip()
         if re.match(r'^[A-D][\)\.]\s*', ln):
+            started = True
+            cl = re.sub(r'^[A-D][\)\.]\s*', '', ln).strip()
+            choices.append(cl)
         else:
+            if not started:
                 q_lines.append(ln)
             else:
                 if choices:
                     choices[-1] += " " + ln
+    return " ".join(q_lines).strip(), choices
 class OcrAgent:
     def __init__(self, tesseract_lang="eng"):
+        self.lang = tesseract_lang
+    def _extract_pdfplumber(self, path: str) -> str:
         if pdfplumber is None:
+            return ""
         texts = []
+        with pdfplumber.open(path) as pdf:
+            for p in pdf.pages:
+                texts.append(p.extract_text() or "")
         return "\n\n".join(texts)
+    def _extract_tesseract(self, path: str) -> str:
         if convert_from_path is None or pytesseract is None:
+            return ""
+        images = convert_from_path(path, dpi=200)
         texts = []
         for img in images:
+            texts.append(pytesseract.image_to_string(img, lang=self.lang))
         return "\n\n".join(texts)
     def extract_text(self, pdf_path: str) -> str:
         text = ""
         try:
             if pdfplumber:
+                text = self._extract_pdfplumber(pdf_path)
+            if not text or len(text.strip()) < 120:
+                # fallback
+                text = self._extract_tesseract(pdf_path)
         except Exception:
+            text = self._extract_tesseract(pdf_path) if convert_from_path and pytesseract else ""
         return text
+    def parse_questions(self, raw_text: str):
+        blocks = _split_blocks_by_num(raw_text)
         questions = []
         for blk in blocks:
             blk = blk.strip()
             if not blk:
                 continue
             blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
+            qtext, choices = _extract_choices(blk2)
             qtype = "mcq" if choices else "short_answer"
+            questions.append({"text": qtext, "choices": choices, "question_type": qtype, "topics": [], "difficulty": 3})
         return questions
     def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
         text = self.extract_text(pdf_path)
+        questions = self.parse_questions(text)
         subject_token = subject_token.lower()
+        qfile = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
+        scheme_file = os.path.join(out_dir, f"spm_{year}_{subject_token}_scheme.json")
         os.makedirs(out_dir, exist_ok=True)
+        with open(qfile, "w", encoding="utf-8") as f:
             json.dump(questions, f, indent=2, ensure_ascii=False)
         scheme_map = {str(i + 1): None for i in range(len(questions))}
+        with open(scheme_file, "w", encoding="utf-8") as f:
             json.dump(scheme_map, f, indent=2, ensure_ascii=False)
+        return qfile, scheme_file