Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 14, 2025

Commit

855ae47

verified ·

1 Parent(s): 346cec5

Update ocr_agent.py

Browse files

Files changed (1) hide show

ocr_agent.py +17 -14

ocr_agent.py CHANGED Viewed

@@ -16,18 +16,20 @@ except Exception:
     pytesseract = None
-def _split_blocks_by_num(text):
-    parts = re.split(r"\n(?=\s*\d+\.)", text)
-    return parts
-def _extract_choices(block):
     m = re.search(r"\bA[\)\.]\s*", block)
     if m:
         start = m.start()
         qtext = block[:start].strip()
-        opts = block[start:].strip()
-        items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts)
         choices = []
         for it in items:
             it = it.strip()
@@ -35,9 +37,10 @@ def _extract_choices(block):
                 continue
             it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
             choices.append(it2)
-        return qtext, choices
-    # fallback line by line
     lines = block.splitlines()
     q_lines = []
     choices = []
@@ -85,28 +88,27 @@ class OcrAgent:
             if pdfplumber:
                 text = self._extract_pdfplumber(pdf_path)
             if not text or len(text.strip()) < 120:
-                # fallback
                 text = self._extract_tesseract(pdf_path)
         except Exception:
-            text = self._extract_tesseract(pdf_path) if convert_from_path and pytesseract else ""
         return text
-    def parse_questions(self, raw_text: str):
-        blocks = _split_blocks_by_num(raw_text)
         questions = []
         for blk in blocks:
             blk = blk.strip()
             if not blk:
                 continue
             blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
-            qtext, choices = _extract_choices(blk2)
             qtype = "mcq" if choices else "short_answer"
             questions.append({"text": qtext, "choices": choices, "question_type": qtype, "topics": [], "difficulty": 3})
         return questions
     def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
         text = self.extract_text(pdf_path)
-        questions = self.parse_questions(text)
         subject_token = subject_token.lower()
         qfile = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
@@ -126,3 +128,4 @@ class OcrAgent:

     pytesseract = None
+def _split_blocks_by_number(text: str):
+    # split on lines starting with "1. " or similar
+    return re.split(r"\n(?=\s*\d+\.)", text)
+def _extract_choices_from_block(block: str):
+    block = block.strip()
+    # look for A) or A. markers
     m = re.search(r"\bA[\)\.]\s*", block)
     if m:
         start = m.start()
         qtext = block[:start].strip()
+        opts_text = block[start:].strip()
+        items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts_text)
         choices = []
         for it in items:
             it = it.strip()
                 continue
             it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
             choices.append(it2)
+        if choices:
+            return qtext, choices
+    # fallback: lines style
     lines = block.splitlines()
     q_lines = []
     choices = []
             if pdfplumber:
                 text = self._extract_pdfplumber(pdf_path)
             if not text or len(text.strip()) < 120:
                 text = self._extract_tesseract(pdf_path)
         except Exception:
+            text = self._extract_tesseract(pdf_path) if (convert_from_path and pytesseract) else ""
         return text
+    def parse_questions_from_text(self, raw_text: str):
+        blocks = _split_blocks_by_number(raw_text)
         questions = []
         for blk in blocks:
             blk = blk.strip()
             if not blk:
                 continue
             blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
+            qtext, choices = _extract_choices_from_block(blk2)
             qtype = "mcq" if choices else "short_answer"
             questions.append({"text": qtext, "choices": choices, "question_type": qtype, "topics": [], "difficulty": 3})
         return questions
     def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
         text = self.extract_text(pdf_path)
+        questions = self.parse_questions_from_text(text)
         subject_token = subject_token.lower()
         qfile = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")