Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 14, 2025

Commit

25efed5

verified ·

1 Parent(s): 9eec4e7

Update ocr_agent.py

Browse files

Files changed (1) hide show

ocr_agent.py +88 -91

ocr_agent.py CHANGED Viewed

@@ -1,88 +1,64 @@
 # ocr_agent.py
-import os
 import re
 import json
 try:
     import pdfplumber
-except Exception:
     pdfplumber = None
 try:
     from pdf2image import convert_from_path
     import pytesseract
-except Exception:
     convert_from_path = None
     pytesseract = None
-def _split_blocks_by_number(text: str):
-    # split on lines starting with "1. " or similar
-    return re.split(r"\n(?=\s*\d+\.)", text)
-def _extract_choices_from_block(block: str):
-    block = block.strip()
-    # look for A) or A. markers
-    m = re.search(r"\bA[\)\.]\s*", block)
-    if m:
-        start = m.start()
-        qtext = block[:start].strip()
-        opts_text = block[start:].strip()
-        items = re.split(r'(?=\b[A-D][\)\.]\s*)', opts_text)
-        choices = []
-        for it in items:
-            it = it.strip()
-            if not it:
-                continue
-            it2 = re.sub(r'^[A-D][\)\.]\s*', '', it).strip()
-            choices.append(it2)
-        if choices:
-            return qtext, choices
-    # fallback: lines style
-    lines = block.splitlines()
-    q_lines = []
-    choices = []
-    started = False
-    for ln in lines:
-        ln = ln.strip()
-        if re.match(r'^[A-D][\)\.]\s*', ln):
-            started = True
-            cl = re.sub(r'^[A-D][\)\.]\s*', '', ln).strip()
-            choices.append(cl)
-        else:
-            if not started:
-                q_lines.append(ln)
-            else:
-                if choices:
-                    choices[-1] += " " + ln
-    return " ".join(q_lines).strip(), choices
 class OcrAgent:
-    def __init__(self, tesseract_lang="eng"):
-        self.lang = tesseract_lang
-    def _extract_pdfplumber(self, path: str) -> str:
-        if pdfplumber is None:
-            return ""
-        texts = []
-        with pdfplumber.open(path) as pdf:
-            for p in pdf.pages:
-                texts.append(p.extract_text() or "")
-        return "\n\n".join(texts)
-    def _extract_tesseract(self, path: str) -> str:
-        if convert_from_path is None or pytesseract is None:
             return ""
-        images = convert_from_path(path, dpi=200)
-        texts = []
-        for img in images:
-            texts.append(pytesseract.image_to_string(img, lang=self.lang))
-        return "\n\n".join(texts)
     def extract_text(self, pdf_path: str) -> str:
         text = ""
         try:
             if pdfplumber:
@@ -90,39 +66,60 @@ class OcrAgent:
             if not text or len(text.strip()) < 120:
                 text = self._extract_tesseract(pdf_path)
         except Exception:
-            text = self._extract_tesseract(pdf_path) if (convert_from_path and pytesseract) else ""
-        return text
-    def parse_questions_from_text(self, raw_text: str):
-        blocks = _split_blocks_by_number(raw_text)
         questions = []
-        for blk in blocks:
-            blk = blk.strip()
-            if not blk:
                 continue
-            blk2 = re.sub(r'^\s*\d+\.\s*', '', blk)
-            qtext, choices = _extract_choices_from_block(blk2)
-            qtype = "mcq" if choices else "short_answer"
-            questions.append({"text": qtext, "choices": choices, "question_type": qtype, "topics": [], "difficulty": 3})
         return questions
-    def extract_questions_to_files(self, pdf_path: str, year: str, subject_token: str, out_dir: str = "data"):
-        text = self.extract_text(pdf_path)
-        questions = self.parse_questions_from_text(text)
-        subject_token = subject_token.lower()
-        qfile = os.path.join(out_dir, f"spm_{year}_{subject_token}.json")
-        scheme_file = os.path.join(out_dir, f"spm_{year}_{subject_token}_scheme.json")
-        os.makedirs(out_dir, exist_ok=True)
-        with open(qfile, "w", encoding="utf-8") as f:
-            json.dump(questions, f, indent=2, ensure_ascii=False)
-        scheme_map = {str(i + 1): None for i in range(len(questions))}
-        with open(scheme_file, "w", encoding="utf-8") as f:
-            json.dump(scheme_map, f, indent=2, ensure_ascii=False)
-        return qfile, scheme_file

 # ocr_agent.py
 import re
 import json
+import os
 try:
     import pdfplumber
+except ImportError:
     pdfplumber = None
 try:
     from pdf2image import convert_from_path
     import pytesseract
+except ImportError:
     convert_from_path = None
     pytesseract = None
 class OcrAgent:
+    def __init__(self, data_dir="data"):
+        self.data_dir = data_dir
+        os.makedirs(self.data_dir, exist_ok=True)
+    def _extract_pdfplumber(self, pdf_path: str) -> str:
+        """Extract text using pdfplumber."""
+        text = ""
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() or ""
+        return text
+    def _extract_tesseract(self, pdf_path: str) -> str:
+        """Fallback: OCR via pdf2image + Tesseract."""
+        if not (convert_from_path and pytesseract):
             return ""
+        text = ""
+        pages = convert_from_path(pdf_path, dpi=300)
+        for page in pages:
+            text += pytesseract.image_to_string(page, lang="eng+msa") + "\n"
+        return text
+    def _clean_text(self, raw: str) -> str:
+        """Remove watermarks, scanner marks, and noise."""
+        lines = []
+        for line in raw.splitlines():
+            l = line.strip()
+            if not l:
+                continue
+            # Remove watermarks
+            if "bmspm.net" in l.lower():
+                continue
+            if "camscanner" in l.lower():
+                continue
+            # Remove page numbers (single integers)
+            if re.match(r"^\d+$", l):
+                continue
+            lines.append(l)
+        return "\n".join(lines)
     def extract_text(self, pdf_path: str) -> str:
+        """Extract and clean text from PDF."""
         text = ""
         try:
             if pdfplumber:
             if not text or len(text.strip()) < 120:
                 text = self._extract_tesseract(pdf_path)
         except Exception:
+            if convert_from_path and pytesseract:
+                text = self._extract_tesseract(pdf_path)
+        return self._clean_text(text)
+    def parse_questions(self, cleaned_text: str, subject: str, year: str):
+        """
+        Convert extracted text into structured question JSON.
+        Very naive parsing for now.
+        """
         questions = []
+        blocks = re.split(r"\n(?=\d+\.)", cleaned_text)
+        q_id = 1000
+        for block in blocks:
+            block = block.strip()
+            if not block:
                 continue
+            # First line is question
+            parts = block.split("\n")
+            q_text = parts[0]
+            # Remaining lines treated as choices
+            choices = []
+            for c in parts[1:]:
+                c = c.strip()
+                if re.match(r"^[A-D]\)", c):
+                    choices.append(c)
+            questions.append({
+                "id": q_id,
+                "text": q_text,
+                "choices": choices,
+                "topics": [],
+                "source": "pastpaper",
+                "subject": subject,
+                "year": year
+            })
+            q_id += 1
         return questions
+    def save_questions(self, questions, subject: str, year: str):
+        """Save questions into a JSON file like spm_2018_bm.json."""
+        filename = f"spm_{year}_{subject.lower()}.json"
+        out_path = os.path.join(self.data_dir, filename)
+        with open(out_path, "w", encoding="utf-8") as f:
+            json.dump(questions, f, ensure_ascii=False, indent=2)
+        return out_path
+    def process_pdf(self, pdf_path: str, subject: str, year: str):
+        """Main pipeline: extract → clean → parse → save."""
+        raw_text = self.extract_text(pdf_path)
+        questions = self.parse_questions(raw_text, subject, year)
+        return self.save_questions(questions, subject, year)