Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 14, 2025

Commit

e14bed5

verified ·

1 Parent(s): 25efed5

Update ocr_agent.py

Browse files

Files changed (1) hide show

ocr_agent.py +55 -109

ocr_agent.py CHANGED Viewed

@@ -1,124 +1,70 @@
-# ocr_agent.py
-import re
-import json
 import os
-try:
-    import pdfplumber
-except ImportError:
-    pdfplumber = None
-try:
-    from pdf2image import convert_from_path
-    import pytesseract
-except ImportError:
-    convert_from_path = None
-    pytesseract = None
 class OcrAgent:
     def __init__(self, data_dir="data"):
         self.data_dir = data_dir
         os.makedirs(self.data_dir, exist_ok=True)
-    def _extract_pdfplumber(self, pdf_path: str) -> str:
-        """Extract text using pdfplumber."""
-        text = ""
-        with pdfplumber.open(pdf_path) as pdf:
-            for page in pdf.pages:
-                text += page.extract_text() or ""
-        return text
-    def _extract_tesseract(self, pdf_path: str) -> str:
-        """Fallback: OCR via pdf2image + Tesseract."""
-        if not (convert_from_path and pytesseract):
-            return ""
-        text = ""
-        pages = convert_from_path(pdf_path, dpi=300)
-        for page in pages:
-            text += pytesseract.image_to_string(page, lang="eng+msa") + "\n"
-        return text
-    def _clean_text(self, raw: str) -> str:
-        """Remove watermarks, scanner marks, and noise."""
-        lines = []
-        for line in raw.splitlines():
-            l = line.strip()
-            if not l:
-                continue
-            # Remove watermarks
-            if "bmspm.net" in l.lower():
-                continue
-            if "camscanner" in l.lower():
-                continue
-            # Remove page numbers (single integers)
-            if re.match(r"^\d+$", l):
-                continue
-            lines.append(l)
-        return "\n".join(lines)
-    def extract_text(self, pdf_path: str) -> str:
-        """Extract and clean text from PDF."""
-        text = ""
-        try:
-            if pdfplumber:
-                text = self._extract_pdfplumber(pdf_path)
-            if not text or len(text.strip()) < 120:
-                text = self._extract_tesseract(pdf_path)
-        except Exception:
-            if convert_from_path and pytesseract:
-                text = self._extract_tesseract(pdf_path)
-        return self._clean_text(text)
-    def parse_questions(self, cleaned_text: str, subject: str, year: str):
         """
-        Convert extracted text into structured question JSON.
-        Very naive parsing for now.
         """
         questions = []
-        blocks = re.split(r"\n(?=\d+\.)", cleaned_text)
-        q_id = 1000
-        for block in blocks:
-            block = block.strip()
-            if not block:
-                continue
-            # First line is question
-            parts = block.split("\n")
-            q_text = parts[0]
-            # Remaining lines treated as choices
-            choices = []
-            for c in parts[1:]:
-                c = c.strip()
-                if re.match(r"^[A-D]\)", c):
-                    choices.append(c)
-            questions.append({
-                "id": q_id,
-                "text": q_text,
-                "choices": choices,
-                "topics": [],
-                "source": "pastpaper",
-                "subject": subject,
-                "year": year
-            })
-            q_id += 1
         return questions
-    def save_questions(self, questions, subject: str, year: str):
-        """Save questions into a JSON file like spm_2018_bm.json."""
-        filename = f"spm_{year}_{subject.lower()}.json"
-        out_path = os.path.join(self.data_dir, filename)
-        with open(out_path, "w", encoding="utf-8") as f:
-            json.dump(questions, f, ensure_ascii=False, indent=2)
-        return out_path
-    def process_pdf(self, pdf_path: str, subject: str, year: str):
-        """Main pipeline: extract → clean → parse → save."""
-        raw_text = self.extract_text(pdf_path)
-        questions = self.parse_questions(raw_text, subject, year)
-        return self.save_questions(questions, subject, year)

 import os
+import fitz  # PyMuPDF
+import re
 class OcrAgent:
     def __init__(self, data_dir="data"):
         self.data_dir = data_dir
         os.makedirs(self.data_dir, exist_ok=True)
+    def extract_questions(self, pdf_path, subject, year):
         """
+        Extracts questions from PDF with text + visual support.
+        Returns a list of formatted question blocks (HTML with text + images).
         """
+        doc = fitz.open(pdf_path)
         questions = []
+        q_counter = 1
+        for page_num, page in enumerate(doc, start=1):
+            text = page.get_text("text").strip()
+            images = page.get_images(full=True)
+            # Save images if present
+            img_paths = []
+            for i, img in enumerate(images, start=1):
+                xref = img[0]
+                pix = fitz.Pixmap(doc, xref)
+                img_filename = f"{subject}_{year}_q{q_counter}_{i}.png"
+                img_path = os.path.join(self.data_dir, img_filename)
+                pix.save(img_path)
+                img_paths.append(img_path)
+            # Split text into question + choices
+            match = re.split(r"\n[A-D]\.", text)
+            if len(match) > 1:
+                q_text = match[0].strip()
+                choices = re.findall(r"[A-D]\.\s?.*", text)
+            else:
+                q_text = text
+                choices = []
+            formatted = self.format_question_block(
+                q_counter, q_text, choices, img_paths
+            )
+            questions.append(formatted)
+            q_counter += 1
         return questions
+    def format_question_block(self, q_num, q_text, choices, img_paths):
+        """
+        Format one question into HTML with optional images and choices.
+        """
+        block = f"<b>Q{q_num}.</b> {q_text}<br>"
+        for img_path in img_paths:
+            rel_path = os.path.relpath(img_path, self.data_dir)
+            block += f'<img src="data/{rel_path}" style="max-width:400px;"><br>'
+        if choices:
+            block += "<ul>"
+            for choice in choices:
+                block += f"<li>{choice}</li>"
+            block += "</ul>"
+        return block