Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 14, 2025

Commit

069219d

verified ·

1 Parent(s): fc1f560

Update ocr_agent.py

Browse files

Files changed (1) hide show

ocr_agent.py +23 -46

ocr_agent.py CHANGED Viewed

@@ -1,55 +1,32 @@
-import os
-import json
 import fitz  # PyMuPDF
-class OcrAgent:
-    def __init__(self):
-        pass
-    def extract_from_pdf(self, pdf_path):
-        """Extract text from a PDF using PyMuPDF."""
-        text = ""
         doc = fitz.open(pdf_path)
-        for page in doc:
-            text += page.get_text()
-        return text
-    def clean_text(self, raw_text):
-        """Basic text cleanup (stub)."""
-        return raw_text.replace("\n", " ").strip()
-    def text_to_json(self, text, subject, year, output_dir="data"):
-        """
-        Save extracted questions into JSON format and
-        auto-create a blank scheme JSON with null answers.
-        """
-        os.makedirs(output_dir, exist_ok=True)
-        base_name = f"spm_{year}_{subject.lower()}"
-        questions_file = os.path.join(output_dir, f"{base_name}.json")
-        scheme_file = os.path.join(output_dir, f"{base_name}_scheme.json")
-        # For now, split text into fake MCQs (stub)
         questions = []
-        for i, chunk in enumerate(text.split(".")[:10], start=1):
-            q = {
-                "id": int(f"{year}{i:03}"),
-                "text": chunk.strip() if chunk.strip() else f"Question {i} placeholder",
-                "choices": ["A", "B", "C", "D"],
-                "subject": subject,
-                "year": year
-            }
-            questions.append(q)
-        # Write questions.json
-        with open(questions_file, "w", encoding="utf-8") as f:
             json.dump(questions, f, indent=2, ensure_ascii=False)
-        # Create blank scheme.json with null answers
-        scheme_data = {str(q["id"]): {"correct_answer": None} for q in questions}
-        with open(scheme_file, "w", encoding="utf-8") as f:
-            json.dump(scheme_data, f, indent=2, ensure_ascii=False)
-        return questions_file

 import fitz  # PyMuPDF
+import json
+import os
+class OcrAgent:
+    def extract_questions(self, pdf_path, output_path):
         doc = fitz.open(pdf_path)
         questions = []
+        qid = 1000
+        for page in doc:
+            text = page.get_text("text")
+            for line in text.splitlines():
+                if line.strip().endswith("?") or line.strip().startswith("Q"):
+                    questions.append(
+                        {
+                            "id": qid,
+                            "text": line.strip(),
+                            "choices": ["A", "B", "C", "D"],
+                            "topics": [],
+                            "correct_answer": None,
+                        }
+                    )
+                    qid += 1
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, "w", encoding="utf-8") as f:
             json.dump(questions, f, indent=2, ensure_ascii=False)
+        print(f"✅ Extracted {len(questions)} questions → {output_path}")