Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 14, 2025

Commit

bf49bb2

verified ·

1 Parent(s): dc8d23a

Update ocr_agent.py

Browse files

Files changed (1) hide show

ocr_agent.py +69 -277

ocr_agent.py CHANGED Viewed

@@ -1,285 +1,77 @@
-"""
-ocr_agent.py
-Handles converting scanned/digital SPM PDF papers into structured JSON question lists.
-Capabilities:
-- Try pdfplumber (best for digital PDFs with selectable text)
-- Fallback to pytesseract + pdf2image for scanned PDFs
-- Parse raw extracted text into question objects (MCQ-centric)
-- Optional "naturalize" pass using GLM-4.5 to rewrite/clean question text
-Outputs a list of question dicts suitable for merge_questions.py:
-[
-  {
-    "question_type": "mcq",
-    "text": "...",
-    "choices": ["A", "B", "C", "D"],
-    "topics": [],
-    "difficulty": 3
-  },
-  ...
-]
-"""
 import os
-import re
 import json
-import logging
-from typing import List, Dict
-# try import optional heavy deps; functions will check existence
-try:
-    import pdfplumber
-except Exception:
-    pdfplumber = None
-try:
-    from pdf2image import convert_from_path
-    from PIL import Image
-    import pytesseract
-except Exception:
-    convert_from_path = None
-    Image = None
-    pytesseract = None
-# optional GLM cleaning
-import requests
-GLM_API_URL = "https://api.your-glm-provider.com/v1/chat/completions"
-GLM_API_KEY = os.getenv("ZHIPUAI_API_KEY")
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-def extract_text_pdfplumber(pdf_path: str) -> str:
-    """Extract text using pdfplumber (works well for digital PDFs)."""
-    if pdfplumber is None:
-        raise RuntimeError("pdfplumber is not installed")
-    texts = []
-    with pdfplumber.open(pdf_path) as pdf:
-        for page in pdf.pages:
-            t = page.extract_text()
-            if t:
-                texts.append(t)
-    return "\n\n".join(texts)
-def extract_text_tesseract(pdf_path: str, dpi: int = 200, fmt="jpeg") -> str:
-    """Fallback OCR using pytesseract via pdf2image (for scanned PDFs)."""
-    if convert_from_path is None or pytesseract is None:
-        raise RuntimeError("pdf2image/pytesseract not available")
-    texts = []
-    # convert each page to image
-    images = convert_from_path(pdf_path, dpi=dpi)
-    for img in images:
-        text = pytesseract.image_to_string(img, lang='eng+msa')  # english + malay if Tesseract lang installed
-        texts.append(text)
-    return "\n\n".join(texts)
-def try_extract_text(pdf_path: str) -> str:
-    """Try pdfplumber first, fallback to tesseract. Returns raw extracted text."""
-    logger.info("Attempting pdfplumber extraction...")
-    if pdfplumber:
-        try:
-            text = extract_text_pdfplumber(pdf_path)
-            # heuristics: if extracted text is short, it's probably scanned — fall back
-            if len(text.strip()) >= 200:
-                logger.info("pdfplumber extraction looks OK (length=%d)", len(text))
-                return text
-            else:
-                logger.info("pdfplumber produced short text; falling back to OCR")
-        except Exception as e:
-            logger.warning("pdfplumber extraction failed: %s", e)
-    # fallback
-    logger.info("Attempting pytesseract extraction...")
-    if pytesseract and convert_from_path:
-        try:
-            text = extract_text_tesseract(pdf_path)
-            logger.info("pytesseract extraction done (length=%d)", len(text))
-            return text
-        except Exception as e:
-            logger.error("pytesseract extraction failed: %s", e)
-            raise
-    else:
-        raise RuntimeError("No available PDF/text extraction method (pdfplumber or pytesseract required).")
-# --- parsing heuristics --- #
-_RE_Q_SPLIT = re.compile(r'\n\s*\d+\.\s+', flags=re.MULTILINE)  # split on numbered questions like "1. "
-_RE_OPTION_LINE = re.compile(r'^[A-D][\).\s]+', flags=re.MULTILINE)
-_RE_FIND_OPTIONS = re.compile(r'(?:A[\).\s].*?)(?:B[\).\s].*?)(?:C[\).\s].*?)(?:D[\).\s].*?)', re.S)
-def parse_mcq_blocks(raw_text: str) -> List[Dict]:
-    """
-    Attempt to parse MCQ questions from raw_text.
-    Strategy:
-    - Normalize line breaks.
-    - Split by question numbers (1., 2., etc.)
-    - In each block try to find A/B/C/D option markers and separate choices.
-    - Return list of question dicts. Best-effort; may require human review for tricky PDFs.
-    """
-    text = raw_text.replace('\r\n', '\n').replace('\r', '\n')
-    # ensure leading "1. " if not present (some PDFs may use different style)
-    parts = re.split(r'\n(?=\d+\.\s)', "\n" + text)  # keeps the numbers as part of each block
-    questions = []
-    for part in parts:
-        part = part.strip()
-        if not part:
-            continue
-        # find the question number at start
-        m = re.match(r'^\d+\.\s*(.*)', part, flags=re.S)
-        if m:
-            body = m.group(1).strip()
-        else:
-            body = part
-        # attempt to extract choices
-        # search for A) / A. / A space markers
-        # find options by locating ' A ' ' B ' ' C ' ' D ' lines
-        # try different heuristics
-        options = []
-        # heuristic 1: find pattern A) ... B) ... C) ... D)
-        opt_match = re.search(r'(A[\)\.\s].*?)(?=B[\)\.\s])', body, flags=re.S)
-        if opt_match:
-            # use robust method: find all options by A B C D markers
-            # replace newlines inside options with spaces, then split by markers
-            raw = body
-            # find start of options (first 'A' marker)
-            start = re.search(r'\bA[\)\.\s]', raw)
-            if start:
-                q_text = raw[:start.start()].strip()
-                options_text = raw[start.start():].strip()
-                # split by A/B/C/D markers
-                items = re.split(r'(?=\b[A-D][\)\.]\s*)', options_text)
-                choices = []
-                for it in items:
-                    it = it.strip()
-                    if not it:
-                        continue
-                    # remove leading "A) " or "A. "
-                    it2 = re.sub(r'^[A-D][\)\.]\s*', '', it)
-                    choices.append(it2.strip().replace('\n', ' '))
-                if len(choices) >= 2:
-                    questions.append({
-                        "question_type": "mcq",
-                        "text": q_text,
-                        "choices": choices,
-                        "topics": [],
-                        "difficulty": 3
-                    })
-                    continue
-        # heuristic 2: lines with A) style
-        lines = body.split('\n')
-        choice_lines = [ln for ln in lines if re.match(r'^\s*[A-D][\)\.]\s*', ln)]
-        if len(choice_lines) >= 2:
-            # gather contiguous lines starting where first option appears
-            first_idx = next(i for i, ln in enumerate(lines) if re.match(r'^\s*[A-D][\)\.]\s*', ln))
-            q_text = ' '.join([ln.strip() for ln in lines[:first_idx]])
-            choices = []
-            for ln in lines[first_idx:]:
-                m = re.match(r'^\s*([A-D])[)\.]\s*(.*)', ln)
-                if m:
-                    choices.append(m.group(2).strip())
-            if choices:
-                questions.append({
-                    "question_type": "mcq",
-                    "text": q_text,
-                    "choices": choices,
-                    "topics": [],
-                    "difficulty": 3
-                })
                 continue
-        # fallback: treat entire block as a short-answer or descriptive question
-        questions.append({
-            "question_type": "short_answer",
-            "text": body.strip(),
-            "choices": [],
-            "topics": [],
-            "difficulty": 3
-        })
-    return questions
-# --- GLM-based naturalizer (optional) --- #
-def glm_naturalize_question(q_text: str, choices: List[str]=None) -> Dict:
-    """
-    Use GLM-4.5 to 'clean' and naturalize a single question.
-    Returns dict with keys: text, choices (possibly unchanged), note (optional).
-    NOTE: this uses your GLM API key and incurs cost.
-    """
-    if not GLM_API_KEY:
-        # no API key — return original
-        return {"text": q_text, "choices": choices or []}
-    system_prompt = "You are a helpful editor who rewrites exam questions to be clear, natural, concise, and exam-appropriate. Do not change the meaning."
-    user_prompt = f"Question: {q_text}\n\nChoices: {json.dumps(choices or [])}\n\nReturn JSON: {{'text': '...', 'choices': [...]}}, no extra commentary."
-    headers = {"Authorization": f"Bearer {GLM_API_KEY}", "Content-Type": "application/json"}
-    payload = {
-        "model": "glm-4.5",
-        "messages": [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt}
-        ],
-        "temperature": 0.2,
-        "max_tokens": 300
-    }
-    try:
-        r = requests.post(GLM_API_URL, headers=headers, json=payload, timeout=30)
-        r.raise_for_status()
-        data = r.json()
-        raw = data["choices"][0]["message"]["content"]
-        # try extract JSON
-        m = re.search(r"(\{[\s\S]*\})", raw)
-        if m:
-            cleaned = json.loads(m.group(1).replace("'", '"'))
-            # ensure choices exist
-            return {"text": cleaned.get("text", q_text), "choices": cleaned.get("choices", choices or [])}
-    except Exception as e:
-        logger.warning("GLM naturalize failed: %s", e)
-    return {"text": q_text, "choices": choices or []}
-# --- top-level conversion function --- #
-def pdf_to_questions(pdf_path: str, year: int = None, subject: str = None, naturalize: bool = False) -> List[Dict]:
-    """
-    Convert a PDF path to a list of question dicts.
-    If naturalize=True and GLM key present, will call GLM to rewrite extracted questions.
-    """
-    raw = try_extract_text(pdf_path)
-    parsed = parse_mcq_blocks(raw)
-    if naturalize:
-        cleaned = []
-        for q in parsed:
-            try:
-                res = glm_naturalize_question(q["text"], q.get("choices", []))
-                q["text"] = res["text"]
-                q["choices"] = res["choices"]
-            except Exception as e:
-                logger.warning("naturalize failed for question: %s", e)
-            cleaned.append(q)
-        parsed = cleaned
-    # attach year/subject info placeholders (merge script will assign final subject key)
-    for q in parsed:
-        if year:
-            q.setdefault("year", year)
-        if subject:
-            q.setdefault("subject", subject)
-    return parsed
-def pdf_to_json_file(pdf_path: str, out_json_path: str, year: int = None, subject: str = None, naturalize: bool = False):
-    qs = pdf_to_questions(pdf_path, year=year, subject=subject, naturalize=naturalize)
-    # Write basic JSON array (questions without ids)
-    with open(out_json_path, "w", encoding="utf-8") as f:
-        json.dump(qs, f, indent=2, ensure_ascii=False)
-    logger.info("Wrote %d questions to %s", len(qs), out_json_path)
-    return out_json_path

 import os
 import json
+import pytesseract
+from PIL import Image
+import pdfplumber
+class OcrAgent:
+    def __init__(self, language="eng"):
+        self.language = language
+    def extract_from_image(self, image_path):
+        img = Image.open(image_path)
+        text = pytesseract.image_to_string(img, lang=self.language)
+        return text
+    def extract_from_pdf(self, pdf_path):
+        """Extract text from each page. Uses native text when available, OCR fallback otherwise."""
+        text_blocks = []
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                text = page.extract_text()
+                if not text:  # scanned page fallback
+                    pil_img = page.to_image(resolution=300).original
+                    text = pytesseract.image_to_string(pil_img, lang=self.language)
+                text_blocks.append(text)
+        return "\n".join(text_blocks)
+    def clean_text(self, raw_text):
+        """Basic cleanup of OCR noise."""
+        lines = raw_text.splitlines()
+        cleaned = [line.strip() for line in lines if line.strip()]
+        return " ".join(cleaned)
+    def text_to_json(self, raw_text, subject="BM", year="2018", output_dir="data"):
+        """
+        Convert cleaned text into simple JSON format.
+        Assumes format like:
+        1. Question text
+           A. option
+           B. option
+           ...
+        """
+        questions = []
+        current_q = None
+        for line in raw_text.splitlines():
+            line = line.strip()
+            if not line:
                 continue
+            if line[0].isdigit() and "." in line[:3]:
+                # New question
+                if current_q:
+                    questions.append(current_q)
+                q_text = line[line.find(".") + 1:].strip()
+                current_q = {"text": q_text, "choices": [], "topics": ["general"]}
+            elif line[0] in ["A", "B", "C", "D"] and line[1] == ".":
+                # Answer choice
+                if current_q:
+                    choice_text = line[2:].strip()
+                    current_q["choices"].append(choice_text)
+            else:
+                # Continuation of question text
+                if current_q:
+                    current_q["text"] += " " + line
+        if current_q:
+            questions.append(current_q)
+        # Save JSON
+        os.makedirs(output_dir, exist_ok=True)
+        filename = f"{output_dir}/spm_{year}_{subject}.json"
+        with open(filename, "w", encoding="utf-8") as f:
+            json.dump(questions, f, indent=2, ensure_ascii=False)
+        return filename