Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 14, 2025

Commit

8f96aba

verified ·

1 Parent(s): 069219d

Update app.py

Browse files

Files changed (1) hide show

app.py +217 -145

app.py CHANGED Viewed

@@ -1,210 +1,281 @@
 import os
 import json
 import random
 import gradio as gr
 from agents import AnalyzerAgent, CoachAgent, PredictiveAgent
 from ocr_agent import OcrAgent
-import subprocess
 # Paths
-QUESTION_BANK_PATH = "questions.json"
 DATA_DIR = "data"
-# Initialize agents
 analyzer = AnalyzerAgent()
 coach_agent = CoachAgent()
 predictor = PredictiveAgent()
 ocr_agent = OcrAgent()
-# Load question bank
 def load_question_bank():
-    if not os.path.exists(QUESTION_BANK_PATH):
         return []
-    with open(QUESTION_BANK_PATH, "r", encoding="utf-8") as f:
-        try:
-            return json.load(f)
-        except json.JSONDecodeError:
-            return []
 QUESTION_BANK = load_question_bank()
-# Merge JSONs after OCR extraction
 def merge_questions():
     try:
         subprocess.run(["python", "merge_questions.py"], check=True)
         global QUESTION_BANK
         QUESTION_BANK = load_question_bank()
-        return True, "Merge successful."
     except subprocess.CalledProcessError as e:
         return False, f"Merge failed: {e}"
-# Extract questions via OCR and save as JSON
-def process_pdf(file, subject, year):
-    if not file:
-        return "No file uploaded."
-    output_path = os.path.join(DATA_DIR, f"spm_{year}_{subject.lower()}.json")
-    os.makedirs(DATA_DIR, exist_ok=True)
-    ocr_agent.extract_questions(file, output_path)
-    ok, msg = merge_questions()
-    return f"OCR extracted data → {output_path}, merge status: {msg}"
-# Select questions for simulation
-def start_exam(level, subject, num_questions, include_predicted=False):
-    if not QUESTION_BANK:
-        return [], "Question bank is empty. Upload PDFs first."
-    # Filter by subject
-    pool = [q for q in QUESTION_BANK if q.get("subject") == subject]
-    # Add predicted if chosen
     if include_predicted:
-        predicted = predictor.predict(subject=subject, level=level, count=num_questions // 2)
-        pool.extend(predicted)
-    if not pool:
-        return [], f"No questions available for {subject}."
-    selected = random.sample(pool, min(num_questions, len(pool)))
-    return selected, f"Loaded {len(selected)} questions."
-# Submit exam
-def submit_exam(answers, exam_data, level, subject):
     if not exam_data:
-        return "No questions found.", {}, {}, {}, gr.update(visible=False), gr.update(visible=True)
     correct = 0
-    graded = 0  # only grade if answer key exists
     per_question = {}
     for q in exam_data:
-        qid = str(q["id"])
-        user_ans = answers.get(qid)
         correct_ans = None
-        if q["id"] < 900000:  # past paper
-            orig = next((item for item in QUESTION_BANK if item["id"] == q["id"]), None)
-            correct_ans = orig.get("correct_answer") if orig else None
-        else:  # predicted
-            correct_ans = q.get("correct_answer")
-        per_question[qid] = {"user": user_ans, "correct": correct_ans, "topics": q.get("topics", [])}
-        if correct_ans is not None:  # ✅ only grade if valid answer key exists
             graded += 1
-            if user_ans and str(user_ans).strip() == str(correct_ans).strip():
                 correct += 1
-    score = round(100 * correct / graded, 2) if graded > 0 else "N/A (no answer keys)"
     analysis = analyzer.analyze(per_question)
-    coach = coach_agent.coach(analysis, level, subject)
-    predictions_summary = predictor.summary(level, subject)
     return (
         f"Your Score: {score}%",
         analysis,
         coach,
-        predictions_summary,
         gr.update(visible=False),
         gr.update(visible=True)
     )
-# Prefill subject/year from filename
-def auto_detect(file):
-    if not file:
-        return None, None
-    fname = os.path.basename(file).lower()
-    year = None
-    subject = None
-    for y in range(2018, 2025):
-        if str(y) in fname:
-            year = str(y)
-            break
-    subjects = ["bm", "english", "math", "history", "science", "moralstudies"]
-    for s in subjects:
-        if s in fname:
-            subject = s
-            break
-    return year, subject
-def prefill_subject_year(file):
-    """Return auto-detected subject/year for UI prefill"""
-    if not file:
-        return "BM", "2018"
-    year, subject = auto_detect(file)
-    valid_subjects = ["BM", "English", "Math", "History", "Science", "MoralStudies"]
-    if subject:
-        subject = subject.upper()
-        if subject in ["B.M", "BAHASA", "BAHASAMELAYU"]:
-            subject = "BM"
-        if subject not in valid_subjects:
-            subject = "BM"
-    else:
-        subject = "BM"
-    return subject, year if year else "2018"
-# UI
 with gr.Blocks() as demo:
-    gr.Markdown("# 📘 SPM Exam Simulation Platform")
-    with gr.Tab("📤 Upload Papers"):
-        pdf_file = gr.File(label="Upload SPM PDF", type="filepath")
-        subject_dropdown = gr.Dropdown(
-            label="Subject", choices=["BM", "English", "Math", "History", "Science", "MoralStudies"], value="BM"
-        )
-        year_dropdown = gr.Dropdown(
-            label="Year", choices=[str(y) for y in range(2018, 2025)], value="2018"
-        )
-        upload_btn = gr.Button("Process PDF")
-        upload_status = gr.Textbox(label="Upload Status")
-        upload_btn.click(
-            process_pdf, inputs=[pdf_file, subject_dropdown, year_dropdown], outputs=upload_status
-        )
-        pdf_file.change(prefill_subject_year, inputs=pdf_file, outputs=[subject_dropdown, year_dropdown])
-    with gr.Tab("📝 Exam Simulation"):
-        level = gr.Dropdown(label="Level", choices=["Form2", "Form3", "Higher1", "Higher2", "Higher3"], value="Form3")
-        subject = gr.Dropdown(
-            label="Subject", choices=["BM", "English", "Math", "History", "Science", "MoralStudies"], value="Math"
-        )
-        num_questions = gr.Slider(5, 50, value=10, step=1, label="Number of Questions")
-        include_predicted = gr.Checkbox(label="Include AI-predicted questions", value=True)
         start_btn = gr.Button("Start Exam")
-        exam_output = gr.State()
-        exam_interface = gr.JSON(label="Exam Questions")
-        start_btn.click(
-            start_exam, inputs=[level, subject, num_questions, include_predicted], outputs=[exam_interface, upload_status]
-        )
-    with gr.Tab("📊 Results"):
-        answers_input = gr.JSON(label="Submit Your Answers (JSON: {id: choice})")
-        submit_btn = gr.Button("Submit Exam")
-        score_output = gr.Textbox(label="Score")
-        analysis_output = gr.JSON(label="Weakness Analysis")
-        coach_output = gr.JSON(label="Personalized Coaching")
-        predictor_output = gr.JSON(label="Predicted Trends")
-        back_btn = gr.Button("← Back to Exam", visible=False)
-        retry_btn = gr.Button("Retry", visible=False)
-        submit_btn.click(
-            submit_exam,
-            inputs=[answers_input, exam_interface, level, subject],
-            outputs=[score_output, analysis_output, coach_output, predictor_output, back_btn, retry_btn],
-        )
 demo.launch()
@@ -218,3 +289,4 @@ demo.launch()

 import os
+import re
 import json
 import random
+import subprocess
 import gradio as gr
 from agents import AnalyzerAgent, CoachAgent, PredictiveAgent
 from ocr_agent import OcrAgent
 # Paths
 DATA_DIR = "data"
+QUESTIONS_FILE = "questions.json"
+# Ensure data dir exists
+os.makedirs(DATA_DIR, exist_ok=True)
+# Agents
 analyzer = AnalyzerAgent()
 coach_agent = CoachAgent()
 predictor = PredictiveAgent()
 ocr_agent = OcrAgent()
 def load_question_bank():
+    """Load merged question bank safely; return [] if file missing/invalid."""
+    if not os.path.exists(QUESTIONS_FILE):
+        return []
+    try:
+        with open(QUESTIONS_FILE, "r", encoding="utf-8") as f:
+            content = f.read().strip()
+            return json.loads(content) if content else []
+    except Exception:
         return []
 QUESTION_BANK = load_question_bank()
+# ---------------- Merge helper ----------------
 def merge_questions():
+    """Run merge_questions.py to rebuild questions.json and reload in memory."""
     try:
         subprocess.run(["python", "merge_questions.py"], check=True)
         global QUESTION_BANK
         QUESTION_BANK = load_question_bank()
+        return True, "Merge successful"
     except subprocess.CalledProcessError as e:
         return False, f"Merge failed: {e}"
+# ---------------- OCR / Upload ----------------
+def auto_detect_from_filename(path):
+    """Try to detect year and subject (lowercase subject token) from filename.
+    Matches patterns like: spm_2018_bm.pdf or spm-2019-math.pdf etc."""
+    if not path:
+        return None, None
+    fname = os.path.basename(path)
+    m = re.search(r"spm[_\-]?(\d{4})[_\-]?([A-Za-z]+)", fname, re.IGNORECASE)
+    if not m:
+        return None, None
+    year = m.group(1)
+    subj = m.group(2).lower()
+    return year, subj
+SUBJECT_DISPLAY_ORDER = ["BM", "English", "Math", "History", "Science", "MoralStudies",
+                         "Accounting", "Economics", "Business"]
+def normalize_display_subject(token):
+    """Return display subject label (capitalized BM / English / Math / MoralStudies, etc.)."""
+    if not token:
+        return "BM"
+    t = token.strip().lower()
+    mapping = {
+        "bm": "BM",
+        "bahasa": "BM",
+        "bahasamelayu": "BM",
+        "english": "English",
+        "eng": "English",
+        "math": "Math",
+        "mathematics": "Math",
+        "history": "History",
+        "sejarah": "History",
+        "science": "Science",
+        "moral": "MoralStudies",
+        "moralstudies": "MoralStudies",
+        "accounting": "Accounting",
+        "economics": "Economics",
+        "business": "Business",
+    }
+    return mapping.get(t, token.capitalize())
+def subject_token_from_display(display_subj):
+    """Convert display subject (BM, English) to token used in filenames (lowercase)."""
+    if not display_subj:
+        return "bm"
+    dsp = display_subj.strip()
+    return dsp.lower()
+def process_pdf_and_merge(file_path, display_subject, year):
+    """
+    - Run OCR -> write data/spm_{year}_{subject}.json + scheme file.
+    - Auto-run merge_questions.py to create/refresh questions.json
+    """
+    if not file_path:
+        return "No file provided."
+    subj_token = subject_token_from_display(display_subject)
+    # call OCR agent to extract and write files
+    try:
+        out_qfile, out_scheme = ocr_agent.extract_questions_to_files(pdf_path=file_path,
+                                                                     year=str(year),
+                                                                     subject_token=subj_token,
+                                                                     out_dir=DATA_DIR)
+    except Exception as e:
+        return f"❌ OCR failed: {e}"
+    ok, msg = merge_questions()
+    if ok:
+        return f"✅ OCR saved {out_qfile} and {out_scheme}. Merge result: {msg}"
+    else:
+        return f"⚠️ OCR saved {out_qfile} and {out_scheme}. Merge result: {msg}"
+# ---------------- Exam logic ----------------
+def generate_exam(subject_display, num_questions, include_predicted):
+    """
+    Returns (exam_data (list), status_message, exam_data) to store exam_data in state.
+    exam_data items: {id:int, text:str, choices:list, topics:list, source:str}
+    """
+    # internal lookup subject key stored in questions.json is "Form5_<DisplaySubject>" e.g., Form5_BM
+    subj_key = f"Form5_{subject_display}"
+    pool = [q for q in QUESTION_BANK if q.get("subject") == subj_key]
+    predicted_questions = []
     if include_predicted:
+        # ask predictor to generate predictions using the current bank (so trend info is used)
+        predicted_questions = predictor.generate_predictions(level="Form5",
+                                                            subject=subject_display,
+                                                            n=8,
+                                                            question_bank=QUESTION_BANK)
+    combined = pool + predicted_questions
+    if not combined:
+        return [], f"No questions available for {subject_display}. Upload papers (2018–2024) first.", []
+    random.shuffle(combined)
+    selected = combined[:min(num_questions, len(combined))]
+    # Standardize output shape (do not expose 'correct_answer' for predicted? we include it,
+    # but the UI can show choices; predicted questions have correct_answer set by predictor)
+    exam_data = []
+    for q in selected:
+        # ensure minimal fields exist
+        exam_data.append({
+            "id": q.get("id"),
+            "text": q.get("text"),
+            "choices": q.get("choices", []),
+            "topics": q.get("topics", []),
+            "source": q.get("source", "pastpaper")
+        })
+    return exam_data, f"Prepared {len(exam_data)} questions (includes {len(predicted_questions)} predicted)" , exam_data
+def submit_exam_answers(answers_json, exam_data, subject_display):
+    """
+    answers_json: dict mapping question id (string) -> answer string (the answer text or choice text)
+    exam_data: list (from start)
+    We grade only questions where a correct_answer exists (not None).
+    """
     if not exam_data:
+        return "No exam data found.", {}, {}, {}, gr.update(visible=False), gr.update(visible=True)
     correct = 0
+    graded = 0
     per_question = {}
     for q in exam_data:
+        qid = q.get("id")
+        k = str(qid)
+        user_ans = answers_json.get(k)
+        # find canonical correct_answer: for past paper, from QUESTION_BANK; for predicted, from q itself if present
         correct_ans = None
+        if q.get("source") == "predicted":
+            # predicted question object may include a 'correct_answer'
+            # in our design predictor attaches 'correct_answer' to predicted questions
+            # but it's still probabilistic (has 'confidence' field)
+            # q (from exam_data) did not include correct_answer (we stripped), so find from QUESTION_BANK? Not present
+            # We need to find original predicted object — predictor returns dicts; but since predicted questions were not saved to QUESTION_BANK,
+            # the simple way: during generate_exam we should have kept the predicted correct_answer in the exam_data object.
+            # To keep things robust, first attempt to find a matching question in QUESTION_BANK (unlikely),
+            # then try to see if exam_data contains 'correct_answer' directly (shouldn't in UI). We'll assume predicted questions include correct_answer in exam_data if they are to be graded.
+            correct_ans = q.get("correct_answer")  # may be None
+        else:
+            # pastpaper: find in QUESTION_BANK by id
+            orig = next((item for item in QUESTION_BANK if item.get("id") == qid), None)
+            if orig:
+                correct_ans = orig.get("correct_answer")
+        per_question[str(qid)] = {"user": user_ans, "correct": correct_ans, "topics": q.get("topics", [])}
+        if correct_ans is not None:
             graded += 1
+            # compare string-normalized answers
+            if user_ans is not None and str(user_ans).strip() == str(correct_ans).strip():
                 correct += 1
+    score = round(100 * correct / graded, 2) if graded > 0 else "N/A (no answer keys available)"
     analysis = analyzer.analyze(per_question)
+    coach = coach_agent.coach(analysis, "Form5", subject_display)
+    pred_summary = predictor.summary(level="Form5", subject=subject_display, question_bank=QUESTION_BANK)
     return (
         f"Your Score: {score}%",
         analysis,
         coach,
+        pred_summary,
         gr.update(visible=False),
         gr.update(visible=True)
     )
+# ----------------- UI -----------------
 with gr.Blocks() as demo:
+    gr.Markdown("## SPM Exam Simulator — Form 5 (Past papers 2018–2024) with AI Predictions & OCR")
+    with gr.Tab("Upload Papers (OCR → JSON → Merge)"):
+        pdf_file = gr.File(label="Upload SPM PDF (filename like spm_2018_bm.pdf helps auto-detect)",
+                           type="filepath")
+        subject_dropdown = gr.Dropdown(choices=SUBJECT_DISPLAY_ORDER, value="BM", label="Subject (override)")
+        year_dropdown = gr.Dropdown(choices=[str(y) for y in range(2018, 2025)], value="2018", label="Year")
+        process_btn = gr.Button("Process PDF → JSON + Merge")
+        ocr_status = gr.Textbox(label="Status", interactive=False)
+        # When a file is uploaded, auto-fill subject/year fields
+        def prefill(file_path):
+            if not file_path:
+                return "BM", "2018"
+            year, subj_token = auto_detect_from_filename(file_path)
+            subj_display = normalize_display_subject(subj_token) if subj_token else "BM"
+            return subj_display, year if year else "2018"
+        pdf_file.change(fn=prefill, inputs=[pdf_file], outputs=[subject_dropdown, year_dropdown])
+        process_btn.click(fn=process_pdf_and_merge,
+                          inputs=[pdf_file, subject_dropdown, year_dropdown],
+                          outputs=[ocr_status])
+    with gr.Tab("Exam Simulator"):
+        subject_sel = gr.Dropdown(choices=["BM", "English", "Math", "History", "Science", "MoralStudies",
+                                          "Accounting", "Economics", "Business"],
+                                 value="Math", label="Subject")
+        num_q = gr.Slider(minimum=5, maximum=50, step=5, value=10, label="Number of Questions")
+        include_pred = gr.Checkbox(value=True, label="Include AI-predicted questions (in-memory only)")
         start_btn = gr.Button("Start Exam")
+        exam_state = gr.State()  # will store exam_data (list)
+        exam_display = gr.JSON(label="Exam Questions (read-only)")
+        start_btn.click(fn=generate_exam,
+                        inputs=[subject_sel, num_q, include_pred],
+                        outputs=[exam_display, gr.Textbox(label="Status"), exam_state])
+    with gr.Tab("Submit & Results"):
+        answers_input = gr.JSON(label="Your Answers (JSON dictionary: {\"<id>\": \"<choice text>\"})")
+        submit_btn = gr.Button("Submit Answers")
+        score_out = gr.Textbox(label="Score")
+        analysis_out = gr.JSON(label="Weakness Analysis")
+        coach_out = gr.JSON(label="Study Coach")
+        pred_out = gr.JSON(label="Predictions Summary")
+        submit_btn.click(fn=submit_exam_answers,
+                         inputs=[answers_input, gr.State(), subject_sel, ],
+                         outputs=[score_out, analysis_out, coach_out, pred_out, gr.Update(), gr.Update()])
 demo.launch()