Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 13, 2025

Commit

dd7996c

verified ·

1 Parent(s): 30ace27

Update merge_questions.py

Browse files

Files changed (1) hide show

merge_questions.py +23 -59

merge_questions.py CHANGED Viewed

@@ -2,26 +2,11 @@ import json
 import glob
 import re
 import os
-import pdfplumber
 DATA_DIR = "data"
 OUTPUT_FILE = "questions.json"
-def extract_answers_from_pdf(pdf_path):
-    """Extract answers like '1. B' from a PDF answer scheme"""
-    answers = {}
-    with pdfplumber.open(pdf_path) as pdf:
-        for page in pdf.pages:
-            text = page.extract_text()
-            if not text:
-                continue
-            for match in re.findall(r"(\d+)\.\s*([A-D])", text):
-                qnum, ans = match
-                answers[int(qnum)] = ans
-    return answers
 def load_answers_from_json(json_path):
-    """Load answers from a JSON answer scheme: { '1': 'A', '2': 'C' }"""
     with open(json_path, "r", encoding="utf-8") as f:
         return {int(k): v for k, v in json.load(f).items()}
@@ -30,74 +15,53 @@ def merge_question_files(data_dir=DATA_DIR, output_file=OUTPUT_FILE):
     next_id = 1000
     missing_answers = []
-    # Find all question JSON files
-    question_files = glob.glob(os.path.join(data_dir, "spm_*.json"))
     for qfile in question_files:
-        year = re.search(r"spm_(\d+)\.json", qfile)
-        if not year:
             continue
-        year = year.group(1)
-        # Look for corresponding answer scheme
-        answers = {}
-        scheme_pdf = os.path.join(data_dir, f"spm_{year}_scheme.pdf")
-        scheme_json = os.path.join(data_dir, f"spm_{year}_scheme.json")
-        if os.path.exists(scheme_pdf):
-            answers = extract_answers_from_pdf(scheme_pdf)
-            print(f"✅ Loaded {len(answers)} answers from {scheme_pdf}")
-        elif os.path.exists(scheme_json):
-            answers = load_answers_from_json(scheme_json)
-            print(f"✅ Loaded {len(answers)} answers from {scheme_json}")
-        else:
-            print(f"⚠️ No answer scheme found for {year}, leaving blank.")
-        # Process questions
         with open(qfile, "r", encoding="utf-8") as f:
             data = json.load(f)
         for idx, q in enumerate(data, start=1):
             q["id"] = next_id
             next_id += 1
-            # Normalize subject (drop year suffix if any)
-            if "_" in q["subject"]:
-                parts = q["subject"].split("_")
-                if len(parts) > 2 and parts[-1].isdigit():
-                    q["subject"] = "_".join(parts[:-1])
-            # Assign correct answer if available
             if answers.get(idx):
-                letter = answers[idx]
                 try:
-                    q["correct_answer"] = q["choices"][ord(letter) - ord("A")]
                 except Exception:
                     q["correct_answer"] = ""
                     missing_answers.append((year, idx, q["text"]))
             else:
-                q["correct_answer"] = q.get("correct_answer", "")
-                if not q["correct_answer"]:
                     missing_answers.append((year, idx, q["text"][:50]))
             all_questions.append(q)
-    # Save merged file
     with open(output_file, "w", encoding="utf-8") as f:
         json.dump(all_questions, f, indent=2, ensure_ascii=False)
-    print(f"\n🎉 Merged {len(all_questions)} questions into {output_file}")
-    # Report missing answers
     if missing_answers:
-        print("\n⚠️ Missing correct answers for these questions:")
-        for year, idx, snippet in missing_answers[:20]:  # show first 20 only
-            print(f"  Year {year}, Q{idx}: {snippet}...")
-        if len(missing_answers) > 20:
-            print(f"  ... and {len(missing_answers)-20} more")
-    else:
-        print("✅ All questions have correct answers!")
 if __name__ == "__main__":
     merge_question_files()

 import glob
 import re
 import os
 DATA_DIR = "data"
 OUTPUT_FILE = "questions.json"
 def load_answers_from_json(json_path):
     with open(json_path, "r", encoding="utf-8") as f:
         return {int(k): v for k, v in json.load(f).items()}
     next_id = 1000
     missing_answers = []
+    question_files = glob.glob(os.path.join(data_dir, "spm_*_*.json"))
     for qfile in question_files:
+        if qfile.endswith("_scheme.json"):
+            continue
+        m = re.match(r".*spm_(\d+)_(\w+)\.json", qfile)
+        if not m:
             continue
+        year, subject = m.groups()
+        subj = subject  # keep case as in filename (BM, English, etc.)
+        subj_key = f"Form5_{subj}"
+        scheme_file = os.path.join(data_dir, f"spm_{year}_{subject}_scheme.json")
+        answers = load_answers_from_json(scheme_file) if os.path.exists(scheme_file) else {}
         with open(qfile, "r", encoding="utf-8") as f:
             data = json.load(f)
         for idx, q in enumerate(data, start=1):
             q["id"] = next_id
             next_id += 1
+            q["subject"] = subj_key
+            q["question_type"] = q.get("question_type", "mcq")
+            q["difficulty"] = q.get("difficulty", 3)
+            q["source"] = "pastpaper"
             if answers.get(idx):
                 try:
+                    q["correct_answer"] = q["choices"][ord(answers[idx]) - ord("A")]
                 except Exception:
                     q["correct_answer"] = ""
                     missing_answers.append((year, idx, q["text"]))
             else:
+                if not q.get("correct_answer"):
                     missing_answers.append((year, idx, q["text"][:50]))
             all_questions.append(q)
     with open(output_file, "w", encoding="utf-8") as f:
         json.dump(all_questions, f, indent=2, ensure_ascii=False)
+    print(f"✅ Merged {len(all_questions)} questions into {output_file}")
     if missing_answers:
+        print(f"⚠️ {len(missing_answers)} questions missing answers.")
 if __name__ == "__main__":
     merge_question_files()