Spaces:

Benny-Tang
/

exam-simulator

Runtime error

App Files Files Community

Benny-Tang commited on Sep 14, 2025

Commit

09e298e

verified ·

1 Parent(s): 8f96aba

Update merge_questions.py

Browse files

Files changed (1) hide show

merge_questions.py +115 -30

merge_questions.py CHANGED Viewed

@@ -1,48 +1,133 @@
 import os
 import json
 DATA_DIR = "data"
-OUTPUT_PATH = "questions.json"
-def load_json_file(path):
     try:
         with open(path, "r", encoding="utf-8") as f:
             return json.load(f)
-    except (json.JSONDecodeError, FileNotFoundError):
-        return []
-def merge_json_files():
     merged = []
-    if not os.path.exists(DATA_DIR):
-        os.makedirs(DATA_DIR, exist_ok=True)
-    for fname in os.listdir(DATA_DIR):
-        if fname.endswith(".json"):
-            path = os.path.join(DATA_DIR, fname)
-            data = load_json_file(path)
-            if isinstance(data, list):
-                merged.extend(data)
-            else:
-                print(f"⚠️ Skipped invalid file: {fname}")
-    # Deduplicate by question ID
-    seen = set()
-    unique = []
-    for q in merged:
-        if q.get("id") not in seen:
-            seen.add(q.get("id"))
-            unique.append(q)
-    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
-        json.dump(unique, f, indent=2, ensure_ascii=False)
-    print(f"✅ Merged {len(unique)} questions into {OUTPUT_PATH}")
 if __name__ == "__main__":
-    merge_json_files()

 import os
 import json
+import re
+from pathlib import Path
 DATA_DIR = "data"
+OUTPUT_FILE = "questions.json"
+# Allowed tokens in data filenames (lowercase)
+VALID_SUBJECT_TOKENS = {
+    "bm": "BM",
+    "english": "English",
+    "math": "Math",
+    "history": "History",
+    "science": "Science",
+    "moralstudies": "MoralStudies",
+    "accounting": "Accounting",
+    "economics": "Economics",
+    "business": "Business",
+}
+YEARS = [str(y) for y in range(2018, 2025)]
+def safe_load_json(path):
     try:
         with open(path, "r", encoding="utf-8") as f:
             return json.load(f)
+    except Exception:
+        return None
+def merge_all(data_dir=DATA_DIR, output_file=OUTPUT_FILE):
+    os.makedirs(data_dir, exist_ok=True)
     merged = []
+    next_id = 1000
+    # scan files named spm_{year}_{subject}.json (case-insensitive)
+    p = Path(data_dir)
+    files = sorted(p.glob("spm_*_*.json"))
+    for file in files:
+        fname = file.name
+        # skip scheme files (we handle schemes separately)
+        if fname.lower().endswith("_scheme.json"):
+            continue
+        m = re.match(r"spm[_\-](\d{4})[_\-]([a-zA-Z]+)\.json$", fname, re.IGNORECASE)
+        if not m:
+            print(f"Skipping non-matching file: {fname}")
+            continue
+        year_token = m.group(1)
+        subj_token = m.group(2).lower()
+        if year_token not in YEARS:
+            print(f"Skipping year outside 2018-2024: {fname}")
+            continue
+        if subj_token not in VALID_SUBJECT_TOKENS:
+            print(f"Unknown subject token '{subj_token}' in {fname}; skipping.")
+            continue
+        subj_display = VALID_SUBJECT_TOKENS[subj_token]
+        subj_key = f"Form5_{subj_display}"
+        # load questions list from file
+        qlist = safe_load_json(str(file))
+        if not isinstance(qlist, list):
+            print(f"Skipping {fname}: not a JSON list.")
+            continue
+        # load scheme if exists
+        scheme_path = file.with_name(f"{file.stem}_scheme.json")
+        scheme = safe_load_json(str(scheme_path)) or {}
+        # Accept two possible scheme formats:
+        # 1) mapping of string index (1-based) -> single-letter "A"/"B" ...
+        # 2) mapping of string index (1-based) -> {"correct_answer": "<choice text>"}
+        for idx, q in enumerate(qlist, start=1):
+            # prepare normalized question object
+            merged_q = {
+                "id": next_id,
+                "subject": subj_key,
+                "question_type": q.get("question_type", "mcq"),
+                "text": q.get("text", "").strip(),
+                "choices": q.get("choices", []),
+                "topics": q.get("topics", []),
+                "difficulty": q.get("difficulty", 3),
+                "source": q.get("source", "pastpaper"),
+                "year": int(year_token)
+            }
+            # attach correct_answer if scheme provides it
+            correct_answer = None
+            # scheme with keys by numeric index as string
+            if isinstance(scheme, dict):
+                val = scheme.get(str(idx)) or scheme.get(idx)
+                # if val is a single-letter like "A", map to choices
+                if isinstance(val, str):
+                    val_strip = val.strip().upper()
+                    if len(val_strip) == 1 and merged_q["choices"]:
+                        pos = ord(val_strip) - ord("A")
+                        if 0 <= pos < len(merged_q["choices"]):
+                            correct_answer = merged_q["choices"][pos]
+                        else:
+                            # letter out of range -> leave None
+                            correct_answer = None
+                    else:
+                        # treat val as direct answer text
+                        correct_answer = val if val else None
+                elif isinstance(val, dict):
+                    # maybe {"correct_answer": "Ampere"}
+                    correct_answer = val.get("correct_answer")
+                else:
+                    correct_answer = None
+            merged_q["correct_answer"] = correct_answer
+            merged.append(merged_q)
+            next_id += 1
+    # write merged file safely (overwrite)
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(merged, f, indent=2, ensure_ascii=False)
+    print(f"✅ Merged {len(merged)} questions into {output_file}")
 if __name__ == "__main__":
+    merge_all()