Spaces:
Runtime error
Runtime error
Update merge_questions.py
Browse files- merge_questions.py +23 -59
merge_questions.py
CHANGED
|
@@ -2,26 +2,11 @@ import json
|
|
| 2 |
import glob
|
| 3 |
import re
|
| 4 |
import os
|
| 5 |
-
import pdfplumber
|
| 6 |
|
| 7 |
DATA_DIR = "data"
|
| 8 |
OUTPUT_FILE = "questions.json"
|
| 9 |
|
| 10 |
-
def extract_answers_from_pdf(pdf_path):
|
| 11 |
-
"""Extract answers like '1. B' from a PDF answer scheme"""
|
| 12 |
-
answers = {}
|
| 13 |
-
with pdfplumber.open(pdf_path) as pdf:
|
| 14 |
-
for page in pdf.pages:
|
| 15 |
-
text = page.extract_text()
|
| 16 |
-
if not text:
|
| 17 |
-
continue
|
| 18 |
-
for match in re.findall(r"(\d+)\.\s*([A-D])", text):
|
| 19 |
-
qnum, ans = match
|
| 20 |
-
answers[int(qnum)] = ans
|
| 21 |
-
return answers
|
| 22 |
-
|
| 23 |
def load_answers_from_json(json_path):
|
| 24 |
-
"""Load answers from a JSON answer scheme: { '1': 'A', '2': 'C' }"""
|
| 25 |
with open(json_path, "r", encoding="utf-8") as f:
|
| 26 |
return {int(k): v for k, v in json.load(f).items()}
|
| 27 |
|
|
@@ -30,74 +15,53 @@ def merge_question_files(data_dir=DATA_DIR, output_file=OUTPUT_FILE):
|
|
| 30 |
next_id = 1000
|
| 31 |
missing_answers = []
|
| 32 |
|
| 33 |
-
|
| 34 |
-
question_files = glob.glob(os.path.join(data_dir, "spm_*.json"))
|
| 35 |
|
| 36 |
for qfile in question_files:
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
| 39 |
continue
|
| 40 |
-
year =
|
| 41 |
-
|
| 42 |
-
#
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
answers = extract_answers_from_pdf(scheme_pdf)
|
| 49 |
-
print(f"✅ Loaded {len(answers)} answers from {scheme_pdf}")
|
| 50 |
-
elif os.path.exists(scheme_json):
|
| 51 |
-
answers = load_answers_from_json(scheme_json)
|
| 52 |
-
print(f"✅ Loaded {len(answers)} answers from {scheme_json}")
|
| 53 |
-
else:
|
| 54 |
-
print(f"⚠️ No answer scheme found for {year}, leaving blank.")
|
| 55 |
-
|
| 56 |
-
# Process questions
|
| 57 |
with open(qfile, "r", encoding="utf-8") as f:
|
| 58 |
data = json.load(f)
|
| 59 |
|
| 60 |
for idx, q in enumerate(data, start=1):
|
| 61 |
q["id"] = next_id
|
| 62 |
next_id += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
# Normalize subject (drop year suffix if any)
|
| 65 |
-
if "_" in q["subject"]:
|
| 66 |
-
parts = q["subject"].split("_")
|
| 67 |
-
if len(parts) > 2 and parts[-1].isdigit():
|
| 68 |
-
q["subject"] = "_".join(parts[:-1])
|
| 69 |
-
|
| 70 |
-
# Assign correct answer if available
|
| 71 |
if answers.get(idx):
|
| 72 |
-
letter = answers[idx]
|
| 73 |
try:
|
| 74 |
-
q["correct_answer"] = q["choices"][ord(
|
| 75 |
except Exception:
|
| 76 |
q["correct_answer"] = ""
|
| 77 |
missing_answers.append((year, idx, q["text"]))
|
| 78 |
else:
|
| 79 |
-
|
| 80 |
-
if not q["correct_answer"]:
|
| 81 |
missing_answers.append((year, idx, q["text"][:50]))
|
| 82 |
|
| 83 |
all_questions.append(q)
|
| 84 |
|
| 85 |
-
# Save merged file
|
| 86 |
with open(output_file, "w", encoding="utf-8") as f:
|
| 87 |
json.dump(all_questions, f, indent=2, ensure_ascii=False)
|
| 88 |
|
| 89 |
-
print(f"
|
| 90 |
-
|
| 91 |
-
# Report missing answers
|
| 92 |
if missing_answers:
|
| 93 |
-
print("
|
| 94 |
-
for year, idx, snippet in missing_answers[:20]: # show first 20 only
|
| 95 |
-
print(f" Year {year}, Q{idx}: {snippet}...")
|
| 96 |
-
if len(missing_answers) > 20:
|
| 97 |
-
print(f" ... and {len(missing_answers)-20} more")
|
| 98 |
-
else:
|
| 99 |
-
print("✅ All questions have correct answers!")
|
| 100 |
-
|
| 101 |
|
| 102 |
if __name__ == "__main__":
|
| 103 |
merge_question_files()
|
|
|
|
|
|
| 2 |
import glob
|
| 3 |
import re
|
| 4 |
import os
|
|
|
|
| 5 |
|
| 6 |
DATA_DIR = "data"
|
| 7 |
OUTPUT_FILE = "questions.json"
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
def load_answers_from_json(json_path):
|
|
|
|
| 10 |
with open(json_path, "r", encoding="utf-8") as f:
|
| 11 |
return {int(k): v for k, v in json.load(f).items()}
|
| 12 |
|
|
|
|
| 15 |
next_id = 1000
|
| 16 |
missing_answers = []
|
| 17 |
|
| 18 |
+
question_files = glob.glob(os.path.join(data_dir, "spm_*_*.json"))
|
|
|
|
| 19 |
|
| 20 |
for qfile in question_files:
|
| 21 |
+
if qfile.endswith("_scheme.json"):
|
| 22 |
+
continue
|
| 23 |
+
|
| 24 |
+
m = re.match(r".*spm_(\d+)_(\w+)\.json", qfile)
|
| 25 |
+
if not m:
|
| 26 |
continue
|
| 27 |
+
year, subject = m.groups()
|
| 28 |
+
|
| 29 |
+
subj = subject # keep case as in filename (BM, English, etc.)
|
| 30 |
+
subj_key = f"Form5_{subj}"
|
| 31 |
+
|
| 32 |
+
scheme_file = os.path.join(data_dir, f"spm_{year}_{subject}_scheme.json")
|
| 33 |
+
answers = load_answers_from_json(scheme_file) if os.path.exists(scheme_file) else {}
|
| 34 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
with open(qfile, "r", encoding="utf-8") as f:
|
| 36 |
data = json.load(f)
|
| 37 |
|
| 38 |
for idx, q in enumerate(data, start=1):
|
| 39 |
q["id"] = next_id
|
| 40 |
next_id += 1
|
| 41 |
+
q["subject"] = subj_key
|
| 42 |
+
q["question_type"] = q.get("question_type", "mcq")
|
| 43 |
+
q["difficulty"] = q.get("difficulty", 3)
|
| 44 |
+
q["source"] = "pastpaper"
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
if answers.get(idx):
|
|
|
|
| 47 |
try:
|
| 48 |
+
q["correct_answer"] = q["choices"][ord(answers[idx]) - ord("A")]
|
| 49 |
except Exception:
|
| 50 |
q["correct_answer"] = ""
|
| 51 |
missing_answers.append((year, idx, q["text"]))
|
| 52 |
else:
|
| 53 |
+
if not q.get("correct_answer"):
|
|
|
|
| 54 |
missing_answers.append((year, idx, q["text"][:50]))
|
| 55 |
|
| 56 |
all_questions.append(q)
|
| 57 |
|
|
|
|
| 58 |
with open(output_file, "w", encoding="utf-8") as f:
|
| 59 |
json.dump(all_questions, f, indent=2, ensure_ascii=False)
|
| 60 |
|
| 61 |
+
print(f"✅ Merged {len(all_questions)} questions into {output_file}")
|
|
|
|
|
|
|
| 62 |
if missing_answers:
|
| 63 |
+
print(f"⚠️ {len(missing_answers)} questions missing answers.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
if __name__ == "__main__":
|
| 66 |
merge_question_files()
|
| 67 |
+
|