Benny-Tang commited on
Commit
dd7996c
·
verified ·
1 Parent(s): 30ace27

Update merge_questions.py

Browse files
Files changed (1) hide show
  1. merge_questions.py +23 -59
merge_questions.py CHANGED
@@ -2,26 +2,11 @@ import json
2
  import glob
3
  import re
4
  import os
5
- import pdfplumber
6
 
7
  DATA_DIR = "data"
8
  OUTPUT_FILE = "questions.json"
9
 
10
- def extract_answers_from_pdf(pdf_path):
11
- """Extract answers like '1. B' from a PDF answer scheme"""
12
- answers = {}
13
- with pdfplumber.open(pdf_path) as pdf:
14
- for page in pdf.pages:
15
- text = page.extract_text()
16
- if not text:
17
- continue
18
- for match in re.findall(r"(\d+)\.\s*([A-D])", text):
19
- qnum, ans = match
20
- answers[int(qnum)] = ans
21
- return answers
22
-
23
  def load_answers_from_json(json_path):
24
- """Load answers from a JSON answer scheme: { '1': 'A', '2': 'C' }"""
25
  with open(json_path, "r", encoding="utf-8") as f:
26
  return {int(k): v for k, v in json.load(f).items()}
27
 
@@ -30,74 +15,53 @@ def merge_question_files(data_dir=DATA_DIR, output_file=OUTPUT_FILE):
30
  next_id = 1000
31
  missing_answers = []
32
 
33
- # Find all question JSON files
34
- question_files = glob.glob(os.path.join(data_dir, "spm_*.json"))
35
 
36
  for qfile in question_files:
37
- year = re.search(r"spm_(\d+)\.json", qfile)
38
- if not year:
 
 
 
39
  continue
40
- year = year.group(1)
41
-
42
- # Look for corresponding answer scheme
43
- answers = {}
44
- scheme_pdf = os.path.join(data_dir, f"spm_{year}_scheme.pdf")
45
- scheme_json = os.path.join(data_dir, f"spm_{year}_scheme.json")
46
-
47
- if os.path.exists(scheme_pdf):
48
- answers = extract_answers_from_pdf(scheme_pdf)
49
- print(f"✅ Loaded {len(answers)} answers from {scheme_pdf}")
50
- elif os.path.exists(scheme_json):
51
- answers = load_answers_from_json(scheme_json)
52
- print(f"✅ Loaded {len(answers)} answers from {scheme_json}")
53
- else:
54
- print(f"⚠️ No answer scheme found for {year}, leaving blank.")
55
-
56
- # Process questions
57
  with open(qfile, "r", encoding="utf-8") as f:
58
  data = json.load(f)
59
 
60
  for idx, q in enumerate(data, start=1):
61
  q["id"] = next_id
62
  next_id += 1
 
 
 
 
63
 
64
- # Normalize subject (drop year suffix if any)
65
- if "_" in q["subject"]:
66
- parts = q["subject"].split("_")
67
- if len(parts) > 2 and parts[-1].isdigit():
68
- q["subject"] = "_".join(parts[:-1])
69
-
70
- # Assign correct answer if available
71
  if answers.get(idx):
72
- letter = answers[idx]
73
  try:
74
- q["correct_answer"] = q["choices"][ord(letter) - ord("A")]
75
  except Exception:
76
  q["correct_answer"] = ""
77
  missing_answers.append((year, idx, q["text"]))
78
  else:
79
- q["correct_answer"] = q.get("correct_answer", "")
80
- if not q["correct_answer"]:
81
  missing_answers.append((year, idx, q["text"][:50]))
82
 
83
  all_questions.append(q)
84
 
85
- # Save merged file
86
  with open(output_file, "w", encoding="utf-8") as f:
87
  json.dump(all_questions, f, indent=2, ensure_ascii=False)
88
 
89
- print(f"\n🎉 Merged {len(all_questions)} questions into {output_file}")
90
-
91
- # Report missing answers
92
  if missing_answers:
93
- print("\n⚠️ Missing correct answers for these questions:")
94
- for year, idx, snippet in missing_answers[:20]: # show first 20 only
95
- print(f" Year {year}, Q{idx}: {snippet}...")
96
- if len(missing_answers) > 20:
97
- print(f" ... and {len(missing_answers)-20} more")
98
- else:
99
- print("✅ All questions have correct answers!")
100
-
101
 
102
  if __name__ == "__main__":
103
  merge_question_files()
 
 
2
  import glob
3
  import re
4
  import os
 
5
 
6
  DATA_DIR = "data"
7
  OUTPUT_FILE = "questions.json"
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def load_answers_from_json(json_path):
 
10
  with open(json_path, "r", encoding="utf-8") as f:
11
  return {int(k): v for k, v in json.load(f).items()}
12
 
 
15
  next_id = 1000
16
  missing_answers = []
17
 
18
+ question_files = glob.glob(os.path.join(data_dir, "spm_*_*.json"))
 
19
 
20
  for qfile in question_files:
21
+ if qfile.endswith("_scheme.json"):
22
+ continue
23
+
24
+ m = re.match(r".*spm_(\d+)_(\w+)\.json", qfile)
25
+ if not m:
26
  continue
27
+ year, subject = m.groups()
28
+
29
+ subj = subject # keep case as in filename (BM, English, etc.)
30
+ subj_key = f"Form5_{subj}"
31
+
32
+ scheme_file = os.path.join(data_dir, f"spm_{year}_{subject}_scheme.json")
33
+ answers = load_answers_from_json(scheme_file) if os.path.exists(scheme_file) else {}
34
+
 
 
 
 
 
 
 
 
 
35
  with open(qfile, "r", encoding="utf-8") as f:
36
  data = json.load(f)
37
 
38
  for idx, q in enumerate(data, start=1):
39
  q["id"] = next_id
40
  next_id += 1
41
+ q["subject"] = subj_key
42
+ q["question_type"] = q.get("question_type", "mcq")
43
+ q["difficulty"] = q.get("difficulty", 3)
44
+ q["source"] = "pastpaper"
45
 
 
 
 
 
 
 
 
46
  if answers.get(idx):
 
47
  try:
48
+ q["correct_answer"] = q["choices"][ord(answers[idx]) - ord("A")]
49
  except Exception:
50
  q["correct_answer"] = ""
51
  missing_answers.append((year, idx, q["text"]))
52
  else:
53
+ if not q.get("correct_answer"):
 
54
  missing_answers.append((year, idx, q["text"][:50]))
55
 
56
  all_questions.append(q)
57
 
 
58
  with open(output_file, "w", encoding="utf-8") as f:
59
  json.dump(all_questions, f, indent=2, ensure_ascii=False)
60
 
61
+ print(f" Merged {len(all_questions)} questions into {output_file}")
 
 
62
  if missing_answers:
63
+ print(f"⚠️ {len(missing_answers)} questions missing answers.")
 
 
 
 
 
 
 
64
 
65
  if __name__ == "__main__":
66
  merge_question_files()
67
+