Benny-Tang commited on
Commit
09e298e
·
verified ·
1 Parent(s): 8f96aba

Update merge_questions.py

Browse files
Files changed (1) hide show
  1. merge_questions.py +115 -30
merge_questions.py CHANGED
@@ -1,48 +1,133 @@
1
  import os
2
  import json
 
 
3
 
4
  DATA_DIR = "data"
5
- OUTPUT_PATH = "questions.json"
6
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- def load_json_file(path):
 
 
 
9
  try:
10
  with open(path, "r", encoding="utf-8") as f:
11
  return json.load(f)
12
- except (json.JSONDecodeError, FileNotFoundError):
13
- return []
14
 
15
 
16
- def merge_json_files():
 
17
  merged = []
18
- if not os.path.exists(DATA_DIR):
19
- os.makedirs(DATA_DIR, exist_ok=True)
20
-
21
- for fname in os.listdir(DATA_DIR):
22
- if fname.endswith(".json"):
23
- path = os.path.join(DATA_DIR, fname)
24
- data = load_json_file(path)
25
- if isinstance(data, list):
26
- merged.extend(data)
27
- else:
28
- print(f"⚠️ Skipped invalid file: {fname}")
29
-
30
- # Deduplicate by question ID
31
- seen = set()
32
- unique = []
33
- for q in merged:
34
- if q.get("id") not in seen:
35
- seen.add(q.get("id"))
36
- unique.append(q)
37
-
38
- with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
39
- json.dump(unique, f, indent=2, ensure_ascii=False)
40
-
41
- print(f"✅ Merged {len(unique)} questions into {OUTPUT_PATH}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  if __name__ == "__main__":
45
- merge_json_files()
 
46
 
47
 
48
 
 
1
  import os
2
  import json
3
+ import re
4
+ from pathlib import Path
5
 
6
  DATA_DIR = "data"
7
+ OUTPUT_FILE = "questions.json"
8
 
9
+ # Allowed tokens in data filenames (lowercase)
10
+ VALID_SUBJECT_TOKENS = {
11
+ "bm": "BM",
12
+ "english": "English",
13
+ "math": "Math",
14
+ "history": "History",
15
+ "science": "Science",
16
+ "moralstudies": "MoralStudies",
17
+ "accounting": "Accounting",
18
+ "economics": "Economics",
19
+ "business": "Business",
20
+ }
21
 
22
+ YEARS = [str(y) for y in range(2018, 2025)]
23
+
24
+
25
+ def safe_load_json(path):
26
  try:
27
  with open(path, "r", encoding="utf-8") as f:
28
  return json.load(f)
29
+ except Exception:
30
+ return None
31
 
32
 
33
+ def merge_all(data_dir=DATA_DIR, output_file=OUTPUT_FILE):
34
+ os.makedirs(data_dir, exist_ok=True)
35
  merged = []
36
+ next_id = 1000
37
+
38
+ # scan files named spm_{year}_{subject}.json (case-insensitive)
39
+ p = Path(data_dir)
40
+ files = sorted(p.glob("spm_*_*.json"))
41
+
42
+ for file in files:
43
+ fname = file.name
44
+ # skip scheme files (we handle schemes separately)
45
+ if fname.lower().endswith("_scheme.json"):
46
+ continue
47
+
48
+ m = re.match(r"spm[_\-](\d{4})[_\-]([a-zA-Z]+)\.json$", fname, re.IGNORECASE)
49
+ if not m:
50
+ print(f"Skipping non-matching file: {fname}")
51
+ continue
52
+ year_token = m.group(1)
53
+ subj_token = m.group(2).lower()
54
+
55
+ if year_token not in YEARS:
56
+ print(f"Skipping year outside 2018-2024: {fname}")
57
+ continue
58
+
59
+ if subj_token not in VALID_SUBJECT_TOKENS:
60
+ print(f"Unknown subject token '{subj_token}' in {fname}; skipping.")
61
+ continue
62
+
63
+ subj_display = VALID_SUBJECT_TOKENS[subj_token]
64
+ subj_key = f"Form5_{subj_display}"
65
+
66
+ # load questions list from file
67
+ qlist = safe_load_json(str(file))
68
+ if not isinstance(qlist, list):
69
+ print(f"Skipping {fname}: not a JSON list.")
70
+ continue
71
+
72
+ # load scheme if exists
73
+ scheme_path = file.with_name(f"{file.stem}_scheme.json")
74
+ scheme = safe_load_json(str(scheme_path)) or {}
75
+
76
+ # Accept two possible scheme formats:
77
+ # 1) mapping of string index (1-based) -> single-letter "A"/"B" ...
78
+ # 2) mapping of string index (1-based) -> {"correct_answer": "<choice text>"}
79
+ for idx, q in enumerate(qlist, start=1):
80
+ # prepare normalized question object
81
+ merged_q = {
82
+ "id": next_id,
83
+ "subject": subj_key,
84
+ "question_type": q.get("question_type", "mcq"),
85
+ "text": q.get("text", "").strip(),
86
+ "choices": q.get("choices", []),
87
+ "topics": q.get("topics", []),
88
+ "difficulty": q.get("difficulty", 3),
89
+ "source": q.get("source", "pastpaper"),
90
+ "year": int(year_token)
91
+ }
92
+
93
+ # attach correct_answer if scheme provides it
94
+ correct_answer = None
95
+ # scheme with keys by numeric index as string
96
+ if isinstance(scheme, dict):
97
+ val = scheme.get(str(idx)) or scheme.get(idx)
98
+ # if val is a single-letter like "A", map to choices
99
+ if isinstance(val, str):
100
+ val_strip = val.strip().upper()
101
+ if len(val_strip) == 1 and merged_q["choices"]:
102
+ pos = ord(val_strip) - ord("A")
103
+ if 0 <= pos < len(merged_q["choices"]):
104
+ correct_answer = merged_q["choices"][pos]
105
+ else:
106
+ # letter out of range -> leave None
107
+ correct_answer = None
108
+ else:
109
+ # treat val as direct answer text
110
+ correct_answer = val if val else None
111
+ elif isinstance(val, dict):
112
+ # maybe {"correct_answer": "Ampere"}
113
+ correct_answer = val.get("correct_answer")
114
+ else:
115
+ correct_answer = None
116
+
117
+ merged_q["correct_answer"] = correct_answer
118
+ merged.append(merged_q)
119
+ next_id += 1
120
+
121
+ # write merged file safely (overwrite)
122
+ with open(output_file, "w", encoding="utf-8") as f:
123
+ json.dump(merged, f, indent=2, ensure_ascii=False)
124
+
125
+ print(f"✅ Merged {len(merged)} questions into {output_file}")
126
 
127
 
128
  if __name__ == "__main__":
129
+ merge_all()
130
+
131
 
132
 
133