Benny-Tang commited on
Commit
a593d2d
·
verified ·
1 Parent(s): b3687e7

Update merge_questions.py

Browse files
Files changed (1) hide show
  1. merge_questions.py +17 -101
merge_questions.py CHANGED
@@ -1,112 +1,28 @@
1
- # merge_questions.py
2
  import os
3
  import json
4
- import re
5
- from pathlib import Path
6
 
7
  DATA_DIR = "data"
8
  OUTPUT_FILE = "questions.json"
9
 
10
- VALID_SUBJECT_TOKENS = {
11
- "bm": "BM",
12
- "english": "English",
13
- "math": "Math",
14
- "history": "History",
15
- "science": "Science",
16
- "moralstudies": "MoralStudies",
17
- "accounting": "Accounting",
18
- "economics": "Economics",
19
- "business": "Business",
20
- }
21
-
22
- YEARS = [str(y) for y in range(2018, 2025)]
23
-
24
-
25
- def safe_load(path):
26
- try:
27
- with open(path, "r", encoding="utf-8") as f:
28
- return json.load(f)
29
- except Exception:
30
- return None
31
-
32
-
33
- def merge_all(data_dir=DATA_DIR, output_file=OUTPUT_FILE):
34
- os.makedirs(data_dir, exist_ok=True)
35
- merged = []
36
- next_id = 1000
37
-
38
- p = Path(data_dir)
39
- files = sorted(p.glob("spm_*_*.json"))
40
-
41
- for file in files:
42
- fname = file.name
43
- if fname.lower().endswith("_scheme.json"):
44
- continue
45
- m = re.match(r"spm[_\-](\d{4})[_\-]([a-zA-Z]+)\.json$", fname, re.IGNORECASE)
46
- if not m:
47
- print(f"Skipping non-matching file: {fname}")
48
- continue
49
- year = m.group(1)
50
- subj_token = m.group(2).lower()
51
- if year not in YEARS:
52
- print(f"Skipping year outside 2018-2024: {fname}")
53
- continue
54
- if subj_token not in VALID_SUBJECT_TOKENS:
55
- print(f"Unknown subject token '{subj_token}' in {fname}; skipping.")
56
- continue
57
- subj_display = VALID_SUBJECT_TOKENS[subj_token]
58
- subj_key = f"Form5_{subj_display}"
59
-
60
- qlist = safe_load(str(file))
61
- if not isinstance(qlist, list):
62
- print(f"Skipping {fname}: not a JSON list.")
63
- continue
64
-
65
- scheme_path = file.with_name(f"{file.stem}_scheme.json")
66
- scheme = safe_load(str(scheme_path)) or {}
67
-
68
- for idx, q in enumerate(qlist, start=1):
69
- merged_q = {
70
- "id": next_id,
71
- "subject": subj_key,
72
- "question_type": q.get("question_type", "mcq"),
73
- "text": q.get("text", "").strip(),
74
- "choices": q.get("choices", []),
75
- "topics": q.get("topics", []),
76
- "difficulty": q.get("difficulty", 3),
77
- "source": q.get("source", "pastpaper"),
78
- "year": int(year)
79
- }
80
-
81
- correct_answer = None
82
- val = scheme.get(str(idx)) if isinstance(scheme, dict) else None
83
- if isinstance(val, str):
84
- v = val.strip()
85
- if len(v) == 1 and merged_q["choices"]:
86
- pos = ord(v.upper()) - ord("A")
87
- if 0 <= pos < len(merged_q["choices"]):
88
- correct_answer = merged_q["choices"][pos]
89
- else:
90
- correct_answer = None
91
- else:
92
- correct_answer = v if v else None
93
- elif isinstance(val, dict):
94
- correct_answer = val.get("correct_answer")
95
- else:
96
- correct_answer = None
97
-
98
- merged_q["correct_answer"] = correct_answer
99
- merged.append(merged_q)
100
- next_id += 1
101
-
102
- with open(output_file, "w", encoding="utf-8") as f:
103
- json.dump(merged, f, indent=2, ensure_ascii=False)
104
-
105
- print(f"✅ Merged {len(merged)} questions into {output_file}")
106
-
107
 
108
  if __name__ == "__main__":
109
- merge_all()
 
110
 
111
 
112
 
 
 
1
  import os
2
  import json
 
 
3
 
4
  DATA_DIR = "data"
5
  OUTPUT_FILE = "questions.json"
6
 
7
+ def merge_json_files():
8
+ all_questions = []
9
+ for fname in os.listdir(DATA_DIR):
10
+ if fname.endswith(".json"):
11
+ fpath = os.path.join(DATA_DIR, fname)
12
+ try:
13
+ with open(fpath, "r", encoding="utf-8") as f:
14
+ data = json.load(f)
15
+ if isinstance(data, list):
16
+ all_questions.extend(data)
17
+ except Exception as e:
18
+ print(f"⚠️ Error reading {fname}: {e}")
19
+ with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
20
+ json.dump(all_questions, out, ensure_ascii=False, indent=2)
21
+ print(f"✅ Merged {len(all_questions)} questions into {OUTPUT_FILE}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  if __name__ == "__main__":
24
+ merge_json_files()
25
+
26
 
27
 
28