Benny-Tang commited on
Commit
cdc6db0
·
verified ·
1 Parent(s): 467a83a

Update merge_questions.py

Browse files
Files changed (1) hide show
  1. merge_questions.py +98 -14
merge_questions.py CHANGED
@@ -1,30 +1,114 @@
 
1
  import os
2
  import json
 
3
 
4
- DATA_DIR = "data"
5
- OUTPUT_FILE = "questions.json"
 
6
 
7
- def merge_json_files():
8
- all_questions = []
9
- for fname in os.listdir(DATA_DIR):
10
- if fname.endswith(".json") and "paper2" in fname.lower():
11
- fpath = os.path.join(DATA_DIR, fname)
 
 
12
  try:
13
  if os.path.getsize(fpath) == 0:
14
- print(f"⚠️ Skipping empty file: {fname}")
15
  continue
16
  with open(fpath, "r", encoding="utf-8") as f:
17
  data = json.load(f)
18
  if isinstance(data, list):
19
- all_questions.extend(data)
20
  except Exception as e:
21
- print(f"⚠️ Error reading {fname}: {e}")
22
- with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
23
- json.dump(all_questions, out, ensure_ascii=False, indent=2)
24
- print(f"✅ Merged {len(all_questions)} Paper 2 questions into {OUTPUT_FILE}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  if __name__ == "__main__":
27
- merge_json_files()
 
28
 
29
 
30
 
 
1
+ # merge_questions.py
2
  import os
3
  import json
4
+ import sqlite3
5
 
6
+ PROCESSED_DIR = "data/processed"
7
+ QUESTIONS_FILE = "questions.json"
8
+ DB_FILE = "exam.db"
9
 
10
+ def load_processed_files():
11
+ items = []
12
+ if not os.path.exists(PROCESSED_DIR):
13
+ return items
14
+ for fname in os.listdir(PROCESSED_DIR):
15
+ if fname.endswith(".json"):
16
+ fpath = os.path.join(PROCESSED_DIR, fname)
17
  try:
18
  if os.path.getsize(fpath) == 0:
19
+ print(f"Skipping empty file {fname}")
20
  continue
21
  with open(fpath, "r", encoding="utf-8") as f:
22
  data = json.load(f)
23
  if isinstance(data, list):
24
+ items.extend(data)
25
  except Exception as e:
26
+ print(f"Error reading {fname}: {e}")
27
+ return items
28
+
29
+ def normalize_question(q, next_id):
30
+ """
31
+ Ensure fields id, text, choices(list), answer(letter or text), subject, paper, year, image, source
32
+ """
33
+ nq = {}
34
+ nq["id"] = int(q.get("id") or next_id)
35
+ nq["text"] = q.get("text") or q.get("question") or ""
36
+ # choices: if string, try to split by newline
37
+ c = q.get("choices", [])
38
+ if isinstance(c, str):
39
+ # split lines and remove leading A./B.
40
+ lines = [l.strip() for l in c.splitlines() if l.strip()]
41
+ choices = []
42
+ for ln in lines:
43
+ choices.append(re_sub_strip_choice(ln))
44
+ nq["choices"] = choices
45
+ elif isinstance(c, list):
46
+ nq["choices"] = c
47
+ else:
48
+ nq["choices"] = []
49
+
50
+ ans = q.get("answer", "")
51
+ nq["answer"] = ans
52
+ nq["subject"] = q.get("subject") or q.get("source_subject") or "Unknown"
53
+ nq["paper"] = int(q.get("paper", 2) or 2)
54
+ nq["year"] = int(q.get("year", 0) or 0)
55
+ nq["image"] = q.get("image")
56
+ nq["source"] = q.get("source") or ""
57
+ return nq
58
+
59
+ def re_sub_strip_choice(s):
60
+ import re
61
+ return re.sub(r'^[A-D][\.\)\-]?\s*', '', s).strip()
62
+
63
+ def merge():
64
+ items = load_processed_files()
65
+ merged = []
66
+ next_id = 100000
67
+ for it in items:
68
+ next_id += 1
69
+ try:
70
+ nq = normalize_question(it, next_id)
71
+ merged.append(nq)
72
+ except Exception as e:
73
+ print(f"Skipping item due to normalize error: {e}")
74
+
75
+ # write questions.json
76
+ with open(QUESTIONS_FILE, "w", encoding="utf-8") as f:
77
+ json.dump(merged, f, ensure_ascii=False, indent=2)
78
+ print(f"✅ Merged {len(merged)} questions into {QUESTIONS_FILE}")
79
+
80
+ # update sqlite DB
81
+ conn = sqlite3.connect(DB_FILE)
82
+ cur = conn.cursor()
83
+ cur.execute("""
84
+ CREATE TABLE IF NOT EXISTS questions (
85
+ id INTEGER PRIMARY KEY,
86
+ text TEXT,
87
+ choices TEXT,
88
+ answer TEXT,
89
+ subject TEXT,
90
+ paper INTEGER,
91
+ year INTEGER,
92
+ image TEXT,
93
+ source TEXT
94
+ )
95
+ """)
96
+ conn.commit()
97
+ inserted = 0
98
+ for q in merged:
99
+ try:
100
+ cur.execute("INSERT OR IGNORE INTO questions (id,text,choices,answer,subject,paper,year,image,source) VALUES (?,?,?,?,?,?,?,?,?)",
101
+ (q["id"], q["text"], json.dumps(q["choices"], ensure_ascii=False), q["answer"], q["subject"], q["paper"], q["year"], q["image"], q["source"]))
102
+ inserted += 1
103
+ except Exception as e:
104
+ print("DB insert error:", e)
105
+ conn.commit()
106
+ conn.close()
107
+ print(f"✅ Inserted or ignored {inserted} rows into {DB_FILE}")
108
 
109
  if __name__ == "__main__":
110
+ merge()
111
+
112
 
113
 
114