Spaces:
Runtime error
Runtime error
Update merge_questions.py
Browse files- merge_questions.py +98 -14
merge_questions.py
CHANGED
|
@@ -1,30 +1,114 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
|
|
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
|
|
|
| 6 |
|
| 7 |
-
def
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
| 12 |
try:
|
| 13 |
if os.path.getsize(fpath) == 0:
|
| 14 |
-
print(f"
|
| 15 |
continue
|
| 16 |
with open(fpath, "r", encoding="utf-8") as f:
|
| 17 |
data = json.load(f)
|
| 18 |
if isinstance(data, list):
|
| 19 |
-
|
| 20 |
except Exception as e:
|
| 21 |
-
print(f"
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
if __name__ == "__main__":
|
| 27 |
-
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
|
|
|
|
| 1 |
+
# merge_questions.py
|
| 2 |
import os
|
| 3 |
import json
|
| 4 |
+
import sqlite3
|
| 5 |
|
| 6 |
+
PROCESSED_DIR = "data/processed"
|
| 7 |
+
QUESTIONS_FILE = "questions.json"
|
| 8 |
+
DB_FILE = "exam.db"
|
| 9 |
|
| 10 |
+
def load_processed_files():
|
| 11 |
+
items = []
|
| 12 |
+
if not os.path.exists(PROCESSED_DIR):
|
| 13 |
+
return items
|
| 14 |
+
for fname in os.listdir(PROCESSED_DIR):
|
| 15 |
+
if fname.endswith(".json"):
|
| 16 |
+
fpath = os.path.join(PROCESSED_DIR, fname)
|
| 17 |
try:
|
| 18 |
if os.path.getsize(fpath) == 0:
|
| 19 |
+
print(f"Skipping empty file {fname}")
|
| 20 |
continue
|
| 21 |
with open(fpath, "r", encoding="utf-8") as f:
|
| 22 |
data = json.load(f)
|
| 23 |
if isinstance(data, list):
|
| 24 |
+
items.extend(data)
|
| 25 |
except Exception as e:
|
| 26 |
+
print(f"Error reading {fname}: {e}")
|
| 27 |
+
return items
|
| 28 |
+
|
| 29 |
+
def normalize_question(q, next_id):
|
| 30 |
+
"""
|
| 31 |
+
Ensure fields id, text, choices(list), answer(letter or text), subject, paper, year, image, source
|
| 32 |
+
"""
|
| 33 |
+
nq = {}
|
| 34 |
+
nq["id"] = int(q.get("id") or next_id)
|
| 35 |
+
nq["text"] = q.get("text") or q.get("question") or ""
|
| 36 |
+
# choices: if string, try to split by newline
|
| 37 |
+
c = q.get("choices", [])
|
| 38 |
+
if isinstance(c, str):
|
| 39 |
+
# split lines and remove leading A./B.
|
| 40 |
+
lines = [l.strip() for l in c.splitlines() if l.strip()]
|
| 41 |
+
choices = []
|
| 42 |
+
for ln in lines:
|
| 43 |
+
choices.append(re_sub_strip_choice(ln))
|
| 44 |
+
nq["choices"] = choices
|
| 45 |
+
elif isinstance(c, list):
|
| 46 |
+
nq["choices"] = c
|
| 47 |
+
else:
|
| 48 |
+
nq["choices"] = []
|
| 49 |
+
|
| 50 |
+
ans = q.get("answer", "")
|
| 51 |
+
nq["answer"] = ans
|
| 52 |
+
nq["subject"] = q.get("subject") or q.get("source_subject") or "Unknown"
|
| 53 |
+
nq["paper"] = int(q.get("paper", 2) or 2)
|
| 54 |
+
nq["year"] = int(q.get("year", 0) or 0)
|
| 55 |
+
nq["image"] = q.get("image")
|
| 56 |
+
nq["source"] = q.get("source") or ""
|
| 57 |
+
return nq
|
| 58 |
+
|
| 59 |
+
def re_sub_strip_choice(s):
|
| 60 |
+
import re
|
| 61 |
+
return re.sub(r'^[A-D][\.\)\-]?\s*', '', s).strip()
|
| 62 |
+
|
| 63 |
+
def merge():
|
| 64 |
+
items = load_processed_files()
|
| 65 |
+
merged = []
|
| 66 |
+
next_id = 100000
|
| 67 |
+
for it in items:
|
| 68 |
+
next_id += 1
|
| 69 |
+
try:
|
| 70 |
+
nq = normalize_question(it, next_id)
|
| 71 |
+
merged.append(nq)
|
| 72 |
+
except Exception as e:
|
| 73 |
+
print(f"Skipping item due to normalize error: {e}")
|
| 74 |
+
|
| 75 |
+
# write questions.json
|
| 76 |
+
with open(QUESTIONS_FILE, "w", encoding="utf-8") as f:
|
| 77 |
+
json.dump(merged, f, ensure_ascii=False, indent=2)
|
| 78 |
+
print(f"✅ Merged {len(merged)} questions into {QUESTIONS_FILE}")
|
| 79 |
+
|
| 80 |
+
# update sqlite DB
|
| 81 |
+
conn = sqlite3.connect(DB_FILE)
|
| 82 |
+
cur = conn.cursor()
|
| 83 |
+
cur.execute("""
|
| 84 |
+
CREATE TABLE IF NOT EXISTS questions (
|
| 85 |
+
id INTEGER PRIMARY KEY,
|
| 86 |
+
text TEXT,
|
| 87 |
+
choices TEXT,
|
| 88 |
+
answer TEXT,
|
| 89 |
+
subject TEXT,
|
| 90 |
+
paper INTEGER,
|
| 91 |
+
year INTEGER,
|
| 92 |
+
image TEXT,
|
| 93 |
+
source TEXT
|
| 94 |
+
)
|
| 95 |
+
""")
|
| 96 |
+
conn.commit()
|
| 97 |
+
inserted = 0
|
| 98 |
+
for q in merged:
|
| 99 |
+
try:
|
| 100 |
+
cur.execute("INSERT OR IGNORE INTO questions (id,text,choices,answer,subject,paper,year,image,source) VALUES (?,?,?,?,?,?,?,?,?)",
|
| 101 |
+
(q["id"], q["text"], json.dumps(q["choices"], ensure_ascii=False), q["answer"], q["subject"], q["paper"], q["year"], q["image"], q["source"]))
|
| 102 |
+
inserted += 1
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print("DB insert error:", e)
|
| 105 |
+
conn.commit()
|
| 106 |
+
conn.close()
|
| 107 |
+
print(f"✅ Inserted or ignored {inserted} rows into {DB_FILE}")
|
| 108 |
|
| 109 |
if __name__ == "__main__":
|
| 110 |
+
merge()
|
| 111 |
+
|
| 112 |
|
| 113 |
|
| 114 |
|