cosmicmicra commited on
Commit
5dcf37a
Β·
verified Β·
1 Parent(s): e34f815

Add question database module (QuestionDatabase class + indexing)

Browse files
Files changed (1) hide show
  1. question_database.py +328 -0
question_database.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MathLingua β€” Question Database Module
3
+
4
+ Loads the question_database.json file and provides indexing, selection,
5
+ and readability validation utilities.
6
+
7
+ 130 hand-crafted math word problems across 15 sub-levels (1.1–3.5).
8
+ Difficulty is LINGUISTIC (readability), not mathematical.
9
+ Each question includes 4 scaffold levels (L1–L4) and readability metrics.
10
+
11
+ Distribution: Levels 1.1–3.1 have 10 questions each; Levels 3.2–3.5 have 5 each.
12
+ Total: 11Γ—10 + 4Γ—5 = 130 questions.
13
+
14
+ Reference: MathLingua Technical Specification Β§4
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import os
21
+ import random
22
+ from dataclasses import dataclass, asdict
23
+ from typing import Optional
24
+
25
+ try:
26
+ import textstat
27
+ HAS_TEXTSTAT = True
28
+ except ImportError:
29
+ HAS_TEXTSTAT = False
30
+
31
+
32
+ # ────────────────────────────────────────────────────────
33
+ # Constants
34
+ # ────────────────────────────────────────────────────────
35
+
36
+ LEVELS = [
37
+ "1.1", "1.2", "1.3", "1.4", "1.5",
38
+ "2.1", "2.2", "2.3", "2.4", "2.5",
39
+ "3.1", "3.2", "3.3", "3.4", "3.5",
40
+ ]
41
+
42
+ LEVEL_TO_ELO = {
43
+ "1.1": 820, "1.2": 870, "1.3": 920, "1.4": 970, "1.5": 1020,
44
+ "2.1": 1070, "2.2": 1120, "2.3": 1170, "2.4": 1220, "2.5": 1270,
45
+ "3.1": 1320, "3.2": 1370, "3.3": 1420, "3.4": 1470, "3.5": 1520,
46
+ }
47
+
48
+ DB_PATH = os.path.join(os.path.dirname(__file__), "question_database.json")
49
+
50
+
51
+ # ────────────────────────────────────────────────────────
52
+ # Question dataclass
53
+ # ────────────────────────────────────────────────────────
54
+
55
+ @dataclass
56
+ class Question:
57
+ id: str
58
+ level: str
59
+ topic: str
60
+ subtopic: str
61
+ grade: int
62
+ problem_text: str
63
+ answer: str
64
+ answer_numeric: float
65
+ solution_steps: list[str]
66
+ scaffolds: dict[str, str]
67
+ readability: dict[str, float]
68
+ elo_rating: int
69
+ metadata: dict[str, str]
70
+
71
+
72
+ # ────────────────────────────────────────────────────────
73
+ # QuestionDatabase
74
+ # ────────────────────────────────────────────────────────
75
+
76
+ class QuestionDatabase:
77
+ """
78
+ Manages the question pool with indexing by level, topic, and Elo range.
79
+
80
+ Usage:
81
+ db = QuestionDatabase() # auto-loads from question_database.json
82
+ db = QuestionDatabase(path="custom.json")
83
+
84
+ questions = db.get_by_level("2.1")
85
+ q = db.select_question("2.1", topic="fractions", exclude_ids=["2.1.03"])
86
+ stats = db.level_stats("2.1")
87
+ """
88
+
89
+ def __init__(self, path: Optional[str] = None):
90
+ self.path = path or DB_PATH
91
+ self.questions: list[Question] = []
92
+ self._by_id: dict[str, Question] = {}
93
+ self._by_level: dict[str, list[Question]] = {l: [] for l in LEVELS}
94
+ self._by_topic: dict[str, list[Question]] = {}
95
+
96
+ self._load()
97
+
98
+ def _load(self):
99
+ """Load questions from JSON file."""
100
+ if not os.path.exists(self.path):
101
+ raise FileNotFoundError(
102
+ f"Question database not found at: {self.path}\n"
103
+ f"Run this module directly to generate it, or provide a valid path."
104
+ )
105
+
106
+ with open(self.path, "r", encoding="utf-8") as f:
107
+ raw = json.load(f)
108
+
109
+ for entry in raw:
110
+ q = Question(
111
+ id=entry["id"],
112
+ level=entry["level"],
113
+ topic=entry["topic"],
114
+ subtopic=entry["subtopic"],
115
+ grade=entry["grade"],
116
+ problem_text=entry["problem_text"],
117
+ answer=entry["answer"],
118
+ answer_numeric=entry["answer_numeric"],
119
+ solution_steps=entry["solution_steps"],
120
+ scaffolds=entry["scaffolds"],
121
+ readability=entry.get("readability", {}),
122
+ elo_rating=entry["elo_rating"],
123
+ metadata=entry.get("metadata", {"source": "curated", "created_at": "2026-04-27"}),
124
+ )
125
+ self.questions.append(q)
126
+ self._by_id[q.id] = q
127
+ self._by_level[q.level].append(q)
128
+
129
+ if q.topic not in self._by_topic:
130
+ self._by_topic[q.topic] = []
131
+ self._by_topic[q.topic].append(q)
132
+
133
+ def __len__(self) -> int:
134
+ return len(self.questions)
135
+
136
+ def get_by_id(self, question_id: str) -> Optional[Question]:
137
+ """Get a question by its ID."""
138
+ return self._by_id.get(question_id)
139
+
140
+ def get_by_level(self, level: str) -> list[Question]:
141
+ """Get all questions at a given level."""
142
+ return self._by_level.get(level, [])
143
+
144
+ def get_by_topic(self, topic: str) -> list[Question]:
145
+ """Get all questions for a given topic."""
146
+ return self._by_topic.get(topic, [])
147
+
148
+ def select_question(
149
+ self,
150
+ level: str,
151
+ topic: Optional[str] = None,
152
+ exclude_ids: Optional[set[str]] = None,
153
+ ) -> Optional[Question]:
154
+ """
155
+ Select a random question at the given level, optionally filtered by topic.
156
+
157
+ Args:
158
+ level: Target sub-level (e.g., "2.1")
159
+ topic: Optional topic filter (e.g., "fractions")
160
+ exclude_ids: Set of question IDs to exclude (recently served)
161
+
162
+ Returns: Question or None if no match found
163
+ """
164
+ candidates = self._by_level.get(level, [])
165
+
166
+ if topic:
167
+ candidates = [q for q in candidates if q.topic == topic]
168
+
169
+ if exclude_ids:
170
+ candidates = [q for q in candidates if q.id not in exclude_ids]
171
+
172
+ if not candidates:
173
+ return None
174
+
175
+ return random.choice(candidates)
176
+
177
+ def select_batch(
178
+ self,
179
+ level_distribution: dict[str, int],
180
+ exclude_ids: Optional[set[str]] = None,
181
+ topic_weights: Optional[dict[str, float]] = None,
182
+ ) -> list[Question]:
183
+ """
184
+ Select a batch of questions according to level distribution.
185
+
186
+ Args:
187
+ level_distribution: {level: count} e.g., {"2.1": 5, "2.2": 8, "2.3": 5, "2.4": 2}
188
+ exclude_ids: Questions to exclude
189
+ topic_weights: Optional topic preference weights (favor weaker topics)
190
+
191
+ Returns: List of selected questions
192
+ """
193
+ exclude = exclude_ids or set()
194
+ batch = []
195
+
196
+ for level, count in level_distribution.items():
197
+ candidates = [q for q in self._by_level.get(level, []) if q.id not in exclude]
198
+
199
+ if topic_weights:
200
+ # Weight candidates by topic preference
201
+ weighted = []
202
+ for q in candidates:
203
+ w = topic_weights.get(q.topic, 1.0)
204
+ weighted.append((q, w))
205
+ # Weighted sample
206
+ if weighted:
207
+ questions_only = [qw[0] for qw in weighted]
208
+ weights_only = [qw[1] for qw in weighted]
209
+ selected = random.choices(questions_only, weights=weights_only, k=min(count, len(candidates)))
210
+ batch.extend(selected)
211
+ else:
212
+ selected = random.sample(candidates, min(count, len(candidates)))
213
+ batch.extend(selected)
214
+
215
+ random.shuffle(batch)
216
+ return batch
217
+
218
+ def level_stats(self, level: str) -> dict:
219
+ """Get statistics for a level's questions."""
220
+ questions = self._by_level.get(level, [])
221
+ if not questions:
222
+ return {"count": 0}
223
+
224
+ fk_scores = [q.readability.get("flesch_kincaid", 0) for q in questions if q.readability]
225
+ word_counts = [q.readability.get("word_count", 0) for q in questions if q.readability]
226
+ diff_words = [q.readability.get("difficult_words", 0) for q in questions if q.readability]
227
+
228
+ return {
229
+ "count": len(questions),
230
+ "topics": list(set(q.topic for q in questions)),
231
+ "avg_fk": round(sum(fk_scores) / max(len(fk_scores), 1), 2),
232
+ "avg_words": round(sum(word_counts) / max(len(word_counts), 1), 1),
233
+ "avg_difficult_words": round(sum(diff_words) / max(len(diff_words), 1), 1),
234
+ "elo_range": (min(q.elo_rating for q in questions), max(q.elo_rating for q in questions)),
235
+ }
236
+
237
+ def compute_readability(self, text: str) -> dict[str, float]:
238
+ """Compute readability metrics for a problem text using textstat."""
239
+ if not HAS_TEXTSTAT:
240
+ return {"error": "textstat not installed"}
241
+
242
+ return {
243
+ "flesch_kincaid": round(textstat.flesch_kincaid_grade(text), 2),
244
+ "word_count": textstat.lexicon_count(text, removepunct=True),
245
+ "difficult_words": textstat.difficult_words(text),
246
+ "avg_syllables_per_word": round(
247
+ textstat.syllable_count(text) / max(textstat.lexicon_count(text, removepunct=True), 1), 3
248
+ ),
249
+ }
250
+
251
+ def validate_all(self) -> dict:
252
+ """Validate the full database: check counts, readability ordering, etc."""
253
+ results = {
254
+ "total_questions": len(self.questions),
255
+ "expected_total": 130,
256
+ "level_counts": {},
257
+ "level_stats": {},
258
+ "monotonic_fk": True,
259
+ "issues": [],
260
+ }
261
+
262
+ expected_counts = {l: 10 for l in LEVELS}
263
+ for l in ["3.2", "3.3", "3.4", "3.5"]:
264
+ expected_counts[l] = 5
265
+
266
+ prev_fk = 0.0
267
+ for level in LEVELS:
268
+ count = len(self._by_level[level])
269
+ results["level_counts"][level] = count
270
+ stats = self.level_stats(level)
271
+ results["level_stats"][level] = stats
272
+
273
+ if count != expected_counts[level]:
274
+ results["issues"].append(
275
+ f"Level {level}: expected {expected_counts[level]} questions, got {count}"
276
+ )
277
+
278
+ if stats.get("avg_fk", 0) < prev_fk:
279
+ results["monotonic_fk"] = False
280
+ results["issues"].append(
281
+ f"Level {level}: FK grade {stats.get('avg_fk')} is less than previous {prev_fk}"
282
+ )
283
+ prev_fk = stats.get("avg_fk", 0)
284
+
285
+ results["valid"] = len(results["issues"]) == 0
286
+ return results
287
+
288
+ def to_dict_list(self) -> list[dict]:
289
+ """Export all questions as a list of dicts (for JSON serialization)."""
290
+ return [asdict(q) for q in self.questions]
291
+
292
+ def summary(self) -> str:
293
+ """Print a summary table of the database."""
294
+ lines = [
295
+ "MathLingua Question Database Summary",
296
+ "=" * 60,
297
+ f"Total questions: {len(self.questions)}",
298
+ f"Topics: {sorted(self._by_topic.keys())}",
299
+ "",
300
+ f"{'Level':<8}{'Count':<8}{'Avg FK':<10}{'Avg Words':<12}{'Topics':<30}",
301
+ "-" * 60,
302
+ ]
303
+ for level in LEVELS:
304
+ stats = self.level_stats(level)
305
+ lines.append(
306
+ f"{level:<8}{stats['count']:<8}{stats.get('avg_fk', 'N/A'):<10}"
307
+ f"{stats.get('avg_words', 'N/A'):<12}{', '.join(stats.get('topics', [])):<30}"
308
+ )
309
+ return "\n".join(lines)
310
+
311
+
312
+ # ────────────────────────────────────────────────────────
313
+ # Main
314
+ # ────────────────────────────────────────────────────────
315
+
316
+ if __name__ == "__main__":
317
+ try:
318
+ db = QuestionDatabase()
319
+ print(db.summary())
320
+ print("\n")
321
+ validation = db.validate_all()
322
+ print(f"Validation: {'PASS βœ“' if validation['valid'] else 'FAIL βœ—'}")
323
+ if validation["issues"]:
324
+ for issue in validation["issues"]:
325
+ print(f" ⚠ {issue}")
326
+ except FileNotFoundError as e:
327
+ print(f"Database file not found: {e}")
328
+ print("The question_database.json file should be in the same directory.")