cosmicmicra commited on
Commit
87a78e6
Β·
verified Β·
1 Parent(s): 4ae86c5

Add feature engineering module (LDS & MCS computation)

Browse files
Files changed (1) hide show
  1. feature_engineering.py +319 -0
feature_engineering.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MathLingua β€” Feature Engineering Module
3
+
4
+ Computes Language Dependency Score (LDS) and Math Confidence Score (MCS)
5
+ from student interaction data. These two engineered features disentangle
6
+ linguistic struggle from mathematical difficulty, enabling the adaptive
7
+ engine to make targeted decisions.
8
+
9
+ Reference: MathLingua Technical Specification Β§5
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import math
15
+ from dataclasses import dataclass, field
16
+ from typing import Optional
17
+
18
+
19
+ # ────────────────────────────────────────────────────────
20
+ # Data containers
21
+ # ────────────────────────────────────────────────────────
22
+
23
+ @dataclass
24
+ class InteractionSignals:
25
+ """Raw signals captured from a single student-question interaction."""
26
+
27
+ max_hint_level: int = 0 # 0 = no hints, 1 = L1, ..., 4 = L4
28
+ time_before_first_hint: float = 0.0 # seconds
29
+ total_time: float = 0.0 # seconds (from display to submission)
30
+ time_at_L1: float = 0.0 # seconds spent at each scaffold
31
+ time_at_L2: float = 0.0
32
+ time_at_L3: float = 0.0
33
+ time_at_L4: float = 0.0
34
+ num_attempts: int = 1 # answer attempts
35
+ is_correct: bool = False
36
+ question_level: str = "1.1" # difficulty sub-level
37
+
38
+
39
+ @dataclass
40
+ class EngineeredFeatures:
41
+ """Output of the feature engineering pipeline for one interaction."""
42
+
43
+ # Sub-features for LDS
44
+ hint_depth_normalized: float = 0.0 # D_hint ∈ [0, 1]
45
+ scaffold_time_ratio: float = 0.0 # R_scaffold ∈ [0, 1]
46
+ escalation_speed: float = 0.0 # E_speed ∈ [0, 1]
47
+ reveal_flag: float = 0.0 # F_reveal ∈ {0, 1}
48
+
49
+ # Sub-features for MCS
50
+ correctness: float = 0.0 # C_correct ∈ {0, 1}
51
+ speed_factor: float = 0.0 # S_speed ∈ [0, 1]
52
+ attempt_efficiency: float = 0.0 # A_efficiency ∈ [0, 1]
53
+
54
+ # Composite scores
55
+ lds: float = 0.0 # Language Dependency Score [0, 1]
56
+ mcs: float = 0.0 # Math Confidence Score [0, 1]
57
+
58
+ # Diagnostic quadrant
59
+ quadrant: str = "" # thriving | language_gap | math_struggle | dual_challenge
60
+
61
+
62
+ # ────────────────────────────────────────────────────────
63
+ # Default median times per level (seconds)
64
+ # Calibrated from spec: lower levels β†’ shorter, higher β†’ longer
65
+ # ────────────────────────────────────────────────────────
66
+
67
+ DEFAULT_MEDIAN_TIMES: dict[str, float] = {
68
+ "1.1": 30.0, "1.2": 35.0, "1.3": 40.0, "1.4": 45.0, "1.5": 50.0,
69
+ "2.1": 55.0, "2.2": 60.0, "2.3": 65.0, "2.4": 70.0, "2.5": 75.0,
70
+ "3.1": 80.0, "3.2": 85.0, "3.3": 90.0, "3.4": 95.0, "3.5": 100.0,
71
+ }
72
+
73
+
74
+ # ────────────────────────────────────────────────────────
75
+ # Feature Engineer
76
+ # ────────────────────────────────────────────────────────
77
+
78
+ class FeatureEngineer:
79
+ """
80
+ Computes LDS and MCS from raw interaction signals.
81
+
82
+ LDS = clamp(0.35Β·D_hint + 0.25Β·R_scaffold + 0.20Β·E_speed + 0.20Β·F_reveal, 0, 1)
83
+ MCS = clamp(0.30Β·C_correct + 0.25Β·S_speed + 0.20Β·A_efficiency + 0.25Β·(1-LDS), 0, 1)
84
+
85
+ The 2Γ—2 diagnostic quadrant is derived from thresholds:
86
+ LDS < 0.4 & MCS β‰₯ 0.6 β†’ Thriving
87
+ LDS β‰₯ 0.4 & MCS β‰₯ 0.6 β†’ Language Gap
88
+ LDS < 0.4 & MCS < 0.6 β†’ Math Struggle
89
+ LDS β‰₯ 0.4 & MCS < 0.6 β†’ Dual Challenge
90
+ """
91
+
92
+ # LDS weights
93
+ W1: float = 0.35 # hint depth
94
+ W2: float = 0.25 # scaffold time ratio
95
+ W3: float = 0.20 # escalation speed
96
+ W4: float = 0.20 # reveal flag
97
+
98
+ # MCS weights
99
+ W5: float = 0.30 # correctness
100
+ W6: float = 0.25 # speed factor
101
+ W7: float = 0.20 # attempt efficiency
102
+ W8: float = 0.25 # language independence (1 - LDS)
103
+
104
+ # Diagnostic thresholds
105
+ LDS_THRESHOLD: float = 0.4
106
+ MCS_THRESHOLD: float = 0.6
107
+
108
+ def __init__(self, median_times: Optional[dict[str, float]] = None):
109
+ self.median_times = median_times or DEFAULT_MEDIAN_TIMES
110
+
111
+ @staticmethod
112
+ def _clamp(value: float, lo: float = 0.0, hi: float = 1.0) -> float:
113
+ return max(lo, min(hi, value))
114
+
115
+ # ── Sub-feature computations ──
116
+
117
+ def _hint_depth_normalized(self, signals: InteractionSignals) -> float:
118
+ """D_hint = h_i / 4"""
119
+ return signals.max_hint_level / 4.0
120
+
121
+ def _scaffold_time_ratio(self, signals: InteractionSignals) -> float:
122
+ """R_scaffold = scaffold_time / total_time"""
123
+ scaffold_time = (
124
+ signals.time_at_L1 + signals.time_at_L2 +
125
+ signals.time_at_L3 + signals.time_at_L4
126
+ )
127
+ if signals.total_time <= 0:
128
+ return 0.0
129
+ return self._clamp(scaffold_time / signals.total_time)
130
+
131
+ def _escalation_speed(self, signals: InteractionSignals) -> float:
132
+ """E_speed = 1 - (t_pre / median_time) if hints used, else 0"""
133
+ if signals.max_hint_level == 0:
134
+ return 0.0
135
+ median = self.median_times.get(signals.question_level, 60.0)
136
+ if median <= 0:
137
+ return 1.0
138
+ raw = 1.0 - (signals.time_before_first_hint / median)
139
+ return self._clamp(raw)
140
+
141
+ def _reveal_flag(self, signals: InteractionSignals) -> float:
142
+ """F_reveal = 1.0 if L4 accessed, else 0.0"""
143
+ return 1.0 if signals.max_hint_level == 4 else 0.0
144
+
145
+ def _correctness(self, signals: InteractionSignals) -> float:
146
+ """C_correct ∈ {0, 1}"""
147
+ return 1.0 if signals.is_correct else 0.0
148
+
149
+ def _speed_factor(self, signals: InteractionSignals) -> float:
150
+ """S_speed = clamp(median_time / total_time, 0, 1)"""
151
+ median = self.median_times.get(signals.question_level, 60.0)
152
+ if signals.total_time <= 0:
153
+ return 0.0
154
+ return self._clamp(median / signals.total_time)
155
+
156
+ def _attempt_efficiency(self, signals: InteractionSignals) -> float:
157
+ """A_efficiency = 1 / attempts"""
158
+ if signals.num_attempts <= 0:
159
+ return 0.0
160
+ return 1.0 / signals.num_attempts
161
+
162
+ # ── Composite scores ──
163
+
164
+ def _compute_lds(self, d_hint: float, r_scaffold: float,
165
+ e_speed: float, f_reveal: float) -> float:
166
+ raw = (self.W1 * d_hint + self.W2 * r_scaffold +
167
+ self.W3 * e_speed + self.W4 * f_reveal)
168
+ return self._clamp(raw)
169
+
170
+ def _compute_mcs(self, c_correct: float, s_speed: float,
171
+ a_efficiency: float, lds: float) -> float:
172
+ raw = (self.W5 * c_correct + self.W6 * s_speed +
173
+ self.W7 * a_efficiency + self.W8 * (1.0 - lds))
174
+ return self._clamp(raw)
175
+
176
+ def _classify_quadrant(self, lds: float, mcs: float) -> str:
177
+ if lds < self.LDS_THRESHOLD and mcs >= self.MCS_THRESHOLD:
178
+ return "thriving"
179
+ elif lds >= self.LDS_THRESHOLD and mcs >= self.MCS_THRESHOLD:
180
+ return "language_gap"
181
+ elif lds < self.LDS_THRESHOLD and mcs < self.MCS_THRESHOLD:
182
+ return "math_struggle"
183
+ else:
184
+ return "dual_challenge"
185
+
186
+ # ── Main entry point ──
187
+
188
+ def compute(self, signals: InteractionSignals) -> EngineeredFeatures:
189
+ """Compute all engineered features from raw interaction signals."""
190
+
191
+ d_hint = self._hint_depth_normalized(signals)
192
+ r_scaffold = self._scaffold_time_ratio(signals)
193
+ e_speed = self._escalation_speed(signals)
194
+ f_reveal = self._reveal_flag(signals)
195
+
196
+ c_correct = self._correctness(signals)
197
+ s_speed = self._speed_factor(signals)
198
+ a_efficiency = self._attempt_efficiency(signals)
199
+
200
+ lds = self._compute_lds(d_hint, r_scaffold, e_speed, f_reveal)
201
+ mcs = self._compute_mcs(c_correct, s_speed, a_efficiency, lds)
202
+ quadrant = self._classify_quadrant(lds, mcs)
203
+
204
+ return EngineeredFeatures(
205
+ hint_depth_normalized=round(d_hint, 4),
206
+ scaffold_time_ratio=round(r_scaffold, 4),
207
+ escalation_speed=round(e_speed, 4),
208
+ reveal_flag=f_reveal,
209
+ correctness=c_correct,
210
+ speed_factor=round(s_speed, 4),
211
+ attempt_efficiency=round(a_efficiency, 4),
212
+ lds=round(lds, 4),
213
+ mcs=round(mcs, 4),
214
+ quadrant=quadrant,
215
+ )
216
+
217
+ def compute_weighted_outcome(self, is_correct: bool,
218
+ max_hint_level: int) -> float:
219
+ """
220
+ Hint-weighted outcome for Elo/BKT updates.
221
+ 1.00 = correct, no hints
222
+ 0.75 = correct, L1 only
223
+ 0.50 = correct, L2
224
+ 0.25 = correct, L3
225
+ 0.00 = incorrect, or L4 used
226
+ """
227
+ if not is_correct or max_hint_level == 4:
228
+ return 0.0
229
+ outcome_map = {0: 1.0, 1: 0.75, 2: 0.50, 3: 0.25}
230
+ return outcome_map.get(max_hint_level, 0.0)
231
+
232
+
233
+ # ────────────────────────────────────────────────────────
234
+ # Self-test / examples
235
+ # ────────────────────────────────────────────────────────
236
+
237
+ def _run_examples():
238
+ fe = FeatureEngineer()
239
+
240
+ print("=" * 70)
241
+ print("MathLingua Feature Engineering β€” Worked Examples")
242
+ print("=" * 70)
243
+
244
+ # Example 1: Strong student, no hints, fast solve
245
+ signals1 = InteractionSignals(
246
+ max_hint_level=0,
247
+ time_before_first_hint=0.0,
248
+ total_time=25.0,
249
+ is_correct=True,
250
+ num_attempts=1,
251
+ question_level="2.1",
252
+ )
253
+ f1 = fe.compute(signals1)
254
+ print(f"\nExample 1 β€” Strong student, no hints, fast solve")
255
+ print(f" LDS = {f1.lds:.3f} (expected ~0.0)")
256
+ print(f" MCS = {f1.mcs:.3f} (expected ~1.0)")
257
+ print(f" Quadrant: {f1.quadrant}")
258
+ print(f" Weighted outcome: {fe.compute_weighted_outcome(True, 0)}")
259
+
260
+ # Example 2: Language-dependent, used L3, correct
261
+ signals2 = InteractionSignals(
262
+ max_hint_level=3,
263
+ time_before_first_hint=5.0,
264
+ total_time=90.0,
265
+ time_at_L1=10.0,
266
+ time_at_L2=15.0,
267
+ time_at_L3=30.0,
268
+ is_correct=True,
269
+ num_attempts=2,
270
+ question_level="2.3",
271
+ )
272
+ f2 = fe.compute(signals2)
273
+ print(f"\nExample 2 β€” Language-dependent, used L3, correct on 2nd try")
274
+ print(f" LDS = {f2.lds:.3f} (expected ~0.5-0.6)")
275
+ print(f" MCS = {f2.mcs:.3f} (expected ~0.3-0.4)")
276
+ print(f" Quadrant: {f2.quadrant}")
277
+ print(f" Weighted outcome: {fe.compute_weighted_outcome(True, 3)}")
278
+
279
+ # Example 3: Perfect student β€” fast, correct, no hints
280
+ signals3 = InteractionSignals(
281
+ max_hint_level=0,
282
+ total_time=15.0,
283
+ is_correct=True,
284
+ num_attempts=1,
285
+ question_level="1.1",
286
+ )
287
+ f3 = fe.compute(signals3)
288
+ print(f"\nExample 3 β€” Perfect interaction (very easy level)")
289
+ print(f" LDS = {f3.lds:.3f} (expected 0.0)")
290
+ print(f" MCS = {f3.mcs:.3f} (expected 1.0)")
291
+ print(f" Quadrant: {f3.quadrant}")
292
+
293
+ # Example 4: Struggling β€” used L4, incorrect
294
+ signals4 = InteractionSignals(
295
+ max_hint_level=4,
296
+ time_before_first_hint=3.0,
297
+ total_time=120.0,
298
+ time_at_L1=10.0,
299
+ time_at_L2=15.0,
300
+ time_at_L3=20.0,
301
+ time_at_L4=40.0,
302
+ is_correct=False,
303
+ num_attempts=3,
304
+ question_level="3.1",
305
+ )
306
+ f4 = fe.compute(signals4)
307
+ print(f"\nExample 4 β€” Struggling student, used all scaffolds, incorrect")
308
+ print(f" LDS = {f4.lds:.3f} (expected ~0.7-0.9)")
309
+ print(f" MCS = {f4.mcs:.3f} (expected ~0.05-0.15)")
310
+ print(f" Quadrant: {f4.quadrant}")
311
+ print(f" Weighted outcome: {fe.compute_weighted_outcome(False, 4)}")
312
+
313
+ print("\n" + "=" * 70)
314
+ print("All examples computed successfully βœ“")
315
+ print("=" * 70)
316
+
317
+
318
+ if __name__ == "__main__":
319
+ _run_examples()