namish10 commited on
Commit
18a270a
·
verified ·
1 Parent(s): e386f0b

Upload data_collector.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data_collector.py +382 -0
data_collector.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Real Learning Data Collection Module for ContextFlow
3
+
4
+ Collects real behavioral signals from actual learning sessions for model improvement.
5
+ Addresses: Synthetic Data Bias limitation
6
+ """
7
+
8
+ import json
9
+ import time
10
+ import uuid
11
+ from datetime import datetime
12
+ from typing import Dict, List, Optional, Any
13
+ from dataclasses import dataclass, asdict, field
14
+ from collections import defaultdict
15
+ import numpy as np
16
+
17
+
18
+ @dataclass
19
+ class LearningSession:
20
+ """A real learning session with actual student data"""
21
+ session_id: str
22
+ user_id: str
23
+ topic: str
24
+ start_time: datetime
25
+ end_time: Optional[datetime] = None
26
+ events: List[Dict] = field(default_factory=list)
27
+ confusion_scores: List[float] = field(default_factory=list)
28
+ actual_doubts: List[str] = field(default_factory=list)
29
+ gesture_signals: Dict[str, int] = field(default_factory=dict)
30
+ completion_status: str = "in_progress"
31
+
32
+ def to_dict(self) -> Dict:
33
+ return {
34
+ 'session_id': self.session_id,
35
+ 'user_id': self.user_id,
36
+ 'topic': self.topic,
37
+ 'start_time': self.start_time.isoformat(),
38
+ 'end_time': self.end_time.isoformat() if self.end_time else None,
39
+ 'events': self.events,
40
+ 'confusion_scores': self.confusion_scores,
41
+ 'actual_doubts': self.actual_doubts,
42
+ 'gesture_signals': self.gesture_signals,
43
+ 'completion_status': self.completion_status,
44
+ 'duration_minutes': (self.end_time - self.start_time).total_seconds() / 60 if self.end_time else 0
45
+ }
46
+
47
+
48
+ @dataclass
49
+ class BehavioralEvent:
50
+ """A single behavioral event from a real session"""
51
+ timestamp: float
52
+ event_type: str
53
+ data: Dict[str, Any]
54
+ session_id: str
55
+ user_id: str
56
+
57
+ # Event types:
58
+ # - mouse_move, mouse_click, scroll, keypress
59
+ # - gesture_detected, confusion_reported
60
+ # - help_requested, content_completed, question_answered
61
+ # - time_on_task, pause_resume
62
+
63
+
64
+ class RealDataCollector:
65
+ """
66
+ Collects real learning data from user sessions.
67
+
68
+ Usage:
69
+ collector = RealDataCollector(user_id='student123')
70
+ collector.start_session('machine learning')
71
+ collector.record_event('mouse_hesitation', {'duration_ms': 2000})
72
+ collector.report_doubt('how_gradient_descent_works')
73
+ collector.end_session()
74
+ """
75
+
76
+ def __init__(self, user_id: str):
77
+ self.user_id = user_id
78
+ self.current_session: Optional[LearningSession] = None
79
+ self.sessions: List[LearningSession] = []
80
+ self.data_dir = 'collected_data'
81
+
82
+ def start_session(self, topic: str) -> str:
83
+ """Start a new learning session"""
84
+ session_id = str(uuid.uuid4())
85
+ self.current_session = LearningSession(
86
+ session_id=session_id,
87
+ user_id=self.user_id,
88
+ topic=topic,
89
+ start_time=datetime.now()
90
+ )
91
+ return session_id
92
+
93
+ def record_event(self, event_type: str, data: Dict[str, Any]):
94
+ """Record a behavioral event"""
95
+ if not self.current_session:
96
+ return
97
+
98
+ event = BehavioralEvent(
99
+ timestamp=time.time(),
100
+ event_type=event_type,
101
+ data=data,
102
+ session_id=self.current_session.session_id,
103
+ user_id=self.user_id
104
+ )
105
+
106
+ self.current_session.events.append(asdict(event))
107
+
108
+ # Update gesture signals
109
+ if event_type.startswith('gesture_'):
110
+ gesture_name = event_type.replace('gesture_', '')
111
+ self.current_session.gesture_signals[gesture_name] = \
112
+ self.current_session.gesture_signals.get(gesture_name, 0) + 1
113
+
114
+ def record_confusion(self, score: float):
115
+ """Record a confusion score observation"""
116
+ if not self.current_session:
117
+ return
118
+ self.current_session.confusion_scores.append(score)
119
+
120
+ def report_doubt(self, doubt_type: str):
121
+ """Record an actual doubt the student had"""
122
+ if not self.current_session:
123
+ return
124
+ self.current_session.actual_doubts.append(doubt_type)
125
+
126
+ def end_session(self, status: str = "completed"):
127
+ """End the current session"""
128
+ if not self.current_session:
129
+ return
130
+
131
+ self.current_session.end_time = datetime.now()
132
+ self.current_session.completion_status = status
133
+ self.sessions.append(self.current_session)
134
+ self.current_session = None
135
+
136
+ def save_session(self) -> str:
137
+ """Save session data to file"""
138
+ if not self.current_session:
139
+ return None
140
+
141
+ session_data = self.current_session.to_dict()
142
+ filename = f"{self.data_dir}/{self.current_session.session_id}.json"
143
+
144
+ import os
145
+ os.makedirs(self.data_dir, exist_ok=True)
146
+
147
+ with open(filename, 'w') as f:
148
+ json.dump(session_data, f, indent=2)
149
+
150
+ return filename
151
+
152
+ def get_training_data(self) -> List[Dict]:
153
+ """Get collected data formatted for RL training"""
154
+ training_samples = []
155
+
156
+ for session in self.sessions:
157
+ if session.completion_status != "completed":
158
+ continue
159
+
160
+ # Create state-action pairs from session
161
+ for i, event in enumerate(session.events):
162
+ # Extract state features from events
163
+ state = self._extract_state_from_session(session, i)
164
+
165
+ # Get actual doubt (if reported around this time)
166
+ actual_doubt = self._get_doubt_at_time(session, event['timestamp'])
167
+
168
+ if actual_doubt:
169
+ training_samples.append({
170
+ 'state': state,
171
+ 'actual_doubt': actual_doubt,
172
+ 'session_id': session.session_id,
173
+ 'topic': session.topic
174
+ })
175
+
176
+ return training_samples
177
+
178
+ def _extract_state_from_session(self, session: LearningSession, event_idx: int) -> np.ndarray:
179
+ """Extract 64-dim state vector from session events"""
180
+ events_so_far = session.events[:event_idx+1]
181
+
182
+ # Topic embedding (32 dims) - simplified
183
+ topic_hash = hash(session.topic) % 1000
184
+ np.random.seed(topic_hash)
185
+ topic_emb = np.random.randn(32) * 0.1
186
+
187
+ # Progress (1 dim)
188
+ progress = min(event_idx / max(len(session.events), 1), 1.0)
189
+
190
+ # Confusion signals (16 dims)
191
+ recent_confusion = session.confusion_scores[-10:] if session.confusion_scores else [0]
192
+ confusion_features = [
193
+ np.mean(recent_confusion), # avg confusion
194
+ np.std(recent_confusion) if len(recent_confusion) > 1 else 0, # variance
195
+ recent_confusion[-1] if recent_confusion else 0, # current
196
+ ] * 5 + [0] * 1 # pad to 16
197
+
198
+ # Gesture signals (14 dims)
199
+ gesture_features = np.zeros(14)
200
+ for g, count in session.gesture_signals.items():
201
+ idx = hash(g) % 14
202
+ gesture_features[idx] = min(count / 20, 1.0)
203
+
204
+ # Time spent (1 dim)
205
+ if session.end_time:
206
+ time_spent = (session.end_time - session.start_time).total_seconds()
207
+ else:
208
+ time_spent = time.time() - session.start_time.timestamp()
209
+
210
+ # Combine
211
+ state = np.concatenate([
212
+ topic_emb,
213
+ [progress],
214
+ confusion_features[:16],
215
+ gesture_features,
216
+ [min(time_spent / 1800, 1.0)]
217
+ ])
218
+
219
+ return state
220
+
221
+ def _get_doubt_at_time(self, session: LearningSession, timestamp: float) -> Optional[str]:
222
+ """Get doubt reported around this timestamp"""
223
+ for doubt in session.actual_doubts:
224
+ # Simplified - in real impl, would match timestamps
225
+ return doubt
226
+ return None
227
+
228
+
229
+ class DataAugmentor:
230
+ """
231
+ Augment collected data to improve model generalization.
232
+ Addresses: Synthetic Data Bias, Real-world Generalization
233
+ """
234
+
235
+ @staticmethod
236
+ def add_noise(state: np.ndarray, noise_level: float = 0.1) -> np.ndarray:
237
+ """Add Gaussian noise to state for augmentation"""
238
+ noise = np.random.randn(*state.shape) * noise_level
239
+ return state + noise
240
+
241
+ @staticmethod
242
+ def scale_features(state: np.ndarray, scale_range: tuple = (0.8, 1.2)) -> np.ndarray:
243
+ """Randomly scale features for augmentation"""
244
+ scale = np.random.uniform(scale_range[0], scale_range[1], state.shape)
245
+ return state * scale
246
+
247
+ @staticmethod
248
+ def shuffle_confusion_order(state: np.ndarray, n_shuffle: int = 3) -> np.ndarray:
249
+ """Shuffle some confusion features"""
250
+ augmented = state.copy()
251
+ confusion_start, confusion_end = 33, 49
252
+
253
+ indices = list(range(confusion_start, confusion_end))
254
+ np.random.shuffle(indices)
255
+
256
+ original = augmented[confusion_start:confusion_end].copy()
257
+ for i, j in enumerate(indices[:n_shuffle]):
258
+ augmented[confusion_start + i] = original[j]
259
+
260
+ return augmented
261
+
262
+ @staticmethod
263
+ def augment_batch(states: np.ndarray, labels: np.ndarray,
264
+ augment_ratio: float = 0.5) -> tuple:
265
+ """Augment a batch of training data"""
266
+ n_augment = int(len(states) * augment_ratio)
267
+ indices = np.random.choice(len(states), n_augment, replace=False)
268
+
269
+ augmented_states = []
270
+ augmented_labels = []
271
+
272
+ for idx in indices:
273
+ state = states[idx]
274
+ label = labels[idx]
275
+
276
+ # Randomly apply augmentation
277
+ if np.random.random() < 0.5:
278
+ state = DataAugmentor.add_noise(state)
279
+ if np.random.random() < 0.3:
280
+ state = DataAugmentor.scale_features(state)
281
+ if np.random.random() < 0.2:
282
+ state = DataAugmentor.shuffle_confusion_order(state)
283
+
284
+ augmented_states.append(state)
285
+ augmented_labels.append(label)
286
+
287
+ return np.vstack([states, np.array(augmented_states)]), \
288
+ np.concatenate([labels, np.array(augmented_labels)])
289
+
290
+
291
+ class DataValidator:
292
+ """
293
+ Validate collected data quality.
294
+ Addresses: Validation Gap
295
+ """
296
+
297
+ @staticmethod
298
+ def validate_session(session: LearningSession) -> Dict[str, Any]:
299
+ """Validate a session has sufficient data"""
300
+ issues = []
301
+
302
+ if len(session.events) < 10:
303
+ issues.append("Insufficient events (need at least 10)")
304
+
305
+ if not session.actual_doubts:
306
+ issues.append("No actual doubts recorded")
307
+
308
+ if session.completion_status != "completed":
309
+ issues.append("Session not completed")
310
+
311
+ if len(session.confusion_scores) < 3:
312
+ issues.append("Insufficient confusion observations")
313
+
314
+ return {
315
+ 'valid': len(issues) == 0,
316
+ 'session_id': session.session_id,
317
+ 'issues': issues,
318
+ 'metrics': {
319
+ 'n_events': len(session.events),
320
+ 'n_doubts': len(session.actual_doubts),
321
+ 'n_confusion_scores': len(session.confusion_scores),
322
+ 'duration_minutes': (session.end_time - session.start_time).total_seconds() / 60
323
+ if session.end_time else 0
324
+ }
325
+ }
326
+
327
+ @staticmethod
328
+ def get_benchmark_metrics(sessions: List[LearningSession]) -> Dict:
329
+ """Generate benchmark metrics from collected data"""
330
+ valid_sessions = [s for s in sessions
331
+ if DataValidator.validate_session(s)['valid']]
332
+
333
+ if not valid_sessions:
334
+ return {'error': 'No valid sessions for benchmarking'}
335
+
336
+ all_doubts = []
337
+ for s in valid_sessions:
338
+ all_doubts.extend(s.actual_doubts)
339
+
340
+ doubt_counts = defaultdict(int)
341
+ for d in all_doubts:
342
+ doubt_counts[d] += 1
343
+
344
+ return {
345
+ 'n_valid_sessions': len(valid_sessions),
346
+ 'total_doubts': len(all_doubts),
347
+ 'unique_doubts': len(doubt_counts),
348
+ 'top_doubts': sorted(doubt_counts.items(), key=lambda x: -x[1])[:10],
349
+ 'avg_confusion': np.mean([s.confusion_scores for s in valid_sessions
350
+ if s.confusion_scores]),
351
+ 'completion_rate': len(valid_sessions) / len(sessions) if sessions else 0
352
+ }
353
+
354
+
355
+ # CLI for data collection
356
+ if __name__ == "__main__":
357
+ import argparse
358
+
359
+ parser = argparse.ArgumentParser(description='ContextFlow Real Data Collector')
360
+ parser.add_argument('--user_id', required=True, help='User ID for this session')
361
+ parser.add_argument('--topic', required=True, help='Learning topic')
362
+ parser.add_argument('--simulate', action='store_true', help='Simulate data collection')
363
+
364
+ args = parser.parse_args()
365
+
366
+ collector = RealDataCollector(args.user_id)
367
+ collector.start_session(args.topic)
368
+
369
+ if args.simulate:
370
+ print("Simulating session...")
371
+ for i in range(50):
372
+ collector.record_event('mouse_move', {'x': i, 'y': i*2})
373
+ if i % 10 == 0:
374
+ collector.record_confusion(np.random.random())
375
+ if i == 25:
376
+ collector.report_doubt('how_backpropagation_works')
377
+
378
+ collector.end_session('completed')
379
+ collector.save_session()
380
+ print(f"Session saved with {len(collector.sessions[0].events)} events")
381
+
382
+ print("Data collector ready. Use collector.record_event() to log events.")