namish10 commited on
Commit
82e4a98
·
verified ·
1 Parent(s): 72558bb

Upload multimodal_detection.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. multimodal_detection.py +526 -0
multimodal_detection.py ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multi-Modal Confusion Detection Module for ContextFlow
3
+
4
+ Combines audio, biometric, and behavioral signals for comprehensive confusion detection.
5
+ Addresses: Multi-modal Confusion Detection requirement
6
+ """
7
+
8
+ import numpy as np
9
+ from typing import Dict, List, Optional, Tuple, Any
10
+ from dataclasses import dataclass, field
11
+ from collections import deque
12
+ import threading
13
+ import time
14
+
15
+
16
+ @dataclass
17
+ class AudioFeatures:
18
+ """Audio features extracted from voice/speech"""
19
+ speech_rate: float = 0.0 # Words per minute
20
+ pause_frequency: float = 0.0 # Pauses per minute
21
+ pause_duration: float = 0.0 # Average pause duration (ms)
22
+ pitch_variation: float = 0.0 # Pitch standard deviation
23
+ volume_level: float = 0.0 # Average volume (0-1)
24
+ hesitations: int = 0 # Count of "uh", "um", etc.
25
+ question_markers: int = 0 # Rising intonation count
26
+
27
+ def to_vector(self) -> np.ndarray:
28
+ """Convert to 7-dim feature vector"""
29
+ return np.array([
30
+ self.speech_rate / 200, # Normalize to ~0-1
31
+ self.pause_frequency / 10,
32
+ self.pause_duration / 5000,
33
+ self.pitch_variation / 50,
34
+ self.volume_level,
35
+ self.hesitations / 20,
36
+ self.question_markers / 10
37
+ ])
38
+
39
+
40
+ @dataclass
41
+ class BiometricFeatures:
42
+ """Biometric features for confusion detection"""
43
+ heart_rate: float = 0.0 # BPM
44
+ heart_rate_variability: float = 0.0 # HRV metric
45
+ skin_conductance: float = 0.0 # GSR (microsiemens)
46
+ skin_temperature: float = 0.0 # Celsius
47
+ eye_blink_rate: float = 0.0 # Blinks per minute
48
+ eye_open_duration: float = 0.0 # Average eye open (ms)
49
+
50
+ def to_vector(self) -> np.ndarray:
51
+ """Convert to 6-dim feature vector"""
52
+ return np.array([
53
+ (self.heart_rate - 60) / 60, # Centered at resting HR
54
+ self.heart_rate_variability / 50,
55
+ self.skin_conductance / 20,
56
+ (self.skin_temperature - 36) / 2, # Centered at 36C
57
+ (self.eye_blink_rate - 15) / 15, # Centered at normal
58
+ self.eye_open_duration / 500
59
+ ])
60
+
61
+
62
+ @dataclass
63
+ class BehavioralFeatures:
64
+ """Behavioral features (existing confusion signals)"""
65
+ mouse_hesitation: float = 0.0
66
+ scroll_reversals: float = 0.0
67
+ time_on_page: float = 0.0
68
+ click_frequency: float = 0.0
69
+ back_button: float = 0.0
70
+ tab_switches: float = 0.0
71
+ copy_attempts: float = 0.0
72
+ search_usage: float = 0.0
73
+
74
+ def to_vector(self) -> np.ndarray:
75
+ """Convert to 8-dim feature vector"""
76
+ return np.array([
77
+ self.mouse_hesitation / 5,
78
+ self.scroll_reversals / 10,
79
+ self.time_on_page / 300,
80
+ self.click_frequency / 20,
81
+ self.back_button / 5,
82
+ self.tab_switches / 10,
83
+ self.copy_attempts / 5,
84
+ self.search_usage / 5
85
+ ])
86
+
87
+
88
+ class MultiModalFusion:
89
+ """
90
+ Fuses multiple signal modalities into unified confusion assessment.
91
+
92
+ Supported modalities:
93
+ - Audio: Speech patterns, hesitations
94
+ - Biometric: Heart rate, GSR, eye tracking
95
+ - Behavioral: Mouse, keyboard, scrolling patterns
96
+ """
97
+
98
+ def __init__(
99
+ self,
100
+ audio_weight: float = 0.2,
101
+ biometric_weight: float = 0.3,
102
+ behavioral_weight: float = 0.5
103
+ ):
104
+ self.audio_weight = audio_weight
105
+ self.biometric_weight = biometric_weight
106
+ self.behavioral_weight = behavioral_weight
107
+
108
+ # Modality-specific thresholds
109
+ self.audio_threshold = 0.6
110
+ self.biometric_threshold = 0.7
111
+ self.behavioral_threshold = 0.5
112
+
113
+ # History buffers
114
+ self.audio_history = deque(maxlen=30) # Last 30 seconds
115
+ self.biometric_history = deque(maxlen=60) # Last 60 seconds
116
+ self.behavioral_history = deque(maxlen=100) # Last 100 events
117
+
118
+ # Thread safety
119
+ self.lock = threading.Lock()
120
+
121
+ def update_audio(self, features: AudioFeatures):
122
+ """Update audio feature buffer"""
123
+ with self.lock:
124
+ self.audio_history.append(features)
125
+
126
+ def update_biometric(self, features: BiometricFeatures):
127
+ """Update biometric feature buffer"""
128
+ with self.lock:
129
+ self.biometric_history.append(features)
130
+
131
+ def update_behavioral(self, features: BehavioralFeatures):
132
+ """Update behavioral feature buffer"""
133
+ with self.lock:
134
+ self.behavioral_history.append(features)
135
+
136
+ def get_audio_confusion(self) -> Tuple[float, str]:
137
+ """Get confusion score from audio signals"""
138
+ with self.lock:
139
+ if not self.audio_history:
140
+ return 0.0, "no_audio"
141
+
142
+ recent = list(self.audio_history)[-10:] # Last 10 samples
143
+
144
+ # Compute weighted features
145
+ speech_rate = np.mean([f.speech_rate for f in recent])
146
+ hesitations = np.mean([f.hesitations for f in recent])
147
+ pause_freq = np.mean([f.pause_frequency for f in recent])
148
+
149
+ # Confusion indicators:
150
+ # - Slower speech rate
151
+ # - More hesitations
152
+ # - More pauses
153
+
154
+ confusion = 0.0
155
+
156
+ if speech_rate < 120: # Slow speech
157
+ confusion += 0.3
158
+ if hesitations > 5: # Many hesitations
159
+ confusion += 0.4
160
+ if pause_freq > 3: # Frequent pauses
161
+ confusion += 0.3
162
+
163
+ confusion = min(confusion, 1.0)
164
+
165
+ return confusion, self._get_audio_reason(hesitations, speech_rate, pause_freq)
166
+
167
+ def _get_audio_reason(self, hesitations: float, speech_rate: float, pause_freq: float) -> str:
168
+ """Generate human-readable audio confusion reason"""
169
+ reasons = []
170
+ if hesitations > 5:
171
+ reasons.append("frequent_hesitations")
172
+ if speech_rate < 120:
173
+ reasons.append("slow_speech")
174
+ if pause_freq > 3:
175
+ reasons.append("frequent_pauses")
176
+ return ",".join(reasons) if reasons else "normal"
177
+
178
+ def get_biometric_confusion(self) -> Tuple[float, str]:
179
+ """Get confusion score from biometric signals"""
180
+ with self.lock:
181
+ if not self.biometric_history:
182
+ return 0.0, "no_biometric"
183
+
184
+ recent = list(self.biometric_history)[-20:] # Last 20 samples
185
+
186
+ hr = np.mean([f.heart_rate for f in recent])
187
+ hrv = np.mean([f.heart_rate_variability for f in recent])
188
+ gsr = np.mean([f.skin_conductance for f in recent])
189
+
190
+ # Confusion indicators:
191
+ # - Elevated heart rate
192
+ # - Lower HRV (stress indicator)
193
+ # - Higher GSR (arousal)
194
+
195
+ confusion = 0.0
196
+
197
+ if hr > 85: # Elevated HR
198
+ confusion += 0.3
199
+ if hrv < 30: # Low HRV
200
+ confusion += 0.3
201
+ if gsr > 10: # Elevated GSR
202
+ confusion += 0.4
203
+
204
+ confusion = min(confusion, 1.0)
205
+
206
+ return confusion, self._get_biometric_reason(hr, hrv, gsr)
207
+
208
+ def _get_biometric_reason(self, hr: float, hrv: float, gsr: float) -> str:
209
+ """Generate human-readable biometric confusion reason"""
210
+ reasons = []
211
+ if hr > 85:
212
+ reasons.append("elevated_heart_rate")
213
+ if hrv < 30:
214
+ reasons.append("low_hrv")
215
+ if gsr > 10:
216
+ reasons.append("high_arousal")
217
+ return ",".join(reasons) if reasons else "normal"
218
+
219
+ def get_behavioral_confusion(self) -> Tuple[float, str]:
220
+ """Get confusion score from behavioral signals"""
221
+ with self.lock:
222
+ if not self.behavioral_history:
223
+ return 0.0, "no_behavioral"
224
+
225
+ recent = list(self.behavioral_history)[-20:] # Last 20 events
226
+
227
+ mouse_h = np.mean([f.mouse_hesitation for f in recent])
228
+ scrolls = np.mean([f.scroll_reversals for f in recent])
229
+ back_btn = np.mean([f.back_button for f in recent])
230
+
231
+ confusion = 0.0
232
+
233
+ if mouse_h > 3:
234
+ confusion += 0.3
235
+ if scrolls > 5:
236
+ confusion += 0.3
237
+ if back_btn > 3:
238
+ confusion += 0.2
239
+
240
+ confusion = min(confusion, 1.0)
241
+
242
+ return confusion, self._get_behavioral_reason(mouse_h, scrolls, back_btn)
243
+
244
+ def _get_behavioral_reason(self, mouse_h: float, scrolls: float, back_btn: float) -> str:
245
+ """Generate human-readable behavioral confusion reason"""
246
+ reasons = []
247
+ if mouse_h > 3:
248
+ reasons.append("mouse_hesitation")
249
+ if scrolls > 5:
250
+ reasons.append("scroll_reversals")
251
+ if back_btn > 3:
252
+ reasons.append("back_button_usage")
253
+ return ",".join(reasons) if reasons else "normal"
254
+
255
+ def get_fused_confusion(self) -> Dict[str, Any]:
256
+ """
257
+ Get fused multi-modal confusion assessment.
258
+
259
+ Returns:
260
+ Dict with confusion scores, reasons, and confidence
261
+ """
262
+ audio_score, audio_reason = self.get_audio_confusion()
263
+ biometric_score, biometric_reason = self.get_biometric_confusion()
264
+ behavioral_score, behavioral_reason = self.get_behavioral_confusion()
265
+
266
+ # Weighted fusion
267
+ fused_score = (
268
+ audio_score * self.audio_weight +
269
+ biometric_score * self.biometric_weight +
270
+ behavioral_score * self.behavioral_weight
271
+ )
272
+
273
+ # Confidence based on signal availability
274
+ n_signals = sum([
275
+ len(self.audio_history) > 0,
276
+ len(self.biometric_history) > 0,
277
+ len(self.behavioral_history) > 0
278
+ ])
279
+ confidence = min(n_signals / 3.0, 1.0)
280
+
281
+ # Primary indicator (highest weighted contribution)
282
+ contributions = {
283
+ 'audio': audio_score * self.audio_weight,
284
+ 'biometric': biometric_score * self.biometric_weight,
285
+ 'behavioral': behavioral_score * self.behavioral_weight
286
+ }
287
+ primary_indicator = max(contributions, key=contributions.get)
288
+
289
+ return {
290
+ 'confusion_score': fused_score,
291
+ 'confidence': confidence,
292
+ 'primary_indicator': primary_indicator,
293
+ 'audio_score': audio_score,
294
+ 'biometric_score': biometric_score,
295
+ 'behavioral_score': behavioral_score,
296
+ 'audio_reason': audio_reason,
297
+ 'biometric_reason': biometric_reason,
298
+ 'behavioral_reason': behavioral_reason,
299
+ 'suggested_action': self._get_suggested_action(fused_score, primary_indicator),
300
+ 'available_modalities': {
301
+ 'audio': len(self.audio_history) > 0,
302
+ 'biometric': len(self.biometric_history) > 0,
303
+ 'behavioral': len(self.behavioral_history) > 0
304
+ }
305
+ }
306
+
307
+ def _get_suggested_action(self, score: float, primary: str) -> str:
308
+ """Get suggested intervention based on confusion level"""
309
+ if score < 0.3:
310
+ return "continue_learning"
311
+ elif score < 0.5:
312
+ return "offer_hint"
313
+ elif score < 0.7:
314
+ return "trigger_ai_explanation"
315
+ else:
316
+ return "pause_and_assess"
317
+
318
+ def reset(self):
319
+ """Reset all buffers"""
320
+ with self.lock:
321
+ self.audio_history.clear()
322
+ self.biometric_history.clear()
323
+ self.behavioral_history.clear()
324
+
325
+
326
+ class AudioAnalyzer:
327
+ """
328
+ Real-time audio analysis for confusion detection.
329
+
330
+ Requires: microphone input (simulated for now)
331
+ """
332
+
333
+ def __init__(self):
334
+ self.sample_buffer = deque(maxlen=1000)
335
+ self.is_recording = False
336
+ self.sample_rate = 16000
337
+
338
+ def start_recording(self):
339
+ """Start audio capture"""
340
+ self.is_recording = True
341
+ self.sample_buffer.clear()
342
+
343
+ def stop_recording(self):
344
+ """Stop audio capture"""
345
+ self.is_recording = False
346
+
347
+ def add_audio_sample(self, amplitude: float):
348
+ """Add audio amplitude sample"""
349
+ if self.is_recording:
350
+ self.sample_buffer.append({
351
+ 'amplitude': amplitude,
352
+ 'timestamp': time.time()
353
+ })
354
+
355
+ def analyze(self) -> AudioFeatures:
356
+ """Analyze audio buffer and extract features"""
357
+ if len(self.sample_buffer) < 100:
358
+ return AudioFeatures()
359
+
360
+ amplitudes = [s['amplitude'] for s in self.sample_buffer]
361
+
362
+ # Simple feature extraction
363
+ features = AudioFeatures()
364
+
365
+ # Detect pauses (low amplitude segments)
366
+ threshold = np.mean(amplitudes) * 0.3
367
+ is_pause = amplitudes < threshold
368
+ pause_durations = []
369
+ current_pause = 0
370
+
371
+ for p in is_pause:
372
+ if p:
373
+ current_pause += 1
374
+ else:
375
+ if current_pause > 0:
376
+ pause_durations.append(current_pause)
377
+ current_pause = 0
378
+
379
+ features.pause_frequency = len(pause_durations) / (len(amplitudes) / self.sample_rate) * 60
380
+ features.pause_duration = np.mean(pause_durations) * 1000 / self.sample_rate if pause_durations else 0
381
+
382
+ # Volume level
383
+ features.volume_level = np.mean(amplitudes)
384
+
385
+ return features
386
+
387
+
388
+ class BiometricProcessor:
389
+ """
390
+ Processes biometric data for confusion detection.
391
+
392
+ Supports: heart rate monitors, GSR sensors, eye trackers
393
+ """
394
+
395
+ def __init__(self):
396
+ self.data_buffer = deque(maxlen=60)
397
+
398
+ def add_reading(
399
+ self,
400
+ heart_rate: Optional[float] = None,
401
+ hrv: Optional[float] = None,
402
+ gsr: Optional[float] = None,
403
+ skin_temp: Optional[float] = None,
404
+ blink_rate: Optional[float] = None,
405
+ eye_open: Optional[float] = None
406
+ ):
407
+ """Add biometric reading"""
408
+ self.data_buffer.append({
409
+ 'heart_rate': heart_rate,
410
+ 'hrv': hrv,
411
+ 'gsr': gsr,
412
+ 'skin_temp': skin_temp,
413
+ 'blink_rate': blink_rate,
414
+ 'eye_open': eye_open,
415
+ 'timestamp': time.time()
416
+ })
417
+
418
+ def analyze(self) -> BiometricFeatures:
419
+ """Analyze biometric buffer and extract features"""
420
+ if len(self.data_buffer) < 5:
421
+ return BiometricFeatures()
422
+
423
+ features = BiometricFeatures()
424
+
425
+ hr_values = [d['heart_rate'] for d in self.data_buffer if d['heart_rate']]
426
+ hrv_values = [d['hrv'] for d in self.data_buffer if d['hrv']]
427
+ gsr_values = [d['gsr'] for d in self.data_buffer if d['gsr']]
428
+
429
+ if hr_values:
430
+ features.heart_rate = np.mean(hr_values)
431
+ if hrv_values:
432
+ features.heart_rate_variability = np.mean(hrv_values)
433
+ if gsr_values:
434
+ features.skin_conductance = np.mean(gsr_values)
435
+
436
+ return features
437
+
438
+
439
+ # API integration
440
+ class MultiModalAPI:
441
+ """REST API for multi-modal confusion detection"""
442
+
443
+ def __init__(self, fusion: MultiModalFusion):
444
+ self.fusion = fusion
445
+ self.audio_analyzer = AudioAnalyzer()
446
+ self.biometric_processor = BiometricProcessor()
447
+
448
+ def process_audio(self, amplitude: float):
449
+ """Process audio sample"""
450
+ self.audio_analyzer.add_audio_sample(amplitude)
451
+ features = self.audio_analyzer.analyze()
452
+ self.fusion.update_audio(features)
453
+ return features
454
+
455
+ def process_biometric(
456
+ self,
457
+ heart_rate: Optional[float] = None,
458
+ hrv: Optional[float] = None,
459
+ gsr: Optional[float] = None
460
+ ):
461
+ """Process biometric data"""
462
+ self.biometric_processor.add_reading(
463
+ heart_rate=heart_rate,
464
+ hrv=hrv,
465
+ gsr=gsr
466
+ )
467
+ features = self.biometric_processor.analyze()
468
+ self.fusion.update_biometric(features)
469
+ return features
470
+
471
+ def process_behavioral(
472
+ self,
473
+ mouse_hesitation: float = 0,
474
+ scroll_reversals: float = 0,
475
+ time_on_page: float = 0
476
+ ):
477
+ """Process behavioral data"""
478
+ features = BehavioralFeatures(
479
+ mouse_hesitation=mouse_hesitation,
480
+ scroll_reversals=scroll_reversals,
481
+ time_on_page=time_on_page
482
+ )
483
+ self.fusion.update_behavioral(features)
484
+ return features
485
+
486
+ def get_confusion_assessment(self) -> Dict:
487
+ """Get multi-modal confusion assessment"""
488
+ return self.fusion.get_fused_confusion()
489
+
490
+
491
+ # Demo
492
+ if __name__ == "__main__":
493
+ fusion = MultiModalFusion()
494
+ api = MultiModalAPI(fusion)
495
+
496
+ print("Multi-Modal Confusion Detection Demo")
497
+ print("=" * 40)
498
+
499
+ # Simulate data collection
500
+ for i in range(20):
501
+ # Audio: increasing hesitation
502
+ api.process_audio(amplitude=0.3 if i < 10 else 0.1)
503
+
504
+ # Biometric: elevated stress
505
+ api.process_biometric(
506
+ heart_rate=75 + i * 0.5,
507
+ hrv=40 - i * 0.3,
508
+ gsr=8 + i * 0.2
509
+ )
510
+
511
+ # Behavioral: more reversals
512
+ api.process_behavioral(
513
+ mouse_hesitation=2 + i * 0.2,
514
+ scroll_reversals=3 + i * 0.3,
515
+ time_on_page=60 + i * 3
516
+ )
517
+
518
+ # Get assessment
519
+ result = api.get_confusion_assessment()
520
+
521
+ print(f"Confusion Score: {result['confusion_score']:.2f}")
522
+ print(f"Confidence: {result['confidence']:.2f}")
523
+ print(f"Primary Indicator: {result['primary_indicator']}")
524
+ print(f"Biometric Score: {result['biometric_score']:.2f}")
525
+ print(f"Behavioral Score: {result['behavioral_score']:.2f}")
526
+ print(f"Suggested Action: {result['suggested_action']}")