""" Feature Extraction Module for ContextFlow RL Model This module extracts the 64-dimensional state vector used by the RL model for doubt prediction. State Vector Structure (64 features): ├── Topic Embedding (32 dims) - TF-IDF of learning topic ├── Progress (1 dim) - Session progress 0.0-1.0 ├── Confusion Signals (16 dims) - Behavioral indicators ├── Gesture Signals (14 dims) - Hand gesture frequencies └── Time Spent (1 dim) - Normalized session time """ import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from typing import Dict, List, Optional class FeatureExtractor: """Extract 64-dimensional state vector for RL model""" def __init__(self): self.state_dim = 64 # TF-IDF vectorizer for topics (32 dimensions) self.topic_vectorizer = TfidfVectorizer(max_features=32) self._fit_topic_vectorizer() # Signal names for interpretability self.confusion_signal_names = [ 'mouse_hesitation', 'scroll_reversals', 'time_on_page', 'eye_tracking_x', 'eye_tracking_y', 'page_scrolling', 'click_frequency', 'back_button', 'tab_switches', 'copy_attempts', 'zoom_level', 'scroll_speed', 'reading_pauses', 'search_usage', 'bookmark_usage', 'print_usage' ] self.gesture_signal_names = [ 'pinch', 'swipe_up', 'swipe_down', 'swipe_left', 'swipe_right', 'two_finger_swipe', 'point', 'wave', 'thumbs_up', 'thumbs_down', 'fist', 'open_palm', 'rotation', 'zoom_gesture' ] def _fit_topic_vectorizer(self): """Fit TF-IDF on common learning topics""" topics = [ 'machine learning', 'deep learning', 'neural networks', 'python programming', 'data science', 'statistics', 'linear algebra', 'calculus', 'probability', 'natural language processing', 'computer vision', 'reinforcement learning', 'supervised learning', 'unsupervised learning', 'classification', 'regression', 'clustering', 'backpropagation', 'gradient descent', 'optimization', 'transformers', 'attention mechanism', 'bert', 'gpt', 'cnn', 'rnn', 'lstm', 'gru', 'overfitting', 'underfitting', 'regularization', 'cross validation', 'hyperparameters', 'training' ] self.topic_vectorizer.fit(topics) def extract_topic_embedding(self, topic: str) -> np.ndarray: """Extract 32-dimensional topic embedding""" topic_vec = self.topic_vectorizer.transform([topic.lower()]).toarray()[0] # Ensure 32 dimensions (pad if needed) if len(topic_vec) < 32: topic_vec = np.pad(topic_vec, (0, 32 - len(topic_vec))) return topic_vec[:32] def extract_confusion_signals(self, signals: Dict) -> np.ndarray: """ Extract 16-dimensional confusion signal vector Args: signals: Dict with keys like 'mouse_hesitation', 'scroll_reversals', etc. Returns: Normalized confusion signals (0.0-1.0) """ result = np.zeros(16) for i, name in enumerate(self.confusion_signal_names): if name in signals: value = float(signals[name]) # Normalize based on signal type if name == 'mouse_hesitation': result[i] = min(value / 5.0, 1.0) # 0-5 scale elif name == 'scroll_reversals': result[i] = min(value / 10.0, 1.0) # 0-10 scale elif name == 'time_on_page': result[i] = min(value / 300.0, 1.0) # 0-5min scale elif 'eye_tracking' in name: result[i] = min(abs(value), 1.0) # -1 to 1 scale else: result[i] = min(value, 1.0) return result def extract_gesture_signals(self, gestures: Dict) -> np.ndarray: """ Extract 14-dimensional gesture signal vector Args: gestures: Dict with gesture counts or frequencies Returns: Normalized gesture signals (0.0-1.0) """ result = np.zeros(14) for i, name in enumerate(self.gesture_signal_names): if name in gestures: value = float(gestures[name]) result[i] = min(value / 20.0, 1.0) # Normalize to 0-20 range return result def extract_state( self, topic: str, progress: float, confusion_signals: Dict, gesture_signals: Dict, time_spent: float ) -> np.ndarray: """ Extract complete 64-dimensional state vector Args: topic: Learning topic string progress: Session progress (0.0-1.0) confusion_signals: Dict of behavioral signals gesture_signals: Dict of gesture counts time_spent: Time spent in seconds Returns: 64-dimensional state vector """ # Topic embedding: 32 dims topic_emb = self.extract_topic_embedding(topic) # Progress: 1 dim progress_arr = np.array([np.clip(progress, 0.0, 1.0)]) # Confusion signals: 16 dims confusion_arr = self.extract_confusion_signals(confusion_signals) # Gesture signals: 14 dims gesture_arr = self.extract_gesture_signals(gesture_signals) # Time spent: 1 dim (normalized to 0-30min) time_arr = np.array([min(time_spent / 1800.0, 1.0)]) # Concatenate all features state = np.concatenate([ topic_emb, # 32 dims progress_arr, # 1 dim confusion_arr, # 16 dims gesture_arr, # 14 dims time_arr # 1 dim ]) assert len(state) == 64, f"State vector should be 64 dims, got {len(state)}" return state def get_feature_names(self) -> List[str]: """Get interpretable feature names""" names = [] # Topic features for i in range(32): names.append(f"topic_{i}") names.append("progress") # Confusion signals names.extend(self.confusion_signal_names) # Gesture signals names.extend(self.gesture_signal_names) names.append("time_spent") return names def create_sample_state() -> np.ndarray: """Create a sample state vector for testing""" extractor = FeatureExtractor() return extractor.extract_state( topic="machine learning", progress=0.5, confusion_signals={ 'mouse_hesitation': 2.5, 'scroll_reversals': 4, 'time_on_page': 120, 'click_frequency': 8, 'back_button': 2 }, gesture_signals={ 'pinch': 5, 'swipe_right': 3, 'point': 2 }, time_spent=300 ) if __name__ == "__main__": # Test feature extraction extractor = FeatureExtractor() state = create_sample_state() print(f"State vector shape: {state.shape}") print(f"Sum of features: {state.sum():.4f}") print(f"Features > 0: {(state > 0).sum()}")