contextflow-rl / feature_extractor.py

Upload feature_extractor.py with huggingface_hub

357af64 verified 6 days ago

7.56 kB

	"""
	Feature Extraction Module for ContextFlow RL Model

	This module extracts the 64-dimensional state vector used by the RL model
	for doubt prediction.

	State Vector Structure (64 features):
	├── Topic Embedding (32 dims) - TF-IDF of learning topic
	├── Progress (1 dim) - Session progress 0.0-1.0
	├── Confusion Signals (16 dims) - Behavioral indicators
	├── Gesture Signals (14 dims) - Hand gesture frequencies
	└── Time Spent (1 dim) - Normalized session time
	"""

	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from typing import Dict, List, Optional


	class FeatureExtractor:
	"""Extract 64-dimensional state vector for RL model"""

	def __init__(self):
	self.state_dim = 64

	# TF-IDF vectorizer for topics (32 dimensions)
	self.topic_vectorizer = TfidfVectorizer(max_features=32)
	self._fit_topic_vectorizer()

	# Signal names for interpretability
	self.confusion_signal_names = [
	'mouse_hesitation', 'scroll_reversals', 'time_on_page',
	'eye_tracking_x', 'eye_tracking_y', 'page_scrolling',
	'click_frequency', 'back_button', 'tab_switches',
	'copy_attempts', 'zoom_level', 'scroll_speed',
	'reading_pauses', 'search_usage', 'bookmark_usage', 'print_usage'
	]

	self.gesture_signal_names = [
	'pinch', 'swipe_up', 'swipe_down', 'swipe_left', 'swipe_right',
	'two_finger_swipe', 'point', 'wave', 'thumbs_up', 'thumbs_down',
	'fist', 'open_palm', 'rotation', 'zoom_gesture'
	]

	def _fit_topic_vectorizer(self):
	"""Fit TF-IDF on common learning topics"""
	topics = [
	'machine learning', 'deep learning', 'neural networks',
	'python programming', 'data science', 'statistics',
	'linear algebra', 'calculus', 'probability',
	'natural language processing', 'computer vision',
	'reinforcement learning', 'supervised learning', 'unsupervised learning',
	'classification', 'regression', 'clustering',
	'backpropagation', 'gradient descent', 'optimization',
	'transformers', 'attention mechanism', 'bert', 'gpt',
	'cnn', 'rnn', 'lstm', 'gru',
	'overfitting', 'underfitting', 'regularization',
	'cross validation', 'hyperparameters', 'training'
	]
	self.topic_vectorizer.fit(topics)

	def extract_topic_embedding(self, topic: str) -> np.ndarray:
	"""Extract 32-dimensional topic embedding"""
	topic_vec = self.topic_vectorizer.transform([topic.lower()]).toarray()[0]

	# Ensure 32 dimensions (pad if needed)
	if len(topic_vec) < 32:
	topic_vec = np.pad(topic_vec, (0, 32 - len(topic_vec)))

	return topic_vec[:32]

	def extract_confusion_signals(self, signals: Dict) -> np.ndarray:
	"""
	Extract 16-dimensional confusion signal vector

	Args:
	signals: Dict with keys like 'mouse_hesitation', 'scroll_reversals', etc.

	Returns:
	Normalized confusion signals (0.0-1.0)
	"""
	result = np.zeros(16)

	for i, name in enumerate(self.confusion_signal_names):
	if name in signals:
	value = float(signals[name])
	# Normalize based on signal type
	if name == 'mouse_hesitation':
	result[i] = min(value / 5.0, 1.0) # 0-5 scale
	elif name == 'scroll_reversals':
	result[i] = min(value / 10.0, 1.0) # 0-10 scale
	elif name == 'time_on_page':
	result[i] = min(value / 300.0, 1.0) # 0-5min scale
	elif 'eye_tracking' in name:
	result[i] = min(abs(value), 1.0) # -1 to 1 scale
	else:
	result[i] = min(value, 1.0)

	return result

	def extract_gesture_signals(self, gestures: Dict) -> np.ndarray:
	"""
	Extract 14-dimensional gesture signal vector

	Args:
	gestures: Dict with gesture counts or frequencies

	Returns:
	Normalized gesture signals (0.0-1.0)
	"""
	result = np.zeros(14)

	for i, name in enumerate(self.gesture_signal_names):
	if name in gestures:
	value = float(gestures[name])
	result[i] = min(value / 20.0, 1.0) # Normalize to 0-20 range

	return result

	def extract_state(
	self,
	topic: str,
	progress: float,
	confusion_signals: Dict,
	gesture_signals: Dict,
	time_spent: float
	) -> np.ndarray:
	"""
	Extract complete 64-dimensional state vector

	Args:
	topic: Learning topic string
	progress: Session progress (0.0-1.0)
	confusion_signals: Dict of behavioral signals
	gesture_signals: Dict of gesture counts
	time_spent: Time spent in seconds

	Returns:
	64-dimensional state vector
	"""
	# Topic embedding: 32 dims
	topic_emb = self.extract_topic_embedding(topic)

	# Progress: 1 dim
	progress_arr = np.array([np.clip(progress, 0.0, 1.0)])

	# Confusion signals: 16 dims
	confusion_arr = self.extract_confusion_signals(confusion_signals)

	# Gesture signals: 14 dims
	gesture_arr = self.extract_gesture_signals(gesture_signals)

	# Time spent: 1 dim (normalized to 0-30min)
	time_arr = np.array([min(time_spent / 1800.0, 1.0)])

	# Concatenate all features
	state = np.concatenate([
	topic_emb, # 32 dims
	progress_arr, # 1 dim
	confusion_arr, # 16 dims
	gesture_arr, # 14 dims
	time_arr # 1 dim
	])

	assert len(state) == 64, f"State vector should be 64 dims, got {len(state)}"

	return state

	def get_feature_names(self) -> List[str]:
	"""Get interpretable feature names"""
	names = []

	# Topic features
	for i in range(32):
	names.append(f"topic_{i}")

	names.append("progress")

	# Confusion signals
	names.extend(self.confusion_signal_names)

	# Gesture signals
	names.extend(self.gesture_signal_names)

	names.append("time_spent")

	return names


	def create_sample_state() -> np.ndarray:
	"""Create a sample state vector for testing"""
	extractor = FeatureExtractor()

	return extractor.extract_state(
	topic="machine learning",
	progress=0.5,
	confusion_signals={
	'mouse_hesitation': 2.5,
	'scroll_reversals': 4,
	'time_on_page': 120,
	'click_frequency': 8,
	'back_button': 2
	},
	gesture_signals={
	'pinch': 5,
	'swipe_right': 3,
	'point': 2
	},
	time_spent=300
	)


	if __name__ == "__main__":
	# Test feature extraction
	extractor = FeatureExtractor()
	state = create_sample_state()

	print(f"State vector shape: {state.shape}")
	print(f"Sum of features: {state.sum():.4f}")
	print(f"Features > 0: {(state > 0).sum()}")