namish10
/

contextflow-rl

+"""
+Online Learning Module for ContextFlow
+Implements continuous model improvement from real user interactions.
+Addresses: Online Learning requirement
+"""
+import numpy as np
+import pickle
+from typing import Dict, List, Optional, Any, Tuple
+from dataclasses import dataclass, field
+from collections import deque
+import threading
+import time
+import json
+@dataclass
+class InteractionSample:
+    """A single interaction sample for online learning"""
+    state: np.ndarray
+    action: int
+    reward: float
+    next_state: np.ndarray
+    done: bool
+    timestamp: float
+    user_id: str
+    confidence: float = 0.0
+    def to_dict(self) -> Dict:
+        return {
+            'state': self.state.tolist(),
+            'action': self.action,
+            'reward': self.reward,
+            'next_state': self.next_state.tolist(),
+            'done': self.done,
+            'timestamp': self.timestamp,
+            'user_id': self.user_id,
+            'confidence': self.confidence
+        }
+@dataclass
+class OnlineQNetwork:
+    """Q-Network for online learning"""
+    weights: Dict[str, np.ndarray]
+    biases: Dict[str, np.ndarray]
+    version: int = 1
+    def forward(self, state: np.ndarray) -> np.ndarray:
+        """Forward pass through network"""
+        # Layer 1
+        h1 = np.maximum(np.dot(state, self.weights['l1']) + self.biases['b1'], 0)
+        # Layer 2
+        h2 = np.maximum(np.dot(h1, self.weights['l2']) + self.biases['b2'], 0)
+        # Output
+        q_values = np.dot(h2, self.weights['l3']) + self.biases['b3']
+        return q_values
+    def clone_from(self, source: 'OnlineQNetwork'):
+        """Clone weights from another network"""
+        self.weights = {k: v.copy() for k, v in source.weights.items()}
+        self.biases = {k: v.copy() for k, v in source.biases.items()}
+        self.version = source.version + 1
+class OnlineLearningEngine:
+    """
+    Online learning engine for continuous model improvement.
+    Features:
+    - Incremental updates from user feedback
+    - Experience replay buffer
+    - Target network for stability
+    - Periodic checkpointing
+    """
+    def __init__(
+        self,
+        state_dim: int = 64,
+        action_dim: int = 10,
+        hidden_dim: int = 128,
+        learning_rate: float = 0.001,
+        gamma: float = 0.95,
+        batch_size: int = 32,
+        buffer_size: int = 10000,
+        target_update_freq: int = 100
+    ):
+        self.state_dim = state_dim
+        self.action_dim = action_dim
+        self.learning_rate = learning_rate
+        self.gamma = gamma
+        self.batch_size = batch_size
+        self.target_update_freq = target_update_freq
+        # Initialize networks
+        self.q_network = self._init_network()
+        self.target_network = self._init_network()
+        self._sync_target()
+        # Experience replay buffer
+        self.replay_buffer = deque(maxlen=buffer_size)
+        # Training stats
+        self.total_updates = 0
+        self.update_count = 0
+        # Lock for thread safety
+        self.lock = threading.Lock()
+        # Callbacks for events
+        self.on_checkpoint = None
+        self.on_update = None
+    def _init_network(self) -> OnlineQNetwork:
+        """Initialize network weights"""
+        np.random.seed(42)
+        return OnlineQNetwork(
+            weights={
+                'l1': np.random.randn(self.state_dim, self.hidden_dim) * 0.1,
+                'l2': np.random.randn(self.hidden_dim, self.hidden_dim) * 0.1,
+                'l3': np.random.randn(self.hidden_dim, self.action_dim) * 0.1
+            },
+            biases={
+                'b1': np.zeros(self.hidden_dim),
+                'b2': np.zeros(self.hidden_dim),
+                'b3': np.zeros(self.action_dim)
+            },
+            version=1
+        )
+    def _sync_target(self):
+        """Copy Q-network to target network"""
+        self.target_network.clone_from(self.q_network)
+    def add_interaction(
+        self,
+        state: np.ndarray,
+        action: int,
+        reward: float,
+        next_state: np.ndarray,
+        done: bool,
+        user_id: str = 'anonymous',
+        confidence: float = 0.0
+    ):
+        """Add a new interaction to the replay buffer"""
+        sample = InteractionSample(
+            state=state,
+            action=action,
+            reward=reward,
+            next_state=next_state,
+            done=done,
+            timestamp=time.time(),
+            user_id=user_id,
+            confidence=confidence
+        )
+        with self.lock:
+            self.replay_buffer.append(sample)
+        # Trigger online update
+        if len(self.replay_buffer) >= self.batch_size:
+            self.update()
+    def update(self) -> Optional[Dict]:
+        """Perform a single online update"""
+        with self.lock:
+            if len(self.replay_buffer) < self.batch_size:
+                return None
+            # Sample batch
+            indices = np.random.choice(len(self.replay_buffer), self.batch_size, replace=False)
+            batch = [self.replay_buffer[i] for i in indices]
+            # Extract batch arrays
+            states = np.array([s.state for s in batch])
+            actions = np.array([s.action for s in batch])
+            rewards = np.array([s.reward for s in batch])
+            next_states = np.array([s.next_state for s in batch])
+            dones = np.array([s.done for s in batch])
+            # Compute targets
+            current_q = self.q_network.forward(states)
+            next_q = self.target_network.forward(next_states)
+            targets = current_q.copy()
+            max_next_q = np.max(next_q, axis=1)
+            for i in range(self.batch_size):
+                if dones[i]:
+                    targets[i, actions[i]] = rewards[i]
+                else:
+                    targets[i, actions[i]] = rewards[i] + self.gamma * max_next_q[i]
+            # Compute gradients and update (simplified SGD)
+            # In production, would use PyTorch autograd
+            errors = targets - current_q
+            # Gradient descent on layer 3
+            h2 = np.maximum(np.dot(states, self.q_network.weights['l1']) + self.q_network.biases['b1'], 0)
+            h3 = np.maximum(np.dot(h2, self.q_network.weights['l2']) + self.q_network.biases['b2'], 0)
+            for i in range(self.batch_size):
+                grad_l3 = np.outer(h3[i], errors[i])
+                grad_b3 = errors[i]
+                self.q_network.weights['l3'] += self.learning_rate * grad_l3
+                self.q_network.biases['b3'] += self.learning_rate * grad_b3
+            # Update target network periodically
+            self.update_count += 1
+            if self.update_count % self.target_update_freq == 0:
+                self._sync_target()
+            self.total_updates += 1
+            loss = np.mean(errors ** 2)
+            result = {
+                'loss': float(loss),
+                'updates': self.total_updates,
+                'buffer_size': len(self.replay_buffer)
+            }
+            if self.on_update:
+                self.on_update(result)
+            return result
+    def predict(self, state: np.ndarray) -> Tuple[int, float]:
+        """Predict best action for a state"""
+        q_values = self.q_network.forward(state)
+        action = int(np.argmax(q_values))
+        confidence = float(np.max(q_values))
+        return action, confidence
+    def get_q_values(self, state: np.ndarray) -> np.ndarray:
+        """Get Q-values for all actions"""
+        return self.q_network.forward(state)
+    def save_checkpoint(self, path: str):
+        """Save model checkpoint"""
+        checkpoint = {
+            'q_network': {
+                'weights': {k: v.tolist() for k, v in self.q_network.weights.items()},
+                'biases': {k: v.tolist() for k, v in self.q_network.biases.items()},
+                'version': self.q_network.version
+            },
+            'total_updates': self.total_updates,
+            'buffer_size': len(self.replay_buffer)
+        }
+        with open(path, 'w') as f:
+            json.dump(checkpoint, f)
+        if self.on_checkpoint:
+            self.on_checkpoint(path)
+        return path
+    def load_checkpoint(self, path: str):
+        """Load model checkpoint"""
+        with open(path, 'r') as f:
+            checkpoint = json.load(f)
+        self.q_network.weights = {k: np.array(v) for k, v in checkpoint['q_network']['weights'].items()}
+        self.q_network.biases = {k: np.array(v) for k, v in checkpoint['q_network']['biases'].items()}
+        self.q_network.version = checkpoint['q_network']['version']
+        self.total_updates = checkpoint['total_updates']
+        self._sync_target()
+        return checkpoint
+    def get_stats(self) -> Dict:
+        """Get learning statistics"""
+        with self.lock:
+            return {
+                'total_updates': self.total_updates,
+                'buffer_size': len(self.replay_buffer),
+                'buffer_capacity': self.replay_buffer.maxlen,
+                'network_version': self.q_network.version
+            }
+class AdaptiveLearningScheduler:
+    """
+    Adaptive learning rate scheduler based on performance.
+    Reduces learning rate when performance plateaus.
+    Increases when making good progress.
+    """
+    def __init__(
+        self,
+        initial_lr: float = 0.001,
+        min_lr: float = 0.00001,
+        patience: int = 10,
+        factor: float = 0.5
+    ):
+        self.current_lr = initial_lr
+        self.min_lr = min_lr
+        self.patience = patience
+        self.factor = factor
+        self.best_loss = float('inf')
+        self.wait_count = 0
+        self.history = []
+    def step(self, loss: float) -> float:
+        """Update learning rate based on loss"""
+        self.history.append(loss)
+        if len(self.history) < 2:
+            return self.current_lr
+        if loss < self.best_loss:
+            self.best_loss = loss
+            self.wait_count = 0
+        else:
+            self.wait_count += 1
+        if self.wait_count >= self.patience and self.current_lr > self.min_lr:
+            self.current_lr *= self.factor
+            self.wait_count = 0
+        return self.current_lr
+# API Integration
+class OnlineLearningAPI:
+    """REST API wrapper for online learning"""
+    def __init__(self, engine: OnlineLearningEngine):
+        self.engine = engine
+    def record_feedback(
+        self,
+        user_id: str,
+        state: List[float],
+        action: int,
+        quality: int,  # 1-5 quality rating
+        comment: Optional[str] = None
+    ) -> Dict:
+        """
+        Record user feedback and trigger online update.
+        Quality mapping:
+        - 1: Very unhelpful (-1.0)
+        - 2: Unhelpful (-0.5)
+        - 3: Neutral (0.0)
+        - 4: Helpful (0.5)
+        - 5: Very helpful (1.0)
+        """
+        reward_map = {1: -1.0, 2: -0.5, 3: 0.0, 4: 0.5, 5: 1.0}
+        reward = reward_map.get(quality, 0.0)
+        state_arr = np.array(state)
+        # Simulate next state (in real impl, would come from actual interaction)
+        next_state = state_arr + np.random.randn(len(state_arr)) * 0.1
+        self.engine.add_interaction(
+            state=state_arr,
+            action=action,
+            reward=reward,
+            next_state=next_state,
+            done=False,
+            user_id=user_id,
+            confidence=reward
+        )
+        return {
+            'status': 'recorded',
+            'reward': reward,
+            'total_updates': self.engine.total_updates
+        }
+    def get_prediction(self, state: List[float]) -> Dict:
+        """Get prediction for a state"""
+        state_arr = np.array(state)
+        action, confidence = self.engine.predict(state_arr)
+        q_values = self.engine.get_q_values(state_arr)
+        return {
+            'action': action,
+            'confidence': confidence,
+            'q_values': q_values.tolist()
+        }
+    def get_stats(self) -> Dict:
+        """Get learning stats"""
+        return self.engine.get_stats()
+# Example usage
+if __name__ == "__main__":
+    engine = OnlineLearningEngine()
+    api = OnlineLearningAPI(engine)
+    print("Online Learning Engine initialized")
+    print(f"State dim: {engine.state_dim}, Action dim: {engine.action_dim}")
+    # Simulate some feedback
+    for i in range(100):
+        state = np.random.randn(64)
+        action = np.random.randint(0, 10)
+        quality = np.random.randint(1, 6)
+        result = api.record_feedback(
+            user_id='test_user',
+            state=state.tolist(),
+            action=action,
+            quality=quality
+        )
+    print(f"\\nAfter 100 interactions:")
+    print(f"  Updates: {result['total_updates']}")
+    print(f"  Stats: {api.get_stats()}")