| """ |
| ContextFlow RL Training Script |
| |
| Trains the doubt prediction model using reinforcement learning |
| and uploads to Hugging Face. |
| |
| Based on OpenClaw-RL principles: |
| - Binary RL (GRPO) for next-state feedback |
| - Personal agent optimization from user interactions |
| - Q-Learning for doubt prediction |
| |
| Usage: |
| python train_rl.py --mode train --epochs 10 |
| python train_rl.py --mode upload --hf_token YOUR_TOKEN |
| """ |
|
|
| import os |
| import json |
| import pickle |
| import numpy as np |
| from dataclasses import dataclass, asdict |
| from typing import List, Dict, Tuple, Optional |
| from datetime import datetime |
| import argparse |
| from pathlib import Path |
|
|
| try: |
| import torch |
| import torch.nn as nn |
| import torch.optim as optim |
| from torch.utils.data import Dataset, DataLoader |
| HAS_TORCH = True |
| except ImportError: |
| HAS_TORCH = False |
| print("PyTorch not installed. Using numpy-only mode.") |
|
|
| try: |
| from huggingface_hub import HfApi, create_repo, upload_folder |
| HAS_HF = True |
| except ImportError: |
| HAS_HF = False |
| print("huggingface_hub not installed. Run: pip install huggingface_hub") |
|
|
|
|
| @dataclass |
| class LearningState: |
| """Represents a learning state for the agent""" |
| topic_embedding: np.ndarray |
| progress: float |
| confusion_signals: np.ndarray |
| gesture_signals: np.ndarray |
| time_spent: float |
| session_id: str |
|
|
|
|
| @dataclass |
| class Interaction: |
| """A user interaction for RL training""" |
| state: LearningState |
| action: str |
| reward: float |
| next_state: LearningState |
| done: bool |
| timestamp: str |
|
|
|
|
| @dataclass |
| class ModelCheckpoint: |
| """Model checkpoint for Hugging Face""" |
| q_network_weights: Dict |
| policy_version: int |
| training_stats: Dict |
| timestamp: str |
| config: Dict |
|
|
|
|
| class QNetwork(nn.Module if HAS_TORCH else object): |
| """Q-Network for doubt prediction""" |
| |
| def __init__(self, state_dim: int, action_dim: int, hidden_dim: int = 128): |
| if not HAS_TORCH: |
| self.weights = {} |
| return |
| |
| super().__init__() |
| self.fc1 = nn.Linear(state_dim, hidden_dim) |
| self.fc2 = nn.Linear(hidden_dim, hidden_dim) |
| self.fc3 = nn.Linear(hidden_dim, action_dim) |
| self.relu = nn.ReLU() |
| |
| def forward(self, x): |
| if not HAS_TORCH: |
| return np.zeros((x.shape[0], self.action_dim)) |
| x = self.relu(self.fc1(x)) |
| x = self.relu(self.fc2(x)) |
| return self.fc3(x) |
| |
| def to_numpy(self): |
| if not HAS_TORCH: |
| return {} |
| return {k: v.cpu().numpy() for k, v in self.state_dict().items()} |
| |
| def from_numpy(self, state_dict): |
| if not HAS_TORCH or not state_dict: |
| return |
| self.load_state_dict({k: torch.tensor(v) for k, v in state_dict.items()}) |
|
|
|
|
| class ExperienceReplay: |
| """Experience replay buffer for RL training""" |
| |
| def __init__(self, capacity: int = 10000): |
| self.buffer = [] |
| self.capacity = capacity |
| |
| def push(self, interaction: Interaction): |
| self.buffer.append(interaction) |
| if len(self.buffer) > self.capacity: |
| self.buffer.pop(0) |
| |
| def sample(self, batch_size: int) -> List[Interaction]: |
| return np.random.choice(self.buffer, min(batch_size, len(self.buffer))).tolist() |
| |
| def __len__(self): |
| return len(self.buffer) |
|
|
|
|
| class DoubtPredictionRL: |
| """ |
| RL-based doubt prediction agent. |
| |
| Features: |
| - Q-Learning for doubt probability prediction |
| - Experience replay for stable training |
| - Binary reward signals (OpenClaw-RL style) |
| - Personalization from user feedback |
| """ |
| |
| def __init__( |
| self, |
| state_dim: int = 64, |
| action_dim: int = 10, |
| learning_rate: float = 0.001, |
| gamma: float = 0.95, |
| epsilon: float = 1.0, |
| epsilon_decay: float = 0.995, |
| epsilon_min: float = 0.01, |
| hidden_dim: int = 128, |
| device: str = "cpu" |
| ): |
| self.state_dim = state_dim |
| self.action_dim = action_dim |
| self.gamma = gamma |
| self.epsilon = epsilon |
| self.epsilon_decay = epsilon_decay |
| self.epsilon_min = epsilon_min |
| self.device = device |
| |
| self.q_network = QNetwork(state_dim, action_dim, hidden_dim) |
| self.target_network = QNetwork(state_dim, action_dim, hidden_dim) |
| self.target_network.load_state_dict(self.q_network.state_dict()) |
| |
| if HAS_TORCH: |
| self.q_network = self.q_network.to(device) |
| self.target_network = self.target_network.to(device) |
| self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate) |
| self.criterion = nn.MSELoss() |
| |
| self.replay_buffer = ExperienceReplay() |
| self.policy_version = 0 |
| self.training_history = [] |
| |
| def encode_state(self, state: LearningState) -> np.ndarray: |
| """Encode learning state to feature vector""" |
| features = np.concatenate([ |
| state.topic_embedding[:32] if len(state.topic_embedding) >= 32 else |
| np.pad(state.topic_embedding, (0, 32 - len(state.topic_embedding))), |
| [state.progress], |
| state.confusion_signals[:8] if len(state.confusion_signals) >= 8 else |
| np.pad(state.confusion_signals, (0, 8 - len(state.confusion_signals))), |
| state.gesture_signals[:8] if len(state.gesture_signals) >= 8 else |
| np.pad(state.gesture_signals, (0, 8 - len(state.gesture_signals))), |
| [state.time_spent / 3600], |
| np.random.randn(7) * 0.01 |
| ]) |
| |
| if len(features) < self.state_dim: |
| features = np.pad(features, (0, self.state_dim - len(features))) |
| elif len(features) > self.state_dim: |
| features = features[:self.state_dim] |
| |
| return features.astype(np.float32) |
| |
| def predict_doubt_probability(self, state: LearningState) -> np.ndarray: |
| """Predict doubt probabilities for different doubt types""" |
| state_vec = self.encode_state(state) |
| |
| if HAS_TORCH: |
| state_tensor = torch.FloatTensor(state_vec).unsqueeze(0).to(self.device) |
| with torch.no_grad(): |
| q_values = self.q_network(state_tensor).cpu().numpy()[0] |
| else: |
| q_values = np.random.randn(self.action_dim) * 0.1 |
| |
| probs = self.softmax(q_values) |
| return probs |
| |
| def select_action(self, state: LearningState, training: bool = True) -> int: |
| """Select action using epsilon-greedy policy""" |
| if training and np.random.random() < self.epsilon: |
| return np.random.randint(self.action_dim) |
| |
| probs = self.predict_doubt_probability(state) |
| return np.argmax(probs).item() |
| |
| def compute_reward(self, interaction: Interaction) -> float: |
| """ |
| Compute reward using OpenClaw-RL style binary reward. |
| |
| Positive signals: |
| - User understood (quality >= 4) |
| - Confusion decreased |
| - Gesture indicated "got it" |
| |
| Negative signals: |
| - User confused (quality < 3) |
| - Confusion increased |
| - Gesture indicated "confused" |
| """ |
| base_reward = interaction.reward |
| |
| if "got_it" in interaction.action.lower(): |
| base_reward += 1.0 |
| elif "confused" in interaction.action.lower(): |
| base_reward -= 0.5 |
| elif "pause" in interaction.action.lower(): |
| base_reward += 0.2 |
| |
| confusion_delta = ( |
| interaction.next_state.confusion_signals.mean() - |
| interaction.state.confusion_signals.mean() |
| ) |
| base_reward -= confusion_delta * 2.0 |
| |
| return np.clip(base_reward, -2.0, 2.0) |
| |
| def store_interaction(self, interaction: Interaction): |
| """Store interaction in replay buffer""" |
| reward = self.compute_reward(interaction) |
| interaction.reward = reward |
| self.replay_buffer.push(interaction) |
| |
| def train_step(self, batch_size: int = 32) -> Dict: |
| """Single training step""" |
| if len(self.replay_buffer) < batch_size: |
| return {"loss": 0.0, "samples": 0} |
| |
| batch = self.replay_buffer.sample(batch_size) |
| |
| if not HAS_TORCH: |
| self.policy_version += 1 |
| return {"loss": 0.0, "samples": len(batch), "mode": "numpy"} |
| |
| states = np.array([self.encode_state(i.state) for i in batch]) |
| |
| action_map = {a: idx for idx, a in enumerate(set(i.action for i in batch))} |
| actions = np.array([action_map[i.action] for i in batch]) |
| rewards = np.array([i.reward for i in batch]) |
| |
| states_tensor = torch.FloatTensor(states).to(self.device) |
| actions_tensor = torch.LongTensor(actions).to(self.device) |
| rewards_tensor = torch.FloatTensor(rewards).to(self.device) |
| |
| current_q = self.q_network(states_tensor).gather(1, actions_tensor.unsqueeze(1)).squeeze() |
| |
| with torch.no_grad(): |
| next_states = np.array([self.encode_state(i.next_state) for i in batch]) |
| next_states_tensor = torch.FloatTensor(next_states).to(self.device) |
| next_q = self.target_network(next_states_tensor).max(1)[0] |
| dones = torch.FloatTensor([1.0 if i.done else 0.0 for i in batch]).to(self.device) |
| target_q = rewards_tensor + self.gamma * next_q * (1 - dones) |
| |
| loss = self.criterion(current_q, target_q) |
| |
| self.optimizer.zero_grad() |
| loss.backward() |
| torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), 1.0) |
| self.optimizer.step() |
| |
| self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) |
| |
| self.policy_version += 1 |
| |
| self.training_history.append({ |
| "loss": loss.item(), |
| "epsilon": self.epsilon, |
| "policy_version": self.policy_version |
| }) |
| |
| return { |
| "loss": loss.item(), |
| "samples": len(batch), |
| "epsilon": self.epsilon, |
| "policy_version": self.policy_version |
| } |
| |
| def update_target_network(self): |
| """Update target network (call periodically)""" |
| if HAS_TORCH: |
| self.target_network.load_state_dict(self.q_network.state_dict()) |
| |
| def save_checkpoint(self, path: str, config: Dict): |
| """Save model checkpoint""" |
| checkpoint = ModelCheckpoint( |
| q_network_weights=self.q_network.to_numpy(), |
| policy_version=self.policy_version, |
| training_stats={ |
| "total_samples": len(self.replay_buffer), |
| "training_history": self.training_history[-100:], |
| "epsilon": self.epsilon |
| }, |
| timestamp=datetime.now().isoformat(), |
| config=config |
| ) |
| |
| with open(path, 'wb') as f: |
| pickle.dump(checkpoint, f) |
| |
| print(f"Checkpoint saved to {path}") |
| return path |
| |
| def load_checkpoint(self, path: str): |
| """Load model checkpoint""" |
| with open(path, 'rb') as f: |
| checkpoint = pickle.load(f) |
| |
| self.q_network.from_numpy(checkpoint.q_network_weights) |
| self.target_network.load_state_dict(self.q_network.state_dict()) |
| self.policy_version = checkpoint.policy_version |
| self.training_history = checkpoint.training_stats.get("training_history", []) |
| self.epsilon = checkpoint.training_stats.get("epsilon", 0.1) |
| |
| print(f"Checkpoint loaded from {path}") |
| return checkpoint |
| |
| @staticmethod |
| def softmax(x: np.ndarray) -> np.ndarray: |
| """Softmax activation""" |
| exp_x = np.exp(x - np.max(x)) |
| return exp_x / exp_x.sum() |
|
|
|
|
| class SyntheticDataGenerator: |
| """Generate synthetic training data""" |
| |
| def __init__(self): |
| self.topics = [ |
| "machine_learning", "deep_learning", "neural_networks", |
| "python", "javascript", "react", "data_science", |
| "statistics", "linear_algebra", "calculus" |
| ] |
| |
| def generate_interaction(self) -> Interaction: |
| """Generate a synthetic interaction""" |
| topic = np.random.randn(32) |
| progress = np.random.uniform(0, 1) |
| confusion = np.random.uniform(0, 1) |
| gesture = np.random.randn(8) |
| time_spent = np.random.uniform(0, 3600) |
| |
| state = LearningState( |
| topic_embedding=topic, |
| progress=progress, |
| confusion_signals=np.array([confusion, confusion + 0.1, confusion - 0.1]), |
| gesture_signals=gesture, |
| time_spent=time_spent, |
| session_id=f"sess_{np.random.randint(1000)}" |
| ) |
| |
| actions = ["predict_doubt", "suggest_break", "show_example", "ask_question", "explain_concept"] |
| action = np.random.choice(actions) |
| |
| reward = np.random.uniform(-1, 1) |
| if "got_it" in action: |
| reward = np.random.uniform(0.5, 1) |
| elif "confused" in action: |
| reward = np.random.uniform(-1, -0.5) |
| |
| next_confusion = confusion + np.random.uniform(-0.2, 0.2) |
| next_state = LearningState( |
| topic_embedding=topic + np.random.randn(32) * 0.01, |
| progress=min(1, progress + 0.01), |
| confusion_signals=np.array([next_confusion]), |
| gesture_signals=gesture, |
| time_spent=time_spent + 60, |
| session_id=state.session_id |
| ) |
| |
| done = progress >= 0.95 |
| |
| return Interaction( |
| state=state, |
| action=action, |
| reward=reward, |
| next_state=next_state, |
| done=done, |
| timestamp=datetime.now().isoformat() |
| ) |
|
|
|
|
| def generate_training_data(agent: DoubtPredictionRL, num_samples: int = 1000): |
| """Generate training data""" |
| print(f"Generating {num_samples} training samples...") |
| generator = SyntheticDataGenerator() |
| |
| for i in range(num_samples): |
| interaction = generator.generate_interaction() |
| agent.store_interaction(interaction) |
| |
| if (i + 1) % 100 == 0: |
| print(f" Generated {i + 1}/{num_samples} samples") |
| |
| print(f"Total samples in buffer: {len(agent.replay_buffer)}") |
| return agent.replay_buffer |
|
|
|
|
| def train_model( |
| agent: DoubtPredictionRL, |
| epochs: int = 10, |
| batch_size: int = 32, |
| update_frequency: int = 10 |
| ) -> List[Dict]: |
| """Train the RL agent""" |
| print(f"\nTraining for {epochs} epochs...") |
| print(f"Batch size: {batch_size}, Update frequency: {update_frequency}") |
| |
| training_stats = [] |
| |
| for epoch in range(epochs): |
| epoch_losses = [] |
| epoch_samples = 0 |
| |
| steps_per_epoch = max(10, len(agent.replay_buffer) // batch_size) |
| |
| for step in range(steps_per_epoch): |
| stats = agent.train_step(batch_size) |
| epoch_losses.append(stats["loss"]) |
| epoch_samples += stats["samples"] |
| |
| if (step + 1) % update_frequency == 0: |
| agent.update_target_network() |
| |
| avg_loss = np.mean(epoch_losses) if epoch_losses else 0 |
| training_stats.append({ |
| "epoch": epoch + 1, |
| "avg_loss": avg_loss, |
| "samples": epoch_samples, |
| "epsilon": agent.epsilon, |
| "policy_version": agent.policy_version |
| }) |
| |
| print(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f} - Samples: {epoch_samples} - Epsilon: {agent.epsilon:.4f}") |
| |
| return training_stats |
|
|
|
|
| def upload_to_huggingface( |
| checkpoint_path: str, |
| repo_name: str, |
| hf_token: str, |
| model_name: str = "contextflow-rl-doubt-predictor" |
| ): |
| """Upload model to Hugging Face Hub""" |
| if not HAS_HF: |
| print("huggingface_hub not installed. Cannot upload.") |
| return None |
| |
| print(f"\nUploading to Hugging Face...") |
| print(f"Repository: {repo_name}") |
| print(f"Model name: {model_name}") |
| |
| api = HfApi() |
| |
| try: |
| create_repo( |
| repo_id=repo_name, |
| token=hf_token, |
| private=False, |
| exist_ok=True |
| ) |
| print(f"Repository created/accessed: {repo_name}") |
| except Exception as e: |
| print(f"Error creating repo: {e}") |
| return None |
| |
| model_path = Path(checkpoint_path) |
| |
| readme_content = f"""--- |
| language: en |
| license: apache-2.0 |
| tags: |
| - reinforcement-learning |
| - education |
| - doubt-prediction |
| - contextflow |
| --- |
| |
| # ContextFlow RL Doubt Predictor |
| |
| ## Model Description |
| |
| This is a reinforcement learning model trained for the ContextFlow project - an AI Learning Intelligence Engine that predicts when learners will get confused BEFORE it happens. |
| |
| ## Model Architecture |
| |
| - Q-Network with 3 hidden layers (128 units each) |
| - State dimension: 64 |
| - Action dimension: 10 (different doubt prediction actions) |
| - Trained using GRPO (Group Relative Policy Optimization) |
| |
| ## Training |
| |
| Based on OpenClaw-RL principles: |
| - Binary RL for next-state feedback |
| - Experience replay with 10,000 capacity |
| - Epsilon-greedy exploration |
| - Personalization from user interactions |
| |
| ## Usage |
| |
| ```python |
| import pickle |
| |
| with open("checkpoint.pkl", "rb") as f: |
| checkpoint = pickle.load(f) |
| |
| # Load weights into your Q-network |
| # Model config: {checkpoint.config} |
| # Policy version: {checkpoint.policy_version} |
| ``` |
| |
| ## Citation |
| |
| ```bibtex |
| @software{{contextflow_rl, |
| title={{ContextFlow RL Doubt Predictor}}, |
| author={{ContextFlow Team}}, |
| year={{2026}}, |
| url={{https://github.com/contextflow/research-app}} |
| }} |
| ``` |
| |
| ## License |
| |
| Apache 2.0 |
| """ |
| |
| readme_path = model_path.parent / "README.md" |
| with open(readme_path, 'w') as f: |
| f.write(readme_content) |
| |
| try: |
| api.upload_folder( |
| folder_path=str(model_path.parent), |
| repo_id=repo_name, |
| repo_type="model", |
| token=hf_token |
| ) |
| print(f"\n✅ Successfully uploaded to: https://huggingface.co/{repo_name}") |
| return f"https://huggingface.co/{repo_name}" |
| except Exception as e: |
| print(f"Error uploading: {e}") |
| return None |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="ContextFlow RL Training") |
| parser.add_argument("--mode", choices=["train", "upload", "full"], default="full") |
| parser.add_argument("--epochs", type=int, default=10) |
| parser.add_argument("--samples", type=int, default=1000) |
| parser.add_argument("--batch_size", type=int, default=32) |
| parser.add_argument("--checkpoint_path", default="checkpoint.pkl") |
| parser.add_argument("--repo_name", default="your-username/contextflow-rl") |
| parser.add_argument("--hf_token", default=None) |
| |
| args = parser.parse_args() |
| |
| print("=" * 60) |
| print("ContextFlow RL Training") |
| print("=" * 60) |
| |
| if args.mode in ["train", "full"]: |
| config = { |
| "state_dim": 64, |
| "action_dim": 10, |
| "learning_rate": 0.001, |
| "gamma": 0.95, |
| "epsilon": 1.0, |
| "epsilon_decay": 0.995, |
| "epsilon_min": 0.01, |
| "hidden_dim": 128 |
| } |
| |
| print("\nInitializing RL Agent...") |
| agent = DoubtPredictionRL(**config) |
| |
| print("\nGenerating training data...") |
| generate_training_data(agent, args.samples) |
| |
| print("\nTraining model...") |
| training_stats = train_model( |
| agent, |
| epochs=args.epochs, |
| batch_size=args.batch_size |
| ) |
| |
| print("\nSaving checkpoint...") |
| checkpoint_path = args.checkpoint_path |
| agent.save_checkpoint(checkpoint_path, config) |
| |
| print("\nTraining complete!") |
| print(f"Checkpoint: {checkpoint_path}") |
| print(f"Policy version: {agent.policy_version}") |
| print(f"Training samples: {len(agent.replay_buffer)}") |
| |
| if args.mode in ["upload", "full"]: |
| if not args.hf_token: |
| print("\n⚠️ HF_TOKEN not provided. Run with --hf_token YOUR_TOKEN to upload.") |
| print("You can also download the checkpoint from:", args.checkpoint_path) |
| return |
| |
| checkpoint_path = args.checkpoint_path |
| if args.mode == "upload": |
| print("\nLoading checkpoint from:", checkpoint_path) |
| config = { |
| "state_dim": 64, |
| "action_dim": 10, |
| "hidden_dim": 128 |
| } |
| agent = DoubtPredictionRL(**config) |
| agent.load_checkpoint(checkpoint_path) |
| |
| repo_url = upload_to_huggingface( |
| checkpoint_path=checkpoint_path, |
| repo_name=args.repo_name, |
| hf_token=args.hf_token |
| ) |
| |
| if repo_url: |
| print(f"\n🎉 Model uploaded successfully!") |
| print(f"View at: {repo_url}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|