{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Viraltest v2 — TRL GRPO Training\n", "\n", "Train Qwen2.5-1.5B-Instruct on the Viraltest environment using Group Relative Policy Optimization.\n", "\n", "**Requirements:** Free Colab T4 GPU, ~30 min for 100 episodes.\n", "\n", "**Reward:** per-step env reward (0-1) + 2× terminal grader_score." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q trl transformers accelerate peft bitsandbytes openai httpx matplotlib" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "import matplotlib.pyplot as plt\n", "from typing import List, Dict, Any\n", "\n", "# Set your env server URL (run the Docker container or HF Space first)\n", "ENV_BASE_URL = os.getenv(\"ENV_BASE_URL\", \"http://localhost:8000\")\n", "MODEL_NAME = \"Qwen/Qwen2.5-1.5B-Instruct\"\n", "\n", "print(f\"Environment: {ENV_BASE_URL}\")\n", "print(f\"Model: {MODEL_NAME}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Episode Collection\n", "\n", "Run the agent against the environment and collect (prompt, response, reward) tuples." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import httpx\n", "\n", "def reset_env(task: str = \"monthly_engage\") -> Dict[str, Any]:\n", " resp = httpx.post(f\"{ENV_BASE_URL}/reset\", json={\"task\": task}, timeout=30)\n", " return resp.json()\n", "\n", "def step_env(action: Dict[str, Any]) -> Dict[str, Any]:\n", " resp = httpx.post(f\"{ENV_BASE_URL}/step\", json=action, timeout=30)\n", " return resp.json()\n", "\n", "def collect_episode(task: str, max_steps: int = 30) -> List[Dict[str, Any]]:\n", " \"\"\"Collect one episode of (obs, action, reward) tuples.\"\"\"\n", " obs = reset_env(task)\n", " trajectory = []\n", " for step in range(max_steps):\n", " obs_data = obs.get(\"observation\", {})\n", " if obs.get(\"done\", False):\n", " break\n", " # Simple heuristic agent for data collection\n", " action = {\n", " \"scheduled_actions\": [\n", " {\"hour\": 12, \"action_type\": \"post\", \"content_type\": \"carousel\",\n", " \"topic\": \"AI tools\", \"tags\": [\"ai\", \"coding\"], \"intent\": \"save_bait\"},\n", " ],\n", " \"notes\": f\"Step {step}: collecting training data.\"\n", " }\n", " obs = step_env(action)\n", " reward = obs.get(\"reward\", 0.0)\n", " trajectory.append({\"obs\": obs_data, \"action\": action, \"reward\": reward})\n", " return trajectory\n", "\n", "# Collect baseline episodes\n", "print(\"Collecting baseline episodes...\")\n", "baseline_rewards = []\n", "for task in [\"monthly_engage\", \"monthly_strategic\", \"monthly_competitive\"]:\n", " traj = collect_episode(task)\n", " total_reward = sum(t[\"reward\"] for t in traj)\n", " baseline_rewards.append(total_reward)\n", " print(f\" {task}: {total_reward:.4f} ({len(traj)} steps)\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## GRPO Training Loop\n", "\n", "Uses TRL's GRPOTrainer with the environment reward as the RL signal." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# NOTE: Full GRPO training requires:\n", "# 1. Running the env server (docker or uvicorn)\n", "# 2. A reward function that maps env observations to scalar rewards\n", "# 3. Enough GPU memory for the model + optimizer\n", "#\n", "# This skeleton shows the structure. Adapt based on your compute.\n", "\n", "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "# from trl import GRPOConfig, GRPOTrainer # uncomment when running\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n", "# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True, torch_dtype=\"auto\")\n", "\n", "print(f\"Tokenizer loaded: {MODEL_NAME}\")\n", "print(\"To run full training, uncomment model loading and GRPOTrainer setup.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot Reward Curves" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Placeholder — replace with actual training rewards\n", "import numpy as np\n", "\n", "episodes = list(range(1, 201))\n", "# Simulated reward curve (replace with real data)\n", "rewards = np.cumsum(np.random.randn(200) * 0.02 + 0.01)\n", "rewards = np.clip(rewards, 0, 1)\n", "\n", "fig, ax = plt.subplots(figsize=(10, 5))\n", "ax.plot(episodes, rewards, linewidth=1.5, color='#2196F3')\n", "ax.set_xlabel('Episode')\n", "ax.set_ylabel('Cumulative Reward')\n", "ax.set_title('Viraltest v2 — GRPO Training Reward Curve')\n", "ax.grid(True, alpha=0.3)\n", "fig.savefig('../plots/reward_curve.png', dpi=150, bbox_inches='tight')\n", "plt.show()\n", "print('Saved plots/reward_curve.png')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Before vs After comparison\n", "tasks = ['monthly_engage', 'monthly_strategic', 'monthly_competitive']\n", "before_scores = [0.12, 0.10, 0.08] # Replace with actual baseline\n", "after_scores = [0.45, 0.35, 0.28] # Replace with actual trained\n", "\n", "x = np.arange(len(tasks))\n", "width = 0.35\n", "\n", "fig, ax = plt.subplots(figsize=(8, 5))\n", "bars1 = ax.bar(x - width/2, before_scores, width, label='Baseline', color='#FF9800')\n", "bars2 = ax.bar(x + width/2, after_scores, width, label='Trained (GRPO)', color='#4CAF50')\n", "\n", "ax.set_ylabel('Grader Score')\n", "ax.set_title('Before vs After Training — Grader Scores')\n", "ax.set_xticks(x)\n", "ax.set_xticklabels(tasks, rotation=15)\n", "ax.legend()\n", "ax.set_ylim(0, 0.8)\n", "ax.grid(True, alpha=0.3, axis='y')\n", "\n", "fig.savefig('../plots/before_after.png', dpi=150, bbox_inches='tight')\n", "plt.show()\n", "print('Saved plots/before_after.png')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 4 }