Spaces:

Lomesh7777
/

openenv-multi-agent-RL

Sleeping

App Files Files Community

Lomesh2000 commited on 13 days ago

Commit

57eab70

1 Parent(s): 385203e

multi agent environment learning

Browse files

Files changed (28) hide show

Colab_Training.ipynb +113 -0
README.md +144 -17
pyproject.toml +34 -0
requirements.txt +1 -0
salespath_env/__pycache__/__init__.cpython-312.pyc +0 -0
salespath_env/__pycache__/client.cpython-312.pyc +0 -0
salespath_env/__pycache__/models.cpython-312.pyc +0 -0
salespath_env/client.py +138 -0
salespath_env/server/__pycache__/__init__.cpython-312.pyc +0 -0
salespath_env/server/__pycache__/app.cpython-312.pyc +0 -0
salespath_env/server/__pycache__/prospect_simulator.cpython-312.pyc +0 -0
salespath_env/server/__pycache__/reward.cpython-312.pyc +0 -0
salespath_env/server/__pycache__/rules.cpython-312.pyc +0 -0
salespath_env/server/__pycache__/salespath_environment.cpython-312.pyc +0 -0
salespath_env/server/__pycache__/task_bank.cpython-312.pyc +0 -0
salespath_env/server/app.py +116 -18
salespath_env/server/prospect_simulator.py +175 -161
salespath_env/server/rules.py +44 -12
salespath_env/server/salespath_environment.py +14 -0
training/__pycache__/plot_rewards.cpython-312.pyc +0 -0
training/__pycache__/train_grpo.cpython-312.pyc +0 -0
training/__pycache__/train_sft.cpython-312.pyc +0 -0
training/__pycache__/train_test.cpython-312.pyc +0 -0
training/plot_rewards.py +103 -0
training/sft_demos.jsonl +14 -0
training/train_grpo.py +388 -0
training/train_sft.py +172 -0
training/train_test.py +212 -0

Colab_Training.ipynb ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# SalesPath: OpenEnv RL Training via GRPO\n",
+        "\n",
+        "This notebook contains the complete training pipeline for the SalesPath environment. It performs:\n",
+        "1. **SFT Warm-start**: Fine-tunes a base model on expert sales demonstrations.\n",
+        "2. **GRPO RL**: Uses live rollouts against your hosted environment to optimize the agent.\n",
+        "\n",
+        "> **CRITICAL:** Before running this, ensure you are using a **T4 GPU** (`Runtime` -> `Change runtime type` -> `Hardware accelerator` -> `T4 GPU`).\n",
+        "> \n",
+        "> You must also have pushed your environment code to a **Hugging Face Space** so this notebook can interact with it."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 1. Install required dependencies\n",
+        "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n",
+        "!pip install --no-deps trl peft accelerate bitsandbytes datasets matplotlib openenv-core"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 2. Clone your environment repository from Hugging Face Spaces\n",
+        "# ⚠️ REPLACE WITH YOUR ACTUAL HF SPACE URL\n",
+        "HF_SPACE_URL = \"https://huggingface.co/spaces/YOUR_USERNAME/salespath-env\"\n",
+        "\n",
+        "import os\n",
+        "repo_name = HF_SPACE_URL.split(\"/\")[-1]\n",
+        "\n",
+        "!git clone {HF_SPACE_URL}\n",
+        "os.chdir(repo_name)\n",
+        "print(f\"\\nWorking directory changed to: {os.getcwd()}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 3. Run SFT Warm-start (~10-15 minutes)\n",
+        "# This trains the model to understand the basic output format and sales flow.\n",
+        "!python training/train_sft.py"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 4. Run GRPO Reinforcement Learning (~45-60 minutes)\n",
+        "import os\n",
+        "\n",
+        "# Derive the direct API URL for the Hugging Face space\n",
+        "username = HF_SPACE_URL.split(\"/\")[-2]\n",
+        "space_name = HF_SPACE_URL.split(\"/\")[-1]\n",
+        "direct_url = f\"https://{username}-{space_name}.hf.space\"\n",
+        "\n",
+        "os.environ[\"SALESPATH_ENV_URL\"] = direct_url\n",
+        "os.environ[\"SFT_CHECKPOINT\"] = \"./sft_checkpoint\"\n",
+        "\n",
+        "print(f\"Targeting Environment API: {direct_url}\")\n",
+        "\n",
+        "# Run the GRPO training script\n",
+        "!python training/train_grpo.py"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 5. Plot the Training Rewards\n",
+        "!python training/plot_rewards.py --log ./reward_log.jsonl --out ./plots\n",
+        "\n",
+        "from IPython.display import Image, display\n",
+        "print(\"\\n=== Reward Curve ===\")\n",
+        "display(Image(\"./plots/reward_curve.png\"))\n",
+        "\n",
+        "print(\"\\n=== Reward by Difficulty ===\")\n",
+        "display(Image(\"./plots/reward_by_difficulty.png\"))"
+      ]
+    }
+  ]
+}

README.md CHANGED Viewed

@@ -7,22 +7,21 @@ sdk: docker
 app_port: 7860
 pinned: false
 license: mit
-short_description: RL gym environment for sales agent training
 ---
-# SalesPath Environment
-A [OpenEnv](https://github.com/openenv)-compatible Reinforcement Learning gym environment for training sales agents via LLM fine-tuning.
-## API Endpoints
-| Method | Endpoint | Description |
-|--------|----------|-------------|
-| `POST` | `/reset` | Reset the environment, returns initial observation |
-| `POST` | `/step` | Take an action, returns next observation + reward |
-| `GET`  | `/health` | Health check |
-## Quick Start
 ### Reset
 ```bash
@@ -35,13 +34,141 @@ curl -X POST https://imsachin010-salespath-env.hf.space/reset \
 ```bash
 curl -X POST https://imsachin010-salespath-env.hf.space/step \
   -H "Content-Type: application/json" \
-  -d '{"action": {"action_type": "PROSPECT", "content": "Hello, tell me about your workflow challenges."}}'
 ```
-## Action Types
-- `PROSPECT` — Initial outreach and discovery
-- `QUALIFY` — Qualify the lead
-- `PITCH` — Deliver the sales pitch
-- `HANDLE_OBJECTION` — Handle prospect objections
-- `CLOSE` — Attempt to close the deal

 app_port: 7860
 pinned: false
 license: mit
+short_description: RL gym environment for training B2B sales agents via GRPO
 ---
+# SalesPath — RL Environment for B2B Sales Agents
+A [OpenEnv](https://github.com/openenv)-compatible reinforcement learning gym
+that trains an LLM to navigate the full B2B sales process through GRPO.
+The agent must learn to qualify leads, handle objections, offer demos,
+negotiate, and close — all while respecting business rules enforced by a
+deterministic rule-based ProspectSimulator (no LLM on the environment side).
+---
+## Quick Start (hosted on HF Spaces)
 ### Reset
 ```bash
 ```bash
 curl -X POST https://imsachin010-salespath-env.hf.space/step \
   -H "Content-Type: application/json" \
+  -d '{
+    "action": {
+      "action_type": "PROSPECT",
+      "content": "Hello! I understand you have inventory tracking challenges. Tell me more."
+    }
+  }'
+```
+### Health check
+```bash
+curl https://imsachin010-salespath-env.hf.space/health
+```
+---
+## Action Space
+| Action | When to use |
+|---|---|
+| `PROSPECT` | Opening turn only — initial outreach |
+| `QUALIFY` | Uncover budget, decision maker, pain points |
+| `PRESENT` | Pitch the solution (requires QUALIFY first) |
+| `HANDLE_OBJECTION` | Respond to pricing / timing objections |
+| `OFFER_DEMO` | Schedule a live product demo |
+| `NEGOTIATE` | Discuss pricing/terms (requires OFFER_DEMO + known budget) |
+| `CLOSE` | Attempt to sign the deal |
+| `FOLLOW_UP` | Re-engage after prospect silence |
+| `DISQUALIFY` | End the conversation (correct only if budget < threshold AND no decision maker) |
+---
+## Business Rules Enforced
+| Rule | Description |
+|---|---|
+| R01 | Must QUALIFY before PRESENT |
+| R02 | Must OFFER_DEMO before NEGOTIATE |
+| R03 | Cannot NEGOTIATE while budget is unknown |
+| R04 | Discount in NEGOTIATE only after 2 objections handled |
+| R05 | Cannot repeat the same action on consecutive turns |
+| R06 | First action must be PROSPECT |
+| R07 | FOLLOW_UP only valid after prospect silence (no response for 1+ turns) |
+| R08 | DISQUALIFY valid only when budget < threshold AND no decision maker |
+| R09 | Must OFFER_DEMO before CLOSE (difficulty 2+) |
+---
+## Reward Function
+Composite weighted reward computed every step:
+| Component | Weight | Description |
+|---|---|---|
+| `r_outcome` | 0.40 | +1.0 on successful close, +0.5 on valid DISQUALIFY, -0.5 on bad close |
+| `r_compliance` | 0.30 | -0.2 per rule violation this turn |
+| `r_ordering` | 0.15 | Fraction of workflow steps completed in correct order |
+| `r_efficiency` | 0.10 | Penalty for turns beyond the optimal episode length |
+| `r_format` | 0.05 | +1.0 for valid action type, -0.1 for invalid |
+---
+## Difficulty Levels
+| Level | Description | Correct terminal action |
+|---|---|---|
+| 1 | Budget known, decision maker present, easy close | CLOSE |
+| 2 | Budget hidden, 1 objection, demo required | CLOSE |
+| 3 | Budget hidden, 2 objections, stalling prospect | CLOSE |
+| 4 | Misleading signals, low budget, no decision maker | DISQUALIFY |
+---
+## Training Pipeline
+```
+sft_demos.jsonl (14 expert demos)
+        ↓
+   train_sft.py   ← SFT warm-start (SFTTrainer, TRL)
+        ↓
+  sft_checkpoint/
+        ↓
+  train_grpo.py  ← GRPO RL fine-tuning (GRPOTrainer, TRL + Unsloth 4-bit)
+        ↓
+ grpo_checkpoint/ + reward_log.jsonl
+        ↓
+ plot_rewards.py  ← reward curves
 ```
+### Commands
+```bash
+# 1. Smoke test (no GPU, ~30 seconds)
+python training/train_test.py
+# 2. SFT warm-start (~15 min on T4)
+python training/train_sft.py
+# 3. Full GRPO training (~60 min on T4)
+uvicorn salespath_env.server.app:app --port 7860 &
+python training/train_grpo.py
+# 4. Plot reward curves
+python training/plot_rewards.py
+```
+---
+## File Structure
+```
+salespath-env/
+├── salespath_env/
+│   ├── client.py                  ← HTTP client for training scripts
+│   ├── models.py                  ← SalesPathAction / Observation / State
+│   └── server/
+│       ├── app.py                 ← FastAPI app (OpenEnv)
+│       ├── salespath_environment.py
+│       ├── prospect_simulator.py  ← Rule-based, no LLM
+│       ├── rules.py               ← 9 business rules (R01–R09)
+│       ├── reward.py              ← 5-component reward function
+│       └── task_bank.py           ← Prospect profiles (4 difficulty levels)
+├── training/
+│   ├── sft_demos.jsonl            ← Expert demonstration data
+│   ├── train_test.py              ← Smoke test (no GPU)
+│   ├── train_sft.py               ← SFT warm-start
+│   ├── train_grpo.py              ← GRPO RL training
+│   └── plot_rewards.py            ← Reward curve visualisation
+├── Dockerfile
+└── requirements.txt
+```
+---
+## Links
+- 📝 Blog post: _[add HuggingFace blog link here]_
+- 🎥 Demo video: _[add YouTube link here]_
+- 🤗 HF Space: https://huggingface.co/spaces/imsachin010/salespath-env

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[project]
+name = "salespath-env"
+version = "0.1.0"
+description = "OpenEnv RL environment for training B2B sales agents via GRPO"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+dependencies = [
+    "openenv-core>=0.2.3",
+    "fastapi>=0.110.0",
+    "uvicorn[standard]>=0.29.0",
+    "pydantic>=2.0",
+]
+[project.optional-dependencies]
+training = [
+    "trl>=0.8.6",
+    "transformers>=4.40.0",
+    "datasets>=2.18.0",
+    "peft>=0.10.0",
+    "bitsandbytes>=0.43.0",
+    "accelerate>=0.28.0",
+    "torch>=2.2.0",
+    "matplotlib>=3.8.0",
+    "unsloth",
+]
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.backends.legacy:build"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["salespath_env*"]

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 fastapi>=0.110.0
 uvicorn[standard]>=0.29.0
 pydantic>=2.0

+# Environment server (used by Dockerfile)
 fastapi>=0.110.0
 uvicorn[standard]>=0.29.0
 pydantic>=2.0

salespath_env/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (178 Bytes). View file

salespath_env/__pycache__/client.cpython-312.pyc ADDED Viewed

Binary file (5.72 kB). View file

salespath_env/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (3.15 kB). View file

salespath_env/client.py CHANGED Viewed

	@@ -0,0 +1,138 @@

+# salespath_env/client.py
+"""
+HTTP client for the SalesPath environment.
+Used by training scripts to talk to the hosted FastAPI server.
+"""
+from __future__ import annotations
+import requests
+class SalesPathClient:
+    """
+    Thin wrapper around the /reset and /step HTTP endpoints.
+    Example
+    -------
+    >>> client = SalesPathClient("http://localhost:7860")
+    >>> obs = client.reset(difficulty=1)
+    >>> obs = client.step("PROSPECT", "Hi, tell me about your pain points.")
+    >>> print(obs["reward"])
+    """
+    def __init__(self, base_url: str = "http://localhost:7860"):
+        self.base_url = base_url.rstrip("/")
+        self._session = requests.Session()
+    # ------------------------------------------------------------------
+    # Core API
+    # ------------------------------------------------------------------
+    def reset(self, difficulty: int = 1) -> dict:
+        """
+        Reset the environment for a new episode.
+        OpenEnv /reset returns the raw observation dict.
+        Returns a flat dict with all observation fields.
+        """
+        resp = self._session.post(
+            f"{self.base_url}/reset",
+            json={"difficulty": difficulty},
+            timeout=30,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        # /reset may return raw observation or wrapped {observation:{...}}
+        if "observation" in data:
+            flat = dict(data["observation"])
+            flat.setdefault("reward", data.get("reward", 0.0))
+            flat.setdefault("done",   data.get("done", False))
+            return flat
+        return data
+    def step(
+        self,
+        action_type: str,
+        content: str = "",
+        target: str = "",
+    ) -> dict:
+        """
+        Take one action in the environment.
+        OpenEnv /step returns {observation:{...}, reward:float, done:bool}.
+        This method flattens it so callers get a single dict with all
+        observation fields plus reward and done at the top level.
+        Returns
+        -------
+        dict with keys:
+            prospect_response, workflow_stage, constraints_violated,
+            steps_completed, turn_number, reward, reward_components,
+            done, info
+        """
+        resp = self._session.post(
+            f"{self.base_url}/step",
+            json={
+                "action": {
+                    "action_type": action_type,
+                    "content": content,
+                    "target": target,
+                }
+            },
+            timeout=30,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        # Flatten: {observation:{...}, reward, done} → one flat dict
+        if "observation" in data:
+            flat = dict(data["observation"])
+            flat["reward"] = data.get("reward", flat.get("reward", 0.0))
+            flat["done"]   = data.get("done",   flat.get("done", False))
+            return flat
+        return data
+    def health(self) -> dict:
+        resp = self._session.get(f"{self.base_url}/health", timeout=10)
+        resp.raise_for_status()
+        return resp.json()
+    # ------------------------------------------------------------------
+    # Convenience: run a full hard-coded demo episode
+    # ------------------------------------------------------------------
+    def run_demo_episode(self, difficulty: int = 1, verbose: bool = True) -> float:
+        """
+        Run one scripted episode and return total cumulative reward.
+        Useful for smoke-testing the server end-to-end.
+        """
+        obs = self.reset(difficulty)
+        if verbose:
+            print(f"\n=== Episode start (difficulty={difficulty}) ===")
+            print(f"Prospect: {obs.get('prospect_response', '')}\n")
+        # Scripted optimal sequence for difficulty 1
+        script = [
+            ("PROSPECT",         "Hello! I'd love to learn about your current challenges."),
+            ("QUALIFY",          "Can you tell me about your budget and decision process?"),
+            ("PRESENT",          "Here's how our platform solves your inventory problem."),
+            ("CLOSE",            "Based on everything, shall we move forward?"),
+        ]
+        total_reward = 0.0
+        for action_type, content in script:
+            obs = self.step(action_type, content)
+            total_reward += obs.get("reward", 0.0)
+            if verbose:
+                print(f"[Turn {obs['turn_number']}] Agent: {action_type}")
+                print(f"  Prospect: {obs['prospect_response']}")
+                print(f"  Reward: {obs['reward']:.3f}  |  Done: {obs['done']}")
+                if obs.get("constraints_violated"):
+                    print(f"  ⚠ Violations: {obs['constraints_violated']}")
+                print()
+            if obs["done"]:
+                break
+        if verbose:
+            print(f"=== Episode done. Cumulative reward: {total_reward:.3f} ===\n")
+        return total_reward

salespath_env/server/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (173 Bytes). View file

salespath_env/server/__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (4.34 kB). View file

salespath_env/server/__pycache__/prospect_simulator.cpython-312.pyc ADDED Viewed

Binary file (5.03 kB). View file

salespath_env/server/__pycache__/reward.cpython-312.pyc ADDED Viewed

Binary file (3.01 kB). View file

salespath_env/server/__pycache__/rules.cpython-312.pyc ADDED Viewed

Binary file (7.36 kB). View file

salespath_env/server/__pycache__/salespath_environment.cpython-312.pyc ADDED Viewed

Binary file (6.62 kB). View file

salespath_env/server/__pycache__/task_bank.cpython-312.pyc ADDED Viewed

Binary file (2.66 kB). View file

salespath_env/server/app.py CHANGED Viewed

@@ -1,18 +1,116 @@
-# salespath_env/server/app.py
-from openenv.core.env_server import create_fastapi_app
-from ..models import (
-    SalesPathAction,
-    SalesPathObservation,
-)
-from .salespath_environment import (
-    SalesPathEnvironment,
-)
-app = create_fastapi_app(
-    SalesPathEnvironment,
-    SalesPathAction,
-    SalesPathObservation,
-)

+# salespath_env/server/app.py
+"""
+Custom stateful FastAPI server for SalesPath.
+Why not create_fastapi_app?
+  OpenEnv's built-in HTTP /reset and /step endpoints are STATELESS —
+  they create a new Environment instance per request and destroy it.
+  State is preserved only over WebSocket sessions.
+  For our training loop (HTTP polling), we need a persistent environment
+  that survives across /reset + multiple /step calls. This file provides
+  that by keeping a single global SalesPathEnvironment instance.
+  The response envelope matches OpenEnv exactly:
+    { "observation": {...}, "reward": float, "done": bool }
+  so all existing clients work without changes.
+"""
+from typing import Any, Dict, Optional
+from fastapi import FastAPI
+from pydantic import BaseModel
+from ..models import SalesPathAction
+from .salespath_environment import SalesPathEnvironment
+# ---------------------------------------------------------------------------
+# Single persistent environment instance
+# ---------------------------------------------------------------------------
+_env: SalesPathEnvironment = SalesPathEnvironment()
+# ---------------------------------------------------------------------------
+# Request models
+# ---------------------------------------------------------------------------
+class ResetRequest(BaseModel):
+    difficulty: int = 1
+class ActionPayload(BaseModel):
+    action_type: str
+    content: str = ""
+    target: str = ""
+class StepRequest(BaseModel):
+    action: ActionPayload
+# ---------------------------------------------------------------------------
+# FastAPI app
+# ---------------------------------------------------------------------------
+app = FastAPI(
+    title="SalesPath Environment",
+    description="OpenEnv-compatible RL environment for B2B sales agent training.",
+    version="0.1.0",
+)
+@app.post("/reset")
+def reset(req: ResetRequest = ResetRequest()):
+    """
+    Start a new episode.
+    Resets the environment and returns the initial observation.
+    """
+    obs = _env.reset(difficulty=req.difficulty)
+    return {
+        "observation": obs.model_dump(),
+        "reward": obs.reward,
+        "done": obs.done,
+    }
+@app.post("/step")
+def step(req: StepRequest):
+    """
+    Take one action in the current episode.
+    Returns the next observation, reward, and done flag.
+    """
+    action = SalesPathAction(
+        action_type=req.action.action_type,
+        content=req.action.content,
+        target=req.action.target,
+    )
+    obs = _env.step(action)
+    return {
+        "observation": obs.model_dump(),
+        "reward": obs.reward,
+        "done": obs.done,
+    }
+@app.get("/health")
+def health():
+    return {"status": "healthy"}
+@app.get("/state")
+def state():
+    """Expose internal state (for debugging). Hidden state excluded."""
+    s = _env.state
+    return {
+        "episode_id":           s.episode_id,
+        "turn_number":          s.turn_number,
+        "workflow_stage":       s.workflow_stage,
+        "steps_completed":      s.steps_completed,
+        "constraints_violated": s.constraints_violated,
+        "objections_handled":   s.objections_handled,
+        "difficulty":           s.difficulty,
+        "done":                 s.done,
+        "prospect_profile":     s.prospect_profile,
+    }

salespath_env/server/prospect_simulator.py CHANGED Viewed

@@ -1,162 +1,176 @@
-# salespath_env/server/prospect_simulator.py
-from ..models import SalesPathAction, SalesPathState
-RESPONSE_TEXT = {
-    "open:positive_signal": "That sounds interesting. Tell me more about how this works.",
-    "open:neutral_signal": "I see. We're evaluating a few options at the moment.",
-    "objection:price": "The pricing seems higher than what we budgeted for.",
-    "objection:timing": "The timing isn't ideal — we're in the middle of a quarter close.",
-    "objection:premature_pitch": (
-        "I'm not sure we're ready to discuss solutions yet. "
-        "What do you know about our current situation?"
-    ),
-    "deflect:budget_not_discussed": (
-        "We haven't really talked about what we're looking for yet."
-    ),
-    "deflect:stall": (
-        "Let me get back to you on this. A lot is happening on our end."
-    ),
-    "accept:demo_scheduled": (
-        "Yes, let's set up a demo. What time works next week?"
-    ),
-    "accept:close_success": (
-        "Alright, I think we can move forward with this. "
-        "Send over the paperwork."
-    ),
-    "reject:close_failed": (
-        "I don't think we're ready to commit at this point."
-    ),
-    "silence": "",
-    "exit:disqualified": (
-        "I think we're done here. This isn't the right fit."
-    ),
-}
-class ProspectSimulator:
-    """
-    Pure rule-based simulator.
-    No LLM. No transformers. Deterministic behavior.
-    """
-    def respond(
-        self,
-        action: SalesPathAction,
-        state: SalesPathState,
-    ) -> tuple[str, str]:
-        """
-        Returns:
-            (response_token, response_text)
-        """
-        token = self._get_token(action, state)
-        text = RESPONSE_TEXT[token]
-        return token, text
-    def _get_token(
-        self,
-        action: SalesPathAction,
-        state: SalesPathState,
-    ) -> str:
-        atype = action.action_type
-        difficulty = state.difficulty
-        turn = state.turn_number
-        profile = state.prospect_profile
-        hidden = state.hidden_state
-        objections = state.objections_handled
-        # -----------------------------
-        # Rule-triggered responses first
-        # -----------------------------
-        if state.constraints_violated:
-            latest = state.constraints_violated[-1]
-            if latest == "R01":
-                return "objection:premature_pitch"
-            if latest == "R03":
-                return "deflect:budget_not_discussed"
-        # -----------------------------
-        # Action-based responses
-        # -----------------------------
-        if atype == "PROSPECT":
-            return "open:positive_signal"
-        if atype == "QUALIFY":
-            # Reveal budget if hidden
-            if profile.get("budget_signal") == "unknown":
-                state.prospect_profile["budget_signal"] = hidden.get(
-                    "revealed_budget",
-                    "medium",
-                )
-            return "open:neutral_signal"
-        if atype == "PRESENT":
-            if difficulty >= 2:
-                if objections == 0:
-                    return "objection:price"
-            return "open:positive_signal"
-        if atype == "HANDLE_OBJECTION":
-            state.objections_handled += 1
-            required_objections = hidden.get("num_objections", 1)
-            if state.objections_handled >= required_objections:
-                return "open:positive_signal"
-            if objections == 0:
-                return "objection:timing"
-            return "open:positive_signal"
-        if atype == "OFFER_DEMO":
-            return "accept:demo_scheduled"
-        if atype == "NEGOTIATE":
-            return "open:neutral_signal"
-        if atype == "CLOSE":
-            true_budget = hidden.get("true_budget", 0.7)
-            close_threshold = hidden.get("close_threshold", 0.5)
-            decision_maker = profile.get("decision_maker", True)
-            if (
-                true_budget >= close_threshold
-                and decision_maker
-            ):
-                return "accept:close_success"
-            return "reject:close_failed"
-        if atype == "FOLLOW_UP":
-            return "open:neutral_signal"
-        if atype == "DISQUALIFY":
-            return "exit:disqualified"
-        # -----------------------------
-        # Difficulty 3+ mode shift
-        # -----------------------------
-        if difficulty >= 3 and turn >= 10:
-            import random
-            if random.random() < hidden.get("stall_probability", 0.0):
-                return "deflect:stall"
         return "open:neutral_signal"

+# salespath_env/server/prospect_simulator.py
+import random
+from ..models import SalesPathAction, SalesPathState
+RESPONSE_TEXT = {
+    "open:positive_signal":     "That sounds interesting. Tell me more about how this works.",
+    "open:neutral_signal":      "I see. We're evaluating a few options at the moment.",
+    "objection:price":          "The pricing seems higher than what we budgeted for.",
+    "objection:timing":         "The timing isn't ideal — we're in the middle of a quarter close.",
+    "objection:premature_pitch": (
+        "I'm not sure we're ready to discuss solutions yet. "
+        "What do you know about our current situation?"
+    ),
+    "deflect:budget_not_discussed": (
+        "We haven't really talked about what we're looking for yet."
+    ),
+    "deflect:stall": (
+        "Let me get back to you on this. A lot is happening on our end."
+    ),
+    "accept:demo_scheduled": (
+        "Yes, let's set up a demo. What time works next week?"
+    ),
+    "accept:close_success": (
+        "Alright, I think we can move forward with this. "
+        "Send over the paperwork."
+    ),
+    "reject:close_failed": (
+        "I don't think we're ready to commit at this point."
+    ),
+    "silence": "",
+    "exit:disqualified": (
+        "I think we're done here. This isn't the right fit."
+    ),
+}
+# Prefix injected into QUALIFY response to reveal budget signal
+# without mutating prospect_profile (immutable prospect state).
+BUDGET_REVEAL_TEXT = {
+    "high":   "We do have solid budget allocated for this initiative. ",
+    "medium": "We have some budget set aside, though flexibility is limited. ",
+    "low":    "Our budget is quite constrained right now. ",
+}
+class ProspectSimulator:
+    """
+    Pure rule-based simulator. No LLM. No transformers.
+    Deterministic per action type.
+    Immutability guarantee:
+        This class NEVER mutates state.prospect_profile.
+        Budget reveal is surfaced via the response *text* only.
+        The environment (salespath_environment.py) owns all state writes.
+    """
+    def respond(
+        self,
+        action: SalesPathAction,
+        state: SalesPathState,
+    ) -> tuple[str, str]:
+        """
+        Returns:
+            (response_token, response_text)
+        """
+        token = self._get_token(action, state)
+        text  = self._build_text(token, action, state)
+        return token, text
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _build_text(
+        self,
+        token: str,
+        action: SalesPathAction,
+        state: SalesPathState,
+    ) -> str:
+        base = RESPONSE_TEXT[token]
+        # Inject budget reveal into QUALIFY response text.
+        # We read from hidden_state, not prospect_profile, so no mutation needed.
+        if action.action_type == "QUALIFY":
+            budget_signal = state.prospect_profile.get("budget_signal", "unknown")
+            if budget_signal == "unknown":
+                revealed = state.hidden_state.get("revealed_budget", "medium")
+                prefix   = BUDGET_REVEAL_TEXT.get(revealed, "")
+                return prefix + base
+        return base
+    def _get_token(
+        self,
+        action: SalesPathAction,
+        state: SalesPathState,
+    ) -> str:
+        atype      = action.action_type
+        difficulty = state.difficulty
+        turn       = state.turn_number
+        profile    = state.prospect_profile
+        hidden     = state.hidden_state
+        objections = state.objections_handled
+        # --------------------------------------------------
+        # 1. Rule-violation responses (highest priority)
+        # --------------------------------------------------
+        if state.constraints_violated:
+            latest = state.constraints_violated[-1]
+            if latest == "R01":
+                return "objection:premature_pitch"
+            if latest == "R03":
+                return "deflect:budget_not_discussed"
+        # --------------------------------------------------
+        # 2. Stall injection for difficulty 3+
+        #    FIX: moved BEFORE action branches so it can
+        #    actually fire (was dead code in original).
+        # --------------------------------------------------
+        if difficulty >= 3 and turn >= 5:
+            stall_prob = hidden.get("stall_probability", 0.0)
+            if stall_prob > 0.0 and random.random() < stall_prob:
+                return "deflect:stall"
+        # --------------------------------------------------
+        # 3. Action-based deterministic responses
+        # --------------------------------------------------
+        if atype == "PROSPECT":
+            return "open:positive_signal"
+        if atype == "QUALIFY":
+            return "open:neutral_signal"
+        if atype == "PRESENT":
+            if difficulty >= 2 and objections == 0:
+                return "objection:price"
+            return "open:positive_signal"
+        if atype == "HANDLE_OBJECTION":
+            state.objections_handled += 1          # only non-profile mutation
+            required = hidden.get("num_objections", 1)
+            if state.objections_handled >= required:
+                return "open:positive_signal"
+            if objections == 0:
+                return "objection:timing"
+            return "open:positive_signal"
+        if atype == "OFFER_DEMO":
+            return "accept:demo_scheduled"
+        if atype == "NEGOTIATE":
+            return "open:neutral_signal"
+        if atype == "CLOSE":
+            true_budget     = hidden.get("true_budget", 0.7)
+            close_threshold = hidden.get("close_threshold", 0.5)
+            decision_maker  = profile.get("decision_maker", True)
+            if true_budget >= close_threshold and decision_maker:
+                return "accept:close_success"
+            return "reject:close_failed"
+        if atype == "FOLLOW_UP":
+            return "open:neutral_signal"
+        if atype == "DISQUALIFY":
+            return "exit:disqualified"
         return "open:neutral_signal"

salespath_env/server/rules.py CHANGED Viewed

@@ -78,10 +78,15 @@ def _no_repeat_action(
     """
     R05:
     Same action twice in a row is invalid.
-    """
-    if state.conversation_history:
-        last_action = state.conversation_history[-1].get("action_type", "")
-        return last_action == action.action_type
     return False
@@ -104,13 +109,35 @@ def _followup_timing(
 ) -> bool:
     """
     R07:
-    FOLLOW_UP only valid after silence.
-    If prospect just responded last turn, violation.
     """
     if action.action_type == "FOLLOW_UP":
-        if state.conversation_history:
-            last_speaker = state.conversation_history[-1].get("speaker", "agent")
-            return last_speaker == "prospect"
     return False
@@ -120,15 +147,20 @@ def _disqualify_logic(
 ) -> bool:
     """
     R08:
-    DISQUALIFY only when prospect is genuinely not closeable.
-    Violation if prospect is actually closeable.
     """
     if action.action_type == "DISQUALIFY":
         true_budget = state.hidden_state.get("true_budget", 0.5)
         close_threshold = state.hidden_state.get("close_threshold", 0.5)
         decision_maker = state.prospect_profile.get("decision_maker", True)
-        return (true_budget >= close_threshold) and decision_maker
     return False

     """
     R05:
     Same action twice in a row is invalid.
+    FIX: conversation_history alternates agent/prospect entries.
+    Must filter to agent-only turns before comparing.
+    """
+    agent_turns = [
+        e for e in state.conversation_history
+        if e.get("speaker") == "agent"
+    ]
+    if agent_turns:
+        return agent_turns[-1].get("action_type", "") == action.action_type
     return False
 ) -> bool:
     """
     R07:
+    FOLLOW_UP only valid after prospect silence (no response for 1+ agent turns).
+    Violation if the prospect HAS replied since the last agent action.
+    FIX: Previous logic was inverted — it was blocking valid FOLLOW_UP.
     """
     if action.action_type == "FOLLOW_UP":
+        if not state.conversation_history:
+            return True  # Nothing happened yet — FOLLOW_UP makes no sense
+        agent_turns = [
+            e for e in state.conversation_history
+            if e.get("speaker") == "agent"
+        ]
+        prospect_turns = [
+            e for e in state.conversation_history
+            if e.get("speaker") == "prospect"
+        ]
+        if not agent_turns:
+            return True
+        last_agent_turn_num = agent_turns[-1]["turn"]
+        last_prospect_turn_num = max(
+            (e["turn"] for e in prospect_turns),
+            default=0,
+        )
+        # Violation if prospect already responded AFTER the last agent turn
+        return last_prospect_turn_num >= last_agent_turn_num
     return False
 ) -> bool:
     """
     R08:
+    DISQUALIFY is correct ONLY when:
+      - true_budget < close_threshold  AND
+      - decision_maker is False
+    Violation if prospect is actually closeable OR has a decision maker.
+    FIX: Both conditions must hold for a valid disqualification.
     """
     if action.action_type == "DISQUALIFY":
         true_budget = state.hidden_state.get("true_budget", 0.5)
         close_threshold = state.hidden_state.get("close_threshold", 0.5)
         decision_maker = state.prospect_profile.get("decision_maker", True)
+        # Valid disqualify requires: low budget AND no decision maker
+        valid_disqualify = (true_budget < close_threshold) and (not decision_maker)
+        return not valid_disqualify  # Violation if NOT a valid disqualify case
     return False

salespath_env/server/salespath_environment.py CHANGED Viewed

@@ -220,6 +220,20 @@ class SalesPathEnvironment(Environment):
             )
         )
         state.conversation_history.append(
             {
                 "turn": state.turn_number,

             )
         )
+        # -----------------------------------
+        # Budget reveal (env owns state write)
+        # Simulator surfaced the info via text;
+        # now we update prospect_profile so rules
+        # (e.g. R03) can see the revealed value.
+        # -----------------------------------
+        if (
+            action.action_type == "QUALIFY"
+            and state.prospect_profile.get("budget_signal") == "unknown"
+        ):
+            state.prospect_profile["budget_signal"] = (
+                state.hidden_state.get("revealed_budget", "medium")
+            )
         state.conversation_history.append(
             {
                 "turn": state.turn_number,

training/__pycache__/plot_rewards.cpython-312.pyc ADDED Viewed

Binary file (5.92 kB). View file

training/__pycache__/train_grpo.cpython-312.pyc ADDED Viewed

Binary file (13.6 kB). View file

training/__pycache__/train_sft.cpython-312.pyc ADDED Viewed

Binary file (5.39 kB). View file

training/__pycache__/train_test.cpython-312.pyc ADDED Viewed

Binary file (8.78 kB). View file

training/plot_rewards.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+plot_rewards.py — Visualise GRPO training progress
+====================================================
+Reads reward_log.jsonl written by train_grpo.py and
+produces two plots:
+  1. Mean reward per step (with min/max band)
+  2. Reward by difficulty level
+Run:
+    python training/plot_rewards.py
+    python training/plot_rewards.py --log ./reward_log.jsonl --out ./plots/
+"""
+import argparse
+import json
+import os
+from collections import defaultdict
+def load_log(path: str) -> list[dict]:
+    records = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                records.append(json.loads(line))
+    return records
+def plot(log_path: str, out_dir: str):
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        print("❌ matplotlib not installed.  pip install matplotlib")
+        return
+    os.makedirs(out_dir, exist_ok=True)
+    records = load_log(log_path)
+    if not records:
+        print(f"❌ No records found in {log_path}")
+        return
+    steps        = [r["step"]        for r in records]
+    means        = [r["mean_reward"] for r in records]
+    maxes        = [r["max_reward"]  for r in records]
+    mins         = [r["min_reward"]  for r in records]
+    difficulties = [r["difficulty"]  for r in records]
+    # --- Plot 1: mean reward with band ---
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.plot(steps, means, label="Mean reward", color="#4C6EF5", linewidth=2)
+    ax.fill_between(steps, mins, maxes, alpha=0.2, color="#4C6EF5", label="Min/Max band")
+    ax.axhline(0, color="gray", linestyle="--", linewidth=0.8)
+    # Mark difficulty changes
+    prev_d = None
+    for s, d in zip(steps, difficulties):
+        if d != prev_d:
+            ax.axvline(s, color="orange", linestyle=":", linewidth=1.2, alpha=0.7)
+            ax.text(s + 0.5, ax.get_ylim()[0] * 0.9, f"D{d}", fontsize=8, color="orange")
+            prev_d = d
+    ax.set_xlabel("Training Step")
+    ax.set_ylabel("Episode Reward")
+    ax.set_title("SalesPath GRPO — Mean Reward per Step")
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    plt.tight_layout()
+    path1 = os.path.join(out_dir, "reward_curve.png")
+    plt.savefig(path1, dpi=150)
+    print(f"✅ Saved: {path1}")
+    # --- Plot 2: per-difficulty box ---
+    by_diff = defaultdict(list)
+    for r in records:
+        by_diff[r["difficulty"]].append(r["mean_reward"])
+    fig2, ax2 = plt.subplots(figsize=(7, 5))
+    labels = sorted(by_diff.keys())
+    data   = [by_diff[d] for d in labels]
+    ax2.boxplot(data, labels=[f"Difficulty {d}" for d in labels], patch_artist=True)
+    ax2.set_ylabel("Mean Episode Reward")
+    ax2.set_title("Reward Distribution by Difficulty Level")
+    ax2.grid(True, alpha=0.3)
+    plt.tight_layout()
+    path2 = os.path.join(out_dir, "reward_by_difficulty.png")
+    plt.savefig(path2, dpi=150)
+    print(f"✅ Saved: {path2}")
+    # Print quick stats
+    print(f"\n📊 Training summary ({len(records)} steps)")
+    print(f"   Start mean reward : {means[0]:+.3f}")
+    print(f"   End mean reward   : {means[-1]:+.3f}")
+    print(f"   Best mean reward  : {max(means):+.3f}  (step {steps[means.index(max(means))]})")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log", default="./reward_log.jsonl")
+    parser.add_argument("--out", default="./plots")
+    args = parser.parse_args()
+    plot(args.log, args.out)

training/sft_demos.jsonl ADDED Viewed

	@@ -0,0 +1,14 @@

+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Meridian Retail, a medium retail company. Pain points: manual inventory tracking, slow reporting. Begin the sales conversation. [Stage: START | Turn: 0]"}, {"role": "assistant", "content": "ACTION: PROSPECT\nCONTENT: Hi! I understand Meridian Retail is dealing with manual inventory tracking and slow reporting. I'd love to learn more about how these challenges are impacting your day-to-day operations."}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Meridian Retail, a medium retail company. Pain points: manual inventory tracking, slow reporting. [Stage: PROSPECT | Turn: 1 | Prospect said: That sounds interesting. Tell me more about how this works.]"}, {"role": "assistant", "content": "ACTION: QUALIFY\nCONTENT: Great! Before I dive in, could you share more about your budget range and who the key decision makers are for this type of initiative?"}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Meridian Retail, a medium retail company. Pain points: manual inventory tracking, slow reporting. [Stage: QUALIFY | Turn: 2 | Prospect said: I see. We're evaluating a few options at the moment. Budget signal: high. Steps completed: PROSPECT, QUALIFY]"}, {"role": "assistant", "content": "ACTION: PRESENT\nCONTENT: Perfect. Our platform automates inventory tracking in real-time and generates reports in seconds — exactly the pain points you mentioned. It integrates with your existing systems with zero downtime."}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Meridian Retail, a medium retail company. [Stage: PRESENT | Turn: 3 | Prospect said: That sounds interesting. Tell me more. | Steps completed: PROSPECT, QUALIFY, PRESENT]"}, {"role": "assistant", "content": "ACTION: CLOSE\nCONTENT: Based on everything we've discussed, I believe our solution is an excellent fit for Meridian Retail. Shall we move forward and get the paperwork started?"}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Apex Logistics, an enterprise logistics company. Pain points: route optimization, driver coordination, fuel tracking. Budget: unknown. Begin the sales conversation. [Stage: START | Turn: 0]"}, {"role": "assistant", "content": "ACTION: PROSPECT\nCONTENT: Hello! I see Apex Logistics is tackling route optimization and driver coordination challenges. These are critical in logistics. I'd love to explore how we can help — could you walk me through your current process?"}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Apex Logistics. [Stage: PROSPECT | Turn: 1 | Prospect said: That sounds interesting. | Budget: unknown]"}, {"role": "assistant", "content": "ACTION: QUALIFY\nCONTENT: To make sure I propose the right solution, could you share your approximate budget for this initiative and confirm who the final decision maker would be?"}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Apex Logistics. [Stage: QUALIFY | Turn: 2 | Prospect said: We have some budget set aside. Budget signal: medium | Steps: PROSPECT, QUALIFY]"}, {"role": "assistant", "content": "ACTION: PRESENT\nCONTENT: Our platform optimizes routes using real-time traffic and GPS data, cutting fuel costs by up to 20% and improving driver coordination through a unified dashboard. Companies your size see ROI within 6 months."}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Apex Logistics. [Stage: PRESENT | Turn: 3 | Prospect said: The pricing seems higher than what we budgeted. | Objections: 0 | Steps: PROSPECT, QUALIFY, PRESENT]"}, {"role": "assistant", "content": "ACTION: HANDLE_OBJECTION\nCONTENT: That's a fair concern. The upfront cost is offset by the fuel savings and reduced overtime — most clients break even within 4 months. We also offer flexible payment plans. Would that make it more accessible?"}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Apex Logistics. [Stage: HANDLE_OBJECTION | Turn: 4 | Prospect said: That sounds reasonable. | Objections handled: 1 | Steps: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION]"}, {"role": "assistant", "content": "ACTION: OFFER_DEMO\nCONTENT: I'd love to show you the platform in action with your actual data. Can we schedule a 30-minute live demo next week so your team can see the route optimization in real time?"}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Apex Logistics. [Stage: OFFER_DEMO | Turn: 5 | Prospect said: Yes, let's set up a demo. | Steps: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO]"}, {"role": "assistant", "content": "ACTION: CLOSE\nCONTENT: Excellent! Given the demo and the ROI we've discussed, I think we're aligned. Are you ready to move forward with the agreement?"}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Cipher Tech, a small tech company. Budget signal: high (but misleading — actual budget is low). No decision maker present. [Stage: START | Turn: 0]"}, {"role": "assistant", "content": "ACTION: PROSPECT\nCONTENT: Hi! I understand you're facing security and compliance challenges. Could you tell me more about your current setup and what you're trying to achieve?"}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Cipher Tech. [Stage: QUALIFY | Turn: 2 | Prospect said: Our budget is quite constrained. No decision maker available. | Budget: low | Decision maker: false | Steps: PROSPECT, QUALIFY]"}, {"role": "assistant", "content": "ACTION: DISQUALIFY\nCONTENT: Thank you for your transparency. Based on your current budget constraints and the absence of a decision maker, I don't think we're the right fit at this time. I'll leave the door open for when circumstances change."}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Nova Financial. [Stage: OFFER_DEMO | Turn: 7 | Prospect said: Let me get back to you. A lot is happening on our end. | Objections: 2 | Budget: medium | Steps: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, HANDLE_OBJECTION, OFFER_DEMO]"}, {"role": "assistant", "content": "ACTION: NEGOTIATE\nCONTENT: I understand you're busy. To make this decision easier, I can offer a 3-month pilot at a reduced rate so your team can validate the ROI before full commitment."}]}
+{"messages": [{"role": "system", "content": "You are a professional B2B sales agent. Your goal is to close deals by following the correct sales process. Always respond with exactly one action in the format:\nACTION: <ACTION_TYPE>\nCONTENT: <your message to the prospect>\n\nValid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"}, {"role": "user", "content": "You are engaging Atlas Health. [Stage: NEGOTIATE | Turn: 9 | Prospect said: I see. We're evaluating options. | Steps: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, HANDLE_OBJECTION, OFFER_DEMO, NEGOTIATE]"}, {"role": "assistant", "content": "ACTION: CLOSE\nCONTENT: We've covered your pain points, seen the demo, and aligned on terms. I believe this is the right solution for Atlas Health. Can we finalize the agreement today?"}]}

training/train_grpo.py ADDED Viewed

	@@ -0,0 +1,388 @@

+"""
+train_grpo.py — Full GRPO RL Training
+=======================================
+Stage 2: loads the SFT checkpoint and fine-tunes with GRPO
+         using live rollouts against the SalesPath environment.
+Architecture
+------------
+  SFT checkpoint  →  Unsloth 4-bit QLoRA  →  GRPOTrainer (TRL)
+                                                    ↓
+                                         SalesPath env (HTTP)
+                                         reward = composite score
+Recommended hardware : A100 / T4 GPU (Google Colab)
+Expected runtime     : ~45-90 min for 200 steps on T4
+Run:
+    # 1. Start the env server in another terminal:
+    #    uvicorn salespath_env.server.app:app --port 7860
+    #
+    # 2. Then run this script:
+    python training/train_grpo.py
+Outputs:
+    ./grpo_checkpoint/  ← final RL-trained model
+    reward_log.jsonl    ← per-step reward components for plotting
+"""
+from __future__ import annotations
+import json
+import os
+import re
+import sys
+import time
+from typing import Any
+import torch
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+ENV_URL         = os.environ.get("SALESPATH_ENV_URL", "http://localhost:7860")
+SFT_CHECKPOINT  = os.environ.get("SFT_CHECKPOINT", "./sft_checkpoint")
+OUTPUT_DIR      = "./grpo_checkpoint"
+REWARD_LOG_PATH = "./reward_log.jsonl"
+MODEL_NAME      = SFT_CHECKPOINT       # start from SFT weights
+MAX_SEQ_LEN     = 1024
+LORA_R          = 16
+LORA_ALPHA      = 16
+# GRPO hyper-parameters
+NUM_TRAIN_STEPS     = 200     # increase to 500+ for best results
+ROLLOUTS_PER_STEP   = 8       # episodes collected before each gradient update
+DIFFICULTY_SCHEDULE = {       # step → difficulty to use for rollouts
+    0:   1,
+    50:  2,
+    100: 3,
+    150: 4,
+}
+LR              = 5e-6
+KL_COEFF        = 0.05        # keep close to SFT policy
+GRAD_ACCUM      = 4
+BATCH_SIZE      = 2
+REPORT_TO       = "none"      # swap to "wandb" for live reward curves
+# ---------------------------------------------------------------------------
+# 1. Load model (Unsloth 4-bit QLoRA)
+# ---------------------------------------------------------------------------
+try:
+    from unsloth import FastLanguageModel
+    USE_UNSLOTH = True
+except ImportError:
+    USE_UNSLOTH = False
+    print("⚠️  Unsloth not found — falling back to HuggingFace transformers.")
+if USE_UNSLOTH:
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=MODEL_NAME,
+        max_seq_length=MAX_SEQ_LEN,
+        dtype=None,
+        load_in_4bit=True,
+    )
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=LORA_R,
+        lora_alpha=LORA_ALPHA,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                        "gate_proj", "up_proj", "down_proj"],
+        lora_dropout=0.0,
+        bias="none",
+        use_gradient_checkpointing="unsloth",
+        random_state=42,
+    )
+else:
+    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+    from peft import get_peft_model, LoraConfig, TaskType
+    bnb = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME, quantization_config=bnb, device_map="auto"
+    )
+    model = get_peft_model(model, LoraConfig(
+        r=LORA_R, lora_alpha=LORA_ALPHA,
+        target_modules=["q_proj", "v_proj"],
+        task_type=TaskType.CAUSAL_LM,
+    ))
+tokenizer.pad_token     = tokenizer.eos_token
+tokenizer.padding_side  = "right"
+print(f"✅ Model loaded from: {MODEL_NAME}")
+# ---------------------------------------------------------------------------
+# 2. Environment client
+# ---------------------------------------------------------------------------
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from salespath_env.client import SalesPathClient
+client = SalesPathClient(ENV_URL)
+print(f"✅ Connected to env at {ENV_URL}  →  {client.health()}")
+# ---------------------------------------------------------------------------
+# 3. Prompt / action helpers
+# ---------------------------------------------------------------------------
+SYSTEM_PROMPT = (
+    "You are a professional B2B sales agent. "
+    "Follow the correct sales process to close deals.\n"
+    "Always respond with exactly ONE action in this format:\n"
+    "ACTION: <ACTION_TYPE>\n"
+    "CONTENT: <your message>\n\n"
+    "Valid actions: PROSPECT, QUALIFY, PRESENT, HANDLE_OBJECTION, "
+    "OFFER_DEMO, NEGOTIATE, CLOSE, FOLLOW_UP, DISQUALIFY"
+)
+ACTION_RE = re.compile(
+    r"ACTION:\s*([A-Z_]+)\s*\nCONTENT:\s*(.+)",
+    re.DOTALL,
+)
+def obs_to_user_message(obs: dict, stage: str, turn: int) -> str:
+    parts = [obs.get("prospect_response", "")]
+    if obs.get("steps_completed"):
+        parts.append(f"Steps completed: {', '.join(obs['steps_completed'])}")
+    if obs.get("constraints_violated"):
+        parts.append(f"⚠ Violations: {', '.join(obs['constraints_violated'])}")
+    parts.append(f"[Stage: {stage} | Turn: {turn}]")
+    return "\n".join(parts)
+def parse_action(text: str) -> tuple[str, str]:
+    """Extract (action_type, content) from model output."""
+    m = ACTION_RE.search(text.strip())
+    if m:
+        return m.group(1).strip(), m.group(2).strip()
+    # Fallback: if the model doesn't follow format, treat whole text as QUALIFY
+    return "QUALIFY", text.strip()
+def generate_action(messages: list[dict]) -> str:
+    """Run one forward pass; return raw generated text."""
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    ).to(model.device)
+    with torch.no_grad():
+        output_ids = model.generate(
+            inputs,
+            max_new_tokens=128,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    new_tokens = output_ids[0, inputs.shape[-1]:]
+    return tokenizer.decode(new_tokens, skip_special_tokens=True)
+# ---------------------------------------------------------------------------
+# 4. Rollout collector
+# ---------------------------------------------------------------------------
+def run_episode(difficulty: int) -> list[dict]:
+    """
+    Run one complete episode; return list of
+    {prompt_messages, completion, reward, reward_components} dicts.
+    """
+    obs      = client.reset(difficulty=difficulty)
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    samples  = []
+    for _ in range(20):   # hard cap matches env MAX_TURNS
+        user_msg = obs_to_user_message(
+            obs,
+            obs.get("workflow_stage", "START"),
+            obs.get("turn_number", 0),
+        )
+        messages.append({"role": "user", "content": user_msg})
+        # Generate & parse
+        completion   = generate_action(list(messages))
+        action_type, content = parse_action(completion)
+        # Step env
+        obs = client.step(action_type, content)
+        samples.append({
+            "messages":          list(messages),
+            "completion":        completion,
+            "reward":            obs["reward"],
+            "reward_components": obs.get("reward_components", {}),
+        })
+        messages.append({"role": "assistant", "content": completion})
+        if obs["done"]:
+            break
+    return samples
+def collect_rollouts(
+    n: int,
+    difficulty: int,
+) -> tuple[list[str], list[str], list[float]]:
+    """
+    Collect n episode rollouts.
+    Returns (prompts, completions, rewards) as flat lists for GRPOTrainer.
+    """
+    prompts, completions, rewards = [], [], []
+    for ep in range(n):
+        samples = run_episode(difficulty)
+        for s in samples:
+            prompt_text = tokenizer.apply_chat_template(
+                s["messages"],
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            prompts.append(prompt_text)
+            completions.append(s["completion"])
+            rewards.append(s["reward"])
+        ep_reward = sum(s["reward"] for s in samples)
+        print(f"    ep {ep+1}/{n}  steps={len(samples)}  ep_reward={ep_reward:+.3f}")
+    return prompts, completions, rewards
+# ---------------------------------------------------------------------------
+# 5. Reward log helper
+# ---------------------------------------------------------------------------
+reward_log: list[dict] = []
+def log_rewards(step: int, rewards: list[float], difficulty: int) -> None:
+    entry = {
+        "step": step,
+        "difficulty": difficulty,
+        "mean_reward": sum(rewards) / len(rewards),
+        "max_reward":  max(rewards),
+        "min_reward":  min(rewards),
+        "n_samples":   len(rewards),
+    }
+    reward_log.append(entry)
+    with open(REWARD_LOG_PATH, "a") as f:
+        f.write(json.dumps(entry) + "\n")
+    print(
+        f"  📊 step={step:4d}  diff={difficulty}  "
+        f"mean={entry['mean_reward']:+.3f}  "
+        f"max={entry['max_reward']:+.3f}"
+    )
+# ---------------------------------------------------------------------------
+# 6. GRPOTrainer setup
+# ---------------------------------------------------------------------------
+from datasets import Dataset
+from trl import GRPOTrainer, GRPOConfig
+def make_reward_fn(precomputed: dict[str, float]):
+    """
+    GRPOTrainer calls reward_funcs(prompts, completions) → list[float].
+    We pre-run rollouts and store results; the reward_fn just looks them up.
+    """
+    def reward_fn(prompts: list[str], completions: list[str], **kwargs) -> list[float]:
+        return [
+            precomputed.get(p + c, 0.0)
+            for p, c in zip(prompts, completions)
+        ]
+    return reward_fn
+grpo_config = GRPOConfig(
+    output_dir=OUTPUT_DIR,
+    num_train_epochs=1,                    # we control steps manually
+    per_device_train_batch_size=BATCH_SIZE,
+    gradient_accumulation_steps=GRAD_ACCUM,
+    learning_rate=LR,
+    kl_coeff=KL_COEFF,
+    logging_steps=1,
+    save_steps=50,
+    fp16=not USE_UNSLOTH,
+    report_to=REPORT_TO,
+    max_completion_length=128,
+    remove_unused_columns=False,
+)
+# ---------------------------------------------------------------------------
+# 7. Training loop
+# ---------------------------------------------------------------------------
+print(f"\n🚀 Starting GRPO training for {NUM_TRAIN_STEPS} steps")
+print(f"   Rollouts per step : {ROLLOUTS_PER_STEP}")
+print(f"   KL coefficient    : {KL_COEFF}")
+print(f"   Difficulty schedule: {DIFFICULTY_SCHEDULE}\n")
+for step in range(NUM_TRAIN_STEPS):
+    # Determine difficulty for this step
+    difficulty = 1
+    for threshold, d in sorted(DIFFICULTY_SCHEDULE.items()):
+        if step >= threshold:
+            difficulty = d
+    print(f"\n[Step {step+1}/{NUM_TRAIN_STEPS}]  difficulty={difficulty}")
+    # -- Collect rollouts --
+    prompts, completions, rewards = collect_rollouts(
+        ROLLOUTS_PER_STEP, difficulty
+    )
+    log_rewards(step + 1, rewards, difficulty)
+    # -- Build dataset for this step --
+    reward_lookup = {
+        p + c: r
+        for p, c, r in zip(prompts, completions, rewards)
+    }
+    step_dataset = Dataset.from_dict({
+        "prompt":     prompts,
+        "completion": completions,
+    })
+    # -- GRPOTrainer one-step update --
+    trainer = GRPOTrainer(
+        model=model,
+        reward_funcs=make_reward_fn(reward_lookup),
+        args=grpo_config,
+        train_dataset=step_dataset,
+        processing_class=tokenizer,
+    )
+    trainer.train()
+    # Save checkpoint every 50 steps
+    if (step + 1) % 50 == 0:
+        ckpt = os.path.join(OUTPUT_DIR, f"step_{step+1}")
+        model.save_pretrained(ckpt)
+        tokenizer.save_pretrained(ckpt)
+        print(f"  💾 Checkpoint saved: {ckpt}")
+# ---------------------------------------------------------------------------
+# 8. Final save
+# ---------------------------------------------------------------------------
+model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)
+print(f"\n✅ GRPO training complete.")
+print(f"   Model  → {OUTPUT_DIR}")
+print(f"   Rewards → {REWARD_LOG_PATH}")
+print("\nPlot rewards with:  python training/plot_rewards.py")

training/train_sft.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+train_sft.py — SFT Warm-Start Stage
+=====================================
+Fine-tunes a base LLM on expert sales demonstrations BEFORE GRPO.
+SFT teaches the model the correct action FORMAT and rough ordering,
+giving GRPO a much better starting policy.
+Recommended hardware : T4 GPU (Google Colab free tier)
+Expected runtime     : ~10-15 minutes for 14 demos × 3 epochs
+Run:
+    python training/train_sft.py
+Outputs:
+    ./sft_checkpoint/   ← load this as base in train_grpo.py
+"""
+import json
+import os
+import sys
+# ---------------------------------------------------------------------------
+# Config — tweak these
+# ---------------------------------------------------------------------------
+MODEL_NAME   = "unsloth/Qwen2.5-1.5B-Instruct"   # swap for 0.5B on tiny GPU
+OUTPUT_DIR   = "./sft_checkpoint"
+DATA_PATH    = os.path.join(os.path.dirname(__file__), "sft_demos.jsonl")
+MAX_SEQ_LEN  = 1024
+NUM_EPOCHS   = 3
+BATCH_SIZE   = 2
+GRAD_ACCUM   = 4
+LR           = 2e-4
+LORA_R       = 16
+LORA_ALPHA   = 16
+# ---------------------------------------------------------------------------
+# 1. Load model with Unsloth 4-bit QLoRA
+# ---------------------------------------------------------------------------
+try:
+    from unsloth import FastLanguageModel
+    USE_UNSLOTH = True
+except ImportError:
+    USE_UNSLOTH = False
+    print("⚠️  Unsloth not installed — falling back to plain HuggingFace.")
+    print("   Install with: pip install unsloth")
+if USE_UNSLOTH:
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=MODEL_NAME,
+        max_seq_length=MAX_SEQ_LEN,
+        dtype=None,          # auto-detect: bf16 on Ampere+, fp16 otherwise
+        load_in_4bit=True,
+    )
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=LORA_R,
+        lora_alpha=LORA_ALPHA,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                        "gate_proj", "up_proj", "down_proj"],
+        lora_dropout=0.05,
+        bias="none",
+        use_gradient_checkpointing="unsloth",
+        random_state=42,
+    )
+else:
+    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+    from peft import get_peft_model, LoraConfig, TaskType
+    import torch
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        quantization_config=bnb_config,
+        device_map="auto",
+    )
+    lora_config = LoraConfig(
+        r=LORA_R, lora_alpha=LORA_ALPHA,
+        target_modules=["q_proj", "v_proj"],
+        task_type=TaskType.CAUSAL_LM,
+    )
+    model = get_peft_model(model, lora_config)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+print(f"✅ Model loaded: {MODEL_NAME}  (4-bit QLoRA, r={LORA_R})")
+# ---------------------------------------------------------------------------
+# 2. Load & format SFT dataset
+# ---------------------------------------------------------------------------
+def load_sft_data(path: str) -> list[dict]:
+    records = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                records.append(json.loads(line))
+    return records
+def format_chat(record: dict) -> str:
+    """
+    Apply the model's chat template to convert messages → a single string.
+    """
+    return tokenizer.apply_chat_template(
+        record["messages"],
+        tokenize=False,
+        add_generation_prompt=False,
+    )
+raw_data = load_sft_data(DATA_PATH)
+print(f"✅ Loaded {len(raw_data)} SFT demonstrations from {DATA_PATH}")
+from datasets import Dataset
+formatted = [{"text": format_chat(r)} for r in raw_data]
+dataset   = Dataset.from_list(formatted)
+print(f"   Sample:\n{formatted[0]['text'][:300]}\n...")
+# ---------------------------------------------------------------------------
+# 3. SFT Trainer
+# ---------------------------------------------------------------------------
+from trl import SFTTrainer, SFTConfig
+sft_config = SFTConfig(
+    output_dir=OUTPUT_DIR,
+    num_train_epochs=NUM_EPOCHS,
+    per_device_train_batch_size=BATCH_SIZE,
+    gradient_accumulation_steps=GRAD_ACCUM,
+    learning_rate=LR,
+    warmup_ratio=0.1,
+    lr_scheduler_type="cosine",
+    logging_steps=1,
+    save_strategy="epoch",
+    fp16=not USE_UNSLOTH,   # Unsloth handles this internally
+    bf16=False,
+    max_seq_length=MAX_SEQ_LEN,
+    dataset_text_field="text",
+    report_to="none",        # swap to "wandb" if you have W&B set up
+)
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    args=sft_config,
+)
+print("🚀 Starting SFT training...")
+trainer_stats = trainer.train()
+print(f"\n✅ SFT done.")
+print(f"   Loss   : {trainer_stats.training_loss:.4f}")
+print(f"   Saved  : {OUTPUT_DIR}")
+# ---------------------------------------------------------------------------
+# 4. Save final checkpoint
+# ---------------------------------------------------------------------------
+model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)
+print(f"✅ Checkpoint saved to {OUTPUT_DIR}")
+print("\nNext step → run:  python training/train_grpo.py")

training/train_test.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""
+train_test.py — Quick smoke test (no GPU, no LLM needed).
+Tests the FULL pipeline end-to-end in ~30 seconds:
+  1. Starts the env server in a subprocess
+  2. Runs 4 scripted episodes (one per difficulty)
+  3. Prints reward traces and rule-violation checks
+  4. Verifies the three fixed bugs (R05, R07, R08) behave correctly
+Run:
+    python training/train_test.py
+"""
+import subprocess
+import sys
+import time
+import os
+# Add project root to path so we can import client
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+# ---------------------------------------------------------------------------
+# 1. Start server in background
+# ---------------------------------------------------------------------------
+def start_server(port: int = 7860):
+    proc = subprocess.Popen(
+        [
+            sys.executable, "-m", "uvicorn",
+            "salespath_env.server.app:app",
+            "--host", "0.0.0.0",
+            "--port", str(port),
+            "--log-level", "error",
+        ],
+        cwd=os.path.join(os.path.dirname(__file__), ".."),
+    )
+    print(f"⏳  Starting server on port {port}...")
+    time.sleep(4)  # wait for uvicorn to be ready
+    return proc
+# ---------------------------------------------------------------------------
+# 2. Import client
+# ---------------------------------------------------------------------------
+from salespath_env.client import SalesPathClient
+# ---------------------------------------------------------------------------
+# 3. Test episodes
+# ---------------------------------------------------------------------------
+EPISODES = {
+    1: [
+        # Happy path — difficulty 1
+        ("PROSPECT",   "Hello, tell me about your challenges."),
+        ("QUALIFY",    "What's your budget and who decides?"),
+        ("PRESENT",    "Here's how we solve your inventory problem."),
+        ("CLOSE",      "Shall we move forward?"),
+    ],
+    2: [
+        # Objection + demo — difficulty 2
+        ("PROSPECT",          "Hi, I'd like to learn more about your operations."),
+        ("QUALIFY",           "Can you share your budget range?"),
+        ("PRESENT",           "Here is our solution."),
+        ("HANDLE_OBJECTION",  "Totally understand the pricing concern — here's why the ROI works."),
+        ("OFFER_DEMO",        "Let me show you a live demo next week."),
+        ("CLOSE",             "Ready to move forward?"),
+    ],
+    3: [
+        # Full hard path — difficulty 3
+        ("PROSPECT",          "Hello, I've researched your compliance challenges."),
+        ("QUALIFY",           "Who is the decision maker and what is the budget?"),
+        ("PRESENT",           "Here's how we address audit trails and data silos."),
+        ("HANDLE_OBJECTION",  "I understand the timing is tough — many clients felt the same."),
+        ("HANDLE_OBJECTION",  "On the price point — we can structure payments quarterly."),
+        ("OFFER_DEMO",        "Let me show you a live demo."),
+        ("NEGOTIATE",         "Here is a pilot option at a reduced rate."),
+        ("CLOSE",             "Shall we proceed?"),
+    ],
+    4: [
+        # Trap case — correct action is DISQUALIFY
+        ("PROSPECT",    "Hi, tell me about your security needs."),
+        ("QUALIFY",     "What is your budget and who decides?"),
+        ("DISQUALIFY",  "Given the budget constraints and no decision maker, this isn't the right time."),
+    ],
+}
+def run_episode(client: SalesPathClient, difficulty: int, script: list) -> dict:
+    obs = client.reset(difficulty=difficulty)
+    print(f"\n{'='*60}")
+    print(f"  Difficulty {difficulty}  |  Prospect: {obs.get('prospect_response', '')[:80]}")
+    print(f"{'='*60}")
+    results = {"rewards": [], "violations": [], "turns": 0, "done": False}
+    for action_type, content in script:
+        obs = client.step(action_type, content)
+        results["rewards"].append(obs["reward"])
+        results["violations"].extend(obs.get("constraints_violated", []))
+        results["turns"] = obs["turn_number"]
+        results["done"]  = obs["done"]
+        status = "✅" if not obs.get("constraints_violated") else "⚠️"
+        print(
+            f"  {status} Turn {obs['turn_number']:2d}  "
+            f"{action_type:<20}  "
+            f"reward={obs['reward']:+.3f}  "
+            f"violations={obs.get('constraints_violated', [])}"
+        )
+        if obs["done"]:
+            break
+    total = sum(results["rewards"])
+    print(f"\n  Cumulative reward: {total:+.3f}  |  Violations: {results['violations']}")
+    return results
+# ---------------------------------------------------------------------------
+# 4. Bug-regression checks
+# ---------------------------------------------------------------------------
+def test_r05_no_repeat(client: SalesPathClient):
+    """R05: same action twice in a row must fire a violation."""
+    print("\n--- BUG CHECK: R05 no-repeat ---")
+    client.reset(difficulty=1)
+    client.step("PROSPECT", "Hello.")
+    obs = client.step("PROSPECT", "Hello again.")   # should violate R05
+    violated = obs.get("constraints_violated", [])
+    ok = "R05" in violated
+    print(f"  R05 fired on consecutive PROSPECT: {'✅ PASS' if ok else '❌ FAIL'}  {violated}")
+    return ok
+def test_r07_followup(client: SalesPathClient):
+    """R07: FOLLOW_UP after a prospect response should be a violation."""
+    print("\n--- BUG CHECK: R07 followup timing ---")
+    client.reset(difficulty=1)
+    client.step("PROSPECT", "Hello.")           # prospect responds positively
+    obs = client.step("FOLLOW_UP", "Just checking in.")  # violation — prospect already replied
+    violated = obs.get("constraints_violated", [])
+    ok = "R07" in violated
+    print(f"  R07 fired when prospect already responded: {'✅ PASS' if ok else '❌ FAIL'}  {violated}")
+    return ok
+def test_r08_disqualify(client: SalesPathClient):
+    """R08: DISQUALIFY on a closeable difficulty-1 prospect must be a violation."""
+    print("\n--- BUG CHECK: R08 disqualify logic ---")
+    client.reset(difficulty=1)   # high budget, decision maker present → closeable
+    client.step("PROSPECT", "Hello.")
+    client.step("QUALIFY", "What's your budget?")
+    obs = client.step("DISQUALIFY", "I don't think you're a fit.")
+    violated = obs.get("constraints_violated", [])
+    ok = "R08" in violated
+    print(f"  R08 fired on valid prospect: {'✅ PASS' if ok else '❌ FAIL'}  {violated}")
+    return ok
+# ---------------------------------------------------------------------------
+# 5. Main
+# ---------------------------------------------------------------------------
+def main():
+    PORT = 7860
+    server = start_server(PORT)
+    try:
+        client = SalesPathClient(f"http://localhost:{PORT}")
+        # Health check
+        try:
+            h = client.health()
+            print(f"✅ Server healthy: {h}")
+        except Exception as e:
+            print(f"❌ Server not responding: {e}")
+            return
+        # Run all difficulty episodes
+        all_rewards = {}
+        for diff, script in EPISODES.items():
+            result = run_episode(client, diff, script)
+            all_rewards[diff] = sum(result["rewards"])
+        # Bug regression suite
+        print("\n" + "="*60)
+        print("  BUG REGRESSION SUITE")
+        print("="*60)
+        r05_ok = test_r05_no_repeat(client)
+        r07_ok = test_r07_followup(client)
+        r08_ok = test_r08_disqualify(client)
+        # Summary
+        print("\n" + "="*60)
+        print("  SUMMARY")
+        print("="*60)
+        for diff, total in all_rewards.items():
+            print(f"  Difficulty {diff}: cumulative reward = {total:+.3f}")
+        bugs_passed = sum([r05_ok, r07_ok, r08_ok])
+        print(f"\n  Bug fixes passing: {bugs_passed}/3")
+        print(f"\n{'✅ ALL SYSTEMS GO' if bugs_passed == 3 else '⚠️  SOME CHECKS FAILED'}")
+    finally:
+        server.terminate()
+        print("\nServer stopped.")
+if __name__ == "__main__":
+    main()