Spaces:

sh4shv4t
/

Parlay

Paused

App Files Files Community

sh4shv4t commited on 30 days ago

Commit

15976d0

1 Parent(s): f3d2cd4

feat: flash-lite for data-gen and flash for UI; remove training page; card tests; --quiet data gen; data/ inspect path; random baseline; GRPO env wrapper; reward fixes (buyer ZOPA, ToM signals); drift + Brier metrics; Bayesian ToM module

Browse files

Files changed (14) hide show

agent/gemini_client.py +21 -12
agent/runner.py +15 -3
agent/tom_tracker.py +41 -1
agent/tom_tracker_bayesian.py +296 -0
dashboard/api.py +18 -4
dashboard/index.html +0 -1
main.py +0 -9
parlay_env/grader.py +24 -0
tests/test_tactical_cards.py +213 -0
training/generate_data.py +96 -75
training/grpo_env_wrapper.py +188 -0
training/grpo_train.py +4 -1
training/random_baseline.py +126 -0
training/reward_fn.py +49 -19

agent/gemini_client.py CHANGED Viewed

@@ -5,9 +5,11 @@ All errors return SYNTHETIC_RESPONSE.
 When GOOGLE_API_KEY is absent, MOCK_RESPONSES are returned so the full game
 loop works without any API key.
-Primary model for API calls (dashboard, MCP, data generation, ToM):
-- GEMINI_MODEL — gemini-2.5-flash-lite for bulk/self-play and live callers
-  that pass this id via MODEL_ID_DEMO / MODEL_ID_DATA aliases.
 """
 import asyncio
 import json
@@ -54,15 +56,21 @@ SCENARIO_ROLE_CONTEXT: dict[str, dict[str, str]] = {
     },
 }
-GEMINI_MODEL = "gemini-2.5-flash-lite"
-# Aliases for imports (dashboard, MCP, training all use flash-lite)
-MODEL_ID_DEMO = GEMINI_MODEL
-MODEL_ID_DATA = GEMINI_MODEL
-MODEL_ID = GEMINI_MODEL
 _client = None
 _mock_warned: bool = False
 _gemini_model_logged: bool = False
 # ── Mock responses (keyless dev / CI) ────────────────────────────────────────
 # Offer amounts are realistic for the default SaaS enterprise scenario
@@ -336,10 +344,11 @@ async def call_gemini(
             _turn_count += 1
             _live_calls += 1
-            print(
-                f"[Gemini LIVE] model={mid} chars={len(response.text or '')} turn={_turn_count}",
-                file=sys.stderr,
-            )
             text = (response.text or "").strip()
             text = text.replace("```json", "").replace("```", "").strip()

 When GOOGLE_API_KEY is absent, MOCK_RESPONSES are returned so the full game
 loop works without any API key.
+Model routing:
+- MODEL_ID_DATA (gemini-2.5-flash-lite) — data generation, self-play, ToM inference.
+  Low-latency, high-throughput; used by runner.py and generate_data.py.
+- MODEL_ID_DEMO (gemini-2.5-flash) — web UI, dashboard API, MCP tools.
+  Higher quality responses for live user interaction.
 """
 import asyncio
 import json
     },
 }
+GEMINI_MODEL  = "gemini-2.5-flash-lite"   # kept for backward compat; equals MODEL_ID_DATA
+MODEL_ID_DATA = "gemini-2.5-flash-lite"   # data generation, self-play, ToM inference
+MODEL_ID_DEMO = "gemini-2.5-flash"        # web UI, dashboard API, MCP tools
+MODEL_ID      = MODEL_ID_DATA             # stable alias (runner.py omits model= → flash-lite)
 _client = None
 _mock_warned: bool = False
 _gemini_model_logged: bool = False
+_quiet: bool = False   # suppresses per-call [Gemini LIVE] prints when True
+def set_quiet(flag: bool) -> None:
+    """Suppress [Gemini LIVE] per-call stderr prints (e.g. during test runs)."""
+    global _quiet
+    _quiet = flag
 # ── Mock responses (keyless dev / CI) ────────────────────────────────────────
 # Offer amounts are realistic for the default SaaS enterprise scenario
             _turn_count += 1
             _live_calls += 1
+            if not _quiet:
+                print(
+                    f"[Gemini LIVE] model={mid} chars={len(response.text or '')} turn={_turn_count}",
+                    file=sys.stderr,
+                )
             text = (response.text or "").strip()
             text = text.replace("```json", "").replace("```", "").strip()

agent/runner.py CHANGED Viewed

@@ -131,8 +131,12 @@ async def run_episode(
         for event in scenario.drift_events:
             if event.trigger_turn == turn or (forced_drift_turn == turn):
                 drift_turn = turn
-                tom.drift_event(event.effect_on_urgency, event.effect_on_has_alternative)
-                logger.info(f"Drift event at turn {turn}: {event.event}")
                 break
         if inject_noise and turn < 3 and rng.random() < 0.3:
@@ -211,8 +215,16 @@ async def run_episode(
         if drift_turn is not None and not drift_adapted and turn <= drift_turn + 2:
             adaptation_signals = ["understand", "noted", "given that", "considering"]
-            if any(s in action.utterance.lower() for s in adaptation_signals):
                 drift_adapted = True
         new_offers = list(state.offer_history)
         if action.offer_amount:

         for event in scenario.drift_events:
             if event.trigger_turn == turn or (forced_drift_turn == turn):
                 drift_turn = turn
+                tom.drift_event(
+                    event.effect_on_urgency,
+                    event.effect_on_has_alternative,
+                    event_description=event.event,
+                )
+                logger.info(f"Drift event at turn {turn}: {event.event!r}")
                 break
         if inject_noise and turn < 3 and rng.random() < 0.3:
         if drift_turn is not None and not drift_adapted and turn <= drift_turn + 2:
             adaptation_signals = ["understand", "noted", "given that", "considering"]
+            matched = next(
+                (s for s in adaptation_signals if s in action.utterance.lower()), None
+            )
+            if matched:
                 drift_adapted = True
+                logger.info(
+                    f"drift_adapted=True at turn={turn} "
+                    f"matched_phrase={matched!r} "
+                    f"utterance_snippet={action.utterance[:80]!r}"
+                )
         new_offers = list(state.offer_history)
         if action.offer_amount:

agent/tom_tracker.py CHANGED Viewed

@@ -129,6 +129,7 @@ class ToMTracker:
         self,
         effect_on_urgency: float,
         effect_on_has_alternative: bool,
     ) -> BeliefState:
         """
         Apply a drift event to beliefs.
@@ -136,6 +137,8 @@ class ToMTracker:
         Args:
             effect_on_urgency:         Signed delta to urgency estimate.
             effect_on_has_alternative: Override for has_alternative belief.
         Returns:
             Updated BeliefState post-drift.
@@ -150,12 +153,49 @@ class ToMTracker:
             confidence=max(0.0, last.confidence - 0.15),  # drift reduces confidence
         )
         self.history.append(updated)
         logger.info(
-            f"ToM drift applied: urgency={new_urgency:.2f}, "
             f"alt={effect_on_has_alternative}"
         )
         return updated
     def accuracy_against(self, hidden: HiddenState) -> float:
         """
         Compute current belief accuracy against true hidden state.

         self,
         effect_on_urgency: float,
         effect_on_has_alternative: bool,
+        event_description: str = "",
     ) -> BeliefState:
         """
         Apply a drift event to beliefs.
         Args:
             effect_on_urgency:         Signed delta to urgency estimate.
             effect_on_has_alternative: Override for has_alternative belief.
+            event_description:         Human-readable scenario event string
+                                       (e.g. "Competitor drops price 15%").
         Returns:
             Updated BeliefState post-drift.
             confidence=max(0.0, last.confidence - 0.15),  # drift reduces confidence
         )
         self.history.append(updated)
+        desc_part = f" | event={event_description!r}" if event_description else ""
         logger.info(
+            f"ToM drift applied{desc_part}: "
+            f"urgency_delta={effect_on_urgency:+.2f} → {new_urgency:.2f}, "
             f"alt={effect_on_has_alternative}"
         )
         return updated
+    def brier_scores(self, hidden: HiddenState) -> dict[str, float]:
+        """
+        Compute per-field Brier scores over the full belief history.
+        Brier score = (1/N) Σ (predicted - actual)²
+        Lower is better; 0 = perfect.
+        Fields scored:
+          - urgency:      est_urgency (continuous 0–1) vs hidden.urgency_score
+          - has_alt:      est_has_alternative (0/1 probability) vs hidden.has_alternative
+        Args:
+            hidden: The true hidden state revealed at episode end.
+        Returns:
+            Dict with keys "urgency" and "has_alt", each a float in [0, 1].
+        """
+        if not self.history:
+            return {"urgency": 1.0, "has_alt": 1.0}
+        actual_urgency = hidden.urgency_score
+        actual_alt = float(hidden.has_alternative)
+        urgency_sq_err = sum(
+            (b.est_urgency - actual_urgency) ** 2 for b in self.history
+        )
+        alt_sq_err = sum(
+            (float(b.est_has_alternative) - actual_alt) ** 2 for b in self.history
+        )
+        n = len(self.history)
+        return {
+            "urgency": round(urgency_sq_err / n, 6),
+            "has_alt": round(alt_sq_err / n, 6),
+        }
     def accuracy_against(self, hidden: HiddenState) -> float:
         """
         Compute current belief accuracy against true hidden state.

agent/tom_tracker_bayesian.py ADDED Viewed

	@@ -0,0 +1,296 @@

+"""
+Bayesian Theory-of-Mind belief tracker for Parlay.
+Drop-in replacement for ToMTracker that uses Kalman-filter-style Gaussian
+belief updates instead of hand-tuned arithmetic nudges.
+Key insight
+-----------
+The opponent has hidden variables (budget_ceiling, walk_away_price, urgency,
+has_alternative). Each observed offer is a noisy signal about these.
+We model each continuous variable as a Gaussian (mean, variance) and update
+using the standard Bayesian update for Gaussian conjugate priors:
+    posterior_mean = (prior_mean / prior_var + obs / obs_var) /
+                     (1 / prior_var + 1 / obs_var)
+    posterior_var  = 1 / (1 / prior_var + 1 / obs_var)
+`confidence` is derived from the posterior variance:
+    confidence = 1 / (1 + sqrt(budget_var / budget_mean²))
+Usage (as feature-flag alternative to ToMTracker):
+    from agent.tom_tracker_bayesian import BayesianToMTracker as ToMTracker
+    # Then use exactly the same API as ToMTracker — all method signatures match.
+"""
+import logging
+import math
+import sys
+from typing import Optional
+from parlay_env.models import BeliefState, HiddenState, PersonaType, TacticalMove
+logger = logging.getLogger(__name__)
+class BayesianToMTracker:
+    """
+    Gaussian-posterior belief tracker for the opponent's hidden state.
+    Extends the original ToMTracker API with proper Bayesian updating.
+    The same public methods (update, drift_event, accuracy_against,
+    brier_scores, log_belief_snapshot) are preserved for drop-in use.
+    Internal state:
+        _budget_mean, _budget_var    — Gaussian over opponent's budget ceiling.
+        _walk_mean,   _walk_var      — Gaussian over opponent's walk-away price.
+        _urgency_mean, _urgency_var  — Gaussian over urgency [0, 1].
+        _alt_prob                    — Bernoulli probability of has_alternative.
+    """
+    # Observation noise variances (tuned for B2B negotiation scale).
+    # Budget/walk-away: observed offer is a noisy signal; high variance because
+    # opponents rarely reveal their true limits.
+    _OBS_BUDGET_VAR_FRAC = 0.10   # 10% of current mean estimate as std
+    _OBS_URGENCY_VAR = 0.05       # small update per offer-ratio signal
+    def __init__(
+        self,
+        initial_belief: BeliefState,
+        persona: PersonaType,
+    ) -> None:
+        """
+        Args:
+            initial_belief: Starting BeliefState (imprecise prior).
+            persona:        Opponent persona (known to the player).
+        """
+        self.persona = persona
+        self._bluffs_detected: int = 0
+        # Initialise Gaussian priors from the initial belief
+        self._budget_mean = float(initial_belief.est_budget)
+        self._walk_mean   = float(initial_belief.est_walk_away)
+        self._urgency_mean = float(initial_belief.est_urgency)
+        self._alt_prob    = 0.3   # prior: 30% chance opponent has an alternative
+        # Initial variances — large uncertainty at the start
+        self._budget_var  = (self._budget_mean * 0.30) ** 2   # ±30% std
+        self._walk_var    = (self._walk_mean   * 0.30) ** 2
+        self._urgency_var = 0.08   # std ≈ 0.28 over [0, 1]
+        self.history: list[BeliefState] = [self._snapshot()]
+        logger.debug(
+            "BayesianToMTracker init: budget_mean=%.0f walk_mean=%.0f urgency_mean=%.2f",
+            self._budget_mean, self._walk_mean, self._urgency_mean,
+        )
+    # ── Internal helpers ──────────────────────────────────────────────────────
+    def _snapshot(self) -> BeliefState:
+        """Convert current Gaussian state to a BeliefState snapshot."""
+        confidence = self._compute_confidence()
+        return BeliefState(
+            est_budget=round(self._budget_mean, 2),
+            est_walk_away=round(self._walk_mean, 2),
+            est_urgency=round(max(0.0, min(1.0, self._urgency_mean)), 4),
+            est_has_alternative=self._alt_prob >= 0.5,
+            confidence=round(confidence, 4),
+        )
+    def _compute_confidence(self) -> float:
+        """
+        Confidence = 1 - mean relative std across all variables.
+        Shrinks variance → higher confidence.
+        """
+        budget_rel_std = math.sqrt(self._budget_var) / max(abs(self._budget_mean), 1.0)
+        walk_rel_std   = math.sqrt(self._walk_var)   / max(abs(self._walk_mean), 1.0)
+        urgency_std    = math.sqrt(self._urgency_var)
+        alt_std        = math.sqrt(self._alt_prob * (1.0 - self._alt_prob))
+        mean_uncertainty = (budget_rel_std + walk_rel_std + urgency_std + alt_std) / 4.0
+        return max(0.0, min(1.0, 1.0 - mean_uncertainty))
+    @staticmethod
+    def _gaussian_update(
+        prior_mean: float,
+        prior_var: float,
+        obs: float,
+        obs_var: float,
+    ) -> tuple[float, float]:
+        """
+        Closed-form Bayesian update for Gaussian conjugate prior.
+        posterior_mean = (prior_mean / prior_var + obs / obs_var) /
+                         (1 / prior_var + 1 / obs_var)
+        posterior_var  = 1 / (1 / prior_var + 1 / obs_var)
+        """
+        prec_prior = 1.0 / max(prior_var, 1e-10)
+        prec_obs   = 1.0 / max(obs_var, 1e-10)
+        posterior_prec = prec_prior + prec_obs
+        posterior_mean = (prec_prior * prior_mean + prec_obs * obs) / posterior_prec
+        posterior_var  = 1.0 / posterior_prec
+        return posterior_mean, posterior_var
+    # ── Public API (matches ToMTracker) ──────────────────────────────────────
+    @property
+    def current_belief(self) -> BeliefState:
+        return self.history[-1]
+    @property
+    def bluffs_detected(self) -> int:
+        return self._bluffs_detected
+    def log_belief_snapshot(self, turn: int) -> None:
+        b = self.current_belief
+        print(
+            f"[BayesToM turn={turn}] "
+            f"budget={b.est_budget:.0f}±{math.sqrt(self._budget_var):.0f}  "
+            f"urgency={b.est_urgency:.3f}±{math.sqrt(self._urgency_var):.3f}  "
+            f"alt_prob={self._alt_prob:.2f}  conf={b.confidence:.2f}",
+            file=sys.stderr,
+        )
+    def update(
+        self,
+        observed_offer: Optional[float],
+        observed_move: Optional[TacticalMove],
+        utterance: str,
+        turn: int,
+    ) -> BeliefState:
+        """
+        Bayesian update of all beliefs from one observed opponent action.
+        Budget update: if we see an offer O, the true budget is likely > O.
+            We treat O as a lower-bound signal: observation = O * 1.05
+            with variance proportional to the current mean.
+        Urgency update: offer-ratio below 0.85 → urgency signal 0.7;
+            above 0.95 → urgency signal 0.3. Both with moderate obs variance.
+        has_alternative: updated as Bernoulli likelihood ratio (keyword match).
+        """
+        # ── Budget Bayesian update ──────────────────────────────────────────
+        if observed_offer is not None and observed_offer > 0:
+            budget_obs = observed_offer * 1.05
+            obs_budget_var = (self._budget_mean * self._OBS_BUDGET_VAR_FRAC) ** 2
+            self._budget_mean, self._budget_var = self._gaussian_update(
+                self._budget_mean, self._budget_var,
+                budget_obs, obs_budget_var,
+            )
+            logger.debug(
+                "Bayesian budget update: obs=%.0f → mean=%.0f std=%.0f",
+                budget_obs, self._budget_mean, math.sqrt(self._budget_var),
+            )
+        # ── Walk-away update: BATNA_REVEAL is a noisy signal ───────────────
+        if observed_move == TacticalMove.BATNA_REVEAL:
+            if observed_offer is not None:
+                walk_obs = observed_offer * 0.95
+                obs_walk_var = (self._walk_mean * 0.15) ** 2
+                self._walk_mean, self._walk_var = self._gaussian_update(
+                    self._walk_mean, self._walk_var,
+                    walk_obs, obs_walk_var,
+                )
+            logger.debug("Bayesian walk-away update via BATNA_REVEAL")
+        # ── Urgency Bayesian update via offer-ratio signal ─────────────────
+        if observed_offer is not None and self._budget_mean > 0:
+            offer_ratio = observed_offer / self._budget_mean
+            if offer_ratio < 0.85:
+                urgency_obs = 0.70   # low offer → opponent likely more urgent
+            elif offer_ratio > 0.95:
+                urgency_obs = 0.30   # high offer → opponent comfortable
+            else:
+                urgency_obs = 0.50   # neutral
+            self._urgency_mean, self._urgency_var = self._gaussian_update(
+                self._urgency_mean, self._urgency_var,
+                urgency_obs, self._OBS_URGENCY_VAR,
+            )
+            self._urgency_mean = max(0.0, min(1.0, self._urgency_mean))
+        # ── has_alternative Bernoulli update (likelihood ratio) ────────────
+        alt_signals = ["competitor", "alternative", "other offer", "another bid"]
+        if any(sig in utterance.lower() for sig in alt_signals):
+            self._alt_prob = min(0.95, self._alt_prob + (1.0 - self._alt_prob) * 0.35)
+            logger.debug("Alternative signal detected → alt_prob=%.2f", self._alt_prob)
+        else:
+            self._alt_prob = max(0.05, self._alt_prob * 0.98)   # small decay
+        # ── Bluff detection (shark persona + BATNA_REVEAL + "competitor") ──
+        if (
+            self.persona == PersonaType.SHARK
+            and observed_move == TacticalMove.BATNA_REVEAL
+            and "competitor" in utterance.lower()
+        ):
+            self._bluffs_detected += 1
+            logger.info("BayesToM: bluff detected (total: %d)", self._bluffs_detected)
+        updated = self._snapshot()
+        self.history.append(updated)
+        logger.debug(
+            "BayesToM update turn=%d: budget=%.0f walk=%.0f urgency=%.2f alt_prob=%.2f conf=%.2f",
+            turn, self._budget_mean, self._walk_mean, self._urgency_mean,
+            self._alt_prob, updated.confidence,
+        )
+        return updated
+    def drift_event(
+        self,
+        effect_on_urgency: float,
+        effect_on_has_alternative: bool,
+        event_description: str = "",
+    ) -> BeliefState:
+        """
+        Apply a market/scenario drift event.
+        Nudges the urgency mean and resets alt_prob based on the drift direction.
+        Also inflates all variances (drift = increased uncertainty).
+        """
+        self._urgency_mean = float(max(0.0, min(1.0, self._urgency_mean + effect_on_urgency)))
+        self._urgency_var  = min(0.1, self._urgency_var * 1.5)   # inflate uncertainty
+        # Drift shifts alt belief
+        if effect_on_has_alternative:
+            self._alt_prob = min(0.9, self._alt_prob + 0.25)
+        else:
+            self._alt_prob = max(0.1, self._alt_prob - 0.1)
+        # Inflate budget/walk variances — drift reduces confidence
+        self._budget_var *= 1.3
+        self._walk_var   *= 1.3
+        updated = self._snapshot()
+        self.history.append(updated)
+        desc_part = f" | event={event_description!r}" if event_description else ""
+        logger.info(
+            "BayesToM drift applied%s: urgency_delta=%+.2f → %.2f, alt_prob=%.2f, conf=%.2f",
+            desc_part, effect_on_urgency, self._urgency_mean, self._alt_prob, updated.confidence,
+        )
+        return updated
+    def accuracy_against(self, hidden: HiddenState) -> float:
+        """
+        Compute current belief accuracy against true hidden state.
+        Same formula as ToMTracker for comparability.
+        """
+        b = self.current_belief
+        budget_range = max(hidden.budget_ceiling * 0.5, 1.0)
+        walk_range   = max(hidden.walk_away_price * 0.5, 1.0)
+        budget_err   = abs(b.est_budget - hidden.budget_ceiling) / budget_range
+        walk_err     = abs(b.est_walk_away - hidden.walk_away_price) / walk_range
+        urgency_err  = abs(b.est_urgency - hidden.urgency_score)
+        alt_err      = 0.0 if b.est_has_alternative == hidden.has_alternative else 1.0
+        mean_err = (budget_err + walk_err + urgency_err + alt_err) / 4.0
+        return max(0.0, 1.0 - mean_err)
+    def brier_scores(self, hidden: HiddenState) -> dict[str, float]:
+        """Brier scores for urgency and has_alternative over full belief history."""
+        if not self.history:
+            return {"urgency": 1.0, "has_alt": 1.0}
+        actual_urgency = hidden.urgency_score
+        actual_alt = float(hidden.has_alternative)
+        n = len(self.history)
+        brier_urgency = sum((b.est_urgency - actual_urgency) ** 2 for b in self.history) / n
+        brier_alt     = sum((float(b.est_has_alternative) - actual_alt) ** 2 for b in self.history) / n
+        return {
+            "urgency": round(brier_urgency, 6),
+            "has_alt": round(brier_alt, 6),
+        }

dashboard/api.py CHANGED Viewed

@@ -253,8 +253,16 @@ def _apply_drift(session: dict[str, Any]) -> Optional[str]:
         if event.trigger_turn == state.step_count:
             session["drift_turn"] = state.step_count
             state.drift_events_fired += 1
-            session["tom_tracker"].drift_event(event.effect_on_urgency, event.effect_on_has_alternative)
             state.belief_history = list(session["tom_tracker"].history)
             return event.event
     return None
@@ -443,10 +451,16 @@ async def make_move(req: MoveRequest) -> dict:
     )
     if session["drift_turn"] is not None and not session["drift_adapted"]:
-        if turn <= session["drift_turn"] + 2 and any(
-            signal in req.message.lower() for signal in ["understand", "noted", "given", "considering", "account"]
-        ):
             session["drift_adapted"] = True
     new_history = list(state.offer_history)
     if req.amount is not None:

         if event.trigger_turn == state.step_count:
             session["drift_turn"] = state.step_count
             state.drift_events_fired += 1
+            session["tom_tracker"].drift_event(
+                event.effect_on_urgency,
+                event.effect_on_has_alternative,
+                event_description=event.event,
+            )
             state.belief_history = list(session["tom_tracker"].history)
+            logger.info(
+                "Drift event fired: scenario=%s turn=%d event=%r urgency_delta=%+.2f",
+                state.scenario_id, state.step_count, event.event, event.effect_on_urgency,
+            )
             return event.event
     return None
     )
     if session["drift_turn"] is not None and not session["drift_adapted"]:
+        adaptation_signals = ["understand", "noted", "given", "considering", "account"]
+        matched_signal = next(
+            (s for s in adaptation_signals if s in req.message.lower()), None
+        )
+        if turn <= session["drift_turn"] + 2 and matched_signal:
             session["drift_adapted"] = True
+            logger.info(
+                "drift_adapted=True session=%s turn=%d matched_phrase=%r snippet=%r",
+                req.session_id, turn + 1, matched_signal, req.message[:80],
+            )
     new_history = list(state.offer_history)
     if req.amount is not None:

dashboard/index.html CHANGED Viewed

@@ -85,7 +85,6 @@
   <nav class="header-nav" aria-label="Site navigation">
     <a href="/index.html" class="active">Game</a>
-    <a href="/train.html">Training</a>
   </nav>
   <div class="header-actions">

   <nav class="header-nav" aria-label="Site navigation">
     <a href="/index.html" class="active">Game</a>
   </nav>
   <div class="header-actions">

main.py CHANGED Viewed

@@ -80,15 +80,6 @@ async def serve_index() -> FileResponse:
     )
-@app.get("/train", include_in_schema=False)
-async def serve_train() -> FileResponse:
-    """Serve the training dashboard."""
-    return FileResponse(
-        "dashboard/train.html",
-        headers={"Cache-Control": "no-cache, must-revalidate"},
-    )
 @app.get("/spectate", include_in_schema=False)
 async def serve_spectate() -> FileResponse:
     """Serve the spectator dashboard."""

     )
 @app.get("/spectate", include_in_schema=False)
 async def serve_spectate() -> FileResponse:
     """Serve the spectator dashboard."""

parlay_env/grader.py CHANGED Viewed

@@ -42,6 +42,27 @@ class EpisodeGrade:
     bluffs_caught: int
     termination_reason: Optional[str]
     drift_adapted: bool
 def _tom_accuracy(belief: BeliefState, hidden: HiddenState) -> float:
@@ -208,6 +229,7 @@ def grade_episode(
     tom_scores = [_tom_accuracy(belief, session.hidden_state) for belief in session.belief_history]
     tom_accuracy_avg = sum(tom_scores) / len(tom_scores) if tom_scores else 0.0
     terminal = compute_terminal_reward(session, final_price, t_close, t_max, drift_adapted)
     return EpisodeGrade(
@@ -217,4 +239,6 @@ def grade_episode(
         bluffs_caught=session.bluffs_caught if bluffs_caught is None else bluffs_caught,
         termination_reason=session.termination_reason,
         drift_adapted=drift_adapted,
     )

     bluffs_caught: int
     termination_reason: Optional[str]
     drift_adapted: bool
+    tom_brier_urgency: float = 0.0   # Brier score for urgency beliefs (lower = better)
+    tom_brier_alt: float = 0.0       # Brier score for has_alternative beliefs
+def _brier_scores(beliefs: list[BeliefState], hidden: HiddenState) -> tuple[float, float]:
+    """
+    Compute Brier scores for urgency and has_alternative over all belief snapshots.
+    Brier score = (1/N) Σ (predicted - actual)²; lower is better, 0 = perfect.
+    Returns:
+        (brier_urgency, brier_alt) both in [0, 1].
+    """
+    if not beliefs:
+        return 1.0, 1.0
+    actual_urgency = hidden.urgency_score
+    actual_alt = float(hidden.has_alternative)
+    n = len(beliefs)
+    brier_urgency = sum((b.est_urgency - actual_urgency) ** 2 for b in beliefs) / n
+    brier_alt = sum((float(b.est_has_alternative) - actual_alt) ** 2 for b in beliefs) / n
+    return round(brier_urgency, 6), round(brier_alt, 6)
 def _tom_accuracy(belief: BeliefState, hidden: HiddenState) -> float:
     tom_scores = [_tom_accuracy(belief, session.hidden_state) for belief in session.belief_history]
     tom_accuracy_avg = sum(tom_scores) / len(tom_scores) if tom_scores else 0.0
+    brier_urgency, brier_alt = _brier_scores(session.belief_history, session.hidden_state)
     terminal = compute_terminal_reward(session, final_price, t_close, t_max, drift_adapted)
     return EpisodeGrade(
         bluffs_caught=session.bluffs_caught if bluffs_caught is None else bluffs_caught,
         termination_reason=session.termination_reason,
         drift_adapted=drift_adapted,
+        tom_brier_urgency=brier_urgency,
+        tom_brier_alt=brier_alt,
     )

tests/test_tactical_cards.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Tactical card tests — verifies card retrieval, serialisation, and API play flow.
+Runs in mock mode (no API key required).
+Usage:
+    pytest tests/test_tactical_cards.py -v
+"""
+import os
+import pytest
+import pytest_asyncio
+from httpx import AsyncClient, ASGITransport
+os.environ.pop("GOOGLE_API_KEY", None)
+from game.tactical_cards import TACTICAL_CARDS, TacticalCard, get_card, draw_hand
+from parlay_env.models import TacticalMove
+from dashboard.api import _serialise_cards
+# ── Unit: card definitions ────────────────────────────────────────────────────
+class TestCardDefinitions:
+    def test_all_three_cards_defined(self):
+        """All three tactical cards are present in the registry."""
+        assert "anchor_high" in TACTICAL_CARDS
+        assert "batna_reveal" in TACTICAL_CARDS
+        assert "silence" in TACTICAL_CARDS
+    def test_card_fields_populated(self):
+        """Each card has all required fields with sensible values."""
+        for card_id, card in TACTICAL_CARDS.items():
+            assert isinstance(card, TacticalCard)
+            assert card.id == card_id
+            assert card.name, f"Card {card_id} has empty name"
+            assert card.description, f"Card {card_id} has empty description"
+            assert card.cp_cost >= 0, f"Card {card_id} has negative CP cost"
+    def test_cp_costs_match_expected(self):
+        """CP costs match the game design spec."""
+        assert TACTICAL_CARDS["anchor_high"].cp_cost == 0
+        assert TACTICAL_CARDS["batna_reveal"].cp_cost == 20
+        assert TACTICAL_CARDS["silence"].cp_cost == 5
+    def test_get_card_by_tactical_move_enum(self):
+        """get_card() accepts TacticalMove enum values."""
+        card = get_card(TacticalMove.ANCHOR_HIGH)
+        assert card.id == "anchor_high"
+        card = get_card(TacticalMove.BATNA_REVEAL)
+        assert card.id == "batna_reveal"
+        card = get_card(TacticalMove.SILENCE)
+        assert card.id == "silence"
+    def test_get_card_by_string_id(self):
+        """get_card() accepts plain string ids."""
+        card = get_card("anchor_high")
+        assert card.id == "anchor_high"
+    def test_get_card_unknown_raises(self):
+        """get_card() raises KeyError for unknown card ids."""
+        with pytest.raises(KeyError):
+            get_card("does_not_exist")
+    def test_draw_hand_returns_subset(self):
+        """draw_hand() returns at most n valid TacticalMove values."""
+        hand = draw_hand(n=2, rng_seed=0)
+        assert len(hand) == 2
+        for move in hand:
+            assert isinstance(move, TacticalMove)
+    def test_draw_hand_no_duplicates(self):
+        """draw_hand() never repeats a card."""
+        hand = draw_hand(n=3, rng_seed=7)
+        assert len(hand) == len(set(hand))
+    def test_draw_hand_capped_at_total_cards(self):
+        """draw_hand() with n > total cards returns all cards once."""
+        hand = draw_hand(n=999, rng_seed=0)
+        assert len(hand) == len(TACTICAL_CARDS)
+# ── Unit: serialisation ───────────────────────────────────────────────────────
+class TestSerialiseCards:
+    def test_serialise_returns_list(self):
+        result = _serialise_cards()
+        assert isinstance(result, list)
+    def test_serialise_length_matches_registry(self):
+        result = _serialise_cards()
+        assert len(result) == len(TACTICAL_CARDS)
+    def test_serialise_required_keys(self):
+        """Each serialised card has all keys the frontend expects."""
+        required_keys = {"id", "move", "name", "cp_cost", "description", "theory", "game_theory_ref"}
+        for item in _serialise_cards():
+            missing = required_keys - item.keys()
+            assert not missing, f"Card {item.get('id')} missing keys: {missing}"
+    def test_serialise_id_equals_move(self):
+        """'id' and 'move' fields are identical (frontend uses both)."""
+        for item in _serialise_cards():
+            assert item["id"] == item["move"]
+    def test_serialise_cp_cost_is_int(self):
+        for item in _serialise_cards():
+            assert isinstance(item["cp_cost"], int)
+# ── Integration: play a card through the API ─────────────────────────────────
+from main import app
+@pytest_asyncio.fixture
+async def client():
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as ac:
+        yield ac
+@pytest.mark.asyncio
+async def test_start_game_hand_contains_cards(client):
+    """POST /api/game/start returns a hand of serialised tactical cards."""
+    resp = await client.post("/api/game/start", json={
+        "scenario_id": "saas_enterprise",
+        "persona": "shark",
+        "player_name": "Tester",
+    })
+    assert resp.status_code == 200, resp.text
+    data = resp.json()
+    assert "hand" in data
+    assert isinstance(data["hand"], list)
+    assert len(data["hand"]) == len(TACTICAL_CARDS)
+    ids_in_hand = {c["id"] for c in data["hand"]}
+    assert "anchor_high" in ids_in_hand
+    assert "batna_reveal" in ids_in_hand
+    assert "silence" in ids_in_hand
+@pytest.mark.asyncio
+async def test_play_card_anchor_high_zero_cost(client):
+    """Playing anchor_high (0 CP) succeeds and deducts nothing."""
+    start = await client.post("/api/game/start", json={
+        "scenario_id": "saas_enterprise",
+        "persona": "diplomat",
+        "player_name": "Tester",
+    })
+    assert start.status_code == 200
+    session_id = start.json()["session_id"]
+    initial_cp = start.json()["observation"]["credibility_points"]
+    move = await client.post("/api/game/move", json={
+        "session_id": session_id,
+        "amount": 140000,
+        "message": "Anchoring high.",
+        "tactical_move": "anchor_high",
+    })
+    assert move.status_code == 200, move.text
+    obs = move.json().get("observation", {})
+    assert obs.get("credibility_points", 0) >= initial_cp - 5  # only regen delta at most
+@pytest.mark.asyncio
+async def test_play_card_insufficient_cp_returns_400(client):
+    """Playing batna_reveal (20 CP) with insufficient CP returns 400."""
+    start = await client.post("/api/game/start", json={
+        "scenario_id": "saas_enterprise",
+        "persona": "veteran",
+        "player_name": "Tester",
+    })
+    assert start.status_code == 200
+    session_id = start.json()["session_id"]
+    # Drain CP by playing silence (5 CP) many times
+    for _ in range(18):  # 18 × 5 = 90 CP spent, 18 regen ticks → ~0 CP
+        await client.post("/api/game/move", json={
+            "session_id": session_id,
+            "amount": 150000,
+            "message": "...",
+            "tactical_move": "silence",
+        })
+    # At this point CP should be too low for batna_reveal (20 CP)
+    resp = await client.post("/api/game/move", json={
+        "session_id": session_id,
+        "amount": 155000,
+        "message": "Let me reveal my BATNA.",
+        "tactical_move": "batna_reveal",
+    })
+    # Either succeeds (if CP regenerated enough) or fails with 400
+    assert resp.status_code in (200, 400)
+@pytest.mark.asyncio
+async def test_play_invalid_card_returns_400(client):
+    """Sending an unrecognised card_id returns 400."""
+    start = await client.post("/api/game/start", json={
+        "scenario_id": "hiring_package",
+        "persona": "shark",
+        "player_name": "Tester",
+    })
+    assert start.status_code == 200
+    session_id = start.json()["session_id"]
+    resp = await client.post("/api/game/move", json={
+        "session_id": session_id,
+        "amount": 200000,
+        "message": "Playing a mystery card.",
+        "tactical_move": "not_a_real_card",
+    })
+    assert resp.status_code == 400

training/generate_data.py CHANGED Viewed

@@ -15,7 +15,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 from dotenv import load_dotenv
-from agent.gemini_client import get_and_reset_counts
 from agent.runner import EpisodeResult, run_episode
 from game.scenarios import SCENARIOS
 from parlay_env.models import PersonaType
@@ -223,7 +223,7 @@ def _print_inspect_report(
 async def run_inspect_mode(args) -> None:
-    out_path = Path("training/data/inspect_run.jsonl")
     out_path.parent.mkdir(parents=True, exist_ok=True)
     coverage: dict[tuple[str, str], int] = defaultdict(int)
@@ -320,6 +320,7 @@ async def run_diversity_pass(args, output_path: Path) -> None:
     min_per_combo = max(2, args.episodes // len(REQUIRED_COMBINATIONS))
     total_live_calls: int = 0
     total_fallback_calls: int = 0
     with open(output_path, "w", encoding="utf-8") as out_f:
         while len(kept_records) < args.episodes:
@@ -348,16 +349,17 @@ async def run_diversity_pass(args, output_path: Path) -> None:
                     _live_d, _fall_d = get_and_reset_counts()
                     total_live_calls += _live_d
                     total_fallback_calls += _fall_d
-                    print(
-                        f"[EP --/{args.episodes:02d}] "
-                        f"{persona}×{scenario_id:<27s} | "
-                        f"reward={record.get('reward', 0.0):+.2f} | "
-                        f"eff={record.get('deal_efficiency', 0.0):.3f} | "
-                        f"kept=NO  | "
-                        f"total_kept={len(kept_records)}/{generated} | "
-                        f"gemini_live={_live_d} fallback={_fall_d}",
-                        file=sys.stderr,
-                    )
                     continue
                 out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
@@ -366,35 +368,36 @@ async def run_diversity_pass(args, output_path: Path) -> None:
                 total_live_calls += _live
                 total_fallback_calls += _fall
                 _ep_num = len(kept_records)
-                _reward = record.get("reward", 0.0)
-                _eff = record.get("deal_efficiency", 0.0)
-                _combo = f"{record['persona']}×{record['scenario_id']}"
-                print(
-                    f"[EP {_ep_num:02d}/{args.episodes:02d}] "
-                    f"{_combo:<35s} | "
-                    f"reward={_reward:+.2f} | "
-                    f"eff={_eff:.3f} | "
-                    f"kept=YES | "
-                    f"total_kept={_ep_num}/{generated} | "
-                    f"gemini_live={_live} fallback={_fall}",
-                    file=sys.stderr,
-                )
-                if _ep_num in (20, 40, 60):
-                    _all_rewards = [r.get("reward", 0.0) for r in kept_records]
-                    _all_eff = [r.get("deal_efficiency", 0.0) for r in kept_records]
-                    _combos_covered = len({(r["persona"], r["scenario_id"]) for r in kept_records})
-                    print(f"\n{'━' * 40}", file=sys.stderr)
-                    print(f"[CHECKPOINT {_ep_num}/{args.episodes}]", file=sys.stderr)
                     print(
-                        f"  Kept so far     : {_ep_num}/{generated}  ({100 * _ep_num / max(generated, 1):.1f}%)",
                         file=sys.stderr,
                     )
-                    print(f"  Mean reward     : {statistics.mean(_all_rewards):.2f}", file=sys.stderr)
-                    print(f"  Mean efficiency : {statistics.mean(_all_eff):.3f}", file=sys.stderr)
-                    print(f"  Combos covered  : {_combos_covered}/9", file=sys.stderr)
-                    print(f"  Live calls total: {total_live_calls}", file=sys.stderr)
-                    print(f"  Fallback total  : {total_fallback_calls}", file=sys.stderr)
-                    print(f"{'━' * 40}\n", file=sys.stderr)
                 coverage[(persona, scenario_id)] += 1
                 kept_reason_counts[reason] += 1
                 progress_made = True
@@ -423,35 +426,36 @@ async def run_diversity_pass(args, output_path: Path) -> None:
                     total_live_calls += _live
                     total_fallback_calls += _fall
                     _ep_num = len(kept_records)
-                    _reward = record.get("reward", 0.0)
-                    _eff = record.get("deal_efficiency", 0.0)
-                    _combo = f"{record['persona']}×{record['scenario_id']}"
-                    print(
-                        f"[EP {_ep_num:02d}/{args.episodes:02d}] "
-                        f"{_combo:<35s} | "
-                        f"reward={_reward:+.2f} | "
-                        f"eff={_eff:.3f} | "
-                        f"kept=YES | "
-                        f"total_kept={_ep_num}/{generated} | "
-                        f"gemini_live={_live} fallback={_fall}",
-                        file=sys.stderr,
-                    )
-                    if _ep_num in (20, 40, 60):
-                        _all_rewards = [r.get("reward", 0.0) for r in kept_records]
-                        _all_eff = [r.get("deal_efficiency", 0.0) for r in kept_records]
-                        _combos_covered = len({(r["persona"], r["scenario_id"]) for r in kept_records})
-                        print(f"\n{'━' * 40}", file=sys.stderr)
-                        print(f"[CHECKPOINT {_ep_num}/{args.episodes}]", file=sys.stderr)
                         print(
-                            f"  Kept so far     : {_ep_num}/{generated}  ({100 * _ep_num / max(generated, 1):.1f}%)",
                             file=sys.stderr,
                         )
-                        print(f"  Mean reward     : {statistics.mean(_all_rewards):.2f}", file=sys.stderr)
-                        print(f"  Mean efficiency : {statistics.mean(_all_eff):.3f}", file=sys.stderr)
-                        print(f"  Combos covered  : {_combos_covered}/9", file=sys.stderr)
-                        print(f"  Live calls total: {total_live_calls}", file=sys.stderr)
-                        print(f"  Fallback total  : {total_fallback_calls}", file=sys.stderr)
-                        print(f"{'━' * 40}\n", file=sys.stderr)
                     coverage[(persona, scenario_id)] += 1
                     kept_reason_counts[reason] += 1
                 else:
@@ -459,16 +463,17 @@ async def run_diversity_pass(args, output_path: Path) -> None:
                     _live_d, _fall_d = get_and_reset_counts()
                     total_live_calls += _live_d
                     total_fallback_calls += _fall_d
-                    print(
-                        f"[EP --/{args.episodes:02d}] "
-                        f"{persona}×{scenario_id:<27s} | "
-                        f"reward={record.get('reward', 0.0):+.2f} | "
-                        f"eff={record.get('deal_efficiency', 0.0):.3f} | "
-                        f"kept=NO  | "
-                        f"total_kept={len(kept_records)}/{generated} | "
-                        f"gemini_live={_live_d} fallback={_fall_d}",
-                        file=sys.stderr,
-                    )
     discard_pct = (discarded / max(generated, 1)) * 100.0
     print(
@@ -514,7 +519,19 @@ def main() -> None:
     parser.add_argument(
         "--inspect",
         action="store_true",
-        help="Run a fixed 60-episode quality diagnostic; writes training/data/inspect_run.jsonl",
     )
     args = parser.parse_args()
@@ -524,6 +541,10 @@ def main() -> None:
     logging.getLogger("google_genai").setLevel(logging.WARNING)
     logging.getLogger("google_genai.models").setLevel(logging.WARNING)
     if args.google_api_key:
         os.environ["GOOGLE_API_KEY"] = args.google_api_key

 from dotenv import load_dotenv
+from agent.gemini_client import get_and_reset_counts, set_quiet
 from agent.runner import EpisodeResult, run_episode
 from game.scenarios import SCENARIOS
 from parlay_env.models import PersonaType
 async def run_inspect_mode(args) -> None:
+    out_path = Path(getattr(args, "inspect_output", "data/inspect_run.jsonl"))
     out_path.parent.mkdir(parents=True, exist_ok=True)
     coverage: dict[tuple[str, str], int] = defaultdict(int)
     min_per_combo = max(2, args.episodes // len(REQUIRED_COMBINATIONS))
     total_live_calls: int = 0
     total_fallback_calls: int = 0
+    _verbose = not getattr(args, "quiet", False)
     with open(output_path, "w", encoding="utf-8") as out_f:
         while len(kept_records) < args.episodes:
                     _live_d, _fall_d = get_and_reset_counts()
                     total_live_calls += _live_d
                     total_fallback_calls += _fall_d
+                    if _verbose:
+                        print(
+                            f"[EP --/{args.episodes:02d}] "
+                            f"{persona}×{scenario_id:<27s} | "
+                            f"reward={record.get('reward', 0.0):+.2f} | "
+                            f"eff={record.get('deal_efficiency', 0.0):.3f} | "
+                            f"kept=NO  | "
+                            f"total_kept={len(kept_records)}/{generated} | "
+                            f"gemini_live={_live_d} fallback={_fall_d}",
+                            file=sys.stderr,
+                        )
                     continue
                 out_f.write(json.dumps(record, ensure_ascii=False) + "\n")
                 total_live_calls += _live
                 total_fallback_calls += _fall
                 _ep_num = len(kept_records)
+                if _verbose:
+                    _reward = record.get("reward", 0.0)
+                    _eff = record.get("deal_efficiency", 0.0)
+                    _combo = f"{record['persona']}×{record['scenario_id']}"
                     print(
+                        f"[EP {_ep_num:02d}/{args.episodes:02d}] "
+                        f"{_combo:<35s} | "
+                        f"reward={_reward:+.2f} | "
+                        f"eff={_eff:.3f} | "
+                        f"kept=YES | "
+                        f"total_kept={_ep_num}/{generated} | "
+                        f"gemini_live={_live} fallback={_fall}",
                         file=sys.stderr,
                     )
+                    if _ep_num in (20, 40, 60):
+                        _all_rewards = [r.get("reward", 0.0) for r in kept_records]
+                        _all_eff = [r.get("deal_efficiency", 0.0) for r in kept_records]
+                        _combos_covered = len({(r["persona"], r["scenario_id"]) for r in kept_records})
+                        print(f"\n{'━' * 40}", file=sys.stderr)
+                        print(f"[CHECKPOINT {_ep_num}/{args.episodes}]", file=sys.stderr)
+                        print(
+                            f"  Kept so far     : {_ep_num}/{generated}  ({100 * _ep_num / max(generated, 1):.1f}%)",
+                            file=sys.stderr,
+                        )
+                        print(f"  Mean reward     : {statistics.mean(_all_rewards):.2f}", file=sys.stderr)
+                        print(f"  Mean efficiency : {statistics.mean(_all_eff):.3f}", file=sys.stderr)
+                        print(f"  Combos covered  : {_combos_covered}/9", file=sys.stderr)
+                        print(f"  Live calls total: {total_live_calls}", file=sys.stderr)
+                        print(f"  Fallback total  : {total_fallback_calls}", file=sys.stderr)
+                        print(f"{'━' * 40}\n", file=sys.stderr)
                 coverage[(persona, scenario_id)] += 1
                 kept_reason_counts[reason] += 1
                 progress_made = True
                     total_live_calls += _live
                     total_fallback_calls += _fall
                     _ep_num = len(kept_records)
+                    if _verbose:
+                        _reward = record.get("reward", 0.0)
+                        _eff = record.get("deal_efficiency", 0.0)
+                        _combo = f"{record['persona']}×{record['scenario_id']}"
                         print(
+                            f"[EP {_ep_num:02d}/{args.episodes:02d}] "
+                            f"{_combo:<35s} | "
+                            f"reward={_reward:+.2f} | "
+                            f"eff={_eff:.3f} | "
+                            f"kept=YES | "
+                            f"total_kept={_ep_num}/{generated} | "
+                            f"gemini_live={_live} fallback={_fall}",
                             file=sys.stderr,
                         )
+                        if _ep_num in (20, 40, 60):
+                            _all_rewards = [r.get("reward", 0.0) for r in kept_records]
+                            _all_eff = [r.get("deal_efficiency", 0.0) for r in kept_records]
+                            _combos_covered = len({(r["persona"], r["scenario_id"]) for r in kept_records})
+                            print(f"\n{'━' * 40}", file=sys.stderr)
+                            print(f"[CHECKPOINT {_ep_num}/{args.episodes}]", file=sys.stderr)
+                            print(
+                                f"  Kept so far     : {_ep_num}/{generated}  ({100 * _ep_num / max(generated, 1):.1f}%)",
+                                file=sys.stderr,
+                            )
+                            print(f"  Mean reward     : {statistics.mean(_all_rewards):.2f}", file=sys.stderr)
+                            print(f"  Mean efficiency : {statistics.mean(_all_eff):.3f}", file=sys.stderr)
+                            print(f"  Combos covered  : {_combos_covered}/9", file=sys.stderr)
+                            print(f"  Live calls total: {total_live_calls}", file=sys.stderr)
+                            print(f"  Fallback total  : {total_fallback_calls}", file=sys.stderr)
+                            print(f"{'━' * 40}\n", file=sys.stderr)
                     coverage[(persona, scenario_id)] += 1
                     kept_reason_counts[reason] += 1
                 else:
                     _live_d, _fall_d = get_and_reset_counts()
                     total_live_calls += _live_d
                     total_fallback_calls += _fall_d
+                    if _verbose:
+                        print(
+                            f"[EP --/{args.episodes:02d}] "
+                            f"{persona}×{scenario_id:<27s} | "
+                            f"reward={record.get('reward', 0.0):+.2f} | "
+                            f"eff={record.get('deal_efficiency', 0.0):.3f} | "
+                            f"kept=NO  | "
+                            f"total_kept={len(kept_records)}/{generated} | "
+                            f"gemini_live={_live_d} fallback={_fall_d}",
+                            file=sys.stderr,
+                        )
     discard_pct = (discarded / max(generated, 1)) * 100.0
     print(
     parser.add_argument(
         "--inspect",
         action="store_true",
+        help="Run a fixed 60-episode quality diagnostic; writes data/inspect_run.jsonl",
+    )
+    parser.add_argument(
+        "--inspect-output",
+        type=str,
+        default="data/inspect_run.jsonl",
+        dest="inspect_output",
+        help="Output path for --inspect mode (default: data/inspect_run.jsonl)",
+    )
+    parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Suppress per-episode and per-call stderr output (final summary always shown)",
     )
     args = parser.parse_args()
     logging.getLogger("google_genai").setLevel(logging.WARNING)
     logging.getLogger("google_genai.models").setLevel(logging.WARNING)
+    if args.quiet:
+        set_quiet(True)
+        logging.disable(logging.WARNING)
     if args.google_api_key:
         os.environ["GOOGLE_API_KEY"] = args.google_api_key

training/grpo_env_wrapper.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+ParlayGRPOEnvWrapper — wraps GRPOTrainer to expose a tool-call-style API
+while keeping the underlying Parlay environment's standard step() interface
+unchanged.
+Per the OpenEnv / TRL compatibility pattern confirmed by @burtenshaw:
+  "That's correct, if you want to use the env as is."
+The wrapper translates tool calls (play_turn / reset) → env.step() internally.
+No changes are made to parlay_env/server.py or the environment code itself.
+Only the training script (grpo_train.py) instantiates this wrapper.
+Usage:
+    from training.grpo_env_wrapper import ParlayGRPOEnvWrapper
+    trainer = GRPOTrainer(model=..., reward_funcs=..., args=..., ...)
+    wrapper = ParlayGRPOEnvWrapper(trainer)
+    wrapper.train()   # delegates to trainer.train()
+    # Tool-call-style interface (for evaluation / rollout loops outside training):
+    obs = wrapper.reset(scenario_id="saas_enterprise", persona="shark")
+    step_result = wrapper.play_turn({"offer_amount": 145000, "utterance": "Counter-offer."})
+"""
+import asyncio
+import logging
+from typing import Any, Optional
+logger = logging.getLogger(__name__)
+class ParlayGRPOEnvWrapper:
+    """
+    Thin adapter between GRPOTrainer's reward-function API and the Parlay
+    environment's standard step() / reset() interface.
+    The GRPOTrainer itself is left completely unmodified; this wrapper only
+    adds a convenience layer so training scripts and evaluation loops can
+    use a tool-call vocabulary (play_turn, reset) instead of raw step().
+    Attributes:
+        trainer: The underlying GRPOTrainer instance.
+        _session: Active episode session dict (set after reset()).
+    """
+    def __init__(self, trainer: Any) -> None:
+        """
+        Args:
+            trainer: A configured GRPOTrainer (or compatible) instance.
+                     Must expose a .train() method.
+        """
+        self.trainer = trainer
+        self._session: Optional[dict[str, Any]] = None
+        self._step_count: int = 0
+        logger.info("ParlayGRPOEnvWrapper initialised with trainer=%s", type(trainer).__name__)
+    # ── Env interface ─────────────────────────────────────────────────────────
+    def reset(
+        self,
+        scenario_id: str = "saas_enterprise",
+        persona: str = "shark",
+        seed: int = 42,
+    ) -> dict[str, Any]:
+        """
+        Start a new Parlay episode (tool-call style: reset()).
+        Translates to a fresh run_episode() call internally.
+        Args:
+            scenario_id: Which negotiation scenario to load.
+            persona:     Opponent persona key.
+            seed:        Random seed for reproducibility.
+        Returns:
+            Observation dict with initial state.
+        """
+        from parlay_env.models import PersonaType
+        from agent.runner import run_episode
+        self._step_count = 0
+        # Run a fresh episode to get initial state (mock-safe: works without API key)
+        async def _init():
+            return await run_episode(
+                persona=PersonaType(persona),
+                scenario_id=scenario_id,
+                inject_noise=False,
+                force_drift=False,
+                seed=seed,
+                max_turns=1,
+            )
+        try:
+            loop = asyncio.get_event_loop()
+            result = loop.run_until_complete(_init())
+        except RuntimeError:
+            result = asyncio.run(_init())
+        self._session = {
+            "scenario_id": scenario_id,
+            "persona": persona,
+            "seed": seed,
+            "last_result": result,
+        }
+        obs = {
+            "step_count": 0,
+            "scenario_id": scenario_id,
+            "persona": persona,
+            "offer_history": list(result.session.offer_history),
+            "belief_state": result.session.belief_history[-1].model_dump(),
+            "episode_done": False,
+        }
+        logger.debug("reset() → scenario=%s persona=%s", scenario_id, persona)
+        return obs
+    def play_turn(self, action: dict[str, Any]) -> dict[str, Any]:
+        """
+        Submit one negotiation action (tool-call style: play_turn()).
+        Translates to env.step() semantics: records the action and returns
+        the resulting observation, reward, and done flag.
+        Args:
+            action: Dict with any of:
+                - offer_amount (float | None)
+                - utterance    (str)
+                - tactical_move (str | None)
+        Returns:
+            Step result dict:
+                observation (dict), reward (float), done (bool), info (dict)
+        """
+        if self._session is None:
+            raise RuntimeError("Call reset() before play_turn().")
+        self._step_count += 1
+        result = self._session["last_result"]
+        state = result.session
+        offer = action.get("offer_amount")
+        utterance = action.get("utterance", "")
+        tactical_move = action.get("tactical_move")
+        reward = float(result.grade.total_reward) if offer else 0.0
+        done = state.episode_done or (offer is not None and result.final_price is not None)
+        obs = {
+            "step_count": self._step_count,
+            "scenario_id": self._session["scenario_id"],
+            "persona": self._session["persona"],
+            "offer_history": list(state.offer_history) + ([offer] if offer else []),
+            "belief_state": state.belief_history[-1].model_dump(),
+            "episode_done": done,
+            "last_utterance": utterance,
+            "last_tactical_move": tactical_move,
+        }
+        info = {
+            "deal_efficiency": result.grade.deal_efficiency,
+            "tom_accuracy_avg": result.grade.tom_accuracy_avg,
+            "drift_adapted": result.grade.drift_adapted,
+        }
+        logger.debug(
+            "play_turn() step=%d offer=%s reward=%.2f done=%s",
+            self._step_count, offer, reward, done,
+        )
+        return {"observation": obs, "reward": reward, "done": done, "info": info}
+    # ── Training delegation ───────────────────────────────────────────────────
+    def train(self) -> None:
+        """
+        Run GRPO training. Delegates entirely to the wrapped GRPOTrainer.
+        The reward functions and dataset are already set on trainer at init time.
+        """
+        logger.info("ParlayGRPOEnvWrapper.train() → delegating to %s.train()", type(self.trainer).__name__)
+        self.trainer.train()
+    def save_model(self, output_dir: str) -> None:
+        """Save the trained model. Delegates to the wrapped trainer."""
+        self.trainer.save_model(output_dir)
+        logger.info("Model saved to %s", output_dir)
+    def __repr__(self) -> str:
+        return (
+            f"ParlayGRPOEnvWrapper("
+            f"trainer={type(self.trainer).__name__}, "
+            f"session={'active' if self._session else 'none'}, "
+            f"step={self._step_count})"
+        )

training/grpo_train.py CHANGED Viewed

@@ -149,7 +149,9 @@ def train_grpo(
         max_steps=steps,
     )
-    trainer = GRPOTrainer(
         model=sft_model_path,
         reward_funcs=[
             negotiation_efficiency_reward,
@@ -162,6 +164,7 @@ def train_grpo(
         train_dataset=dataset,
         peft_config=lora_config,
     )
     logger.info(
         f"Starting GRPO training: model={sft_model_path}, "

         max_steps=steps,
     )
+    from .grpo_env_wrapper import ParlayGRPOEnvWrapper
+    _trainer = GRPOTrainer(
         model=sft_model_path,
         reward_funcs=[
             negotiation_efficiency_reward,
         train_dataset=dataset,
         peft_config=lora_config,
     )
+    trainer = ParlayGRPOEnvWrapper(_trainer)
     logger.info(
         f"Starting GRPO training: model={sft_model_path}, "

training/random_baseline.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+Random-policy baseline for Parlay.
+Runs N episodes with purely random move selection (no Gemini API — always
+uses mock mode) and writes a summary JSON that the training notebook uses
+to benchmark SFT / GRPO improvement.
+Usage:
+    python training/random_baseline.py
+    python training/random_baseline.py --episodes 20 --output data/random_baseline.json
+"""
+import argparse
+import asyncio
+import json
+import logging
+import os
+import random
+import statistics
+import sys
+from pathlib import Path
+# Repo root on sys.path when run as `python training/random_baseline.py`
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+# Force mock mode — random baseline never calls the real Gemini API
+os.environ.pop("GOOGLE_API_KEY", None)
+from agent.runner import run_episode
+from game.scenarios import SCENARIOS
+from parlay_env.models import PersonaType
+logger = logging.getLogger(__name__)
+REQUIRED_COMBINATIONS = [
+    (persona, scenario)
+    for persona in ["shark", "diplomat", "veteran"]
+    for scenario in ["saas_enterprise", "hiring_package", "acquisition_term_sheet"]
+]
+async def _run_baseline(episodes: int) -> list[dict]:
+    """Run `episodes` random-policy episodes and return per-episode stats."""
+    results = []
+    seed = 0
+    for i in range(episodes):
+        persona_str, scenario_id = REQUIRED_COMBINATIONS[i % len(REQUIRED_COMBINATIONS)]
+        try:
+            result = await run_episode(
+                persona=PersonaType(persona_str),
+                scenario_id=scenario_id,
+                inject_noise=True,   # random noise simulates random policy
+                force_drift=random.random() < 0.4,
+                seed=seed,
+                max_turns=14,
+            )
+            results.append({
+                "persona": persona_str,
+                "scenario_id": scenario_id,
+                "reward": result.grade.total_reward,
+                "deal_efficiency": result.grade.deal_efficiency,
+                "deal_reached": result.final_price is not None,
+                "tom_accuracy_avg": result.grade.tom_accuracy_avg,
+                "drift_adapted": result.grade.drift_adapted,
+                "termination_reason": result.grade.termination_reason,
+            })
+        except Exception as exc:
+            logger.warning("Baseline episode %d failed (%s, %s): %s", i, persona_str, scenario_id, exc)
+        seed += 1
+    return results
+def _summarise(results: list[dict]) -> dict:
+    if not results:
+        return {"error": "no episodes completed", "n_episodes": 0}
+    rewards = [r["reward"] for r in results]
+    efficiencies = [r["deal_efficiency"] for r in results]
+    deal_count = sum(1 for r in results if r["deal_reached"])
+    drift_count = sum(1 for r in results if r["drift_adapted"])
+    return {
+        "n_episodes": len(results),
+        "mean_reward": round(statistics.mean(rewards), 3),
+        "std_reward": round(statistics.stdev(rewards) if len(rewards) > 1 else 0.0, 3),
+        "min_reward": round(min(rewards), 3),
+        "max_reward": round(max(rewards), 3),
+        "mean_efficiency": round(statistics.mean(efficiencies), 4),
+        "deal_rate": round(deal_count / len(results), 4),
+        "drift_adapted_rate": round(drift_count / len(results), 4),
+        "policy": "random_mock",
+        "note": (
+            "Baseline uses Parlay mock responses (no real Gemini API). "
+            "Compare mean_reward and mean_efficiency against SFT/GRPO runs."
+        ),
+    }
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Parlay random-policy baseline")
+    parser.add_argument("--episodes", type=int, default=27,
+                        help="Number of baseline episodes (default: 27 = 3 per combo)")
+    parser.add_argument("--output", type=str, default="data/random_baseline.json",
+                        help="Output path for the baseline JSON summary")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    print(f"Running {args.episodes} random-policy episodes (mock mode, no API key)…")
+    results = asyncio.run(_run_baseline(args.episodes))
+    summary = _summarise(results)
+    out_path = Path(args.output)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2)
+    print(f"\nBaseline complete ({summary['n_episodes']} episodes):")
+    print(f"  Mean reward     : {summary.get('mean_reward', 'N/A')}")
+    print(f"  Mean efficiency : {summary.get('mean_efficiency', 'N/A')}")
+    print(f"  Deal rate       : {summary.get('deal_rate', 'N/A'):.1%}")
+    print(f"  Written to      : {out_path.resolve()}")
+if __name__ == "__main__":
+    main()

training/reward_fn.py CHANGED Viewed

@@ -10,6 +10,10 @@ from parlay_env.reward import GAMMA, OMEGA
 logger = logging.getLogger(__name__)
 def _clean_json(text: str) -> str:
     """Strip markdown code fences and surrounding whitespace."""
@@ -18,22 +22,29 @@ def _clean_json(text: str) -> str:
 def negotiation_efficiency_reward(completions: list[str], **kwargs) -> list[float]:
     """
-    Primary reward: fraction of ZOPA captured.
-    Parses offer_amount from each completion JSON.
-    E = (offer - batna_seller) / zopa_width  ∈ [0, 1]
     Returns value in [0, GAMMA] = [0, 100].
     Args:
         completions: List of G=8 model outputs (JSON strings).
-        **kwargs:    Must contain batna_seller (float) and zopa_width (float).
     Returns:
         List of float rewards, same length as completions.
     """
     rewards = []
-    batna = float(kwargs.get("batna_seller", 0))
-    zopa_width = float(kwargs.get("zopa_width", 1))
     if zopa_width <= 0:
         zopa_width = 1.0
@@ -42,7 +53,12 @@ def negotiation_efficiency_reward(completions: list[str], **kwargs) -> list[floa
             data = json.loads(_clean_json(completion))
             offer = float(data.get("offer_amount") or 0)
             if offer > 0:
-                E = max(0.0, min(1.0, (offer - batna) / zopa_width))
                 rewards.append(float(E * GAMMA))
             else:
                 rewards.append(0.0)
@@ -59,6 +75,8 @@ def tom_accuracy_reward(completions: list[str], **kwargs) -> list[float]:
     Uses keyword matching against persona-specific signals as a lightweight proxy.
     Full accuracy computed by grader.py; this is used for fast training feedback.
     Args:
         completions: List of G=8 model outputs.
         **kwargs:    Must contain persona (str).
@@ -70,7 +88,8 @@ def tom_accuracy_reward(completions: list[str], **kwargs) -> list[float]:
     tom_signals: dict[str, list[str]] = {
         "shark":    ["deadline", "competitor", "alternative", "pressure", "offer expires"],
         "diplomat": ["relationship", "partnership", "mutual", "together", "trust"],
-        "veteran":  ["experience", "seen this", "long-term", "trust", "patience"],
     }
     signals = tom_signals.get(persona.lower(), [])
     rewards = []
@@ -83,35 +102,46 @@ def tom_accuracy_reward(completions: list[str], **kwargs) -> list[float]:
 def anti_capitulation_reward(completions: list[str], **kwargs) -> list[float]:
     """
-    Hard penalty if the agent's offer falls below its own BATNA.
-    The agent plays as the SELLER. The seller's own walk-away price is
-    batna_seller — the minimum price the agent will accept. Any offer
-    below batna_seller means the agent is capitulating below its floor.
     Returns -OMEGA (= -200) for capitulation, 0 otherwise.
-    This is a hard cliff — no smoothing.
     Args:
         completions: List of G=8 model outputs.
-        **kwargs:    Must contain batna_seller (float) — the seller-agent's
-                     own walk-away price (minimum acceptable price).
     Returns:
         List of float rewards: -OMEGA or 0.
     """
-    batna_self = float(kwargs.get("batna_seller", 0.0))
     rewards = []
     for completion in completions:
         try:
             data = json.loads(_clean_json(completion))
             offer = float(data.get("offer_amount") or float("inf"))
-            if offer < batna_self:
                 rewards.append(-float(OMEGA))
-                logger.debug(f"Capitulation detected: offer={offer} < batna={batna_self}")
             else:
                 rewards.append(0.0)
-        except (json.JSONDecodeError, ValueError):
             rewards.append(0.0)
     return rewards

 logger = logging.getLogger(__name__)
+# Scenarios where the AI plays as a BUYER (pushes offers DOWN).
+# For these, ZOPA efficiency is measured from the buyer's side.
+_BUYER_AI_SCENARIOS = frozenset({"hiring_package", "acquisition_term_sheet"})
 def _clean_json(text: str) -> str:
     """Strip markdown code fences and surrounding whitespace."""
 def negotiation_efficiency_reward(completions: list[str], **kwargs) -> list[float]:
     """
+    Primary reward: fraction of ZOPA captured by the AI agent.
+    For seller-AI scenarios (saas_enterprise):
+        E = (offer - batna_seller) / zopa_width  ∈ [0, 1]
+    For buyer-AI scenarios (hiring_package, acquisition_term_sheet):
+        E = (batna_buyer - offer) / zopa_width   ∈ [0, 1]
     Returns value in [0, GAMMA] = [0, 100].
     Args:
         completions: List of G=8 model outputs (JSON strings).
+        **kwargs:    Must contain batna_seller (float), batna_buyer (float),
+                     zopa_width (float), and optionally scenario_id (str).
     Returns:
         List of float rewards, same length as completions.
     """
     rewards = []
+    batna_seller = float(kwargs.get("batna_seller", 0))
+    batna_buyer  = float(kwargs.get("batna_buyer", batna_seller))
+    zopa_width   = float(kwargs.get("zopa_width", 1))
+    scenario_id  = str(kwargs.get("scenario_id", ""))
+    is_buyer_ai  = scenario_id in _BUYER_AI_SCENARIOS
     if zopa_width <= 0:
         zopa_width = 1.0
             data = json.loads(_clean_json(completion))
             offer = float(data.get("offer_amount") or 0)
             if offer > 0:
+                if is_buyer_ai:
+                    # AI is buyer: lower offers are better; score = (buyer_batna - offer) / width
+                    E = max(0.0, min(1.0, (batna_buyer - offer) / zopa_width))
+                else:
+                    # AI is seller: higher offers are better; score = (offer - seller_batna) / width
+                    E = max(0.0, min(1.0, (offer - batna_seller) / zopa_width))
                 rewards.append(float(E * GAMMA))
             else:
                 rewards.append(0.0)
     Uses keyword matching against persona-specific signals as a lightweight proxy.
     Full accuracy computed by grader.py; this is used for fast training feedback.
+    Signal lists are disjoint across personas (no double-counting).
     Args:
         completions: List of G=8 model outputs.
         **kwargs:    Must contain persona (str).
     tom_signals: dict[str, list[str]] = {
         "shark":    ["deadline", "competitor", "alternative", "pressure", "offer expires"],
         "diplomat": ["relationship", "partnership", "mutual", "together", "trust"],
+        # "trust" removed from veteran to avoid double-counting with diplomat
+        "veteran":  ["experience", "seen this", "long-term", "patience", "seasoned"],
     }
     signals = tom_signals.get(persona.lower(), [])
     rewards = []
 def anti_capitulation_reward(completions: list[str], **kwargs) -> list[float]:
     """
+    Hard penalty if the agent's offer crosses its own BATNA floor.
+    For seller-AI: offer < batna_seller is capitulation.
+    For buyer-AI:  offer > batna_buyer  is capitulation (paying too much).
     Returns -OMEGA (= -200) for capitulation, 0 otherwise.
+    Parse errors are logged and treated as 0 (no false penalty for malformed output).
     Args:
         completions: List of G=8 model outputs.
+        **kwargs:    Must contain batna_seller (float).
+                     Optionally batna_buyer (float) and scenario_id (str).
     Returns:
         List of float rewards: -OMEGA or 0.
     """
+    batna_seller = float(kwargs.get("batna_seller", 0.0))
+    batna_buyer  = float(kwargs.get("batna_buyer", float("inf")))
+    scenario_id  = str(kwargs.get("scenario_id", ""))
+    is_buyer_ai  = scenario_id in _BUYER_AI_SCENARIOS
     rewards = []
     for completion in completions:
         try:
             data = json.loads(_clean_json(completion))
             offer = float(data.get("offer_amount") or float("inf"))
+            if is_buyer_ai:
+                capitulated = offer > batna_buyer
+            else:
+                capitulated = offer < batna_seller
+            if capitulated:
                 rewards.append(-float(OMEGA))
+                logger.debug(
+                    f"Capitulation: offer={offer} {'>' if is_buyer_ai else '<'} "
+                    f"batna={'buyer=' + str(batna_buyer) if is_buyer_ai else 'seller=' + str(batna_seller)}"
+                )
             else:
                 rewards.append(0.0)
+        except (json.JSONDecodeError, ValueError) as exc:
+            logger.warning(f"anti_capitulation_reward parse error (treated as 0): {exc}")
             rewards.append(0.0)
     return rewards