Spaces:

pratinavseth
/

cricket-captain-llm

Running

File size: 33,563 Bytes

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>CricketCaptain-LLM — OpenEnv Hackathon 2026</title>
<style>
  * { box-sizing: border-box; margin: 0; padding: 0; }
  body { font-family: 'Segoe UI', system-ui, sans-serif; background: #0f1117; color: #e8eaf6; }

  .deck { width: 100%; }
  .slide {
    display: none;
    min-height: 100vh;
    padding: 52px 80px;
    flex-direction: column;
    justify-content: center;
    position: relative;
    overflow: hidden;
  }
  .slide.active { display: flex; }

  .s1  { background: linear-gradient(135deg, #0d1b2a 0%, #0f2744 100%); }
  .s2  { background: linear-gradient(135deg, #0a1e0a 0%, #0e2d12 100%); }
  .s3  { background: linear-gradient(135deg, #0d1e3a 0%, #112952 100%); }
  .s4  { background: linear-gradient(135deg, #1a0930 0%, #2c1060 100%); }
  .s5  { background: linear-gradient(135deg, #1e0a06 0%, #3d1008 100%); }
  .s6  { background: linear-gradient(135deg, #001d2e 0%, #003050 100%); }
  .s7  { background: linear-gradient(135deg, #0e1e0e 0%, #1a3a1a 100%); }
  .s8  { background: linear-gradient(135deg, #1a1400 0%, #332800 100%); }
  .s9  { background: linear-gradient(135deg, #001a1a 0%, #003030 100%); }
  .s10 { background: linear-gradient(135deg, #0d1b2a 0%, #0f2744 100%); }

  .slide-number {
    position: absolute; top: 22px; right: 36px;
    font-size: 12px; color: rgba(255,255,255,0.30); letter-spacing: 2px;
    font-family: monospace;
  }

  h1 { font-size: 2.9rem; font-weight: 700; line-height: 1.15; margin-bottom: 14px; }
  h2 { font-size: 1.85rem; font-weight: 600; margin-bottom: 20px; color: #90caf9; }
  h3 { font-size: 1.1rem; font-weight: 600; margin-bottom: 8px; color: #80deea; }
  p  { font-size: 1.05rem; line-height: 1.65; color: #cfd8dc; max-width: 860px; }
  .subtitle { font-size: 1.25rem; color: #90caf9; margin-bottom: 28px; font-weight: 400; max-width: 700px; }
  .tagline  { font-size: 1.4rem; color: #a5d6a7; font-style: italic; margin-top: 18px; }

  .two-col   { display: grid; grid-template-columns: 1fr 1fr; gap: 40px; margin-top: 8px; }
  .three-col { display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 24px; margin-top: 8px; }
  .four-col  { display: grid; grid-template-columns: 1fr 1fr 1fr 1fr; gap: 18px; margin-top: 8px; }

  .card {
    background: rgba(255,255,255,0.06);
    border: 1px solid rgba(255,255,255,0.12);
    border-radius: 12px;
    padding: 20px;
  }
  .card .value { font-size: 2rem; font-weight: 700; color: #80cbc4; }
  .card .value.large { font-size: 2.6rem; }
  .card .label { font-size: 0.85rem; color: #90a4ae; margin-top: 4px; line-height: 1.4; }

  ul { list-style: none; margin-top: 6px; }
  ul li { padding: 5px 0; padding-left: 20px; position: relative; color: #cfd8dc; font-size: 1.0rem; line-height: 1.5; }
  ul li::before { content: "▸"; position: absolute; left: 0; color: #4db6ac; }

  .tag {
    display: inline-block; padding: 2px 9px; border-radius: 4px;
    font-size: 0.76rem; font-weight: 600; letter-spacing: 0.4px; margin: 2px 3px;
  }
  .tag-green  { background: #1b5e20; color: #a5d6a7; }
  .tag-orange { background: #bf360c; color: #ffe0b2; }
  .tag-blue   { background: #0d47a1; color: #bbdefb; }
  .tag-purple { background: #4a148c; color: #e1bee7; }
  .tag-red    { background: #b71c1c; color: #ffcdd2; }
  .tag-teal   { background: #004d40; color: #b2dfdb; }
  .tag-yellow { background: #f57f17; color: #fff9c4; }

  table { width: 100%; border-collapse: collapse; margin-top: 14px; font-size: 0.92rem; }
  th { background: rgba(255,255,255,0.09); padding: 9px 13px; text-align: left; color: #b0bec5; font-weight: 600; }
  td { padding: 8px 13px; border-bottom: 1px solid rgba(255,255,255,0.06); color: #cfd8dc; }
  tr:last-child td { border-bottom: none; }
  tr:hover td { background: rgba(255,255,255,0.03); }

  code {
    background: rgba(255,255,255,0.1); border-radius: 4px;
    padding: 1px 6px; font-family: 'Cascadia Code', 'Fira Code', monospace;
    font-size: 0.85em; color: #80cbc4;
  }
  pre {
    background: rgba(0,0,0,0.45); border-radius: 8px; padding: 14px 18px;
    font-family: 'Cascadia Code', 'Fira Code', monospace; font-size: 0.80rem;
    color: #a5d6a7; line-height: 1.55; overflow-x: auto; margin-top: 10px;
    border: 1px solid rgba(255,255,255,0.07);
  }
  pre .dim { color: #546e7a; }
  pre .hi  { color: #ffcc80; }
  pre .kw  { color: #80cbc4; }

  .progress-bar {
    position: fixed; bottom: 0; left: 0; height: 3px;
    background: linear-gradient(90deg, #4db6ac, #7c4dff, #ef5350);
    transition: width 0.35s ease;
    z-index: 200;
  }
  .nav {
    position: fixed; bottom: 22px; right: 36px;
    display: flex; gap: 10px; z-index: 100;
  }
  .nav button {
    background: rgba(255,255,255,0.10); border: 1px solid rgba(255,255,255,0.18);
    color: #fff; padding: 9px 20px; border-radius: 6px; cursor: pointer;
    font-size: 0.88rem; transition: background 0.2s;
  }
  .nav button:hover { background: rgba(255,255,255,0.20); }
  .nav button:disabled { opacity: 0.25; cursor: default; }

  .slide-hint {
    position: fixed; bottom: 26px; left: 50%; transform: translateX(-50%);
    font-size: 11px; color: rgba(255,255,255,0.22); letter-spacing: 1px;
  }

  .hl { background: rgba(77,182,172,0.13); border-left: 3px solid #4db6ac; padding: 14px 18px; border-radius: 0 8px 8px 0; margin-top: 14px; }
  .wn { background: rgba(255,152,0,0.12); border-left: 3px solid #ff9800; padding: 14px 18px; border-radius: 0 8px 8px 0; margin-top: 14px; font-size: 0.93rem; }
  .gr { background: rgba(100,221,23,0.09); border-left: 3px solid #69f0ae; padding: 14px 18px; border-radius: 0 8px 8px 0; margin-top: 14px; }

  .pill-row { display: flex; flex-wrap: wrap; gap: 7px; margin-top: 10px; }

  .badge {
    display: inline-flex; align-items: center; gap: 6px;
    background: rgba(255,255,255,0.07); border: 1px solid rgba(255,255,255,0.13);
    padding: 5px 12px; border-radius: 20px; font-size: 0.82rem; color: #b0bec5;
  }

  .score-bar { margin: 5px 0; }
  .score-bar .bar-wrap { background: rgba(255,255,255,0.08); border-radius: 4px; height: 8px; margin-top: 3px; }
  .score-bar .bar-fill { height: 8px; border-radius: 4px; }

  .signal-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; margin-top: 10px; }
  .signal-item { background: rgba(255,255,255,0.04); border: 1px solid rgba(255,255,255,0.08); border-radius: 8px; padding: 10px 14px; font-size: 0.88rem; }
  .signal-item .sk { color: #80deea; font-weight: 600; margin-bottom: 3px; }
  .signal-item .sv { color: #90a4ae; font-size: 0.82rem; }
</style>
</head>
<body>

<!-- ══════════════════════════════════════════════════════════
     SLIDE 1 — Title & Hook
═══════════════════════════════════════════════════════════ -->
<div class="deck">
<section class="slide s1 active" id="s1">
  <div class="slide-number">01 / 10</div>
  <p style="font-size:2.6rem; margin-bottom:6px;">🏏</p>
  <h1>CricketCaptain-LLM</h1>
  <p class="subtitle">A multi-agent RL training environment for <strong>strategic coherence</strong> — teaching LLMs to mean what they say across 300 consecutive decisions.</p>
  <p class="tagline">"I'll consolidate and preserve wickets"  →  then actually plays defensively for 4 overs.</p>
  <div style="margin-top:28px;">
    <p style="color:#90a4ae; font-size:0.95rem; margin-bottom:10px;">Targets: Theme #1 Multi-Agent Interaction + Theme #2 Long-Horizon Planning</p>
    <div class="pill-row">
      <span class="badge">🌐 OpenEnv ≥ 0.2.2</span>
      <span class="badge">⚡ TRL MT-GRPO</span>
      <span class="badge">🤖 google/gemma-4-26B-A4B-it</span>
      <span class="badge">🔁 HF Router</span>
      <span class="badge">📊 Cricsheet Markov Engine</span>
      <span class="badge">🎯 WDCT Benchmark</span>
    </div>
  </div>
</section>

<!-- ══════════════════════════════════════════════════════════
     SLIDE 2 — The Problem: What LLMs Can't Do
═══════════════════════════════════════════════════════════ -->
<section class="slide s2" id="s2">
  <div class="slide-number">02 / 10</div>
  <h2>The Gap We're Closing</h2>
  <div class="two-col">
    <div>
      <h3>WDCT Benchmark — Words &amp; Deeds Consistency</h3>
      <p style="font-size:0.95rem; margin-bottom:12px;">arxiv:2503.07003 — the only public benchmark directly measuring whether LLMs execute what they declare.</p>
      <div class="four-col" style="grid-template-columns: 1fr 1fr; gap: 12px; margin-top:10px;">
        <div class="card"><div class="value">0.49</div><div class="label">Smaller models (7B class)</div></div>
        <div class="card"><div class="value">0.76</div><div class="label">GPT-4 (best published)</div></div>
      </div>
      <div class="hl" style="margin-top:14px;">
        <strong>No RL training environment has targeted this benchmark directly.</strong><br>
        We built one.
      </div>
    </div>
    <div>
      <h3>Why It Matters Beyond Cricket</h3>
      <ul>
        <li>Planning agents that can't commit to strategy fail silently</li>
        <li>Reasoning traces are only useful if they predict the next action</li>
        <li>Chain-of-thought gains are undermined by declaration-execution drift</li>
        <li>Every agentic system suffers from this; no training environment targets it</li>
      </ul>
      <div class="wn" style="margin-top:14px;">
        A model that <em>says</em> "preserve wickets" but <em>plays</em> aggressive shots hasn't learned strategy — it's learned to <em>sound</em> strategic.
      </div>
    </div>
  </div>
</section>

<!-- ══════════════════════════════════════════════════════════
     SLIDE 3 — Why Cricket: The Perfect Domain
═══════════════════════════════════════════════════════════ -->
<section class="slide s3" id="s3">
  <div class="slide-number">03 / 10</div>
  <h2>Why Cricket Is the Right Stress Test</h2>
  <div class="two-col">
    <div>
      <table style="margin-top:0;">
        <tr><th>Property</th><th>Capability Forced</th></tr>
        <tr><td>300 consecutive decisions (50×6)</td><td>Long-horizon coherence</td></tr>
        <tr><td>10 wickets as irreversible budget</td><td>Consequence-aware risk planning</td></tr>
        <tr><td>Powerplay → Middle → Death phases</td><td>Strategic revision at regime shifts</td></tr>
        <tr><td>DLS par = ground-truth optimal score</td><td>Objective performance signal</td></tr>
        <tr><td>"Declare strategy, then play shots"</td><td>Declaration-execution directly testable</td></tr>
        <tr><td>LLM opponent (HF Router)</td><td>Theory-of-mind / opponent modeling</td></tr>
        <tr><td>Full match: bat &amp; bowl both innings</td><td>End-to-end role adaptation</td></tr>
      </table>
    </div>
    <div>
      <h3>Why Not Chess / Math / Coding?</h3>
      <ul>
        <li>Chess: no natural language declarations; coherence untestable</li>
        <li>Math: single-step; no 300-turn consistency requirement</li>
        <li>Coding: rare phase transitions; no risk budget exhaustion</li>
        <li>Cricket: declarations are mandatory tool calls, shots are mandatory tool calls — alignment is <em>structurally enforced</em></li>
      </ul>
      <div class="gr">
        Cricket is not the goal. It's the <em>measurement apparatus</em> for a capability that transfers to every agentic domain.
      </div>
    </div>
  </div>
</section>

<!-- ══════════════════════════════════════════════════════════
     SLIDE 4 — Environment Architecture & State Machine
═══════════════════════════════════════════════════════════ -->
<section class="slide s4" id="s4">
  <div class="slide-number">04 / 10</div>
  <h2>Environment Architecture</h2>
  <div class="two-col">
    <div>
      <h3>OpenEnv Server — State Machine</h3>
      <div style="font-family:monospace; font-size:0.82rem; line-height:2.1; margin-top:8px; color:#cfd8dc;">
        <span style="color:#ef9a9a;">TOSS</span>
        &nbsp;→&nbsp;
        <span style="color:#80cbc4;">BATTING</span>
        &nbsp;→&nbsp;
        <span style="color:#ffcc80;">BOWLING</span>
        &nbsp;→&nbsp;
        <span style="color:#a5d6a7;">RESULT</span>
      </div>
      <div class="signal-grid" style="margin-top:14px;">
        <div class="signal-item"><div class="sk">Markov Engine</div><div class="sv">5-dim key: over × wickets × score_band × phase × bowler_type → (runs, wicket_fell)</div></div>
        <div class="signal-item"><div class="sk">Cricsheet Data</div><div class="sv">Ball-by-ball transition probs from real ODI/T20 matches; synthetic fallback</div></div>
        <div class="signal-item"><div class="sk">Format Mapper</div><div class="sv">T5 / T20 / ODI rules auto-selected by closest max_overs; phase-aware shot weights, batter &amp; bowler roles from <code>format_rules.json</code></div></div>
        <div class="signal-item"><div class="sk">Player Rosters</div><div class="sv">10 T20I team profiles; fuzzy name lookup (exact → surname → word-overlap); real aggression/style fed into select_batter / choose_bowler</div></div>
        <div class="signal-item"><div class="sk">Tool Budget</div><div class="sv">3 overhead calls/over (analyze, reflect, plan_delivery, set_strategy, set_bowling_strategy); −0.04 fine per excess call; plan_shot budget-free</div></div>
        <div class="signal-item"><div class="sk">LLM Opponent</div><div class="sv">google/gemma-4-26B-A4B-it via HF Router (default); graceful heuristic fallback; llm_cached mode for reproducible eval</div></div>
        <div class="signal-item"><div class="sk">DLS Par</div><div class="sv">Duckworth-Lewis par score as objective target; used in r_result</div></div>
        <div class="signal-item"><div class="sk">Concurrent Sessions</div><div class="sv">SUPPORTS_CONCURRENT_SESSIONS = True; max 4 parallel envs</div></div>
      </div>
    </div>
    <div>
      <h3>12 Tools — 4 Categories</h3>
      <div style="margin-top:8px;">
        <p style="font-size:0.8rem; color:#90a4ae; margin-bottom:6px;">PLANNING</p>
        <div class="pill-row" style="margin-top:0;">
          <span class="tag tag-blue">call_toss</span>
          <span class="tag tag-blue">set_match_plan</span>
          <span class="tag tag-blue">update_match_plan</span>
        </div>
        <p style="font-size:0.8rem; color:#90a4ae; margin-top:10px; margin-bottom:6px;">BATTING EXECUTION</p>
        <div class="pill-row" style="margin-top:0;">
          <span class="tag tag-green">set_strategy</span>
          <span class="tag tag-green">plan_shot</span>
          <span class="tag tag-green">play_delivery</span>
        </div>
        <p style="font-size:0.8rem; color:#90a4ae; margin-top:10px; margin-bottom:6px;">BOWLING EXECUTION</p>
        <div class="pill-row" style="margin-top:0;">
          <span class="tag tag-orange">choose_bowler</span>
          <span class="tag tag-orange">set_bowling_strategy</span>
          <span class="tag tag-orange">bowl_delivery</span>
        </div>
        <p style="font-size:0.8rem; color:#90a4ae; margin-top:10px; margin-bottom:6px;">ANALYSIS</p>
        <div class="pill-row" style="margin-top:0;">
          <span class="tag tag-purple">analyze_situation</span>
          <span class="tag tag-purple">reflect_after_ball</span>
        </div>
      </div>
      <div class="hl" style="margin-top:14px; font-size:0.88rem;">
        Tools are <strong>phase-gated</strong> — batting tools unavailable during bowling, etc. Invalid phase = 0 reward turn.
      </div>
    </div>
  </div>
</section>

<!-- ══════════════════════════════════════════════════════════
     SLIDE 5 — Observation Space & Agent Signals
═══════════════════════════════════════════════════════════ -->
<section class="slide s5" id="s5">
  <div class="slide-number">05 / 10</div>
  <h2>What the Agent Sees — Observation Space</h2>
  <div class="two-col">
    <div>
      <h3>Per-Turn Observation (structured JSON + rendered text)</h3>
      <pre><span class="dim">// game_context</span>
{ "over": 14, "ball": 3, "score": 112, "wickets": 2,
  "run_rate": 7.8, "req_rate": 8.4, "phase": "MIDDLE",
  "bowler_type": "spin", "field_setting": "Attacking" }

<span class="dim">// declared_strategy (agent's own prior declaration)</span>
{ "phase_intent": "consolidate", "aggression": 0.35,
  "rationale": "Preserve wickets, build platform" }

<span class="dim">// tool_budget (per-over overhead counter)</span>
{ "overhead_calls_this_over": 1, "budget": 3,
  "remaining": 2, "fines_accumulated": 0.0 }

<span class="dim">// last_outcome</span>
{ "runs": 1, "wicket": false, "extras": 0 }

<span class="dim">// available_tools + tool_history (last 5)</span></pre>
    </div>
    <div>
      <h3>State Fields Used as Reward Signals</h3>
      <div class="signal-grid">
        <div class="signal-item"><div class="sk">coherence_scores[ ]</div><div class="sv">Per-delivery aggression_match × rationale_specificity × phase_fit</div></div>
        <div class="signal-item"><div class="sk">adaptation_scores[ ]</div><div class="sv">Strategy updated after wicket / phase shift; 0 if stuck</div></div>
        <div class="signal-item"><div class="sk">opponent_awareness_scores[ ]</div><div class="sv">Response to opponent's stated field/line changes</div></div>
        <div class="signal-item"><div class="sk">regret_scores[ ]</div><div class="sv">Counterfactual: did agent outperform or underperform heuristic baseline?</div></div>
        <div class="signal-item"><div class="sk">plan_commitment_scores[ ]</div><div class="sv">Keyword overlap: match_plan rationale → delivery rationale</div></div>
        <div class="signal-item"><div class="sk">plan_staleness_penalties[ ]</div><div class="sv">Penalty if plan not refreshed for 2+ overs when context shifted</div></div>
      </div>
      <div class="hl" style="font-size:0.85rem; margin-top:10px;">
        <code>prompt_text</code> is a rendered summary of all above — fed directly to the LLM. Strategy extracted from rendered text for stateless GRPO.
      </div>
    </div>
  </div>
</section>

<!-- ══════════════════════════════════════════════════════════
     SLIDE 6 — Reward Architecture (the heart of it)
═══════════════════════════════════════════════════════════ -->
<section class="slide s6" id="s6">
  <div class="slide-number">06 / 10</div>
  <h2>4-Rubric Composite Reward — Hard to Game</h2>
  <table>
    <tr>
      <th>Rubric</th><th>Weight</th><th>Frequency</th><th>Measures</th><th>Key Sub-signals</th>
    </tr>
    <tr>
      <td><code>r_cricket</code></td>
      <td><strong>45%</strong></td>
      <td>Per ball</td>
      <td>Dream11 proxy: runs, wickets, milestones</td>
      <td>dot%, boundary%, 50s/100s, maiden overs, economy</td>
    </tr>
    <tr>
      <td><code>r_behavior</code></td>
      <td><strong>25%</strong></td>
      <td>Every turn</td>
      <td>Declaration-execution alignment</td>
      <td>coherence (50%) + adaptation (20%) + opponent_awareness (20%) + regret (10%)</td>
    </tr>
    <tr>
      <td><code>r_result</code></td>
      <td><strong>20%</strong></td>
      <td>Innings/episode end</td>
      <td>Win/loss vs DLS par, target margin</td>
      <td>score/par, wickets_remaining, lead/deficit, +0.25 progress bonus</td>
    </tr>
    <tr>
      <td><code>r_validity</code></td>
      <td><strong>10%</strong></td>
      <td>Every turn</td>
      <td>Parseable XML/JSON tool call</td>
      <td>Format gate; 0 = parse fail, 1 = valid</td>
    </tr>
  </table>
  <p style="margin-top:8px;font-size:0.9em;color:#888">Rebalanced from 55/25/15/5 → 45/25/20/10 to match the SWE-RL recipe (60% intermediate / 40% terminal). Reasoning: partial-trajectory training rarely fires <code>r_result</code>; weighting it 55% wastes gradient on a near-zero signal.</p>
  <div class="two-col" style="margin-top:18px;">
    <div>
      <h3>Coherence Score Formula (per delivery)</h3>
      <pre><span class="hi">coherence</span> = (
  <span class="kw">aggression_match</span>   <span class="dim"># |declared_aggression - shot_aggression_proxy|</span>
  × <span class="kw">rationale_specificity</span> <span class="dim"># min(words / 15, 1.0)</span>
  × <span class="kw">phase_appropriate</span>  <span class="dim"># 1.0 if shot fits phase norms, 0.6 otherwise</span>
)</pre>
    </div>
    <div>
      <h3>Single-Stage Training + Format Curriculum</h3>
      <ul>
        <li><strong>Warmup (2–3 over curriculum):</strong> per-scenario <code>max_overs</code> sampled from [2,2,2,2,2,2,3,3,3] so episodes complete in budget and <code>r_result</code> can fire</li>
        <li><strong>Main run (5-over end-to-end):</strong> resumes warmup adapter, trains on target eval distribution</li>
        <li>Qwen3-4B-Instruct-2507 emits <code>&lt;tool_call&gt;...&lt;/tool_call&gt;</code> natively — no Stage 1 SFT needed</li>
        <li>GRPO group size = 4; full episode advantages (TRL <code>environment_factory</code>)</li>
      </ul>
    </div>
  </div>
</section>

<!-- ══════════════════════════════════════════════════════════
     SLIDE 7 — Training Pipeline: Stateless GRPO Trick
═══════════════════════════════════════════════════════════ -->
<section class="slide s7" id="s7">
  <div class="slide-number">07 / 10</div>
  <h2>Training Pipeline — Stateless GRPO</h2>
  <div class="two-col">
    <div>
      <h3>The Core Technical Insight</h3>
      <p style="font-size:0.93rem; margin-bottom:10px;">TRL's GRPOTrainer requires a <strong>stateless</strong> reward function: <code>reward_fn(prompts, completions) → list[float]</code>. No env.step() inside.</p>
      <div class="hl" style="font-size:0.88rem;">
        The strategy the agent declared is embedded in the rendered <code>prompt_text</code> as "Batting Strategy: …". We parse it back with regex — no shared env state needed.
      </div>
      <pre style="margin-top:12px;"><span class="kw">_STRATEGY_RE</span> = re.compile(
  r"<span class="hi">Batting Strategy:\s*(.+)$</span>", re.MULTILINE
)
<span class="kw">_PHASE_RE</span>    = re.compile(
  r"<span class="hi">Phase:\s+(POWERPLAY|MIDDLE|DEATH)</span>", re.I
)

<span class="dim"># r_behavior scored from (prompt, completion) alone</span>
<span class="dim"># r_result injected at episode end → all turns</span></pre>
      <h3 style="margin-top:14px;">Stack</h3>
      <div class="pill-row">
        <span class="badge">gemma-4-26B-A4B-it</span>
        <span class="badge">Unsloth 4-bit LoRA</span>
        <span class="badge">TRL GRPOTrainer</span>
        <span class="badge">MT-GRPO per-turn advantage</span>
      </div>
    </div>
    <div>
      <h3>End-to-End Commands</h3>
      <pre><span class="dim"># 1. Start server</span>
uvicorn server.app:app --port 8766

<span class="dim"># 2. Sanity test (3-over match, heuristic)</span>
python train.py train-smoke \
  --config configs/default.yaml \
  --matches 1 --max-overs 3

<span class="dim"># 3. Live LLM match (HF Router)</span>
python inference.py \
  --config configs/default.yaml \
  --max-overs 3 --opponent-mode llm_live

<span class="dim"># 4. Warmup → Main chained run (auto-resumes adapter)</span>
bash scripts/run_warmup_then_main.sh

<span class="dim"># 5. Eval: untrained vs trained head-to-head</span>
python compare_eval.py --model Qwen/Qwen3-4B-Instruct-2507 \
  --label baseline --episodes 20 --max-overs 5 \
  --output eval_results/baseline.json
python compare_eval.py --model Qwen/Qwen3-4B-Instruct-2507 \
  --adapter ./checkpoints/stage2_final \
  --label trained --episodes 20 --max-overs 5 \
  --output eval_results/trained.json
python compare_eval.py --compare \
  eval_results/baseline.json eval_results/trained.json</pre>
      <div class="wn" style="font-size:0.84rem;">
        All model / API / env settings live in <code>configs/default.yaml</code>. Zero hardcoding.
      </div>
    </div>
  </div>
</section>

<!-- ══════════════════════════════════════════════════════════
     SLIDE 8 — Current Results & Baseline Numbers
═══════════════════════════════════════════════════════════ -->
<section class="slide s8" id="s8">
  <div class="slide-number">08 / 10</div>
  <h2>What We Measured — Baseline Results</h2>
  <div class="four-col">
    <div class="card">
      <div class="value large">0%</div>
      <div class="label">Parse error rate across all 14 runs — heuristic, cricsheet, llm_live opponents</div>
    </div>
    <div class="card">
      <div class="value large">1.0</div>
      <div class="label">r_validity across all 9 train-smoke matches (3 opponent modes × 3 matches)</div>
    </div>
    <div class="card">
      <div class="value large">0.62</div>
      <div class="label">Peak mean coherence (train-smoke, cricsheet opponent, 5-over)</div>
    </div>
    <div class="card">
      <div class="value large">3</div>
      <div class="label">Opponent modes verified end-to-end: heuristic · cricsheet · llm_live</div>
    </div>
  </div>
  <div class="two-col" style="margin-top:20px;">
    <div>
      <h3>What training should produce (target)</h3>
      <ul>
        <li>r_validity: 0.70 → 0.98+ after warmup (25 steps)</li>
        <li>Coherence: ~0.52 (random) → 0.75+ after main run</li>
        <li>analyze_situation calls cluster at over 6, 16, 36 transitions</li>
        <li>Strategy declarations become more specific (&gt;15 word rationales)</li>
        <li>Shot choices match declared aggression level &gt;80% of deliveries</li>
      </ul>
    </div>
    <div>
      <h3>Reward signals verified working ✅</h3>
      <ul>
        <li>plan_commitment_scores populated per delivery</li>
        <li>plan_staleness_penalties active at over-end</li>
        <li>coherence_scores differentiate matching vs mismatching strategies</li>
        <li>adaptation_scores fire on wicket loss &amp; phase transitions</li>
        <li>opponent_awareness_scores respond to field change</li>
      </ul>
      <div class="wn" style="font-size:0.84rem; margin-top:10px;">
        All signals verified. Full reward curves pending GRPO training run. Colab notebook ready.
      </div>
    </div>
  </div>
</section>

<!-- ══════════════════════════════════════════════════════════
     SLIDE 9 — Innovation Summary & Judging Criteria Mapping
═══════════════════════════════════════════════════════════ -->
<section class="slide s9" id="s9">
  <div class="slide-number">09 / 10</div>
  <h2>Why This Scores on Every Judging Dimension</h2>
  <table>
    <tr>
      <th>Criterion</th><th>Weight</th><th>Our Angle</th><th>Evidence</th>
    </tr>
    <tr>
      <td><strong>Environment Innovation</strong></td>
      <td>40%</td>
      <td>First RL env purpose-built for WDCT / declaration-execution alignment</td>
      <td>12-tool multi-phase, multi-agent, 300-decision episodes with Cricsheet Markov engine</td>
    </tr>
    <tr>
      <td><strong>Storytelling</strong></td>
      <td>30%</td>
      <td>Clear capability gap → environment design → reward signal → training → measurable WDCT improvement</td>
      <td>README, this slide deck, Colab notebook, Gradio live demo</td>
    </tr>
    <tr>
      <td><strong>Improvement in Rewards</strong></td>
      <td>20%</td>
      <td>Two-stage curriculum produces observable r_validity spike then coherence rise</td>
      <td>Colab plots, before/after tool call samples, coherence heatmap</td>
    </tr>
    <tr>
      <td><strong>Reward &amp; Pipeline</strong></td>
      <td>10%</td>
      <td>4-rubric composite, hard to game (r_result at episode end enforces real match outcomes)</td>
      <td>server/reward_calculator.py, server/coherence_grader.py, stateless GRPO reward fn</td>
    </tr>
  </table>
  <div class="two-col" style="margin-top:16px;">
    <div class="hl">
      <strong>Unique technical contributions:</strong><br>
      Stateless GRPO via prompt-text parsing · Plan staleness penalty · Per-delivery commitment scoring · Phase-gated tool availability · LLM opponent via HF Router · Format-aware rules (T5/T20/ODI) · Tool budget + fine system · Real player roster lookup
    </div>
    <div class="gr">
      <strong>Minimum requirements met:</strong><br>
      ✅ OpenEnv latest &nbsp; ✅ TRL/Unsloth Colab &nbsp; ✅ HF Space (ready) &nbsp; ✅ README with results<br>
      ✅ 3 opponent modes verified (heuristic · cricsheet · llm_live) &nbsp; ⚠️ Blog / video post-training
    </div>
  </div>
</section>

<!-- ══════════════════════════════════════════════════════════
     SLIDE 10 — Roadmap & The Money Shot
═══════════════════════════════════════════════════════════ -->
<section class="slide s10" id="s10">
  <div class="slide-number">10 / 10</div>
  <h2>Roadmap to Submission</h2>
  <div class="two-col">
    <div>
      <h3>🔴 Critical Path (on-site, Day 1–2)</h3>
      <ul>
        <li>Run Colab notebook (notebooks/colab_train_minimal.ipynb) → warmup → main chained training</li>
        <li>Export: reward_curves.png, coherence_heatmap.png, tool_timeline.png</li>
        <li>Deploy to HuggingFace Spaces → live interactive Gradio demo URL</li>
        <li>Add HF Space URL + plot images to README</li>
        <li>Write 500-word mini-blog on HF (problem → env → results)</li>
        <li>Run Cricsheet data curation (<code>scripts/curate_transitions.py</code>) for real ball probs</li>
      </ul>
    </div>
    <div>
      <h3>💰 The Money Shot for Judges</h3>
      <div class="hl">
        A heatmap: <strong>episode × delivery coherence score</strong>, showing the gradient rising from ~0.35 (random) toward 0.75+ as training progresses. This directly visualizes the declared coherence improvement.
      </div>
      <h3 style="margin-top:16px;">🟡 Stretch (improves score)</h3>
      <ul>
        <li>WDCT before/after comparison on canonical states (Over 35, 180/3)</li>
        <li>Opponent cache for reproducible eval without API calls</li>
        <li>&lt;2 min screen demo video (Gradio UI + reward curve walkthrough)</li>
      </ul>
      <div style="margin-top:16px; font-size:0.82rem; color:#546e7a;">
        github.com/[team]/cricket-captain-llm &nbsp;·&nbsp; huggingface.co/spaces/[team]/cricket-captain
      </div>
    </div>
  </div>
</section>
</div><!-- .deck -->

<div class="progress-bar" id="prog"></div>
<div class="nav">
  <button id="prev" onclick="go(-1)" disabled>← Prev</button>
  <button id="next" onclick="go(1)">Next →</button>
</div>
<div class="slide-hint">← → arrow keys to navigate</div>

<script>
  const slides = document.querySelectorAll('.slide');
  let cur = 0;
  const prog = document.getElementById('prog');

  function go(dir) {
    slides[cur].classList.remove('active');
    cur = Math.max(0, Math.min(slides.length - 1, cur + dir));
    slides[cur].classList.add('active');
    document.getElementById('prev').disabled = (cur === 0);
    document.getElementById('next').disabled = (cur === slides.length - 1);
    prog.style.width = ((cur + 1) / slides.length * 100) + '%';
    window.scrollTo(0, 0);
  }

  document.addEventListener('keydown', e => {
    if (e.key === 'ArrowRight' || e.key === 'ArrowDown') go(1);
    if (e.key === 'ArrowLeft'  || e.key === 'ArrowUp')   go(-1);
  });

  prog.style.width = (1 / slides.length * 100) + '%';
</script>
</body>
</html>