cricket-captain-llm / docs /slides.html
pratinavseth's picture
sync: pull latest from main (model_server.py, captain LLM toggle in ui.py, 0.6B configs, SUBMISSION + RUNTIME_DURABILITY docs)
e70c305 verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>CricketCaptain-LLM — OpenEnv Hackathon 2026</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body { font-family: 'Segoe UI', system-ui, sans-serif; background: #0f1117; color: #e8eaf6; }
.deck { width: 100%; }
.slide {
display: none;
min-height: 100vh;
padding: 52px 80px;
flex-direction: column;
justify-content: center;
position: relative;
overflow: hidden;
}
.slide.active { display: flex; }
.s1 { background: linear-gradient(135deg, #0d1b2a 0%, #0f2744 100%); }
.s2 { background: linear-gradient(135deg, #0a1e0a 0%, #0e2d12 100%); }
.s3 { background: linear-gradient(135deg, #0d1e3a 0%, #112952 100%); }
.s4 { background: linear-gradient(135deg, #1a0930 0%, #2c1060 100%); }
.s5 { background: linear-gradient(135deg, #1e0a06 0%, #3d1008 100%); }
.s6 { background: linear-gradient(135deg, #001d2e 0%, #003050 100%); }
.s7 { background: linear-gradient(135deg, #0e1e0e 0%, #1a3a1a 100%); }
.s8 { background: linear-gradient(135deg, #1a1400 0%, #332800 100%); }
.s9 { background: linear-gradient(135deg, #001a1a 0%, #003030 100%); }
.s10 { background: linear-gradient(135deg, #0d1b2a 0%, #0f2744 100%); }
.slide-number {
position: absolute; top: 22px; right: 36px;
font-size: 12px; color: rgba(255,255,255,0.30); letter-spacing: 2px;
font-family: monospace;
}
h1 { font-size: 2.9rem; font-weight: 700; line-height: 1.15; margin-bottom: 14px; }
h2 { font-size: 1.85rem; font-weight: 600; margin-bottom: 20px; color: #90caf9; }
h3 { font-size: 1.1rem; font-weight: 600; margin-bottom: 8px; color: #80deea; }
p { font-size: 1.05rem; line-height: 1.65; color: #cfd8dc; max-width: 860px; }
.subtitle { font-size: 1.25rem; color: #90caf9; margin-bottom: 28px; font-weight: 400; max-width: 700px; }
.tagline { font-size: 1.4rem; color: #a5d6a7; font-style: italic; margin-top: 18px; }
.two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 40px; margin-top: 8px; }
.three-col { display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 24px; margin-top: 8px; }
.four-col { display: grid; grid-template-columns: 1fr 1fr 1fr 1fr; gap: 18px; margin-top: 8px; }
.card {
background: rgba(255,255,255,0.06);
border: 1px solid rgba(255,255,255,0.12);
border-radius: 12px;
padding: 20px;
}
.card .value { font-size: 2rem; font-weight: 700; color: #80cbc4; }
.card .value.large { font-size: 2.6rem; }
.card .label { font-size: 0.85rem; color: #90a4ae; margin-top: 4px; line-height: 1.4; }
ul { list-style: none; margin-top: 6px; }
ul li { padding: 5px 0; padding-left: 20px; position: relative; color: #cfd8dc; font-size: 1.0rem; line-height: 1.5; }
ul li::before { content: "▸"; position: absolute; left: 0; color: #4db6ac; }
.tag {
display: inline-block; padding: 2px 9px; border-radius: 4px;
font-size: 0.76rem; font-weight: 600; letter-spacing: 0.4px; margin: 2px 3px;
}
.tag-green { background: #1b5e20; color: #a5d6a7; }
.tag-orange { background: #bf360c; color: #ffe0b2; }
.tag-blue { background: #0d47a1; color: #bbdefb; }
.tag-purple { background: #4a148c; color: #e1bee7; }
.tag-red { background: #b71c1c; color: #ffcdd2; }
.tag-teal { background: #004d40; color: #b2dfdb; }
.tag-yellow { background: #f57f17; color: #fff9c4; }
table { width: 100%; border-collapse: collapse; margin-top: 14px; font-size: 0.92rem; }
th { background: rgba(255,255,255,0.09); padding: 9px 13px; text-align: left; color: #b0bec5; font-weight: 600; }
td { padding: 8px 13px; border-bottom: 1px solid rgba(255,255,255,0.06); color: #cfd8dc; }
tr:last-child td { border-bottom: none; }
tr:hover td { background: rgba(255,255,255,0.03); }
code {
background: rgba(255,255,255,0.1); border-radius: 4px;
padding: 1px 6px; font-family: 'Cascadia Code', 'Fira Code', monospace;
font-size: 0.85em; color: #80cbc4;
}
pre {
background: rgba(0,0,0,0.45); border-radius: 8px; padding: 14px 18px;
font-family: 'Cascadia Code', 'Fira Code', monospace; font-size: 0.80rem;
color: #a5d6a7; line-height: 1.55; overflow-x: auto; margin-top: 10px;
border: 1px solid rgba(255,255,255,0.07);
}
pre .dim { color: #546e7a; }
pre .hi { color: #ffcc80; }
pre .kw { color: #80cbc4; }
.progress-bar {
position: fixed; bottom: 0; left: 0; height: 3px;
background: linear-gradient(90deg, #4db6ac, #7c4dff, #ef5350);
transition: width 0.35s ease;
z-index: 200;
}
.nav {
position: fixed; bottom: 22px; right: 36px;
display: flex; gap: 10px; z-index: 100;
}
.nav button {
background: rgba(255,255,255,0.10); border: 1px solid rgba(255,255,255,0.18);
color: #fff; padding: 9px 20px; border-radius: 6px; cursor: pointer;
font-size: 0.88rem; transition: background 0.2s;
}
.nav button:hover { background: rgba(255,255,255,0.20); }
.nav button:disabled { opacity: 0.25; cursor: default; }
.slide-hint {
position: fixed; bottom: 26px; left: 50%; transform: translateX(-50%);
font-size: 11px; color: rgba(255,255,255,0.22); letter-spacing: 1px;
}
.hl { background: rgba(77,182,172,0.13); border-left: 3px solid #4db6ac; padding: 14px 18px; border-radius: 0 8px 8px 0; margin-top: 14px; }
.wn { background: rgba(255,152,0,0.12); border-left: 3px solid #ff9800; padding: 14px 18px; border-radius: 0 8px 8px 0; margin-top: 14px; font-size: 0.93rem; }
.gr { background: rgba(100,221,23,0.09); border-left: 3px solid #69f0ae; padding: 14px 18px; border-radius: 0 8px 8px 0; margin-top: 14px; }
.pill-row { display: flex; flex-wrap: wrap; gap: 7px; margin-top: 10px; }
.badge {
display: inline-flex; align-items: center; gap: 6px;
background: rgba(255,255,255,0.07); border: 1px solid rgba(255,255,255,0.13);
padding: 5px 12px; border-radius: 20px; font-size: 0.82rem; color: #b0bec5;
}
.score-bar { margin: 5px 0; }
.score-bar .bar-wrap { background: rgba(255,255,255,0.08); border-radius: 4px; height: 8px; margin-top: 3px; }
.score-bar .bar-fill { height: 8px; border-radius: 4px; }
.signal-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; margin-top: 10px; }
.signal-item { background: rgba(255,255,255,0.04); border: 1px solid rgba(255,255,255,0.08); border-radius: 8px; padding: 10px 14px; font-size: 0.88rem; }
.signal-item .sk { color: #80deea; font-weight: 600; margin-bottom: 3px; }
.signal-item .sv { color: #90a4ae; font-size: 0.82rem; }
</style>
</head>
<body>
<!-- ══════════════════════════════════════════════════════════
SLIDE 1 — Title & Hook
═══════════════════════════════════════════════════════════ -->
<div class="deck">
<section class="slide s1 active" id="s1">
<div class="slide-number">01 / 10</div>
<p style="font-size:2.6rem; margin-bottom:6px;">🏏</p>
<h1>CricketCaptain-LLM</h1>
<p class="subtitle">A multi-agent RL training environment for <strong>strategic coherence</strong> — teaching LLMs to mean what they say across 300 consecutive decisions.</p>
<p class="tagline">"I'll consolidate and preserve wickets" → then actually plays defensively for 4 overs.</p>
<div style="margin-top:28px;">
<p style="color:#90a4ae; font-size:0.95rem; margin-bottom:10px;">Targets: Theme #1 Multi-Agent Interaction + Theme #2 Long-Horizon Planning</p>
<div class="pill-row">
<span class="badge">🌐 OpenEnv ≥ 0.2.2</span>
<span class="badge">⚡ TRL MT-GRPO</span>
<span class="badge">🤖 google/gemma-4-26B-A4B-it</span>
<span class="badge">🔁 HF Router</span>
<span class="badge">📊 Cricsheet Markov Engine</span>
<span class="badge">🎯 WDCT Benchmark</span>
</div>
</div>
</section>
<!-- ══════════════════════════════════════════════════════════
SLIDE 2 — The Problem: What LLMs Can't Do
═══════════════════════════════════════════════════════════ -->
<section class="slide s2" id="s2">
<div class="slide-number">02 / 10</div>
<h2>The Gap We're Closing</h2>
<div class="two-col">
<div>
<h3>WDCT Benchmark — Words &amp; Deeds Consistency</h3>
<p style="font-size:0.95rem; margin-bottom:12px;">arxiv:2503.07003 — the only public benchmark directly measuring whether LLMs execute what they declare.</p>
<div class="four-col" style="grid-template-columns: 1fr 1fr; gap: 12px; margin-top:10px;">
<div class="card"><div class="value">0.49</div><div class="label">Smaller models (7B class)</div></div>
<div class="card"><div class="value">0.76</div><div class="label">GPT-4 (best published)</div></div>
</div>
<div class="hl" style="margin-top:14px;">
<strong>No RL training environment has targeted this benchmark directly.</strong><br>
We built one.
</div>
</div>
<div>
<h3>Why It Matters Beyond Cricket</h3>
<ul>
<li>Planning agents that can't commit to strategy fail silently</li>
<li>Reasoning traces are only useful if they predict the next action</li>
<li>Chain-of-thought gains are undermined by declaration-execution drift</li>
<li>Every agentic system suffers from this; no training environment targets it</li>
</ul>
<div class="wn" style="margin-top:14px;">
A model that <em>says</em> "preserve wickets" but <em>plays</em> aggressive shots hasn't learned strategy — it's learned to <em>sound</em> strategic.
</div>
</div>
</div>
</section>
<!-- ══════════════════════════════════════════════════════════
SLIDE 3 — Why Cricket: The Perfect Domain
═══════════════════════════════════════════════════════════ -->
<section class="slide s3" id="s3">
<div class="slide-number">03 / 10</div>
<h2>Why Cricket Is the Right Stress Test</h2>
<div class="two-col">
<div>
<table style="margin-top:0;">
<tr><th>Property</th><th>Capability Forced</th></tr>
<tr><td>300 consecutive decisions (50×6)</td><td>Long-horizon coherence</td></tr>
<tr><td>10 wickets as irreversible budget</td><td>Consequence-aware risk planning</td></tr>
<tr><td>Powerplay → Middle → Death phases</td><td>Strategic revision at regime shifts</td></tr>
<tr><td>DLS par = ground-truth optimal score</td><td>Objective performance signal</td></tr>
<tr><td>"Declare strategy, then play shots"</td><td>Declaration-execution directly testable</td></tr>
<tr><td>LLM opponent (HF Router)</td><td>Theory-of-mind / opponent modeling</td></tr>
<tr><td>Full match: bat &amp; bowl both innings</td><td>End-to-end role adaptation</td></tr>
</table>
</div>
<div>
<h3>Why Not Chess / Math / Coding?</h3>
<ul>
<li>Chess: no natural language declarations; coherence untestable</li>
<li>Math: single-step; no 300-turn consistency requirement</li>
<li>Coding: rare phase transitions; no risk budget exhaustion</li>
<li>Cricket: declarations are mandatory tool calls, shots are mandatory tool calls — alignment is <em>structurally enforced</em></li>
</ul>
<div class="gr">
Cricket is not the goal. It's the <em>measurement apparatus</em> for a capability that transfers to every agentic domain.
</div>
</div>
</div>
</section>
<!-- ══════════════════════════════════════════════════════════
SLIDE 4 — Environment Architecture & State Machine
═══════════════════════════════════════════════════════════ -->
<section class="slide s4" id="s4">
<div class="slide-number">04 / 10</div>
<h2>Environment Architecture</h2>
<div class="two-col">
<div>
<h3>OpenEnv Server — State Machine</h3>
<div style="font-family:monospace; font-size:0.82rem; line-height:2.1; margin-top:8px; color:#cfd8dc;">
<span style="color:#ef9a9a;">TOSS</span>
&nbsp;&nbsp;
<span style="color:#80cbc4;">BATTING</span>
&nbsp;&nbsp;
<span style="color:#ffcc80;">BOWLING</span>
&nbsp;&nbsp;
<span style="color:#a5d6a7;">RESULT</span>
</div>
<div class="signal-grid" style="margin-top:14px;">
<div class="signal-item"><div class="sk">Markov Engine</div><div class="sv">5-dim key: over × wickets × score_band × phase × bowler_type → (runs, wicket_fell)</div></div>
<div class="signal-item"><div class="sk">Cricsheet Data</div><div class="sv">Ball-by-ball transition probs from real ODI/T20 matches; synthetic fallback</div></div>
<div class="signal-item"><div class="sk">Format Mapper</div><div class="sv">T5 / T20 / ODI rules auto-selected by closest max_overs; phase-aware shot weights, batter &amp; bowler roles from <code>format_rules.json</code></div></div>
<div class="signal-item"><div class="sk">Player Rosters</div><div class="sv">10 T20I team profiles; fuzzy name lookup (exact → surname → word-overlap); real aggression/style fed into select_batter / choose_bowler</div></div>
<div class="signal-item"><div class="sk">Tool Budget</div><div class="sv">3 overhead calls/over (analyze, reflect, plan_delivery, set_strategy, set_bowling_strategy); −0.04 fine per excess call; plan_shot budget-free</div></div>
<div class="signal-item"><div class="sk">LLM Opponent</div><div class="sv">google/gemma-4-26B-A4B-it via HF Router (default); graceful heuristic fallback; llm_cached mode for reproducible eval</div></div>
<div class="signal-item"><div class="sk">DLS Par</div><div class="sv">Duckworth-Lewis par score as objective target; used in r_result</div></div>
<div class="signal-item"><div class="sk">Concurrent Sessions</div><div class="sv">SUPPORTS_CONCURRENT_SESSIONS = True; max 4 parallel envs</div></div>
</div>
</div>
<div>
<h3>12 Tools — 4 Categories</h3>
<div style="margin-top:8px;">
<p style="font-size:0.8rem; color:#90a4ae; margin-bottom:6px;">PLANNING</p>
<div class="pill-row" style="margin-top:0;">
<span class="tag tag-blue">call_toss</span>
<span class="tag tag-blue">set_match_plan</span>
<span class="tag tag-blue">update_match_plan</span>
</div>
<p style="font-size:0.8rem; color:#90a4ae; margin-top:10px; margin-bottom:6px;">BATTING EXECUTION</p>
<div class="pill-row" style="margin-top:0;">
<span class="tag tag-green">set_strategy</span>
<span class="tag tag-green">plan_shot</span>
<span class="tag tag-green">play_delivery</span>
</div>
<p style="font-size:0.8rem; color:#90a4ae; margin-top:10px; margin-bottom:6px;">BOWLING EXECUTION</p>
<div class="pill-row" style="margin-top:0;">
<span class="tag tag-orange">choose_bowler</span>
<span class="tag tag-orange">set_bowling_strategy</span>
<span class="tag tag-orange">bowl_delivery</span>
</div>
<p style="font-size:0.8rem; color:#90a4ae; margin-top:10px; margin-bottom:6px;">ANALYSIS</p>
<div class="pill-row" style="margin-top:0;">
<span class="tag tag-purple">analyze_situation</span>
<span class="tag tag-purple">reflect_after_ball</span>
</div>
</div>
<div class="hl" style="margin-top:14px; font-size:0.88rem;">
Tools are <strong>phase-gated</strong> — batting tools unavailable during bowling, etc. Invalid phase = 0 reward turn.
</div>
</div>
</div>
</section>
<!-- ══════════════════════════════════════════════════════════
SLIDE 5 — Observation Space & Agent Signals
═══════════════════════════════════════════════════════════ -->
<section class="slide s5" id="s5">
<div class="slide-number">05 / 10</div>
<h2>What the Agent Sees — Observation Space</h2>
<div class="two-col">
<div>
<h3>Per-Turn Observation (structured JSON + rendered text)</h3>
<pre><span class="dim">// game_context</span>
{ "over": 14, "ball": 3, "score": 112, "wickets": 2,
"run_rate": 7.8, "req_rate": 8.4, "phase": "MIDDLE",
"bowler_type": "spin", "field_setting": "Attacking" }
<span class="dim">// declared_strategy (agent's own prior declaration)</span>
{ "phase_intent": "consolidate", "aggression": 0.35,
"rationale": "Preserve wickets, build platform" }
<span class="dim">// tool_budget (per-over overhead counter)</span>
{ "overhead_calls_this_over": 1, "budget": 3,
"remaining": 2, "fines_accumulated": 0.0 }
<span class="dim">// last_outcome</span>
{ "runs": 1, "wicket": false, "extras": 0 }
<span class="dim">// available_tools + tool_history (last 5)</span></pre>
</div>
<div>
<h3>State Fields Used as Reward Signals</h3>
<div class="signal-grid">
<div class="signal-item"><div class="sk">coherence_scores[ ]</div><div class="sv">Per-delivery aggression_match × rationale_specificity × phase_fit</div></div>
<div class="signal-item"><div class="sk">adaptation_scores[ ]</div><div class="sv">Strategy updated after wicket / phase shift; 0 if stuck</div></div>
<div class="signal-item"><div class="sk">opponent_awareness_scores[ ]</div><div class="sv">Response to opponent's stated field/line changes</div></div>
<div class="signal-item"><div class="sk">regret_scores[ ]</div><div class="sv">Counterfactual: did agent outperform or underperform heuristic baseline?</div></div>
<div class="signal-item"><div class="sk">plan_commitment_scores[ ]</div><div class="sv">Keyword overlap: match_plan rationale → delivery rationale</div></div>
<div class="signal-item"><div class="sk">plan_staleness_penalties[ ]</div><div class="sv">Penalty if plan not refreshed for 2+ overs when context shifted</div></div>
</div>
<div class="hl" style="font-size:0.85rem; margin-top:10px;">
<code>prompt_text</code> is a rendered summary of all above — fed directly to the LLM. Strategy extracted from rendered text for stateless GRPO.
</div>
</div>
</div>
</section>
<!-- ══════════════════════════════════════════════════════════
SLIDE 6 — Reward Architecture (the heart of it)
═══════════════════════════════════════════════════════════ -->
<section class="slide s6" id="s6">
<div class="slide-number">06 / 10</div>
<h2>4-Rubric Composite Reward — Hard to Game</h2>
<table>
<tr>
<th>Rubric</th><th>Weight</th><th>Frequency</th><th>Measures</th><th>Key Sub-signals</th>
</tr>
<tr>
<td><code>r_cricket</code></td>
<td><strong>45%</strong></td>
<td>Per ball</td>
<td>Dream11 proxy: runs, wickets, milestones</td>
<td>dot%, boundary%, 50s/100s, maiden overs, economy</td>
</tr>
<tr>
<td><code>r_behavior</code></td>
<td><strong>25%</strong></td>
<td>Every turn</td>
<td>Declaration-execution alignment</td>
<td>coherence (50%) + adaptation (20%) + opponent_awareness (20%) + regret (10%)</td>
</tr>
<tr>
<td><code>r_result</code></td>
<td><strong>20%</strong></td>
<td>Innings/episode end</td>
<td>Win/loss vs DLS par, target margin</td>
<td>score/par, wickets_remaining, lead/deficit, +0.25 progress bonus</td>
</tr>
<tr>
<td><code>r_validity</code></td>
<td><strong>10%</strong></td>
<td>Every turn</td>
<td>Parseable XML/JSON tool call</td>
<td>Format gate; 0 = parse fail, 1 = valid</td>
</tr>
</table>
<p style="margin-top:8px;font-size:0.9em;color:#888">Rebalanced from 55/25/15/5 → 45/25/20/10 to match the SWE-RL recipe (60% intermediate / 40% terminal). Reasoning: partial-trajectory training rarely fires <code>r_result</code>; weighting it 55% wastes gradient on a near-zero signal.</p>
<div class="two-col" style="margin-top:18px;">
<div>
<h3>Coherence Score Formula (per delivery)</h3>
<pre><span class="hi">coherence</span> = (
<span class="kw">aggression_match</span> <span class="dim"># |declared_aggression - shot_aggression_proxy|</span>
× <span class="kw">rationale_specificity</span> <span class="dim"># min(words / 15, 1.0)</span>
× <span class="kw">phase_appropriate</span> <span class="dim"># 1.0 if shot fits phase norms, 0.6 otherwise</span>
)</pre>
</div>
<div>
<h3>Single-Stage Training + Format Curriculum</h3>
<ul>
<li><strong>Warmup (2–3 over curriculum):</strong> per-scenario <code>max_overs</code> sampled from [2,2,2,2,2,2,3,3,3] so episodes complete in budget and <code>r_result</code> can fire</li>
<li><strong>Main run (5-over end-to-end):</strong> resumes warmup adapter, trains on target eval distribution</li>
<li>Qwen3-4B-Instruct-2507 emits <code>&lt;tool_call&gt;...&lt;/tool_call&gt;</code> natively — no Stage 1 SFT needed</li>
<li>GRPO group size = 4; full episode advantages (TRL <code>environment_factory</code>)</li>
</ul>
</div>
</div>
</section>
<!-- ══════════════════════════════════════════════════════════
SLIDE 7 — Training Pipeline: Stateless GRPO Trick
═══════════════════════════════════════════════════════════ -->
<section class="slide s7" id="s7">
<div class="slide-number">07 / 10</div>
<h2>Training Pipeline — Stateless GRPO</h2>
<div class="two-col">
<div>
<h3>The Core Technical Insight</h3>
<p style="font-size:0.93rem; margin-bottom:10px;">TRL's GRPOTrainer requires a <strong>stateless</strong> reward function: <code>reward_fn(prompts, completions) → list[float]</code>. No env.step() inside.</p>
<div class="hl" style="font-size:0.88rem;">
The strategy the agent declared is embedded in the rendered <code>prompt_text</code> as "Batting Strategy: …". We parse it back with regex — no shared env state needed.
</div>
<pre style="margin-top:12px;"><span class="kw">_STRATEGY_RE</span> = re.compile(
r"<span class="hi">Batting Strategy:\s*(.+)$</span>", re.MULTILINE
)
<span class="kw">_PHASE_RE</span> = re.compile(
r"<span class="hi">Phase:\s+(POWERPLAY|MIDDLE|DEATH)</span>", re.I
)
<span class="dim"># r_behavior scored from (prompt, completion) alone</span>
<span class="dim"># r_result injected at episode end → all turns</span></pre>
<h3 style="margin-top:14px;">Stack</h3>
<div class="pill-row">
<span class="badge">gemma-4-26B-A4B-it</span>
<span class="badge">Unsloth 4-bit LoRA</span>
<span class="badge">TRL GRPOTrainer</span>
<span class="badge">MT-GRPO per-turn advantage</span>
</div>
</div>
<div>
<h3>End-to-End Commands</h3>
<pre><span class="dim"># 1. Start server</span>
uvicorn server.app:app --port 8766
<span class="dim"># 2. Sanity test (3-over match, heuristic)</span>
python train.py train-smoke \
--config configs/default.yaml \
--matches 1 --max-overs 3
<span class="dim"># 3. Live LLM match (HF Router)</span>
python inference.py \
--config configs/default.yaml \
--max-overs 3 --opponent-mode llm_live
<span class="dim"># 4. Warmup → Main chained run (auto-resumes adapter)</span>
bash scripts/run_warmup_then_main.sh
<span class="dim"># 5. Eval: untrained vs trained head-to-head</span>
python compare_eval.py --model Qwen/Qwen3-4B-Instruct-2507 \
--label baseline --episodes 20 --max-overs 5 \
--output eval_results/baseline.json
python compare_eval.py --model Qwen/Qwen3-4B-Instruct-2507 \
--adapter ./checkpoints/stage2_final \
--label trained --episodes 20 --max-overs 5 \
--output eval_results/trained.json
python compare_eval.py --compare \
eval_results/baseline.json eval_results/trained.json</pre>
<div class="wn" style="font-size:0.84rem;">
All model / API / env settings live in <code>configs/default.yaml</code>. Zero hardcoding.
</div>
</div>
</div>
</section>
<!-- ══════════════════════════════════════════════════════════
SLIDE 8 — Current Results & Baseline Numbers
═══════════════════════════════════════════════════════════ -->
<section class="slide s8" id="s8">
<div class="slide-number">08 / 10</div>
<h2>What We Measured — Baseline Results</h2>
<div class="four-col">
<div class="card">
<div class="value large">0%</div>
<div class="label">Parse error rate across all 14 runs — heuristic, cricsheet, llm_live opponents</div>
</div>
<div class="card">
<div class="value large">1.0</div>
<div class="label">r_validity across all 9 train-smoke matches (3 opponent modes × 3 matches)</div>
</div>
<div class="card">
<div class="value large">0.62</div>
<div class="label">Peak mean coherence (train-smoke, cricsheet opponent, 5-over)</div>
</div>
<div class="card">
<div class="value large">3</div>
<div class="label">Opponent modes verified end-to-end: heuristic · cricsheet · llm_live</div>
</div>
</div>
<div class="two-col" style="margin-top:20px;">
<div>
<h3>What training should produce (target)</h3>
<ul>
<li>r_validity: 0.70 → 0.98+ after warmup (25 steps)</li>
<li>Coherence: ~0.52 (random) → 0.75+ after main run</li>
<li>analyze_situation calls cluster at over 6, 16, 36 transitions</li>
<li>Strategy declarations become more specific (&gt;15 word rationales)</li>
<li>Shot choices match declared aggression level &gt;80% of deliveries</li>
</ul>
</div>
<div>
<h3>Reward signals verified working ✅</h3>
<ul>
<li>plan_commitment_scores populated per delivery</li>
<li>plan_staleness_penalties active at over-end</li>
<li>coherence_scores differentiate matching vs mismatching strategies</li>
<li>adaptation_scores fire on wicket loss &amp; phase transitions</li>
<li>opponent_awareness_scores respond to field change</li>
</ul>
<div class="wn" style="font-size:0.84rem; margin-top:10px;">
All signals verified. Full reward curves pending GRPO training run. Colab notebook ready.
</div>
</div>
</div>
</section>
<!-- ══════════════════════════════════════════════════════════
SLIDE 9 — Innovation Summary & Judging Criteria Mapping
═══════════════════════════════════════════════════════════ -->
<section class="slide s9" id="s9">
<div class="slide-number">09 / 10</div>
<h2>Why This Scores on Every Judging Dimension</h2>
<table>
<tr>
<th>Criterion</th><th>Weight</th><th>Our Angle</th><th>Evidence</th>
</tr>
<tr>
<td><strong>Environment Innovation</strong></td>
<td>40%</td>
<td>First RL env purpose-built for WDCT / declaration-execution alignment</td>
<td>12-tool multi-phase, multi-agent, 300-decision episodes with Cricsheet Markov engine</td>
</tr>
<tr>
<td><strong>Storytelling</strong></td>
<td>30%</td>
<td>Clear capability gap → environment design → reward signal → training → measurable WDCT improvement</td>
<td>README, this slide deck, Colab notebook, Gradio live demo</td>
</tr>
<tr>
<td><strong>Improvement in Rewards</strong></td>
<td>20%</td>
<td>Two-stage curriculum produces observable r_validity spike then coherence rise</td>
<td>Colab plots, before/after tool call samples, coherence heatmap</td>
</tr>
<tr>
<td><strong>Reward &amp; Pipeline</strong></td>
<td>10%</td>
<td>4-rubric composite, hard to game (r_result at episode end enforces real match outcomes)</td>
<td>server/reward_calculator.py, server/coherence_grader.py, stateless GRPO reward fn</td>
</tr>
</table>
<div class="two-col" style="margin-top:16px;">
<div class="hl">
<strong>Unique technical contributions:</strong><br>
Stateless GRPO via prompt-text parsing · Plan staleness penalty · Per-delivery commitment scoring · Phase-gated tool availability · LLM opponent via HF Router · Format-aware rules (T5/T20/ODI) · Tool budget + fine system · Real player roster lookup
</div>
<div class="gr">
<strong>Minimum requirements met:</strong><br>
✅ OpenEnv latest &nbsp; ✅ TRL/Unsloth Colab &nbsp; ✅ HF Space (ready) &nbsp; ✅ README with results<br>
✅ 3 opponent modes verified (heuristic · cricsheet · llm_live) &nbsp; ⚠️ Blog / video post-training
</div>
</div>
</section>
<!-- ══════════════════════════════════════════════════════════
SLIDE 10 — Roadmap & The Money Shot
═══════════════════════════════════════════════════════════ -->
<section class="slide s10" id="s10">
<div class="slide-number">10 / 10</div>
<h2>Roadmap to Submission</h2>
<div class="two-col">
<div>
<h3>🔴 Critical Path (on-site, Day 1–2)</h3>
<ul>
<li>Run Colab notebook (notebooks/colab_train_minimal.ipynb) → warmup → main chained training</li>
<li>Export: reward_curves.png, coherence_heatmap.png, tool_timeline.png</li>
<li>Deploy to HuggingFace Spaces → live interactive Gradio demo URL</li>
<li>Add HF Space URL + plot images to README</li>
<li>Write 500-word mini-blog on HF (problem → env → results)</li>
<li>Run Cricsheet data curation (<code>scripts/curate_transitions.py</code>) for real ball probs</li>
</ul>
</div>
<div>
<h3>💰 The Money Shot for Judges</h3>
<div class="hl">
A heatmap: <strong>episode × delivery coherence score</strong>, showing the gradient rising from ~0.35 (random) toward 0.75+ as training progresses. This directly visualizes the declared coherence improvement.
</div>
<h3 style="margin-top:16px;">🟡 Stretch (improves score)</h3>
<ul>
<li>WDCT before/after comparison on canonical states (Over 35, 180/3)</li>
<li>Opponent cache for reproducible eval without API calls</li>
<li>&lt;2 min screen demo video (Gradio UI + reward curve walkthrough)</li>
</ul>
<div style="margin-top:16px; font-size:0.82rem; color:#546e7a;">
github.com/[team]/cricket-captain-llm &nbsp;·&nbsp; huggingface.co/spaces/[team]/cricket-captain
</div>
</div>
</div>
</section>
</div><!-- .deck -->
<div class="progress-bar" id="prog"></div>
<div class="nav">
<button id="prev" onclick="go(-1)" disabled>← Prev</button>
<button id="next" onclick="go(1)">Next →</button>
</div>
<div class="slide-hint">← → arrow keys to navigate</div>
<script>
const slides = document.querySelectorAll('.slide');
let cur = 0;
const prog = document.getElementById('prog');
function go(dir) {
slides[cur].classList.remove('active');
cur = Math.max(0, Math.min(slides.length - 1, cur + dir));
slides[cur].classList.add('active');
document.getElementById('prev').disabled = (cur === 0);
document.getElementById('next').disabled = (cur === slides.length - 1);
prog.style.width = ((cur + 1) / slides.length * 100) + '%';
window.scrollTo(0, 0);
}
document.addEventListener('keydown', e => {
if (e.key === 'ArrowRight' || e.key === 'ArrowDown') go(1);
if (e.key === 'ArrowLeft' || e.key === 'ArrowUp') go(-1);
});
prog.style.width = (1 / slides.length * 100) + '%';
</script>
</body>
</html>