sync: pull latest from main (model_server.py, captain LLM toggle in ui.py, 0.6B configs, SUBMISSION + RUNTIME_DURABILITY docs)
e70c305 verified | <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>CricketCaptain-LLM — OpenEnv Hackathon 2026</title> | |
| <style> | |
| * { box-sizing: border-box; margin: 0; padding: 0; } | |
| body { font-family: 'Segoe UI', system-ui, sans-serif; background: #0f1117; color: #e8eaf6; } | |
| .deck { width: 100%; } | |
| .slide { | |
| display: none; | |
| min-height: 100vh; | |
| padding: 52px 80px; | |
| flex-direction: column; | |
| justify-content: center; | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .slide.active { display: flex; } | |
| .s1 { background: linear-gradient(135deg, #0d1b2a 0%, #0f2744 100%); } | |
| .s2 { background: linear-gradient(135deg, #0a1e0a 0%, #0e2d12 100%); } | |
| .s3 { background: linear-gradient(135deg, #0d1e3a 0%, #112952 100%); } | |
| .s4 { background: linear-gradient(135deg, #1a0930 0%, #2c1060 100%); } | |
| .s5 { background: linear-gradient(135deg, #1e0a06 0%, #3d1008 100%); } | |
| .s6 { background: linear-gradient(135deg, #001d2e 0%, #003050 100%); } | |
| .s7 { background: linear-gradient(135deg, #0e1e0e 0%, #1a3a1a 100%); } | |
| .s8 { background: linear-gradient(135deg, #1a1400 0%, #332800 100%); } | |
| .s9 { background: linear-gradient(135deg, #001a1a 0%, #003030 100%); } | |
| .s10 { background: linear-gradient(135deg, #0d1b2a 0%, #0f2744 100%); } | |
| .slide-number { | |
| position: absolute; top: 22px; right: 36px; | |
| font-size: 12px; color: rgba(255,255,255,0.30); letter-spacing: 2px; | |
| font-family: monospace; | |
| } | |
| h1 { font-size: 2.9rem; font-weight: 700; line-height: 1.15; margin-bottom: 14px; } | |
| h2 { font-size: 1.85rem; font-weight: 600; margin-bottom: 20px; color: #90caf9; } | |
| h3 { font-size: 1.1rem; font-weight: 600; margin-bottom: 8px; color: #80deea; } | |
| p { font-size: 1.05rem; line-height: 1.65; color: #cfd8dc; max-width: 860px; } | |
| .subtitle { font-size: 1.25rem; color: #90caf9; margin-bottom: 28px; font-weight: 400; max-width: 700px; } | |
| .tagline { font-size: 1.4rem; color: #a5d6a7; font-style: italic; margin-top: 18px; } | |
| .two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 40px; margin-top: 8px; } | |
| .three-col { display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 24px; margin-top: 8px; } | |
| .four-col { display: grid; grid-template-columns: 1fr 1fr 1fr 1fr; gap: 18px; margin-top: 8px; } | |
| .card { | |
| background: rgba(255,255,255,0.06); | |
| border: 1px solid rgba(255,255,255,0.12); | |
| border-radius: 12px; | |
| padding: 20px; | |
| } | |
| .card .value { font-size: 2rem; font-weight: 700; color: #80cbc4; } | |
| .card .value.large { font-size: 2.6rem; } | |
| .card .label { font-size: 0.85rem; color: #90a4ae; margin-top: 4px; line-height: 1.4; } | |
| ul { list-style: none; margin-top: 6px; } | |
| ul li { padding: 5px 0; padding-left: 20px; position: relative; color: #cfd8dc; font-size: 1.0rem; line-height: 1.5; } | |
| ul li::before { content: "▸"; position: absolute; left: 0; color: #4db6ac; } | |
| .tag { | |
| display: inline-block; padding: 2px 9px; border-radius: 4px; | |
| font-size: 0.76rem; font-weight: 600; letter-spacing: 0.4px; margin: 2px 3px; | |
| } | |
| .tag-green { background: #1b5e20; color: #a5d6a7; } | |
| .tag-orange { background: #bf360c; color: #ffe0b2; } | |
| .tag-blue { background: #0d47a1; color: #bbdefb; } | |
| .tag-purple { background: #4a148c; color: #e1bee7; } | |
| .tag-red { background: #b71c1c; color: #ffcdd2; } | |
| .tag-teal { background: #004d40; color: #b2dfdb; } | |
| .tag-yellow { background: #f57f17; color: #fff9c4; } | |
| table { width: 100%; border-collapse: collapse; margin-top: 14px; font-size: 0.92rem; } | |
| th { background: rgba(255,255,255,0.09); padding: 9px 13px; text-align: left; color: #b0bec5; font-weight: 600; } | |
| td { padding: 8px 13px; border-bottom: 1px solid rgba(255,255,255,0.06); color: #cfd8dc; } | |
| tr:last-child td { border-bottom: none; } | |
| tr:hover td { background: rgba(255,255,255,0.03); } | |
| code { | |
| background: rgba(255,255,255,0.1); border-radius: 4px; | |
| padding: 1px 6px; font-family: 'Cascadia Code', 'Fira Code', monospace; | |
| font-size: 0.85em; color: #80cbc4; | |
| } | |
| pre { | |
| background: rgba(0,0,0,0.45); border-radius: 8px; padding: 14px 18px; | |
| font-family: 'Cascadia Code', 'Fira Code', monospace; font-size: 0.80rem; | |
| color: #a5d6a7; line-height: 1.55; overflow-x: auto; margin-top: 10px; | |
| border: 1px solid rgba(255,255,255,0.07); | |
| } | |
| pre .dim { color: #546e7a; } | |
| pre .hi { color: #ffcc80; } | |
| pre .kw { color: #80cbc4; } | |
| .progress-bar { | |
| position: fixed; bottom: 0; left: 0; height: 3px; | |
| background: linear-gradient(90deg, #4db6ac, #7c4dff, #ef5350); | |
| transition: width 0.35s ease; | |
| z-index: 200; | |
| } | |
| .nav { | |
| position: fixed; bottom: 22px; right: 36px; | |
| display: flex; gap: 10px; z-index: 100; | |
| } | |
| .nav button { | |
| background: rgba(255,255,255,0.10); border: 1px solid rgba(255,255,255,0.18); | |
| color: #fff; padding: 9px 20px; border-radius: 6px; cursor: pointer; | |
| font-size: 0.88rem; transition: background 0.2s; | |
| } | |
| .nav button:hover { background: rgba(255,255,255,0.20); } | |
| .nav button:disabled { opacity: 0.25; cursor: default; } | |
| .slide-hint { | |
| position: fixed; bottom: 26px; left: 50%; transform: translateX(-50%); | |
| font-size: 11px; color: rgba(255,255,255,0.22); letter-spacing: 1px; | |
| } | |
| .hl { background: rgba(77,182,172,0.13); border-left: 3px solid #4db6ac; padding: 14px 18px; border-radius: 0 8px 8px 0; margin-top: 14px; } | |
| .wn { background: rgba(255,152,0,0.12); border-left: 3px solid #ff9800; padding: 14px 18px; border-radius: 0 8px 8px 0; margin-top: 14px; font-size: 0.93rem; } | |
| .gr { background: rgba(100,221,23,0.09); border-left: 3px solid #69f0ae; padding: 14px 18px; border-radius: 0 8px 8px 0; margin-top: 14px; } | |
| .pill-row { display: flex; flex-wrap: wrap; gap: 7px; margin-top: 10px; } | |
| .badge { | |
| display: inline-flex; align-items: center; gap: 6px; | |
| background: rgba(255,255,255,0.07); border: 1px solid rgba(255,255,255,0.13); | |
| padding: 5px 12px; border-radius: 20px; font-size: 0.82rem; color: #b0bec5; | |
| } | |
| .score-bar { margin: 5px 0; } | |
| .score-bar .bar-wrap { background: rgba(255,255,255,0.08); border-radius: 4px; height: 8px; margin-top: 3px; } | |
| .score-bar .bar-fill { height: 8px; border-radius: 4px; } | |
| .signal-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; margin-top: 10px; } | |
| .signal-item { background: rgba(255,255,255,0.04); border: 1px solid rgba(255,255,255,0.08); border-radius: 8px; padding: 10px 14px; font-size: 0.88rem; } | |
| .signal-item .sk { color: #80deea; font-weight: 600; margin-bottom: 3px; } | |
| .signal-item .sv { color: #90a4ae; font-size: 0.82rem; } | |
| </style> | |
| </head> | |
| <body> | |
| <!-- ══════════════════════════════════════════════════════════ | |
| SLIDE 1 — Title & Hook | |
| ═══════════════════════════════════════════════════════════ --> | |
| <div class="deck"> | |
| <section class="slide s1 active" id="s1"> | |
| <div class="slide-number">01 / 10</div> | |
| <p style="font-size:2.6rem; margin-bottom:6px;">🏏</p> | |
| <h1>CricketCaptain-LLM</h1> | |
| <p class="subtitle">A multi-agent RL training environment for <strong>strategic coherence</strong> — teaching LLMs to mean what they say across 300 consecutive decisions.</p> | |
| <p class="tagline">"I'll consolidate and preserve wickets" → then actually plays defensively for 4 overs.</p> | |
| <div style="margin-top:28px;"> | |
| <p style="color:#90a4ae; font-size:0.95rem; margin-bottom:10px;">Targets: Theme #1 Multi-Agent Interaction + Theme #2 Long-Horizon Planning</p> | |
| <div class="pill-row"> | |
| <span class="badge">🌐 OpenEnv ≥ 0.2.2</span> | |
| <span class="badge">⚡ TRL MT-GRPO</span> | |
| <span class="badge">🤖 google/gemma-4-26B-A4B-it</span> | |
| <span class="badge">🔁 HF Router</span> | |
| <span class="badge">📊 Cricsheet Markov Engine</span> | |
| <span class="badge">🎯 WDCT Benchmark</span> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ══════════════════════════════════════════════════════════ | |
| SLIDE 2 — The Problem: What LLMs Can't Do | |
| ═══════════════════════════════════════════════════════════ --> | |
| <section class="slide s2" id="s2"> | |
| <div class="slide-number">02 / 10</div> | |
| <h2>The Gap We're Closing</h2> | |
| <div class="two-col"> | |
| <div> | |
| <h3>WDCT Benchmark — Words & Deeds Consistency</h3> | |
| <p style="font-size:0.95rem; margin-bottom:12px;">arxiv:2503.07003 — the only public benchmark directly measuring whether LLMs execute what they declare.</p> | |
| <div class="four-col" style="grid-template-columns: 1fr 1fr; gap: 12px; margin-top:10px;"> | |
| <div class="card"><div class="value">0.49</div><div class="label">Smaller models (7B class)</div></div> | |
| <div class="card"><div class="value">0.76</div><div class="label">GPT-4 (best published)</div></div> | |
| </div> | |
| <div class="hl" style="margin-top:14px;"> | |
| <strong>No RL training environment has targeted this benchmark directly.</strong><br> | |
| We built one. | |
| </div> | |
| </div> | |
| <div> | |
| <h3>Why It Matters Beyond Cricket</h3> | |
| <ul> | |
| <li>Planning agents that can't commit to strategy fail silently</li> | |
| <li>Reasoning traces are only useful if they predict the next action</li> | |
| <li>Chain-of-thought gains are undermined by declaration-execution drift</li> | |
| <li>Every agentic system suffers from this; no training environment targets it</li> | |
| </ul> | |
| <div class="wn" style="margin-top:14px;"> | |
| A model that <em>says</em> "preserve wickets" but <em>plays</em> aggressive shots hasn't learned strategy — it's learned to <em>sound</em> strategic. | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ══════════════════════════════════════════════════════════ | |
| SLIDE 3 — Why Cricket: The Perfect Domain | |
| ═══════════════════════════════════════════════════════════ --> | |
| <section class="slide s3" id="s3"> | |
| <div class="slide-number">03 / 10</div> | |
| <h2>Why Cricket Is the Right Stress Test</h2> | |
| <div class="two-col"> | |
| <div> | |
| <table style="margin-top:0;"> | |
| <tr><th>Property</th><th>Capability Forced</th></tr> | |
| <tr><td>300 consecutive decisions (50×6)</td><td>Long-horizon coherence</td></tr> | |
| <tr><td>10 wickets as irreversible budget</td><td>Consequence-aware risk planning</td></tr> | |
| <tr><td>Powerplay → Middle → Death phases</td><td>Strategic revision at regime shifts</td></tr> | |
| <tr><td>DLS par = ground-truth optimal score</td><td>Objective performance signal</td></tr> | |
| <tr><td>"Declare strategy, then play shots"</td><td>Declaration-execution directly testable</td></tr> | |
| <tr><td>LLM opponent (HF Router)</td><td>Theory-of-mind / opponent modeling</td></tr> | |
| <tr><td>Full match: bat & bowl both innings</td><td>End-to-end role adaptation</td></tr> | |
| </table> | |
| </div> | |
| <div> | |
| <h3>Why Not Chess / Math / Coding?</h3> | |
| <ul> | |
| <li>Chess: no natural language declarations; coherence untestable</li> | |
| <li>Math: single-step; no 300-turn consistency requirement</li> | |
| <li>Coding: rare phase transitions; no risk budget exhaustion</li> | |
| <li>Cricket: declarations are mandatory tool calls, shots are mandatory tool calls — alignment is <em>structurally enforced</em></li> | |
| </ul> | |
| <div class="gr"> | |
| Cricket is not the goal. It's the <em>measurement apparatus</em> for a capability that transfers to every agentic domain. | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ══════════════════════════════════════════════════════════ | |
| SLIDE 4 — Environment Architecture & State Machine | |
| ═══════════════════════════════════════════════════════════ --> | |
| <section class="slide s4" id="s4"> | |
| <div class="slide-number">04 / 10</div> | |
| <h2>Environment Architecture</h2> | |
| <div class="two-col"> | |
| <div> | |
| <h3>OpenEnv Server — State Machine</h3> | |
| <div style="font-family:monospace; font-size:0.82rem; line-height:2.1; margin-top:8px; color:#cfd8dc;"> | |
| <span style="color:#ef9a9a;">TOSS</span> | |
| → | |
| <span style="color:#80cbc4;">BATTING</span> | |
| → | |
| <span style="color:#ffcc80;">BOWLING</span> | |
| → | |
| <span style="color:#a5d6a7;">RESULT</span> | |
| </div> | |
| <div class="signal-grid" style="margin-top:14px;"> | |
| <div class="signal-item"><div class="sk">Markov Engine</div><div class="sv">5-dim key: over × wickets × score_band × phase × bowler_type → (runs, wicket_fell)</div></div> | |
| <div class="signal-item"><div class="sk">Cricsheet Data</div><div class="sv">Ball-by-ball transition probs from real ODI/T20 matches; synthetic fallback</div></div> | |
| <div class="signal-item"><div class="sk">Format Mapper</div><div class="sv">T5 / T20 / ODI rules auto-selected by closest max_overs; phase-aware shot weights, batter & bowler roles from <code>format_rules.json</code></div></div> | |
| <div class="signal-item"><div class="sk">Player Rosters</div><div class="sv">10 T20I team profiles; fuzzy name lookup (exact → surname → word-overlap); real aggression/style fed into select_batter / choose_bowler</div></div> | |
| <div class="signal-item"><div class="sk">Tool Budget</div><div class="sv">3 overhead calls/over (analyze, reflect, plan_delivery, set_strategy, set_bowling_strategy); −0.04 fine per excess call; plan_shot budget-free</div></div> | |
| <div class="signal-item"><div class="sk">LLM Opponent</div><div class="sv">google/gemma-4-26B-A4B-it via HF Router (default); graceful heuristic fallback; llm_cached mode for reproducible eval</div></div> | |
| <div class="signal-item"><div class="sk">DLS Par</div><div class="sv">Duckworth-Lewis par score as objective target; used in r_result</div></div> | |
| <div class="signal-item"><div class="sk">Concurrent Sessions</div><div class="sv">SUPPORTS_CONCURRENT_SESSIONS = True; max 4 parallel envs</div></div> | |
| </div> | |
| </div> | |
| <div> | |
| <h3>12 Tools — 4 Categories</h3> | |
| <div style="margin-top:8px;"> | |
| <p style="font-size:0.8rem; color:#90a4ae; margin-bottom:6px;">PLANNING</p> | |
| <div class="pill-row" style="margin-top:0;"> | |
| <span class="tag tag-blue">call_toss</span> | |
| <span class="tag tag-blue">set_match_plan</span> | |
| <span class="tag tag-blue">update_match_plan</span> | |
| </div> | |
| <p style="font-size:0.8rem; color:#90a4ae; margin-top:10px; margin-bottom:6px;">BATTING EXECUTION</p> | |
| <div class="pill-row" style="margin-top:0;"> | |
| <span class="tag tag-green">set_strategy</span> | |
| <span class="tag tag-green">plan_shot</span> | |
| <span class="tag tag-green">play_delivery</span> | |
| </div> | |
| <p style="font-size:0.8rem; color:#90a4ae; margin-top:10px; margin-bottom:6px;">BOWLING EXECUTION</p> | |
| <div class="pill-row" style="margin-top:0;"> | |
| <span class="tag tag-orange">choose_bowler</span> | |
| <span class="tag tag-orange">set_bowling_strategy</span> | |
| <span class="tag tag-orange">bowl_delivery</span> | |
| </div> | |
| <p style="font-size:0.8rem; color:#90a4ae; margin-top:10px; margin-bottom:6px;">ANALYSIS</p> | |
| <div class="pill-row" style="margin-top:0;"> | |
| <span class="tag tag-purple">analyze_situation</span> | |
| <span class="tag tag-purple">reflect_after_ball</span> | |
| </div> | |
| </div> | |
| <div class="hl" style="margin-top:14px; font-size:0.88rem;"> | |
| Tools are <strong>phase-gated</strong> — batting tools unavailable during bowling, etc. Invalid phase = 0 reward turn. | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ══════════════════════════════════════════════════════════ | |
| SLIDE 5 — Observation Space & Agent Signals | |
| ═══════════════════════════════════════════════════════════ --> | |
| <section class="slide s5" id="s5"> | |
| <div class="slide-number">05 / 10</div> | |
| <h2>What the Agent Sees — Observation Space</h2> | |
| <div class="two-col"> | |
| <div> | |
| <h3>Per-Turn Observation (structured JSON + rendered text)</h3> | |
| <pre><span class="dim">// game_context</span> | |
| { "over": 14, "ball": 3, "score": 112, "wickets": 2, | |
| "run_rate": 7.8, "req_rate": 8.4, "phase": "MIDDLE", | |
| "bowler_type": "spin", "field_setting": "Attacking" } | |
| <span class="dim">// declared_strategy (agent's own prior declaration)</span> | |
| { "phase_intent": "consolidate", "aggression": 0.35, | |
| "rationale": "Preserve wickets, build platform" } | |
| <span class="dim">// tool_budget (per-over overhead counter)</span> | |
| { "overhead_calls_this_over": 1, "budget": 3, | |
| "remaining": 2, "fines_accumulated": 0.0 } | |
| <span class="dim">// last_outcome</span> | |
| { "runs": 1, "wicket": false, "extras": 0 } | |
| <span class="dim">// available_tools + tool_history (last 5)</span></pre> | |
| </div> | |
| <div> | |
| <h3>State Fields Used as Reward Signals</h3> | |
| <div class="signal-grid"> | |
| <div class="signal-item"><div class="sk">coherence_scores[ ]</div><div class="sv">Per-delivery aggression_match × rationale_specificity × phase_fit</div></div> | |
| <div class="signal-item"><div class="sk">adaptation_scores[ ]</div><div class="sv">Strategy updated after wicket / phase shift; 0 if stuck</div></div> | |
| <div class="signal-item"><div class="sk">opponent_awareness_scores[ ]</div><div class="sv">Response to opponent's stated field/line changes</div></div> | |
| <div class="signal-item"><div class="sk">regret_scores[ ]</div><div class="sv">Counterfactual: did agent outperform or underperform heuristic baseline?</div></div> | |
| <div class="signal-item"><div class="sk">plan_commitment_scores[ ]</div><div class="sv">Keyword overlap: match_plan rationale → delivery rationale</div></div> | |
| <div class="signal-item"><div class="sk">plan_staleness_penalties[ ]</div><div class="sv">Penalty if plan not refreshed for 2+ overs when context shifted</div></div> | |
| </div> | |
| <div class="hl" style="font-size:0.85rem; margin-top:10px;"> | |
| <code>prompt_text</code> is a rendered summary of all above — fed directly to the LLM. Strategy extracted from rendered text for stateless GRPO. | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ══════════════════════════════════════════════════════════ | |
| SLIDE 6 — Reward Architecture (the heart of it) | |
| ═══════════════════════════════════════════════════════════ --> | |
| <section class="slide s6" id="s6"> | |
| <div class="slide-number">06 / 10</div> | |
| <h2>4-Rubric Composite Reward — Hard to Game</h2> | |
| <table> | |
| <tr> | |
| <th>Rubric</th><th>Weight</th><th>Frequency</th><th>Measures</th><th>Key Sub-signals</th> | |
| </tr> | |
| <tr> | |
| <td><code>r_cricket</code></td> | |
| <td><strong>45%</strong></td> | |
| <td>Per ball</td> | |
| <td>Dream11 proxy: runs, wickets, milestones</td> | |
| <td>dot%, boundary%, 50s/100s, maiden overs, economy</td> | |
| </tr> | |
| <tr> | |
| <td><code>r_behavior</code></td> | |
| <td><strong>25%</strong></td> | |
| <td>Every turn</td> | |
| <td>Declaration-execution alignment</td> | |
| <td>coherence (50%) + adaptation (20%) + opponent_awareness (20%) + regret (10%)</td> | |
| </tr> | |
| <tr> | |
| <td><code>r_result</code></td> | |
| <td><strong>20%</strong></td> | |
| <td>Innings/episode end</td> | |
| <td>Win/loss vs DLS par, target margin</td> | |
| <td>score/par, wickets_remaining, lead/deficit, +0.25 progress bonus</td> | |
| </tr> | |
| <tr> | |
| <td><code>r_validity</code></td> | |
| <td><strong>10%</strong></td> | |
| <td>Every turn</td> | |
| <td>Parseable XML/JSON tool call</td> | |
| <td>Format gate; 0 = parse fail, 1 = valid</td> | |
| </tr> | |
| </table> | |
| <p style="margin-top:8px;font-size:0.9em;color:#888">Rebalanced from 55/25/15/5 → 45/25/20/10 to match the SWE-RL recipe (60% intermediate / 40% terminal). Reasoning: partial-trajectory training rarely fires <code>r_result</code>; weighting it 55% wastes gradient on a near-zero signal.</p> | |
| <div class="two-col" style="margin-top:18px;"> | |
| <div> | |
| <h3>Coherence Score Formula (per delivery)</h3> | |
| <pre><span class="hi">coherence</span> = ( | |
| <span class="kw">aggression_match</span> <span class="dim"># |declared_aggression - shot_aggression_proxy|</span> | |
| × <span class="kw">rationale_specificity</span> <span class="dim"># min(words / 15, 1.0)</span> | |
| × <span class="kw">phase_appropriate</span> <span class="dim"># 1.0 if shot fits phase norms, 0.6 otherwise</span> | |
| )</pre> | |
| </div> | |
| <div> | |
| <h3>Single-Stage Training + Format Curriculum</h3> | |
| <ul> | |
| <li><strong>Warmup (2–3 over curriculum):</strong> per-scenario <code>max_overs</code> sampled from [2,2,2,2,2,2,3,3,3] so episodes complete in budget and <code>r_result</code> can fire</li> | |
| <li><strong>Main run (5-over end-to-end):</strong> resumes warmup adapter, trains on target eval distribution</li> | |
| <li>Qwen3-4B-Instruct-2507 emits <code><tool_call>...</tool_call></code> natively — no Stage 1 SFT needed</li> | |
| <li>GRPO group size = 4; full episode advantages (TRL <code>environment_factory</code>)</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ══════════════════════════════════════════════════════════ | |
| SLIDE 7 — Training Pipeline: Stateless GRPO Trick | |
| ═══════════════════════════════════════════════════════════ --> | |
| <section class="slide s7" id="s7"> | |
| <div class="slide-number">07 / 10</div> | |
| <h2>Training Pipeline — Stateless GRPO</h2> | |
| <div class="two-col"> | |
| <div> | |
| <h3>The Core Technical Insight</h3> | |
| <p style="font-size:0.93rem; margin-bottom:10px;">TRL's GRPOTrainer requires a <strong>stateless</strong> reward function: <code>reward_fn(prompts, completions) → list[float]</code>. No env.step() inside.</p> | |
| <div class="hl" style="font-size:0.88rem;"> | |
| The strategy the agent declared is embedded in the rendered <code>prompt_text</code> as "Batting Strategy: …". We parse it back with regex — no shared env state needed. | |
| </div> | |
| <pre style="margin-top:12px;"><span class="kw">_STRATEGY_RE</span> = re.compile( | |
| r"<span class="hi">Batting Strategy:\s*(.+)$</span>", re.MULTILINE | |
| ) | |
| <span class="kw">_PHASE_RE</span> = re.compile( | |
| r"<span class="hi">Phase:\s+(POWERPLAY|MIDDLE|DEATH)</span>", re.I | |
| ) | |
| <span class="dim"># r_behavior scored from (prompt, completion) alone</span> | |
| <span class="dim"># r_result injected at episode end → all turns</span></pre> | |
| <h3 style="margin-top:14px;">Stack</h3> | |
| <div class="pill-row"> | |
| <span class="badge">gemma-4-26B-A4B-it</span> | |
| <span class="badge">Unsloth 4-bit LoRA</span> | |
| <span class="badge">TRL GRPOTrainer</span> | |
| <span class="badge">MT-GRPO per-turn advantage</span> | |
| </div> | |
| </div> | |
| <div> | |
| <h3>End-to-End Commands</h3> | |
| <pre><span class="dim"># 1. Start server</span> | |
| uvicorn server.app:app --port 8766 | |
| <span class="dim"># 2. Sanity test (3-over match, heuristic)</span> | |
| python train.py train-smoke \ | |
| --config configs/default.yaml \ | |
| --matches 1 --max-overs 3 | |
| <span class="dim"># 3. Live LLM match (HF Router)</span> | |
| python inference.py \ | |
| --config configs/default.yaml \ | |
| --max-overs 3 --opponent-mode llm_live | |
| <span class="dim"># 4. Warmup → Main chained run (auto-resumes adapter)</span> | |
| bash scripts/run_warmup_then_main.sh | |
| <span class="dim"># 5. Eval: untrained vs trained head-to-head</span> | |
| python compare_eval.py --model Qwen/Qwen3-4B-Instruct-2507 \ | |
| --label baseline --episodes 20 --max-overs 5 \ | |
| --output eval_results/baseline.json | |
| python compare_eval.py --model Qwen/Qwen3-4B-Instruct-2507 \ | |
| --adapter ./checkpoints/stage2_final \ | |
| --label trained --episodes 20 --max-overs 5 \ | |
| --output eval_results/trained.json | |
| python compare_eval.py --compare \ | |
| eval_results/baseline.json eval_results/trained.json</pre> | |
| <div class="wn" style="font-size:0.84rem;"> | |
| All model / API / env settings live in <code>configs/default.yaml</code>. Zero hardcoding. | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ══════════════════════════════════════════════════════════ | |
| SLIDE 8 — Current Results & Baseline Numbers | |
| ═══════════════════════════════════════════════════════════ --> | |
| <section class="slide s8" id="s8"> | |
| <div class="slide-number">08 / 10</div> | |
| <h2>What We Measured — Baseline Results</h2> | |
| <div class="four-col"> | |
| <div class="card"> | |
| <div class="value large">0%</div> | |
| <div class="label">Parse error rate across all 14 runs — heuristic, cricsheet, llm_live opponents</div> | |
| </div> | |
| <div class="card"> | |
| <div class="value large">1.0</div> | |
| <div class="label">r_validity across all 9 train-smoke matches (3 opponent modes × 3 matches)</div> | |
| </div> | |
| <div class="card"> | |
| <div class="value large">0.62</div> | |
| <div class="label">Peak mean coherence (train-smoke, cricsheet opponent, 5-over)</div> | |
| </div> | |
| <div class="card"> | |
| <div class="value large">3</div> | |
| <div class="label">Opponent modes verified end-to-end: heuristic · cricsheet · llm_live</div> | |
| </div> | |
| </div> | |
| <div class="two-col" style="margin-top:20px;"> | |
| <div> | |
| <h3>What training should produce (target)</h3> | |
| <ul> | |
| <li>r_validity: 0.70 → 0.98+ after warmup (25 steps)</li> | |
| <li>Coherence: ~0.52 (random) → 0.75+ after main run</li> | |
| <li>analyze_situation calls cluster at over 6, 16, 36 transitions</li> | |
| <li>Strategy declarations become more specific (>15 word rationales)</li> | |
| <li>Shot choices match declared aggression level >80% of deliveries</li> | |
| </ul> | |
| </div> | |
| <div> | |
| <h3>Reward signals verified working ✅</h3> | |
| <ul> | |
| <li>plan_commitment_scores populated per delivery</li> | |
| <li>plan_staleness_penalties active at over-end</li> | |
| <li>coherence_scores differentiate matching vs mismatching strategies</li> | |
| <li>adaptation_scores fire on wicket loss & phase transitions</li> | |
| <li>opponent_awareness_scores respond to field change</li> | |
| </ul> | |
| <div class="wn" style="font-size:0.84rem; margin-top:10px;"> | |
| All signals verified. Full reward curves pending GRPO training run. Colab notebook ready. | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ══════════════════════════════════════════════════════════ | |
| SLIDE 9 — Innovation Summary & Judging Criteria Mapping | |
| ═══════════════════════════════════════════════════════════ --> | |
| <section class="slide s9" id="s9"> | |
| <div class="slide-number">09 / 10</div> | |
| <h2>Why This Scores on Every Judging Dimension</h2> | |
| <table> | |
| <tr> | |
| <th>Criterion</th><th>Weight</th><th>Our Angle</th><th>Evidence</th> | |
| </tr> | |
| <tr> | |
| <td><strong>Environment Innovation</strong></td> | |
| <td>40%</td> | |
| <td>First RL env purpose-built for WDCT / declaration-execution alignment</td> | |
| <td>12-tool multi-phase, multi-agent, 300-decision episodes with Cricsheet Markov engine</td> | |
| </tr> | |
| <tr> | |
| <td><strong>Storytelling</strong></td> | |
| <td>30%</td> | |
| <td>Clear capability gap → environment design → reward signal → training → measurable WDCT improvement</td> | |
| <td>README, this slide deck, Colab notebook, Gradio live demo</td> | |
| </tr> | |
| <tr> | |
| <td><strong>Improvement in Rewards</strong></td> | |
| <td>20%</td> | |
| <td>Two-stage curriculum produces observable r_validity spike then coherence rise</td> | |
| <td>Colab plots, before/after tool call samples, coherence heatmap</td> | |
| </tr> | |
| <tr> | |
| <td><strong>Reward & Pipeline</strong></td> | |
| <td>10%</td> | |
| <td>4-rubric composite, hard to game (r_result at episode end enforces real match outcomes)</td> | |
| <td>server/reward_calculator.py, server/coherence_grader.py, stateless GRPO reward fn</td> | |
| </tr> | |
| </table> | |
| <div class="two-col" style="margin-top:16px;"> | |
| <div class="hl"> | |
| <strong>Unique technical contributions:</strong><br> | |
| Stateless GRPO via prompt-text parsing · Plan staleness penalty · Per-delivery commitment scoring · Phase-gated tool availability · LLM opponent via HF Router · Format-aware rules (T5/T20/ODI) · Tool budget + fine system · Real player roster lookup | |
| </div> | |
| <div class="gr"> | |
| <strong>Minimum requirements met:</strong><br> | |
| ✅ OpenEnv latest ✅ TRL/Unsloth Colab ✅ HF Space (ready) ✅ README with results<br> | |
| ✅ 3 opponent modes verified (heuristic · cricsheet · llm_live) ⚠️ Blog / video post-training | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ══════════════════════════════════════════════════════════ | |
| SLIDE 10 — Roadmap & The Money Shot | |
| ═══════════════════════════════════════════════════════════ --> | |
| <section class="slide s10" id="s10"> | |
| <div class="slide-number">10 / 10</div> | |
| <h2>Roadmap to Submission</h2> | |
| <div class="two-col"> | |
| <div> | |
| <h3>🔴 Critical Path (on-site, Day 1–2)</h3> | |
| <ul> | |
| <li>Run Colab notebook (notebooks/colab_train_minimal.ipynb) → warmup → main chained training</li> | |
| <li>Export: reward_curves.png, coherence_heatmap.png, tool_timeline.png</li> | |
| <li>Deploy to HuggingFace Spaces → live interactive Gradio demo URL</li> | |
| <li>Add HF Space URL + plot images to README</li> | |
| <li>Write 500-word mini-blog on HF (problem → env → results)</li> | |
| <li>Run Cricsheet data curation (<code>scripts/curate_transitions.py</code>) for real ball probs</li> | |
| </ul> | |
| </div> | |
| <div> | |
| <h3>💰 The Money Shot for Judges</h3> | |
| <div class="hl"> | |
| A heatmap: <strong>episode × delivery coherence score</strong>, showing the gradient rising from ~0.35 (random) toward 0.75+ as training progresses. This directly visualizes the declared coherence improvement. | |
| </div> | |
| <h3 style="margin-top:16px;">🟡 Stretch (improves score)</h3> | |
| <ul> | |
| <li>WDCT before/after comparison on canonical states (Over 35, 180/3)</li> | |
| <li>Opponent cache for reproducible eval without API calls</li> | |
| <li><2 min screen demo video (Gradio UI + reward curve walkthrough)</li> | |
| </ul> | |
| <div style="margin-top:16px; font-size:0.82rem; color:#546e7a;"> | |
| github.com/[team]/cricket-captain-llm · huggingface.co/spaces/[team]/cricket-captain | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| </div><!-- .deck --> | |
| <div class="progress-bar" id="prog"></div> | |
| <div class="nav"> | |
| <button id="prev" onclick="go(-1)" disabled>← Prev</button> | |
| <button id="next" onclick="go(1)">Next →</button> | |
| </div> | |
| <div class="slide-hint">← → arrow keys to navigate</div> | |
| <script> | |
| const slides = document.querySelectorAll('.slide'); | |
| let cur = 0; | |
| const prog = document.getElementById('prog'); | |
| function go(dir) { | |
| slides[cur].classList.remove('active'); | |
| cur = Math.max(0, Math.min(slides.length - 1, cur + dir)); | |
| slides[cur].classList.add('active'); | |
| document.getElementById('prev').disabled = (cur === 0); | |
| document.getElementById('next').disabled = (cur === slides.length - 1); | |
| prog.style.width = ((cur + 1) / slides.length * 100) + '%'; | |
| window.scrollTo(0, 0); | |
| } | |
| document.addEventListener('keydown', e => { | |
| if (e.key === 'ArrowRight' || e.key === 'ArrowDown') go(1); | |
| if (e.key === 'ArrowLeft' || e.key === 'ArrowUp') go(-1); | |
| }); | |
| prog.style.width = (1 / slides.length * 100) + '%'; | |
| </script> | |
| </body> | |
| </html> | |