cricket-captain-llm / scripts /build_player_profiles.py
pratinavseth's picture
feat: align benchmark data and training roadmap
d45f009
"""Build batter and bowler profiles from curated Cricsheet ball outcomes.
Input:
data/processed/ball_outcomes_t20_v1.pkl
Output:
data/processed/player_profiles_t20_v1.json
"""
import argparse
import json
import pickle
from collections import defaultdict
from pathlib import Path
_ROOT = Path(__file__).parent.parent
_PROCESSED = _ROOT / "data" / "processed"
_DREAM11_PATH = _ROOT / "data" / "dream11_t20_points.json"
def _load_dream11() -> dict:
if not _DREAM11_PATH.exists():
return {}
with _DREAM11_PATH.open() as f:
return json.load(f)
def _classify_batter(runs: int, balls: int, boundaries: int, death_runs: int) -> str:
strike_rate = 100 * runs / max(balls, 1)
boundary_rate = boundaries / max(balls, 1)
death_share = death_runs / max(runs, 1)
if strike_rate >= 145 or boundary_rate >= 0.22:
return "hitter"
if death_share >= 0.35 and strike_rate >= 130:
return "finisher"
if strike_rate < 110 and balls >= 30:
return "anchor"
return "balanced"
def _classify_bowler(balls: int, runs: int, wickets: int, death_balls: int, dots: int, bowler_type: str) -> str:
economy = 6 * runs / max(balls, 1)
strike_rate = balls / max(wickets, 1)
dot_rate = dots / max(balls, 1)
death_share = death_balls / max(balls, 1)
if death_share >= 0.30:
return "death_specialist"
if wickets >= 10 and strike_rate <= 18:
return "wicket_taker"
if economy <= 7.0 or dot_rate >= 0.42:
return "economy"
return bowler_type or "pace"
def build_profiles(records: list[dict]) -> dict:
batters = defaultdict(lambda: {
"runs": 0, "balls": 0, "boundaries": 0, "dismissals": 0,
"phase_runs": defaultdict(int), "phase_balls": defaultdict(int),
})
bowlers = defaultdict(lambda: {
"runs": 0, "balls": 0, "dots": 0, "wickets": 0, "type_counts": defaultdict(int),
"phase_runs": defaultdict(int), "phase_balls": defaultdict(int), "phase_wickets": defaultdict(int),
})
for r in records:
if not r.get("legal_delivery", True):
continue
phase = r.get("phase", "middle")
batter = r.get("batter")
bowler = r.get("bowler")
runs_batter = int(r.get("runs_batter", 0))
runs_total = int(r.get("runs_total", runs_batter))
if batter:
b = batters[batter]
b["runs"] += runs_batter
b["balls"] += 1
b["boundaries"] += 1 if runs_batter >= 4 else 0
b["dismissals"] += 1 if r.get("wicket") else 0
b["phase_runs"][phase] += runs_batter
b["phase_balls"][phase] += 1
if bowler:
w = bowlers[bowler]
w["runs"] += runs_total
w["balls"] += 1
w["dots"] += 1 if runs_total == 0 else 0
w["wickets"] += 1 if r.get("wicket") and r.get("dismissal_kind") != "run out" else 0
w["type_counts"][r.get("bowler_type", "pace")] += 1
w["phase_runs"][phase] += runs_total
w["phase_balls"][phase] += 1
w["phase_wickets"][phase] += 1 if r.get("wicket") else 0
dream11 = _load_dream11()
dot_points = dream11.get("categories", {}).get("bowling", {}).get("items", {}).get("dot_ball", {}).get("points", 1)
wicket_points = dream11.get("categories", {}).get("bowling", {}).get("items", {}).get("wicket_excluding_run_out", {}).get("points", 30)
batter_profiles = {}
for name, s in batters.items():
balls = s["balls"]
if balls < 12:
continue
death_runs = s["phase_runs"].get("death", 0)
style = _classify_batter(s["runs"], balls, s["boundaries"], death_runs)
batter_profiles[name] = {
"role": "batter",
"style": style,
"runs": s["runs"],
"balls": balls,
"strike_rate": round(100 * s["runs"] / max(balls, 1), 2),
"boundary_rate": round(s["boundaries"] / max(balls, 1), 4),
"dismissal_rate": round(s["dismissals"] / max(balls, 1), 4),
"phase_strengths": {
phase: round(100 * s["phase_runs"][phase] / max(s["phase_balls"][phase], 1), 2)
for phase in ("powerplay", "middle", "death")
if s["phase_balls"].get(phase)
},
}
bowler_profiles = {}
for name, s in bowlers.items():
balls = s["balls"]
if balls < 12:
continue
bowler_type = max(s["type_counts"], key=s["type_counts"].get) if s["type_counts"] else "pace"
style = _classify_bowler(
balls, s["runs"], s["wickets"], s["phase_balls"].get("death", 0), s["dots"], bowler_type
)
bowler_profiles[name] = {
"role": "bowler",
"type": bowler_type,
"style": style,
"balls": balls,
"runs_conceded": s["runs"],
"wickets": s["wickets"],
"economy": round(6 * s["runs"] / max(balls, 1), 2),
"dot_rate": round(s["dots"] / max(balls, 1), 4),
"strike_rate": round(balls / max(s["wickets"], 1), 2),
"dream11_pressure_points": s["dots"] * dot_points + s["wickets"] * wicket_points,
"phase_strengths": {
phase: {
"economy": round(6 * s["phase_runs"][phase] / max(s["phase_balls"][phase], 1), 2),
"wickets": s["phase_wickets"][phase],
}
for phase in ("powerplay", "middle", "death")
if s["phase_balls"].get(phase)
},
}
return {
"schema_version": "player_profiles_t20_v1",
"batters": batter_profiles,
"bowlers": bowler_profiles,
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", default=str(_PROCESSED / "ball_outcomes_t20_v1.pkl"))
parser.add_argument("--output", default=str(_PROCESSED / "player_profiles_t20_v1.json"))
args = parser.parse_args()
with open(args.input, "rb") as f:
records = pickle.load(f)
profiles = build_profiles(records)
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
with output.open("w") as f:
json.dump(profiles, f, indent=2)
print(
f"Wrote {output}: {len(profiles['batters'])} batters, "
f"{len(profiles['bowlers'])} bowlers"
)
if __name__ == "__main__":
main()