"""Build batter and bowler profiles from curated Cricsheet ball outcomes. Input: data/processed/ball_outcomes_t20_v1.pkl Output: data/processed/player_profiles_t20_v1.json """ import argparse import json import pickle from collections import defaultdict from pathlib import Path _ROOT = Path(__file__).parent.parent _PROCESSED = _ROOT / "data" / "processed" _DREAM11_PATH = _ROOT / "data" / "dream11_t20_points.json" def _load_dream11() -> dict: if not _DREAM11_PATH.exists(): return {} with _DREAM11_PATH.open() as f: return json.load(f) def _classify_batter(runs: int, balls: int, boundaries: int, death_runs: int) -> str: strike_rate = 100 * runs / max(balls, 1) boundary_rate = boundaries / max(balls, 1) death_share = death_runs / max(runs, 1) if strike_rate >= 145 or boundary_rate >= 0.22: return "hitter" if death_share >= 0.35 and strike_rate >= 130: return "finisher" if strike_rate < 110 and balls >= 30: return "anchor" return "balanced" def _classify_bowler(balls: int, runs: int, wickets: int, death_balls: int, dots: int, bowler_type: str) -> str: economy = 6 * runs / max(balls, 1) strike_rate = balls / max(wickets, 1) dot_rate = dots / max(balls, 1) death_share = death_balls / max(balls, 1) if death_share >= 0.30: return "death_specialist" if wickets >= 10 and strike_rate <= 18: return "wicket_taker" if economy <= 7.0 or dot_rate >= 0.42: return "economy" return bowler_type or "pace" def build_profiles(records: list[dict]) -> dict: batters = defaultdict(lambda: { "runs": 0, "balls": 0, "boundaries": 0, "dismissals": 0, "phase_runs": defaultdict(int), "phase_balls": defaultdict(int), }) bowlers = defaultdict(lambda: { "runs": 0, "balls": 0, "dots": 0, "wickets": 0, "type_counts": defaultdict(int), "phase_runs": defaultdict(int), "phase_balls": defaultdict(int), "phase_wickets": defaultdict(int), }) for r in records: if not r.get("legal_delivery", True): continue phase = r.get("phase", "middle") batter = r.get("batter") bowler = r.get("bowler") runs_batter = int(r.get("runs_batter", 0)) runs_total = int(r.get("runs_total", runs_batter)) if batter: b = batters[batter] b["runs"] += runs_batter b["balls"] += 1 b["boundaries"] += 1 if runs_batter >= 4 else 0 b["dismissals"] += 1 if r.get("wicket") else 0 b["phase_runs"][phase] += runs_batter b["phase_balls"][phase] += 1 if bowler: w = bowlers[bowler] w["runs"] += runs_total w["balls"] += 1 w["dots"] += 1 if runs_total == 0 else 0 w["wickets"] += 1 if r.get("wicket") and r.get("dismissal_kind") != "run out" else 0 w["type_counts"][r.get("bowler_type", "pace")] += 1 w["phase_runs"][phase] += runs_total w["phase_balls"][phase] += 1 w["phase_wickets"][phase] += 1 if r.get("wicket") else 0 dream11 = _load_dream11() dot_points = dream11.get("categories", {}).get("bowling", {}).get("items", {}).get("dot_ball", {}).get("points", 1) wicket_points = dream11.get("categories", {}).get("bowling", {}).get("items", {}).get("wicket_excluding_run_out", {}).get("points", 30) batter_profiles = {} for name, s in batters.items(): balls = s["balls"] if balls < 12: continue death_runs = s["phase_runs"].get("death", 0) style = _classify_batter(s["runs"], balls, s["boundaries"], death_runs) batter_profiles[name] = { "role": "batter", "style": style, "runs": s["runs"], "balls": balls, "strike_rate": round(100 * s["runs"] / max(balls, 1), 2), "boundary_rate": round(s["boundaries"] / max(balls, 1), 4), "dismissal_rate": round(s["dismissals"] / max(balls, 1), 4), "phase_strengths": { phase: round(100 * s["phase_runs"][phase] / max(s["phase_balls"][phase], 1), 2) for phase in ("powerplay", "middle", "death") if s["phase_balls"].get(phase) }, } bowler_profiles = {} for name, s in bowlers.items(): balls = s["balls"] if balls < 12: continue bowler_type = max(s["type_counts"], key=s["type_counts"].get) if s["type_counts"] else "pace" style = _classify_bowler( balls, s["runs"], s["wickets"], s["phase_balls"].get("death", 0), s["dots"], bowler_type ) bowler_profiles[name] = { "role": "bowler", "type": bowler_type, "style": style, "balls": balls, "runs_conceded": s["runs"], "wickets": s["wickets"], "economy": round(6 * s["runs"] / max(balls, 1), 2), "dot_rate": round(s["dots"] / max(balls, 1), 4), "strike_rate": round(balls / max(s["wickets"], 1), 2), "dream11_pressure_points": s["dots"] * dot_points + s["wickets"] * wicket_points, "phase_strengths": { phase: { "economy": round(6 * s["phase_runs"][phase] / max(s["phase_balls"][phase], 1), 2), "wickets": s["phase_wickets"][phase], } for phase in ("powerplay", "middle", "death") if s["phase_balls"].get(phase) }, } return { "schema_version": "player_profiles_t20_v1", "batters": batter_profiles, "bowlers": bowler_profiles, } def main(): parser = argparse.ArgumentParser() parser.add_argument("--input", default=str(_PROCESSED / "ball_outcomes_t20_v1.pkl")) parser.add_argument("--output", default=str(_PROCESSED / "player_profiles_t20_v1.json")) args = parser.parse_args() with open(args.input, "rb") as f: records = pickle.load(f) profiles = build_profiles(records) output = Path(args.output) output.parent.mkdir(parents=True, exist_ok=True) with output.open("w") as f: json.dump(profiles, f, indent=2) print( f"Wrote {output}: {len(profiles['batters'])} batters, " f"{len(profiles['bowlers'])} bowlers" ) if __name__ == "__main__": main()