""" Fetch T20I player profiles from ESPNcricinfo Statsguru. Derives player attributes from career batting + bowling stats: aggression — normalised from strike rate style — aggressive / balanced / anchor (from SR + boundary %) role — opener / middle / finisher / bowler / allrounder bowl_style — economy / attacking / stock (from economy + bowling SR) bowler_type — pace / spin (heuristic; override manually if wrong) max_overs — 4 (T20 standard) Usage: python scripts/fetch_player_profiles.py --team india python scripts/fetch_player_profiles.py --team australia --team england python scripts/fetch_player_profiles.py --list-teams Output: data/player_profiles/{team}.json """ import argparse import json import os import ssl import time import urllib.request from html.parser import HTMLParser # macOS ships without root certs linked to Python — skip verification for Statsguru _SSL_CTX = ssl.create_default_context() _SSL_CTX.check_hostname = False _SSL_CTX.verify_mode = ssl.CERT_NONE # --------------------------------------------------------------------------- # Team name → ESPNcricinfo team ID # --------------------------------------------------------------------------- TEAM_IDS = { "england": 1, "australia": 2, "south_africa": 3, "west_indies": 4, "new_zealand": 5, "india": 6, "pakistan": 7, "sri_lanka": 8, "zimbabwe": 9, "bangladesh": 25, "afghanistan": 40, "ireland": 29, "scotland": 30, "netherlands": 15, } # Known spinners — used to classify bowler_type when stats alone are ambiguous _KNOWN_SPINNERS = { "jadeja", "ashwin", "chahal", "kuldeep", "axar", "bishnoi", "muralitharan", "mendis", "herath", "hasaranga", "tahir", "shamsi", "afridi", "imad", "shadab", "nawaz", "rashid", "nabi", "mujeeb", "santner", "sodhi", "moeen", "dawson", "parkinson", "zampa", "naraine", "chase", "mehidy", "shakib", "taijul", } _OUT_DIR = os.path.join(os.path.dirname(__file__), "..", "data", "player_profiles") # --------------------------------------------------------------------------- # Statsguru HTML table parser # --------------------------------------------------------------------------- class _StatsguruParser(HTMLParser): """ Parses ESPNcricinfo Statsguru HTML. Headers come from elements. Data rows are or . Player names are inside tags within the first . """ def __init__(self): super().__init__() self.headers: list[str] = [] self.rows: list[list[str]] = [] self._in_header = False self._in_data_row = False self._in_cell = False self._cell_buf = "" self._current_row: list[str] = [] def handle_starttag(self, tag, attrs): attrs_dict = dict(attrs) cls = attrs_dict.get("class", "") if tag == "th": self._in_header = True self._cell_buf = "" if tag == "tr" and cls in ("data1", "data2"): self._in_data_row = True self._current_row = [] if self._in_data_row and tag == "td": self._in_cell = True self._cell_buf = "" def handle_endtag(self, tag): if tag == "th" and self._in_header: self._in_header = False h = self._cell_buf.strip() if h: self.headers.append(h) if tag == "td" and self._in_cell: self._in_cell = False self._current_row.append(self._cell_buf.strip()) if tag == "tr" and self._in_data_row: self._in_data_row = False if self._current_row: self.rows.append(self._current_row) def handle_data(self, data): if self._in_header or self._in_cell: self._cell_buf += data def _fetch_table(url: str) -> tuple[list[str], list[list[str]]]: """Return (headers, data_rows) from a Statsguru page.""" req = urllib.request.Request(url, headers={ "User-Agent": "Mozilla/5.0 (compatible; cricket-research-bot/1.0)", "Accept": "text/html", }) with urllib.request.urlopen(req, context=_SSL_CTX, timeout=15) as resp: html = resp.read().decode("utf-8", errors="replace") parser = _StatsguruParser() parser.feed(html) # Filter out short/empty rows and the trailing investigate-icon column headers = parser.headers[:-1] if parser.headers and parser.headers[-1] == "" else parser.headers data_rows = [ row[:len(headers)] for row in parser.rows if len(row) >= max(len(headers) - 2, 3) and row[0].strip() not in ("", "-") ] return headers, data_rows # --------------------------------------------------------------------------- # Statsguru URLs # --------------------------------------------------------------------------- _BASE = "https://stats.espncricinfo.com/ci/engine/stats/index.html" def _batting_url(team_id: int) -> str: return ( f"{_BASE}?class=3;template=results;type=batting" f";team={team_id};orderby=runs;size=200" ) def _bowling_url(team_id: int) -> str: return ( f"{_BASE}?class=3;template=results;type=bowling" f";team={team_id};orderby=wickets;size=200" ) # --------------------------------------------------------------------------- # Profile derivation # --------------------------------------------------------------------------- def _safe_float(val: str, default: float = 0.0) -> float: try: return float(val) except (ValueError, TypeError): return default def _safe_int(val: str, default: int = 0) -> int: try: return int(val.replace("*", "")) except (ValueError, TypeError): return default def _derive_batting(row: dict) -> dict: sr = _safe_float(row.get("SR", ""), 100.0) bf = _safe_int(row.get("BF", ""), 1) or 1 fours = _safe_int(row.get("4s", ""), 0) sixes = _safe_int(row.get("6s", ""), 0) avg = _safe_float(row.get("Ave", ""), 0.0) runs = _safe_int(row.get("Runs", ""), 0) inns = _safe_int(row.get("Inns", ""), 1) or 1 boundary_pct = (fours + sixes) / bf # Aggression: SR 90→0.10, SR 190→0.95 aggression = round(min(0.95, max(0.10, (sr - 90) / 105)), 2) if sr > 155 or boundary_pct > 0.22: style = "aggressive" elif sr < 120 and avg > 22: style = "anchor" else: style = "balanced" return { "aggression": aggression, "style": style, "strike_rate": round(sr, 1), "average": round(avg, 1), "boundary_pct": round(boundary_pct, 3), "runs": runs, "innings": inns, } def _derive_bowling(row: dict) -> dict: econ = _safe_float(row.get("Econ", ""), 8.0) bowl_sr = _safe_float(row.get("SR", ""), 20.0) wkts = _safe_int(row.get("Wkts", ""), 0) overs = _safe_float(row.get("Overs", ""), 0.0) if econ < 6.8: bowl_style = "economy" elif bowl_sr < 14.0: bowl_style = "attacking" else: bowl_style = "stock" return { "economy": round(econ, 2), "bowling_sr": round(bowl_sr, 1), "wickets": wkts, "overs_bowled": round(overs, 1), "bowl_style": bowl_style, "max_overs": 4, } def _is_spinner(name: str) -> bool: name_lower = name.lower() return any(s in name_lower for s in _KNOWN_SPINNERS) def _infer_role(bat_stats: dict | None, bowl_stats: dict | None) -> str: has_bowl = bowl_stats is not None and bowl_stats.get("wickets", 0) >= 5 # Tail-enders: appear in batting stats but can't really bat is_tailender = ( bat_stats is not None and bat_stats.get("strike_rate", 100) < 90 and bat_stats.get("average", 10) < 12 ) if is_tailender: return "bowler" if has_bowl else "unknown" has_bat = bat_stats is not None and bat_stats.get("innings", 0) >= 5 if has_bat and has_bowl: return "allrounder" if has_bowl and not has_bat: return "bowler" if has_bat: style = bat_stats.get("style", "balanced") sr = bat_stats.get("strike_rate", 130) avg = bat_stats.get("average", 0) if style == "aggressive" and sr > 145: return "finisher" if style == "anchor" or avg > 32: return "middle" return "middle" # default — can't infer opener from career stats alone return "unknown" # --------------------------------------------------------------------------- # Main fetch routine # --------------------------------------------------------------------------- def fetch_team_profiles(team_name: str) -> list[dict]: team_id = TEAM_IDS.get(team_name.lower().replace(" ", "_")) if team_id is None: raise ValueError(f"Unknown team '{team_name}'. Use --list-teams to see options.") print(f" Fetching batting stats for {team_name} (id={team_id}) …") bat_headers, bat_rows = _fetch_table(_batting_url(team_id)) time.sleep(1.5) print(f" Fetching bowling stats for {team_name} …") bowl_headers, bowl_rows = _fetch_table(_bowling_url(team_id)) # Build name → stats dicts def _rows_to_dict(headers, rows): result = {} for row in rows: if not row or len(row) < 3: continue name = row[0].strip() if not name or name in ("Player", "-"): continue result[name] = dict(zip(headers, row)) return result bat_map = _rows_to_dict(bat_headers, bat_rows) bowl_map = _rows_to_dict(bowl_headers, bowl_rows) all_names = sorted(set(bat_map) | set(bowl_map)) profiles = [] for name in all_names: bat_raw = bat_map.get(name) bowl_raw = bowl_map.get(name) bat_stats = _derive_batting(bat_raw) if bat_raw else None bowl_stats = _derive_bowling(bowl_raw) if bowl_raw else None role = _infer_role(bat_stats, bowl_stats) profile: dict = {"name": name, "role": role} if bat_stats: profile.update({ "aggression": bat_stats["aggression"], "style": bat_stats["style"], "strike_rate": bat_stats["strike_rate"], "average": bat_stats["average"], "boundary_pct": bat_stats["boundary_pct"], }) if bowl_stats and bowl_stats["wickets"] >= 5: profile.update({ "bowler_type": "spin" if _is_spinner(name) else "pace", "bowl_style": bowl_stats["bowl_style"], "economy": bowl_stats["economy"], "bowling_sr": bowl_stats["bowling_sr"], "max_overs": bowl_stats["max_overs"], }) profiles.append(profile) return profiles # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description="Fetch T20I player profiles from ESPNcricinfo") parser.add_argument("--team", action="append", dest="teams", metavar="TEAM", help="Team name (repeatable). e.g. --team india --team australia") parser.add_argument("--list-teams", action="store_true", help="Print available team names and exit") parser.add_argument("--out-dir", default=_OUT_DIR, help="Output directory (default: data/player_profiles/)") args = parser.parse_args() if args.list_teams: print("Available teams:") for t in sorted(TEAM_IDS): print(f" {t}") return if not args.teams: parser.error("Provide at least one --team name, or use --list-teams") os.makedirs(args.out_dir, exist_ok=True) for team in args.teams: team_key = team.lower().replace(" ", "_") print(f"\n[{team_key}]") try: profiles = fetch_team_profiles(team_key) out_path = os.path.join(args.out_dir, f"{team_key}.json") with open(out_path, "w") as f: json.dump({"team": team_key, "format": "T20I", "players": profiles}, f, indent=2) print(f" → {len(profiles)} players written to {out_path}") except Exception as e: print(f" ERROR: {e}") if __name__ == "__main__": main()