| """ |
| Fetch T20I player profiles from ESPNcricinfo Statsguru. |
| |
| Derives player attributes from career batting + bowling stats: |
| aggression — normalised from strike rate |
| style — aggressive / balanced / anchor (from SR + boundary %) |
| role — opener / middle / finisher / bowler / allrounder |
| bowl_style — economy / attacking / stock (from economy + bowling SR) |
| bowler_type — pace / spin (heuristic; override manually if wrong) |
| max_overs — 4 (T20 standard) |
| |
| Usage: |
| python scripts/fetch_player_profiles.py --team india |
| python scripts/fetch_player_profiles.py --team australia --team england |
| python scripts/fetch_player_profiles.py --list-teams |
| |
| Output: data/player_profiles/{team}.json |
| """ |
|
|
| import argparse |
| import json |
| import os |
| import ssl |
| import time |
| import urllib.request |
| from html.parser import HTMLParser |
|
|
| |
| _SSL_CTX = ssl.create_default_context() |
| _SSL_CTX.check_hostname = False |
| _SSL_CTX.verify_mode = ssl.CERT_NONE |
|
|
| |
| |
| |
|
|
| TEAM_IDS = { |
| "england": 1, |
| "australia": 2, |
| "south_africa": 3, |
| "west_indies": 4, |
| "new_zealand": 5, |
| "india": 6, |
| "pakistan": 7, |
| "sri_lanka": 8, |
| "zimbabwe": 9, |
| "bangladesh": 25, |
| "afghanistan": 40, |
| "ireland": 29, |
| "scotland": 30, |
| "netherlands": 15, |
| } |
|
|
| |
| _KNOWN_SPINNERS = { |
| "jadeja", "ashwin", "chahal", "kuldeep", "axar", "bishnoi", |
| "muralitharan", "mendis", "herath", "hasaranga", |
| "tahir", "shamsi", |
| "afridi", "imad", "shadab", "nawaz", |
| "rashid", "nabi", "mujeeb", |
| "santner", "sodhi", |
| "moeen", "dawson", "parkinson", |
| "zampa", |
| "naraine", "chase", |
| "mehidy", "shakib", "taijul", |
| } |
|
|
| _OUT_DIR = os.path.join(os.path.dirname(__file__), "..", "data", "player_profiles") |
|
|
| |
| |
| |
|
|
| class _StatsguruParser(HTMLParser): |
| """ |
| Parses ESPNcricinfo Statsguru HTML. |
| |
| Headers come from <th> elements. |
| Data rows are <tr class="data1"> or <tr class="data2">. |
| Player names are inside <a> tags within the first <td>. |
| """ |
|
|
| def __init__(self): |
| super().__init__() |
| self.headers: list[str] = [] |
| self.rows: list[list[str]] = [] |
| self._in_header = False |
| self._in_data_row = False |
| self._in_cell = False |
| self._cell_buf = "" |
| self._current_row: list[str] = [] |
|
|
| def handle_starttag(self, tag, attrs): |
| attrs_dict = dict(attrs) |
| cls = attrs_dict.get("class", "") |
|
|
| if tag == "th": |
| self._in_header = True |
| self._cell_buf = "" |
| if tag == "tr" and cls in ("data1", "data2"): |
| self._in_data_row = True |
| self._current_row = [] |
| if self._in_data_row and tag == "td": |
| self._in_cell = True |
| self._cell_buf = "" |
|
|
| def handle_endtag(self, tag): |
| if tag == "th" and self._in_header: |
| self._in_header = False |
| h = self._cell_buf.strip() |
| if h: |
| self.headers.append(h) |
| if tag == "td" and self._in_cell: |
| self._in_cell = False |
| self._current_row.append(self._cell_buf.strip()) |
| if tag == "tr" and self._in_data_row: |
| self._in_data_row = False |
| if self._current_row: |
| self.rows.append(self._current_row) |
|
|
| def handle_data(self, data): |
| if self._in_header or self._in_cell: |
| self._cell_buf += data |
|
|
|
|
| def _fetch_table(url: str) -> tuple[list[str], list[list[str]]]: |
| """Return (headers, data_rows) from a Statsguru page.""" |
| req = urllib.request.Request(url, headers={ |
| "User-Agent": "Mozilla/5.0 (compatible; cricket-research-bot/1.0)", |
| "Accept": "text/html", |
| }) |
| with urllib.request.urlopen(req, context=_SSL_CTX, timeout=15) as resp: |
| html = resp.read().decode("utf-8", errors="replace") |
|
|
| parser = _StatsguruParser() |
| parser.feed(html) |
|
|
| |
| headers = parser.headers[:-1] if parser.headers and parser.headers[-1] == "" else parser.headers |
| data_rows = [ |
| row[:len(headers)] |
| for row in parser.rows |
| if len(row) >= max(len(headers) - 2, 3) and row[0].strip() not in ("", "-") |
| ] |
|
|
| return headers, data_rows |
|
|
|
|
| |
| |
| |
|
|
| _BASE = "https://stats.espncricinfo.com/ci/engine/stats/index.html" |
|
|
|
|
| def _batting_url(team_id: int) -> str: |
| return ( |
| f"{_BASE}?class=3;template=results;type=batting" |
| f";team={team_id};orderby=runs;size=200" |
| ) |
|
|
|
|
| def _bowling_url(team_id: int) -> str: |
| return ( |
| f"{_BASE}?class=3;template=results;type=bowling" |
| f";team={team_id};orderby=wickets;size=200" |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def _safe_float(val: str, default: float = 0.0) -> float: |
| try: |
| return float(val) |
| except (ValueError, TypeError): |
| return default |
|
|
|
|
| def _safe_int(val: str, default: int = 0) -> int: |
| try: |
| return int(val.replace("*", "")) |
| except (ValueError, TypeError): |
| return default |
|
|
|
|
| def _derive_batting(row: dict) -> dict: |
| sr = _safe_float(row.get("SR", ""), 100.0) |
| bf = _safe_int(row.get("BF", ""), 1) or 1 |
| fours = _safe_int(row.get("4s", ""), 0) |
| sixes = _safe_int(row.get("6s", ""), 0) |
| avg = _safe_float(row.get("Ave", ""), 0.0) |
| runs = _safe_int(row.get("Runs", ""), 0) |
| inns = _safe_int(row.get("Inns", ""), 1) or 1 |
|
|
| boundary_pct = (fours + sixes) / bf |
|
|
| |
| aggression = round(min(0.95, max(0.10, (sr - 90) / 105)), 2) |
|
|
| if sr > 155 or boundary_pct > 0.22: |
| style = "aggressive" |
| elif sr < 120 and avg > 22: |
| style = "anchor" |
| else: |
| style = "balanced" |
|
|
| return { |
| "aggression": aggression, |
| "style": style, |
| "strike_rate": round(sr, 1), |
| "average": round(avg, 1), |
| "boundary_pct": round(boundary_pct, 3), |
| "runs": runs, |
| "innings": inns, |
| } |
|
|
|
|
| def _derive_bowling(row: dict) -> dict: |
| econ = _safe_float(row.get("Econ", ""), 8.0) |
| bowl_sr = _safe_float(row.get("SR", ""), 20.0) |
| wkts = _safe_int(row.get("Wkts", ""), 0) |
| overs = _safe_float(row.get("Overs", ""), 0.0) |
|
|
| if econ < 6.8: |
| bowl_style = "economy" |
| elif bowl_sr < 14.0: |
| bowl_style = "attacking" |
| else: |
| bowl_style = "stock" |
|
|
| return { |
| "economy": round(econ, 2), |
| "bowling_sr": round(bowl_sr, 1), |
| "wickets": wkts, |
| "overs_bowled": round(overs, 1), |
| "bowl_style": bowl_style, |
| "max_overs": 4, |
| } |
|
|
|
|
| def _is_spinner(name: str) -> bool: |
| name_lower = name.lower() |
| return any(s in name_lower for s in _KNOWN_SPINNERS) |
|
|
|
|
| def _infer_role(bat_stats: dict | None, bowl_stats: dict | None) -> str: |
| has_bowl = bowl_stats is not None and bowl_stats.get("wickets", 0) >= 5 |
|
|
| |
| is_tailender = ( |
| bat_stats is not None |
| and bat_stats.get("strike_rate", 100) < 90 |
| and bat_stats.get("average", 10) < 12 |
| ) |
| if is_tailender: |
| return "bowler" if has_bowl else "unknown" |
|
|
| has_bat = bat_stats is not None and bat_stats.get("innings", 0) >= 5 |
|
|
| if has_bat and has_bowl: |
| return "allrounder" |
| if has_bowl and not has_bat: |
| return "bowler" |
| if has_bat: |
| style = bat_stats.get("style", "balanced") |
| sr = bat_stats.get("strike_rate", 130) |
| avg = bat_stats.get("average", 0) |
| if style == "aggressive" and sr > 145: |
| return "finisher" |
| if style == "anchor" or avg > 32: |
| return "middle" |
| return "middle" |
| return "unknown" |
|
|
|
|
| |
| |
| |
|
|
| def fetch_team_profiles(team_name: str) -> list[dict]: |
| team_id = TEAM_IDS.get(team_name.lower().replace(" ", "_")) |
| if team_id is None: |
| raise ValueError(f"Unknown team '{team_name}'. Use --list-teams to see options.") |
|
|
| print(f" Fetching batting stats for {team_name} (id={team_id}) …") |
| bat_headers, bat_rows = _fetch_table(_batting_url(team_id)) |
| time.sleep(1.5) |
|
|
| print(f" Fetching bowling stats for {team_name} …") |
| bowl_headers, bowl_rows = _fetch_table(_bowling_url(team_id)) |
|
|
| |
| def _rows_to_dict(headers, rows): |
| result = {} |
| for row in rows: |
| if not row or len(row) < 3: |
| continue |
| name = row[0].strip() |
| if not name or name in ("Player", "-"): |
| continue |
| result[name] = dict(zip(headers, row)) |
| return result |
|
|
| bat_map = _rows_to_dict(bat_headers, bat_rows) |
| bowl_map = _rows_to_dict(bowl_headers, bowl_rows) |
|
|
| all_names = sorted(set(bat_map) | set(bowl_map)) |
| profiles = [] |
|
|
| for name in all_names: |
| bat_raw = bat_map.get(name) |
| bowl_raw = bowl_map.get(name) |
|
|
| bat_stats = _derive_batting(bat_raw) if bat_raw else None |
| bowl_stats = _derive_bowling(bowl_raw) if bowl_raw else None |
|
|
| role = _infer_role(bat_stats, bowl_stats) |
|
|
| profile: dict = {"name": name, "role": role} |
|
|
| if bat_stats: |
| profile.update({ |
| "aggression": bat_stats["aggression"], |
| "style": bat_stats["style"], |
| "strike_rate": bat_stats["strike_rate"], |
| "average": bat_stats["average"], |
| "boundary_pct": bat_stats["boundary_pct"], |
| }) |
|
|
| if bowl_stats and bowl_stats["wickets"] >= 5: |
| profile.update({ |
| "bowler_type": "spin" if _is_spinner(name) else "pace", |
| "bowl_style": bowl_stats["bowl_style"], |
| "economy": bowl_stats["economy"], |
| "bowling_sr": bowl_stats["bowling_sr"], |
| "max_overs": bowl_stats["max_overs"], |
| }) |
|
|
| profiles.append(profile) |
|
|
| return profiles |
|
|
|
|
| |
| |
| |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Fetch T20I player profiles from ESPNcricinfo") |
| parser.add_argument("--team", action="append", dest="teams", metavar="TEAM", |
| help="Team name (repeatable). e.g. --team india --team australia") |
| parser.add_argument("--list-teams", action="store_true", help="Print available team names and exit") |
| parser.add_argument("--out-dir", default=_OUT_DIR, help="Output directory (default: data/player_profiles/)") |
| args = parser.parse_args() |
|
|
| if args.list_teams: |
| print("Available teams:") |
| for t in sorted(TEAM_IDS): |
| print(f" {t}") |
| return |
|
|
| if not args.teams: |
| parser.error("Provide at least one --team name, or use --list-teams") |
|
|
| os.makedirs(args.out_dir, exist_ok=True) |
|
|
| for team in args.teams: |
| team_key = team.lower().replace(" ", "_") |
| print(f"\n[{team_key}]") |
| try: |
| profiles = fetch_team_profiles(team_key) |
| out_path = os.path.join(args.out_dir, f"{team_key}.json") |
| with open(out_path, "w") as f: |
| json.dump({"team": team_key, "format": "T20I", "players": profiles}, f, indent=2) |
| print(f" → {len(profiles)} players written to {out_path}") |
| except Exception as e: |
| print(f" ERROR: {e}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|