"""
Fetch T20I player profiles from ESPNcricinfo Statsguru.
Derives player attributes from career batting + bowling stats:
aggression — normalised from strike rate
style — aggressive / balanced / anchor (from SR + boundary %)
role — opener / middle / finisher / bowler / allrounder
bowl_style — economy / attacking / stock (from economy + bowling SR)
bowler_type — pace / spin (heuristic; override manually if wrong)
max_overs — 4 (T20 standard)
Usage:
python scripts/fetch_player_profiles.py --team india
python scripts/fetch_player_profiles.py --team australia --team england
python scripts/fetch_player_profiles.py --list-teams
Output: data/player_profiles/{team}.json
"""
import argparse
import json
import os
import ssl
import time
import urllib.request
from html.parser import HTMLParser
# macOS ships without root certs linked to Python — skip verification for Statsguru
_SSL_CTX = ssl.create_default_context()
_SSL_CTX.check_hostname = False
_SSL_CTX.verify_mode = ssl.CERT_NONE
# ---------------------------------------------------------------------------
# Team name → ESPNcricinfo team ID
# ---------------------------------------------------------------------------
TEAM_IDS = {
"england": 1,
"australia": 2,
"south_africa": 3,
"west_indies": 4,
"new_zealand": 5,
"india": 6,
"pakistan": 7,
"sri_lanka": 8,
"zimbabwe": 9,
"bangladesh": 25,
"afghanistan": 40,
"ireland": 29,
"scotland": 30,
"netherlands": 15,
}
# Known spinners — used to classify bowler_type when stats alone are ambiguous
_KNOWN_SPINNERS = {
"jadeja", "ashwin", "chahal", "kuldeep", "axar", "bishnoi",
"muralitharan", "mendis", "herath", "hasaranga",
"tahir", "shamsi",
"afridi", "imad", "shadab", "nawaz",
"rashid", "nabi", "mujeeb",
"santner", "sodhi",
"moeen", "dawson", "parkinson",
"zampa",
"naraine", "chase",
"mehidy", "shakib", "taijul",
}
_OUT_DIR = os.path.join(os.path.dirname(__file__), "..", "data", "player_profiles")
# ---------------------------------------------------------------------------
# Statsguru HTML table parser
# ---------------------------------------------------------------------------
class _StatsguruParser(HTMLParser):
"""
Parses ESPNcricinfo Statsguru HTML.
Headers come from
.
Player names are inside tags within the first | .
"""
def __init__(self):
super().__init__()
self.headers: list[str] = []
self.rows: list[list[str]] = []
self._in_header = False
self._in_data_row = False
self._in_cell = False
self._cell_buf = ""
self._current_row: list[str] = []
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
cls = attrs_dict.get("class", "")
if tag == "th":
self._in_header = True
self._cell_buf = ""
if tag == "tr" and cls in ("data1", "data2"):
self._in_data_row = True
self._current_row = []
if self._in_data_row and tag == "td":
self._in_cell = True
self._cell_buf = ""
def handle_endtag(self, tag):
if tag == "th" and self._in_header:
self._in_header = False
h = self._cell_buf.strip()
if h:
self.headers.append(h)
if tag == "td" and self._in_cell:
self._in_cell = False
self._current_row.append(self._cell_buf.strip())
if tag == "tr" and self._in_data_row:
self._in_data_row = False
if self._current_row:
self.rows.append(self._current_row)
def handle_data(self, data):
if self._in_header or self._in_cell:
self._cell_buf += data
def _fetch_table(url: str) -> tuple[list[str], list[list[str]]]:
"""Return (headers, data_rows) from a Statsguru page."""
req = urllib.request.Request(url, headers={
"User-Agent": "Mozilla/5.0 (compatible; cricket-research-bot/1.0)",
"Accept": "text/html",
})
with urllib.request.urlopen(req, context=_SSL_CTX, timeout=15) as resp:
html = resp.read().decode("utf-8", errors="replace")
parser = _StatsguruParser()
parser.feed(html)
# Filter out short/empty rows and the trailing investigate-icon column
headers = parser.headers[:-1] if parser.headers and parser.headers[-1] == "" else parser.headers
data_rows = [
row[:len(headers)]
for row in parser.rows
if len(row) >= max(len(headers) - 2, 3) and row[0].strip() not in ("", "-")
]
return headers, data_rows
# ---------------------------------------------------------------------------
# Statsguru URLs
# ---------------------------------------------------------------------------
_BASE = "https://stats.espncricinfo.com/ci/engine/stats/index.html"
def _batting_url(team_id: int) -> str:
return (
f"{_BASE}?class=3;template=results;type=batting"
f";team={team_id};orderby=runs;size=200"
)
def _bowling_url(team_id: int) -> str:
return (
f"{_BASE}?class=3;template=results;type=bowling"
f";team={team_id};orderby=wickets;size=200"
)
# ---------------------------------------------------------------------------
# Profile derivation
# ---------------------------------------------------------------------------
def _safe_float(val: str, default: float = 0.0) -> float:
try:
return float(val)
except (ValueError, TypeError):
return default
def _safe_int(val: str, default: int = 0) -> int:
try:
return int(val.replace("*", ""))
except (ValueError, TypeError):
return default
def _derive_batting(row: dict) -> dict:
sr = _safe_float(row.get("SR", ""), 100.0)
bf = _safe_int(row.get("BF", ""), 1) or 1
fours = _safe_int(row.get("4s", ""), 0)
sixes = _safe_int(row.get("6s", ""), 0)
avg = _safe_float(row.get("Ave", ""), 0.0)
runs = _safe_int(row.get("Runs", ""), 0)
inns = _safe_int(row.get("Inns", ""), 1) or 1
boundary_pct = (fours + sixes) / bf
# Aggression: SR 90→0.10, SR 190→0.95
aggression = round(min(0.95, max(0.10, (sr - 90) / 105)), 2)
if sr > 155 or boundary_pct > 0.22:
style = "aggressive"
elif sr < 120 and avg > 22:
style = "anchor"
else:
style = "balanced"
return {
"aggression": aggression,
"style": style,
"strike_rate": round(sr, 1),
"average": round(avg, 1),
"boundary_pct": round(boundary_pct, 3),
"runs": runs,
"innings": inns,
}
def _derive_bowling(row: dict) -> dict:
econ = _safe_float(row.get("Econ", ""), 8.0)
bowl_sr = _safe_float(row.get("SR", ""), 20.0)
wkts = _safe_int(row.get("Wkts", ""), 0)
overs = _safe_float(row.get("Overs", ""), 0.0)
if econ < 6.8:
bowl_style = "economy"
elif bowl_sr < 14.0:
bowl_style = "attacking"
else:
bowl_style = "stock"
return {
"economy": round(econ, 2),
"bowling_sr": round(bowl_sr, 1),
"wickets": wkts,
"overs_bowled": round(overs, 1),
"bowl_style": bowl_style,
"max_overs": 4,
}
def _is_spinner(name: str) -> bool:
name_lower = name.lower()
return any(s in name_lower for s in _KNOWN_SPINNERS)
def _infer_role(bat_stats: dict | None, bowl_stats: dict | None) -> str:
has_bowl = bowl_stats is not None and bowl_stats.get("wickets", 0) >= 5
# Tail-enders: appear in batting stats but can't really bat
is_tailender = (
bat_stats is not None
and bat_stats.get("strike_rate", 100) < 90
and bat_stats.get("average", 10) < 12
)
if is_tailender:
return "bowler" if has_bowl else "unknown"
has_bat = bat_stats is not None and bat_stats.get("innings", 0) >= 5
if has_bat and has_bowl:
return "allrounder"
if has_bowl and not has_bat:
return "bowler"
if has_bat:
style = bat_stats.get("style", "balanced")
sr = bat_stats.get("strike_rate", 130)
avg = bat_stats.get("average", 0)
if style == "aggressive" and sr > 145:
return "finisher"
if style == "anchor" or avg > 32:
return "middle"
return "middle" # default — can't infer opener from career stats alone
return "unknown"
# ---------------------------------------------------------------------------
# Main fetch routine
# ---------------------------------------------------------------------------
def fetch_team_profiles(team_name: str) -> list[dict]:
team_id = TEAM_IDS.get(team_name.lower().replace(" ", "_"))
if team_id is None:
raise ValueError(f"Unknown team '{team_name}'. Use --list-teams to see options.")
print(f" Fetching batting stats for {team_name} (id={team_id}) …")
bat_headers, bat_rows = _fetch_table(_batting_url(team_id))
time.sleep(1.5)
print(f" Fetching bowling stats for {team_name} …")
bowl_headers, bowl_rows = _fetch_table(_bowling_url(team_id))
# Build name → stats dicts
def _rows_to_dict(headers, rows):
result = {}
for row in rows:
if not row or len(row) < 3:
continue
name = row[0].strip()
if not name or name in ("Player", "-"):
continue
result[name] = dict(zip(headers, row))
return result
bat_map = _rows_to_dict(bat_headers, bat_rows)
bowl_map = _rows_to_dict(bowl_headers, bowl_rows)
all_names = sorted(set(bat_map) | set(bowl_map))
profiles = []
for name in all_names:
bat_raw = bat_map.get(name)
bowl_raw = bowl_map.get(name)
bat_stats = _derive_batting(bat_raw) if bat_raw else None
bowl_stats = _derive_bowling(bowl_raw) if bowl_raw else None
role = _infer_role(bat_stats, bowl_stats)
profile: dict = {"name": name, "role": role}
if bat_stats:
profile.update({
"aggression": bat_stats["aggression"],
"style": bat_stats["style"],
"strike_rate": bat_stats["strike_rate"],
"average": bat_stats["average"],
"boundary_pct": bat_stats["boundary_pct"],
})
if bowl_stats and bowl_stats["wickets"] >= 5:
profile.update({
"bowler_type": "spin" if _is_spinner(name) else "pace",
"bowl_style": bowl_stats["bowl_style"],
"economy": bowl_stats["economy"],
"bowling_sr": bowl_stats["bowling_sr"],
"max_overs": bowl_stats["max_overs"],
})
profiles.append(profile)
return profiles
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="Fetch T20I player profiles from ESPNcricinfo")
parser.add_argument("--team", action="append", dest="teams", metavar="TEAM",
help="Team name (repeatable). e.g. --team india --team australia")
parser.add_argument("--list-teams", action="store_true", help="Print available team names and exit")
parser.add_argument("--out-dir", default=_OUT_DIR, help="Output directory (default: data/player_profiles/)")
args = parser.parse_args()
if args.list_teams:
print("Available teams:")
for t in sorted(TEAM_IDS):
print(f" {t}")
return
if not args.teams:
parser.error("Provide at least one --team name, or use --list-teams")
os.makedirs(args.out_dir, exist_ok=True)
for team in args.teams:
team_key = team.lower().replace(" ", "_")
print(f"\n[{team_key}]")
try:
profiles = fetch_team_profiles(team_key)
out_path = os.path.join(args.out_dir, f"{team_key}.json")
with open(out_path, "w") as f:
json.dump({"team": team_key, "format": "T20I", "players": profiles}, f, indent=2)
print(f" → {len(profiles)} players written to {out_path}")
except Exception as e:
print(f" ERROR: {e}")
if __name__ == "__main__":
main()
|