| """Script for downloading test data.""" |
|
|
| import argparse |
| import logging |
| import os |
| import shutil |
| import ssl |
| import warnings |
| from pathlib import Path |
| from urllib.parse import urlparse |
| from urllib.request import urlopen, urlretrieve |
| from zipfile import ZipFile, is_zipfile |
|
|
| import pandas as pd |
|
|
| try: |
| from tqdm import tqdm |
| except ImportError: |
| tqdm = None |
|
|
| import socceraction.atomic.spadl as atomicspadl |
| import socceraction.spadl as spadl |
| import socceraction.spadl.statsbomb as statsbomb |
| import socceraction.spadl.wyscout as wyscout |
| from socceraction.data.statsbomb import StatsBombLoader |
| from socceraction.data.wyscout import PublicWyscoutLoader |
|
|
| warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning) |
| warnings.filterwarnings( |
| action="ignore", message="credentials were not supplied. open data access only" |
| ) |
|
|
|
|
| |
| ssl._create_default_https_context = ssl._create_unverified_context |
|
|
| _data_dir = os.path.dirname(__file__) |
|
|
|
|
| def download_statsbomb_data() -> None: |
| """Download and extract the StatsBomb open data repository.""" |
| logging.info("Downloading StatsBomb data") |
| dataset_url = "https://github.com/statsbomb/open-data/archive/master.zip" |
|
|
| tmp_datafolder = os.path.join(_data_dir, "statsbomb", "tmp") |
| raw_datafolder = os.path.join(_data_dir, "statsbomb", "raw") |
| for datafolder in [tmp_datafolder, raw_datafolder]: |
| if not os.path.exists(datafolder): |
| os.makedirs(datafolder, exist_ok=True) |
| statsbombzip = os.path.join(tmp_datafolder, "statsbomb-open-data.zip") |
|
|
| with urlopen(dataset_url) as dl_file: |
| with open(statsbombzip, "wb") as out_file: |
| out_file.write(dl_file.read()) |
|
|
| with ZipFile(statsbombzip, "r") as zipObj: |
| zipObj.extractall(tmp_datafolder) |
|
|
| shutil.rmtree(raw_datafolder) |
| Path(f"{tmp_datafolder}/open-data-master/data").rename(raw_datafolder) |
| shutil.rmtree(tmp_datafolder) |
| logging.info("Done! Data was saved to %s", raw_datafolder) |
|
|
|
|
| def convert_statsbomb_data() -> None: |
| """Convert StatsBomb data to SPADL.""" |
| logging.info("Converting StatsBomb data") |
| seasons = { |
| 3: "2018", |
| } |
| leagues = { |
| "FIFA World Cup": "WorldCup", |
| } |
| spadl_datafolder = os.path.join(_data_dir, "statsbomb") |
|
|
| free_open_data_remote = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/" |
|
|
| SBL = StatsBombLoader(root=free_open_data_remote, getter="remote") |
|
|
| |
| df_competitions = SBL.competitions() |
| selected_competitions = df_competitions.competition_name.isin(leagues.keys()) |
| selected_seasons = df_competitions.season_id.isin(seasons.keys()) |
| df_selected_competitions = df_competitions.loc[selected_competitions & selected_seasons] |
|
|
| for competition in df_selected_competitions.itertuples(): |
| |
| games = SBL.games(competition.competition_id, competition.season_id) |
|
|
| if tqdm is not None: |
| games_verbose = tqdm(list(games.itertuples()), desc="Loading match data") |
| else: |
| games_verbose = games.itertuples() |
| teams, players = [], [] |
|
|
| competition_id = leagues[competition.competition_name] |
| season_id = seasons[competition.season_id] |
| spadl_h5 = os.path.join(spadl_datafolder, f"spadl-{competition_id}-{season_id}.h5") |
| with pd.HDFStore(spadl_h5) as spadlstore: |
| spadlstore.put("actiontypes", spadl.actiontypes_df(), format="table") |
| spadlstore.put("results", spadl.results_df(), format="table") |
| spadlstore.put("bodyparts", spadl.bodyparts_df(), format="table") |
|
|
| for game in games_verbose: |
| |
| teams.append(SBL.teams(game.game_id)) |
| players.append(SBL.players(game.game_id)) |
| events = SBL.events(game.game_id) |
|
|
| |
| spadlstore.put( |
| f"actions/game_{game.game_id}", |
| statsbomb.convert_to_actions(events, game.home_team_id), |
| format="table", |
| ) |
|
|
| games.season_id = season_id |
| games.competition_id = competition_id |
| spadlstore.put("games", games) |
| spadlstore.put( |
| "teams", |
| pd.concat(teams).drop_duplicates("team_id").reset_index(drop=True), |
| ) |
| spadlstore.put( |
| "players", |
| pd.concat(players).drop_duplicates("player_id").reset_index(drop=True), |
| ) |
| logging.info("Done! Data was saved to %s", spadl_datafolder) |
|
|
|
|
| def download_wyscout_data() -> None: |
| """Download and extract the Wyscout public dataset.""" |
| logging.info("Downloading Wyscout data") |
| |
| dataset_urls = { |
| "competitions": "https://ndownloader.figshare.com/files/15073685", |
| "teams": "https://ndownloader.figshare.com/files/15073697", |
| "players": "https://ndownloader.figshare.com/files/15073721", |
| "games": "https://ndownloader.figshare.com/files/14464622", |
| "events": "https://ndownloader.figshare.com/files/14464685", |
| } |
|
|
| raw_datafolder = os.path.join(_data_dir, "wyscout_public", "raw") |
| if not os.path.exists(raw_datafolder): |
| os.makedirs(raw_datafolder, exist_ok=True) |
|
|
| |
| for url in dataset_urls.values(): |
| url_obj = urlopen(url).geturl() |
| path = Path(urlparse(url_obj).path) |
| file_name = os.path.join(raw_datafolder, path.name) |
| file_local, _ = urlretrieve(url_obj, file_name) |
| if is_zipfile(file_local): |
| with ZipFile(file_local) as zip_file: |
| zip_file.extractall(raw_datafolder) |
| logging.info("Done! Data was saved to %s", raw_datafolder) |
|
|
|
|
| def convert_wyscout_data() -> None: |
| """Convert Wyscout data to SPADL.""" |
| logging.info("Converting Wyscout data") |
| seasons = { |
| 10078: "2018", |
| } |
| leagues = { |
| 28: "WorldCup", |
| } |
|
|
| raw_datafolder = os.path.join(_data_dir, "wyscout_public", "raw") |
| spadl_datafolder = os.path.join(_data_dir, "wyscout_public") |
|
|
| WYL = PublicWyscoutLoader(root=raw_datafolder) |
|
|
| |
| df_competitions = WYL.competitions() |
| selected_competitions = df_competitions.competition_id.isin(leagues.keys()) |
| df_selected_competitions = df_competitions.loc[selected_competitions] |
|
|
| for competition in df_selected_competitions.itertuples(): |
| |
| games = WYL.games(competition.competition_id, competition.season_id) |
|
|
| if tqdm is not None: |
| games_verbose = tqdm(list(games.itertuples()), desc="Loading match data") |
| else: |
| games_verbose = games.itertuples() |
| teams, players = [], [] |
|
|
| competition_id = leagues[competition.competition_id] |
| season_id = seasons[competition.season_id] |
| spadl_h5 = os.path.join(spadl_datafolder, f"spadl-{competition_id}-{season_id}.h5") |
| with pd.HDFStore(spadl_h5) as spadlstore: |
| spadlstore.put("actiontypes", spadl.actiontypes_df(), format="table") |
| spadlstore.put("results", spadl.results_df(), format="table") |
| spadlstore.put("bodyparts", spadl.bodyparts_df(), format="table") |
|
|
| for game in games_verbose: |
| |
| teams.append(WYL.teams(game.game_id)) |
| players.append(WYL.players(game.game_id)) |
| events = WYL.events(game.game_id) |
|
|
| |
| spadlstore.put( |
| f"actions/game_{game.game_id}", |
| wyscout.convert_to_actions(events, game.home_team_id), |
| |
| ) |
|
|
| games.season_id = season_id |
| games.competition_id = competition_id |
| spadlstore.put("games", games) |
| spadlstore.put( |
| "teams", |
| pd.concat(teams).drop_duplicates("team_id").reset_index(drop=True), |
| ) |
| spadlstore.put( |
| "players", |
| pd.concat(players).drop_duplicates("player_id").reset_index(drop=True), |
| ) |
| logging.info("Done! Data was saved to %s", spadl_datafolder) |
|
|
|
|
| def create_spadl(game_id: int, home_team_id: int) -> None: |
| """Create SPADL actions from StatsBomb data for a given game.""" |
| logging.info("Creating SPADL data") |
| spadl_datafolder = os.path.join(_data_dir, "spadl") |
| if not os.path.exists(spadl_datafolder): |
| os.makedirs(spadl_datafolder, exist_ok=True) |
|
|
| |
| free_open_data_remote = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/" |
| SBL = StatsBombLoader(root=free_open_data_remote, getter="remote") |
| events = SBL.events(game_id) |
| |
| spadl_json = os.path.join(spadl_datafolder, "spadl.json") |
| df_actions = statsbomb.convert_to_actions(events, home_team_id) |
| pd.concat( |
| [ |
| df_actions[df_actions.period_id == 1].head(n=200), |
| df_actions[df_actions.period_id == 2].head(n=200), |
| ] |
| ).to_json(spadl_json, orient="records") |
| |
| atomic_spadl_json = os.path.join(spadl_datafolder, "atomic_spadl.json") |
| df_atomic_actions = atomicspadl.convert_to_atomic(df_actions) |
| pd.concat( |
| [ |
| df_atomic_actions[df_atomic_actions.period_id == 1].head(n=200), |
| df_atomic_actions[df_atomic_actions.period_id == 2].head(n=200), |
| ] |
| ).to_json(atomic_spadl_json, orient="records") |
| logging.info("Done! SPADL data was saved to %s and %s", spadl_json, atomic_spadl_json) |
|
|
|
|
| if __name__ == "__main__": |
| |
| logging.basicConfig(level=logging.INFO) |
|
|
| |
| my_parser = argparse.ArgumentParser( |
| prog="download", |
| usage="%(prog)s [options]", |
| formatter_class=argparse.RawTextHelpFormatter, |
| description="""Download and prepare the data needed for running the tests. |
| |
| Use the options specified below to select specific preprocessing |
| steps. When this script is run without any options, all preprocessing |
| steps required to run the default test setup will be executed. |
| """, |
| ) |
|
|
| |
| my_parser.add_argument( |
| "--download-statsbomb", |
| action="store_true", |
| help="Download the public StatsBomb data.", |
| ) |
| my_parser.add_argument( |
| "--convert-statsbomb", |
| action="store_true", |
| help="Convert the public StatsBomb data to SPADL.", |
| ) |
| my_parser.add_argument( |
| "--download-wyscout", |
| action="store_true", |
| help="Download the public Wyscout data.", |
| ) |
| my_parser.add_argument( |
| "--convert-wyscout", |
| action="store_true", |
| help="Convert the public Wyscout data to SPADL.", |
| ) |
| my_parser.add_argument( |
| "--spadl", |
| action="store_true", |
| help="Create a JSON file with example SPADL and Atomic-SPADL data.", |
| ) |
|
|
| |
| args = my_parser.parse_args() |
| no_options = not any( |
| [ |
| args.download_statsbomb, |
| args.convert_statsbomb, |
| args.download_wyscout, |
| args.convert_wyscout, |
| args.spadl, |
| ] |
| ) |
|
|
| |
| if args.download_statsbomb or no_options: |
| download_statsbomb_data() |
| if args.convert_statsbomb: |
| convert_statsbomb_data() |
| if args.download_wyscout or no_options: |
| download_wyscout_data() |
| if args.convert_wyscout: |
| convert_wyscout_data() |
| if args.spadl: |
| create_spadl(8657, 777) |
|
|