| """Implements the VAEP framework. |
| |
| Attributes |
| ---------- |
| xfns_default : list(callable) |
| The default VAEP features. |
| |
| """ |
|
|
| import math |
| from typing import Any, Optional |
|
|
| import numpy as np |
| import pandas as pd |
| from sklearn.exceptions import NotFittedError |
| from sklearn.metrics import brier_score_loss, roc_auc_score |
|
|
| import socceraction.spadl as spadlcfg |
|
|
| from . import features as fs |
| from . import formula as vaep |
| from . import labels as lab |
|
|
| try: |
| import xgboost |
| except ImportError: |
| xgboost = None |
| try: |
| import catboost |
| except ImportError: |
| catboost = None |
| try: |
| import lightgbm |
| except ImportError: |
| lightgbm = None |
|
|
|
|
| xfns_default = [ |
| fs.actiontype_onehot, |
| fs.result_onehot, |
| fs.actiontype_result_onehot, |
| fs.bodypart_onehot, |
| fs.time, |
| fs.startlocation, |
| fs.endlocation, |
| fs.startpolar, |
| fs.endpolar, |
| fs.movement, |
| fs.team, |
| fs.time_delta, |
| fs.space_delta, |
| fs.goalscore, |
| ] |
|
|
|
|
| class VAEP: |
| """ |
| An implementation of the VAEP framework. |
| |
| VAEP (Valuing Actions by Estimating Probabilities) [1]_ defines the |
| problem of valuing a soccer player's contributions within a match as |
| a binary classification problem and rates actions by estimating its effect |
| on the short-term probablities that a team will both score and concede. |
| |
| Parameters |
| ---------- |
| xfns : list |
| List of feature transformers (see :mod:`socceraction.vaep.features`) |
| used to describe the game states. Uses :attr:`~socceraction.vaep.base.xfns_default` |
| if None. |
| nb_prev_actions : int, default=3 # noqa: DAR103 |
| Number of previous actions used to decscribe the game state. |
| |
| |
| References |
| ---------- |
| .. [1] Tom Decroos, Lotte Bransen, Jan Van Haaren, and Jesse Davis. |
| "Actions speak louder than goals: Valuing player actions in soccer." In |
| Proceedings of the 25th ACM SIGKDD International Conference on Knowledge |
| Discovery & Data Mining, pp. 1851-1861. 2019. |
| """ |
|
|
| _spadlcfg = spadlcfg |
| _fs = fs |
| _lab = lab |
| _vaep = vaep |
|
|
| def __init__( |
| self, |
| xfns: Optional[list[fs.FeatureTransfomer]] = None, |
| nb_prev_actions: int = 3, |
| ) -> None: |
| self.__models: dict[str, Any] = {} |
| self.xfns = xfns_default if xfns is None else xfns |
| self.yfns = [self._lab.scores, self._lab.concedes] |
| self.nb_prev_actions = nb_prev_actions |
|
|
| def compute_features(self, game: pd.Series, game_actions: fs.Actions) -> pd.DataFrame: |
| """ |
| Transform actions to the feature-based representation of game states. |
| |
| Parameters |
| ---------- |
| game : pd.Series |
| The SPADL representation of a single game. |
| game_actions : pd.DataFrame |
| The actions performed during `game` in the SPADL representation. |
| |
| Returns |
| ------- |
| features : pd.DataFrame |
| Returns the feature-based representation of each game state in the game. |
| """ |
| game_actions_with_names = self._spadlcfg.add_names(game_actions) |
| gamestates = self._fs.gamestates(game_actions_with_names, self.nb_prev_actions) |
| gamestates = self._fs.play_left_to_right(gamestates, game.home_team_id) |
| return pd.concat([fn(gamestates) for fn in self.xfns], axis=1) |
|
|
| def compute_labels( |
| self, |
| game: pd.Series, |
| game_actions: fs.Actions, |
| ) -> pd.DataFrame: |
| """ |
| Compute the labels for each game state in the given game. |
| |
| Parameters |
| ---------- |
| game : pd.Series |
| The SPADL representation of a single game. |
| game_actions : pd.DataFrame |
| The actions performed during `game` in the SPADL representation. |
| |
| Returns |
| ------- |
| labels : pd.DataFrame |
| Returns the labels of each game state in the game. |
| """ |
| game_actions_with_names = self._spadlcfg.add_names(game_actions) |
| return pd.concat([fn(game_actions_with_names) for fn in self.yfns], axis=1) |
|
|
| def fit( |
| self, |
| X: pd.DataFrame, |
| y: pd.DataFrame, |
| learner: str = "xgboost", |
| val_size: float = 0.25, |
| tree_params: Optional[dict[str, Any]] = None, |
| fit_params: Optional[dict[str, Any]] = None, |
| ) -> "VAEP": |
| """ |
| Fit the model according to the given training data. |
| |
| Parameters |
| ---------- |
| X : pd.DataFrame |
| Feature representation of the game states. |
| y : pd.DataFrame |
| Scoring and conceding labels for each game state. |
| learner : string, default='xgboost' # noqa: DAR103 |
| Gradient boosting implementation which should be used to learn the |
| model. The supported learners are 'xgboost', 'catboost' and 'lightgbm'. |
| val_size : float, default=0.25 # noqa: DAR103 |
| Percentage of the dataset that will be used as the validation set |
| for early stopping. When zero, no validation data will be used. |
| tree_params : dict |
| Parameters passed to the constructor of the learner. |
| fit_params : dict |
| Parameters passed to the fit method of the learner. |
| |
| Raises |
| ------ |
| ValueError |
| If one of the features is missing in the provided dataframe. |
| |
| Returns |
| ------- |
| self |
| Fitted VAEP model. |
| |
| """ |
| nb_states = len(X) |
| idx = np.random.permutation(nb_states) |
| |
| train_idx = idx[:math.floor(nb_states * (1 - val_size))] |
| val_idx = idx[(math.floor(nb_states * (1 - val_size)) + 1):] |
| |
|
|
| |
| cols = self._fs.feature_column_names(self.xfns, self.nb_prev_actions) |
| if not set(cols).issubset(set(X.columns)): |
| missing_cols = " and ".join(set(cols).difference(X.columns)) |
| raise ValueError(f"{missing_cols} are not available in the features dataframe") |
|
|
| |
| X_train, y_train = X.iloc[train_idx][cols], y.iloc[train_idx] |
| X_val, y_val = X.iloc[val_idx][cols], y.iloc[val_idx] |
|
|
| |
| for col in list(y.columns): |
| eval_set = [(X_val, y_val[col])] if val_size > 0 else None |
| if learner == "xgboost": |
| self.__models[col] = self._fit_xgboost( |
| X_train, y_train[col], eval_set, tree_params, fit_params |
| ) |
| elif learner == "catboost": |
| self.__models[col] = self._fit_catboost( |
| X_train, y_train[col], eval_set, tree_params, fit_params |
| ) |
| elif learner == "lightgbm": |
| self.__models[col] = self._fit_lightgbm( |
| X_train, y_train[col], eval_set, tree_params, fit_params |
| ) |
| else: |
| raise ValueError(f"A {learner} learner is not supported") |
| return self |
|
|
| def _fit_xgboost( |
| self, |
| X: pd.DataFrame, |
| y: pd.Series, |
| eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None, |
| tree_params: Optional[dict[str, Any]] = None, |
| fit_params: Optional[dict[str, Any]] = None, |
| ) -> "xgboost.XGBClassifier": |
| if xgboost is None: |
| raise ImportError("xgboost is not installed.") |
| |
| if tree_params is None: |
| tree_params = { |
| "n_estimators": 100, |
| "max_depth": 3, |
| "eval_metric": "auc", |
| "early_stopping_rounds": 10, |
| "enable_categorical": True, |
| } |
| if fit_params is None: |
| fit_params = {"verbose": True} |
| if eval_set is not None: |
| val_params = {"eval_set": eval_set} |
| fit_params = {**fit_params, **val_params} |
| |
| model = xgboost.XGBClassifier(**tree_params) |
| return model.fit(X, y, **fit_params) |
|
|
| def _fit_catboost( |
| self, |
| X: pd.DataFrame, |
| y: pd.Series, |
| eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None, |
| tree_params: Optional[dict[str, Any]] = None, |
| fit_params: Optional[dict[str, Any]] = None, |
| ) -> "catboost.CatBoostClassifier": |
| if catboost is None: |
| raise ImportError("catboost is not installed.") |
| |
| if tree_params is None: |
| tree_params = { |
| "eval_metric": "BrierScore", |
| "loss_function": "Logloss", |
| "iterations": 100, |
| } |
| if fit_params is None: |
| is_cat_feature = [c.dtype.name == "category" for (_, c) in X.iteritems()] |
| fit_params = { |
| "cat_features": np.nonzero(is_cat_feature)[0].tolist(), |
| "verbose": True, |
| } |
| if eval_set is not None: |
| val_params = {"early_stopping_rounds": 10, "eval_set": eval_set} |
| fit_params = {**fit_params, **val_params} |
| |
| model = catboost.CatBoostClassifier(**tree_params) |
| return model.fit(X, y, **fit_params) |
|
|
| def _fit_lightgbm( |
| self, |
| X: pd.DataFrame, |
| y: pd.Series, |
| eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None, |
| tree_params: Optional[dict[str, Any]] = None, |
| fit_params: Optional[dict[str, Any]] = None, |
| ) -> "lightgbm.LGBMClassifier": |
| if lightgbm is None: |
| raise ImportError("lightgbm is not installed.") |
| if tree_params is None: |
| tree_params = {"n_estimators": 100, "max_depth": 3} |
| if fit_params is None: |
| fit_params = {"eval_metric": "auc", "verbose": True} |
| if eval_set is not None: |
| val_params = {"early_stopping_rounds": 10, "eval_set": eval_set} |
| fit_params = {**fit_params, **val_params} |
| |
| model = lightgbm.LGBMClassifier(**tree_params) |
| return model.fit(X, y, **fit_params) |
|
|
| def _estimate_probabilities(self, X: pd.DataFrame) -> pd.DataFrame: |
| |
| cols = self._fs.feature_column_names(self.xfns, self.nb_prev_actions) |
| if not set(cols).issubset(set(X.columns)): |
| missing_cols = " and ".join(set(cols).difference(X.columns)) |
| raise ValueError(f"{missing_cols} are not available in the features dataframe") |
|
|
| Y_hat = pd.DataFrame() |
| for col in self.__models: |
| Y_hat[col] = [p[1] for p in self.__models[col].predict_proba(X[cols])] |
| return Y_hat |
|
|
| def rate( |
| self, |
| game: pd.Series, |
| game_actions: fs.Actions, |
| game_states: Optional[fs.Features] = None, |
| ) -> pd.DataFrame: |
| """ |
| Compute the VAEP rating for the given game states. |
| |
| Parameters |
| ---------- |
| game : pd.Series |
| The SPADL representation of a single game. |
| game_actions : pd.DataFrame |
| The actions performed during `game` in the SPADL representation. |
| game_states : pd.DataFrame, default=None |
| DataFrame with the game state representation of each action. If |
| `None`, these will be computed on-th-fly. |
| |
| Raises |
| ------ |
| NotFittedError |
| If the model is not fitted yet. |
| |
| Returns |
| ------- |
| ratings : pd.DataFrame |
| Returns the VAEP rating for each given action, as well as the |
| offensive and defensive value of each action. |
| """ |
| if not self.__models: |
| raise NotFittedError() |
|
|
| game_actions_with_names = self._spadlcfg.add_names(game_actions) |
| if game_states is None: |
| game_states = self.compute_features(game, game_actions) |
|
|
| y_hat = self._estimate_probabilities(game_states) |
| p_scores, p_concedes = y_hat.scores, y_hat.concedes |
| vaep_values = self._vaep.value(game_actions_with_names, p_scores, p_concedes) |
| return vaep_values |
|
|
| def score(self, X: pd.DataFrame, y: pd.DataFrame) -> dict[str, dict[str, float]]: |
| """Evaluate the fit of the model on the given test data and labels. |
| |
| Parameters |
| ---------- |
| X : pd.DataFrame |
| Feature representation of the game states. |
| y : pd.DataFrame |
| Scoring and conceding labels for each game state. |
| |
| Raises |
| ------ |
| NotFittedError |
| If the model is not fitted yet. |
| |
| Returns |
| ------- |
| score : dict |
| The Brier and AUROC scores for both binary classification problems. |
| """ |
| if not self.__models: |
| raise NotFittedError() |
|
|
| y_hat = self._estimate_probabilities(X) |
|
|
| scores: dict[str, dict[str, float]] = {} |
| for col in self.__models: |
| scores[col] = {} |
| scores[col]["brier"] = brier_score_loss(y[col], y_hat[col]) |
| scores[col]["auroc"] = roc_auc_score(y[col], y_hat[col]) |
|
|
| return scores |
|
|