| import pandas as pd |
| from sklearn.tree import DecisionTreeClassifier |
| from sklearn.model_selection import cross_val_score |
| import numpy as np |
|
|
| |
|
|
| |
| |
| |
| class ChiMerge(): |
| """ |
| supervised discretization using the ChiMerge method. |
| |
| |
| Parameters |
| ---------- |
| confidenceVal: number |
| default=3.841, correspond to p=0.05 dof=1 |
| num_of_bins: int |
| number of bins after discretize |
| col: str |
| the column to be performed |
| |
| """ |
| |
| def __init__(self, col=None, bins=None, confidenceVal=3.841, num_of_bins=10): |
| self.col = col |
| self._dim = None |
| self.confidenceVal = confidenceVal |
| self.bins = bins |
| self.num_of_bins = num_of_bins |
|
|
|
|
| def fit(self, X, y, **kwargs): |
| """Fit encoder according to X and y. |
| Parameters |
| ---------- |
| X : array-like, shape = [n_samples, n_features] |
| Training vectors, where n_samples is the number of samples |
| and n_features is the number of features. |
| y : array-like, shape = [n_samples] |
| Target values. |
| Returns |
| ------- |
| self : encoder |
| Returns self. |
| """ |
|
|
| self._dim = X.shape[1] |
|
|
| _, bins = self.chimerge( |
| X_in=X, |
| y=y, |
| confidenceVal=self.confidenceVal, |
| col=self.col, |
| num_of_bins=self.num_of_bins |
| ) |
| self.bins = bins |
| return self |
| |
| |
| def transform(self, X): |
| """Perform the transformation to new data. |
| Will use the tree model and the column list to discretize the |
| column. |
| Parameters |
| ---------- |
| X : array-like, shape = [n_samples, n_features] |
| Returns |
| ------- |
| X : new dataframe with discretized new column. |
| """ |
| |
| if self._dim is None: |
| raise ValueError('Must train encoder before it can be used to transform data.') |
| |
| |
| if X.shape[1] != self._dim: |
| raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) |
| |
| X, _ = self.chimerge( |
| X_in=X, |
| col=self.col, |
| bins=self.bins |
| ) |
| |
| return X |
|
|
| def chimerge(self, X_in, y=None, confidenceVal=None, num_of_bins=None, col=None, bins=None): |
| """ |
| discretize a variable using ChiMerge |
| |
| """ |
|
|
| X = X_in.copy(deep=True) |
|
|
| if bins is not None: |
| try: |
| X[col+'_chimerge'] = pd.cut(X[col],bins=bins,include_lowest=True) |
| except Exception as e: |
| print(e) |
| |
| else: |
| try: |
| |
| total_num = X.groupby([col])[y].count() |
| total_num = pd.DataFrame({'total_num': total_num}) |
| positive_class = X.groupby([col])[y].sum() |
| positive_class = pd.DataFrame({'positive_class': positive_class}) |
| regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True,how='inner') |
| regroup.reset_index(inplace=True) |
| regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] |
| regroup = regroup.drop('total_num', axis=1) |
| np_regroup = np.array(regroup) |
| |
| i = 0 |
| while (i <= np_regroup.shape[0] - 2): |
| if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or ( np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)): |
| np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1] |
| np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2] |
| np_regroup[i, 0] = np_regroup[i + 1, 0] |
| np_regroup = np.delete(np_regroup, i + 1, 0) |
| i = i - 1 |
| i = i + 1 |
| |
| |
| chi_table = np.array([]) |
| for i in np.arange(np_regroup.shape[0] - 1): |
| chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \ |
| * (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \ |
| ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * ( |
| np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2])) |
| chi_table = np.append(chi_table, chi) |
| |
| while (1): |
| if (len(chi_table) <= (num_of_bins - 1) and min(chi_table) >= confidenceVal): |
| break |
| chi_min_index = np.argwhere(chi_table == min(chi_table))[0] |
| np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1] |
| np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2] |
| np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0] |
| np_regroup = np.delete(np_regroup, chi_min_index + 1, 0) |
| |
| if (chi_min_index == np_regroup.shape[0] - 1): |
| chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \ |
| * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \ |
| ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2])) |
| chi_table = np.delete(chi_table, chi_min_index, axis=0) |
| |
| else: |
| chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \ |
| * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \ |
| ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2])) |
| chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \ |
| * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \ |
| ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2])) |
| chi_table = np.delete(chi_table, chi_min_index + 1, axis=0) |
| result_data = pd.DataFrame() |
| result_data['variable'] = [col] * np_regroup.shape[0] |
| bins = [] |
| tmp = [] |
| for i in np.arange(np_regroup.shape[0]): |
| if i == 0: |
| y = '-inf' + ',' + str(np_regroup[i, 0]) |
| |
| |
| elif i == np_regroup.shape[0] - 1: |
| y = str(np_regroup[i - 1, 0]) + '+' |
| |
| |
| else: |
| y = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0]) |
| |
| |
| bins.append(np_regroup[i - 1, 0]) |
| tmp.append(y) |
| |
| |
| bins.append(X[col].min()-0.1) |
| |
| result_data['interval'] = tmp |
| result_data['flag_0'] = np_regroup[:, 2] |
| result_data['flag_1'] = np_regroup[:, 1] |
| bins.sort(reverse=False) |
| print('Interval for variable %s' % col) |
| print(result_data) |
| |
| except Exception as e: |
| print(e) |
| |
| return X, bins |
| |
| |
| |
| |
| |
| class DiscretizeByDecisionTree(): |
| """ |
| Discretisation with Decision Trees consists of using a decision tree |
| to identify the optimal splitting points that would determine the bins |
| or contiguous intervals: |
| |
| 1.train a decision tree of limited depth (2, 3 or 4) using the variable |
| we want to discretise to predict the target. |
| 2.the original variable values are then replaced by the |
| probability returned by the tree. |
| |
| Parameters |
| ---------- |
| col: str |
| column to discretise |
| max_depth: int or list of int |
| max depth of the tree. Can be an int or a list of int we want the tree model to search |
| for the optimal depth. |
| |
| """ |
|
|
| def __init__(self, col=None, max_depth=None, tree_model=None): |
| self.col = col |
| self._dim = None |
| self.max_depth = max_depth |
| self.tree_model = tree_model |
|
|
|
|
| def fit(self, X, y, **kwargs): |
| """Fit encoder according to X and y. |
| Parameters |
| ---------- |
| X : array-like, shape = [n_samples, n_features] |
| Training vectors, where n_samples is the number of samples |
| and n_features is the number of features. |
| y : array-like, shape = [n_samples] |
| Target values. |
| Returns |
| ------- |
| self : encoder |
| Returns self. |
| """ |
|
|
| self._dim = X.shape[1] |
|
|
| _, tree = self.discretize( |
| X_in=X, |
| y=y, |
| max_depth=self.max_depth, |
| col=self.col, |
| tree_model=self.tree_model |
| ) |
| self.tree_model = tree |
| return self |
|
|
| def transform(self, X): |
| """Perform the transformation to new categorical data. |
| Will use the tree model and the column list to discretize the |
| column. |
| Parameters |
| ---------- |
| X : array-like, shape = [n_samples, n_features] |
| Returns |
| ------- |
| X : new dataframe with discretized new column. |
| """ |
|
|
| if self._dim is None: |
| raise ValueError('Must train encoder before it can be used to transform data.') |
|
|
| |
| if X.shape[1] != self._dim: |
| raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) |
|
|
| X, _ = self.discretize( |
| X_in=X, |
| col=self.col, |
| tree_model=self.tree_model |
| ) |
|
|
| return X |
|
|
|
|
| def discretize(self, X_in, y=None, max_depth=None, tree_model=None, col=None): |
| """ |
| discretize a variable using DecisionTreeClassifier |
| |
| """ |
|
|
| X = X_in.copy(deep=True) |
|
|
| if tree_model is not None: |
| X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1] |
|
|
| else: |
| if isinstance(max_depth,int): |
| tree_model = DecisionTreeClassifier(max_depth=max_depth) |
| tree_model.fit(X[col].to_frame(), y) |
| |
| |
| |
| |
| |
| |
| |
| elif len(max_depth)>1: |
| score_ls = [] |
| score_std_ls = [] |
| for tree_depth in max_depth: |
| tree_model = DecisionTreeClassifier(max_depth=tree_depth) |
| scores = cross_val_score(tree_model, X[col].to_frame(), y, cv=3, scoring='roc_auc') |
| score_ls.append(np.mean(scores)) |
| score_std_ls.append(np.std(scores)) |
| temp = pd.concat([pd.Series(max_depth), pd.Series(score_ls), pd.Series(score_std_ls)], axis=1) |
| temp.columns = ['depth', 'roc_auc_mean', 'roc_auc_std'] |
| print('result ROC-AUC for each depth') |
| print(temp) |
| max_roc = temp.roc_auc_mean.max() |
| optimal_depth=temp[temp.roc_auc_mean==max_roc]['depth'].values |
| print('optimal_depth:',optimal_depth) |
| tree_model = DecisionTreeClassifier(max_depth=optimal_depth) |
| tree_model.fit(X[col].to_frame(), y) |
| |
| |
| |
| |
| else: |
| raise ValueError('max_depth of a tree must be an integer or a list') |
|
|
| return X, tree_model |
|
|
|
|
|
|