recommender-system
/

feature-engineering-guide

Feature Engineering

Model card Files Files and versions

feature-engineering-guide / feature_selection /feature_shuffle.py

aitek230telu's picture

Upload 52 files

0ab7b0c verified over 1 year ago

history blame contribute delete

1.7 kB

	import pandas as pd
	#import numpy as np


	from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
	from sklearn.metrics import roc_auc_score #, mean_squared_error

	# 2018.11.28 Created by Eamon.Zhang


	def feature_shuffle_rf(X_train,y_train,max_depth=None,class_weight=None,top_n=15,n_estimators=50,random_state=0):

	model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
	random_state=random_state,class_weight=class_weight,
	n_jobs=-1)
	model.fit(X_train, y_train)
	train_auc = roc_auc_score(y_train, (model.predict_proba(X_train))[:, 1])
	feature_dict = {}

	# selection logic
	for feature in X_train.columns:
	X_train_c = X_train.copy().reset_index(drop=True)
	y_train_c = y_train.copy().reset_index(drop=True)

	# shuffle individual feature
	X_train_c[feature] = X_train_c[feature].sample(frac=1,random_state=random_state).reset_index(
	drop=True)
	#print(X_train_c.isnull().sum())
	# make prediction with shuffled feature and calculate roc-auc
	shuff_auc = roc_auc_score(y_train_c,
	(model.predict_proba(X_train_c))[:, 1])
	#print(shuff_auc)
	# save the drop in roc-auc
	feature_dict[feature] = (train_auc - shuff_auc)
	#print(feature_dict)

	auc_drop = pd.Series(feature_dict).reset_index()
	auc_drop.columns = ['feature', 'auc_drop']
	auc_drop.sort_values(by=['auc_drop'], ascending=False, inplace=True)
	selected_features = auc_drop[auc_drop.auc_drop>0]['feature']

	return auc_drop, selected_features