| |
| import yaml |
| from joblib import dump, load |
| import pandas as pd |
| from sklearn.model_selection import train_test_split, cross_val_score |
| from sklearn.metrics import accuracy_score, confusion_matrix, classification_report |
| |
| from sklearn.naive_bayes import MultinomialNB |
| |
| from sklearn.tree import DecisionTreeClassifier |
| |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
| import seaborn as sn |
| import matplotlib.pyplot as plt |
| from concrete.ml.sklearn import DecisionTreeClassifier as ConcreteDecisionTreeClassifier |
| from concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier |
|
|
|
|
| class DiseasePrediction: |
| |
| def __init__(self, model_name=None): |
| |
| try: |
| with open('./config.yaml', 'r') as f: |
| self.config = yaml.safe_load(f) |
| except Exception as e: |
| print("Error reading Config file...") |
|
|
| |
| self.verbose = self.config['verbose'] |
| |
| self.train_features, self.train_labels, self.train_df = self._load_train_dataset() |
| |
| self.test_features, self.test_labels, self.test_df = self._load_test_dataset() |
| |
| self._feature_correlation(data_frame=self.train_df, show_fig=False) |
| |
| self.model_name = model_name |
| |
| self.model_save_path = self.config['model_save_path'] |
|
|
| |
| def _load_train_dataset(self): |
| df_train = pd.read_csv(self.config['dataset']['training_data_path']) |
| cols = df_train.columns |
| cols = cols[:-2] |
| train_features = df_train[cols] |
| train_labels = df_train['prognosis'] |
|
|
| |
| assert (len(train_features.iloc[0]) == 132) |
| assert (len(train_labels) == train_features.shape[0]) |
|
|
| if self.verbose: |
| print("Length of Training Data: ", df_train.shape) |
| print("Training Features: ", train_features.shape) |
| print("Training Labels: ", train_labels.shape) |
| return train_features, train_labels, df_train |
|
|
| |
| def _load_test_dataset(self): |
| df_test = pd.read_csv(self.config['dataset']['test_data_path']) |
| cols = df_test.columns |
| cols = cols[:-1] |
| test_features = df_test[cols] |
| test_labels = df_test['prognosis'] |
|
|
| |
| assert (len(test_features.iloc[0]) == 132) |
| assert (len(test_labels) == test_features.shape[0]) |
|
|
| if self.verbose: |
| print("Length of Test Data: ", df_test.shape) |
| print("Test Features: ", test_features.shape) |
| print("Test Labels: ", test_labels.shape) |
| return test_features, test_labels, df_test |
|
|
| |
| def _feature_correlation(self, data_frame=None, show_fig=False): |
| |
| corr = data_frame.corr() |
| sn.heatmap(corr, square=True, annot=False, cmap="YlGnBu") |
| plt.title("Feature Correlation") |
| plt.tight_layout() |
| if show_fig: |
| plt.show() |
| plt.savefig('feature_correlation.png') |
|
|
| |
| def _train_val_split(self): |
| X_train, X_val, y_train, y_val = train_test_split(self.train_features, self.train_labels, |
| test_size=self.config['dataset']['validation_size'], |
| random_state=self.config['random_state']) |
|
|
| if self.verbose: |
| print("Number of Training Features: {0}\tNumber of Training Labels: {1}".format(len(X_train), len(y_train))) |
| print("Number of Validation Features: {0}\tNumber of Validation Labels: {1}".format(len(X_val), len(y_val))) |
| return X_train, y_train, X_val, y_val |
|
|
| |
| def select_model(self): |
| if self.model_name == 'mnb': |
| self.clf = MultinomialNB() |
| elif self.model_name == 'decision_tree': |
| self.clf = ConcreteDecisionTreeClassifier(criterion=self.config['model']['decision_tree']['criterion']) |
| elif self.model_name == 'gradient_boost': |
| self.clf = ConcreteXGBClassifier(n_estimators=self.config['model']['gradient_boost']['n_estimators'], |
| criterion=self.config['model']['gradient_boost']['criterion']) |
| return self.clf |
|
|
| |
| def train_model(self): |
| |
| X_train, y_train, X_val, y_val = self._train_val_split() |
| classifier = self.select_model() |
| |
| classifier = classifier.fit(X_train, y_train) |
| |
| confidence = classifier.score(X_val, y_val) |
| |
| y_pred = classifier.predict(X_val) |
| |
| accuracy = accuracy_score(y_val, y_pred) |
| |
| conf_mat = confusion_matrix(y_val, y_pred) |
| |
| clf_report = classification_report(y_val, y_pred) |
| |
| score = cross_val_score(classifier, X_val, y_val, cv=3) |
|
|
| if self.verbose: |
| print('\nTraining Accuracy: ', confidence) |
| print('\nValidation Prediction: ', y_pred) |
| print('\nValidation Accuracy: ', accuracy) |
| print('\nValidation Confusion Matrix: \n', conf_mat) |
| print('\nCross Validation Score: \n', score) |
| print('\nClassification Report: \n', clf_report) |
|
|
| |
| dump(classifier, str(self.model_save_path + self.model_name + ".joblib")) |
|
|
| |
| def make_prediction(self, saved_model_name=None, test_data=None): |
| try: |
| |
| clf = load(str(self.model_save_path + saved_model_name + ".joblib")) |
| except Exception as e: |
| print("Model not found...") |
|
|
| if test_data is not None: |
| result = clf.predict(test_data) |
| return result |
| else: |
| result = clf.predict(self.test_features) |
| accuracy = accuracy_score(self.test_labels, result) |
| clf_report = classification_report(self.test_labels, result) |
| return accuracy, clf_report |
|
|
|
|
| if __name__ == "__main__": |
| |
| current_model_name = 'decision_tree' |
| |
| dp = DiseasePrediction(model_name=current_model_name) |
| |
| dp.train_model() |
| |
| test_accuracy, classification_report = dp.make_prediction(saved_model_name=current_model_name) |
| print("Model Test Accuracy: ", test_accuracy) |
| print("Test Data Classification Report: \n", classification_report) |