| from sklearn.compose import ColumnTransformer |
| from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder |
| from sklearn.model_selection import train_test_split |
| from sklearn.linear_model import LinearRegression, LogisticRegression |
| from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier |
| from xgboost import XGBRegressor, XGBClassifier |
| from sklearn.svm import SVR, SVC |
| from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier |
| from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier |
| from sklearn.linear_model import ElasticNet, BayesianRidge |
| from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, GradientBoostingClassifier, AdaBoostClassifier |
| from sklearn.naive_bayes import GaussianNB |
| from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis |
| from sklearn.linear_model import Ridge, Lasso |
| from sklearn.impute import SimpleImputer |
| from sklearn.preprocessing import StandardScaler, OneHotEncoder |
| from sklearn.compose import ColumnTransformer |
| from sklearn.pipeline import Pipeline as SkPipeline |
|
|
| import streamlit as st |
|
|
|
|
|
|
|
|
| def get_model(task_type, model_name, hyperparams): |
| """Returns the model instance based on user selection with hyperparameters.""" |
| models = { |
| "regression": { |
| |
| "Linear Regression": LinearRegression, |
| "Random Forest Regressor": RandomForestRegressor, |
| "XGBoost Regressor": XGBRegressor, |
| |
| "Support Vector Regressor": SVR, |
| "Decision Tree Regressor": DecisionTreeRegressor, |
| "K-Nearest Neighbors Regressor": KNeighborsRegressor, |
| "ElasticNet": ElasticNet, |
| "Gradient Boosting Regressor": GradientBoostingRegressor, |
| "AdaBoost Regressor": AdaBoostRegressor, |
| "Bayesian Ridge": BayesianRidge, |
| "Ridge Regression": Ridge, |
| "Lasso Regression": Lasso , |
|
|
| }, |
| "classification": { |
| |
| "Logistic Regression": LogisticRegression, |
| "Random Forest": RandomForestClassifier, |
| "XGBoost": XGBClassifier, |
| |
| "Support Vector Classifier": SVC, |
| "Decision Tree Classifier": DecisionTreeClassifier, |
| "K-Nearest Neighbors Classifier": KNeighborsClassifier, |
| "Gradient Boosting Classifier": GradientBoostingClassifier, |
| "AdaBoost Classifier": AdaBoostClassifier, |
| "Gaussian Naive Bayes": GaussianNB, |
| "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis, |
| "Linear Discriminant Analysis": LinearDiscriminantAnalysis |
| } |
| } |
| |
|
|
| if task_type in models and model_name in models[task_type]: |
| return models[task_type][model_name](**hyperparams) |
| else: |
| raise ValueError(f"Invalid model selection: {model_name} for {task_type}") |
| |
| |
| def train_model(df, target_column, task_type, selected_model_name, hyperparams): |
| """Preprocess data, train the selected model with hyperparameters, and return the trained model.""" |
|
|
| with st.spinner(" Training model... Please wait!"): |
| |
| |
| model = get_model(task_type, selected_model_name, hyperparams) |
|
|
| |
| X = df.drop(columns=[target_column]) |
| y = df[target_column] |
|
|
| |
| label_encoder = None |
| if task_type == "classification" and y.dtype == "object": |
| from sklearn.preprocessing import LabelEncoder |
| label_encoder = LabelEncoder() |
| y = label_encoder.fit_transform(y) |
|
|
| |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
| |
| num_cols = X.select_dtypes(include=["int64", "float64"]).columns |
| cat_cols = X.select_dtypes(include=["object", "category"]).columns |
|
|
| |
| |
| num_pipeline = SkPipeline([ |
| ("imputer", SimpleImputer(strategy="median")), |
| ("scaler", StandardScaler()) |
| ]) |
|
|
| |
| cat_pipeline = SkPipeline([ |
| ("imputer", SimpleImputer(strategy="most_frequent")), |
| ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)) |
| ]) |
|
|
| preprocessor = ColumnTransformer([ |
| ("num", num_pipeline, num_cols), |
| ("cat", cat_pipeline, cat_cols) |
| ]) |
|
|
| pipeline = SkPipeline([ |
| ("preprocessor", preprocessor), |
| ("model", model) |
| ]) |
|
|
| |
| pipeline.fit(X_train, y_train) |
|
|
| |
| st.session_state.X_test = X_test |
| st.session_state.y_test = y_test |
| st.session_state.task_type = task_type |
| st.session_state.label_encoder = label_encoder |
| |
| |
| if "test_results_calculated" in st.session_state: |
| st.session_state.test_results_calculated = False |
| |
| |
| for key in ['test_metrics', 'test_y_pred', 'test_y_test', 'test_cm', 'sampling_message']: |
| if key in st.session_state: |
| del st.session_state[key] |
|
|
| |
| if task_type == "classification": |
| return pipeline, label_encoder |
| else: |
| return pipeline |