| import gradio as gr |
| import pandas as pd |
| from huggingface_hub.hf_api import create_repo, upload_folder, upload_file, HfApi |
| from huggingface_hub.repository import Repository |
| import subprocess |
| import os |
| import tempfile |
| from uuid import uuid4 |
| import pickle |
| import sweetviz as sv |
| import dabl |
| import re |
|
|
|
|
| def analyze_datasets(dataset, token, column=None, pairwise="off"): |
| df = pd.read_csv(dataset.name) |
| username = HfApi().whoami(token=token)["name"] |
| if column is not None: |
| analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise) |
| else: |
| analyze_report = sv.analyze(df, pairwise_analysis=pairwise) |
| dataset_name = dataset.name.split("/")[-1].strip(".csv") |
| analyze_report.show_html('./index.html', open_browser=False) |
| |
| repo_url = create_repo(f"{username}/{dataset_name}-report", repo_type = "space", token = token, space_sdk = "static", private=False) |
| |
| upload_file(path_or_fileobj ="./index.html", path_in_repo = "./index.html", repo_id =f"{username}/{dataset_name}-report", repo_type = "space", token=token) |
| readme = f"---\ntitle: {dataset_name}\nemoji: β¨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---" |
| with open("README.md", "w+") as f: |
| f.write(readme) |
| upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}-report", repo_type = "space", token=token) |
|
|
| return f"Your dataset report will be ready at {repo_url}" |
|
|
|
|
| from sklearn.utils import estimator_html_repr |
|
|
|
|
| def extract_estimator_config(model): |
| hyperparameter_dict = model.get_params(deep=True) |
| table = "| Hyperparameters | Value |\n| :-- | :-- |\n" |
| for hyperparameter, value in hyperparameter_dict.items(): |
| table += f"| {hyperparameter} | {value} |\n" |
| return table |
|
|
| def detect_training(df, column): |
| if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]: |
| trainer = dabl.SimpleRegressor() |
| task = "regression" |
| elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]: |
| trainer = dabl.SimpleClassifier() |
| task = "classification" |
| return trainer, task |
|
|
| def edit_types(df): |
| types = dabl.detect_types(df) |
| low_cardinality = types[types["low_card_int"] == True].index.tolist() |
| dirty_float = types[types["dirty_float"] == True].index.tolist() |
| type_hints = {} |
| for col in low_cardinality: |
| type_hints[col] = "categorical" |
| for col in dirty_float: |
| type_hints[col] = "continuous" |
| df_clean = dabl.clean(df, type_hints=type_hints) |
| return df_clean |
|
|
| def train_baseline(dataset, token, column): |
| df = pd.read_csv(dataset.name) |
| dataset_name = dataset.name.split("/")[-1].strip(".csv") |
| df_clean = edit_types(df) |
| fc, task = detect_training(df_clean, column) |
| X = df_clean.drop(column, axis = 1) |
| y = df_clean[column] |
| |
| with tempfile.TemporaryDirectory() as tmpdirname: |
| from contextlib import redirect_stdout |
|
|
| with open(f'{tmpdirname}/logs.txt', 'w') as f: |
| with redirect_stdout(f): |
| print('Logging training') |
| fc.fit(X, y) |
| username = HfApi().whoami(token=token)["name"] |
| repo_url = create_repo(repo_id = f"{username}/{dataset_name}-{column}-{task}", token = token) |
| if task == "regression": |
| task_metadata = "tabular-regression" |
| else: |
| task_metadata = "tabular-classification" |
| readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\ntags:\n- {task_metadata}\n- baseline-trainer\n---\n\n" |
| readme += f"## Baseline Model trained on {dataset_name} to apply {task} on {column}\n\n" |
| readme+="**Metrics of the best model:**\n\n" |
| for elem in str(fc.current_best_).split("\n"): |
| readme+= f"{elem}\n\n" |
| readme+= "\n\n**See model plot below:**\n\n" |
| readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_))) |
| readme+= "\n\n**Disclaimer:** This model is trained with dabl library as a baseline, for better results, use [AutoTrain](https://huggingface.co/autotrain).\n\n" |
| readme+= "**Logs of training** including the models tried in the process can be found in logs.txt" |
| with open(f"{tmpdirname}/README.md", "w+") as f: |
| f.write(readme) |
| with open(f"{tmpdirname}/clf.pkl", mode="bw") as f: |
| pickle.dump(fc, file=f) |
| upload_folder(repo_id =f"{username}/{dataset_name}-{column}-{task}", folder_path=tmpdirname, repo_type = "model", token=token, path_in_repo="./") |
|
|
| return f"Your model will be ready at {repo_url}" |
|
|
|
|
|
|
| with gr.Blocks() as demo: |
| main_title = gr.Markdown("""# Baseline Trainer πͺπβ¨""") |
| main_desc = gr.Markdown("""This app trains a baseline model for a given dataset and pushes it to your Hugging Face Hub Profile with a model card. For better results, use [AutoTrain](https://huggingface.co/autotrain).""") |
| |
| |
| with gr.Tabs(): |
| with gr.TabItem("Baseline Trainer") as baseline_trainer: |
| with gr.Row(): |
| with gr.Column(): |
| title = gr.Markdown(""" ## Train a supervised baseline model πͺ""") |
| description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.") |
| dataset = gr.File(label = "CSV Dataset") |
| column = gr.Text(label = "Enter target variable:") |
| pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token. You can find your token [here](https://huggingface.co/settings/tokens)") |
| token = gr.Textbox(label = "Your Hugging Face Token") |
| inference_run = gr.Button("Train") |
| inference_progress = gr.StatusTracker(cover_container=True) |
|
|
| outcome = gr.outputs.Textbox(label = "Progress") |
| inference_run.click( |
| train_baseline, |
| inputs=[dataset, token, column], |
| outputs=outcome, |
| status_tracker=inference_progress, |
| ) |
| with gr.TabItem("Analyze") as analyze: |
| with gr.Row(): |
| with gr.Column(): |
| title = gr.Markdown(""" ## Analyze Dataset πͺ""") |
| description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.") |
| dataset = gr.File(label = "CSV Dataset") |
| column = gr.Text(label = "Compare dataset against a target variable (Optional)") |
| pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis") |
| token = gr.Textbox(label = "Your Hugging Face Token") |
| pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token. You can find your token [here](https://huggingface.co/settings/tokens)") |
| inference_run = gr.Button("Infer") |
| inference_progress = gr.StatusTracker(cover_container=True) |
| outcome = gr.outputs.Textbox() |
| inference_run.click( |
| analyze_datasets, |
| inputs=[dataset, token, column, pairwise], |
| outputs=outcome, |
| status_tracker=inference_progress, |
| ) |
|
|
| demo.launch(debug=True) |