| from deduplication import find_near_duplicates |
| from featurizer import custom_featurizer |
| from issues import find_issues |
| from pipeline import make_step, run_pipeline |
| import pandas as pd |
| from tqdm.auto import tqdm |
|
|
| bar = tqdm(total=100, leave=True) |
|
|
| steps = [ |
| make_step(find_near_duplicates, name="dedup")(progress=bar), |
| make_step(custom_featurizer, name="featurize")( |
| label=None, |
| nan_strategy="impute", |
| on_pipeline_error="drop", |
| progress=bar |
| ), |
| make_step(find_issues, name="find_label_issues")(label="HARDSHIP_INDEX", progress=bar) |
| ] |
|
|
| df = pd.read_csv("./data/Lisette.csv") |
| results = run_pipeline(steps, df=df) |
|
|
| bar.close() |
| print(results) |
|
|
|
|
|
|