| """ |
| literal2idiomatic ver: d-1-2 |
| """ |
| import os |
| from idiomify.paths import ROOT_DIR |
| from idiomify.fetchers import fetch_pie, fetch_config |
| from idiomify.preprocess import upsample, cleanse, stratified_split, annotate |
| import wandb |
|
|
|
|
| def main(): |
|
|
| |
| pie_df = fetch_pie() |
| config = fetch_config()['literal2idiomatic'] |
| train_df, test_df = pie_df.pipe(cleanse)\ |
| .pipe(upsample, seed=config['seed'])\ |
| .pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\ |
| .pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed']) |
| |
| train_df = train_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]] |
| test_df = test_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]] |
| dfs = (train_df, test_df) |
| with wandb.init(entity="eubinecto", project="idiomify") as run: |
| |
| train_path = ROOT_DIR / "train.tsv" |
| test_path = ROOT_DIR / "test.tsv" |
| paths = (train_path, test_path) |
| artifact = wandb.Artifact(name="literal2idiomatic", type="dataset", description=config['description'], |
| metadata=config) |
| for tsv_path, df in zip(paths, dfs): |
| df.to_csv(tsv_path, sep="\t") |
| artifact.add_file(tsv_path) |
| |
| run.log_artifact(artifact, aliases=["latest", config['ver']]) |
| |
| for tsv_path in paths: |
| os.remove(tsv_path) |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|