| import argparse, pandas as pd |
| from datasets import Dataset |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer |
| from training.utils import compute_metrics_sentiment |
|
|
| parser = argparse.ArgumentParser() |
| parser.add_argument("--model_name", default="distilbert-base-uncased") |
| parser.add_argument("--train_csv", required=True) |
| parser.add_argument("--eval_csv", required=True) |
| parser.add_argument("--text_col", default="text") |
| parser.add_argument("--label_col", default="label") |
| parser.add_argument("--output_dir", default="./outputs/sentiment") |
| parser.add_argument("--epochs", type=int, default=3) |
| parser.add_argument("--batch_size", type=int, default=16) |
| parser.add_argument("--lr", type=float, default=5e-5) |
| args = parser.parse_args() |
|
|
| train_df = pd.read_csv(args.train_csv) |
| eval_df = pd.read_csv(args.eval_csv) |
|
|
| label_names = sorted(train_df[args.label_col].unique().tolist()) |
| label2id = {l:i for i,l in enumerate(label_names)} |
| id2label = {i:l for l,i in label2id.items()} |
|
|
| def encode(df): |
| tok = tokenizer(df[args.text_col].tolist(), truncation=True, padding=True) |
| tok["labels"] = [label2id[l] for l in df[args.label_col].tolist()] |
| return tok |
|
|
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) |
| train_ds = Dataset.from_pandas(train_df).map(encode, batched=True, remove_columns=train_df.columns) |
| eval_ds = Dataset.from_pandas(eval_df).map(encode, batched=True, remove_columns=eval_df.columns) |
|
|
| model = AutoModelForSequenceClassification.from_pretrained( |
| args.model_name, num_labels=len(label_names), id2label=id2label, label2id=label2id |
| ) |
|
|
| training_args = TrainingArguments( |
| output_dir=args.output_dir, |
| evaluation_strategy="epoch", |
| learning_rate=args.lr, |
| per_device_train_batch_size=args.batch_size, |
| per_device_eval_batch_size=args.batch_size, |
| num_train_epochs=args.epochs, |
| weight_decay=0.01, |
| load_best_model_at_end=True, |
| metric_for_best_model="accuracy", |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_ds, |
| eval_dataset=eval_ds, |
| tokenizer=tokenizer, |
| compute_metrics=compute_metrics_sentiment, |
| ) |
|
|
| trainer.train() |
| trainer.save_model(args.output_dir) |
| tokenizer.save_pretrained(args.output_dir) |
|
|