| from datasets import load_dataset |
| from transformers import ( |
| AutoTokenizer, |
| AutoModelForSequenceClassification, |
| Trainer, |
| TrainingArguments |
| ) |
| import pandas as pd |
|
|
| |
| df = pd.read_csv("data/vibes.csv") |
| dataset = load_dataset("csv", data_files="data/vibes.csv") |
|
|
| labels = ["negative", "neutral", "positive"] |
| label2id = {l: i for i, l in enumerate(labels)} |
| id2label = {i: l for l, i in label2id.items()} |
|
|
| def encode_labels(example): |
| example["label"] = label2id[example["label"]] |
| return example |
|
|
| dataset = dataset.map(encode_labels) |
|
|
| tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") |
|
|
| def tokenize(batch): |
| return tokenizer(batch["text"], truncation=True, padding=True) |
|
|
| dataset = dataset.map(tokenize, batched=True) |
| dataset = dataset["train"].train_test_split(test_size=0.2) |
|
|
| model = AutoModelForSequenceClassification.from_pretrained( |
| "distilbert-base-uncased", |
| num_labels=3, |
| id2label=id2label, |
| label2id=label2id |
| ) |
|
|
| training_args = TrainingArguments( |
| output_dir="./model", |
| evaluation_strategy="epoch", |
| per_device_train_batch_size=8, |
| per_device_eval_batch_size=8, |
| num_train_epochs=5, |
| save_strategy="epoch", |
| logging_dir="./logs", |
| logging_steps=10 |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=dataset["train"], |
| eval_dataset=dataset["test"], |
| tokenizer=tokenizer |
| ) |
|
|
| trainer.train() |
| trainer.save_model("./model") |
| tokenizer.save_pretrained("./model") |
|
|