| from datasets import load_dataset |
| from transformers import ( |
| DistilBertTokenizerFast, |
| DistilBertForSequenceClassification, |
| Trainer, |
| TrainingArguments |
| ) |
| import pandas as pd |
|
|
| |
| df = pd.read_csv("data.csv") |
| dataset = load_dataset("csv", data_files="data.csv") |
|
|
| |
| label_map = {"Low Risk": 0, "Medium Risk": 1, "High Risk": 2} |
| df["label"] = df["label"].map(label_map) |
|
|
| dataset = load_dataset("csv", data_files={"train": "data.csv"}) |
|
|
| |
| tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") |
|
|
| def tokenize(batch): |
| return tokenizer(batch["text"], padding=True, truncation=True) |
|
|
| dataset = dataset.map(tokenize, batched=True) |
|
|
| |
| model = DistilBertForSequenceClassification.from_pretrained( |
| "distilbert-base-uncased", |
| num_labels=3 |
| ) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir="./results", |
| evaluation_strategy="no", |
| per_device_train_batch_size=4, |
| num_train_epochs=3, |
| save_strategy="epoch", |
| logging_dir="./logs" |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=dataset["train"] |
| ) |
|
|
| trainer.train() |
| model.save_pretrained("./model") |
| tokenizer.save_pretrained("./model") |