| from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer |
| from datasets import load_dataset |
|
|
| model_name = "microsoft/Multilingual-MiniLM-L12-H384" |
|
|
|
|
| dataset = load_dataset("Goodmotion/spam-mail") |
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
| |
| def encode_labels(data): |
| label_map = {"SPAM": 1, "NOSPAM": 0} |
| data["label"] = label_map[data["label"]] |
| return data |
|
|
| def tokenize_data(data): |
| return tokenizer( |
| data["text"], |
| padding="max_length", |
| truncation=True, |
| max_length=128 |
| ) |
|
|
| |
| tokenized_dataset = dataset.map(tokenize_data, batched=True) |
|
|
| |
| model = AutoModelForSequenceClassification.from_pretrained( |
| "microsoft/Multilingual-MiniLM-L12-H384", |
| num_labels=2 |
| ) |
| model.classifier.weight.data.normal_(mean=0.0, std=0.02) |
| model.classifier.bias.data.zero_() |
|
|
|
|
| training_args = TrainingArguments( |
| output_dir="./results", |
| |
| learning_rate=5e-5, |
| |
| per_device_train_batch_size=16, |
| |
| num_train_epochs=3, |
| |
| weight_decay=0.01, |
| logging_dir='./logs' |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_dataset["train"], |
| ) |
|
|
| |
| trainer.train() |
|
|
| |
| model.save_pretrained("./spam-classifier") |
| |
| tokenizer.save_pretrained("./spam-classifier") |
|
|