| |
|
|
| import gradio as gr |
| from transformers import pipeline |
| from transformers import AutoTokenizer |
| from datasets import load_dataset |
| from transformers import DataCollatorWithPadding |
|
|
| raw_datasets = load_dataset("glue", "sst2") |
| raw_datasets |
| checkpoint = "bert-base-uncased" |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
| def tokenize_function(example): |
| return tokenizer(example["sentence"], truncation=True) |
|
|
| tokenized_datasets = raw_datasets.map(tokenize_function, batched=True,remove_columns=['idx','sentence']) |
| tokenized_datasets |
|
|
|
|
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
|
|
| from transformers import TrainingArguments |
| from transformers import AutoModelForSequenceClassification |
| from datasets import load_metric |
| from transformers import Trainer |
| import numpy as np |
|
|
| training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch") |
| model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) |
|
|
| def compute_metrics(eval_preds): |
| metric = load_metric("glue", "sst2") |
| logits, labels = eval_preds |
| predictions = np.argmax(logits, axis=-1) |
| return metric.compute(predictions=predictions, references=labels) |
|
|
| trainer = Trainer( |
| model, |
| training_args, |
| train_dataset=tokenized_datasets["train"], |
| eval_dataset=tokenized_datasets["validation"], |
| data_collator=data_collator, |
| tokenizer=tokenizer, |
| compute_metrics=compute_metrics, |
| ) |
|
|
|
|
| trainer.train() |
| |
| |
| |
| |
| |
| |