{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "from transformers import AutoTokenizer, DataCollatorWithPadding\n", "\n", "checkpoint = 'bert-base-uncased'\n", "raw_dataset = load_dataset('glue', 'mrpc')\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "def tokenize_function(example):\n", " return tokenizer(example['sentence1'], example['sentence2'],\n", " truncation=True)\n", "\n", "tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", "\n", "from transformers import AutoModelForSequenceClassification\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", "\n", "# Specify training arguments hyperparameters:\n", "from transformers import TrainingArguments\n", "training_args = TrainingArguments(\"test-trainer\",\n", " per_device_train_batch_size=16,\n", " per_device_eval_batch_size=16,\n", " num_train_epochs=5,\n", " learning_rate=2e-5,\n", " weight_decay=0.01)\n", "\n", "# Create the Trainer instance:\n", "from transformers import Trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_dataset['train'],\n", " eval_dataset=tokenized_dataset['validation'],\n", " data_collator=data_collator,\n", " tokenizer=tokenizer\n", ")\n", "trainer.train()\n", "\n", "'''The results will however be anticlimatic because \n", "you will only get a training loss that doesn't tell you how well the\n", "model is actually doing..\n", "To fix this, you need to specify the evaluation metric'''\n", "\n", "predictions = trainer.predict(tokenized_dataset['validation'])\n", "print(predictions)\n", "print(predictions.predictions.shape, predictions.label_ids.shape)\n", "\n", "# it returns a named tuple with 3 elements: predictions, label_ids, metrics\n", "# the predictions are the logits of the model with all the sentences of the dataset\n", "# so a numpy array of shape(488 x 2)\n", "\n", "# to match them with our labels we need to take the maximum logits for each prediction\n", "# to know which is the maximum, use the argmax function\n", "import numpy as np\n", "from datasets import load_metric\n", "\n", "metric = load_metric('glue', 'mrpc')\n", "preds = np.argmax(predictions.predictions, axis=-1)\n", "metric.compute(predictions=preds, references=predictions.label_ids)\n", "\n", "'''We can see that our model did learn something!'''" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'''To monitor the metrics during training, we need to define a\n", "compute metric function as we did above\n", "and pass it to the Trainer\n", "'''\n", "metric = load_metric('glue','mrpc')\n", "def compute_metrics(eval_preds):\n", " logits, labels = eval_preds\n", " predictions = np.argmax(logits, axis=-1)\n", " return metric.compute(predictions=predictions, references=labels)\n", "\n", "training_args = TrainingArguments(\"test-trainer\",\n", " evaluation_strategy='epoch')\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_dataset['train'],\n", " eval_dataset=tokenized_dataset['validation'],\n", " data_collator=data_collator,\n", " tokenizer=tokenizer,\n", " compute_metrics=compute_metrics\n", ")\n", "\n", "trainer.train()\n" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }