{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
    "\n",
    "checkpoint = 'bert-base-uncased'\n",
    "raw_dataset = load_dataset('glue', 'mrpc')\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "\n",
    "def tokenize_function(example):\n",
    "    return tokenizer(example['sentence1'], example['sentence2'],\n",
    "                     truncation=True)\n",
    "\n",
    "tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
    "\n",
    "from transformers import AutoModelForSequenceClassification\n",
    "\n",
    "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
    "\n",
    "# Specify training arguments hyperparameters:\n",
    "from transformers import TrainingArguments\n",
    "training_args = TrainingArguments(\"test-trainer\",\n",
    "                                  per_device_train_batch_size=16,\n",
    "                                  per_device_eval_batch_size=16,\n",
    "                                  num_train_epochs=5,\n",
    "                                  learning_rate=2e-5,\n",
    "                                  weight_decay=0.01)\n",
    "\n",
    "# Create the Trainer instance:\n",
    "from transformers import Trainer\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_dataset['train'],\n",
    "    eval_dataset=tokenized_dataset['validation'],\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer\n",
    ")\n",
    "trainer.train()\n",
    "\n",
    "'''The results will however be anticlimatic because \n",
    "you will only get a training loss that doesn't tell you how well the\n",
    "model is actually doing..\n",
    "To fix this, you need to specify the evaluation metric'''\n",
    "\n",
    "predictions = trainer.predict(tokenized_dataset['validation'])\n",
    "print(predictions)\n",
    "print(predictions.predictions.shape, predictions.label_ids.shape)\n",
    "\n",
    "# it returns a named tuple with 3 elements: predictions, label_ids, metrics\n",
    "# the predictions are the logits of the model with all the sentences of the dataset\n",
    "# so a numpy array of shape(488 x 2)\n",
    "\n",
    "# to match them with our labels we need to take the maximum logits for each prediction\n",
    "# to know which is the maximum, use the argmax function\n",
    "import numpy as np\n",
    "from datasets import load_metric\n",
    "\n",
    "metric = load_metric('glue', 'mrpc')\n",
    "preds = np.argmax(predictions.predictions, axis=-1)\n",
    "metric.compute(predictions=preds, references=predictions.label_ids)\n",
    "\n",
    "'''We can see that our model did learn something!'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''To monitor the metrics during training, we need to define a\n",
    "compute metric function as we did above\n",
    "and pass it to the Trainer\n",
    "'''\n",
    "metric = load_metric('glue','mrpc')\n",
    "def compute_metrics(eval_preds):\n",
    "    logits, labels = eval_preds\n",
    "    predictions = np.argmax(logits, axis=-1)\n",
    "    return metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "training_args = TrainingArguments(\"test-trainer\",\n",
    "                                  evaluation_strategy='epoch')\n",
    "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_dataset['train'],\n",
    "    eval_dataset=tokenized_dataset['validation'],\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer,\n",
    "    compute_metrics=compute_metrics\n",
    ")\n",
    "\n",
    "trainer.train()\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}