{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'input_ids': [101, 1045, 2572, 5191, 1012, 1012, 1045, 4553, 2061, 4030, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
    "tokenized_inputs=tokenizer.tokenize('I am worried.. I learn so slow')\n",
    "ids = tokenizer.convert_tokens_to_ids(tokenized_inputs)\n",
    "input_ids = tokenizer.prepare_for_model(ids)\n",
    "input_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': tensor([[ 101, 1045, 2572, 5191, 1045, 4553, 2061, 4030,  102,    0],\n",
      "        [ 101, 2023, 2003, 2033, 2028, 1997, 1996, 5409, 2493,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
      "        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],\n",
      "        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n",
      "torch.Size([2, 10, 768])\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[-0.1298, -0.3846],\n",
      "        [ 0.1664, -0.1490]], grad_fn=<AddmmBackward0>)\n",
      "tensor([[0.5633, 0.4367],\n",
      "        [0.5782, 0.4218]], grad_fn=<SoftmaxBackward0>)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{0: 'LABEL_0', 1: 'LABEL_1'}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
    "inputs = tokenizer([\"I am worried I learn so slow\", \n",
    "                   \"This is me one of the worst students\"],\n",
    "                   padding=True, truncation=True, return_tensors=\"pt\")\n",
    "print(inputs)\n",
    "\n",
    "from transformers import AutoModel\n",
    "\n",
    "model = AutoModel.from_pretrained(\"bert-base-uncased\")\n",
    "outputs = model(**inputs)\n",
    "print(outputs.last_hidden_state.shape)\n",
    "\n",
    "from transformers import AutoModelForSequenceClassification\n",
    "\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n",
    "outputs = model(**inputs)\n",
    "print(outputs.logits)\n",
    "\n",
    "import torch\n",
    "\n",
    "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n",
    "print(predictions)\n",
    "\n",
    "model.config.id2label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BertConfig {\n",
      "  \"architectures\": [\n",
      "    \"BertForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"gradient_checkpointing\": false,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-12,\n",
      "  \"max_position_embeddings\": 512,\n",
      "  \"model_type\": \"bert\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.34.1\",\n",
      "  \"type_vocab_size\": 2,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 30522\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoConfig\n",
    "from transformers import BertConfig\n",
    "from transformers import BertModel\n",
    "\n",
    "bert_config = BertConfig.from_pretrained(\"bert-base-uncased\")\n",
    "bert_model = BertModel(bert_config)\n",
    "\n",
    "auto_config = AutoConfig.from_pretrained(\"bert-base-uncased\")\n",
    "bert_model_auto_config = BertModel(auto_config)\n",
    "\n",
    "print(bert_config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_bert_config = BertConfig.from_pretrained('bert-base-uncased', num_hidden_layers = 9, vocab_size=1000)\n",
    "new_bert_model = BertModel(new_bert_config)\n",
    "\n",
    "save_new_bert_model = new_bert_model.save_pretrained('new_bert_model')\n",
    "\n",
    "load_new_bert_model = BertModel.from_pretrained('new_bert_model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['sentence1', 'sentence2', 'label', 'idx'],\n",
      "    num_rows: 3668\n",
      "}) {'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)} {'sentence1': 'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0} {'sentence1': ['Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .', \"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .\", 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .', 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .', 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .'], 'sentence2': ['Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .', \"Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .\", \"On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .\", 'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .', 'PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .'], 'label': [1, 0, 1, 0, 1], 'idx': [0, 1, 2, 3, 4]}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "008ad47191464d26a6edd36e8d00cc06",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/1725 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 3668\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 408\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 1725\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "raw_dataset = load_dataset(\"glue\", \"mrpc\")\n",
    "print(raw_dataset[\"train\"], raw_dataset[\"train\"].features, raw_dataset[\"train\"][0], raw_dataset[\"train\"][:5])\n",
    "\n",
    "from transformers import AutoTokenizer\n",
    "checkpoint = \"bert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "\n",
    "def tokenize_function(example):\n",
    "    return tokenizer(example[\"sentence1\"], example[\"sentence2\"],\n",
    "                     padding = True, truncation=True,\n",
    "                     max_length=128) \n",
    "\n",
    "tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)\n",
    "tokenized_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 3668\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 408\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 1725\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_dataset = tokenized_dataset.remove_columns(['idx','sentence1','sentence2'])\n",
    "tokenized_dataset = tokenized_dataset.rename_column('label','labels')\n",
    "tokenized_dataset = tokenized_dataset.with_format('torch')\n",
    "\n",
    "tokenized_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "    num_rows: 3668\n",
       "})"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_dataset['train']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "14633ab0e5204abf9593e1b5a8b57c2d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/3668 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8e10fd28b7ad4805b3f3b55d0b192027",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/408 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9aea16a997e64169acfde01a91195a79",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/1725 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([16, 128])\n",
      "torch.Size([16, 128])\n",
      "torch.Size([16, 128])\n",
      "torch.Size([16, 128])\n",
      "torch.Size([16, 128])\n",
      "torch.Size([16, 128])\n",
      "torch.Size([16, 128])\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
    "checkpoint = 'bert-base-uncased'\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "\n",
    "def tokenize_function(example):\n",
    "    return tokenizer(example['sentence1'], example['sentence2'],\n",
    "                     padding=\"max_length\",truncation=True,\n",
    "                     max_length=128)\n",
    "    \n",
    "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) \n",
    "tokenized_datasets = tokenized_datasets.remove_columns(['idx','sentence1','sentence2'])\n",
    "tokenized_datasets = tokenized_datasets.rename_column('label','labels')\n",
    "tokenized_datasets = tokenized_datasets.with_format('torch')\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "\n",
    "train_dataloader = DataLoader(tokenized_datasets['train'], \n",
    "                              batch_size=16, shuffle=True)\n",
    "\n",
    "for step, batch in enumerate(train_dataloader):\n",
    "    print(batch['input_ids'].shape)\n",
    "    if step>5:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "raw_datasets = load_dataset('glue', 'mrpc')\n",
    "from transformers import AutoTokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
    "\n",
    "def tokenize_function(example):\n",
    "    return tokenizer(example['sentence1'], example['sentence2'],\n",
    "                     truncation=True)\n",
    "\n",
    "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
    "tokenized_datasets = tokenized_datasets.remove_columns(['idx','sentence1','sentence2'])\n",
    "tokenized_datasets = tokenized_datasets.rename_column('label','labels')\n",
    "tokenized_datasets = tokenized_datasets.with_format('torch')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 3668\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 408\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 1725\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([16, 73])\n",
      "torch.Size([16, 75])\n",
      "torch.Size([16, 85])\n",
      "torch.Size([16, 81])\n",
      "torch.Size([16, 77])\n",
      "torch.Size([16, 83])\n",
      "torch.Size([16, 79])\n"
     ]
    }
   ],
   "source": [
    "from torch.utils.data import DataLoader\n",
    "from transformers import DataCollatorWithPadding\n",
    "\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
    "train_dataloader = DataLoader(tokenized_datasets['train'],\n",
    "                              batch_size=16, shuffle=True,\n",
    "                              collate_fn=data_collator)\n",
    "\n",
    "for step, batch in enumerate(train_dataloader):\n",
    "    print(batch['input_ids'].shape)\n",
    "    if step>5:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
    "\n",
    "checkpoint = 'bert-base-uncased'\n",
    "raw_dataset = load_dataset('glue', 'mrpc')\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "\n",
    "def tokenize_function(example):\n",
    "    return tokenizer(example['sentence1'], example['sentence2'],\n",
    "                     truncation=True)\n",
    "\n",
    "tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
    "\n",
    "from transformers import AutoModelForSequenceClassification\n",
    "\n",
    "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
    "\n",
    "# Specify training arguments hyperparameters:\n",
    "from transformers import TrainingArguments\n",
    "training_args = TrainingArguments(\"test-trainer\",\n",
    "                                  per_device_train_batch_size=16,\n",
    "                                  per_device_eval_batch_size=16,\n",
    "                                  num_train_epochs=5,\n",
    "                                  learning_rate=2e-5,\n",
    "                                  weight_decay=0.01)\n",
    "\n",
    "# Create the Trainer instance:\n",
    "from transformers import Trainer\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_dataset['train'],\n",
    "    eval_dataset=tokenized_dataset['validation'],\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer\n",
    ")\n",
    "trainer.train()\n",
    "\n",
    "'''The results will however be anticlimatic because \n",
    "you will only get a training loss that doesn't tell you how well the\n",
    "model is actually doing..\n",
    "To fix this, you need to specify the evaluation metric'''\n",
    "\n",
    "predictions = trainer.predict(tokenized_dataset['validation'])\n",
    "print(predictions)\n",
    "print(predictions.predictions.shape, predictions.label_ids.shape)\n",
    "\n",
    "# it returns a named tuple with 3 elements: predictions, label_ids, metrics\n",
    "# the predictions are the logits of the model with all the sentences of the dataset\n",
    "# so a numpy array of shape(488 x 2)\n",
    "\n",
    "# to match them with our labels we need to take the maximum logits for each prediction\n",
    "# to know which is the maximum, use the argmax function\n",
    "import numpy as np\n",
    "from datasets import load_metric\n",
    "\n",
    "metric = load_metric('glue', 'mrpc')\n",
    "preds = np.argmax(predictions.predictions, axis=-1)\n",
    "metric.compute(predictions=preds, references=predictions.label_ids)\n",
    "\n",
    "'''We can see that our model did learn something!'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''To monitor the metrics during training, we need to define a\n",
    "compute metric function as we did above\n",
    "and pass it to the Trainer\n",
    "'''\n",
    "metric = load_metric('glue','mrpc')\n",
    "def compute_metrics(eval_preds):\n",
    "    logits, labels = eval_preds\n",
    "    predictions = np.argmax(logits, axis=-1)\n",
    "    return metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "training_args = TrainingArguments(\"test-trainer\",\n",
    "                                  evaluation_strategy='epoch')\n",
    "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_dataset['train'],\n",
    "    eval_dataset=tokenized_dataset['validation'],\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer,\n",
    "    compute_metrics=compute_metrics\n",
    ")\n",
    "\n",
    "trainer.train()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}