{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is how to batch inputs together."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = 'bert-base-uncased'\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "sequences = [\n",
    "    'I will not give up this time','I will try my best and see what happens'\n",
    "]\n",
    "batch = tokenizer(sequences, padding=True, truncation=True, return_tensors='pt')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The tokenizer accepts sentence pairs as well!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [101, 2026, 2171, 2003, 14093, 999, 102, 1045, 2572, 1037, 3076, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = \"bert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "tokenizer('My name is Abdullah!', \"I am a student.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we have several pairs of sentences:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': tensor([[  101,  2026,  2171,  2003, 14093,   999,   102,  1045,  2572,  1037,\n",
       "          3076,   102,     0,     0,     0,     0,     0,     0,     0,     0,\n",
       "             0],\n",
       "        [  101,  1045,  2572,  6517,  1045,  2514,  2066,  1045,  2001,  6620,\n",
       "          3993,  1012,   102,  2045,  2003,  2061,  2172,  2000,  4553,  1012,\n",
       "           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
       "        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
       "        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = 'bert-base-uncased'\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "tokenizer(\n",
    "    ['My name is Abdullah!', 'I am sad I feel like I was prideful.'],\n",
    "    ['I am a student', 'There is so much to learn.'],\n",
    "    padding = True,\n",
    "    truncation = True,\n",
    "    return_tensors = 'pt'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[ 0.2318, -0.3874],\n",
      "        [-0.3586, -0.5364]], grad_fn=<AddmmBackward0>)\n",
      "tensor([[0.6500, 0.3500],\n",
      "        [0.5443, 0.4557]], grad_fn=<SoftmaxBackward0>)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{0: 'LABEL_0', 1: 'LABEL_1'}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoModelForSequenceClassification\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = 'bert-base-uncased'\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "batch = tokenizer(\n",
    "    ['My name is Abdullah!', 'I work at Hackules.inc'],\n",
    "    ['I am a student', 'This movie is great'],\n",
    "    padding = True,\n",
    "    truncation = True,\n",
    "    return_tensors = 'pt'\n",
    ")\n",
    "\n",
    "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "outputs = model(**batch)\n",
    "print(outputs.logits)\n",
    "\n",
    "import torch\n",
    "\n",
    "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n",
    "print(predictions)\n",
    "\n",
    "model.config.id2label\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}