{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This is how to batch inputs together." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = 'bert-base-uncased'\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "sequences = [\n", " 'I will not give up this time','I will try my best and see what happens'\n", "]\n", "batch = tokenizer(sequences, padding=True, truncation=True, return_tensors='pt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The tokenizer accepts sentence pairs as well!" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': [101, 2026, 2171, 2003, 14093, 999, 102, 1045, 2572, 1037, 3076, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = \"bert-base-uncased\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "tokenizer('My name is Abdullah!', \"I am a student.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If we have several pairs of sentences:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': tensor([[ 101, 2026, 2171, 2003, 14093, 999, 102, 1045, 2572, 1037,\n", " 3076, 102, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0],\n", " [ 101, 1045, 2572, 6517, 1045, 2514, 2066, 1045, 2001, 6620,\n", " 3993, 1012, 102, 2045, 2003, 2061, 2172, 2000, 4553, 1012,\n", " 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = 'bert-base-uncased'\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "tokenizer(\n", " ['My name is Abdullah!', 'I am sad I feel like I was prideful.'],\n", " ['I am a student', 'There is so much to learn.'],\n", " padding = True,\n", " truncation = True,\n", " return_tensors = 'pt'\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "tensor([[ 0.2318, -0.3874],\n", " [-0.3586, -0.5364]], grad_fn=)\n", "tensor([[0.6500, 0.3500],\n", " [0.5443, 0.4557]], grad_fn=)\n" ] }, { "data": { "text/plain": [ "{0: 'LABEL_0', 1: 'LABEL_1'}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoModelForSequenceClassification\n", "from transformers import AutoTokenizer\n", "\n", "checkpoint = 'bert-base-uncased'\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "batch = tokenizer(\n", " ['My name is Abdullah!', 'I work at Hackules.inc'],\n", " ['I am a student', 'This movie is great'],\n", " padding = True,\n", " truncation = True,\n", " return_tensors = 'pt'\n", ")\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "outputs = model(**batch)\n", "print(outputs.logits)\n", "\n", "import torch\n", "\n", "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n", "print(predictions)\n", "\n", "model.config.id2label\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 2 }