{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['i', 'am', 'going', 'to', 'token', '##ize', 'this', 'sentence', '.']\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "tokens = tokenizer.tokenize('''I am going to tokenize this sentence.''')\n", "print(tokens)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['▁i', '▁am', '▁going', '▁to', '▁to', 'ken', 'ize', '▁this', '▁sentence', '▁using', '▁albert', '-', 'base', '-', 'v', '1', '▁model', \"'\", 's', '▁to', 'ken', 'izer', '.']\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('albert-base-v1')\n", "tokens = tokenizer.tokenize('''I am going to tokenize this sentence\n", " using albert-base-v1 model's tokenizer.''')\n", "print(tokens)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Note that above, the subword based tokenizer \\nis used in both tokenizers. \\nHowever to depict the beginning of a word in albert\\nbase tokenizer it uses the prefix _\\nwhile, bert base tokenizer uses ##.'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''Note that above, the subword based tokenizer \n", "is used in both tokenizers. \n", "However to depict the beginning of a word in albert\n", "base tokenizer it uses the prefix _\n", "while, bert base tokenizer uses ##.'''\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[11082, 3046, 2000, 19204, 4697]\n" ] } ], "source": [ "'''\n", "The second task of the tokenization pipeline\n", "is to map those tokens to their respective IDs.\n", "This is done by the convert_tokens_to_ids method.\n", "'''\n", "\n", "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "tokens = tokenizer.tokenize('Lets try to tokenize')\n", "input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", "print(input_ids)\n", "\n", "'''This is why we need to download a file while instantiating\n", "the tokenizer from pretrained method.\n", "We have to make sure we use the same mapping as to when the model\n", "was pretrained\n", "To do this we use the convert tokens to ids method.'''" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[101, 11082, 3046, 2000, 19204, 4697, 102]\n" ] } ], "source": [ "'''\n", "You may have noticed that the tokens for CLS and SEP \n", "are missing! Those are the special tokens that are added \n", "by the prepare for model method. \n", "The prepare for model method knows which special tokens\n", "to add and where to add them based on the model type.\n", "'''\n", "final_inputs = tokenizer.prepare_for_model(input_ids)\n", "print(final_inputs['input_ids'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[CLS] lets try to tokenize [SEP]\n" ] } ], "source": [ "'''\n", "You can look at the special tokens modularly\n", "by decoding the input ids as how the tokenizer \n", "has changed your text by using the decode method.\n", "'''\n", "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "inputs = tokenizer('Lets try to tokenize')\n", "print(tokenizer.decode(inputs['input_ids']))\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Lets try to tokenize.\n" ] } ], "source": [ "'''Above the bert tokenizer uses the [CLS] and [SEP] tokens\n", "But the roberta tokenizer uses the and tokens'''\n", "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('roberta-base')\n", "inputs = tokenizer('Lets try to tokenize.')\n", "print(tokenizer.decode(inputs['input_ids']))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are actually two ways to decode:\n", "- The one shown above\n", "- The one shown below" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': [101, 10463, 2023, 2000, 19204, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}\n", "convert this to tokens.\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "tokens = tokenizer.tokenize('Convert this to tokens.')\n", "input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", "inputs = tokenizer.prepare_for_model(input_ids)\n", "print(inputs)\n", "\n", "# decode:\n", "decode = tokenizer.decode(input_ids)\n", "print(decode)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': [101, 10463, 2023, 2000, 19204, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}\n", "[CLS] convert this to tokens. [SEP]\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer =AutoTokenizer.from_pretrained('bert-base-uncased')\n", "inputs = tokenizer('Convert this to tokens.')\n", "print(inputs)\n", "decode = tokenizer.decode(inputs['input_ids'])\n", "print(decode)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': [101, 11082, 3046, 2000, 19204, 4697, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}\n" ] } ], "source": [ "'''\n", "Now that you know the intermediate stuff about how \n", "a tokenizer works, you can forget all that stuff\n", "and only remember that you have to call it on the input \n", "text.\n", "'''\n", "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "inputs = tokenizer('Lets try to tokenize')\n", "print(inputs)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'To learn what attention mask is\\ncheck out the --batch input together\\n\\nTo learn what the token type ids are\\ncheck out --process pairs of sentences \\n'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''To learn what attention mask is\n", "check out the --batch input together\n", "\n", "To learn what the token type ids are\n", "check out --process pairs of sentences \n", "'''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The primary and easy way to batch inputs together is as follows:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': tensor([[ 101, 1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607,\n", " 2026, 2878, 2166, 1012, 102],\n", " [ 101, 1045, 5223, 2023, 2061, 2172, 102, 0, 0, 0,\n", " 0, 0, 0, 0, 0],\n", " [ 101, 1045, 2572, 2025, 9657, 1012, 102, 0, 0, 0,\n", " 0, 0, 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", " [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = 'bert-base-uncased'\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "sentences = ['I have been waiting for a hugging face course my whole life.',\n", " 'I hate this so much',\n", " 'I am not confident.']\n", "tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')\n", "print(tokens)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "However, if you wish to batch inputs together from beneath the tokenizer pipeline:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 2061, 2172], [1045, 2572, 2025, 9657, 1012]]\n", "[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012]\n", "[1045, 5223, 2023, 2061, 2172]\n", "[1045, 2572, 2025, 9657, 1012]\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = 'bert-base-uncased'\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "sentences = ['I have been waiting for a hugging face course my whole life.',\n", " 'I hate this so much',\n", " 'I am not confident.']\n", "\n", "tokens = [tokenizer.tokenize(sentence) for sentence in sentences]\n", "ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]\n", "\n", "print(ids)\n", "print(ids[0])\n", "print(ids[1])\n", "print(ids[2])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 2061, 2172], [1045, 2572, 2025, 9657, 1012]]\n" ] }, { "ename": "ValueError", "evalue": "expected sequence of length 13 at dim 1 (got 5)", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\Users\\HP\\Desktop\\PythonProjects\\HuggingFace_Beginners\\Tokenizer_pipeline.ipynb Cell 14\u001b[0m line \u001b[0;36m9\n\u001b[0;32m 7\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtorch\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[39mprint\u001b[39m(ids)\n\u001b[1;32m----> 9\u001b[0m input_ids \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49mtensor(ids)\n", "\u001b[1;31mValueError\u001b[0m: expected sequence of length 13 at dim 1 (got 5)" ] } ], "source": [ "'''Trying to create a tensor from the three lists \n", "in torch or tensorflow will result in an error. This\n", "is because the tensors must be of the same size, i.e. rectangular\n", "This is done by padding. which we will see later on.'''\n", "\n", "\n", "import torch\n", "print(ids)\n", "input_ids = torch.tensor(ids)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] } ], "source": [ "'''The padding id provided to the model via training is 0.\n", "One should not try to change it. you can pad your outputs like so:'''\n", "\n", "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "pad = tokenizer.pad_token_id\n", "print(pad)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 2061, 2172], [1045, 2572, 2025, 9657, 1012]]\n", "[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012]\n", "[1045, 5223, 2023, 2061, 2172]\n", "[1045, 2572, 2025, 9657, 1012]\n" ] }, { "ename": "TypeError", "evalue": "'int' object is not callable", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\Users\\HP\\Desktop\\PythonProjects\\HuggingFace_Beginners\\Tokenizer_pipeline.ipynb Cell 16\u001b[0m line \u001b[0;36m1\n\u001b[0;32m 14\u001b[0m \u001b[39mprint\u001b[39m(ids[\u001b[39m1\u001b[39m])\n\u001b[0;32m 15\u001b[0m \u001b[39mprint\u001b[39m(ids[\u001b[39m2\u001b[39m])\n\u001b[1;32m---> 17\u001b[0m pad_ids \u001b[39m=\u001b[39m [tokenizer\u001b[39m.\u001b[39;49mpad_token_id(_) \u001b[39mfor\u001b[39;49;00m _ \u001b[39min\u001b[39;49;00m ids]\n\u001b[0;32m 18\u001b[0m \u001b[39mprint\u001b[39m(pad_ids)\n\u001b[0;32m 19\u001b[0m ids1 \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mtensor(pad_ids[\u001b[39m0\u001b[39m])\n", "\u001b[1;32mc:\\Users\\HP\\Desktop\\PythonProjects\\HuggingFace_Beginners\\Tokenizer_pipeline.ipynb Cell 16\u001b[0m line \u001b[0;36m1\n\u001b[0;32m 14\u001b[0m \u001b[39mprint\u001b[39m(ids[\u001b[39m1\u001b[39m])\n\u001b[0;32m 15\u001b[0m \u001b[39mprint\u001b[39m(ids[\u001b[39m2\u001b[39m])\n\u001b[1;32m---> 17\u001b[0m pad_ids \u001b[39m=\u001b[39m [tokenizer\u001b[39m.\u001b[39;49mpad_token_id(_) \u001b[39mfor\u001b[39;00m _ \u001b[39min\u001b[39;00m ids]\n\u001b[0;32m 18\u001b[0m \u001b[39mprint\u001b[39m(pad_ids)\n\u001b[0;32m 19\u001b[0m ids1 \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mtensor(pad_ids[\u001b[39m0\u001b[39m])\n", "\u001b[1;31mTypeError\u001b[0m: 'int' object is not callable" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "\n", "sentences = ['I have been waiting for a hugging face course my whole life.',\n", " 'I hate this so much',\n", " 'I am not confident.']\n", "\n", "tokens = [tokenizer.tokenize(sentence) for sentence in sentences]\n", "ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]\n", "\n", "print(ids)\n", "print(ids[0])\n", "print(ids[1])\n", "print(ids[2])\n", "\n", "pad_ids = [tokenizer.pad_token_id(_) for _ in ids]\n", "print(pad_ids)\n", "ids1 = torch.tensor(pad_ids[0])\n", "ids2 = torch.tensor(pad_ids[1])\n", "ids3 = torch.tensor(pad_ids[2])\n", "all_ids = torch.tensor(ids1, ids2, ids3)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['this', 'is', 'me', 'practicing']\n", "[2023, 2003, 2033, 12560]\n", "['this', 'is', 'me', 'practicing']\n", "this is me practicing\n" ] } ], "source": [ "'''More methods!!!'''\n", "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "tokens = tokenizer.tokenize('This is me practicing')\n", "print(tokens)\n", "input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", "print(input_ids)\n", "tokens_2 = tokenizer.convert_ids_to_tokens(input_ids)\n", "print(tokens_2)\n", "strings = tokenizer.convert_tokens_to_string(tokens)\n", "print(strings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 2 }