{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).\n", "Using a pipeline without specifying a model name and revision in production is not recommended.\n", "Ignoring args : ('bert-base-uncased',)\n" ] }, { "data": { "text/plain": [ "[{'label': 'NEGATIVE', 'score': 0.9996927976608276}]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import pipeline\n", "\n", "checkpoint = 'bert-base-uncased'\n", "\n", "classifier = pipeline('sentiment-analysis')\n", "classifier('I am disappointed in myself', checkpoint)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "['this', 'is', 'a', 'sentence', 'this', 'is', 'another', 'sentence', '.'] [2023, 2003, 1037, 6251, 2023, 2003, 2178, 6251, 1012] {'input_ids': [101, 2023, 2003, 1037, 6251, 2023, 2003, 2178, 6251, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n", "{'input_ids': [101, 2023, 2003, 1037, 6251, 2023, 2003, 2178, 6251, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n", "[CLS] this is a sentence this is another sentence. [SEP]\n", "{'input_ids': tensor([[ 101, 2023, 2003, 1037, 6251, 102, 0],\n", " [ 101, 2023, 2003, 2178, 6251, 1012, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0],\n", " [1, 1, 1, 1, 1, 1, 1]])}\n", "torch.Size([2, 7, 768])\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([2, 2])\n", "tensor([[0.6280, 0.3720],\n", " [0.6231, 0.3769]], grad_fn=)\n" ] }, { "data": { "text/plain": [ "{0: 'LABEL_0', 1: 'LABEL_1'}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = 'bert-base-uncased'\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "raw_inputs = ['This is a sentence', 'This is another sentence.']\n", "\n", "input_tokens = tokenizer.tokenize(raw_inputs)\n", "input_ids = tokenizer.convert_tokens_to_ids(input_tokens)\n", "inputs = tokenizer.prepare_for_model(input_ids)\n", "\n", "print(input_tokens, input_ids, inputs)\n", "print(inputs)\n", "print(tokenizer.decode(inputs['input_ids']))\n", "\n", "direct_inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')\n", "print(direct_inputs)\n", "\n", "from transformers import AutoModel\n", "\n", "model = AutoModel.from_pretrained(checkpoint)\n", "outputs = model(**direct_inputs)\n", "print(outputs.last_hidden_state.shape)\n", "\n", "from transformers import AutoModelForSequenceClassification\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "outputs = model(**direct_inputs)\n", "print(outputs.logits.shape)\n", "\n", "import torch\n", "\n", "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n", "print(predictions)\n", "\n", "model.config.id2label" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModel\n", "\n", "checkpoint = 'bert-base-uncased'\n", "model = AutoModel.from_pretrained(checkpoint)\n", "\n", "from transformers import AutoConfig, BertModel\n", "\n", "config = AutoConfig.from_pretrained(checkpoint)\n", "model = BertModel(config)\n", "\n", "from transformers import BertConfig\n", "\n", "config = BertConfig.from_pretrained(checkpoint)\n", "model = BertModel(config)\n", "\n", "model.save_pretrained('my-bert-model')\n", "model = BertModel.from_pretrained('my-bert-model')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 3668\n", " })\n", " validation: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 408\n", " })\n", " test: Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 1725\n", " })\n", "})" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import load_dataset\n", "\n", "raw_datasets = load_dataset('glue', 'mrpc')\n", "raw_datasets" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 3668\n", "})" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets['train']" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'sentence1': 'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n", " 'sentence2': 'Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n", " 'label': 1,\n", " 'idx': 0}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets['train'][0]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'sentence1': Value(dtype='string', id=None),\n", " 'sentence2': Value(dtype='string', id=None),\n", " 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),\n", " 'idx': Value(dtype='int32', id=None)}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets['train'].features" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "aace1e2dface4a91891e4f3f293f72a3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1725 [00:005:\n", " break" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Dynamic Padding !!!!" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "62a66797be6544e9a7c2c1b2844d2c0a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/3668 [00:005:\n", " break" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9a3cbe1e023c427daa2f4d86542449f4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/3668 [00:005:\n", " break" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "59780038e4dc41d68f858b55511780a4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/408 [00:005:\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 2 }