{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/plain": [ "{'input_ids': [101, 1045, 2572, 5191, 1012, 1012, 1045, 4553, 2061, 4030, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n", "tokenized_inputs=tokenizer.tokenize('I am worried.. I learn so slow')\n", "ids = tokenizer.convert_tokens_to_ids(tokenized_inputs)\n", "input_ids = tokenizer.prepare_for_model(ids)\n", "input_ids" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': tensor([[ 101, 1045, 2572, 5191, 1045, 4553, 2061, 4030, 102, 0],\n", " [ 101, 2023, 2003, 2033, 2028, 1997, 1996, 5409, 2493, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],\n", " [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n", "torch.Size([2, 10, 768])\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "tensor([[-0.1298, -0.3846],\n", " [ 0.1664, -0.1490]], grad_fn=)\n", "tensor([[0.5633, 0.4367],\n", " [0.5782, 0.4218]], grad_fn=)\n" ] }, { "data": { "text/plain": [ "{0: 'LABEL_0', 1: 'LABEL_1'}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n", "inputs = tokenizer([\"I am worried I learn so slow\", \n", " \"This is me one of the worst students\"],\n", " padding=True, truncation=True, return_tensors=\"pt\")\n", "print(inputs)\n", "\n", "from transformers import AutoModel\n", "\n", "model = AutoModel.from_pretrained(\"bert-base-uncased\")\n", "outputs = model(**inputs)\n", "print(outputs.last_hidden_state.shape)\n", "\n", "from transformers import AutoModelForSequenceClassification\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n", "outputs = model(**inputs)\n", "print(outputs.logits)\n", "\n", "import torch\n", "\n", "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n", "print(predictions)\n", "\n", "model.config.id2label" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BertConfig {\n", " \"architectures\": [\n", " \"BertForMaskedLM\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"classifier_dropout\": null,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"model_type\": \"bert\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"pad_token_id\": 0,\n", " \"position_embedding_type\": \"absolute\",\n", " \"transformers_version\": \"4.34.1\",\n", " \"type_vocab_size\": 2,\n", " \"use_cache\": true,\n", " \"vocab_size\": 30522\n", "}\n", "\n" ] } ], "source": [ "from transformers import AutoConfig\n", "from transformers import BertConfig\n", "from transformers import BertModel\n", "\n", "bert_config = BertConfig.from_pretrained(\"bert-base-uncased\")\n", "bert_model = BertModel(bert_config)\n", "\n", "auto_config = AutoConfig.from_pretrained(\"bert-base-uncased\")\n", "bert_model_auto_config = BertModel(auto_config)\n", "\n", "print(bert_config)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "new_bert_config = BertConfig.from_pretrained('bert-base-uncased', num_hidden_layers = 9, vocab_size=1000)\n", "new_bert_model = BertModel(new_bert_config)\n", "\n", "save_new_bert_model = new_bert_model.save_pretrained('new_bert_model')\n", "\n", "load_new_bert_model = BertModel.from_pretrained('new_bert_model')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataset({\n", " features: ['sentence1', 'sentence2', 'label', 'idx'],\n", " num_rows: 3668\n", "}) {'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)} {'sentence1': 'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0} {'sentence1': ['Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .', \"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .\", 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .', 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .', 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .'], 'sentence2': ['Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .', \"Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .\", \"On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .\", 'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .', 'PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .'], 'label': [1, 0, 1, 0, 1], 'idx': [0, 1, 2, 3, 4]}\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "008ad47191464d26a6edd36e8d00cc06", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1725 [00:005:\n", " break" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "raw_datasets = load_dataset('glue', 'mrpc')\n", "from transformers import AutoTokenizer\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "\n", "def tokenize_function(example):\n", " return tokenizer(example['sentence1'], example['sentence2'],\n", " truncation=True)\n", "\n", "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n", "tokenized_datasets = tokenized_datasets.remove_columns(['idx','sentence1','sentence2'])\n", "tokenized_datasets = tokenized_datasets.rename_column('label','labels')\n", "tokenized_datasets = tokenized_datasets.with_format('torch')\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 3668\n", " })\n", " validation: Dataset({\n", " features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 408\n", " })\n", " test: Dataset({\n", " features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 1725\n", " })\n", "})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized_datasets" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([16, 73])\n", "torch.Size([16, 75])\n", "torch.Size([16, 85])\n", "torch.Size([16, 81])\n", "torch.Size([16, 77])\n", "torch.Size([16, 83])\n", "torch.Size([16, 79])\n" ] } ], "source": [ "from torch.utils.data import DataLoader\n", "from transformers import DataCollatorWithPadding\n", "\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", "train_dataloader = DataLoader(tokenized_datasets['train'],\n", " batch_size=16, shuffle=True,\n", " collate_fn=data_collator)\n", "\n", "for step, batch in enumerate(train_dataloader):\n", " print(batch['input_ids'].shape)\n", " if step>5:\n", " break" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "from transformers import AutoTokenizer, DataCollatorWithPadding\n", "\n", "checkpoint = 'bert-base-uncased'\n", "raw_dataset = load_dataset('glue', 'mrpc')\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "def tokenize_function(example):\n", " return tokenizer(example['sentence1'], example['sentence2'],\n", " truncation=True)\n", "\n", "tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", "\n", "from transformers import AutoModelForSequenceClassification\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", "\n", "# Specify training arguments hyperparameters:\n", "from transformers import TrainingArguments\n", "training_args = TrainingArguments(\"test-trainer\",\n", " per_device_train_batch_size=16,\n", " per_device_eval_batch_size=16,\n", " num_train_epochs=5,\n", " learning_rate=2e-5,\n", " weight_decay=0.01)\n", "\n", "# Create the Trainer instance:\n", "from transformers import Trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_dataset['train'],\n", " eval_dataset=tokenized_dataset['validation'],\n", " data_collator=data_collator,\n", " tokenizer=tokenizer\n", ")\n", "trainer.train()\n", "\n", "'''The results will however be anticlimatic because \n", "you will only get a training loss that doesn't tell you how well the\n", "model is actually doing..\n", "To fix this, you need to specify the evaluation metric'''\n", "\n", "predictions = trainer.predict(tokenized_dataset['validation'])\n", "print(predictions)\n", "print(predictions.predictions.shape, predictions.label_ids.shape)\n", "\n", "# it returns a named tuple with 3 elements: predictions, label_ids, metrics\n", "# the predictions are the logits of the model with all the sentences of the dataset\n", "# so a numpy array of shape(488 x 2)\n", "\n", "# to match them with our labels we need to take the maximum logits for each prediction\n", "# to know which is the maximum, use the argmax function\n", "import numpy as np\n", "from datasets import load_metric\n", "\n", "metric = load_metric('glue', 'mrpc')\n", "preds = np.argmax(predictions.predictions, axis=-1)\n", "metric.compute(predictions=preds, references=predictions.label_ids)\n", "\n", "'''We can see that our model did learn something!'''" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'''To monitor the metrics during training, we need to define a\n", "compute metric function as we did above\n", "and pass it to the Trainer\n", "'''\n", "metric = load_metric('glue','mrpc')\n", "def compute_metrics(eval_preds):\n", " logits, labels = eval_preds\n", " predictions = np.argmax(logits, axis=-1)\n", " return metric.compute(predictions=predictions, references=labels)\n", "\n", "training_args = TrainingArguments(\"test-trainer\",\n", " evaluation_strategy='epoch')\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_dataset['train'],\n", " eval_dataset=tokenized_dataset['validation'],\n", " data_collator=data_collator,\n", " tokenizer=tokenizer,\n", " compute_metrics=compute_metrics\n", ")\n", "\n", "trainer.train()\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 2 }