| from torch.utils.data import DataLoader, Dataset |
| import torch |
| from transformers import BertTokenizerFast, BertModel |
| from transformers import BertConfig, BertPreTrainedModel |
| import numpy as np |
| from typing import Dict, List, Union, Tuple |
| from utils import ner_labels_to_ids, intent_labels_to_ids, structure_data |
|
|
| class tokenized_dataset(Dataset): |
| """ |
| A Pytorch Dataset for tokenizing and encoding text data for a BERT-based model. |
| |
| Args: |
| dataset (dict): A dictionary containing 'text', 'ner', and 'intent' keys. |
| tokenizer (BertTokenizerFast): A tokenizer for processing text input. |
| max_len (int, optionl): Maximum length of tokenized sequences (default: 128). |
| |
| Attributes: |
| len (int): Number of samples in the dataset. |
| |
| Methods: |
| __getitem__(self, index: int) -> Dict[str, torch.Tensor]: |
| Retrieve and preprocess a single sample from the dataset. |
| |
| __len__(self) -> int: |
| Get the total number of samples int the dataset. |
| |
| Returns: |
| Dict[str, torch.Tensor]: A dictionary containing tokenized and encoded text, NER and intent labels. |
| """ |
| def __init__(self, dataset: Dict[str, List[str]], tokenizer: BertTokenizerFast, max_len: int = 128): |
| self.len = len(dataset['text']) |
| self.ner_labels_to_ids = ner_labels_to_ids() |
| self.intent_labels_to_ids = intent_labels_to_ids() |
| self.text = dataset['text'] |
| self.intent = dataset['intent'] |
| self.ner = dataset['entities'] |
| self.tokenizer = tokenizer() |
| self.max_len = max_len |
|
|
| def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: |
| |
| sentence = self.text[index].strip() |
| intent_label = self.intent[index].strip() |
| ner_labels = self.ner[index] |
|
|
| |
| |
| encoding = self.tokenizer( |
| sentence, |
| return_offsets_mapping=True, |
| padding='max_length', |
| truncation=True, |
| max_length=self.max_len |
| ) |
|
|
| |
| tokenized_ner_labels = [self.ner_labels_to_ids[label] for label in ner_labels] |
| |
| encoded_ner_labels = np.ones(len(encoding['offset_mapping']), dtype=int) * -100 |
|
|
| |
| i = 0 |
| prev = -1 |
| for idx, mapping in enumerate(encoding['offset_mapping']): |
| if mapping[0] == mapping[1] == 0: |
| continue |
| if mapping[0] != prev: |
| |
| encoded_ner_labels[idx] = tokenized_ner_labels[i] |
| prev = mapping[1] |
| i += 1 |
| else: |
| prev = mapping[1] |
|
|
| |
| tokenized_intent_label = self.intent_labels_to_ids[intent_label] |
|
|
| |
| item = {key: torch.as_tensor(val) for key, val in encoding.items()} |
| item['ner_labels'] = torch.as_tensor(encoded_ner_labels) |
| item['intent_labels'] = torch.as_tensor(tokenized_intent_label) |
|
|
| return item |
|
|
| def __len__(self) -> int: |
| return self.len |
| |
|
|