| import pandas as pd |
| import torch |
| |
| |
| from transformers import GPT2Tokenizer |
|
|
| |
| |
| |
| |
|
|
| tokenizer = GPT2Tokenizer.from_pretrained('gpt2', add_bos_token=True) |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
|
|
| |
| |
|
|
| |
| |
|
|
| test_df = pd.read_csv('./test_data.csv') |
| test_df = test_df.reset_index(drop=True) |
| |
|
|
| class TextDataset(torch.utils.data.Dataset): |
| def __init__(self, X, y): |
| self.X = X |
| self.y = y |
| |
| def __len__(self): |
| return len(self.X) |
| |
| |
| |
| |
| def __getitem__(self, idx): |
| return self.X[idx], self.y[idx] |
| |
| def collate_fn(batch): |
| X = [i[0] for i in batch] |
| y = [i[1] for i in batch] |
|
|
| lenX = [] |
| maxlen = max([len(tokenizer.tokenize(i)) for i in X]) |
| maylen = max([len(tokenizer.tokenize(i)) for i in y]) |
|
|
| |
|
|
| inputs = [tokenizer(i, max_length=maxlen, padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True) for i in X] |
| targets = [tokenizer(i, max_length=maylen, padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True) for i in y] |
|
|
| input_ids, input_mask = [], [] |
| for i in inputs: |
| input_ids.append(i['input_ids']) |
| input_mask.append(i['attention_mask']) |
| target_ids, target_mask = [], [] |
| for i in targets: |
| target_ids.append(i['input_ids']) |
| target_mask.append(i['attention_mask']) |
| |
| return (torch.vstack(input_ids), torch.vstack(input_mask), torch.vstack(target_ids), torch.vstack(target_mask)) |
|
|
| val_ds = TextDataset(test_df['X'].values, test_df['y'].values) |
| valloader = torch.utils.data.DataLoader(val_ds, batch_size=5000, shuffle=False, collate_fn=collate_fn) |
|
|
| |
|
|
| def get_sample(i, device='cpu'): |
| |
| |
| |
| |
| |
| val_batch = next(iter(valloader)) |
| return val_batch[0][i].unsqueeze(dim=0).to(device), val_batch[1][i].unsqueeze(dim=0).type(torch.float32).to(device), val_batch[2][i].to(device), val_batch[3][i].to(device) |
|
|
|
|
| |
| |
| |
| |
| |
|
|