| |
| import numpy as np |
| import pandas as pd |
| import csv |
| import torch.nn as nn |
| from torch.optim.lr_scheduler import ReduceLROnPlateau |
| from torch.utils.data import TensorDataset, DataLoader |
| from transformers import BertTokenizer,BertConfig,AdamW |
| from sklearn.metrics import accuracy_score |
| from sklearn.metrics import classification_report |
| from tqdm import tqdm |
| import torch |
| import transformers |
| from torch.utils.data import Dataset, DataLoader |
|
|
| |
|
|
| class MyDataSet(Dataset): |
| def __init__(self, loaded_data): |
| self.data = loaded_data |
| |
| def __len__(self): |
| return len(self.data) |
| |
| def __getitem__(self, idx): |
| return self.data[idx] |
| |
| Data_path = "/kaggle/input/inference/train.csv" |
| Totle_data = pd.read_csv(Data_path) |
| Totle_data = Totle_data.sample(frac=0.1) |
| Totle_data = Totle_data.dropna(axis=0,subset = ["2"]) |
| custom_dataset = MyDataSet(Totle_data) |
| |
| train_size = int(len(custom_dataset) * 0.6) |
| validate_size = int(len(custom_dataset) * 0.1) |
| test_size = len(custom_dataset) - validate_size - train_size |
| train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(custom_dataset, [train_size, validate_size, test_size]) |
| |
| |
| train_data_path="Bert_Try.csv" |
| dev_data_path = "Bert_Dev.csv" |
| test_data_path="Bert_Test.csv" |
|
|
| train_dataset = Totle_data.iloc[train_dataset.indices] |
| validate_dataset = Totle_data.iloc[validate_dataset.indices] |
| test_dataset = Totle_data.iloc[test_dataset.indices] |
|
|
| |
| train_dataset.to_csv(train_data_path,index=False,header=True) |
| validate_dataset.to_csv(dev_data_path ,index=False,header=True) |
| test_dataset.to_csv(test_data_path,index=False,header=True) |
|
|
| |
| data = pd.read_csv(train_data_path) |
| data.head |
|
|
| |
|
|
| class BertClassificationModel(nn.Module): |
| def __init__(self): |
| super(BertClassificationModel, self).__init__() |
| |
| pretrained_weights="bert-base-chinese" |
| self.bert = transformers.BertModel.from_pretrained(pretrained_weights) |
| for param in self.bert.parameters(): |
| param.requires_grad = True |
| |
| self.dense = nn.Linear(768, 3) |
| |
| def forward(self, input_ids,token_type_ids,attention_mask): |
| |
| bert_output = self.bert(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=attention_mask) |
| |
| bert_cls_hidden_state = bert_output[1] |
| |
| linear_output = self.dense(bert_cls_hidden_state) |
| return linear_output |
|
|
| |
|
|
| def encoder(max_len,vocab_path,text_list): |
| |
| |
| tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") |
| tokenizer = tokenizer( |
| text_list, |
| padding = True, |
| truncation = True, |
| max_length = max_len, |
| return_tensors='pt' |
| ) |
| input_ids = tokenizer['input_ids'] |
| token_type_ids = tokenizer['token_type_ids'] |
| attention_mask = tokenizer['attention_mask'] |
| return input_ids,token_type_ids,attention_mask |
|
|
| |
| labels2dict = {"neutral":0,"entailment":1,"contradiction":2} |
| def load_data(path): |
| csvFileObj = open(path) |
| readerObj = csv.reader(csvFileObj) |
| text_list = [] |
| labels = [] |
| for row in readerObj: |
| |
| if readerObj.line_num == 1: |
| continue |
| |
| label = int(labels2dict[row[0]]) |
| text = row[1] |
| text_list.append(text) |
| labels.append(label) |
| |
| input_ids,token_type_ids,attention_mask = encoder(max_len=150,vocab_path="/root/Bert/bert-base-chinese/vocab.txt",text_list=text_list) |
| labels = torch.tensor(labels) |
| |
| data = TensorDataset(input_ids,token_type_ids,attention_mask,labels) |
| return data |
|
|
| |
| |
| batch_size = 16 |
| |
| train_data_path="Bert_Try.csv" |
| dev_data_path="Bert_Dev.csv" |
| test_data_path="Bert_Test.csv" |
| |
| train_data = load_data(train_data_path) |
| dev_data = load_data(dev_data_path) |
| test_data = load_data(test_data_path) |
| |
| train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True) |
| dev_loader = DataLoader(dataset=dev_data, batch_size=batch_size, shuffle=True) |
| test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False) |
|
|
| |
| def dev(model,dev_loader): |
| model.to(device) |
|
|
| model.eval() |
|
|
| with torch.no_grad(): |
| correct = 0 |
| total = 0 |
| for step, (input_ids,token_type_ids,attention_mask,labels) in tqdm(enumerate(dev_loader),desc='Dev Itreation:'): |
| input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device) |
| out_put = model(input_ids,token_type_ids,attention_mask) |
| _, predict = torch.max(out_put.data, 1) |
| correct += (predict==labels).sum().item() |
| total += labels.size(0) |
| res = correct / total |
| return res |
|
|
| |
|
|
| device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') |
| def train(model,train_loader,dev_loader) : |
|
|
| model.to(device) |
| model.train() |
| criterion = nn.CrossEntropyLoss() |
| param_optimizer = list(model.named_parameters()) |
| no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] |
|
|
| optimizer_grouped_parameters = [ |
| {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], |
| 'weight_decay': 0.01}, |
| {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} |
| ] |
|
|
| optimizer_params = {'lr': 1e-5, 'eps': 1e-6, 'correct_bias': False} |
| optimizer = AdamW(optimizer_grouped_parameters, **optimizer_params) |
| scheduler = ReduceLROnPlateau(optimizer,mode='max',factor=0.5,min_lr=1e-7, patience=5,verbose= True, threshold=0.0001, eps=1e-08) |
| t_total = len(train_loader) |
|
|
| total_epochs = 10 |
| bestAcc = 0 |
| correct = 0 |
| total = 0 |
| print('Training and verification begin!') |
| for epoch in range(total_epochs): |
| for step, (input_ids,token_type_ids,attention_mask,labels) in enumerate(train_loader): |
|
|
| optimizer.zero_grad() |
| input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device) |
| out_put = model(input_ids,token_type_ids,attention_mask) |
| loss = criterion(out_put, labels) |
| _, predict = torch.max(out_put.data, 1) |
| correct += (predict == labels).sum().item() |
| total += labels.size(0) |
| loss.backward() |
| optimizer.step() |
| |
| if (step + 1) % 10 == 0: |
| train_acc = correct / total |
| print("Train Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,loss.item())) |
| |
| if (step + 1) % 200 == 0: |
| train_acc = correct / total |
| |
| acc = dev(model, dev_loader) |
| if bestAcc < acc: |
| bestAcc = acc |
| |
| path = 'bert_model.pkl' |
| torch.save(model, path) |
| print("DEV Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,bestAcc{:.6f}%,dev_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,bestAcc*100,acc*100,loss.item())) |
| scheduler.step(bestAcc) |
|
|
| |
|
|
| path = '/kaggle/input/inference/bert_model.pkl' |
| |
| |
| model = BertClassificationModel() |
| |
| train(model,train_loader,dev_loader) |
|
|