| import torch |
| import numpy as np |
| import random |
| from transformers import T5Tokenizer, T5ForConditionalGeneration |
| from torch.utils.data import Dataset, DataLoader |
| from sklearn.model_selection import train_test_split |
| import torch.nn as nn |
| import torch.optim as optim |
| from sklearn.metrics import f1_score |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
| def set_seed(seed_value=30): |
| """Set seed for reproducibility.""" |
| random.seed(seed_value) |
| np.random.seed(seed_value) |
| torch.manual_seed(seed_value) |
| torch.cuda.manual_seed_all(seed_value) |
| torch.backends.cudnn.deterministic = True |
| torch.backends.cudnn.benchmark = False |
|
|
| |
| set_seed(30) |
|
|
|
|
| |
| data_path = 'final_dataset.csv' |
| data = pd.read_csv(data_path) |
|
|
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
| |
| tokenizer = T5Tokenizer.from_pretrained('t5-small') |
| model = T5ForConditionalGeneration.from_pretrained('t5-small') |
| model.to(device) |
| model.eval() |
|
|
| |
| def generate_summaries(texts, model, tokenizer, device, max_length=150): |
| summaries = [] |
| for text in texts: |
| encoded_text = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True).to(device) |
| summary_ids = model.generate(encoded_text, max_length=max_length, num_beams=4, early_stopping=True) |
| summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) |
| summaries.append(summary) |
| return summaries |
|
|
| |
| chunk_size = 10 |
| num_chunks = len(data) // chunk_size + (1 if len(data) % chunk_size != 0 else 0) |
|
|
| all_summaries = [] |
| for i in range(num_chunks): |
| batch = data['Content'][i * chunk_size:(i + 1) * chunk_size] |
| batch_summaries = generate_summaries(batch, model, tokenizer, device) |
| all_summaries.extend(batch_summaries) |
|
|
| |
| data['Summary'] = all_summaries |
|
|
| |
| output_path = '/content/summarized_data.csv' |
| data.to_csv(output_path, index=False) |
| print(f"Data with summaries saved to {output_path}") |
|
|
| class PolicyDataset(Dataset): |
| def __init__(self, data, tokenizer, max_input_length=512, max_target_length=128): |
| self.data = data |
| self.tokenizer = tokenizer |
| self.max_input_length = max_input_length |
| self.max_target_length = max_target_length |
|
|
| def __len__(self): |
| return len(self.data) |
|
|
| def __getitem__(self, idx): |
| policy_text = self.data.iloc[idx]['Content'] |
| summary_text = self.data.iloc[idx]['Summary'] |
|
|
| input_encoding = self.tokenizer.encode_plus( |
| policy_text, |
| max_length=self.max_input_length, |
| padding='max_length', |
| truncation=True, |
| return_tensors='pt' |
| ) |
|
|
| target_encoding = self.tokenizer.encode_plus( |
| summary_text, |
| max_length=self.max_target_length, |
| padding='max_length', |
| truncation=True, |
| return_tensors='pt' |
| ) |
|
|
| return { |
| 'input_ids': input_encoding['input_ids'].squeeze(), |
| 'attention_mask': input_encoding['attention_mask'].squeeze(), |
| 'labels': target_encoding['input_ids'].squeeze(), |
| 'labels_mask': target_encoding['attention_mask'].squeeze() |
| } |
|
|
| data = pd.read_csv('summarized_data.csv') |
| tokenizer = T5Tokenizer.from_pretrained('t5-small') |
| model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device) |
|
|
| |
| train_data, eval_data = train_test_split(data, test_size=0.1, random_state=42) |
| train_dataset = PolicyDataset(train_data, tokenizer) |
| eval_dataset = PolicyDataset(eval_data, tokenizer) |
| train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) |
| eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False) |
|
|
|
|
| def train(model, train_loader, optimizer, criterion, device): |
| model.train() |
| total_loss = 0 |
| for batch in train_loader: |
| optimizer.zero_grad() |
|
|
| input_ids = batch['input_ids'].to(device) |
| attention_mask = batch['attention_mask'].to(device) |
| labels = batch['labels'].to(device) |
|
|
| outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) |
| logits = outputs.logits |
|
|
| |
| |
| loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1)) |
| loss.backward() |
| optimizer.step() |
|
|
| total_loss += loss.item() |
|
|
| return total_loss / len(train_loader) |
|
|
| def evaluate(model, eval_loader, criterion, device): |
| model.eval() |
| total_loss = 0 |
| all_predictions = [] |
| all_labels = [] |
| with torch.no_grad(): |
| for batch in eval_loader: |
| input_ids = batch['input_ids'].to(device) |
| attention_mask = batch['attention_mask'].to(device) |
| labels = batch['labels'].to(device) |
|
|
| outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) |
| logits = outputs.logits |
|
|
| |
| loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1)) |
| total_loss += loss.item() |
|
|
| |
| predictions = torch.argmax(logits, dim=-1).flatten().cpu().numpy() |
| labels_flat = labels.flatten().cpu().numpy() |
| valid_indices = labels_flat != -100 |
| valid_predictions = predictions[valid_indices] |
| valid_labels = labels_flat[valid_indices] |
| all_predictions.extend(valid_predictions) |
| all_labels.extend(valid_labels) |
|
|
| f1 = f1_score(all_labels, all_predictions, average='macro') |
| return total_loss / len(eval_loader), f1 |
|
|
| optimizer = optim.AdamW(model.parameters(), lr=5e-5) |
| criterion = nn.CrossEntropyLoss() |
|
|
| |
| for epoch in range(5): |
| train_loss = train(model, train_loader, optimizer, criterion, device) |
| eval_loss, eval_f1 = evaluate(model, eval_loader, criterion, device) |
| print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Eval Loss = {eval_loss:.4f}, Eval F1 = {eval_f1:.4f}") |
|
|
|
|
| |
| def run_training(lr, batch_size, number_of_epochs=5): |
| model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device) |
| model.train() |
| train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) |
| optimizer = optim.AdamW(model.parameters(), lr=lr) |
| criterion = torch.nn.CrossEntropyLoss() |
|
|
| |
| for epoch in range(number_of_epochs): |
| train_loss = train(model, train_loader, optimizer, criterion, device) |
| eval_loss, eval_f1 = evaluate(model, eval_loader, criterion, device) |
| print(f"LR: {lr}, Batch size: {batch_size}, Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}, Eval F1: {eval_f1:.4f}") |
|
|
| |
| learning_rates = [1e-5, 3e-5, 5e-5] |
| batch_sizes = [16, 32, 64] |
|
|
| |
| for lr in learning_rates: |
| for batch_size in batch_sizes: |
| run_training(lr, batch_size, number_of_epochs=5) |
|
|
|
|
|
|