| import pandas as pd |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import accuracy_score, classification_report |
| import torch |
| from torch.utils.data import Dataset, DataLoader |
| from transformers import BertTokenizer, BertForSequenceClassification, AdamW |
| from transformers import get_scheduler |
| |
| from datasets import load_dataset |
|
|
| data_path = "" |
| model_path = "" |
| data_files = {"train": "train_data.csv", "validation": "val_data.csv", "test": "test_data.csv"} |
|
|
| dataset_train = load_dataset(data_path, data_files=data_files, split="train") |
| dataset_val = load_dataset(data_path, data_files=data_files, split="validation") |
| dataset_test = load_dataset(data_path, data_files=data_files, split="test") |
|
|
| train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True) |
| test_loader = DataLoader(dataset_test, batch_size=16) |
|
|
| class CustomModel: |
| def __init__(self, model_name="bert-base-uncased", num_labels=2, lr=5e-5, epochs=4, max_len=128): |
| """ |
| Initialize the custom model with tokenizer, optimizer, scheduler, and training parameters. |
| |
| Args: |
| model_name (str): Name of the pretrained BERT model. |
| num_labels (int): Number of labels for the classification task. |
| lr (float): Learning rate for the optimizer. |
| epochs (int): Number of epochs for training. |
| max_len (int): Maximum token length for sequences. |
| """ |
| self.model_name = model_name |
| self.num_labels = num_labels |
| self.epochs = epochs |
| self.max_len = max_len |
|
|
| |
| self.tokenizer = BertTokenizer.from_pretrained(model_name) |
| self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) |
|
|
| |
| self.optimizer = AdamW(self.model.parameters(), lr=lr) |
|
|
| |
| self.scheduler = None |
|
|
| |
| self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
| self.model.to(self.device) |
|
|
| def setup_scheduler(self, train_loader): |
| """ |
| Setup a learning rate scheduler based on training data. |
| |
| Args: |
| train_loader (DataLoader): Training data loader. |
| """ |
| num_training_steps = len(train_loader) * self.epochs |
| self.scheduler = get_scheduler( |
| "linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps |
| ) |
|
|
| def tokenize_batch(self, texts): |
| """ |
| Tokenize a batch of text inputs. |
| |
| Args: |
| texts (list[str]): List of text strings to tokenize. |
| |
| Returns: |
| dict: Tokenized inputs with attention masks and input IDs. |
| """ |
| return self.tokenizer( |
| texts, |
| padding=True, |
| truncation=True, |
| max_length=self.max_len, |
| return_tensors="pt" |
| ) |
|
|
| def train(self, train_loader): |
| """ |
| Train the model with raw text inputs and labels. |
| |
| Args: |
| train_loader (DataLoader): Training data loader containing text and labels. |
| """ |
| self.model.train() |
| for epoch in range(self.epochs): |
| epoch_loss = 0 |
| for batch in train_loader: |
| texts, labels = batch['title'], batch['labels'] |
| labels = labels.to(self.device) |
|
|
| |
| tokenized_inputs = self.tokenize_batch(texts) |
| tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()} |
| tokenized_inputs['labels'] = labels |
|
|
| |
| outputs = self.model(**tokenized_inputs) |
| loss = outputs.loss |
| loss.backward() |
| self.optimizer.step() |
| self.scheduler.step() |
| self.optimizer.zero_grad() |
| epoch_loss += loss.item() |
| print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss / len(train_loader):.4f}") |
|
|
| def evaluate(self, test_loader): |
| """ |
| Evaluate the model with raw text inputs and labels. |
| |
| Args: |
| test_loader (DataLoader): Test data loader containing text and labels. |
| |
| Returns: |
| Tuple: True labels and predicted labels. |
| """ |
| self.model.eval() |
| y_true, y_pred = [], [] |
| with torch.no_grad(): |
| for batch in test_loader: |
| texts, labels = batch['title'], batch['labels'] |
| labels = labels.to(self.device) |
|
|
| |
| tokenized_inputs = self.tokenize_batch(texts) |
| tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()} |
|
|
| |
| outputs = self.model(**tokenized_inputs) |
| logits = outputs.logits |
| predictions = torch.argmax(logits, dim=-1) |
| y_true.extend(labels.tolist()) |
| y_pred.extend(predictions.tolist()) |
| return y_true, y_pred |
|
|
| def save_model(self, save_path): |
| """ |
| Save the model locally in Hugging Face format. |
| |
| Args: |
| save_path (str): Path to save the model. |
| """ |
| self.model.save_pretrained(save_path) |
| self.tokenizer.save_pretrained(save_path) |
|
|
| def push_model(self, repo_name): |
| """ |
| Push the model to the Hugging Face Hub. |
| |
| Args: |
| repo_name (str): Repository name on Hugging Face Hub. |
| """ |
| self.model.push_to_hub(repo_name) |
| self.tokenizer.push_to_hub(repo_name) |
|
|
| custom_model = CustomModel(model_name=model_path, num_labels=2, lr=5e-5, epochs=4) |
| |
| |
| y_true, y_pred = custom_model.evaluate(test_loader) |
|
|
| |
| print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}") |
| print("Classification Report:\n", classification_report(y_true, y_pred)) |
|
|
|
|