| import torch
|
| from transformers import BertTokenizer, AdamW
|
| from src.models.toxic_classifier import ToxicClassifier
|
| from src.models.trainer import ModelTrainer
|
| from src.data.data_loader import load_toxic_data, create_data_loaders
|
| import logging
|
| import os
|
| from torch.cuda.amp import GradScaler, autocast
|
|
|
|
|
| logging.basicConfig(level=logging.INFO)
|
| logger = logging.getLogger(__name__)
|
|
|
| def train_model(
|
| data_path: str,
|
| model_save_path: str,
|
| num_epochs: int = 5,
|
| batch_size: int = 64,
|
| learning_rate: float = 2e-5,
|
| max_grad_norm: float = 1.0
|
| ):
|
|
|
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| if device.type == 'cuda':
|
| torch.backends.cudnn.benchmark = True
|
| logger.info(f"Using device: {device}")
|
|
|
|
|
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
|
|
|
|
| logger.info("Loading dataset...")
|
| texts, labels = load_toxic_data(data_path)
|
| train_loader, val_loader = create_data_loaders(
|
| texts,
|
| labels,
|
| tokenizer,
|
| batch_size=batch_size
|
| )
|
|
|
|
|
| logger.info("Initializing model...")
|
| model = ToxicClassifier().to(device)
|
|
|
|
|
| optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
|
|
|
|
|
| scaler = GradScaler()
|
|
|
|
|
| trainer = ModelTrainer(model, optimizer, criterion=torch.nn.BCELoss(), device=device, scaler=scaler)
|
|
|
|
|
| logger.info("Starting training...")
|
| best_val_loss = float('inf')
|
|
|
| for epoch in range(num_epochs):
|
|
|
| train_metrics = trainer.train_epoch(train_loader)
|
| logger.info(f"Epoch {epoch+1}/{num_epochs}")
|
| logger.info(f"Training Loss: {train_metrics['loss']:.4f}")
|
|
|
|
|
| val_metrics = trainer.evaluate(val_loader)
|
| val_loss = val_metrics['loss']
|
| logger.info(f"Validation Loss: {val_loss:.4f}")
|
|
|
|
|
| if val_loss < best_val_loss:
|
| best_val_loss = val_loss
|
| torch.save({
|
| 'epoch': epoch,
|
| 'model_state_dict': model.state_dict(),
|
| 'optimizer_state_dict': optimizer.state_dict(),
|
| 'loss': best_val_loss,
|
| }, os.path.join(model_save_path, 'best_model.pt'))
|
| logger.info("Saved best model checkpoint")
|
|
|
| logger.info("Training completed!")
|
|
|
| if __name__ == "__main__":
|
| DATA_PATH = os.path.join("data", "raw", "train.csv")
|
| MODEL_SAVE_PATH = os.path.join("models", "saved")
|
|
|
|
|
| os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
|
|
|
| train_model(DATA_PATH, MODEL_SAVE_PATH) |