import os import sys import json import logging import torch import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding ) from datasets import Dataset _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if str(_PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(_PROJECT_ROOT)) logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(name)s | %(levelname)s | %(message)s") logger = logging.getLogger("roberta_model") def train_roberta(cfg, splits_dir, save_dir): os.makedirs(save_dir, exist_ok=True) # 1. Load Data train_df = pd.read_csv(os.path.join(splits_dir, "df_train.csv")) val_df = pd.read_csv(os.path.join(splits_dir, "df_val.csv")) train_df["clean_text"] = train_df["clean_text"].fillna("") val_df["clean_text"] = val_df["clean_text"].fillna("") maxlen = cfg.get("preprocessing", {}).get("bert_max_len", 512) batch_size = cfg.get("training", {}).get("bert_batch_size", 16) epochs = cfg.get("training", {}).get("bert_epochs", 3) lr = float(cfg.get("training", {}).get("roberta_learning_rate", 1e-5)) logger.info("Loading RoBERTa tokenizer...") model_name = "roberta-base" tokenizer = AutoTokenizer.from_pretrained(model_name) # 2. Tokenization Helper def tokenize_function(examples): return tokenizer(examples["text"], padding=False, truncation=True, max_length=maxlen) # 3. Create OOF Proxy Split (80/20) safely idx_train, idx_meta_val = train_test_split( range(len(train_df)), test_size=0.20, stratify=train_df["binary_label"], random_state=42 ) subset_train_df = train_df.iloc[idx_train].copy() # 4. Convert to HuggingFace Datasets hf_sub_train = Dataset.from_pandas(pd.DataFrame({ "text": subset_train_df["clean_text"], "labels": subset_train_df["binary_label"] }), preserve_index=False) hf_full_train = Dataset.from_pandas(pd.DataFrame({ "text": train_df["clean_text"], "labels": train_df["binary_label"] }), preserve_index=False) hf_val = Dataset.from_pandas(pd.DataFrame({ "text": val_df["clean_text"], "labels": val_df["binary_label"] }), preserve_index=False) logger.info("Tokenizing datasets...") hf_sub_train = hf_sub_train.map(tokenize_function, batched=True) hf_full_train = hf_full_train.map(tokenize_function, batched=True) hf_val = hf_val.map(tokenize_function, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # 5. Initialize Model model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # 6. Trainer Setup training_args = TrainingArguments( output_dir=os.path.join(save_dir, "checkpoints"), eval_strategy="epoch", save_strategy="epoch", learning_rate=lr, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=2, dataloader_num_workers=2, num_train_epochs=epochs, weight_decay=0.01, load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False, fp16=torch.cuda.is_available(), disable_tqdm=False ) trainer = Trainer( model=model, args=training_args, train_dataset=hf_sub_train, eval_dataset=hf_val, processing_class=tokenizer, data_collator=data_collator, ) # 7. Train logger.info("Starting RoBERTa internal proxy training...") trainer.train() # 8. Save Model logger.info("Saving final fine-tuned model...") trainer.save_model(save_dir) tokenizer.save_pretrained(save_dir) # 9. Extract OOF over the entire training set logger.info("Generating OOF predictions on full train set proxy wrapper...") oof_preds = trainer.predict(hf_full_train) # probabilities for class 1 (True) oof_probas = torch.softmax(torch.tensor(oof_preds.predictions), dim=-1)[:, 1].numpy() np.save(os.path.join(save_dir, "roberta_oof.npy"), oof_probas) logger.info("Saved roberta_oof.npy") # Validation evaluation val_preds_out = trainer.predict(hf_val) val_probas = torch.softmax(torch.tensor(val_preds_out.predictions), dim=-1)[:, 1].numpy() from src.models.logistic_model import plot_and_save_cm plot_and_save_cm( val_df["binary_label"], (val_probas > 0.5).astype(int), os.path.join(save_dir, "cm.png"), title="RoBERTa Confusion Matrix" ) logger.info("RoBERTa Training completed!") # ==================================================================== # OPTIONAL: Full K-Fold OOF (GPU-intensive) # -------------------------------------------------------------------- # A robust 5-Fold implementation for deploying RoBERTa if unconstrained # temporal budget scales naturally to GPU cluster arrays. # """ from sklearn.model_selection import StratifiedKFold def strict_kfold_roberta(train_df, tokenize_function, data_collator, lr, batch_size, epochs, save_dir): skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) oof_probas = np.zeros(len(train_df), dtype=np.float32) for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df["binary_label"])): logger.info(f"Training Fold {fold+1}/5") df_train = train_df.iloc[train_idx].copy() df_val = train_df.iloc[val_idx].copy() ds_train = Dataset.from_pandas(pd.DataFrame({"text": df_train["clean_text"], "labels": df_train["binary_label"]}), preserve_index=False).map(tokenize_function, batched=True) ds_val = Dataset.from_pandas(pd.DataFrame({"text": df_val["clean_text"], "labels": df_val["binary_label"]}), preserve_index=False).map(tokenize_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2) training_args = TrainingArguments( output_dir=os.path.join(save_dir, f"fold_{fold}"), eval_strategy="epoch", save_strategy="epoch", learning_rate=lr, per_device_train_batch_size=batch_size, num_train_epochs=epochs, fp16=torch.cuda.is_available(), load_best_model_at_end=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=ds_train, eval_dataset=ds_val, data_collator=data_collator, ) trainer.train() fold_preds = trainer.predict(ds_val) oof_probas[val_idx] = torch.softmax(torch.tensor(fold_preds.predictions), dim=-1)[:, 1].numpy() np.save(os.path.join(save_dir, "roberta_oof.npy"), oof_probas) """ # ==================================================================== if __name__ == "__main__": import yaml cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml") with open(cfg_path, "r", encoding="utf-8") as file: config = yaml.safe_load(file) s_dir = os.path.join(_PROJECT_ROOT, config["paths"]["splits_dir"]) m_dir = os.path.join(_PROJECT_ROOT, config["paths"]["models_dir"], "roberta_model") train_roberta(config, s_dir, m_dir)