TruthLens / src /models /roberta_model.py
DevPatel0611's picture
Clean build with correct gitignore
86b932c
import os
import sys
import json
import logging
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments,
DataCollatorWithPadding
)
from datasets import Dataset
_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(name)s | %(levelname)s | %(message)s")
logger = logging.getLogger("roberta_model")
def train_roberta(cfg, splits_dir, save_dir):
os.makedirs(save_dir, exist_ok=True)
# 1. Load Data
train_df = pd.read_csv(os.path.join(splits_dir, "df_train.csv"))
val_df = pd.read_csv(os.path.join(splits_dir, "df_val.csv"))
train_df["clean_text"] = train_df["clean_text"].fillna("")
val_df["clean_text"] = val_df["clean_text"].fillna("")
maxlen = cfg.get("preprocessing", {}).get("bert_max_len", 512)
batch_size = cfg.get("training", {}).get("bert_batch_size", 16)
epochs = cfg.get("training", {}).get("bert_epochs", 3)
lr = float(cfg.get("training", {}).get("roberta_learning_rate", 1e-5))
logger.info("Loading RoBERTa tokenizer...")
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 2. Tokenization Helper
def tokenize_function(examples):
return tokenizer(examples["text"], padding=False, truncation=True, max_length=maxlen)
# 3. Create OOF Proxy Split (80/20) safely
idx_train, idx_meta_val = train_test_split(
range(len(train_df)), test_size=0.20,
stratify=train_df["binary_label"], random_state=42
)
subset_train_df = train_df.iloc[idx_train].copy()
# 4. Convert to HuggingFace Datasets
hf_sub_train = Dataset.from_pandas(pd.DataFrame({
"text": subset_train_df["clean_text"], "labels": subset_train_df["binary_label"]
}), preserve_index=False)
hf_full_train = Dataset.from_pandas(pd.DataFrame({
"text": train_df["clean_text"], "labels": train_df["binary_label"]
}), preserve_index=False)
hf_val = Dataset.from_pandas(pd.DataFrame({
"text": val_df["clean_text"], "labels": val_df["binary_label"]
}), preserve_index=False)
logger.info("Tokenizing datasets...")
hf_sub_train = hf_sub_train.map(tokenize_function, batched=True)
hf_full_train = hf_full_train.map(tokenize_function, batched=True)
hf_val = hf_val.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# 5. Initialize Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# 6. Trainer Setup
training_args = TrainingArguments(
output_dir=os.path.join(save_dir, "checkpoints"),
eval_strategy="epoch",
save_strategy="epoch",
learning_rate=lr,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=2,
dataloader_num_workers=2,
num_train_epochs=epochs,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
fp16=torch.cuda.is_available(),
disable_tqdm=False
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=hf_sub_train,
eval_dataset=hf_val,
processing_class=tokenizer,
data_collator=data_collator,
)
# 7. Train
logger.info("Starting RoBERTa internal proxy training...")
trainer.train()
# 8. Save Model
logger.info("Saving final fine-tuned model...")
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
# 9. Extract OOF over the entire training set
logger.info("Generating OOF predictions on full train set proxy wrapper...")
oof_preds = trainer.predict(hf_full_train)
# probabilities for class 1 (True)
oof_probas = torch.softmax(torch.tensor(oof_preds.predictions), dim=-1)[:, 1].numpy()
np.save(os.path.join(save_dir, "roberta_oof.npy"), oof_probas)
logger.info("Saved roberta_oof.npy")
# Validation evaluation
val_preds_out = trainer.predict(hf_val)
val_probas = torch.softmax(torch.tensor(val_preds_out.predictions), dim=-1)[:, 1].numpy()
from src.models.logistic_model import plot_and_save_cm
plot_and_save_cm(
val_df["binary_label"],
(val_probas > 0.5).astype(int),
os.path.join(save_dir, "cm.png"),
title="RoBERTa Confusion Matrix"
)
logger.info("RoBERTa Training completed!")
# ====================================================================
# OPTIONAL: Full K-Fold OOF (GPU-intensive)
# --------------------------------------------------------------------
# A robust 5-Fold implementation for deploying RoBERTa if unconstrained
# temporal budget scales naturally to GPU cluster arrays.
#
"""
from sklearn.model_selection import StratifiedKFold
def strict_kfold_roberta(train_df, tokenize_function, data_collator, lr, batch_size, epochs, save_dir):
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_probas = np.zeros(len(train_df), dtype=np.float32)
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df["binary_label"])):
logger.info(f"Training Fold {fold+1}/5")
df_train = train_df.iloc[train_idx].copy()
df_val = train_df.iloc[val_idx].copy()
ds_train = Dataset.from_pandas(pd.DataFrame({"text": df_train["clean_text"], "labels": df_train["binary_label"]}), preserve_index=False).map(tokenize_function, batched=True)
ds_val = Dataset.from_pandas(pd.DataFrame({"text": df_val["clean_text"], "labels": df_val["binary_label"]}), preserve_index=False).map(tokenize_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
training_args = TrainingArguments(
output_dir=os.path.join(save_dir, f"fold_{fold}"),
eval_strategy="epoch",
save_strategy="epoch",
learning_rate=lr,
per_device_train_batch_size=batch_size,
num_train_epochs=epochs,
fp16=torch.cuda.is_available(),
load_best_model_at_end=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=ds_train,
eval_dataset=ds_val,
data_collator=data_collator,
)
trainer.train()
fold_preds = trainer.predict(ds_val)
oof_probas[val_idx] = torch.softmax(torch.tensor(fold_preds.predictions), dim=-1)[:, 1].numpy()
np.save(os.path.join(save_dir, "roberta_oof.npy"), oof_probas)
"""
# ====================================================================
if __name__ == "__main__":
import yaml
cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml")
with open(cfg_path, "r", encoding="utf-8") as file:
config = yaml.safe_load(file)
s_dir = os.path.join(_PROJECT_ROOT, config["paths"]["splits_dir"])
m_dir = os.path.join(_PROJECT_ROOT, config["paths"]["models_dir"], "roberta_model")
train_roberta(config, s_dir, m_dir)