TruthLens / src /models /meta_classifier.py
DevPatel0611's picture
Clean build with correct gitignore
86b932c
import os
import sys
import json
import logging
import joblib
import torch
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from matplotlib import pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
from src.models.lstm_model import BiLSTMClassifier, pad_sequences
from src.stage2_preprocessing import KerasStyleTokenizer
import sys
setattr(sys.modules['__main__'], 'KerasStyleTokenizer', KerasStyleTokenizer)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(name)s | %(levelname)s | %(message)s")
logger = logging.getLogger("meta_classifier")
def build_meta_features(df, lr_proba, lstm_proba, distil_proba, roberta_proba, is_train=True, preprocessor=None):
"""
Construct the meta-feature matrix.
If is_train is True, preprocessor is fit on the categorical columns.
"""
df_meta = pd.DataFrame({
"lr_proba": lr_proba,
"lstm_proba": lstm_proba,
"distilbert_proba": distil_proba,
"roberta_proba": roberta_proba,
"word_count": df["word_count"],
"has_date": df["has_date"].astype(int),
"freshness_score": df["freshness_score"]
})
# Categoricals to encode
cats = df[["text_length_bucket", "source_domain"]].fillna("unknown")
if is_train:
preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
cat_features = preprocessor.fit_transform(cats)
else:
cat_features = preprocessor.transform(cats)
X_meta = np.hstack((df_meta.values, cat_features))
return X_meta, preprocessor
def train_meta_classifier(cfg, splits_dir, models_dir):
save_dir = os.path.join(models_dir, "meta_classifier")
os.makedirs(save_dir, exist_ok=True)
logger.info("Loading dataset splits...")
train_df = pd.read_csv(os.path.join(splits_dir, "df_train.csv"))
val_df = pd.read_csv(os.path.join(splits_dir, "df_val.csv"))
y_train = train_df["binary_label"].values
y_val = val_df["binary_label"].values
# ── 1. Load OOF predictions for Train Set ──
logger.info("Gathering base model OOF predictions...")
try:
lr_oof = np.load(os.path.join(models_dir, "logistic_model", "lr_oof.npy"))
lstm_oof = np.load(os.path.join(models_dir, "lstm_model", "lstm_oof.npy"))
distil_oof = np.load(os.path.join(models_dir, "distilbert_model", "distilbert_oof.npy"))
roberta_oof = np.load(os.path.join(models_dir, "roberta_model", "roberta_oof.npy"))
except FileNotFoundError as e:
logger.error(f"Missing OOF file: {e}. Please ensure all base models have trained completely.")
return
roberta_oof = roberta_oof * 0.92
X_meta_train, meta_preprocessor = build_meta_features(
train_df, lr_oof, lstm_oof, distil_oof, roberta_oof, is_train=True
)
# ── 2. Dynamically Generate Val predictions ──
# Since we need a val set for early stopping, we predict them here.
logger.info("Generating base model predictions for Validation set...")
# Logistic
lr_pipeline = joblib.load(os.path.join(models_dir, "logistic_model", "logistic_model.pkl"))
lr_val = lr_pipeline.predict_proba(val_df)[:, 1]
# LSTM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import pickle
with open(os.path.join(models_dir, "tokenizer.pkl"), "rb") as f:
tok = pickle.load(f)
glove_path = os.path.join(_PROJECT_ROOT, cfg["paths"]["glove_path"])
from src.models.lstm_model import load_glove_embeddings
emb_matrix, vocab_size = load_glove_embeddings(glove_path, tok.word_index)
maxlen = cfg.get("preprocessing", {}).get("lstm_max_len", 512)
X_val_seq = tok.texts_to_sequences(val_df["clean_text"].fillna(""))
X_val_pad = pad_sequences(X_val_seq, maxlen=maxlen, padding='post')
lstm_model = BiLSTMClassifier(vocab_size, emb_matrix).to(device)
lstm_model.load_state_dict(torch.load(os.path.join(models_dir, "lstm_model", "model.pt"), map_location=device))
lstm_model.eval()
val_loader = DataLoader(TensorDataset(torch.from_numpy(X_val_pad).long()), batch_size=64, shuffle=False)
lstm_val_preds = []
with torch.no_grad():
for x_b in val_loader:
logits = lstm_model(x_b[0].to(device))
lstm_val_preds.extend(torch.sigmoid(logits).cpu().numpy())
lstm_val = np.array(lstm_val_preds)
# DistilBERT
d_tok = AutoTokenizer.from_pretrained(os.path.join(models_dir, "distilbert_model"))
d_mod = AutoModelForSequenceClassification.from_pretrained(os.path.join(models_dir, "distilbert_model")).to(device)
d_mod.eval()
distil_val = []
with torch.no_grad():
for text in val_df["clean_text"].fillna(""):
inputs = d_tok(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
out = d_mod(**inputs)
distil_val.append(torch.softmax(out.logits, dim=-1)[0, 1].item())
distil_val = np.array(distil_val)
# RoBERTa
r_tok = AutoTokenizer.from_pretrained(os.path.join(models_dir, "roberta_model"))
r_mod = AutoModelForSequenceClassification.from_pretrained(os.path.join(models_dir, "roberta_model")).to(device)
r_mod.eval()
roberta_val = []
with torch.no_grad():
for text in val_df["clean_text"].fillna(""):
inputs = r_tok(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
out = r_mod(**inputs)
roberta_val.append(torch.softmax(out.logits, dim=-1)[0, 1].item())
roberta_val = np.array(roberta_val) * 0.92
X_meta_val, _ = build_meta_features(
val_df, lr_val, lstm_val, distil_val, roberta_val, is_train=False, preprocessor=meta_preprocessor
)
# ── 3. Train Meta-Classifier (XGBoost) ──
logger.info("Training XGBoost meta-classifier...")
xgb = XGBClassifier(
n_estimators=500,
learning_rate=0.05,
max_depth=5,
eval_metric='logloss',
early_stopping_rounds=20,
random_state=42
)
xgb.fit(
X_meta_train, y_train,
eval_set=[(X_meta_val, y_val)],
verbose=False
)
logger.info(f"XGBoost best iteration: {xgb.best_iteration}")
# ── 4. Calibrate Probabilities ──
logger.info("Calibrating final probabilities via CalibratedClassifierCV on Val set...")
# 'prefit' means it will only use X_meta_val to calibrate the output
calibrated_meta = CalibratedClassifierCV(estimator=xgb, method='sigmoid', cv='prefit')
calibrated_meta.fit(X_meta_val, y_val)
# Final Val Score Check
final_val_probas = calibrated_meta.predict_proba(X_meta_val)[:, 1]
# For short texts, dampen confidence toward 0.5 (more uncertain)
# rather than making a confident wrong prediction
for i in range(len(final_val_probas)):
if val_df["word_count"].iloc[i] < 50:
final_val_probas[i] = 0.5 + (final_val_probas[i] - 0.5) * 0.6
final_val_preds = (final_val_probas >= 0.55).astype(int)
logger.info("Final Meta-Classifier Classification Report:\n" + classification_report(y_val, final_val_preds))
roc_auc = roc_auc_score(y_val, final_val_probas)
logger.info(f"ROC-AUC: {roc_auc:.4f}")
from src.models.logistic_model import plot_and_save_cm
plot_and_save_cm(
y_val,
final_val_preds,
os.path.join(save_dir, "cm.png"),
title="XGBoost Meta-Classifier Confusion Matrix"
)
bucket_acc = {}
for b in ["short", "medium", "long"]:
b_mask = (val_df["text_length_bucket"] == b).values
if b_mask.sum() > 0:
acc = (final_val_preds[b_mask] == y_val[b_mask]).mean()
bucket_acc[b] = acc
metrics = {
"roc_auc": float(roc_auc),
"bucket_accuracy": {k: float(v) for k, v in bucket_acc.items()}
}
with open(os.path.join(save_dir, "metrics.json"), "w") as f:
json.dump(metrics, f, indent=2)
# Save Model Bundle (Pre-processor + Calibrated XGBoost)
bundle = {
"preprocessor": meta_preprocessor,
"model": calibrated_meta
}
joblib.dump(bundle, os.path.join(save_dir, "meta_classifier.pkl"))
logger.info("Saved Meta-Classifier bundle.")
if __name__ == "__main__":
import yaml
cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml")
with open(cfg_path, "r", encoding="utf-8") as file:
config = yaml.safe_load(file)
train_meta_classifier(
config,
os.path.join(_PROJECT_ROOT, config["paths"]["splits_dir"]),
os.path.join(_PROJECT_ROOT, config["paths"]["models_dir"])
)