import os import sys import json import logging import joblib import torch import numpy as np import pandas as pd from xgboost import XGBClassifier from sklearn.calibration import CalibratedClassifierCV from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score from matplotlib import pyplot as plt from torch.utils.data import TensorDataset, DataLoader _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if str(_PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(_PROJECT_ROOT)) from src.models.lstm_model import BiLSTMClassifier, pad_sequences from src.stage2_preprocessing import KerasStyleTokenizer import sys setattr(sys.modules['__main__'], 'KerasStyleTokenizer', KerasStyleTokenizer) from transformers import AutoTokenizer, AutoModelForSequenceClassification logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(name)s | %(levelname)s | %(message)s") logger = logging.getLogger("meta_classifier") def build_meta_features(df, lr_proba, lstm_proba, distil_proba, roberta_proba, is_train=True, preprocessor=None): """ Construct the meta-feature matrix. If is_train is True, preprocessor is fit on the categorical columns. """ df_meta = pd.DataFrame({ "lr_proba": lr_proba, "lstm_proba": lstm_proba, "distilbert_proba": distil_proba, "roberta_proba": roberta_proba, "word_count": df["word_count"], "has_date": df["has_date"].astype(int), "freshness_score": df["freshness_score"] }) # Categoricals to encode cats = df[["text_length_bucket", "source_domain"]].fillna("unknown") if is_train: preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False) cat_features = preprocessor.fit_transform(cats) else: cat_features = preprocessor.transform(cats) X_meta = np.hstack((df_meta.values, cat_features)) return X_meta, preprocessor def train_meta_classifier(cfg, splits_dir, models_dir): save_dir = os.path.join(models_dir, "meta_classifier") os.makedirs(save_dir, exist_ok=True) logger.info("Loading dataset splits...") train_df = pd.read_csv(os.path.join(splits_dir, "df_train.csv")) val_df = pd.read_csv(os.path.join(splits_dir, "df_val.csv")) y_train = train_df["binary_label"].values y_val = val_df["binary_label"].values # ── 1. Load OOF predictions for Train Set ── logger.info("Gathering base model OOF predictions...") try: lr_oof = np.load(os.path.join(models_dir, "logistic_model", "lr_oof.npy")) lstm_oof = np.load(os.path.join(models_dir, "lstm_model", "lstm_oof.npy")) distil_oof = np.load(os.path.join(models_dir, "distilbert_model", "distilbert_oof.npy")) roberta_oof = np.load(os.path.join(models_dir, "roberta_model", "roberta_oof.npy")) except FileNotFoundError as e: logger.error(f"Missing OOF file: {e}. Please ensure all base models have trained completely.") return roberta_oof = roberta_oof * 0.92 X_meta_train, meta_preprocessor = build_meta_features( train_df, lr_oof, lstm_oof, distil_oof, roberta_oof, is_train=True ) # ── 2. Dynamically Generate Val predictions ── # Since we need a val set for early stopping, we predict them here. logger.info("Generating base model predictions for Validation set...") # Logistic lr_pipeline = joblib.load(os.path.join(models_dir, "logistic_model", "logistic_model.pkl")) lr_val = lr_pipeline.predict_proba(val_df)[:, 1] # LSTM device = torch.device("cuda" if torch.cuda.is_available() else "cpu") import pickle with open(os.path.join(models_dir, "tokenizer.pkl"), "rb") as f: tok = pickle.load(f) glove_path = os.path.join(_PROJECT_ROOT, cfg["paths"]["glove_path"]) from src.models.lstm_model import load_glove_embeddings emb_matrix, vocab_size = load_glove_embeddings(glove_path, tok.word_index) maxlen = cfg.get("preprocessing", {}).get("lstm_max_len", 512) X_val_seq = tok.texts_to_sequences(val_df["clean_text"].fillna("")) X_val_pad = pad_sequences(X_val_seq, maxlen=maxlen, padding='post') lstm_model = BiLSTMClassifier(vocab_size, emb_matrix).to(device) lstm_model.load_state_dict(torch.load(os.path.join(models_dir, "lstm_model", "model.pt"), map_location=device)) lstm_model.eval() val_loader = DataLoader(TensorDataset(torch.from_numpy(X_val_pad).long()), batch_size=64, shuffle=False) lstm_val_preds = [] with torch.no_grad(): for x_b in val_loader: logits = lstm_model(x_b[0].to(device)) lstm_val_preds.extend(torch.sigmoid(logits).cpu().numpy()) lstm_val = np.array(lstm_val_preds) # DistilBERT d_tok = AutoTokenizer.from_pretrained(os.path.join(models_dir, "distilbert_model")) d_mod = AutoModelForSequenceClassification.from_pretrained(os.path.join(models_dir, "distilbert_model")).to(device) d_mod.eval() distil_val = [] with torch.no_grad(): for text in val_df["clean_text"].fillna(""): inputs = d_tok(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device) out = d_mod(**inputs) distil_val.append(torch.softmax(out.logits, dim=-1)[0, 1].item()) distil_val = np.array(distil_val) # RoBERTa r_tok = AutoTokenizer.from_pretrained(os.path.join(models_dir, "roberta_model")) r_mod = AutoModelForSequenceClassification.from_pretrained(os.path.join(models_dir, "roberta_model")).to(device) r_mod.eval() roberta_val = [] with torch.no_grad(): for text in val_df["clean_text"].fillna(""): inputs = r_tok(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device) out = r_mod(**inputs) roberta_val.append(torch.softmax(out.logits, dim=-1)[0, 1].item()) roberta_val = np.array(roberta_val) * 0.92 X_meta_val, _ = build_meta_features( val_df, lr_val, lstm_val, distil_val, roberta_val, is_train=False, preprocessor=meta_preprocessor ) # ── 3. Train Meta-Classifier (XGBoost) ── logger.info("Training XGBoost meta-classifier...") xgb = XGBClassifier( n_estimators=500, learning_rate=0.05, max_depth=5, eval_metric='logloss', early_stopping_rounds=20, random_state=42 ) xgb.fit( X_meta_train, y_train, eval_set=[(X_meta_val, y_val)], verbose=False ) logger.info(f"XGBoost best iteration: {xgb.best_iteration}") # ── 4. Calibrate Probabilities ── logger.info("Calibrating final probabilities via CalibratedClassifierCV on Val set...") # 'prefit' means it will only use X_meta_val to calibrate the output calibrated_meta = CalibratedClassifierCV(estimator=xgb, method='sigmoid', cv='prefit') calibrated_meta.fit(X_meta_val, y_val) # Final Val Score Check final_val_probas = calibrated_meta.predict_proba(X_meta_val)[:, 1] # For short texts, dampen confidence toward 0.5 (more uncertain) # rather than making a confident wrong prediction for i in range(len(final_val_probas)): if val_df["word_count"].iloc[i] < 50: final_val_probas[i] = 0.5 + (final_val_probas[i] - 0.5) * 0.6 final_val_preds = (final_val_probas >= 0.55).astype(int) logger.info("Final Meta-Classifier Classification Report:\n" + classification_report(y_val, final_val_preds)) roc_auc = roc_auc_score(y_val, final_val_probas) logger.info(f"ROC-AUC: {roc_auc:.4f}") from src.models.logistic_model import plot_and_save_cm plot_and_save_cm( y_val, final_val_preds, os.path.join(save_dir, "cm.png"), title="XGBoost Meta-Classifier Confusion Matrix" ) bucket_acc = {} for b in ["short", "medium", "long"]: b_mask = (val_df["text_length_bucket"] == b).values if b_mask.sum() > 0: acc = (final_val_preds[b_mask] == y_val[b_mask]).mean() bucket_acc[b] = acc metrics = { "roc_auc": float(roc_auc), "bucket_accuracy": {k: float(v) for k, v in bucket_acc.items()} } with open(os.path.join(save_dir, "metrics.json"), "w") as f: json.dump(metrics, f, indent=2) # Save Model Bundle (Pre-processor + Calibrated XGBoost) bundle = { "preprocessor": meta_preprocessor, "model": calibrated_meta } joblib.dump(bundle, os.path.join(save_dir, "meta_classifier.pkl")) logger.info("Saved Meta-Classifier bundle.") if __name__ == "__main__": import yaml cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml") with open(cfg_path, "r", encoding="utf-8") as file: config = yaml.safe_load(file) train_meta_classifier( config, os.path.join(_PROJECT_ROOT, config["paths"]["splits_dir"]), os.path.join(_PROJECT_ROOT, config["paths"]["models_dir"]) )