Spaces:
Running
Running
| import os | |
| import sys | |
| import json | |
| import logging | |
| import joblib | |
| import torch | |
| import numpy as np | |
| import pandas as pd | |
| from xgboost import XGBClassifier | |
| from sklearn.calibration import CalibratedClassifierCV | |
| from sklearn.preprocessing import OneHotEncoder | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score | |
| from matplotlib import pyplot as plt | |
| from torch.utils.data import TensorDataset, DataLoader | |
| _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| if str(_PROJECT_ROOT) not in sys.path: | |
| sys.path.insert(0, str(_PROJECT_ROOT)) | |
| from src.models.lstm_model import BiLSTMClassifier, pad_sequences | |
| from src.stage2_preprocessing import KerasStyleTokenizer | |
| import sys | |
| setattr(sys.modules['__main__'], 'KerasStyleTokenizer', KerasStyleTokenizer) | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(name)s | %(levelname)s | %(message)s") | |
| logger = logging.getLogger("meta_classifier") | |
| def build_meta_features(df, lr_proba, lstm_proba, distil_proba, roberta_proba, is_train=True, preprocessor=None): | |
| """ | |
| Construct the meta-feature matrix. | |
| If is_train is True, preprocessor is fit on the categorical columns. | |
| """ | |
| df_meta = pd.DataFrame({ | |
| "lr_proba": lr_proba, | |
| "lstm_proba": lstm_proba, | |
| "distilbert_proba": distil_proba, | |
| "roberta_proba": roberta_proba, | |
| "word_count": df["word_count"], | |
| "has_date": df["has_date"].astype(int), | |
| "freshness_score": df["freshness_score"] | |
| }) | |
| # Categoricals to encode | |
| cats = df[["text_length_bucket", "source_domain"]].fillna("unknown") | |
| if is_train: | |
| preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False) | |
| cat_features = preprocessor.fit_transform(cats) | |
| else: | |
| cat_features = preprocessor.transform(cats) | |
| X_meta = np.hstack((df_meta.values, cat_features)) | |
| return X_meta, preprocessor | |
| def train_meta_classifier(cfg, splits_dir, models_dir): | |
| save_dir = os.path.join(models_dir, "meta_classifier") | |
| os.makedirs(save_dir, exist_ok=True) | |
| logger.info("Loading dataset splits...") | |
| train_df = pd.read_csv(os.path.join(splits_dir, "df_train.csv")) | |
| val_df = pd.read_csv(os.path.join(splits_dir, "df_val.csv")) | |
| y_train = train_df["binary_label"].values | |
| y_val = val_df["binary_label"].values | |
| # ββ 1. Load OOF predictions for Train Set ββ | |
| logger.info("Gathering base model OOF predictions...") | |
| try: | |
| lr_oof = np.load(os.path.join(models_dir, "logistic_model", "lr_oof.npy")) | |
| lstm_oof = np.load(os.path.join(models_dir, "lstm_model", "lstm_oof.npy")) | |
| distil_oof = np.load(os.path.join(models_dir, "distilbert_model", "distilbert_oof.npy")) | |
| roberta_oof = np.load(os.path.join(models_dir, "roberta_model", "roberta_oof.npy")) | |
| except FileNotFoundError as e: | |
| logger.error(f"Missing OOF file: {e}. Please ensure all base models have trained completely.") | |
| return | |
| roberta_oof = roberta_oof * 0.92 | |
| X_meta_train, meta_preprocessor = build_meta_features( | |
| train_df, lr_oof, lstm_oof, distil_oof, roberta_oof, is_train=True | |
| ) | |
| # ββ 2. Dynamically Generate Val predictions ββ | |
| # Since we need a val set for early stopping, we predict them here. | |
| logger.info("Generating base model predictions for Validation set...") | |
| # Logistic | |
| lr_pipeline = joblib.load(os.path.join(models_dir, "logistic_model", "logistic_model.pkl")) | |
| lr_val = lr_pipeline.predict_proba(val_df)[:, 1] | |
| # LSTM | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| import pickle | |
| with open(os.path.join(models_dir, "tokenizer.pkl"), "rb") as f: | |
| tok = pickle.load(f) | |
| glove_path = os.path.join(_PROJECT_ROOT, cfg["paths"]["glove_path"]) | |
| from src.models.lstm_model import load_glove_embeddings | |
| emb_matrix, vocab_size = load_glove_embeddings(glove_path, tok.word_index) | |
| maxlen = cfg.get("preprocessing", {}).get("lstm_max_len", 512) | |
| X_val_seq = tok.texts_to_sequences(val_df["clean_text"].fillna("")) | |
| X_val_pad = pad_sequences(X_val_seq, maxlen=maxlen, padding='post') | |
| lstm_model = BiLSTMClassifier(vocab_size, emb_matrix).to(device) | |
| lstm_model.load_state_dict(torch.load(os.path.join(models_dir, "lstm_model", "model.pt"), map_location=device)) | |
| lstm_model.eval() | |
| val_loader = DataLoader(TensorDataset(torch.from_numpy(X_val_pad).long()), batch_size=64, shuffle=False) | |
| lstm_val_preds = [] | |
| with torch.no_grad(): | |
| for x_b in val_loader: | |
| logits = lstm_model(x_b[0].to(device)) | |
| lstm_val_preds.extend(torch.sigmoid(logits).cpu().numpy()) | |
| lstm_val = np.array(lstm_val_preds) | |
| # DistilBERT | |
| d_tok = AutoTokenizer.from_pretrained(os.path.join(models_dir, "distilbert_model")) | |
| d_mod = AutoModelForSequenceClassification.from_pretrained(os.path.join(models_dir, "distilbert_model")).to(device) | |
| d_mod.eval() | |
| distil_val = [] | |
| with torch.no_grad(): | |
| for text in val_df["clean_text"].fillna(""): | |
| inputs = d_tok(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device) | |
| out = d_mod(**inputs) | |
| distil_val.append(torch.softmax(out.logits, dim=-1)[0, 1].item()) | |
| distil_val = np.array(distil_val) | |
| # RoBERTa | |
| r_tok = AutoTokenizer.from_pretrained(os.path.join(models_dir, "roberta_model")) | |
| r_mod = AutoModelForSequenceClassification.from_pretrained(os.path.join(models_dir, "roberta_model")).to(device) | |
| r_mod.eval() | |
| roberta_val = [] | |
| with torch.no_grad(): | |
| for text in val_df["clean_text"].fillna(""): | |
| inputs = r_tok(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device) | |
| out = r_mod(**inputs) | |
| roberta_val.append(torch.softmax(out.logits, dim=-1)[0, 1].item()) | |
| roberta_val = np.array(roberta_val) * 0.92 | |
| X_meta_val, _ = build_meta_features( | |
| val_df, lr_val, lstm_val, distil_val, roberta_val, is_train=False, preprocessor=meta_preprocessor | |
| ) | |
| # ββ 3. Train Meta-Classifier (XGBoost) ββ | |
| logger.info("Training XGBoost meta-classifier...") | |
| xgb = XGBClassifier( | |
| n_estimators=500, | |
| learning_rate=0.05, | |
| max_depth=5, | |
| eval_metric='logloss', | |
| early_stopping_rounds=20, | |
| random_state=42 | |
| ) | |
| xgb.fit( | |
| X_meta_train, y_train, | |
| eval_set=[(X_meta_val, y_val)], | |
| verbose=False | |
| ) | |
| logger.info(f"XGBoost best iteration: {xgb.best_iteration}") | |
| # ββ 4. Calibrate Probabilities ββ | |
| logger.info("Calibrating final probabilities via CalibratedClassifierCV on Val set...") | |
| # 'prefit' means it will only use X_meta_val to calibrate the output | |
| calibrated_meta = CalibratedClassifierCV(estimator=xgb, method='sigmoid', cv='prefit') | |
| calibrated_meta.fit(X_meta_val, y_val) | |
| # Final Val Score Check | |
| final_val_probas = calibrated_meta.predict_proba(X_meta_val)[:, 1] | |
| # For short texts, dampen confidence toward 0.5 (more uncertain) | |
| # rather than making a confident wrong prediction | |
| for i in range(len(final_val_probas)): | |
| if val_df["word_count"].iloc[i] < 50: | |
| final_val_probas[i] = 0.5 + (final_val_probas[i] - 0.5) * 0.6 | |
| final_val_preds = (final_val_probas >= 0.55).astype(int) | |
| logger.info("Final Meta-Classifier Classification Report:\n" + classification_report(y_val, final_val_preds)) | |
| roc_auc = roc_auc_score(y_val, final_val_probas) | |
| logger.info(f"ROC-AUC: {roc_auc:.4f}") | |
| from src.models.logistic_model import plot_and_save_cm | |
| plot_and_save_cm( | |
| y_val, | |
| final_val_preds, | |
| os.path.join(save_dir, "cm.png"), | |
| title="XGBoost Meta-Classifier Confusion Matrix" | |
| ) | |
| bucket_acc = {} | |
| for b in ["short", "medium", "long"]: | |
| b_mask = (val_df["text_length_bucket"] == b).values | |
| if b_mask.sum() > 0: | |
| acc = (final_val_preds[b_mask] == y_val[b_mask]).mean() | |
| bucket_acc[b] = acc | |
| metrics = { | |
| "roc_auc": float(roc_auc), | |
| "bucket_accuracy": {k: float(v) for k, v in bucket_acc.items()} | |
| } | |
| with open(os.path.join(save_dir, "metrics.json"), "w") as f: | |
| json.dump(metrics, f, indent=2) | |
| # Save Model Bundle (Pre-processor + Calibrated XGBoost) | |
| bundle = { | |
| "preprocessor": meta_preprocessor, | |
| "model": calibrated_meta | |
| } | |
| joblib.dump(bundle, os.path.join(save_dir, "meta_classifier.pkl")) | |
| logger.info("Saved Meta-Classifier bundle.") | |
| if __name__ == "__main__": | |
| import yaml | |
| cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml") | |
| with open(cfg_path, "r", encoding="utf-8") as file: | |
| config = yaml.safe_load(file) | |
| train_meta_classifier( | |
| config, | |
| os.path.join(_PROJECT_ROOT, config["paths"]["splits_dir"]), | |
| os.path.join(_PROJECT_ROOT, config["paths"]["models_dir"]) | |
| ) | |