import os import sys import json import logging import time import numpy as np import pandas as pd import joblib from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import OneHotEncoder from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score from matplotlib import pyplot as plt _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if str(_PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(_PROJECT_ROOT)) logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(name)s | %(levelname)s | %(message)s") logger = logging.getLogger("logistic_model") def load_data(splits_dir): """Load train and val pandas dataframes, maintaining clean_text and text_length_bucket.""" train_df = pd.read_csv(os.path.join(splits_dir, "df_train.csv")) val_df = pd.read_csv(os.path.join(splits_dir, "df_val.csv")) # Fill NaN just in case train_df["clean_text"] = train_df["clean_text"].fillna("") val_df["clean_text"] = val_df["clean_text"].fillna("") return train_df, val_df def plot_and_save_cm(y_true, y_pred, path, title="Logistic Regression Confusion Matrix"): """Save confusion matrix as a PNG.""" cm = confusion_matrix(y_true, y_pred) fig, ax = plt.subplots(figsize=(5, 5)) ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.3) for i in range(cm.shape[0]): for j in range(cm.shape[1]): ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center', size='xx-large') plt.xlabel('Predicted Label') plt.ylabel('True Label') plt.title(title) plt.tight_layout() plt.savefig(path) plt.close() def train_logistic_model(cfg, splits_dir, save_dir): logger.info("Initializing Logistic Regression Training...") os.makedirs(save_dir, exist_ok=True) train_df, val_df = load_data(splits_dir) y_train = train_df["binary_label"].values y_val = val_df["binary_label"].values max_features = cfg.get("preprocessing", {}).get("max_tfidf_features", 50000) # Define ColumnTransformer for generic pipeline feature stack preprocessor = ColumnTransformer( transformers=[ ("tfidf", TfidfVectorizer(max_features=max_features, ngram_range=(1, 2)), "clean_text"), ("cat", OneHotEncoder(handle_unknown="ignore"), ["text_length_bucket"]) ], remainder="drop" ) # Define Model log_reg = LogisticRegression(class_weight="balanced", random_state=42, max_iter=1000) pipeline = Pipeline(steps=[ ("preprocessor", preprocessor), ("classifier", log_reg) ]) # K-Fold OOF Predictions logger.info("Generating 5-Fold OOF predictions on Train set...") cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Using method='predict_proba' returns a 2D array [n_samples, 2] oof_probas = cross_val_predict(pipeline, train_df, y_train, cv=cv, method='predict_proba', n_jobs=-1) np.save(os.path.join(save_dir, "lr_oof.npy"), oof_probas[:, 1]) logger.info("Saved OOF predictions (lr_oof.npy)") # Hyperparameter Tuning on full Train via GridSearch logger.info("Hyperparameter tuning C over 5-folds...") param_grid = {'classifier__C': [0.1, 1.0, 10.0]} grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1) grid_search.fit(train_df, y_train) best_pipeline = grid_search.best_estimator_ logger.info(f"Best parameter C: {grid_search.best_params_['classifier__C']}") # Validation Evaluation val_probas = best_pipeline.predict_proba(val_df)[:, 1] val_preds = (val_probas >= 0.5).astype(int) logger.info("Validation Classification Report:\n" + classification_report(y_val, val_preds)) roc_auc = roc_auc_score(y_val, val_probas) logger.info(f"ROC-AUC: {roc_auc:.4f}") # Generate Evaluation Artifacts plot_and_save_cm(y_val, val_preds, os.path.join(save_dir, "cm.png")) # Compute accuracy per text length bucket on val bucket_acc = {} for b in ["short", "medium", "long"]: b_mask = (val_df["text_length_bucket"] == b) if b_mask.sum() > 0: acc = (val_preds[b_mask] == y_val[b_mask]).mean() bucket_acc[b] = acc metrics = { "roc_auc": float(roc_auc), "bucket_accuracy": {k: float(v) for k, v in bucket_acc.items()} } with open(os.path.join(save_dir, "metrics.json"), "w") as f: json.dump(metrics, f, indent=2) # Save Pipeline joblib.dump(best_pipeline, os.path.join(save_dir, "logistic_model.pkl")) logger.info("Saved Logistic Regression Pipeline to format `logistic_model.pkl`.") if __name__ == "__main__": import yaml cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml") with open(cfg_path, "r", encoding="utf-8") as file: config = yaml.safe_load(file) s_dir = os.path.join(_PROJECT_ROOT, config["paths"]["splits_dir"]) m_dir = os.path.join(_PROJECT_ROOT, config["paths"]["models_dir"], "logistic_model") t0 = time.time() train_logistic_model(config, s_dir, m_dir) print(f"Total time: {time.time() - t0:.2f}s")