| """ |
| Relationship Longevity Predictor — Ensemble Model |
| =================================================== |
| Based on: Fisman et al. Speed Dating Experiment (Columbia, 2002-2004) |
| Architecture: XGBoost + LightGBM + CatBoost ensemble with engineered dyadic features |
| Reference: "Why do tree-based models still outperform deep learning on tabular data?" |
| (Grinsztajn et al., NeurIPS 2022, arxiv:2207.08815) |
| |
| Task: Given two individuals' personal/professional profiles, predict: |
| 1. is_match (binary) — mutual compatibility |
| 2. compatibility_score (0-1 continuous) — strength of predicted relationship |
| """ |
|
|
| import os |
| import json |
| import warnings |
| import numpy as np |
| import pandas as pd |
| import matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| from datasets import load_dataset |
| from sklearn.model_selection import StratifiedKFold, cross_val_predict |
| from sklearn.preprocessing import LabelEncoder, StandardScaler |
| from sklearn.metrics import ( |
| roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, |
| classification_report, confusion_matrix, average_precision_score |
| ) |
| from sklearn.calibration import CalibratedClassifierCV |
| from xgboost import XGBClassifier |
| from lightgbm import LGBMClassifier |
| from catboost import CatBoostClassifier |
| from sklearn.ensemble import VotingClassifier, StackingClassifier |
| from sklearn.linear_model import LogisticRegression |
| import joblib |
| import shap |
|
|
| warnings.filterwarnings('ignore') |
| np.random.seed(42) |
|
|
| OUTPUT_DIR = "/app/model_output" |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
| os.makedirs(f"{OUTPUT_DIR}/figures", exist_ok=True) |
|
|
| |
| |
| |
| print("=" * 60) |
| print("STEP 1: Loading and Auditing Data") |
| print("=" * 60) |
|
|
| ds = load_dataset("mstz/speeddating", "dating", split="train") |
| df = ds.to_pandas() |
|
|
| print(f"\nDataset shape: {df.shape}") |
| print(f"Columns: {len(df.columns)}") |
| print(f"\nTarget distribution (is_match):") |
| print(df['is_match'].value_counts(normalize=True)) |
| print(f"\nClass imbalance ratio: {df['is_match'].value_counts()[0] / df['is_match'].value_counts()[1]:.2f}:1") |
| print(f"\nMissing values per column:") |
| missing = df.isnull().sum() |
| print(missing[missing > 0]) |
| print(f"\nTotal missing values: {df.isnull().sum().sum()}") |
|
|
| |
| print(f"\nAge statistics:") |
| print(f" Dater age: {df['dater_age'].describe()[['mean','std','min','max']].to_dict()}") |
| print(f" Dated age: {df['dated_age'].describe()[['mean','std','min','max']].to_dict()}") |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("STEP 2: Feature Engineering (Dyadic Pairwise Features)") |
| print("=" * 60) |
|
|
| |
| trait_cols_dater = [ |
| 'self_reported_attractiveness_of_dater', 'self_reported_sincerity_of_dater', |
| 'self_reported_intelligence_of_dater', 'self_reported_humor_of_dater', |
| 'self_reported_ambition_of_dater' |
| ] |
|
|
| |
| perception_cols = [ |
| 'reported_attractiveness_of_dated_from_dater', 'reported_sincerity_of_dated_from_dater', |
| 'reported_intelligence_of_dated_from_dater', 'reported_humor_of_dated_from_dater', |
| 'reported_ambition_of_dated_from_dater', 'reported_shared_interests_of_dated_from_dater' |
| ] |
|
|
| |
| scored_by_partner_cols = [ |
| 'attractiveness_score_of_dater_from_dated', 'sincerity_score_of_dater_from_dated', |
| 'intelligence_score_of_dater_from_dated', 'humor_score_of_dater_from_dated', |
| 'ambition_score_of_dater_from_dated', 'shared_interests_score_of_dater_from_dated' |
| ] |
|
|
| |
| importance_dater_cols = [ |
| 'attractiveness_importance_for_dater', 'sincerity_importance_for_dater', |
| 'intelligence_importance_for_dater', 'humor_importance_for_dater', |
| 'ambition_importance_for_dater', 'shared_interests_importance_for_dater' |
| ] |
|
|
| importance_dated_cols = [ |
| 'attractiveness_importance_for_dated', 'sincerity_importance_for_dated', |
| 'intelligence_importance_for_dated', 'humor_importance_for_dated', |
| 'ambition_importance_for_dated', 'shared_interests_importance_for_dated' |
| ] |
|
|
| |
| interest_cols = [c for c in df.columns if c.startswith('dater_interest_in_')] |
|
|
| |
| print("Creating dyadic interaction features...") |
|
|
| |
| traits = ['attractiveness', 'sincerity', 'intelligence', 'humor', 'ambition'] |
| for trait in traits: |
| dater_rates_partner = f'reported_{trait}_of_dated_from_dater' |
| partner_rates_dater = f'{trait}_score_of_dater_from_dated' |
| if dater_rates_partner in df.columns and partner_rates_dater in df.columns: |
| df[f'{trait}_perception_gap'] = df[dater_rates_partner] - df[partner_rates_dater] |
| df[f'{trait}_mutual_score'] = (df[dater_rates_partner] + df[partner_rates_dater]) / 2 |
| df[f'{trait}_perception_product'] = df[dater_rates_partner] * df[partner_rates_dater] |
|
|
| |
| for trait in traits: |
| importance_col = f'{trait}_importance_for_dater' |
| score_col = f'{trait}_score_of_dater_from_dated' |
| if importance_col in df.columns and score_col in df.columns: |
| df[f'{trait}_value_fulfillment_dater'] = df[importance_col] * df[score_col] / 100 |
|
|
| |
| for trait in traits: |
| self_col = f'self_reported_{trait}_of_dater' |
| partner_score_col = f'{trait}_score_of_dater_from_dated' |
| if self_col in df.columns and partner_score_col in df.columns: |
| df[f'{trait}_self_awareness_gap'] = df[self_col] - df[partner_score_col] |
|
|
| |
| df['total_perception_gap'] = sum( |
| df[f'{t}_perception_gap'].fillna(0) for t in traits |
| ) / len(traits) |
|
|
| df['total_mutual_score'] = sum( |
| df[f'{t}_mutual_score'].fillna(0) for t in traits |
| ) / len(traits) |
|
|
| df['total_value_fulfillment'] = sum( |
| df[f'{t}_value_fulfillment_dater'].fillna(0) for t in traits |
| ) |
|
|
| df['total_self_awareness_gap'] = sum( |
| df[f'{t}_self_awareness_gap'].fillna(0) for t in traits |
| ) / len(traits) |
|
|
| |
| df['expectation_meets_reality'] = df['expected_satisfaction_of_dater'] * df['dater_liked_dated'] |
| df['confidence_calibration'] = ( |
| df['expected_number_of_likes_of_dater_from_20_people'] / 20 - |
| df['probability_dated_wants_to_date'] / 10 |
| ) |
|
|
| |
| df['age_gap_abs'] = df['age_difference'] |
| df['age_gap_squared'] = df['age_difference'] ** 2 |
| df['dater_is_older'] = (df['dater_age'] > df['dated_age']).astype(int) |
| df['combined_age'] = df['dater_age'] + df['dated_age'] |
|
|
| |
| if interest_cols: |
| df['interest_diversity'] = df[interest_cols].std(axis=1) |
| df['interest_intensity'] = df[interest_cols].mean(axis=1) |
| df['max_interest'] = df[interest_cols].max(axis=1) |
| df['min_interest'] = df[interest_cols].min(axis=1) |
| df['interest_range'] = df['max_interest'] - df['min_interest'] |
|
|
| |
| if importance_dater_cols: |
| df['importance_concentration_dater'] = df[importance_dater_cols].std(axis=1) |
| df['max_importance_dater'] = df[importance_dater_cols].max(axis=1) |
|
|
| if importance_dated_cols: |
| df['importance_concentration_dated'] = df[importance_dated_cols].std(axis=1) |
| |
| for i, (d1, d2) in enumerate(zip(importance_dater_cols, importance_dated_cols)): |
| df[f'importance_alignment_{i}'] = abs(df[d1] - df[d2]) |
| df['total_importance_alignment'] = sum( |
| abs(df[d1] - df[d2]) for d1, d2 in zip(importance_dater_cols, importance_dated_cols) |
| ) |
|
|
| |
| le_race = LabelEncoder() |
| df['dater_race_encoded'] = le_race.fit_transform(df['dater_race'].fillna('Unknown')) |
| df['dated_race_encoded'] = le_race.transform(df['dated_race'].fillna('Unknown')) |
| df['race_match'] = (df['dater_race'] == df['dated_race']).astype(int) |
|
|
| |
| df['decision_agreement'] = (df['dater_wants_to_date'] == df['dated_wants_to_date']).astype(int) |
| |
| |
| |
|
|
| print(f"Total features after engineering: {len(df.columns)}") |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("STEP 3: Preparing Final Feature Set") |
| print("=" * 60) |
|
|
| |
| exclude_cols = [ |
| 'is_match', 'dater_wants_to_date', 'dated_wants_to_date', |
| 'dater_race', 'dated_race', 'already_met_before', 'is_dater_male', |
| 'decision_agreement' |
| ] |
|
|
| |
| df['is_dater_male_int'] = df['is_dater_male'].astype(int) |
| df['are_same_race_int'] = df['are_same_race'].astype(int) |
| df['already_met_int'] = df['already_met_before'].astype(int) |
|
|
| exclude_cols += ['are_same_race'] |
|
|
| feature_cols = [c for c in df.columns if c not in exclude_cols |
| and df[c].dtype in ['float64', 'int64', 'int32', 'float32']] |
|
|
| print(f"Feature columns ({len(feature_cols)}):") |
| for i, c in enumerate(feature_cols): |
| print(f" {i+1}. {c}") |
|
|
| X = df[feature_cols].copy() |
| y = df['is_match'].values |
|
|
| |
| print(f"\nMissing values in features: {X.isnull().sum().sum()}") |
| X = X.fillna(X.median()) |
|
|
| print(f"\nFinal X shape: {X.shape}") |
| print(f"Target distribution: {np.bincount(y)}") |
| print(f"Positive rate: {y.mean():.4f}") |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("STEP 4: Training Ensemble (XGBoost + LightGBM + CatBoost)") |
| print("=" * 60) |
|
|
| |
| scale_pos_weight = (y == 0).sum() / (y == 1).sum() |
| print(f"Scale positive weight: {scale_pos_weight:.2f}") |
|
|
| |
| xgb_model = XGBClassifier( |
| n_estimators=1500, |
| max_depth=7, |
| learning_rate=0.03, |
| colsample_bytree=0.8, |
| subsample=0.8, |
| min_child_weight=3, |
| gamma=0.1, |
| reg_alpha=0.1, |
| reg_lambda=1.0, |
| scale_pos_weight=scale_pos_weight, |
| use_label_encoder=False, |
| eval_metric='auc', |
| tree_method='hist', |
| random_state=42, |
| n_jobs=-1 |
| ) |
|
|
| lgb_model = LGBMClassifier( |
| n_estimators=1500, |
| max_depth=7, |
| learning_rate=0.03, |
| colsample_bytree=0.8, |
| subsample=0.8, |
| min_child_samples=10, |
| reg_alpha=0.1, |
| reg_lambda=1.0, |
| scale_pos_weight=scale_pos_weight, |
| random_state=42, |
| n_jobs=-1, |
| verbose=-1 |
| ) |
|
|
| cat_model = CatBoostClassifier( |
| iterations=1500, |
| depth=7, |
| learning_rate=0.03, |
| l2_leaf_reg=3.0, |
| auto_class_weights='Balanced', |
| random_seed=42, |
| verbose=0 |
| ) |
|
|
| |
| n_splits = 5 |
| skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) |
|
|
| |
| oof_xgb = np.zeros(len(y)) |
| oof_lgb = np.zeros(len(y)) |
| oof_cat = np.zeros(len(y)) |
| oof_ensemble = np.zeros(len(y)) |
|
|
| |
| feature_importance_xgb = np.zeros(len(feature_cols)) |
| feature_importance_lgb = np.zeros(len(feature_cols)) |
|
|
| print(f"\nRunning {n_splits}-fold cross-validation...") |
| for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)): |
| print(f"\n--- Fold {fold + 1}/{n_splits} ---") |
| X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] |
| y_train, y_val = y[train_idx], y[val_idx] |
| |
| |
| xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) |
| oof_xgb[val_idx] = xgb_model.predict_proba(X_val)[:, 1] |
| feature_importance_xgb += xgb_model.feature_importances_ |
| |
| xgb_auc = roc_auc_score(y_val, oof_xgb[val_idx]) |
| print(f" XGBoost AUC: {xgb_auc:.4f}") |
| |
| |
| lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)]) |
| oof_lgb[val_idx] = lgb_model.predict_proba(X_val)[:, 1] |
| feature_importance_lgb += lgb_model.feature_importances_ |
| |
| lgb_auc = roc_auc_score(y_val, oof_lgb[val_idx]) |
| print(f" LightGBM AUC: {lgb_auc:.4f}") |
| |
| |
| cat_model.fit(X_train, y_train, eval_set=(X_val, y_val)) |
| oof_cat[val_idx] = cat_model.predict_proba(X_val)[:, 1] |
| |
| cat_auc = roc_auc_score(y_val, oof_cat[val_idx]) |
| print(f" CatBoost AUC: {cat_auc:.4f}") |
| |
| |
| oof_ensemble[val_idx] = 0.4 * oof_xgb[val_idx] + 0.35 * oof_lgb[val_idx] + 0.25 * oof_cat[val_idx] |
| ens_auc = roc_auc_score(y_val, oof_ensemble[val_idx]) |
| print(f" Ensemble AUC: {ens_auc:.4f}") |
|
|
| |
| feature_importance_xgb /= n_splits |
| feature_importance_lgb /= n_splits |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("STEP 5: Overall Performance Evaluation") |
| print("=" * 60) |
|
|
| from sklearn.metrics import brier_score_loss |
|
|
| models_results = { |
| 'XGBoost': oof_xgb, |
| 'LightGBM': oof_lgb, |
| 'CatBoost': oof_cat, |
| 'Ensemble': oof_ensemble |
| } |
|
|
| results_table = [] |
| for name, preds in models_results.items(): |
| auc = roc_auc_score(y, preds) |
| ap = average_precision_score(y, preds) |
| brier = brier_score_loss(y, preds) |
| |
| |
| from sklearn.metrics import precision_recall_curve |
| precision_curve, recall_curve, thresholds = precision_recall_curve(y, preds) |
| f1_scores = 2 * (precision_curve * recall_curve) / (precision_curve + recall_curve + 1e-10) |
| optimal_threshold = thresholds[np.argmax(f1_scores)] |
| y_pred = (preds >= optimal_threshold).astype(int) |
| |
| acc = accuracy_score(y, y_pred) |
| f1 = f1_score(y, y_pred) |
| prec = precision_score(y, y_pred) |
| rec = recall_score(y, y_pred) |
| |
| results_table.append({ |
| 'Model': name, |
| 'AUC-ROC': auc, |
| 'AUC-PR': ap, |
| 'Brier Score': brier, |
| 'Accuracy': acc, |
| 'F1': f1, |
| 'Precision': prec, |
| 'Recall': rec, |
| 'Optimal Threshold': optimal_threshold |
| }) |
| |
| print(f"\n{name}:") |
| print(f" AUC-ROC: {auc:.4f}") |
| print(f" AUC-PR: {ap:.4f}") |
| print(f" Brier: {brier:.4f}") |
| print(f" Accuracy: {acc:.4f}") |
| print(f" F1: {f1:.4f}") |
| print(f" Precision: {prec:.4f}") |
| print(f" Recall: {rec:.4f}") |
| print(f" Threshold: {optimal_threshold:.4f}") |
|
|
| results_df = pd.DataFrame(results_table) |
| results_df.to_csv(f"{OUTPUT_DIR}/evaluation_results.csv", index=False) |
|
|
| |
| best_model_name = results_df.loc[results_df['AUC-ROC'].idxmax(), 'Model'] |
| best_preds = models_results[best_model_name] |
| best_threshold = results_df.loc[results_df['AUC-ROC'].idxmax(), 'Optimal Threshold'] |
| y_pred_best = (best_preds >= best_threshold).astype(int) |
|
|
| print(f"\n\nBest Model: {best_model_name}") |
| print("\nDetailed Classification Report:") |
| print(classification_report(y, y_pred_best, target_names=['No Match', 'Match'])) |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("STEP 6: Training Final Models on Full Data") |
| print("=" * 60) |
|
|
| |
| final_xgb = XGBClassifier( |
| n_estimators=2000, max_depth=7, learning_rate=0.03, |
| colsample_bytree=0.8, subsample=0.8, min_child_weight=3, |
| gamma=0.1, reg_alpha=0.1, reg_lambda=1.0, |
| scale_pos_weight=scale_pos_weight, |
| use_label_encoder=False, eval_metric='auc', |
| tree_method='hist', random_state=42, n_jobs=-1 |
| ) |
|
|
| final_lgb = LGBMClassifier( |
| n_estimators=2000, max_depth=7, learning_rate=0.03, |
| colsample_bytree=0.8, subsample=0.8, min_child_samples=10, |
| reg_alpha=0.1, reg_lambda=1.0, |
| scale_pos_weight=scale_pos_weight, |
| random_state=42, n_jobs=-1, verbose=-1 |
| ) |
|
|
| final_cat = CatBoostClassifier( |
| iterations=2000, depth=7, learning_rate=0.03, |
| l2_leaf_reg=3.0, auto_class_weights='Balanced', |
| random_seed=42, verbose=0 |
| ) |
|
|
| print("Training final XGBoost...") |
| final_xgb.fit(X, y) |
| print("Training final LightGBM...") |
| final_lgb.fit(X, y) |
| print("Training final CatBoost...") |
| final_cat.fit(X, y) |
|
|
| |
| joblib.dump(final_xgb, f"{OUTPUT_DIR}/xgboost_model.joblib") |
| joblib.dump(final_lgb, f"{OUTPUT_DIR}/lightgbm_model.joblib") |
| final_cat.save_model(f"{OUTPUT_DIR}/catboost_model.cbm") |
| joblib.dump(feature_cols, f"{OUTPUT_DIR}/feature_columns.joblib") |
| joblib.dump(le_race, f"{OUTPUT_DIR}/race_encoder.joblib") |
|
|
| |
| ensemble_config = { |
| 'weights': {'xgboost': 0.4, 'lightgbm': 0.35, 'catboost': 0.25}, |
| 'optimal_threshold': float(best_threshold), |
| 'feature_columns': feature_cols, |
| 'model_files': { |
| 'xgboost': 'xgboost_model.joblib', |
| 'lightgbm': 'lightgbm_model.joblib', |
| 'catboost': 'catboost_model.cbm' |
| }, |
| 'metrics': { |
| 'auc_roc': float(results_df.loc[results_df['AUC-ROC'].idxmax(), 'AUC-ROC']), |
| 'auc_pr': float(results_df.loc[results_df['AUC-ROC'].idxmax(), 'AUC-PR']), |
| 'f1': float(results_df.loc[results_df['AUC-ROC'].idxmax(), 'F1']), |
| 'accuracy': float(results_df.loc[results_df['AUC-ROC'].idxmax(), 'Accuracy']), |
| } |
| } |
| with open(f"{OUTPUT_DIR}/ensemble_config.json", "w") as f: |
| json.dump(ensemble_config, f, indent=2) |
|
|
| print("Models saved!") |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("STEP 7: Feature Importance & SHAP Analysis") |
| print("=" * 60) |
|
|
| |
| fi_df = pd.DataFrame({ |
| 'feature': feature_cols, |
| 'xgb_importance': feature_importance_xgb, |
| 'lgb_importance': feature_importance_lgb |
| }).sort_values('xgb_importance', ascending=False) |
|
|
| fi_df['combined_rank'] = ( |
| fi_df['xgb_importance'].rank(ascending=False) + |
| fi_df['lgb_importance'].rank(ascending=False) |
| ) / 2 |
| fi_df = fi_df.sort_values('combined_rank') |
|
|
| print("\nTop 20 Most Important Features:") |
| for i, row in fi_df.head(20).iterrows(): |
| print(f" {row['feature']}: XGB={row['xgb_importance']:.4f}, LGB={row['lgb_importance']:.0f}") |
|
|
| fi_df.to_csv(f"{OUTPUT_DIR}/feature_importance.csv", index=False) |
|
|
| |
| fig, axes = plt.subplots(1, 2, figsize=(16, 10)) |
|
|
| top_n = 25 |
| top_fi = fi_df.head(top_n) |
|
|
| axes[0].barh(range(top_n), top_fi['xgb_importance'].values, color='steelblue') |
| axes[0].set_yticks(range(top_n)) |
| axes[0].set_yticklabels(top_fi['feature'].values, fontsize=8) |
| axes[0].set_title('XGBoost Feature Importance', fontsize=12) |
| axes[0].invert_yaxis() |
|
|
| axes[1].barh(range(top_n), top_fi['lgb_importance'].values, color='coral') |
| axes[1].set_yticks(range(top_n)) |
| axes[1].set_yticklabels(top_fi['feature'].values, fontsize=8) |
| axes[1].set_title('LightGBM Feature Importance', fontsize=12) |
| axes[1].invert_yaxis() |
|
|
| plt.tight_layout() |
| plt.savefig(f"{OUTPUT_DIR}/figures/feature_importance.png", dpi=150, bbox_inches='tight') |
| plt.close() |
|
|
| |
| print("\nRunning SHAP analysis (XGBoost)...") |
| explainer = shap.TreeExplainer(final_xgb) |
| shap_values = explainer.shap_values(X) |
|
|
| fig, ax = plt.subplots(figsize=(12, 10)) |
| shap.summary_plot(shap_values, X, feature_names=feature_cols, show=False, max_display=25) |
| plt.tight_layout() |
| plt.savefig(f"{OUTPUT_DIR}/figures/shap_summary.png", dpi=150, bbox_inches='tight') |
| plt.close() |
|
|
| |
| top_features_for_shap = fi_df.head(6)['feature'].values |
| fig, axes = plt.subplots(2, 3, figsize=(18, 10)) |
| for idx, feat in enumerate(top_features_for_shap): |
| ax = axes[idx // 3, idx % 3] |
| feat_idx = feature_cols.index(feat) |
| ax.scatter(X[feat], shap_values[:, feat_idx], alpha=0.3, s=5, c='steelblue') |
| ax.set_xlabel(feat, fontsize=8) |
| ax.set_ylabel('SHAP value') |
| ax.axhline(y=0, color='grey', linestyle='--', alpha=0.5) |
| plt.suptitle('SHAP Dependence Plots — Top 6 Features', fontsize=14) |
| plt.tight_layout() |
| plt.savefig(f"{OUTPUT_DIR}/figures/shap_dependence.png", dpi=150, bbox_inches='tight') |
| plt.close() |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("STEP 8: Visualization") |
| print("=" * 60) |
|
|
| from sklearn.metrics import roc_curve |
|
|
| |
| fig, ax = plt.subplots(figsize=(8, 8)) |
| for name, preds in models_results.items(): |
| fpr, tpr, _ = roc_curve(y, preds) |
| auc = roc_auc_score(y, preds) |
| ax.plot(fpr, tpr, label=f'{name} (AUC={auc:.4f})', linewidth=2) |
|
|
| ax.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random') |
| ax.set_xlabel('False Positive Rate', fontsize=12) |
| ax.set_ylabel('True Positive Rate', fontsize=12) |
| ax.set_title('ROC Curves — Relationship Prediction Models', fontsize=14) |
| ax.legend(fontsize=11) |
| ax.grid(True, alpha=0.3) |
| plt.tight_layout() |
| plt.savefig(f"{OUTPUT_DIR}/figures/roc_curves.png", dpi=150, bbox_inches='tight') |
| plt.close() |
|
|
| |
| fig, ax = plt.subplots(figsize=(7, 6)) |
| cm = confusion_matrix(y, y_pred_best) |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', |
| xticklabels=['No Match', 'Match'], |
| yticklabels=['No Match', 'Match'], ax=ax) |
| ax.set_xlabel('Predicted', fontsize=12) |
| ax.set_ylabel('Actual', fontsize=12) |
| ax.set_title(f'{best_model_name} — Confusion Matrix', fontsize=14) |
| plt.tight_layout() |
| plt.savefig(f"{OUTPUT_DIR}/figures/confusion_matrix.png", dpi=150, bbox_inches='tight') |
| plt.close() |
|
|
| |
| fig, ax = plt.subplots(figsize=(10, 6)) |
| ax.hist(best_preds[y == 0], bins=50, alpha=0.6, label='No Match', color='salmon', density=True) |
| ax.hist(best_preds[y == 1], bins=50, alpha=0.6, label='Match', color='steelblue', density=True) |
| ax.axvline(x=best_threshold, color='black', linestyle='--', linewidth=2, label=f'Threshold ({best_threshold:.3f})') |
| ax.set_xlabel('Predicted Probability', fontsize=12) |
| ax.set_ylabel('Density', fontsize=12) |
| ax.set_title('Distribution of Predicted Compatibility Scores', fontsize=14) |
| ax.legend(fontsize=11) |
| plt.tight_layout() |
| plt.savefig(f"{OUTPUT_DIR}/figures/probability_distribution.png", dpi=150, bbox_inches='tight') |
| plt.close() |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("STEP 9: Creating Prediction Interface") |
| print("=" * 60) |
|
|
| |
| prediction_code = ''' |
| import joblib |
| import json |
| import numpy as np |
| import pandas as pd |
| from catboost import CatBoostClassifier |
| |
| class RelationshipPredictor: |
| """ |
| Relationship Longevity Predictor |
| |
| Predicts compatibility between two individuals based on their |
| personal profiles, values, and interests. |
| |
| Returns: |
| - compatibility_score (0-1): Predicted probability of successful match |
| - prediction: "High Compatibility" / "Moderate Compatibility" / "Low Compatibility" |
| - key_factors: Top factors driving the prediction |
| """ |
| |
| def __init__(self, model_dir="./"): |
| self.xgb = joblib.load(f"{model_dir}/xgboost_model.joblib") |
| self.lgb = joblib.load(f"{model_dir}/lightgbm_model.joblib") |
| self.cat = CatBoostClassifier() |
| self.cat.load_model(f"{model_dir}/catboost_model.cbm") |
| self.feature_cols = joblib.load(f"{model_dir}/feature_columns.joblib") |
| |
| with open(f"{model_dir}/ensemble_config.json") as f: |
| self.config = json.load(f) |
| |
| def predict(self, person_a: dict, person_b: dict) -> dict: |
| """ |
| Predict relationship compatibility between two people. |
| |
| Args: |
| person_a: Dict with keys like age, race, interests, personality scores |
| person_b: Dict with same structure |
| |
| Returns: |
| Dict with compatibility_score, prediction label, and key factors |
| """ |
| # Build feature vector from the two profiles |
| features = self._build_features(person_a, person_b) |
| |
| # Ensemble prediction |
| xgb_prob = self.xgb.predict_proba(features)[:, 1][0] |
| lgb_prob = self.lgb.predict_proba(features)[:, 1][0] |
| cat_prob = self.cat.predict_proba(features)[:, 1][0] |
| |
| w = self.config['weights'] |
| score = w['xgboost'] * xgb_prob + w['lightgbm'] * lgb_prob + w['catboost'] * cat_prob |
| |
| if score >= 0.7: |
| label = "High Compatibility" |
| elif score >= 0.4: |
| label = "Moderate Compatibility" |
| else: |
| label = "Low Compatibility" |
| |
| return { |
| 'compatibility_score': round(float(score), 4), |
| 'prediction': label, |
| 'individual_models': { |
| 'xgboost': round(float(xgb_prob), 4), |
| 'lightgbm': round(float(lgb_prob), 4), |
| 'catboost': round(float(cat_prob), 4), |
| } |
| } |
| |
| def _build_features(self, a, b): |
| """Build engineered feature vector from two person profiles.""" |
| # This would map raw profile inputs to the trained feature space |
| # Implementation depends on the input format |
| raise NotImplementedError( |
| "Implement feature mapping based on your input format. " |
| "See feature_columns.joblib for required features." |
| ) |
| |
| # Usage example: |
| # predictor = RelationshipPredictor("./model_output") |
| # result = predictor.predict(person_a_profile, person_b_profile) |
| ''' |
|
|
| with open(f"{OUTPUT_DIR}/predictor.py", "w") as f: |
| f.write(prediction_code) |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("FINAL SUMMARY") |
| print("=" * 60) |
|
|
| print(f""" |
| Relationship Longevity Prediction Model — Training Complete |
| ============================================================ |
| |
| Dataset: Fisman Speed Dating Experiment (mstz/speeddating) |
| - 8,378 speed-dating encounters between 551 individuals |
| - 59 original features + {len(feature_cols) - 59} engineered features = {len(feature_cols)} total |
| - Match rate: {y.mean():.1%} (highly imbalanced) |
| |
| Models Trained: |
| 1. XGBoost (n_estimators=2000, depth=7) |
| 2. LightGBM (n_estimators=2000, depth=7) |
| 3. CatBoost (iterations=2000, depth=7) |
| 4. Weighted Ensemble (0.4 XGB + 0.35 LGB + 0.25 CAT) |
| |
| Cross-Validated Performance (5-fold): |
| """) |
|
|
| for _, row in results_df.iterrows(): |
| print(f" {row['Model']:12s} AUC={row['AUC-ROC']:.4f} F1={row['F1']:.4f} Acc={row['Accuracy']:.4f}") |
|
|
| print(f""" |
| Best Model: {best_model_name} |
| AUC-ROC: {results_df.loc[results_df['AUC-ROC'].idxmax(), 'AUC-ROC']:.4f} |
| AUC-PR: {results_df.loc[results_df['AUC-ROC'].idxmax(), 'AUC-PR']:.4f} |
| F1: {results_df.loc[results_df['AUC-ROC'].idxmax(), 'F1']:.4f} |
| |
| Output Files: |
| - {OUTPUT_DIR}/xgboost_model.joblib |
| - {OUTPUT_DIR}/lightgbm_model.joblib |
| - {OUTPUT_DIR}/catboost_model.cbm |
| - {OUTPUT_DIR}/ensemble_config.json |
| - {OUTPUT_DIR}/feature_columns.joblib |
| - {OUTPUT_DIR}/evaluation_results.csv |
| - {OUTPUT_DIR}/feature_importance.csv |
| - {OUTPUT_DIR}/figures/*.png (ROC, SHAP, confusion matrix, etc.) |
| - {OUTPUT_DIR}/predictor.py (inference class) |
| """) |
|
|
| print("DONE!") |
|
|