| """ |
| Phase 1: Divorce Behavior Model (Gottman Four Horsemen) |
| ======================================================= |
| Dataset: Yöntem et al. (2019) — 170 married/divorced Turkish couples |
| 54 questions mapped to Gottman's relationship theory |
| |
| Goal: Train a behavioral risk scoring model that maps communication/behavioral |
| patterns to divorce probability. This model becomes a FEATURE MODULE |
| for the main relationship predictor. |
| |
| Reference: Gottman's "Four Horsemen of the Apocalypse": |
| 1. Contempt — disrespect, mockery, sarcasm, eye-rolling |
| 2. Criticism — attacking character rather than behavior |
| 3. Defensiveness — counter-attacking, playing the victim |
| 4. Stonewalling — withdrawing, shutting down, refusing to engage |
| |
| Plus positive dimensions: |
| 5. Love Maps — knowing partner's inner world, dreams, worries |
| 6. Shared Goals — aligned life direction, values, future plans |
| """ |
|
|
| import os |
| import json |
| import warnings |
| import numpy as np |
| import pandas as pd |
| import matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| from sklearn.model_selection import StratifiedKFold, cross_val_predict, LeaveOneOut |
| from sklearn.metrics import ( |
| roc_auc_score, accuracy_score, f1_score, classification_report, |
| confusion_matrix, precision_score, recall_score, average_precision_score |
| ) |
| from xgboost import XGBClassifier |
| from lightgbm import LGBMClassifier |
| from catboost import CatBoostClassifier |
| import joblib |
| import shap |
|
|
| warnings.filterwarnings('ignore') |
| np.random.seed(42) |
|
|
| OUTPUT_DIR = "/app/phase1_output" |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
| os.makedirs(f"{OUTPUT_DIR}/figures", exist_ok=True) |
|
|
| |
| |
| |
| print("=" * 70) |
| print("PHASE 1: DIVORCE BEHAVIOR MODEL (GOTTMAN FOUR HORSEMEN)") |
| print("=" * 70) |
|
|
| |
| import urllib.request, zipfile, io |
|
|
| print("\nStep 1: Loading Divorce Predictors dataset...") |
| url = 'https://www.kaggle.com/api/v1/datasets/download/andrewmvd/divorce-prediction' |
| try: |
| req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) |
| with urllib.request.urlopen(req, timeout=30) as r: |
| z = zipfile.ZipFile(io.BytesIO(r.read())) |
| df = pd.read_csv(z.open('divorce_data.csv'), sep=';') |
| print(f" Loaded from Kaggle: {df.shape}") |
| except Exception as e: |
| print(f" Kaggle download failed ({e}), trying alternative...") |
| |
| |
| alt_url = 'https://raw.githubusercontent.com/datasets-br/state-codes/master/data/divorce.csv' |
| try: |
| df = pd.read_csv(alt_url, sep=';') |
| print(f" Loaded from mirror: {df.shape}") |
| except: |
| print(" ERROR: Cannot download dataset. Creating from known structure.") |
| |
| raise RuntimeError("Cannot download Divorce Predictors dataset") |
|
|
| print(f"\nDataset shape: {df.shape}") |
| print(f"Columns: {list(df.columns)}") |
| print(f"\nTarget distribution:") |
| print(df['Divorce'].value_counts()) |
| print(f"Balance ratio: {df['Divorce'].value_counts()[0]}:{df['Divorce'].value_counts()[1]}") |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("Step 2: Data Audit & Gottman Question Mapping") |
| print("=" * 70) |
|
|
| |
| |
| gottman_mapping = { |
| |
| 'Q1': 'shared_goals', |
| 'Q2': 'shared_goals', |
| 'Q3': 'shared_goals', |
| 'Q4': 'shared_goals', |
| 'Q5': 'shared_goals', |
| 'Q6': 'shared_goals', |
| 'Q7': 'shared_goals', |
| 'Q8': 'shared_goals', |
| 'Q9': 'shared_goals', |
| 'Q10': 'shared_goals', |
| |
| |
| 'Q11': 'love_maps', |
| 'Q12': 'love_maps', |
| 'Q13': 'love_maps', |
| 'Q14': 'love_maps', |
| 'Q15': 'love_maps', |
| 'Q16': 'love_maps', |
| 'Q17': 'love_maps', |
| 'Q18': 'love_maps', |
| 'Q19': 'love_maps', |
| 'Q20': 'love_maps', |
| |
| |
| 'Q21': 'love_maps_deep', |
| 'Q22': 'love_maps_deep', |
| 'Q23': 'love_maps_deep', |
| 'Q24': 'love_maps_deep', |
| 'Q25': 'love_maps_deep', |
| 'Q26': 'love_maps_deep', |
| 'Q27': 'love_maps_deep', |
| 'Q28': 'love_maps_deep', |
| 'Q29': 'love_maps_deep', |
| 'Q30': 'love_maps_deep', |
| |
| |
| 'Q31': 'criticism', |
| 'Q32': 'criticism', |
| 'Q33': 'contempt', |
| 'Q34': 'contempt', |
| 'Q35': 'contempt', |
| 'Q36': 'contempt', |
| 'Q37': 'criticism', |
| 'Q38': 'criticism', |
| 'Q39': 'contempt', |
| 'Q40': 'contempt', |
| |
| |
| 'Q41': 'defensiveness', |
| 'Q42': 'stonewalling', |
| 'Q43': 'stonewalling', |
| 'Q44': 'stonewalling', |
| 'Q45': 'defensiveness', |
| 'Q46': 'defensiveness', |
| 'Q47': 'stonewalling', |
| 'Q48': 'defensiveness', |
| 'Q49': 'defensiveness', |
| 'Q50': 'defensiveness', |
| |
| |
| 'Q51': 'contempt_deep', |
| 'Q52': 'contempt_deep', |
| 'Q53': 'contempt_deep', |
| 'Q54': 'contempt_deep', |
| } |
|
|
| |
| feature_cols = [f'Q{i}' for i in range(1, 55)] |
| missing_cols = [c for c in feature_cols if c not in df.columns] |
| if missing_cols: |
| print(f"WARNING: Missing columns: {missing_cols}") |
| else: |
| print(f"All 54 Gottman questions present ✓") |
|
|
| print(f"\nValue range per question: {df[feature_cols].min().min()} to {df[feature_cols].max().max()}") |
| print(f"Scale: 0 (Never) to 4 (Always)") |
|
|
| |
| print(f"\nPer-question statistics:") |
| desc = df[feature_cols].describe().T |
| print(desc[['mean', 'std', 'min', 'max']].to_string()) |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("Step 3: Engineering Gottman Composite Scores") |
| print("=" * 70) |
|
|
| |
| dimensions = {} |
| for col, dim in gottman_mapping.items(): |
| if dim not in dimensions: |
| dimensions[dim] = [] |
| dimensions[dim].append(col) |
|
|
| print("\nGottman Dimension Mapping:") |
| for dim, cols in dimensions.items(): |
| print(f" {dim}: {len(cols)} questions → {cols}") |
|
|
| |
| for dim, cols in dimensions.items(): |
| df[f'gottman_{dim}'] = df[cols].mean(axis=1) |
| df[f'gottman_{dim}_std'] = df[cols].std(axis=1) |
| df[f'gottman_{dim}_max'] = df[cols].max(axis=1) |
| df[f'gottman_{dim}_min'] = df[cols].min(axis=1) |
|
|
| |
| horsemen_dims = ['criticism', 'contempt', 'defensiveness', 'stonewalling', 'contempt_deep'] |
| horsemen_cols = [c for c, d in gottman_mapping.items() if d in horsemen_dims] |
| df['gottman_four_horsemen'] = df[horsemen_cols].mean(axis=1) |
| df['gottman_four_horsemen_max'] = df[horsemen_cols].max(axis=1) |
| df['gottman_four_horsemen_intensity'] = df[horsemen_cols].std(axis=1) |
|
|
| |
| positive_dims = ['shared_goals', 'love_maps', 'love_maps_deep'] |
| positive_cols = [c for c, d in gottman_mapping.items() if d in positive_dims] |
| df['gottman_positive'] = df[positive_cols].mean(axis=1) |
| df['gottman_positive_consistency'] = df[positive_cols].std(axis=1) |
|
|
| |
| df['gottman_ratio'] = (df['gottman_positive'] + 0.1) / (df['gottman_four_horsemen'] + 0.1) |
|
|
| |
| df['contempt_x_stonewalling'] = df['gottman_contempt'] * df['gottman_stonewalling'] |
| df['criticism_x_defensiveness'] = df['gottman_criticism'] * df['gottman_defensiveness'] |
| df['love_maps_x_shared_goals'] = df['gottman_love_maps'] * df['gottman_shared_goals'] |
| df['horsemen_minus_positive'] = df['gottman_four_horsemen'] - df['gottman_positive'] |
|
|
| |
| df['contempt_escalation'] = df['gottman_contempt_deep'] - df['gottman_contempt'] |
|
|
| |
| df['overall_risk'] = ( |
| df['gottman_four_horsemen'] * 0.6 + |
| (4 - df['gottman_positive']) * 0.4 |
| ) |
|
|
| gottman_features = [c for c in df.columns if c.startswith('gottman_') or c in [ |
| 'contempt_x_stonewalling', 'criticism_x_defensiveness', |
| 'love_maps_x_shared_goals', 'horsemen_minus_positive', |
| 'contempt_escalation', 'overall_risk' |
| ]] |
|
|
| print(f"\nEngineered Gottman features ({len(gottman_features)}):") |
| for f in gottman_features: |
| print(f" {f}: mean={df[f].mean():.3f}, std={df[f].std():.3f}") |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("Step 4: Training Divorce Predictor (5-fold + LOOCV)") |
| print("=" * 70) |
|
|
| |
| all_features = feature_cols + gottman_features |
| X = df[all_features].values |
| y = df['Divorce'].values |
|
|
| print(f"Feature set: {len(all_features)} features") |
| print(f"Samples: {len(y)} ({(y==1).sum()} divorced, {(y==0).sum()} married)") |
|
|
| |
| skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) |
|
|
| models = { |
| 'XGBoost': XGBClassifier( |
| n_estimators=500, max_depth=4, learning_rate=0.05, |
| colsample_bytree=0.8, subsample=0.8, min_child_weight=2, |
| gamma=0.2, reg_alpha=0.5, reg_lambda=2.0, |
| use_label_encoder=False, eval_metric='auc', |
| tree_method='hist', random_state=42, n_jobs=-1 |
| ), |
| 'LightGBM': LGBMClassifier( |
| n_estimators=500, max_depth=4, learning_rate=0.05, |
| colsample_bytree=0.8, subsample=0.8, min_child_samples=5, |
| reg_alpha=0.5, reg_lambda=2.0, |
| random_state=42, n_jobs=-1, verbose=-1 |
| ), |
| 'CatBoost': CatBoostClassifier( |
| iterations=500, depth=4, learning_rate=0.05, |
| l2_leaf_reg=5.0, random_seed=42, verbose=0 |
| ) |
| } |
|
|
| oof_predictions = {} |
| feature_importances = {} |
|
|
| for name, model in models.items(): |
| print(f"\n--- {name} ---") |
| oof_probs = np.zeros(len(y)) |
| |
| for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)): |
| X_train, X_val = X[train_idx], X[val_idx] |
| y_train, y_val = y[train_idx], y[val_idx] |
| |
| if name == 'CatBoost': |
| model.fit(X_train, y_train, eval_set=(X_val, y_val)) |
| else: |
| model.fit(X_train, y_train, eval_set=[(X_val, y_val)]) |
| |
| oof_probs[val_idx] = model.predict_proba(X_val)[:, 1] |
| |
| oof_predictions[name] = oof_probs |
| |
| auc = roc_auc_score(y, oof_probs) |
| y_pred = (oof_probs >= 0.5).astype(int) |
| acc = accuracy_score(y, y_pred) |
| f1 = f1_score(y, y_pred) |
| prec = precision_score(y, y_pred) |
| rec = recall_score(y, y_pred) |
| |
| print(f" 5-Fold AUC: {auc:.4f}") |
| print(f" Accuracy: {acc:.4f}") |
| print(f" F1: {f1:.4f}") |
| print(f" Precision: {prec:.4f}") |
| print(f" Recall: {rec:.4f}") |
|
|
| |
| oof_ensemble = 0.4 * oof_predictions['XGBoost'] + 0.35 * oof_predictions['LightGBM'] + 0.25 * oof_predictions['CatBoost'] |
| oof_predictions['Ensemble'] = oof_ensemble |
|
|
| ens_auc = roc_auc_score(y, oof_ensemble) |
| y_pred_ens = (oof_ensemble >= 0.5).astype(int) |
| ens_acc = accuracy_score(y, y_pred_ens) |
| ens_f1 = f1_score(y, y_pred_ens) |
|
|
| print(f"\n--- Ensemble ---") |
| print(f" 5-Fold AUC: {ens_auc:.4f}") |
| print(f" Accuracy: {ens_acc:.4f}") |
| print(f" F1: {ens_f1:.4f}") |
|
|
| |
| best_name = max(oof_predictions.keys(), key=lambda k: roc_auc_score(y, oof_predictions[k])) |
| best_probs = oof_predictions[best_name] |
| y_pred_best = (best_probs >= 0.5).astype(int) |
|
|
| print(f"\nBest Model: {best_name}") |
| print("\nClassification Report:") |
| print(classification_report(y, y_pred_best, target_names=['Married', 'Divorced'])) |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("Step 5: Training Final Divorce Risk Scorer on Full Data") |
| print("=" * 70) |
|
|
| |
| final_xgb = XGBClassifier( |
| n_estimators=800, max_depth=4, learning_rate=0.05, |
| colsample_bytree=0.8, subsample=0.8, min_child_weight=2, |
| gamma=0.2, reg_alpha=0.5, reg_lambda=2.0, |
| use_label_encoder=False, eval_metric='auc', |
| tree_method='hist', random_state=42, n_jobs=-1 |
| ) |
| final_xgb.fit(X, y) |
|
|
| final_lgb = LGBMClassifier( |
| n_estimators=800, max_depth=4, learning_rate=0.05, |
| colsample_bytree=0.8, subsample=0.8, min_child_samples=5, |
| reg_alpha=0.5, reg_lambda=2.0, |
| random_state=42, n_jobs=-1, verbose=-1 |
| ) |
| final_lgb.fit(X, y) |
|
|
| final_cat = CatBoostClassifier( |
| iterations=800, depth=4, learning_rate=0.05, |
| l2_leaf_reg=5.0, random_seed=42, verbose=0 |
| ) |
| final_cat.fit(X, y) |
|
|
| |
| joblib.dump(final_xgb, f"{OUTPUT_DIR}/divorce_xgb.joblib") |
| joblib.dump(final_lgb, f"{OUTPUT_DIR}/divorce_lgb.joblib") |
| final_cat.save_model(f"{OUTPUT_DIR}/divorce_cat.cbm") |
| joblib.dump(all_features, f"{OUTPUT_DIR}/divorce_features.joblib") |
| joblib.dump(gottman_mapping, f"{OUTPUT_DIR}/gottman_mapping.joblib") |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("Step 6: SHAP Analysis — What Drives Divorce?") |
| print("=" * 70) |
|
|
| explainer = shap.TreeExplainer(final_xgb) |
| X_df = pd.DataFrame(X, columns=all_features) |
| shap_values = explainer.shap_values(X_df) |
|
|
| |
| mean_shap = np.abs(shap_values).mean(axis=0) |
| shap_importance = pd.DataFrame({ |
| 'feature': all_features, |
| 'mean_abs_shap': mean_shap |
| }).sort_values('mean_abs_shap', ascending=False) |
|
|
| print("\nTop 25 Most Important Divorce Predictors (SHAP):") |
| for i, row in shap_importance.head(25).iterrows(): |
| dim = gottman_mapping.get(row['feature'], 'engineered') |
| print(f" {row['feature']:40s} SHAP={row['mean_abs_shap']:.4f} [{dim}]") |
|
|
| shap_importance.to_csv(f"{OUTPUT_DIR}/divorce_shap_importance.csv", index=False) |
|
|
| |
| fig, ax = plt.subplots(figsize=(12, 12)) |
| shap.summary_plot(shap_values, X_df, max_display=30, show=False) |
| plt.tight_layout() |
| plt.savefig(f"{OUTPUT_DIR}/figures/divorce_shap_summary.png", dpi=150, bbox_inches='tight') |
| plt.close() |
|
|
| |
| print("\n\nDivorce Risk by Gottman Dimension (mean SHAP):") |
| dim_shap = {} |
| for feat, shap_val in zip(all_features, mean_shap): |
| dim = gottman_mapping.get(feat, 'engineered') |
| if dim not in dim_shap: |
| dim_shap[dim] = [] |
| dim_shap[dim].append(shap_val) |
|
|
| for dim in sorted(dim_shap.keys(), key=lambda d: np.mean(dim_shap[d]), reverse=True): |
| vals = dim_shap[dim] |
| print(f" {dim:25s} mean_SHAP={np.mean(vals):.4f} (n={len(vals)} features)") |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("Step 7: Creating Gottman Risk Scoring Function") |
| print("=" * 70) |
|
|
| |
| |
| |
|
|
| |
| gottman_recipe = { |
| 'dimensions': {dim: cols for dim, cols in dimensions.items()}, |
| 'horsemen_questions': horsemen_cols, |
| 'positive_questions': positive_cols, |
| 'composite_features': gottman_features, |
| 'all_features': all_features, |
| 'model_performance': { |
| 'xgb_auc': float(roc_auc_score(y, oof_predictions['XGBoost'])), |
| 'lgb_auc': float(roc_auc_score(y, oof_predictions['LightGBM'])), |
| 'cat_auc': float(roc_auc_score(y, oof_predictions['CatBoost'])), |
| 'ensemble_auc': float(roc_auc_score(y, oof_predictions['Ensemble'])), |
| 'ensemble_acc': float(ens_acc), |
| 'ensemble_f1': float(ens_f1), |
| }, |
| 'top_predictors': shap_importance.head(20)[['feature', 'mean_abs_shap']].to_dict('records'), |
| 'dimension_importance': {dim: float(np.mean(vals)) for dim, vals in dim_shap.items()} |
| } |
|
|
| with open(f"{OUTPUT_DIR}/gottman_recipe.json", "w") as f: |
| json.dump(gottman_recipe, f, indent=2) |
|
|
| |
| fig, ax = plt.subplots(figsize=(7, 6)) |
| cm = confusion_matrix(y, y_pred_best) |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Reds', |
| xticklabels=['Married', 'Divorced'], |
| yticklabels=['Married', 'Divorced'], ax=ax) |
| ax.set_xlabel('Predicted', fontsize=12) |
| ax.set_ylabel('Actual', fontsize=12) |
| ax.set_title(f'Divorce Predictor — {best_name} Confusion Matrix', fontsize=14) |
| plt.tight_layout() |
| plt.savefig(f"{OUTPUT_DIR}/figures/divorce_confusion_matrix.png", dpi=150, bbox_inches='tight') |
| plt.close() |
|
|
| |
| fig, ax = plt.subplots(figsize=(10, 6)) |
| dim_sorted = sorted(dim_shap.items(), key=lambda x: np.mean(x[1]), reverse=True) |
| dims_names = [d[0] for d in dim_sorted] |
| dims_vals = [np.mean(d[1]) for d in dim_sorted] |
| colors = ['#d32f2f' if d in horsemen_dims else '#1976d2' for d in dims_names] |
| ax.barh(range(len(dims_names)), dims_vals, color=colors) |
| ax.set_yticks(range(len(dims_names))) |
| ax.set_yticklabels(dims_names, fontsize=10) |
| ax.set_xlabel('Mean |SHAP| Value', fontsize=12) |
| ax.set_title('Divorce Risk by Gottman Dimension\n(Red=Horsemen, Blue=Positive)', fontsize=14) |
| ax.invert_yaxis() |
| plt.tight_layout() |
| plt.savefig(f"{OUTPUT_DIR}/figures/gottman_dimension_importance.png", dpi=150, bbox_inches='tight') |
| plt.close() |
|
|
| print("\nPhase 1 Complete!") |
| print(f" Output directory: {OUTPUT_DIR}") |
| print(f" Models: divorce_xgb.joblib, divorce_lgb.joblib, divorce_cat.cbm") |
| print(f" Recipe: gottman_recipe.json") |
| print(f" Figures: {OUTPUT_DIR}/figures/") |
|
|