Upload phase2_marriage_duration.py with huggingface_hub

Browse files

Files changed (1) hide show

phase2_marriage_duration.py +623 -0

phase2_marriage_duration.py ADDED Viewed

	@@ -0,0 +1,623 @@

+"""
+Phase 2: Marriage Duration Longitudinal Model
+==============================================
+Dataset: vedastro-org/15000-Famous-People-Marriage-Divorce-Info
+         15,807 famous people → 18,148 marriage records
+Goal: Extract marriage duration statistics, build survival model,
+      and create longevity prior features (base rates) that can
+      augment the main relationship predictor.
+"""
+import os
+import json
+import warnings
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import seaborn as sns
+from datasets import load_dataset
+from datetime import datetime
+import re
+import joblib
+warnings.filterwarnings('ignore')
+np.random.seed(42)
+OUTPUT_DIR = "/app/phase2_output"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+os.makedirs(f"{OUTPUT_DIR}/figures", exist_ok=True)
+# ============================================================
+# 1. LOAD AND PARSE VEDASTRO DATASET
+# ============================================================
+print("=" * 70)
+print("PHASE 2: MARRIAGE DURATION LONGITUDINAL MODEL")
+print("=" * 70)
+print("\nStep 1: Loading Vedastro marriage records...")
+ds = load_dataset("vedastro-org/15000-Famous-People-Marriage-Divorce-Info", split="train")
+print(f"  Raw records: {len(ds)}")
+# Parse the JSON info field
+records = []
+parse_errors = 0
+for row in ds:
+    try:
+        info = json.loads(row['Info'])
+        person_key = row['PartitionKey']
+        # Extract birth year from key (format: Name+Year)
+        birth_year_match = re.search(r'(\d{4})$', person_key)
+        birth_year = int(birth_year_match.group(1)) if birth_year_match else None
+        marriages = info.get('marriages', [])
+        if not marriages:
+            continue
+        for m_idx, m in enumerate(marriages):
+            record = {
+                'person': person_key,
+                'birth_year': birth_year,
+                'marriage_idx': m_idx,
+                'marriage_type': m.get('type', ''),
+                'spouse': m.get('spouse', ''),
+                'marriage_date_raw': m.get('marriageDate', ''),
+                'divorce_date_raw': m.get('divorceDate', ''),
+                'outcome': m.get('outcome', ''),
+                'data_credibility': m.get('dataCredibility', ''),
+            }
+            records.append(record)
+    except (json.JSONDecodeError, KeyError, TypeError) as e:
+        parse_errors += 1
+df = pd.DataFrame(records)
+print(f"  Parsed marriage records: {len(df)}")
+print(f"  Parse errors: {parse_errors}")
+# ============================================================
+# 2. DATA AUDIT
+# ============================================================
+print("\n" + "=" * 70)
+print("Step 2: Data Audit")
+print("=" * 70)
+print(f"\nOutcome distribution:")
+print(df['outcome'].value_counts())
+print(f"\nMarriage type distribution:")
+print(df['marriage_type'].value_counts())
+print(f"\nData credibility:")
+print(df['data_credibility'].value_counts())
+print(f"\nDivorce date values (sample):")
+print(df['divorce_date_raw'].value_counts().head(20))
+# ============================================================
+# 3. PARSE DATES & COMPUTE DURATION
+# ============================================================
+print("\n" + "=" * 70)
+print("Step 3: Parsing Dates & Computing Marriage Duration")
+print("=" * 70)
+def parse_date(date_str):
+    """Parse various date formats to year (float)."""
+    if not date_str or pd.isna(date_str):
+        return None
+    date_str = str(date_str).strip()
+    # Skip non-date values
+    skip_vals = ['Not Applicable', 'N/A', 'None', '', 'Unknown', 'not applicable',
+                 'Present', 'present', 'Current', 'current', 'Still Married',
+                 'still married', 'Ongoing']
+    if date_str in skip_vals:
+        return None
+    # Try various formats
+    # "1990" → year only
+    if re.match(r'^\d{4}$', date_str):
+        return int(date_str)
+    # "31/08/1921" or "08/31/1921" → DD/MM/YYYY or MM/DD/YYYY
+    match = re.match(r'^(\d{1,2})/(\d{1,2})/(\d{4})$', date_str)
+    if match:
+        return int(match.group(3))
+    # "August 1990" or "Aug 1990"
+    match = re.match(r'^[A-Za-z]+\s+(\d{4})$', date_str)
+    if match:
+        return int(match.group(1))
+    # "1990-01-01" ISO format
+    match = re.match(r'^(\d{4})-\d{2}-\d{2}', date_str)
+    if match:
+        return int(match.group(1))
+    # Just try to find a 4-digit year anywhere
+    match = re.search(r'(\d{4})', date_str)
+    if match:
+        year = int(match.group(1))
+        if 1800 <= year <= 2030:
+            return year
+    return None
+df['marriage_year'] = df['marriage_date_raw'].apply(parse_date)
+df['divorce_year'] = df['divorce_date_raw'].apply(parse_date)
+# Determine if divorced
+df['is_divorced'] = False
+# Explicit divorce date
+df.loc[df['divorce_year'].notna(), 'is_divorced'] = True
+# Outcome-based
+divorce_outcomes = ['Dissolution', 'dissolution', 'Tragedy']
+df.loc[df['outcome'].isin(divorce_outcomes), 'is_divorced'] = True
+# Compute duration
+# For divorced: divorce_year - marriage_year
+# For not divorced: use 2024 as censoring date (or death year if available)
+CENSOR_YEAR = 2024
+df['duration_years'] = np.nan
+# Divorced with both dates
+mask_divorced = df['is_divorced'] & df['marriage_year'].notna() & df['divorce_year'].notna()
+df.loc[mask_divorced, 'duration_years'] = df.loc[mask_divorced, 'divorce_year'] - df.loc[mask_divorced, 'marriage_year']
+# Not divorced (censored)
+mask_censored = ~df['is_divorced'] & df['marriage_year'].notna()
+df.loc[mask_censored, 'duration_years'] = CENSOR_YEAR - df.loc[mask_censored, 'marriage_year']
+# Remove impossible durations
+df.loc[df['duration_years'] < 0, 'duration_years'] = np.nan
+df.loc[df['duration_years'] > 100, 'duration_years'] = np.nan
+print(f"\nDate parsing results:")
+print(f"  Records with marriage year: {df['marriage_year'].notna().sum()}")
+print(f"  Records with divorce year: {df['divorce_year'].notna().sum()}")
+print(f"  Is divorced (total): {df['is_divorced'].sum()}")
+print(f"  Records with valid duration: {df['duration_years'].notna().sum()}")
+# Filter to high-credibility records with valid data
+df_valid = df[df['duration_years'].notna() & (df['duration_years'] >= 0)].copy()
+print(f"\nValid records for analysis: {len(df_valid)}")
+print(f"\nDuration statistics (years):")
+print(df_valid['duration_years'].describe())
+print(f"\nDuration by outcome:")
+for outcome in df_valid['outcome'].unique():
+    subset = df_valid[df_valid['outcome'] == outcome]
+    if len(subset) > 10:
+        print(f"  {outcome}: n={len(subset)}, mean={subset['duration_years'].mean():.1f}, "
+              f"median={subset['duration_years'].median():.1f}, std={subset['duration_years'].std():.1f}")
+print(f"\nDuration by marriage type:")
+for mtype in df_valid['marriage_type'].unique():
+    subset = df_valid[df_valid['marriage_type'] == mtype]
+    if len(subset) > 10:
+        print(f"  {mtype}: n={len(subset)}, mean={subset['duration_years'].mean():.1f}, "
+              f"median={subset['duration_years'].median():.1f}, divorced={subset['is_divorced'].mean():.1%}")
+# ============================================================
+# 4. FEATURE ENGINEERING FOR SURVIVAL MODEL
+# ============================================================
+print("\n" + "=" * 70)
+print("Step 4: Feature Engineering for Survival Model")
+print("=" * 70)
+# Marriage era
+df_valid['marriage_era'] = pd.cut(
+    df_valid['marriage_year'],
+    bins=[0, 1900, 1950, 1970, 1990, 2000, 2010, 2030],
+    labels=['pre_1900', '1900_1950', '1950_1970', '1970_1990', '1990_2000', '2000_2010', '2010_plus']
+)
+# Marriage type encoding
+df_valid['is_love_marriage'] = (df_valid['marriage_type'] == 'Love').astype(int)
+df_valid['is_arranged_marriage'] = (df_valid['marriage_type'] == 'Arranged').astype(int)
+# Age at marriage (if birth year available)
+df_valid['age_at_marriage'] = np.nan
+mask = df_valid['birth_year'].notna() & df_valid['marriage_year'].notna()
+df_valid.loc[mask, 'age_at_marriage'] = df_valid.loc[mask, 'marriage_year'] - df_valid.loc[mask, 'birth_year']
+# Filter out unreasonable ages
+df_valid.loc[(df_valid['age_at_marriage'] < 14) | (df_valid['age_at_marriage'] > 80), 'age_at_marriage'] = np.nan
+# Marriage number (first, second, etc.)
+df_valid['marriage_number'] = df_valid['marriage_idx'] + 1
+df_valid['is_first_marriage'] = (df_valid['marriage_idx'] == 0).astype(int)
+# Outcome encoding
+outcome_map = {
+    'Happiness': 0, 'happiness': 0,
+    'Dissolution': 1, 'dissolution': 1,
+    'Struggle': 2, 'struggle': 2,
+    'Tragedy': 3, 'tragedy': 3,
+}
+df_valid['outcome_code'] = df_valid['outcome'].map(outcome_map).fillna(-1).astype(int)
+print(f"\nAge at marriage statistics:")
+print(df_valid['age_at_marriage'].describe())
+print(f"\nMarriage era distribution:")
+print(df_valid['marriage_era'].value_counts())
+print(f"\nFirst marriage vs subsequent:")
+print(f"  First:      n={df_valid['is_first_marriage'].sum()}, "
+      f"divorce_rate={df_valid[df_valid['is_first_marriage']==1]['is_divorced'].mean():.1%}")
+print(f"  Subsequent: n={(~df_valid['is_first_marriage'].astype(bool)).sum()}, "
+      f"divorce_rate={df_valid[df_valid['is_first_marriage']==0]['is_divorced'].mean():.1%}")
+# ============================================================
+# 5. SURVIVAL ANALYSIS — KAPLAN-MEIER + COX PH
+# ============================================================
+print("\n" + "=" * 70)
+print("Step 5: Survival Analysis")
+print("=" * 70)
+# Install lifelines for survival analysis
+import subprocess
+subprocess.run(['pip', 'install', 'lifelines', '-q'], capture_output=True)
+from lifelines import KaplanMeierFitter, CoxPHFitter
+from lifelines.statistics import logrank_test
+# Prepare survival data
+surv_df = df_valid[['duration_years', 'is_divorced', 'is_love_marriage', 'is_arranged_marriage',
+                     'age_at_marriage', 'marriage_number', 'is_first_marriage', 'marriage_year']].dropna(subset=['duration_years'])
+# Convert is_divorced to int for lifelines
+surv_df['event'] = surv_df['is_divorced'].astype(int)
+print(f"\nSurvival dataset: {len(surv_df)} records")
+print(f"  Events (divorces): {surv_df['event'].sum()}")
+print(f"  Censored (ongoing): {(surv_df['event'] == 0).sum()}")
+# --- Kaplan-Meier ---
+kmf = KaplanMeierFitter()
+# Overall survival
+kmf.fit(surv_df['duration_years'], event_observed=surv_df['event'])
+print(f"\nOverall Marriage Survival Estimates:")
+for t in [5, 10, 15, 20, 25, 30, 40, 50]:
+    if t <= kmf.survival_function_.index.max():
+        surv = kmf.predict(t)
+        print(f"  {t:3d} years: {surv:.1%} still married")
+median_survival = kmf.median_survival_time_
+print(f"  Median survival: {median_survival:.1f} years")
+# Plot overall KM curve
+fig, ax = plt.subplots(figsize=(10, 7))
+kmf.plot_survival_function(ax=ax, label='All Marriages', ci_show=True)
+ax.set_xlabel('Years Since Marriage', fontsize=12)
+ax.set_ylabel('Probability Still Married', fontsize=12)
+ax.set_title('Marriage Survival Curve (Kaplan-Meier)\n15,000+ Famous People', fontsize=14)
+ax.set_xlim(0, 60)
+ax.grid(True, alpha=0.3)
+plt.tight_layout()
+plt.savefig(f"{OUTPUT_DIR}/figures/km_overall.png", dpi=150, bbox_inches='tight')
+plt.close()
+# --- KM by Marriage Type ---
+fig, ax = plt.subplots(figsize=(10, 7))
+for mtype, label, color in [('Love', 'Love Marriage', '#e74c3c'),
+                              ('Arranged', 'Arranged Marriage', '#3498db')]:
+    mask = df_valid['marriage_type'] == mtype
+    subset = df_valid[mask & df_valid['duration_years'].notna()].copy()
+    if len(subset) > 50:
+        kmf_sub = KaplanMeierFitter()
+        kmf_sub.fit(subset['duration_years'], event_observed=subset['is_divorced'].astype(int))
+        kmf_sub.plot_survival_function(ax=ax, label=f'{label} (n={len(subset)})', ci_show=True, color=color)
+ax.set_xlabel('Years Since Marriage', fontsize=12)
+ax.set_ylabel('Probability Still Married', fontsize=12)
+ax.set_title('Marriage Survival by Type: Love vs Arranged', fontsize=14)
+ax.set_xlim(0, 60)
+ax.legend(fontsize=11)
+ax.grid(True, alpha=0.3)
+plt.tight_layout()
+plt.savefig(f"{OUTPUT_DIR}/figures/km_by_type.png", dpi=150, bbox_inches='tight')
+plt.close()
+# Log-rank test: Love vs Arranged
+love_mask = df_valid['marriage_type'] == 'Love'
+arranged_mask = df_valid['marriage_type'] == 'Arranged'
+love_data = df_valid[love_mask & df_valid['duration_years'].notna()]
+arranged_data = df_valid[arranged_mask & df_valid['duration_years'].notna()]
+if len(love_data) > 50 and len(arranged_data) > 50:
+    lr_result = logrank_test(
+        love_data['duration_years'], arranged_data['duration_years'],
+        event_observed_A=love_data['is_divorced'].astype(int),
+        event_observed_B=arranged_data['is_divorced'].astype(int)
+    )
+    print(f"\nLog-rank test (Love vs Arranged):")
+    print(f"  Test statistic: {lr_result.test_statistic:.4f}")
+    print(f"  p-value: {lr_result.p_value:.6f}")
+    print(f"  Significant: {'YES' if lr_result.p_value < 0.05 else 'NO'}")
+# --- KM by Marriage Era ---
+fig, ax = plt.subplots(figsize=(10, 7))
+colors_era = plt.cm.viridis(np.linspace(0.2, 0.9, len(df_valid['marriage_era'].dropna().unique())))
+for era, color in zip(sorted(df_valid['marriage_era'].dropna().unique()), colors_era):
+    subset = df_valid[(df_valid['marriage_era'] == era) & df_valid['duration_years'].notna()]
+    if len(subset) > 30:
+        kmf_era = KaplanMeierFitter()
+        kmf_era.fit(subset['duration_years'], event_observed=subset['is_divorced'].astype(int))
+        kmf_era.plot_survival_function(ax=ax, label=f'{era} (n={len(subset)})', ci_show=False, color=color)
+ax.set_xlabel('Years Since Marriage', fontsize=12)
+ax.set_ylabel('Probability Still Married', fontsize=12)
+ax.set_title('Marriage Survival by Era', fontsize=14)
+ax.set_xlim(0, 60)
+ax.legend(fontsize=9)
+ax.grid(True, alpha=0.3)
+plt.tight_layout()
+plt.savefig(f"{OUTPUT_DIR}/figures/km_by_era.png", dpi=150, bbox_inches='tight')
+plt.close()
+# --- KM by First vs Subsequent Marriage ---
+fig, ax = plt.subplots(figsize=(10, 7))
+for is_first, label, color in [(1, 'First Marriage', '#2ecc71'), (0, 'Subsequent Marriage', '#e67e22')]:
+    subset = df_valid[(df_valid['is_first_marriage'] == is_first) & df_valid['duration_years'].notna()]
+    if len(subset) > 50:
+        kmf_m = KaplanMeierFitter()
+        kmf_m.fit(subset['duration_years'], event_observed=subset['is_divorced'].astype(int))
+        kmf_m.plot_survival_function(ax=ax, label=f'{label} (n={len(subset)})', ci_show=True, color=color)
+ax.set_xlabel('Years Since Marriage', fontsize=12)
+ax.set_ylabel('Probability Still Married', fontsize=12)
+ax.set_title('Marriage Survival: First vs Subsequent Marriage', fontsize=14)
+ax.set_xlim(0, 60)
+ax.legend(fontsize=11)
+ax.grid(True, alpha=0.3)
+plt.tight_layout()
+plt.savefig(f"{OUTPUT_DIR}/figures/km_by_marriage_number.png", dpi=150, bbox_inches='tight')
+plt.close()
+# --- Cox Proportional Hazards ---
+print("\n" + "-" * 50)
+print("Cox Proportional Hazards Model")
+print("-" * 50)
+cox_df = surv_df.dropna(subset=['age_at_marriage']).copy()
+cox_df = cox_df[['duration_years', 'event', 'is_love_marriage', 'age_at_marriage',
+                  'marriage_number', 'is_first_marriage']].dropna()
+if len(cox_df) > 100:
+    cph = CoxPHFitter()
+    cph.fit(cox_df, duration_col='duration_years', event_col='event')
+    print("\nCox PH Model Summary:")
+    cph.print_summary()
+    # Extract hazard ratios
+    print("\nHazard Ratios (exp(coef)):")
+    for var in cph.summary.index:
+        hr = cph.summary.loc[var, 'exp(coef)']
+        p = cph.summary.loc[var, 'p']
+        sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''
+        print(f"  {var:25s}  HR={hr:.4f}  p={p:.4f} {sig}")
+    # Save Cox model
+    cox_summary = cph.summary.to_dict()
+    # Plot hazard ratios
+    fig, ax = plt.subplots(figsize=(8, 5))
+    cph.plot(ax=ax)
+    ax.set_title('Cox PH — Hazard Ratios for Divorce', fontsize=14)
+    plt.tight_layout()
+    plt.savefig(f"{OUTPUT_DIR}/figures/cox_hazard_ratios.png", dpi=150, bbox_inches='tight')
+    plt.close()
+else:
+    print(f"  Insufficient data for Cox PH: {len(cox_df)} records")
+    cox_summary = {}
+# ============================================================
+# 6. DURATION DISTRIBUTION ANALYSIS
+# ============================================================
+print("\n" + "=" * 70)
+print("Step 6: Duration Distribution Analysis")
+print("=" * 70)
+# Duration distribution for divorced couples only
+divorced_durations = df_valid[df_valid['is_divorced']]['duration_years'].dropna()
+print(f"\nDivorced couples duration statistics:")
+print(f"  Count: {len(divorced_durations)}")
+print(f"  Mean: {divorced_durations.mean():.1f} years")
+print(f"  Median: {divorced_durations.median():.1f} years")
+print(f"  Std: {divorced_durations.std():.1f} years")
+print(f"  Mode: {divorced_durations.mode().values[0]:.0f} years")
+# Most dangerous periods
+print(f"\nDivorce Timing (when do marriages end?):")
+for period, label in [(range(0, 3), '0-2 years (honeymoon crisis)'),
+                       (range(3, 8), '3-7 years (seven year itch)'),
+                       (range(8, 15), '8-14 years (mid-life)'),
+                       (range(15, 25), '15-24 years (empty nest)'),
+                       (range(25, 100), '25+ years (late divorce)')]:
+    count = divorced_durations[divorced_durations.isin(period)].count()
+    pct = count / len(divorced_durations) * 100
+    print(f"  {label}: {count} ({pct:.1f}%)")
+# Plot divorce duration histogram
+fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+axes[0].hist(divorced_durations[divorced_durations <= 60], bins=60, color='#e74c3c', alpha=0.7, edgecolor='white')
+axes[0].set_xlabel('Marriage Duration (years)', fontsize=12)
+axes[0].set_ylabel('Count', fontsize=12)
+axes[0].set_title('Distribution of Divorce Timing', fontsize=14)
+axes[0].axvline(x=divorced_durations.median(), color='black', linestyle='--',
+                label=f'Median: {divorced_durations.median():.0f} years')
+axes[0].axvline(x=7, color='#f39c12', linestyle='--', alpha=0.7, label='7-year itch')
+axes[0].legend()
+# Cumulative divorce risk
+axes[1].hist(divorced_durations[divorced_durations <= 60], bins=60, color='#e74c3c',
+             alpha=0.7, cumulative=True, density=True, edgecolor='white')
+axes[1].set_xlabel('Marriage Duration (years)', fontsize=12)
+axes[1].set_ylabel('Cumulative Proportion of Divorces', fontsize=12)
+axes[1].set_title('Cumulative Divorce Risk', fontsize=14)
+axes[1].axhline(y=0.5, color='grey', linestyle='--', alpha=0.5, label='50% of divorces')
+axes[1].legend()
+plt.tight_layout()
+plt.savefig(f"{OUTPUT_DIR}/figures/divorce_timing.png", dpi=150, bbox_inches='tight')
+plt.close()
+# ============================================================
+# 7. BUILD LONGEVITY PRIOR TABLE
+# ============================================================
+print("\n" + "=" * 70)
+print("Step 7: Building Longevity Prior Table")
+print("=" * 70)
+# Create base rate priors that can be used as features
+priors = {}
+# Overall prior
+priors['overall'] = {
+    'divorce_rate': float(df_valid['is_divorced'].mean()),
+    'mean_duration_if_divorced': float(divorced_durations.mean()),
+    'median_duration_if_divorced': float(divorced_durations.median()),
+    'n': int(len(df_valid))
+}
+# By marriage type
+for mtype in ['Love', 'Arranged']:
+    subset = df_valid[df_valid['marriage_type'] == mtype]
+    if len(subset) > 20:
+        divorced_sub = subset[subset['is_divorced']]['duration_years'].dropna()
+        priors[f'type_{mtype.lower()}'] = {
+            'divorce_rate': float(subset['is_divorced'].mean()),
+            'mean_duration_if_divorced': float(divorced_sub.mean()) if len(divorced_sub) > 0 else None,
+            'median_duration_if_divorced': float(divorced_sub.median()) if len(divorced_sub) > 0 else None,
+            'n': int(len(subset))
+        }
+# By era
+for era in df_valid['marriage_era'].dropna().unique():
+    subset = df_valid[df_valid['marriage_era'] == era]
+    if len(subset) > 20:
+        divorced_sub = subset[subset['is_divorced']]['duration_years'].dropna()
+        priors[f'era_{era}'] = {
+            'divorce_rate': float(subset['is_divorced'].mean()),
+            'mean_duration_if_divorced': float(divorced_sub.mean()) if len(divorced_sub) > 0 else None,
+            'n': int(len(subset))
+        }
+# By marriage number
+for num in [1, 2, 3]:
+    subset = df_valid[df_valid['marriage_number'] == num]
+    if len(subset) > 20:
+        divorced_sub = subset[subset['is_divorced']]['duration_years'].dropna()
+        ordinal = {1: 'first', 2: 'second', 3: 'third'}[num]
+        priors[f'marriage_{ordinal}'] = {
+            'divorce_rate': float(subset['is_divorced'].mean()),
+            'mean_duration_if_divorced': float(divorced_sub.mean()) if len(divorced_sub) > 0 else None,
+            'n': int(len(subset))
+        }
+# By age at marriage (buckets)
+for low, high, label in [(14, 22, 'young'), (22, 30, 'prime'), (30, 40, 'mature'), (40, 80, 'late')]:
+    subset = df_valid[(df_valid['age_at_marriage'] >= low) & (df_valid['age_at_marriage'] < high)]
+    if len(subset) > 20:
+        divorced_sub = subset[subset['is_divorced']]['duration_years'].dropna()
+        priors[f'age_at_marriage_{label}'] = {
+            'divorce_rate': float(subset['is_divorced'].mean()),
+            'mean_duration_if_divorced': float(divorced_sub.mean()) if len(divorced_sub) > 0 else None,
+            'age_range': f'{low}-{high}',
+            'n': int(len(subset))
+        }
+print("\nLongevity Prior Table:")
+for key, val in priors.items():
+    print(f"  {key:35s}  divorce_rate={val['divorce_rate']:.1%}  n={val['n']}")
+# Save priors
+with open(f"{OUTPUT_DIR}/longevity_priors.json", "w") as f:
+    json.dump(priors, f, indent=2)
+# ============================================================
+# 8. CREATE SURVIVAL RISK SCORING FUNCTION
+# ============================================================
+print("\n" + "=" * 70)
+print("Step 8: Creating Survival Risk Scoring Function")
+print("=" * 70)
+# Save the survival model components
+survival_recipe = {
+    'priors': priors,
+    'km_overall_median_survival': float(median_survival) if not pd.isna(median_survival) else None,
+    'cox_summary': {k: {kk: float(vv) if isinstance(vv, (np.floating, float)) else vv
+                        for kk, vv in v.items()}
+                   for k, v in cox_summary.items()} if cox_summary else {},
+    'divorce_timing': {
+        'honeymoon_crisis_0_2yr': float(divorced_durations[divorced_durations < 3].count() / len(divorced_durations)),
+        'seven_year_itch_3_7yr': float(divorced_durations[(divorced_durations >= 3) & (divorced_durations < 8)].count() / len(divorced_durations)),
+        'midlife_8_14yr': float(divorced_durations[(divorced_durations >= 8) & (divorced_durations < 15)].count() / len(divorced_durations)),
+        'empty_nest_15_24yr': float(divorced_durations[(divorced_durations >= 15) & (divorced_durations < 25)].count() / len(divorced_durations)),
+        'late_divorce_25yr_plus': float(divorced_durations[divorced_durations >= 25].count() / len(divorced_durations)),
+    },
+    'key_findings': {
+        'love_vs_arranged_divorce_rate': {
+            'love': float(love_data['is_divorced'].mean()) if len(love_data) > 0 else None,
+            'arranged': float(arranged_data['is_divorced'].mean()) if len(arranged_data) > 0 else None,
+        },
+        'first_vs_subsequent_divorce_rate': {
+            'first': float(df_valid[df_valid['is_first_marriage']==1]['is_divorced'].mean()),
+            'subsequent': float(df_valid[df_valid['is_first_marriage']==0]['is_divorced'].mean()),
+        },
+    },
+    'dataset_stats': {
+        'total_records': int(len(df_valid)),
+        'total_divorces': int(df_valid['is_divorced'].sum()),
+        'total_people': int(df_valid['person'].nunique()),
+    }
+}
+with open(f"{OUTPUT_DIR}/survival_recipe.json", "w") as f:
+    json.dump(survival_recipe, f, indent=2, default=str)
+# Save processed dataframe for integration
+df_valid.to_csv(f"{OUTPUT_DIR}/processed_marriages.csv", index=False)
+print("\nPhase 2 Complete!")
+print(f"  Output directory: {OUTPUT_DIR}")
+print(f"  Longevity priors: longevity_priors.json")
+print(f"  Survival recipe: survival_recipe.json")
+print(f"  Processed data: processed_marriages.csv")
+print(f"  Figures: {OUTPUT_DIR}/figures/")
+# ============================================================
+# FINAL SUMMARY
+# ============================================================
+print("\n" + "=" * 70)
+print("PHASE 2 FINAL SUMMARY")
+print("=" * 70)
+print(f"""
+Dataset: vedastro-org/15000-Famous-People-Marriage-Divorce-Info
+  - {len(df_valid)} valid marriage records
+  - {df_valid['person'].nunique()} unique people
+  - {df_valid['is_divorced'].sum()} divorces ({df_valid['is_divorced'].mean():.1%} divorce rate)
+Key Findings:
+  1. Median marriage survival: {median_survival:.0f}+ years
+  2. Most divorces happen within the first {divorced_durations.median():.0f} years
+  3. Love marriages: {love_data['is_divorced'].mean():.1%} divorce rate
+  4. Arranged marriages: {arranged_data['is_divorced'].mean():.1%} divorce rate
+  5. First marriages: {df_valid[df_valid['is_first_marriage']==1]['is_divorced'].mean():.1%} divorce rate
+  6. Subsequent marriages: {df_valid[df_valid['is_first_marriage']==0]['is_divorced'].mean():.1%} divorce rate
+Outputs:
+  - Kaplan-Meier survival curves (overall, by type, by era, by marriage #)
+  - Cox PH model (hazard ratios for each factor)
+  - Longevity prior table (base rates by type/era/age/marriage#)
+  - Full survival recipe for integration
+""")