Spaces:

Esvanth
/

EcoCartAI

Sleeping

File size: 10,166 Bytes

0ed43fe

"""
EcoCart Customer Segmentation — Bias Detection & Mitigation
Task 2 — Demonstrates urban-rural bias in K-Means segmentation and
          applies reweighing to fix it.

NCI MSCAI | Fundamentals of AI TABA 2026

Run:  python3 task2_segmentation.py
Out:  bias_before_after.png, disparate_impact.png
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

RNG = np.random.default_rng(42)


# ── 1. Generate biased customer data ────────────────────────
# Urban customers have more data, higher frequency, higher spend — mimicking
# a real scenario where the platform launched in cities first.

def generate_biased_data(n_urban=300, n_rural=100):
    # Urban: higher frequency and spend on average
    urban = pd.DataFrame({
        "freq":     RNG.normal(6.0, 2.0, n_urban).clip(0.5),
        "spend":    RNG.normal(120, 40, n_urban).clip(10),
        "recency":  RNG.exponential(10, n_urban).clip(1, 90),
        "region":   "urban",
    })
    # Rural: lower frequency and spend (platform is newer there)
    rural = pd.DataFrame({
        "freq":     RNG.normal(3.0, 1.5, n_rural).clip(0.5),
        "spend":    RNG.normal(65, 30, n_rural).clip(10),
        "recency":  RNG.exponential(15, n_rural).clip(1, 90),
        "region":   "rural",
    })
    df = pd.concat([urban, rural], ignore_index=True)
    df["freq"] = df["freq"].round(1)
    df["spend"] = df["spend"].round(0)
    df["recency"] = df["recency"].round(0)
    return df


# ── 2. Segment with K-Means ────────────────────────────────
def segment(df, features=["freq", "spend", "recency"]):
    scaler = StandardScaler()
    X = scaler.fit_transform(df[features])
    km = KMeans(n_clusters=3, random_state=42, n_init=10)
    df = df.copy()
    df["cluster"] = km.fit_predict(X)

    # Label clusters by mean spend (High/Medium/Low)
    means = df.groupby("cluster")["spend"].mean().sort_values(ascending=False)
    label_map = {means.index[0]: "High Value",
                 means.index[1]: "Medium",
                 means.index[2]: "Low Value"}
    df["segment"] = df["cluster"].map(label_map)
    return df


# ── 3. Bias metrics ────────────────────────────────────────
def compute_fairness(df):
    urban = df[df.region == "urban"]
    rural = df[df.region == "rural"]
    u_high = (urban.segment == "High Value").mean()
    r_high = (rural.segment == "High Value").mean()
    di = r_high / u_high if u_high > 0 else 0
    return {
        "urban_high_pct": round(u_high * 100, 1),
        "rural_high_pct": round(r_high * 100, 1),
        "disparate_impact": round(di, 3),
        "fair": di >= 0.8,
    }


# ── 4. Mitigation: reweigh + balanced re-sample ────────────
def mitigate(df):
    """
    Fix 1: Balance the dataset by oversampling rural customers.
    Fix 2: Add a 'distance_adjusted_spend' feature that normalises
           spend by delivery cost (rural customers pay more for delivery,
           so their raw spend understates their purchase intent).
    Fix 3: Post-processing — reassign borderline rural customers using
           a lowered threshold derived from the rural spend distribution.
    """
    df = df.copy()

    # Oversample rural to match urban count
    rural = df[df.region == "rural"]
    urban = df[df.region == "urban"]
    rural_up = rural.sample(n=len(urban), replace=True, random_state=42)
    balanced = pd.concat([urban, rural_up], ignore_index=True)

    # Adjust spend: rural delivery costs ~€12 more on average
    balanced["adj_spend"] = balanced.apply(
        lambda r: r["spend"] + 12 if r["region"] == "rural" else r["spend"],
        axis=1,
    )
    # Adjust frequency: rural customers batch orders
    balanced["adj_freq"] = balanced.apply(
        lambda r: r["freq"] * 1.5 if r["region"] == "rural" else r["freq"],
        axis=1,
    )

    # Re-segment on adjusted features
    scaler = StandardScaler()
    X = scaler.fit_transform(balanced[["adj_freq", "adj_spend", "recency"]])
    km = KMeans(n_clusters=3, random_state=42, n_init=10)
    balanced["cluster"] = km.fit_predict(X)
    means = balanced.groupby("cluster")["adj_spend"].mean().sort_values(ascending=False)
    label_map = {means.index[0]: "High Value",
                 means.index[1]: "Medium",
                 means.index[2]: "Low Value"}
    balanced["segment"] = balanced["cluster"].map(label_map)

    # Post-processing: promote top rural "Medium" and "Low Value" customers
    # to "High Value" until disparate impact reaches 0.85 (above 0.8 threshold)
    rural_mask = balanced.region == "rural"
    urban_mask = balanced.region == "urban"
    urban_high_rate = (balanced[urban_mask].segment == "High Value").mean()
    target_rate = urban_high_rate * 0.85
    n_rural = rural_mask.sum()
    target_rural_high = int(target_rate * n_rural)
    current_rural_high = ((balanced[rural_mask].segment == "High Value")).sum()
    need = target_rural_high - current_rural_high

    if need > 0:
        # Promote from Medium first, then Low Value
        candidates = balanced[rural_mask & (balanced.segment != "High Value")]
        if len(candidates) > 0:
            promote = candidates.nlargest(min(need, len(candidates)), "adj_spend").index
            balanced.loc[promote, "segment"] = "High Value"

    return balanced


# ── 5. Plots ────────────────────────────────────────────────
SEG_COLORS = {"High Value": "#10b981", "Medium": "#f59e0b", "Low Value": "#ef4444"}

def plot_before_after(before_df, after_df, before_fair, after_fair):
    fig, axes = plt.subplots(1, 2, figsize=(14, 5.5))
    fig.patch.set_facecolor("#0d1117")

    for ax, df, fair, title in [
        (axes[0], before_df, before_fair, "BEFORE mitigation (biased)"),
        (axes[1], after_df,  after_fair,  "AFTER mitigation (reweighed + adjusted)"),
    ]:
        ax.set_facecolor("#0d1117")
        for seg in ["High Value", "Medium", "Low Value"]:
            mask = df.segment == seg
            for region, marker in [("urban", "o"), ("rural", "^")]:
                rmask = mask & (df.region == region)
                ax.scatter(df.loc[rmask, "freq"], df.loc[rmask, "spend"],
                           c=SEG_COLORS[seg], marker=marker, s=25, alpha=0.6,
                           label=f"{seg} ({region})" if ax == axes[0] else None)
        di = fair["disparate_impact"]
        color = "#ef4444" if not fair["fair"] else "#10b981"
        ax.set_title(f"{title}\nDI = {di:.3f} {'⚠ BIASED' if not fair['fair'] else '✓ FAIR'}",
                     color="white", fontsize=11)
        ax.set_xlabel("Purchase frequency / month", color="white")
        ax.set_ylabel("Avg spend (€)", color="white")
        ax.tick_params(colors="white")
        ax.grid(True, alpha=0.1, color="white")

    axes[0].legend(fontsize=7, facecolor="#0d1117", edgecolor="#334155",
                   labelcolor="white", loc="upper right", ncol=2)
    plt.tight_layout()
    plt.savefig("output/bias_before_after.png", dpi=150,
                bbox_inches="tight", facecolor="#0d1117")
    plt.close()


def plot_di(before_fair, after_fair):
    fig, ax = plt.subplots(figsize=(8, 4))
    fig.patch.set_facecolor("#0d1117")
    ax.set_facecolor("#0d1117")

    cats = ["Urban → High", "Rural → High", "Disparate Impact"]
    before_vals = [before_fair["urban_high_pct"], before_fair["rural_high_pct"],
                   before_fair["disparate_impact"] * 100]
    after_vals  = [after_fair["urban_high_pct"],  after_fair["rural_high_pct"],
                   after_fair["disparate_impact"] * 100]

    x = range(len(cats))
    w = 0.35
    ax.bar([i - w/2 for i in x], before_vals, w, label="Before", color="#ef4444", alpha=0.85)
    ax.bar([i + w/2 for i in x], after_vals,  w, label="After",  color="#10b981", alpha=0.85)
    ax.axhline(80, color="#fbbf24", linewidth=1.5, linestyle="--", label="DI threshold (80%)")
    ax.set_xticks(x)
    ax.set_xticklabels(cats, color="white")
    ax.set_ylabel("Percentage", color="white")
    ax.set_title("Fairness metrics before vs after mitigation", color="white", fontsize=12)
    ax.tick_params(colors="white")
    ax.legend(fontsize=9, facecolor="#0d1117", edgecolor="#334155", labelcolor="white")
    ax.grid(True, axis="y", alpha=0.15, color="white")
    plt.tight_layout()
    plt.savefig("output/disparate_impact.png", dpi=150,
                bbox_inches="tight", facecolor="#0d1117")
    plt.close()


# ── 6. Main ─────────────────────────────────────────────────
def main():
    print("="*70)
    print("EcoCart Customer Segmentation — Bias Detection & Mitigation")
    print("="*70)

    # Generate and segment (biased)
    df = generate_biased_data()
    df = segment(df)
    before = compute_fairness(df)
    print(f"\nBEFORE mitigation:")
    print(f"  Urban -> High Value: {before['urban_high_pct']}%")
    print(f"  Rural -> High Value: {before['rural_high_pct']}%")
    print(f"  Disparate Impact:   {before['disparate_impact']}")
    print(f"  Fair (DI >= 0.8)?   {before['fair']}")

    print(f"\n  Segment counts:")
    ct = df.groupby(["region", "segment"]).size().unstack(fill_value=0)
    print(ct.to_string(index=True))

    # Mitigate
    fixed = mitigate(df)
    after = compute_fairness(fixed)
    print(f"\nAFTER mitigation:")
    print(f"  Urban -> High Value: {after['urban_high_pct']}%")
    print(f"  Rural -> High Value: {after['rural_high_pct']}%")
    print(f"  Disparate Impact:   {after['disparate_impact']}")
    print(f"  Fair (DI >= 0.8)?   {after['fair']}")

    # Plots
    plot_before_after(df, fixed, before, after)
    plot_di(before, after)
    print("\nWrote: bias_before_after.png, disparate_impact.png")

if __name__ == "__main__":
    main()