EcoCartAI / task2_segmentation.py
Esvanth's picture
Initial commit
0ed43fe
raw
history blame
10.2 kB
"""
EcoCart Customer Segmentation β€” Bias Detection & Mitigation
Task 2 β€” Demonstrates urban-rural bias in K-Means segmentation and
applies reweighing to fix it.
NCI MSCAI | Fundamentals of AI TABA 2026
Run: python3 task2_segmentation.py
Out: bias_before_after.png, disparate_impact.png
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
RNG = np.random.default_rng(42)
# ── 1. Generate biased customer data ────────────────────────
# Urban customers have more data, higher frequency, higher spend β€” mimicking
# a real scenario where the platform launched in cities first.
def generate_biased_data(n_urban=300, n_rural=100):
# Urban: higher frequency and spend on average
urban = pd.DataFrame({
"freq": RNG.normal(6.0, 2.0, n_urban).clip(0.5),
"spend": RNG.normal(120, 40, n_urban).clip(10),
"recency": RNG.exponential(10, n_urban).clip(1, 90),
"region": "urban",
})
# Rural: lower frequency and spend (platform is newer there)
rural = pd.DataFrame({
"freq": RNG.normal(3.0, 1.5, n_rural).clip(0.5),
"spend": RNG.normal(65, 30, n_rural).clip(10),
"recency": RNG.exponential(15, n_rural).clip(1, 90),
"region": "rural",
})
df = pd.concat([urban, rural], ignore_index=True)
df["freq"] = df["freq"].round(1)
df["spend"] = df["spend"].round(0)
df["recency"] = df["recency"].round(0)
return df
# ── 2. Segment with K-Means ────────────────────────────────
def segment(df, features=["freq", "spend", "recency"]):
scaler = StandardScaler()
X = scaler.fit_transform(df[features])
km = KMeans(n_clusters=3, random_state=42, n_init=10)
df = df.copy()
df["cluster"] = km.fit_predict(X)
# Label clusters by mean spend (High/Medium/Low)
means = df.groupby("cluster")["spend"].mean().sort_values(ascending=False)
label_map = {means.index[0]: "High Value",
means.index[1]: "Medium",
means.index[2]: "Low Value"}
df["segment"] = df["cluster"].map(label_map)
return df
# ── 3. Bias metrics ────────────────────────────────────────
def compute_fairness(df):
urban = df[df.region == "urban"]
rural = df[df.region == "rural"]
u_high = (urban.segment == "High Value").mean()
r_high = (rural.segment == "High Value").mean()
di = r_high / u_high if u_high > 0 else 0
return {
"urban_high_pct": round(u_high * 100, 1),
"rural_high_pct": round(r_high * 100, 1),
"disparate_impact": round(di, 3),
"fair": di >= 0.8,
}
# ── 4. Mitigation: reweigh + balanced re-sample ────────────
def mitigate(df):
"""
Fix 1: Balance the dataset by oversampling rural customers.
Fix 2: Add a 'distance_adjusted_spend' feature that normalises
spend by delivery cost (rural customers pay more for delivery,
so their raw spend understates their purchase intent).
Fix 3: Post-processing β€” reassign borderline rural customers using
a lowered threshold derived from the rural spend distribution.
"""
df = df.copy()
# Oversample rural to match urban count
rural = df[df.region == "rural"]
urban = df[df.region == "urban"]
rural_up = rural.sample(n=len(urban), replace=True, random_state=42)
balanced = pd.concat([urban, rural_up], ignore_index=True)
# Adjust spend: rural delivery costs ~€12 more on average
balanced["adj_spend"] = balanced.apply(
lambda r: r["spend"] + 12 if r["region"] == "rural" else r["spend"],
axis=1,
)
# Adjust frequency: rural customers batch orders
balanced["adj_freq"] = balanced.apply(
lambda r: r["freq"] * 1.5 if r["region"] == "rural" else r["freq"],
axis=1,
)
# Re-segment on adjusted features
scaler = StandardScaler()
X = scaler.fit_transform(balanced[["adj_freq", "adj_spend", "recency"]])
km = KMeans(n_clusters=3, random_state=42, n_init=10)
balanced["cluster"] = km.fit_predict(X)
means = balanced.groupby("cluster")["adj_spend"].mean().sort_values(ascending=False)
label_map = {means.index[0]: "High Value",
means.index[1]: "Medium",
means.index[2]: "Low Value"}
balanced["segment"] = balanced["cluster"].map(label_map)
# Post-processing: promote top rural "Medium" and "Low Value" customers
# to "High Value" until disparate impact reaches 0.85 (above 0.8 threshold)
rural_mask = balanced.region == "rural"
urban_mask = balanced.region == "urban"
urban_high_rate = (balanced[urban_mask].segment == "High Value").mean()
target_rate = urban_high_rate * 0.85
n_rural = rural_mask.sum()
target_rural_high = int(target_rate * n_rural)
current_rural_high = ((balanced[rural_mask].segment == "High Value")).sum()
need = target_rural_high - current_rural_high
if need > 0:
# Promote from Medium first, then Low Value
candidates = balanced[rural_mask & (balanced.segment != "High Value")]
if len(candidates) > 0:
promote = candidates.nlargest(min(need, len(candidates)), "adj_spend").index
balanced.loc[promote, "segment"] = "High Value"
return balanced
# ── 5. Plots ────────────────────────────────────────────────
SEG_COLORS = {"High Value": "#10b981", "Medium": "#f59e0b", "Low Value": "#ef4444"}
def plot_before_after(before_df, after_df, before_fair, after_fair):
fig, axes = plt.subplots(1, 2, figsize=(14, 5.5))
fig.patch.set_facecolor("#0d1117")
for ax, df, fair, title in [
(axes[0], before_df, before_fair, "BEFORE mitigation (biased)"),
(axes[1], after_df, after_fair, "AFTER mitigation (reweighed + adjusted)"),
]:
ax.set_facecolor("#0d1117")
for seg in ["High Value", "Medium", "Low Value"]:
mask = df.segment == seg
for region, marker in [("urban", "o"), ("rural", "^")]:
rmask = mask & (df.region == region)
ax.scatter(df.loc[rmask, "freq"], df.loc[rmask, "spend"],
c=SEG_COLORS[seg], marker=marker, s=25, alpha=0.6,
label=f"{seg} ({region})" if ax == axes[0] else None)
di = fair["disparate_impact"]
color = "#ef4444" if not fair["fair"] else "#10b981"
ax.set_title(f"{title}\nDI = {di:.3f} {'⚠ BIASED' if not fair['fair'] else 'βœ“ FAIR'}",
color="white", fontsize=11)
ax.set_xlabel("Purchase frequency / month", color="white")
ax.set_ylabel("Avg spend (€)", color="white")
ax.tick_params(colors="white")
ax.grid(True, alpha=0.1, color="white")
axes[0].legend(fontsize=7, facecolor="#0d1117", edgecolor="#334155",
labelcolor="white", loc="upper right", ncol=2)
plt.tight_layout()
plt.savefig("output/bias_before_after.png", dpi=150,
bbox_inches="tight", facecolor="#0d1117")
plt.close()
def plot_di(before_fair, after_fair):
fig, ax = plt.subplots(figsize=(8, 4))
fig.patch.set_facecolor("#0d1117")
ax.set_facecolor("#0d1117")
cats = ["Urban β†’ High", "Rural β†’ High", "Disparate Impact"]
before_vals = [before_fair["urban_high_pct"], before_fair["rural_high_pct"],
before_fair["disparate_impact"] * 100]
after_vals = [after_fair["urban_high_pct"], after_fair["rural_high_pct"],
after_fair["disparate_impact"] * 100]
x = range(len(cats))
w = 0.35
ax.bar([i - w/2 for i in x], before_vals, w, label="Before", color="#ef4444", alpha=0.85)
ax.bar([i + w/2 for i in x], after_vals, w, label="After", color="#10b981", alpha=0.85)
ax.axhline(80, color="#fbbf24", linewidth=1.5, linestyle="--", label="DI threshold (80%)")
ax.set_xticks(x)
ax.set_xticklabels(cats, color="white")
ax.set_ylabel("Percentage", color="white")
ax.set_title("Fairness metrics before vs after mitigation", color="white", fontsize=12)
ax.tick_params(colors="white")
ax.legend(fontsize=9, facecolor="#0d1117", edgecolor="#334155", labelcolor="white")
ax.grid(True, axis="y", alpha=0.15, color="white")
plt.tight_layout()
plt.savefig("output/disparate_impact.png", dpi=150,
bbox_inches="tight", facecolor="#0d1117")
plt.close()
# ── 6. Main ─────────────────────────────────────────────────
def main():
print("="*70)
print("EcoCart Customer Segmentation β€” Bias Detection & Mitigation")
print("="*70)
# Generate and segment (biased)
df = generate_biased_data()
df = segment(df)
before = compute_fairness(df)
print(f"\nBEFORE mitigation:")
print(f" Urban -> High Value: {before['urban_high_pct']}%")
print(f" Rural -> High Value: {before['rural_high_pct']}%")
print(f" Disparate Impact: {before['disparate_impact']}")
print(f" Fair (DI >= 0.8)? {before['fair']}")
print(f"\n Segment counts:")
ct = df.groupby(["region", "segment"]).size().unstack(fill_value=0)
print(ct.to_string(index=True))
# Mitigate
fixed = mitigate(df)
after = compute_fairness(fixed)
print(f"\nAFTER mitigation:")
print(f" Urban -> High Value: {after['urban_high_pct']}%")
print(f" Rural -> High Value: {after['rural_high_pct']}%")
print(f" Disparate Impact: {after['disparate_impact']}")
print(f" Fair (DI >= 0.8)? {after['fair']}")
# Plots
plot_before_after(df, fixed, before, after)
plot_di(before, after)
print("\nWrote: bias_before_after.png, disparate_impact.png")
if __name__ == "__main__":
main()