Spaces:

Esvanth
/

EcoCartAI

Sleeping

App Files Files Community

EcoCartAI / task2_segmentation.py

Esvanth

Initial commit

0ed43fe 9 days ago

raw

history blame

10.2 kB

	"""
	EcoCart Customer Segmentation — Bias Detection & Mitigation
	Task 2 — Demonstrates urban-rural bias in K-Means segmentation and
	applies reweighing to fix it.

	NCI MSCAI \| Fundamentals of AI TABA 2026

	Run: python3 task2_segmentation.py
	Out: bias_before_after.png, disparate_impact.png
	"""

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.cluster import KMeans
	from sklearn.preprocessing import StandardScaler

	RNG = np.random.default_rng(42)


	# ── 1. Generate biased customer data ────────────────────────
	# Urban customers have more data, higher frequency, higher spend — mimicking
	# a real scenario where the platform launched in cities first.

	def generate_biased_data(n_urban=300, n_rural=100):
	# Urban: higher frequency and spend on average
	urban = pd.DataFrame({
	"freq": RNG.normal(6.0, 2.0, n_urban).clip(0.5),
	"spend": RNG.normal(120, 40, n_urban).clip(10),
	"recency": RNG.exponential(10, n_urban).clip(1, 90),
	"region": "urban",
	})
	# Rural: lower frequency and spend (platform is newer there)
	rural = pd.DataFrame({
	"freq": RNG.normal(3.0, 1.5, n_rural).clip(0.5),
	"spend": RNG.normal(65, 30, n_rural).clip(10),
	"recency": RNG.exponential(15, n_rural).clip(1, 90),
	"region": "rural",
	})
	df = pd.concat([urban, rural], ignore_index=True)
	df["freq"] = df["freq"].round(1)
	df["spend"] = df["spend"].round(0)
	df["recency"] = df["recency"].round(0)
	return df


	# ── 2. Segment with K-Means ────────────────────────────────
	def segment(df, features=["freq", "spend", "recency"]):
	scaler = StandardScaler()
	X = scaler.fit_transform(df[features])
	km = KMeans(n_clusters=3, random_state=42, n_init=10)
	df = df.copy()
	df["cluster"] = km.fit_predict(X)

	# Label clusters by mean spend (High/Medium/Low)
	means = df.groupby("cluster")["spend"].mean().sort_values(ascending=False)
	label_map = {means.index[0]: "High Value",
	means.index[1]: "Medium",
	means.index[2]: "Low Value"}
	df["segment"] = df["cluster"].map(label_map)
	return df


	# ── 3. Bias metrics ────────────────────────────────────────
	def compute_fairness(df):
	urban = df[df.region == "urban"]
	rural = df[df.region == "rural"]
	u_high = (urban.segment == "High Value").mean()
	r_high = (rural.segment == "High Value").mean()
	di = r_high / u_high if u_high > 0 else 0
	return {
	"urban_high_pct": round(u_high * 100, 1),
	"rural_high_pct": round(r_high * 100, 1),
	"disparate_impact": round(di, 3),
	"fair": di >= 0.8,
	}


	# ── 4. Mitigation: reweigh + balanced re-sample ────────────
	def mitigate(df):
	"""
	Fix 1: Balance the dataset by oversampling rural customers.
	Fix 2: Add a 'distance_adjusted_spend' feature that normalises
	spend by delivery cost (rural customers pay more for delivery,
	so their raw spend understates their purchase intent).
	Fix 3: Post-processing — reassign borderline rural customers using
	a lowered threshold derived from the rural spend distribution.
	"""
	df = df.copy()

	# Oversample rural to match urban count
	rural = df[df.region == "rural"]
	urban = df[df.region == "urban"]
	rural_up = rural.sample(n=len(urban), replace=True, random_state=42)
	balanced = pd.concat([urban, rural_up], ignore_index=True)

	# Adjust spend: rural delivery costs ~€12 more on average
	balanced["adj_spend"] = balanced.apply(
	lambda r: r["spend"] + 12 if r["region"] == "rural" else r["spend"],
	axis=1,
	)
	# Adjust frequency: rural customers batch orders
	balanced["adj_freq"] = balanced.apply(
	lambda r: r["freq"] * 1.5 if r["region"] == "rural" else r["freq"],
	axis=1,
	)

	# Re-segment on adjusted features
	scaler = StandardScaler()
	X = scaler.fit_transform(balanced[["adj_freq", "adj_spend", "recency"]])
	km = KMeans(n_clusters=3, random_state=42, n_init=10)
	balanced["cluster"] = km.fit_predict(X)
	means = balanced.groupby("cluster")["adj_spend"].mean().sort_values(ascending=False)
	label_map = {means.index[0]: "High Value",
	means.index[1]: "Medium",
	means.index[2]: "Low Value"}
	balanced["segment"] = balanced["cluster"].map(label_map)

	# Post-processing: promote top rural "Medium" and "Low Value" customers
	# to "High Value" until disparate impact reaches 0.85 (above 0.8 threshold)
	rural_mask = balanced.region == "rural"
	urban_mask = balanced.region == "urban"
	urban_high_rate = (balanced[urban_mask].segment == "High Value").mean()
	target_rate = urban_high_rate * 0.85
	n_rural = rural_mask.sum()
	target_rural_high = int(target_rate * n_rural)
	current_rural_high = ((balanced[rural_mask].segment == "High Value")).sum()
	need = target_rural_high - current_rural_high

	if need > 0:
	# Promote from Medium first, then Low Value
	candidates = balanced[rural_mask & (balanced.segment != "High Value")]
	if len(candidates) > 0:
	promote = candidates.nlargest(min(need, len(candidates)), "adj_spend").index
	balanced.loc[promote, "segment"] = "High Value"

	return balanced


	# ── 5. Plots ────────────────────────────────────────────────
	SEG_COLORS = {"High Value": "#10b981", "Medium": "#f59e0b", "Low Value": "#ef4444"}

	def plot_before_after(before_df, after_df, before_fair, after_fair):
	fig, axes = plt.subplots(1, 2, figsize=(14, 5.5))
	fig.patch.set_facecolor("#0d1117")

	for ax, df, fair, title in [
	(axes[0], before_df, before_fair, "BEFORE mitigation (biased)"),
	(axes[1], after_df, after_fair, "AFTER mitigation (reweighed + adjusted)"),
	]:
	ax.set_facecolor("#0d1117")
	for seg in ["High Value", "Medium", "Low Value"]:
	mask = df.segment == seg
	for region, marker in [("urban", "o"), ("rural", "^")]:
	rmask = mask & (df.region == region)
	ax.scatter(df.loc[rmask, "freq"], df.loc[rmask, "spend"],
	c=SEG_COLORS[seg], marker=marker, s=25, alpha=0.6,
	label=f"{seg} ({region})" if ax == axes[0] else None)
	di = fair["disparate_impact"]
	color = "#ef4444" if not fair["fair"] else "#10b981"
	ax.set_title(f"{title}\nDI = {di:.3f} {'⚠ BIASED' if not fair['fair'] else '✓ FAIR'}",
	color="white", fontsize=11)
	ax.set_xlabel("Purchase frequency / month", color="white")
	ax.set_ylabel("Avg spend (€)", color="white")
	ax.tick_params(colors="white")
	ax.grid(True, alpha=0.1, color="white")

	axes[0].legend(fontsize=7, facecolor="#0d1117", edgecolor="#334155",
	labelcolor="white", loc="upper right", ncol=2)
	plt.tight_layout()
	plt.savefig("output/bias_before_after.png", dpi=150,
	bbox_inches="tight", facecolor="#0d1117")
	plt.close()


	def plot_di(before_fair, after_fair):
	fig, ax = plt.subplots(figsize=(8, 4))
	fig.patch.set_facecolor("#0d1117")
	ax.set_facecolor("#0d1117")

	cats = ["Urban → High", "Rural → High", "Disparate Impact"]
	before_vals = [before_fair["urban_high_pct"], before_fair["rural_high_pct"],
	before_fair["disparate_impact"] * 100]
	after_vals = [after_fair["urban_high_pct"], after_fair["rural_high_pct"],
	after_fair["disparate_impact"] * 100]

	x = range(len(cats))
	w = 0.35
	ax.bar([i - w/2 for i in x], before_vals, w, label="Before", color="#ef4444", alpha=0.85)
	ax.bar([i + w/2 for i in x], after_vals, w, label="After", color="#10b981", alpha=0.85)
	ax.axhline(80, color="#fbbf24", linewidth=1.5, linestyle="--", label="DI threshold (80%)")
	ax.set_xticks(x)
	ax.set_xticklabels(cats, color="white")
	ax.set_ylabel("Percentage", color="white")
	ax.set_title("Fairness metrics before vs after mitigation", color="white", fontsize=12)
	ax.tick_params(colors="white")
	ax.legend(fontsize=9, facecolor="#0d1117", edgecolor="#334155", labelcolor="white")
	ax.grid(True, axis="y", alpha=0.15, color="white")
	plt.tight_layout()
	plt.savefig("output/disparate_impact.png", dpi=150,
	bbox_inches="tight", facecolor="#0d1117")
	plt.close()


	# ── 6. Main ─────────────────────────────────────────────────
	def main():
	print("="*70)
	print("EcoCart Customer Segmentation — Bias Detection & Mitigation")
	print("="*70)

	# Generate and segment (biased)
	df = generate_biased_data()
	df = segment(df)
	before = compute_fairness(df)
	print(f"\nBEFORE mitigation:")
	print(f" Urban -> High Value: {before['urban_high_pct']}%")
	print(f" Rural -> High Value: {before['rural_high_pct']}%")
	print(f" Disparate Impact: {before['disparate_impact']}")
	print(f" Fair (DI >= 0.8)? {before['fair']}")

	print(f"\n Segment counts:")
	ct = df.groupby(["region", "segment"]).size().unstack(fill_value=0)
	print(ct.to_string(index=True))

	# Mitigate
	fixed = mitigate(df)
	after = compute_fairness(fixed)
	print(f"\nAFTER mitigation:")
	print(f" Urban -> High Value: {after['urban_high_pct']}%")
	print(f" Rural -> High Value: {after['rural_high_pct']}%")
	print(f" Disparate Impact: {after['disparate_impact']}")
	print(f" Fair (DI >= 0.8)? {after['fair']}")

	# Plots
	plot_before_after(df, fixed, before, after)
	plot_di(before, after)
	print("\nWrote: bias_before_after.png, disparate_impact.png")

	if __name__ == "__main__":
	main()