Spaces:

Esvanth
/

EcoCartAI

Sleeping

App Files Files Community

EcoCartAI / task5_forecasting.py

Esvanth

Load task data from data/*.csv instead of regenerating in-code

3c2c304 5 days ago

raw

history blame contribute delete

5.74 kB

	"""
	EcoCart Demand Forecasting Prototype
	Task 5 — Linear Regression vs Random Forest on synthetic daily sales.

	Run: python3 task5_forecasting.py
	Out: forecast.png, residuals.png, feature_importance.png
	"""

	import os
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.linear_model import LinearRegression
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
	RNG = np.random.default_rng(42)
	CSV_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "sales_history.csv")


	def load_sales():
	"""Load the 730-day daily sales dataset from data/sales_history.csv."""
	return pd.read_csv(CSV_PATH, parse_dates=["date"])


	# ── 1. Synthetic sales data ────────────────────────────────
	def generate_sales(days=730):
	t = np.arange(days)
	dates = pd.date_range("2023-01-01", periods=days, freq="D")
	base = 100 + 0.05 * t
	weekly = 25 * np.sin(2 * np.pi * t / 7)
	yearly = 40 * np.sin(2 * np.pi * t / 365)
	noise = RNG.normal(0, 8, days)
	promo = np.zeros(days)
	promo[RNG.choice(days, int(days * 0.06), replace=False)] = RNG.uniform(30, 70, int(days * 0.06))
	sales = np.clip(base + weekly + yearly + noise + promo, 0, None)
	return pd.DataFrame({
	"date": dates, "sales": sales,
	"dow": dates.dayofweek, "month": dates.month,
	"day_of_year": dates.dayofyear,
	"is_promo": (promo > 0).astype(int),
	})

	# ── 2. Features ────────────────────────────────────────────
	def add_features(df):
	out = df.copy()
	for lag in [1, 7, 14]:
	out[f"lag_{lag}"] = out["sales"].shift(lag)
	out["roll_7"] = out["sales"].shift(1).rolling(7).mean()
	out["roll_30"] = out["sales"].shift(1).rolling(30).mean()
	return out.dropna().reset_index(drop=True)


	FEATURES = ["dow", "month", "day_of_year", "is_promo",
	"lag_1", "lag_7", "lag_14", "roll_7", "roll_30"]


	# ── 3. Train & evaluate ───────────────────────────────────
	def evaluate(name, y_true, y_pred):
	mae = mean_absolute_error(y_true, y_pred)
	rmse = mean_squared_error(y_true, y_pred) ** 0.5
	r2 = r2_score(y_true, y_pred)
	mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 1, y_true))) * 100
	print(f" {name:<22s} MAE={mae:6.2f} RMSE={rmse:6.2f} R²={r2:.3f} MAPE={mape:.2f}%")
	return {"mae": mae, "rmse": rmse, "r2": r2, "mape": mape}


	def main():
	print("="*70)
	print("EcoCart Demand Forecasting — LR vs Random Forest")
	print("="*70)

	df = load_sales()
	df = add_features(df)
	split = int(len(df) * 0.8)
	train, test = df.iloc[:split], df.iloc[split:]
	X_tr, y_tr = train[FEATURES], train["sales"]
	X_te, y_te = test[FEATURES], test["sales"]
	print(f"Train: {len(train)} days Test: {len(test)} days")

	lr = LinearRegression().fit(X_tr, y_tr)
	rf = RandomForestRegressor(n_estimators=200, max_depth=12,
	min_samples_leaf=3, random_state=42,
	n_jobs=-1).fit(X_tr, y_tr)
	lr_pred = lr.predict(X_te)
	rf_pred = rf.predict(X_te)

	print("\nTest-set metrics:")
	lr_m = evaluate("Linear Regression", y_te.values, lr_pred)
	rf_m = evaluate("Random Forest", y_te.values, rf_pred)

	# ── Plots ──
	plt.rcParams.update({"axes.facecolor":"#0d1117","figure.facecolor":"#0d1117",
	"text.color":"white","axes.labelcolor":"white",
	"xtick.color":"white","ytick.color":"white"})

	# Forecast
	fig, ax = plt.subplots(figsize=(13, 5))
	ax.plot(test.date, y_te, color="#e2e8f0", lw=1.3, label="Actual")
	ax.plot(test.date, lr_pred, color="#3b82f6", lw=1, alpha=0.8, label="Linear Regression")
	ax.plot(test.date, rf_pred, color="#10b981", lw=1, alpha=0.8, label="Random Forest")
	ax.set_title("Test-set: actual vs predicted daily demand", fontsize=12)
	ax.set_xlabel("Date"); ax.set_ylabel("Units sold")
	ax.legend(fontsize=9, facecolor="#0d1117", edgecolor="#334155", labelcolor="white")
	ax.grid(True, alpha=0.1)
	plt.tight_layout()
	plt.savefig("output/forecast.png", dpi=150, bbox_inches="tight")
	plt.close()

	# Residuals
	fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))
	for ax, pred, name, color, m in [
	(axes[0], lr_pred, "Linear Regression", "#3b82f6", lr_m),
	(axes[1], rf_pred, "Random Forest", "#10b981", rf_m),
	]:
	ax.scatter(pred, y_te.values - pred, s=12, c=color, alpha=0.6)
	ax.axhline(0, color="white", lw=0.8)
	ax.set_title(f"{name} residuals (RMSE={m['rmse']:.2f})", fontsize=11)
	ax.set_xlabel("Predicted"); ax.set_ylabel("Residual")
	ax.grid(True, alpha=0.1)
	plt.tight_layout()
	plt.savefig("output/residuals.png", dpi=150, bbox_inches="tight")
	plt.close()

	# Feature importance
	imp = pd.Series(rf.feature_importances_, index=FEATURES).sort_values()
	fig, ax = plt.subplots(figsize=(8, 4.5))
	ax.barh(imp.index, imp.values, color="#10b981")
	ax.set_title("Random Forest — feature importance", fontsize=12)
	ax.set_xlabel("Importance")
	ax.grid(True, axis="x", alpha=0.1)
	plt.tight_layout()
	plt.savefig("output/feature_importance.png", dpi=150, bbox_inches="tight")
	plt.close()

	print(f"\nTop features: {', '.join(imp.index[-3:][::-1])}")
	print("Wrote: forecast.png, residuals.png, feature_importance.png")

	if __name__ == "__main__":
	main()