EcoCartAI / task5_forecasting.py
Esvanth's picture
Load task data from data/*.csv instead of regenerating in-code
3c2c304
"""
EcoCart Demand Forecasting Prototype
Task 5 β€” Linear Regression vs Random Forest on synthetic daily sales.
Run: python3 task5_forecasting.py
Out: forecast.png, residuals.png, feature_importance.png
"""
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
RNG = np.random.default_rng(42)
CSV_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "sales_history.csv")
def load_sales():
"""Load the 730-day daily sales dataset from data/sales_history.csv."""
return pd.read_csv(CSV_PATH, parse_dates=["date"])
# ── 1. Synthetic sales data ────────────────────────────────
def generate_sales(days=730):
t = np.arange(days)
dates = pd.date_range("2023-01-01", periods=days, freq="D")
base = 100 + 0.05 * t
weekly = 25 * np.sin(2 * np.pi * t / 7)
yearly = 40 * np.sin(2 * np.pi * t / 365)
noise = RNG.normal(0, 8, days)
promo = np.zeros(days)
promo[RNG.choice(days, int(days * 0.06), replace=False)] = RNG.uniform(30, 70, int(days * 0.06))
sales = np.clip(base + weekly + yearly + noise + promo, 0, None)
return pd.DataFrame({
"date": dates, "sales": sales,
"dow": dates.dayofweek, "month": dates.month,
"day_of_year": dates.dayofyear,
"is_promo": (promo > 0).astype(int),
})
# ── 2. Features ────────────────────────────────────────────
def add_features(df):
out = df.copy()
for lag in [1, 7, 14]:
out[f"lag_{lag}"] = out["sales"].shift(lag)
out["roll_7"] = out["sales"].shift(1).rolling(7).mean()
out["roll_30"] = out["sales"].shift(1).rolling(30).mean()
return out.dropna().reset_index(drop=True)
FEATURES = ["dow", "month", "day_of_year", "is_promo",
"lag_1", "lag_7", "lag_14", "roll_7", "roll_30"]
# ── 3. Train & evaluate ───────────────────────────────────
def evaluate(name, y_true, y_pred):
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred) ** 0.5
r2 = r2_score(y_true, y_pred)
mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 1, y_true))) * 100
print(f" {name:<22s} MAE={mae:6.2f} RMSE={rmse:6.2f} RΒ²={r2:.3f} MAPE={mape:.2f}%")
return {"mae": mae, "rmse": rmse, "r2": r2, "mape": mape}
def main():
print("="*70)
print("EcoCart Demand Forecasting β€” LR vs Random Forest")
print("="*70)
df = load_sales()
df = add_features(df)
split = int(len(df) * 0.8)
train, test = df.iloc[:split], df.iloc[split:]
X_tr, y_tr = train[FEATURES], train["sales"]
X_te, y_te = test[FEATURES], test["sales"]
print(f"Train: {len(train)} days Test: {len(test)} days")
lr = LinearRegression().fit(X_tr, y_tr)
rf = RandomForestRegressor(n_estimators=200, max_depth=12,
min_samples_leaf=3, random_state=42,
n_jobs=-1).fit(X_tr, y_tr)
lr_pred = lr.predict(X_te)
rf_pred = rf.predict(X_te)
print("\nTest-set metrics:")
lr_m = evaluate("Linear Regression", y_te.values, lr_pred)
rf_m = evaluate("Random Forest", y_te.values, rf_pred)
# ── Plots ──
plt.rcParams.update({"axes.facecolor":"#0d1117","figure.facecolor":"#0d1117",
"text.color":"white","axes.labelcolor":"white",
"xtick.color":"white","ytick.color":"white"})
# Forecast
fig, ax = plt.subplots(figsize=(13, 5))
ax.plot(test.date, y_te, color="#e2e8f0", lw=1.3, label="Actual")
ax.plot(test.date, lr_pred, color="#3b82f6", lw=1, alpha=0.8, label="Linear Regression")
ax.plot(test.date, rf_pred, color="#10b981", lw=1, alpha=0.8, label="Random Forest")
ax.set_title("Test-set: actual vs predicted daily demand", fontsize=12)
ax.set_xlabel("Date"); ax.set_ylabel("Units sold")
ax.legend(fontsize=9, facecolor="#0d1117", edgecolor="#334155", labelcolor="white")
ax.grid(True, alpha=0.1)
plt.tight_layout()
plt.savefig("output/forecast.png", dpi=150, bbox_inches="tight")
plt.close()
# Residuals
fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))
for ax, pred, name, color, m in [
(axes[0], lr_pred, "Linear Regression", "#3b82f6", lr_m),
(axes[1], rf_pred, "Random Forest", "#10b981", rf_m),
]:
ax.scatter(pred, y_te.values - pred, s=12, c=color, alpha=0.6)
ax.axhline(0, color="white", lw=0.8)
ax.set_title(f"{name} residuals (RMSE={m['rmse']:.2f})", fontsize=11)
ax.set_xlabel("Predicted"); ax.set_ylabel("Residual")
ax.grid(True, alpha=0.1)
plt.tight_layout()
plt.savefig("output/residuals.png", dpi=150, bbox_inches="tight")
plt.close()
# Feature importance
imp = pd.Series(rf.feature_importances_, index=FEATURES).sort_values()
fig, ax = plt.subplots(figsize=(8, 4.5))
ax.barh(imp.index, imp.values, color="#10b981")
ax.set_title("Random Forest β€” feature importance", fontsize=12)
ax.set_xlabel("Importance")
ax.grid(True, axis="x", alpha=0.1)
plt.tight_layout()
plt.savefig("output/feature_importance.png", dpi=150, bbox_inches="tight")
plt.close()
print(f"\nTop features: {', '.join(imp.index[-3:][::-1])}")
print("Wrote: forecast.png, residuals.png, feature_importance.png")
if __name__ == "__main__":
main()