File size: 5,742 Bytes
0ed43fe 3c2c304 0ed43fe 3c2c304 0ed43fe 3c2c304 0ed43fe | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | """
EcoCart Demand Forecasting Prototype
Task 5 β Linear Regression vs Random Forest on synthetic daily sales.
Run: python3 task5_forecasting.py
Out: forecast.png, residuals.png, feature_importance.png
"""
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
RNG = np.random.default_rng(42)
CSV_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "sales_history.csv")
def load_sales():
"""Load the 730-day daily sales dataset from data/sales_history.csv."""
return pd.read_csv(CSV_PATH, parse_dates=["date"])
# ββ 1. Synthetic sales data ββββββββββββββββββββββββββββββββ
def generate_sales(days=730):
t = np.arange(days)
dates = pd.date_range("2023-01-01", periods=days, freq="D")
base = 100 + 0.05 * t
weekly = 25 * np.sin(2 * np.pi * t / 7)
yearly = 40 * np.sin(2 * np.pi * t / 365)
noise = RNG.normal(0, 8, days)
promo = np.zeros(days)
promo[RNG.choice(days, int(days * 0.06), replace=False)] = RNG.uniform(30, 70, int(days * 0.06))
sales = np.clip(base + weekly + yearly + noise + promo, 0, None)
return pd.DataFrame({
"date": dates, "sales": sales,
"dow": dates.dayofweek, "month": dates.month,
"day_of_year": dates.dayofyear,
"is_promo": (promo > 0).astype(int),
})
# ββ 2. Features ββββββββββββββββββββββββββββββββββββββββββββ
def add_features(df):
out = df.copy()
for lag in [1, 7, 14]:
out[f"lag_{lag}"] = out["sales"].shift(lag)
out["roll_7"] = out["sales"].shift(1).rolling(7).mean()
out["roll_30"] = out["sales"].shift(1).rolling(30).mean()
return out.dropna().reset_index(drop=True)
FEATURES = ["dow", "month", "day_of_year", "is_promo",
"lag_1", "lag_7", "lag_14", "roll_7", "roll_30"]
# ββ 3. Train & evaluate βββββββββββββββββββββββββββββββββββ
def evaluate(name, y_true, y_pred):
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred) ** 0.5
r2 = r2_score(y_true, y_pred)
mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 1, y_true))) * 100
print(f" {name:<22s} MAE={mae:6.2f} RMSE={rmse:6.2f} RΒ²={r2:.3f} MAPE={mape:.2f}%")
return {"mae": mae, "rmse": rmse, "r2": r2, "mape": mape}
def main():
print("="*70)
print("EcoCart Demand Forecasting β LR vs Random Forest")
print("="*70)
df = load_sales()
df = add_features(df)
split = int(len(df) * 0.8)
train, test = df.iloc[:split], df.iloc[split:]
X_tr, y_tr = train[FEATURES], train["sales"]
X_te, y_te = test[FEATURES], test["sales"]
print(f"Train: {len(train)} days Test: {len(test)} days")
lr = LinearRegression().fit(X_tr, y_tr)
rf = RandomForestRegressor(n_estimators=200, max_depth=12,
min_samples_leaf=3, random_state=42,
n_jobs=-1).fit(X_tr, y_tr)
lr_pred = lr.predict(X_te)
rf_pred = rf.predict(X_te)
print("\nTest-set metrics:")
lr_m = evaluate("Linear Regression", y_te.values, lr_pred)
rf_m = evaluate("Random Forest", y_te.values, rf_pred)
# ββ Plots ββ
plt.rcParams.update({"axes.facecolor":"#0d1117","figure.facecolor":"#0d1117",
"text.color":"white","axes.labelcolor":"white",
"xtick.color":"white","ytick.color":"white"})
# Forecast
fig, ax = plt.subplots(figsize=(13, 5))
ax.plot(test.date, y_te, color="#e2e8f0", lw=1.3, label="Actual")
ax.plot(test.date, lr_pred, color="#3b82f6", lw=1, alpha=0.8, label="Linear Regression")
ax.plot(test.date, rf_pred, color="#10b981", lw=1, alpha=0.8, label="Random Forest")
ax.set_title("Test-set: actual vs predicted daily demand", fontsize=12)
ax.set_xlabel("Date"); ax.set_ylabel("Units sold")
ax.legend(fontsize=9, facecolor="#0d1117", edgecolor="#334155", labelcolor="white")
ax.grid(True, alpha=0.1)
plt.tight_layout()
plt.savefig("output/forecast.png", dpi=150, bbox_inches="tight")
plt.close()
# Residuals
fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))
for ax, pred, name, color, m in [
(axes[0], lr_pred, "Linear Regression", "#3b82f6", lr_m),
(axes[1], rf_pred, "Random Forest", "#10b981", rf_m),
]:
ax.scatter(pred, y_te.values - pred, s=12, c=color, alpha=0.6)
ax.axhline(0, color="white", lw=0.8)
ax.set_title(f"{name} residuals (RMSE={m['rmse']:.2f})", fontsize=11)
ax.set_xlabel("Predicted"); ax.set_ylabel("Residual")
ax.grid(True, alpha=0.1)
plt.tight_layout()
plt.savefig("output/residuals.png", dpi=150, bbox_inches="tight")
plt.close()
# Feature importance
imp = pd.Series(rf.feature_importances_, index=FEATURES).sort_values()
fig, ax = plt.subplots(figsize=(8, 4.5))
ax.barh(imp.index, imp.values, color="#10b981")
ax.set_title("Random Forest β feature importance", fontsize=12)
ax.set_xlabel("Importance")
ax.grid(True, axis="x", alpha=0.1)
plt.tight_layout()
plt.savefig("output/feature_importance.png", dpi=150, bbox_inches="tight")
plt.close()
print(f"\nTop features: {', '.join(imp.index[-3:][::-1])}")
print("Wrote: forecast.png, residuals.png, feature_importance.png")
if __name__ == "__main__":
main()
|