| """ |
| EcoCart Demand Forecasting Prototype |
| Task 5 β Linear Regression vs Random Forest on synthetic daily sales. |
| |
| Run: python3 task5_forecasting.py |
| Out: forecast.png, residuals.png, feature_importance.png |
| """ |
|
|
| import os |
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| from sklearn.linear_model import LinearRegression |
| from sklearn.ensemble import RandomForestRegressor |
| from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score |
| RNG = np.random.default_rng(42) |
| CSV_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "sales_history.csv") |
|
|
|
|
| def load_sales(): |
| """Load the 730-day daily sales dataset from data/sales_history.csv.""" |
| return pd.read_csv(CSV_PATH, parse_dates=["date"]) |
|
|
|
|
| |
| def generate_sales(days=730): |
| t = np.arange(days) |
| dates = pd.date_range("2023-01-01", periods=days, freq="D") |
| base = 100 + 0.05 * t |
| weekly = 25 * np.sin(2 * np.pi * t / 7) |
| yearly = 40 * np.sin(2 * np.pi * t / 365) |
| noise = RNG.normal(0, 8, days) |
| promo = np.zeros(days) |
| promo[RNG.choice(days, int(days * 0.06), replace=False)] = RNG.uniform(30, 70, int(days * 0.06)) |
| sales = np.clip(base + weekly + yearly + noise + promo, 0, None) |
| return pd.DataFrame({ |
| "date": dates, "sales": sales, |
| "dow": dates.dayofweek, "month": dates.month, |
| "day_of_year": dates.dayofyear, |
| "is_promo": (promo > 0).astype(int), |
| }) |
|
|
| |
| def add_features(df): |
| out = df.copy() |
| for lag in [1, 7, 14]: |
| out[f"lag_{lag}"] = out["sales"].shift(lag) |
| out["roll_7"] = out["sales"].shift(1).rolling(7).mean() |
| out["roll_30"] = out["sales"].shift(1).rolling(30).mean() |
| return out.dropna().reset_index(drop=True) |
|
|
|
|
| FEATURES = ["dow", "month", "day_of_year", "is_promo", |
| "lag_1", "lag_7", "lag_14", "roll_7", "roll_30"] |
|
|
|
|
| |
| def evaluate(name, y_true, y_pred): |
| mae = mean_absolute_error(y_true, y_pred) |
| rmse = mean_squared_error(y_true, y_pred) ** 0.5 |
| r2 = r2_score(y_true, y_pred) |
| mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 1, y_true))) * 100 |
| print(f" {name:<22s} MAE={mae:6.2f} RMSE={rmse:6.2f} RΒ²={r2:.3f} MAPE={mape:.2f}%") |
| return {"mae": mae, "rmse": rmse, "r2": r2, "mape": mape} |
|
|
|
|
| def main(): |
| print("="*70) |
| print("EcoCart Demand Forecasting β LR vs Random Forest") |
| print("="*70) |
|
|
| df = load_sales() |
| df = add_features(df) |
| split = int(len(df) * 0.8) |
| train, test = df.iloc[:split], df.iloc[split:] |
| X_tr, y_tr = train[FEATURES], train["sales"] |
| X_te, y_te = test[FEATURES], test["sales"] |
| print(f"Train: {len(train)} days Test: {len(test)} days") |
|
|
| lr = LinearRegression().fit(X_tr, y_tr) |
| rf = RandomForestRegressor(n_estimators=200, max_depth=12, |
| min_samples_leaf=3, random_state=42, |
| n_jobs=-1).fit(X_tr, y_tr) |
| lr_pred = lr.predict(X_te) |
| rf_pred = rf.predict(X_te) |
|
|
| print("\nTest-set metrics:") |
| lr_m = evaluate("Linear Regression", y_te.values, lr_pred) |
| rf_m = evaluate("Random Forest", y_te.values, rf_pred) |
|
|
| |
| plt.rcParams.update({"axes.facecolor":"#0d1117","figure.facecolor":"#0d1117", |
| "text.color":"white","axes.labelcolor":"white", |
| "xtick.color":"white","ytick.color":"white"}) |
|
|
| |
| fig, ax = plt.subplots(figsize=(13, 5)) |
| ax.plot(test.date, y_te, color="#e2e8f0", lw=1.3, label="Actual") |
| ax.plot(test.date, lr_pred, color="#3b82f6", lw=1, alpha=0.8, label="Linear Regression") |
| ax.plot(test.date, rf_pred, color="#10b981", lw=1, alpha=0.8, label="Random Forest") |
| ax.set_title("Test-set: actual vs predicted daily demand", fontsize=12) |
| ax.set_xlabel("Date"); ax.set_ylabel("Units sold") |
| ax.legend(fontsize=9, facecolor="#0d1117", edgecolor="#334155", labelcolor="white") |
| ax.grid(True, alpha=0.1) |
| plt.tight_layout() |
| plt.savefig("output/forecast.png", dpi=150, bbox_inches="tight") |
| plt.close() |
|
|
| |
| fig, axes = plt.subplots(1, 2, figsize=(13, 4.5)) |
| for ax, pred, name, color, m in [ |
| (axes[0], lr_pred, "Linear Regression", "#3b82f6", lr_m), |
| (axes[1], rf_pred, "Random Forest", "#10b981", rf_m), |
| ]: |
| ax.scatter(pred, y_te.values - pred, s=12, c=color, alpha=0.6) |
| ax.axhline(0, color="white", lw=0.8) |
| ax.set_title(f"{name} residuals (RMSE={m['rmse']:.2f})", fontsize=11) |
| ax.set_xlabel("Predicted"); ax.set_ylabel("Residual") |
| ax.grid(True, alpha=0.1) |
| plt.tight_layout() |
| plt.savefig("output/residuals.png", dpi=150, bbox_inches="tight") |
| plt.close() |
|
|
| |
| imp = pd.Series(rf.feature_importances_, index=FEATURES).sort_values() |
| fig, ax = plt.subplots(figsize=(8, 4.5)) |
| ax.barh(imp.index, imp.values, color="#10b981") |
| ax.set_title("Random Forest β feature importance", fontsize=12) |
| ax.set_xlabel("Importance") |
| ax.grid(True, axis="x", alpha=0.1) |
| plt.tight_layout() |
| plt.savefig("output/feature_importance.png", dpi=150, bbox_inches="tight") |
| plt.close() |
|
|
| print(f"\nTop features: {', '.join(imp.index[-3:][::-1])}") |
| print("Wrote: forecast.png, residuals.png, feature_importance.png") |
|
|
| if __name__ == "__main__": |
| main() |
|
|