Spaces:

Esvanth
/

EcoCartAI

Sleeping

File size: 5,742 Bytes

"""
EcoCart Demand Forecasting Prototype
Task 5 — Linear Regression vs Random Forest on synthetic daily sales.

Run:  python3 task5_forecasting.py
Out:  forecast.png, residuals.png, feature_importance.png
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
RNG = np.random.default_rng(42)
CSV_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "sales_history.csv")


def load_sales():
    """Load the 730-day daily sales dataset from data/sales_history.csv."""
    return pd.read_csv(CSV_PATH, parse_dates=["date"])


# ── 1. Synthetic sales data ────────────────────────────────
def generate_sales(days=730):
    t = np.arange(days)
    dates = pd.date_range("2023-01-01", periods=days, freq="D")
    base   = 100 + 0.05 * t
    weekly = 25 * np.sin(2 * np.pi * t / 7)
    yearly = 40 * np.sin(2 * np.pi * t / 365)
    noise  = RNG.normal(0, 8, days)
    promo  = np.zeros(days)
    promo[RNG.choice(days, int(days * 0.06), replace=False)] = RNG.uniform(30, 70, int(days * 0.06))
    sales = np.clip(base + weekly + yearly + noise + promo, 0, None)
    return pd.DataFrame({
        "date": dates, "sales": sales,
        "dow": dates.dayofweek, "month": dates.month,
        "day_of_year": dates.dayofyear,
        "is_promo": (promo > 0).astype(int),
    })

# ── 2. Features ────────────────────────────────────────────
def add_features(df):
    out = df.copy()
    for lag in [1, 7, 14]:
        out[f"lag_{lag}"] = out["sales"].shift(lag)
    out["roll_7"]  = out["sales"].shift(1).rolling(7).mean()
    out["roll_30"] = out["sales"].shift(1).rolling(30).mean()
    return out.dropna().reset_index(drop=True)


FEATURES = ["dow", "month", "day_of_year", "is_promo",
            "lag_1", "lag_7", "lag_14", "roll_7", "roll_30"]


# ── 3. Train & evaluate ───────────────────────────────────
def evaluate(name, y_true, y_pred):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred) ** 0.5
    r2   = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 1, y_true))) * 100
    print(f"  {name:<22s}  MAE={mae:6.2f}  RMSE={rmse:6.2f}  R²={r2:.3f}  MAPE={mape:.2f}%")
    return {"mae": mae, "rmse": rmse, "r2": r2, "mape": mape}


def main():
    print("="*70)
    print("EcoCart Demand Forecasting — LR vs Random Forest")
    print("="*70)

    df = load_sales()
    df = add_features(df)
    split = int(len(df) * 0.8)
    train, test = df.iloc[:split], df.iloc[split:]
    X_tr, y_tr = train[FEATURES], train["sales"]
    X_te, y_te = test[FEATURES],  test["sales"]
    print(f"Train: {len(train)} days  Test: {len(test)} days")

    lr = LinearRegression().fit(X_tr, y_tr)
    rf = RandomForestRegressor(n_estimators=200, max_depth=12,
                               min_samples_leaf=3, random_state=42,
                               n_jobs=-1).fit(X_tr, y_tr)
    lr_pred = lr.predict(X_te)
    rf_pred = rf.predict(X_te)

    print("\nTest-set metrics:")
    lr_m = evaluate("Linear Regression", y_te.values, lr_pred)
    rf_m = evaluate("Random Forest",     y_te.values, rf_pred)

    # ── Plots ──
    plt.rcParams.update({"axes.facecolor":"#0d1117","figure.facecolor":"#0d1117",
                         "text.color":"white","axes.labelcolor":"white",
                         "xtick.color":"white","ytick.color":"white"})

    # Forecast
    fig, ax = plt.subplots(figsize=(13, 5))
    ax.plot(test.date, y_te, color="#e2e8f0", lw=1.3, label="Actual")
    ax.plot(test.date, lr_pred, color="#3b82f6", lw=1, alpha=0.8, label="Linear Regression")
    ax.plot(test.date, rf_pred, color="#10b981", lw=1, alpha=0.8, label="Random Forest")
    ax.set_title("Test-set: actual vs predicted daily demand", fontsize=12)
    ax.set_xlabel("Date"); ax.set_ylabel("Units sold")
    ax.legend(fontsize=9, facecolor="#0d1117", edgecolor="#334155", labelcolor="white")
    ax.grid(True, alpha=0.1)
    plt.tight_layout()
    plt.savefig("output/forecast.png", dpi=150, bbox_inches="tight")
    plt.close()

    # Residuals
    fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))
    for ax, pred, name, color, m in [
        (axes[0], lr_pred, "Linear Regression", "#3b82f6", lr_m),
        (axes[1], rf_pred, "Random Forest",     "#10b981", rf_m),
    ]:
        ax.scatter(pred, y_te.values - pred, s=12, c=color, alpha=0.6)
        ax.axhline(0, color="white", lw=0.8)
        ax.set_title(f"{name} residuals (RMSE={m['rmse']:.2f})", fontsize=11)
        ax.set_xlabel("Predicted"); ax.set_ylabel("Residual")
        ax.grid(True, alpha=0.1)
    plt.tight_layout()
    plt.savefig("output/residuals.png", dpi=150, bbox_inches="tight")
    plt.close()

    # Feature importance
    imp = pd.Series(rf.feature_importances_, index=FEATURES).sort_values()
    fig, ax = plt.subplots(figsize=(8, 4.5))
    ax.barh(imp.index, imp.values, color="#10b981")
    ax.set_title("Random Forest — feature importance", fontsize=12)
    ax.set_xlabel("Importance")
    ax.grid(True, axis="x", alpha=0.1)
    plt.tight_layout()
    plt.savefig("output/feature_importance.png", dpi=150, bbox_inches="tight")
    plt.close()

    print(f"\nTop features: {', '.join(imp.index[-3:][::-1])}")
    print("Wrote: forecast.png, residuals.png, feature_importance.png")

if __name__ == "__main__":
    main()