Spaces:

bitsofchris
/

time-series-ai-weather-forecast

Running

bitsofchris Claude Opus 4.7 (1M context) commited on 12 days ago

Commit

d6d17d4

1 Parent(s): 2a2a12d

Scoreboard: per-metric MAE table across 1 h / 3 h / 12 h lookaheads

Latest-pre-target scoring was just measuring 5-15-min-ahead predictions
(since autorefresh runs every 15 min, the 'most recent before target'
forecast is always near-term). That's a trivial test of forecast skill.

Replace the single MAE line per metric with a table of MAE at three
fixed lookaheads — 1 h, 3 h, 12 h — where each lookahead picks the
forecast whose forecast_made_at is closest to (target − N h). Now the
scoreboard tells the actual story of model degradation with horizon.

- forecast_log.scoreboard_at_lag(metric, lag_hours, window_hours) —
ROW_NUMBER over absolute distance to (target − lag), JOIN with
actuals.
- forecast_log.residuals(..., lag_hours=3.0) — residual chart is now
pinned to the middle row of the table so the picture matches a
headline number.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

app.py +43 -25
src/forecast_log.py +54 -14
src/weather_ui.py +1 -1

app.py CHANGED Viewed

@@ -281,8 +281,9 @@ def refresh():
         comparison_md = ""
     scoreboard = render_scoreboard(log_conn)
-    # Residual chart — same picks the scoreboard MAE uses, over the last 48h.
-    resid_df = forecast_log.residuals(log_conn, metric="temp_f", window_hours=48)
     resid_fig = residual_figure(resid_df) if not resid_df.empty else None
     persist.push_db_async()
@@ -290,31 +291,48 @@ def refresh():
 # --- scoreboard ----------------------------------------------------------
 def render_scoreboard(conn) -> str:
-    lines = ["### 📊 Forecast scoreboard (rolling 48h MAE — lower is better)"]
     any_data = False
-    for metric, label, unit in [
-        ("temp_f", "Temperature", "°F"),
-        ("humidity", "Humidity", "%"),
-        ("pressure_inhg", "Pressure", "inHg"),
-    ]:
-        summ = forecast_log.scoreboard_summary(conn, metric=metric, window_hours=48)
-        if summ.empty:
-            continue
-        any_data = True
-        by = {row["source"]: row for _, row in summ.iterrows()}
-        toto = by.get("toto")
-        nws_row = by.get("nws")
-        parts = [f"**{label}**"]
-        if toto is not None:
-            parts.append(f"Toto **{toto['mae']:.2f} {unit}** _(n={int(toto['n'])})_")
-        if nws_row is not None:
-            parts.append(f"NWS **{nws_row['mae']:.2f} {unit}** _(n={int(nws_row['n'])})_")
-        if toto is not None and nws_row is not None:
-            diff = toto["mae"] - nws_row["mae"]
-            winner = "🤖 Toto" if diff < 0 else "🌎 NWS"
-            parts.append(f"→ **{winner}** wins by {abs(diff):.2f} {unit}")
-        lines.append(" · ".join(parts))
     if not any_data:
         lines.append(
             "_No scored forecasts yet. The scoreboard fills in once forecasts have target hours that have already passed and matching Ecowitt actuals — typically within an hour or two of running._"

         comparison_md = ""
     scoreboard = render_scoreboard(log_conn)
+    # Residual chart — pinned to 3 h-ahead (the middle row of the scoreboard
+    # table) so the picture matches one of the headline numbers.
+    resid_df = forecast_log.residuals(log_conn, metric="temp_f", window_hours=48, lag_hours=3.0)
     resid_fig = residual_figure(resid_df) if not resid_df.empty else None
     persist.push_db_async()
 # --- scoreboard ----------------------------------------------------------
+SCOREBOARD_HORIZONS_H = [1, 3, 12]
+SCOREBOARD_METRICS = [
+    ("temp_f", "Temperature", "°F"),
+    ("humidity", "Humidity", "%"),
+    ("pressure_inhg", "Pressure", "inHg"),
+]
 def render_scoreboard(conn) -> str:
+    """Per-metric, per-lookahead MAE table.
+    Rows = forecast lookahead (1h / 3h / 12h). Cols = Toto MAE, NWS MAE,
+    delta. Pressure has no NWS forecast so its column is dashed."""
+    lines = ["### 📊 Forecast scoreboard (rolling 48 h MAE — lower is better)"]
     any_data = False
+    for metric, label, unit in SCOREBOARD_METRICS:
+        rows: list[str] = []
+        for lag_h in SCOREBOARD_HORIZONS_H:
+            df = forecast_log.scoreboard_at_lag(
+                conn, metric=metric, lag_hours=lag_h, window_hours=48,
+            )
+            if df.empty:
+                continue
+            any_data = True
+            by = {r["source"]: r for _, r in df.iterrows()}
+            toto = by.get("toto")
+            nws_row = by.get("nws")
+            t_cell = f"**{toto['mae']:.2f} {unit}** _(n={int(toto['n'])})_" if toto is not None else "—"
+            n_cell = f"**{nws_row['mae']:.2f} {unit}** _(n={int(nws_row['n'])})_" if nws_row is not None else "—"
+            if toto is not None and nws_row is not None:
+                diff = toto["mae"] - nws_row["mae"]
+                winner = "🤖 Toto" if diff < 0 else "🌎 NWS"
+                d_cell = f"**{winner}** by {abs(diff):.2f} {unit}"
+            elif toto is not None:
+                d_cell = "—"
+            else:
+                d_cell = "—"
+            rows.append(f"| **{lag_h} h-ahead** | {t_cell} | {n_cell} | {d_cell} |")
+        if rows:
+            lines.append(f"**{label}**")
+            lines.append("| Lookahead | 🤖 Toto MAE | 🌎 NWS MAE | Δ |\n|---|---|---|---|")
+            lines.extend(rows)
     if not any_data:
         lines.append(
             "_No scored forecasts yet. The scoreboard fills in once forecasts have target hours that have already passed and matching Ecowitt actuals — typically within an hour or two of running._"

src/forecast_log.py CHANGED Viewed

@@ -281,32 +281,30 @@ def residuals(
     conn: sqlite3.Connection,
     metric: str,
     window_hours: int = 48,
 ) -> pd.DataFrame:
     """For each hourly target_ts in the last `window_hours`, return each
-    model's prediction and the Ecowitt actual side-by-side, plus signed
-    residuals (prediction − actual).
-    Uses the SAME 'latest forecast issued before the target hour' rule the
-    scoreboard MAE uses, so the time-series residuals add up to the
-    aggregate number on the scoreboard.
     """
     import time as _time  # noqa: PLC0415
     now = int(_time.time())
     cutoff = now - window_hours * 3600
     sql = """
-    WITH latest AS (
-        SELECT source, target_ts, MAX(forecast_made_at) AS forecast_made_at
         FROM forecast_snapshots
         WHERE metric = ?
           AND forecast_made_at <= target_ts
           AND target_ts BETWEEN ? AND ?
-        GROUP BY source, target_ts
     ),
     picked AS (
-        SELECT f.source, f.target_ts, f.p50
-        FROM forecast_snapshots f
-        JOIN latest l USING (source, target_ts, forecast_made_at)
-        WHERE f.metric = ?
     )
     SELECT a.target_ts,
            MAX(CASE WHEN p.source='toto' THEN p.p50 END) AS toto_p50,
@@ -319,7 +317,7 @@ def residuals(
     GROUP BY a.target_ts
     ORDER BY a.target_ts
     """
-    params = [metric, cutoff, now, metric, metric, cutoff, now]
     df = pd.read_sql_query(sql, conn, params=params)
     if df.empty:
         return df
@@ -343,3 +341,45 @@ def scoreboard_summary(
         .agg(n="count", mae="mean")
         .reset_index()
     )

     conn: sqlite3.Connection,
     metric: str,
     window_hours: int = 48,
+    lag_hours: float = 3.0,
 ) -> pd.DataFrame:
     """For each hourly target_ts in the last `window_hours`, return each
+    model's prediction (picked at a fixed lookahead) and the Ecowitt
+    actual side-by-side, plus signed residuals (prediction − actual).
     """
     import time as _time  # noqa: PLC0415
     now = int(_time.time())
     cutoff = now - window_hours * 3600
+    lag_seconds = int(lag_hours * 3600)
     sql = """
+    WITH ranked AS (
+        SELECT source, target_ts, p50,
+               ROW_NUMBER() OVER (
+                   PARTITION BY source, target_ts
+                   ORDER BY ABS(forecast_made_at - (target_ts - ?))
+               ) AS rk
         FROM forecast_snapshots
         WHERE metric = ?
           AND forecast_made_at <= target_ts
           AND target_ts BETWEEN ? AND ?
     ),
     picked AS (
+        SELECT source, target_ts, p50 FROM ranked WHERE rk = 1
     )
     SELECT a.target_ts,
            MAX(CASE WHEN p.source='toto' THEN p.p50 END) AS toto_p50,
     GROUP BY a.target_ts
     ORDER BY a.target_ts
     """
+    params = [lag_seconds, metric, cutoff, now, metric, cutoff, now]
     df = pd.read_sql_query(sql, conn, params=params)
     if df.empty:
         return df
         .agg(n="count", mae="mean")
         .reset_index()
     )
+def scoreboard_at_lag(
+    conn: sqlite3.Connection,
+    metric: str,
+    lag_hours: float,
+    window_hours: int = 48,
+) -> pd.DataFrame:
+    """Per-source MAE at a specific forecast lookahead.
+    For each past target hour in the window, pick each source's forecast
+    whose `forecast_made_at` is closest to `target_ts - lag_hours`. With
+    autorefresh ticking every 15 min that picker selects a forecast within
+    ~7-8 min of the requested lag, so the MAE genuinely reflects the
+    'how good was the N-hours-ahead prediction?' question.
+    """
+    import time as _time  # noqa: PLC0415
+    lag_seconds = int(lag_hours * 3600)
+    now = int(_time.time())
+    cutoff = now - window_hours * 3600
+    sql = """
+    WITH ranked AS (
+        SELECT source, target_ts, forecast_made_at, p50,
+               ROW_NUMBER() OVER (
+                   PARTITION BY source, target_ts
+                   ORDER BY ABS(forecast_made_at - (target_ts - ?))
+               ) AS rk
+        FROM forecast_snapshots
+        WHERE metric = ?
+          AND forecast_made_at <= target_ts
+          AND target_ts BETWEEN ? AND ?
+    )
+    SELECT r.source,
+           COUNT(*) AS n,
+           AVG(ABS(r.p50 - a.value)) AS mae
+    FROM ranked r
+    JOIN actuals a USING (target_ts)
+    WHERE r.rk = 1 AND a.metric = ?
+    GROUP BY r.source
+    """
+    df = pd.read_sql_query(sql, conn, params=[lag_seconds, metric, cutoff, now, metric])
+    return df

src/weather_ui.py CHANGED Viewed

@@ -183,7 +183,7 @@ def emoji_strip_markdown(nws_df: pd.DataFrame, tz: str, n: int = 12) -> str:
 def residual_figure(
     df: pd.DataFrame,
-    title: str = "Forecast residual — prediction minus Ecowitt actual, last 48h (°F)",
 ) -> go.Figure:
     """Plot signed residuals over time for Toto and NWS. Zero is perfect."""
     fig = go.Figure()

 def residual_figure(
     df: pd.DataFrame,
+    title: str = "Forecast residual — 3 h-ahead prediction minus Ecowitt actual, last 48 h (°F)",
 ) -> go.Figure:
     """Plot signed residuals over time for Toto and NWS. Zero is perfect."""
     fig = go.Figure()