Scoreboard: per-metric MAE table across 1 h / 3 h / 12 h lookaheads
Browse filesLatest-pre-target scoring was just measuring 5-15-min-ahead predictions
(since autorefresh runs every 15 min, the 'most recent before target'
forecast is always near-term). That's a trivial test of forecast skill.
Replace the single MAE line per metric with a table of MAE at three
fixed lookaheads — 1 h, 3 h, 12 h — where each lookahead picks the
forecast whose forecast_made_at is closest to (target − N h). Now the
scoreboard tells the actual story of model degradation with horizon.
- forecast_log.scoreboard_at_lag(metric, lag_hours, window_hours) —
ROW_NUMBER over absolute distance to (target − lag), JOIN with
actuals.
- forecast_log.residuals(..., lag_hours=3.0) — residual chart is now
pinned to the middle row of the table so the picture matches a
headline number.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- app.py +43 -25
- src/forecast_log.py +54 -14
- src/weather_ui.py +1 -1
|
@@ -281,8 +281,9 @@ def refresh():
|
|
| 281 |
comparison_md = ""
|
| 282 |
scoreboard = render_scoreboard(log_conn)
|
| 283 |
|
| 284 |
-
# Residual chart —
|
| 285 |
-
|
|
|
|
| 286 |
resid_fig = residual_figure(resid_df) if not resid_df.empty else None
|
| 287 |
|
| 288 |
persist.push_db_async()
|
|
@@ -290,31 +291,48 @@ def refresh():
|
|
| 290 |
|
| 291 |
|
| 292 |
# --- scoreboard ----------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
def render_scoreboard(conn) -> str:
|
| 294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
any_data = False
|
| 296 |
-
for metric, label, unit in
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
if not any_data:
|
| 319 |
lines.append(
|
| 320 |
"_No scored forecasts yet. The scoreboard fills in once forecasts have target hours that have already passed and matching Ecowitt actuals — typically within an hour or two of running._"
|
|
|
|
| 281 |
comparison_md = ""
|
| 282 |
scoreboard = render_scoreboard(log_conn)
|
| 283 |
|
| 284 |
+
# Residual chart — pinned to 3 h-ahead (the middle row of the scoreboard
|
| 285 |
+
# table) so the picture matches one of the headline numbers.
|
| 286 |
+
resid_df = forecast_log.residuals(log_conn, metric="temp_f", window_hours=48, lag_hours=3.0)
|
| 287 |
resid_fig = residual_figure(resid_df) if not resid_df.empty else None
|
| 288 |
|
| 289 |
persist.push_db_async()
|
|
|
|
| 291 |
|
| 292 |
|
| 293 |
# --- scoreboard ----------------------------------------------------------
|
| 294 |
+
SCOREBOARD_HORIZONS_H = [1, 3, 12]
|
| 295 |
+
SCOREBOARD_METRICS = [
|
| 296 |
+
("temp_f", "Temperature", "°F"),
|
| 297 |
+
("humidity", "Humidity", "%"),
|
| 298 |
+
("pressure_inhg", "Pressure", "inHg"),
|
| 299 |
+
]
|
| 300 |
+
|
| 301 |
+
|
| 302 |
def render_scoreboard(conn) -> str:
|
| 303 |
+
"""Per-metric, per-lookahead MAE table.
|
| 304 |
+
|
| 305 |
+
Rows = forecast lookahead (1h / 3h / 12h). Cols = Toto MAE, NWS MAE,
|
| 306 |
+
delta. Pressure has no NWS forecast so its column is dashed."""
|
| 307 |
+
lines = ["### 📊 Forecast scoreboard (rolling 48 h MAE — lower is better)"]
|
| 308 |
any_data = False
|
| 309 |
+
for metric, label, unit in SCOREBOARD_METRICS:
|
| 310 |
+
rows: list[str] = []
|
| 311 |
+
for lag_h in SCOREBOARD_HORIZONS_H:
|
| 312 |
+
df = forecast_log.scoreboard_at_lag(
|
| 313 |
+
conn, metric=metric, lag_hours=lag_h, window_hours=48,
|
| 314 |
+
)
|
| 315 |
+
if df.empty:
|
| 316 |
+
continue
|
| 317 |
+
any_data = True
|
| 318 |
+
by = {r["source"]: r for _, r in df.iterrows()}
|
| 319 |
+
toto = by.get("toto")
|
| 320 |
+
nws_row = by.get("nws")
|
| 321 |
+
t_cell = f"**{toto['mae']:.2f} {unit}** _(n={int(toto['n'])})_" if toto is not None else "—"
|
| 322 |
+
n_cell = f"**{nws_row['mae']:.2f} {unit}** _(n={int(nws_row['n'])})_" if nws_row is not None else "—"
|
| 323 |
+
if toto is not None and nws_row is not None:
|
| 324 |
+
diff = toto["mae"] - nws_row["mae"]
|
| 325 |
+
winner = "🤖 Toto" if diff < 0 else "🌎 NWS"
|
| 326 |
+
d_cell = f"**{winner}** by {abs(diff):.2f} {unit}"
|
| 327 |
+
elif toto is not None:
|
| 328 |
+
d_cell = "—"
|
| 329 |
+
else:
|
| 330 |
+
d_cell = "—"
|
| 331 |
+
rows.append(f"| **{lag_h} h-ahead** | {t_cell} | {n_cell} | {d_cell} |")
|
| 332 |
+
if rows:
|
| 333 |
+
lines.append(f"**{label}**")
|
| 334 |
+
lines.append("| Lookahead | 🤖 Toto MAE | 🌎 NWS MAE | Δ |\n|---|---|---|---|")
|
| 335 |
+
lines.extend(rows)
|
| 336 |
if not any_data:
|
| 337 |
lines.append(
|
| 338 |
"_No scored forecasts yet. The scoreboard fills in once forecasts have target hours that have already passed and matching Ecowitt actuals — typically within an hour or two of running._"
|
|
@@ -281,32 +281,30 @@ def residuals(
|
|
| 281 |
conn: sqlite3.Connection,
|
| 282 |
metric: str,
|
| 283 |
window_hours: int = 48,
|
|
|
|
| 284 |
) -> pd.DataFrame:
|
| 285 |
"""For each hourly target_ts in the last `window_hours`, return each
|
| 286 |
-
model's prediction
|
| 287 |
-
residuals (prediction − actual).
|
| 288 |
-
|
| 289 |
-
Uses the SAME 'latest forecast issued before the target hour' rule the
|
| 290 |
-
scoreboard MAE uses, so the time-series residuals add up to the
|
| 291 |
-
aggregate number on the scoreboard.
|
| 292 |
"""
|
| 293 |
import time as _time # noqa: PLC0415
|
| 294 |
now = int(_time.time())
|
| 295 |
cutoff = now - window_hours * 3600
|
|
|
|
| 296 |
sql = """
|
| 297 |
-
WITH
|
| 298 |
-
SELECT source, target_ts,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
FROM forecast_snapshots
|
| 300 |
WHERE metric = ?
|
| 301 |
AND forecast_made_at <= target_ts
|
| 302 |
AND target_ts BETWEEN ? AND ?
|
| 303 |
-
GROUP BY source, target_ts
|
| 304 |
),
|
| 305 |
picked AS (
|
| 306 |
-
SELECT
|
| 307 |
-
FROM forecast_snapshots f
|
| 308 |
-
JOIN latest l USING (source, target_ts, forecast_made_at)
|
| 309 |
-
WHERE f.metric = ?
|
| 310 |
)
|
| 311 |
SELECT a.target_ts,
|
| 312 |
MAX(CASE WHEN p.source='toto' THEN p.p50 END) AS toto_p50,
|
|
@@ -319,7 +317,7 @@ def residuals(
|
|
| 319 |
GROUP BY a.target_ts
|
| 320 |
ORDER BY a.target_ts
|
| 321 |
"""
|
| 322 |
-
params = [metric, cutoff, now, metric,
|
| 323 |
df = pd.read_sql_query(sql, conn, params=params)
|
| 324 |
if df.empty:
|
| 325 |
return df
|
|
@@ -343,3 +341,45 @@ def scoreboard_summary(
|
|
| 343 |
.agg(n="count", mae="mean")
|
| 344 |
.reset_index()
|
| 345 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
conn: sqlite3.Connection,
|
| 282 |
metric: str,
|
| 283 |
window_hours: int = 48,
|
| 284 |
+
lag_hours: float = 3.0,
|
| 285 |
) -> pd.DataFrame:
|
| 286 |
"""For each hourly target_ts in the last `window_hours`, return each
|
| 287 |
+
model's prediction (picked at a fixed lookahead) and the Ecowitt
|
| 288 |
+
actual side-by-side, plus signed residuals (prediction − actual).
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
"""
|
| 290 |
import time as _time # noqa: PLC0415
|
| 291 |
now = int(_time.time())
|
| 292 |
cutoff = now - window_hours * 3600
|
| 293 |
+
lag_seconds = int(lag_hours * 3600)
|
| 294 |
sql = """
|
| 295 |
+
WITH ranked AS (
|
| 296 |
+
SELECT source, target_ts, p50,
|
| 297 |
+
ROW_NUMBER() OVER (
|
| 298 |
+
PARTITION BY source, target_ts
|
| 299 |
+
ORDER BY ABS(forecast_made_at - (target_ts - ?))
|
| 300 |
+
) AS rk
|
| 301 |
FROM forecast_snapshots
|
| 302 |
WHERE metric = ?
|
| 303 |
AND forecast_made_at <= target_ts
|
| 304 |
AND target_ts BETWEEN ? AND ?
|
|
|
|
| 305 |
),
|
| 306 |
picked AS (
|
| 307 |
+
SELECT source, target_ts, p50 FROM ranked WHERE rk = 1
|
|
|
|
|
|
|
|
|
|
| 308 |
)
|
| 309 |
SELECT a.target_ts,
|
| 310 |
MAX(CASE WHEN p.source='toto' THEN p.p50 END) AS toto_p50,
|
|
|
|
| 317 |
GROUP BY a.target_ts
|
| 318 |
ORDER BY a.target_ts
|
| 319 |
"""
|
| 320 |
+
params = [lag_seconds, metric, cutoff, now, metric, cutoff, now]
|
| 321 |
df = pd.read_sql_query(sql, conn, params=params)
|
| 322 |
if df.empty:
|
| 323 |
return df
|
|
|
|
| 341 |
.agg(n="count", mae="mean")
|
| 342 |
.reset_index()
|
| 343 |
)
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def scoreboard_at_lag(
|
| 347 |
+
conn: sqlite3.Connection,
|
| 348 |
+
metric: str,
|
| 349 |
+
lag_hours: float,
|
| 350 |
+
window_hours: int = 48,
|
| 351 |
+
) -> pd.DataFrame:
|
| 352 |
+
"""Per-source MAE at a specific forecast lookahead.
|
| 353 |
+
|
| 354 |
+
For each past target hour in the window, pick each source's forecast
|
| 355 |
+
whose `forecast_made_at` is closest to `target_ts - lag_hours`. With
|
| 356 |
+
autorefresh ticking every 15 min that picker selects a forecast within
|
| 357 |
+
~7-8 min of the requested lag, so the MAE genuinely reflects the
|
| 358 |
+
'how good was the N-hours-ahead prediction?' question.
|
| 359 |
+
"""
|
| 360 |
+
import time as _time # noqa: PLC0415
|
| 361 |
+
lag_seconds = int(lag_hours * 3600)
|
| 362 |
+
now = int(_time.time())
|
| 363 |
+
cutoff = now - window_hours * 3600
|
| 364 |
+
sql = """
|
| 365 |
+
WITH ranked AS (
|
| 366 |
+
SELECT source, target_ts, forecast_made_at, p50,
|
| 367 |
+
ROW_NUMBER() OVER (
|
| 368 |
+
PARTITION BY source, target_ts
|
| 369 |
+
ORDER BY ABS(forecast_made_at - (target_ts - ?))
|
| 370 |
+
) AS rk
|
| 371 |
+
FROM forecast_snapshots
|
| 372 |
+
WHERE metric = ?
|
| 373 |
+
AND forecast_made_at <= target_ts
|
| 374 |
+
AND target_ts BETWEEN ? AND ?
|
| 375 |
+
)
|
| 376 |
+
SELECT r.source,
|
| 377 |
+
COUNT(*) AS n,
|
| 378 |
+
AVG(ABS(r.p50 - a.value)) AS mae
|
| 379 |
+
FROM ranked r
|
| 380 |
+
JOIN actuals a USING (target_ts)
|
| 381 |
+
WHERE r.rk = 1 AND a.metric = ?
|
| 382 |
+
GROUP BY r.source
|
| 383 |
+
"""
|
| 384 |
+
df = pd.read_sql_query(sql, conn, params=[lag_seconds, metric, cutoff, now, metric])
|
| 385 |
+
return df
|
|
@@ -183,7 +183,7 @@ def emoji_strip_markdown(nws_df: pd.DataFrame, tz: str, n: int = 12) -> str:
|
|
| 183 |
|
| 184 |
def residual_figure(
|
| 185 |
df: pd.DataFrame,
|
| 186 |
-
title: str = "Forecast residual — prediction minus Ecowitt actual, last
|
| 187 |
) -> go.Figure:
|
| 188 |
"""Plot signed residuals over time for Toto and NWS. Zero is perfect."""
|
| 189 |
fig = go.Figure()
|
|
|
|
| 183 |
|
| 184 |
def residual_figure(
|
| 185 |
df: pd.DataFrame,
|
| 186 |
+
title: str = "Forecast residual — 3 h-ahead prediction minus Ecowitt actual, last 48 h (°F)",
|
| 187 |
) -> go.Figure:
|
| 188 |
"""Plot signed residuals over time for Toto and NWS. Zero is perfect."""
|
| 189 |
fig = go.Figure()
|