bitsofchris Claude Opus 4.7 (1M context) commited on
Commit
d6d17d4
·
1 Parent(s): 2a2a12d

Scoreboard: per-metric MAE table across 1 h / 3 h / 12 h lookaheads

Browse files

Latest-pre-target scoring was just measuring 5-15-min-ahead predictions
(since autorefresh runs every 15 min, the 'most recent before target'
forecast is always near-term). That's a trivial test of forecast skill.

Replace the single MAE line per metric with a table of MAE at three
fixed lookaheads — 1 h, 3 h, 12 h — where each lookahead picks the
forecast whose forecast_made_at is closest to (target − N h). Now the
scoreboard tells the actual story of model degradation with horizon.

- forecast_log.scoreboard_at_lag(metric, lag_hours, window_hours) —
ROW_NUMBER over absolute distance to (target − lag), JOIN with
actuals.
- forecast_log.residuals(..., lag_hours=3.0) — residual chart is now
pinned to the middle row of the table so the picture matches a
headline number.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show
  1. app.py +43 -25
  2. src/forecast_log.py +54 -14
  3. src/weather_ui.py +1 -1
app.py CHANGED
@@ -281,8 +281,9 @@ def refresh():
281
  comparison_md = ""
282
  scoreboard = render_scoreboard(log_conn)
283
 
284
- # Residual chart — same picks the scoreboard MAE uses, over the last 48h.
285
- resid_df = forecast_log.residuals(log_conn, metric="temp_f", window_hours=48)
 
286
  resid_fig = residual_figure(resid_df) if not resid_df.empty else None
287
 
288
  persist.push_db_async()
@@ -290,31 +291,48 @@ def refresh():
290
 
291
 
292
  # --- scoreboard ----------------------------------------------------------
 
 
 
 
 
 
 
 
293
  def render_scoreboard(conn) -> str:
294
- lines = ["### 📊 Forecast scoreboard (rolling 48h MAE — lower is better)"]
 
 
 
 
295
  any_data = False
296
- for metric, label, unit in [
297
- ("temp_f", "Temperature", "°F"),
298
- ("humidity", "Humidity", "%"),
299
- ("pressure_inhg", "Pressure", "inHg"),
300
- ]:
301
- summ = forecast_log.scoreboard_summary(conn, metric=metric, window_hours=48)
302
- if summ.empty:
303
- continue
304
- any_data = True
305
- by = {row["source"]: row for _, row in summ.iterrows()}
306
- toto = by.get("toto")
307
- nws_row = by.get("nws")
308
- parts = [f"**{label}**"]
309
- if toto is not None:
310
- parts.append(f"Toto **{toto['mae']:.2f} {unit}** _(n={int(toto['n'])})_")
311
- if nws_row is not None:
312
- parts.append(f"NWS **{nws_row['mae']:.2f} {unit}** _(n={int(nws_row['n'])})_")
313
- if toto is not None and nws_row is not None:
314
- diff = toto["mae"] - nws_row["mae"]
315
- winner = "🤖 Toto" if diff < 0 else "🌎 NWS"
316
- parts.append(f"→ **{winner}** wins by {abs(diff):.2f} {unit}")
317
- lines.append(" · ".join(parts))
 
 
 
 
 
318
  if not any_data:
319
  lines.append(
320
  "_No scored forecasts yet. The scoreboard fills in once forecasts have target hours that have already passed and matching Ecowitt actuals — typically within an hour or two of running._"
 
281
  comparison_md = ""
282
  scoreboard = render_scoreboard(log_conn)
283
 
284
+ # Residual chart — pinned to 3 h-ahead (the middle row of the scoreboard
285
+ # table) so the picture matches one of the headline numbers.
286
+ resid_df = forecast_log.residuals(log_conn, metric="temp_f", window_hours=48, lag_hours=3.0)
287
  resid_fig = residual_figure(resid_df) if not resid_df.empty else None
288
 
289
  persist.push_db_async()
 
291
 
292
 
293
  # --- scoreboard ----------------------------------------------------------
294
+ SCOREBOARD_HORIZONS_H = [1, 3, 12]
295
+ SCOREBOARD_METRICS = [
296
+ ("temp_f", "Temperature", "°F"),
297
+ ("humidity", "Humidity", "%"),
298
+ ("pressure_inhg", "Pressure", "inHg"),
299
+ ]
300
+
301
+
302
  def render_scoreboard(conn) -> str:
303
+ """Per-metric, per-lookahead MAE table.
304
+
305
+ Rows = forecast lookahead (1h / 3h / 12h). Cols = Toto MAE, NWS MAE,
306
+ delta. Pressure has no NWS forecast so its column is dashed."""
307
+ lines = ["### 📊 Forecast scoreboard (rolling 48 h MAE — lower is better)"]
308
  any_data = False
309
+ for metric, label, unit in SCOREBOARD_METRICS:
310
+ rows: list[str] = []
311
+ for lag_h in SCOREBOARD_HORIZONS_H:
312
+ df = forecast_log.scoreboard_at_lag(
313
+ conn, metric=metric, lag_hours=lag_h, window_hours=48,
314
+ )
315
+ if df.empty:
316
+ continue
317
+ any_data = True
318
+ by = {r["source"]: r for _, r in df.iterrows()}
319
+ toto = by.get("toto")
320
+ nws_row = by.get("nws")
321
+ t_cell = f"**{toto['mae']:.2f} {unit}** _(n={int(toto['n'])})_" if toto is not None else "—"
322
+ n_cell = f"**{nws_row['mae']:.2f} {unit}** _(n={int(nws_row['n'])})_" if nws_row is not None else "—"
323
+ if toto is not None and nws_row is not None:
324
+ diff = toto["mae"] - nws_row["mae"]
325
+ winner = "🤖 Toto" if diff < 0 else "🌎 NWS"
326
+ d_cell = f"**{winner}** by {abs(diff):.2f} {unit}"
327
+ elif toto is not None:
328
+ d_cell = ""
329
+ else:
330
+ d_cell = "—"
331
+ rows.append(f"| **{lag_h} h-ahead** | {t_cell} | {n_cell} | {d_cell} |")
332
+ if rows:
333
+ lines.append(f"**{label}**")
334
+ lines.append("| Lookahead | 🤖 Toto MAE | 🌎 NWS MAE | Δ |\n|---|---|---|---|")
335
+ lines.extend(rows)
336
  if not any_data:
337
  lines.append(
338
  "_No scored forecasts yet. The scoreboard fills in once forecasts have target hours that have already passed and matching Ecowitt actuals — typically within an hour or two of running._"
src/forecast_log.py CHANGED
@@ -281,32 +281,30 @@ def residuals(
281
  conn: sqlite3.Connection,
282
  metric: str,
283
  window_hours: int = 48,
 
284
  ) -> pd.DataFrame:
285
  """For each hourly target_ts in the last `window_hours`, return each
286
- model's prediction and the Ecowitt actual side-by-side, plus signed
287
- residuals (prediction − actual).
288
-
289
- Uses the SAME 'latest forecast issued before the target hour' rule the
290
- scoreboard MAE uses, so the time-series residuals add up to the
291
- aggregate number on the scoreboard.
292
  """
293
  import time as _time # noqa: PLC0415
294
  now = int(_time.time())
295
  cutoff = now - window_hours * 3600
 
296
  sql = """
297
- WITH latest AS (
298
- SELECT source, target_ts, MAX(forecast_made_at) AS forecast_made_at
 
 
 
 
299
  FROM forecast_snapshots
300
  WHERE metric = ?
301
  AND forecast_made_at <= target_ts
302
  AND target_ts BETWEEN ? AND ?
303
- GROUP BY source, target_ts
304
  ),
305
  picked AS (
306
- SELECT f.source, f.target_ts, f.p50
307
- FROM forecast_snapshots f
308
- JOIN latest l USING (source, target_ts, forecast_made_at)
309
- WHERE f.metric = ?
310
  )
311
  SELECT a.target_ts,
312
  MAX(CASE WHEN p.source='toto' THEN p.p50 END) AS toto_p50,
@@ -319,7 +317,7 @@ def residuals(
319
  GROUP BY a.target_ts
320
  ORDER BY a.target_ts
321
  """
322
- params = [metric, cutoff, now, metric, metric, cutoff, now]
323
  df = pd.read_sql_query(sql, conn, params=params)
324
  if df.empty:
325
  return df
@@ -343,3 +341,45 @@ def scoreboard_summary(
343
  .agg(n="count", mae="mean")
344
  .reset_index()
345
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  conn: sqlite3.Connection,
282
  metric: str,
283
  window_hours: int = 48,
284
+ lag_hours: float = 3.0,
285
  ) -> pd.DataFrame:
286
  """For each hourly target_ts in the last `window_hours`, return each
287
+ model's prediction (picked at a fixed lookahead) and the Ecowitt
288
+ actual side-by-side, plus signed residuals (prediction − actual).
 
 
 
 
289
  """
290
  import time as _time # noqa: PLC0415
291
  now = int(_time.time())
292
  cutoff = now - window_hours * 3600
293
+ lag_seconds = int(lag_hours * 3600)
294
  sql = """
295
+ WITH ranked AS (
296
+ SELECT source, target_ts, p50,
297
+ ROW_NUMBER() OVER (
298
+ PARTITION BY source, target_ts
299
+ ORDER BY ABS(forecast_made_at - (target_ts - ?))
300
+ ) AS rk
301
  FROM forecast_snapshots
302
  WHERE metric = ?
303
  AND forecast_made_at <= target_ts
304
  AND target_ts BETWEEN ? AND ?
 
305
  ),
306
  picked AS (
307
+ SELECT source, target_ts, p50 FROM ranked WHERE rk = 1
 
 
 
308
  )
309
  SELECT a.target_ts,
310
  MAX(CASE WHEN p.source='toto' THEN p.p50 END) AS toto_p50,
 
317
  GROUP BY a.target_ts
318
  ORDER BY a.target_ts
319
  """
320
+ params = [lag_seconds, metric, cutoff, now, metric, cutoff, now]
321
  df = pd.read_sql_query(sql, conn, params=params)
322
  if df.empty:
323
  return df
 
341
  .agg(n="count", mae="mean")
342
  .reset_index()
343
  )
344
+
345
+
346
+ def scoreboard_at_lag(
347
+ conn: sqlite3.Connection,
348
+ metric: str,
349
+ lag_hours: float,
350
+ window_hours: int = 48,
351
+ ) -> pd.DataFrame:
352
+ """Per-source MAE at a specific forecast lookahead.
353
+
354
+ For each past target hour in the window, pick each source's forecast
355
+ whose `forecast_made_at` is closest to `target_ts - lag_hours`. With
356
+ autorefresh ticking every 15 min that picker selects a forecast within
357
+ ~7-8 min of the requested lag, so the MAE genuinely reflects the
358
+ 'how good was the N-hours-ahead prediction?' question.
359
+ """
360
+ import time as _time # noqa: PLC0415
361
+ lag_seconds = int(lag_hours * 3600)
362
+ now = int(_time.time())
363
+ cutoff = now - window_hours * 3600
364
+ sql = """
365
+ WITH ranked AS (
366
+ SELECT source, target_ts, forecast_made_at, p50,
367
+ ROW_NUMBER() OVER (
368
+ PARTITION BY source, target_ts
369
+ ORDER BY ABS(forecast_made_at - (target_ts - ?))
370
+ ) AS rk
371
+ FROM forecast_snapshots
372
+ WHERE metric = ?
373
+ AND forecast_made_at <= target_ts
374
+ AND target_ts BETWEEN ? AND ?
375
+ )
376
+ SELECT r.source,
377
+ COUNT(*) AS n,
378
+ AVG(ABS(r.p50 - a.value)) AS mae
379
+ FROM ranked r
380
+ JOIN actuals a USING (target_ts)
381
+ WHERE r.rk = 1 AND a.metric = ?
382
+ GROUP BY r.source
383
+ """
384
+ df = pd.read_sql_query(sql, conn, params=[lag_seconds, metric, cutoff, now, metric])
385
+ return df
src/weather_ui.py CHANGED
@@ -183,7 +183,7 @@ def emoji_strip_markdown(nws_df: pd.DataFrame, tz: str, n: int = 12) -> str:
183
 
184
  def residual_figure(
185
  df: pd.DataFrame,
186
- title: str = "Forecast residual — prediction minus Ecowitt actual, last 48h (°F)",
187
  ) -> go.Figure:
188
  """Plot signed residuals over time for Toto and NWS. Zero is perfect."""
189
  fig = go.Figure()
 
183
 
184
  def residual_figure(
185
  df: pd.DataFrame,
186
+ title: str = "Forecast residual — 3 h-ahead prediction minus Ecowitt actual, last 48 h (°F)",
187
  ) -> go.Figure:
188
  """Plot signed residuals over time for Toto and NWS. Zero is perfect."""
189
  fig = go.Figure()