Spaces:

bitsofchris
/

time-series-ai-weather-forecast

Running

bitsofchris Claude Opus 4.7 (1M context) commited on 9 days ago

Commit

f132d17

1 Parent(s): 7cc37a1

Read history from the SQLite archive, weekly view at 5-min cadence

The autorefresh thread was already syncing 5-min Ecowitt history into
data/ecowitt.db every tick. Now the live display + Toto inference also
read from that archive instead of hitting the Ecowitt API on every page
load, so we (a) accumulate true 5-min granularity over time, beyond
Ecowitt's own 24-30h 5-min retention window, and (b) stop wasting API
quota on data we already have.

- src/storage.py: new read_history_dataframe(conn, since, until,
cycle_type, resample) — pivots readings into a UTC-indexed DataFrame
with one column per metric in HISTORY_FIELDS.
- app.py: fetch_history now queries the archive first (5-min cadence)
and falls back to a one-shot API pull only if the archive is empty.
- app.py: VIEW_WEEK switches to cycle_type=5min / resample=5min /
history=7d / horizon=72h. 7d × 288 = 2016 context points (63 patches),
72h × 12 = 864 horizon steps per metric.
- app.py: autorefresh reorders to sync-then-refresh so the live page
always sees the latest data the API has produced. Initial 5-min sync
on startup before demo.launch() so the first visitor isn't stuck on
a stale dataset snapshot.
- src/forecast_log.py: record_toto + record_actuals filter to hourly-
aligned target_ts by default (only_hourly=True), so forecasts.db
stays small even though inference happens at 5-min cadence — the
scoreboard scores at hourly granularity anyway.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

app.py +55 -10
src/forecast_log.py +32 -7
src/storage.py +49 -0

app.py CHANGED Viewed

@@ -30,11 +30,15 @@ CACHE_TTL_SECONDS = AUTO_REFRESH_SECONDS - 60  # so autorefresh always refetches
 DISPLAY_TZ = os.environ.get("DISPLAY_TZ", "America/New_York")
 PLACE_NAME = os.environ.get("PLACE_NAME", "Yaphank, NY")
-# Single canonical view — weekly, hourly cadence.
 VIEW_WEEK = {
-    "label": "Past 7 days · 72 h forecast (hourly cadence)",
-    "cycle_type": "30min",
-    "resample": "1h",
     "history_days": 7,
     "horizon_hours": 72,
 }
@@ -67,8 +71,30 @@ def cached(ttl: int):
 # --- data fetchers --------------------------------------------------------
-@cached(CACHE_TTL_SECONDS)
 def fetch_history(cycle_type: str, resample: str, hours: float) -> pd.DataFrame:
     cfg = ecowitt.EcowittConfig.from_env()
     end = datetime.now(timezone.utc).replace(tzinfo=None)
     start = end - timedelta(hours=hours)
@@ -282,7 +308,7 @@ def render_scoreboard(conn) -> str:
 # --- auto-refresh background thread --------------------------------------
-ECOWITT_ARCHIVE_DB = "data/ecowitt.db"
 def _sync_archive_all_cycles() -> None:
@@ -307,12 +333,30 @@ def _sync_archive_all_cycles() -> None:
         conn.close()
 def _autorefresh_loop():
     while True:
         try:
-            refresh()                  # live forecast + forecasts.db log
-            _sync_archive_all_cycles() # 5min/30min/4hour raw archive
-            persist.push_all_async()   # back up both DBs to HF Dataset
         except Exception:  # noqa: BLE001
             print("[autorefresh] error during refresh:")
             traceback.print_exc()
@@ -415,6 +459,7 @@ with gr.Blocks(title="Toto Weather Forecast", theme=gr.themes.Soft()) as demo:
 if __name__ == "__main__":
-    persist.pull_all()  # bootstrap forecast log + archive from the HF Dataset
     _start_autorefresh()
     demo.launch()

 DISPLAY_TZ = os.environ.get("DISPLAY_TZ", "America/New_York")
 PLACE_NAME = os.environ.get("PLACE_NAME", "Yaphank, NY")
+# Single canonical view. History is read from the local SQLite archive
+# (data/ecowitt.db) instead of hitting the Ecowitt API on every page load.
+# The archive is kept current by the autorefresh thread + the per-Space-tick
+# sync, so over time we accumulate true 5-min granularity beyond Ecowitt's
+# own 24-30h 5-min retention window.
 VIEW_WEEK = {
+    "label": "Past 7 days · 72 h forecast (5-min cadence)",
+    "cycle_type": "5min",
+    "resample": "5min",
     "history_days": 7,
     "horizon_hours": 72,
 }
 # --- data fetchers --------------------------------------------------------
+ECOWITT_ARCHIVE_DB_PATH = "data/ecowitt.db"
 def fetch_history(cycle_type: str, resample: str, hours: float) -> pd.DataFrame:
+    """Read history from the local SQLite archive. If the archive is empty
+    (cold start before the first sync), fall back to a one-shot API pull so
+    the page still renders something."""
+    now_unix = int(time.time())
+    since_unix = now_unix - int(hours * 3600)
+    conn = storage.connect(ECOWITT_ARCHIVE_DB_PATH)
+    try:
+        df = storage.read_history_dataframe(
+            conn, since_unix=since_unix, until_unix=now_unix,
+            cycle_type=cycle_type, resample=resample,
+        )
+    finally:
+        conn.close()
+    if not df.empty:
+        return df
+    # Cold-start fallback: pull a small slice directly from the API so the
+    # page isn't blank on the very first visit before sync has run.
     cfg = ecowitt.EcowittConfig.from_env()
     end = datetime.now(timezone.utc).replace(tzinfo=None)
     start = end - timedelta(hours=hours)
 # --- auto-refresh background thread --------------------------------------
+ECOWITT_ARCHIVE_DB = ECOWITT_ARCHIVE_DB_PATH  # alias
 def _sync_archive_all_cycles() -> None:
         conn.close()
+def _sync_5min_only() -> None:
+    """Quick sync of just the 5-min cycle (the one the display reads from).
+    Called on startup so the first visitor sees fresh data."""
+    try:
+        cfg = ecowitt.EcowittConfig.from_env()
+    except RuntimeError:
+        return
+    conn = storage.connect(ECOWITT_ARCHIVE_DB)
+    try:
+        cycle = next(c for c in sync.CYCLES if c.name == "5min")
+        sync.sync_cycle(cfg, conn, cycle, verbose=False)
+    except Exception:  # noqa: BLE001
+        print("[startup] 5-min sync error:")
+        traceback.print_exc()
+    finally:
+        conn.close()
 def _autorefresh_loop():
     while True:
         try:
+            _sync_archive_all_cycles()  # bring the local archive up to date first
+            refresh()                    # read fresh data from DB, write forecast log
+            persist.push_all_async()     # ship both DBs to the HF Dataset
         except Exception:  # noqa: BLE001
             print("[autorefresh] error during refresh:")
             traceback.print_exc()
 if __name__ == "__main__":
+    persist.pull_all()       # bootstrap forecast log + archive from the HF Dataset
+    _sync_5min_only()        # ensure the archive has fresh 5-min data before first paint
     _start_autorefresh()
     demo.launch()

src/forecast_log.py CHANGED Viewed

@@ -61,13 +61,24 @@ def _ts(t) -> int:
     return int(pd.Timestamp(t).tz_convert("UTC").timestamp())
-def record_actuals(conn: sqlite3.Connection, history: pd.DataFrame) -> int:
-    """Upsert actuals from a history DataFrame (UTC-indexed; column = metric)."""
     rows = []
     for metric in history.columns:
         s = history[metric].dropna()
         for ts, val in s.items():
-            rows.append((_ts(ts), metric, float(val)))
     if not rows:
         return 0
     conn.executemany(
@@ -83,12 +94,26 @@ def record_toto(
     metric: str,
     fcst: TotoForecast,
     forecast_made_at: int | None = None,
 ) -> int:
     made = forecast_made_at if forecast_made_at is not None else int(time.time())
-    rows = [
-        (made, _ts(t), "toto", metric, float(p10), float(p50), float(p90))
-        for t, p10, p50, p90 in zip(fcst.median.index, fcst.p10.values, fcst.median.values, fcst.p90.values)
-    ]
     conn.executemany(
         "INSERT OR REPLACE INTO forecast_snapshots "
         "(forecast_made_at, target_ts, source, metric, p10, p50, p90) "

     return int(pd.Timestamp(t).tz_convert("UTC").timestamp())
+def record_actuals(
+    conn: sqlite3.Connection,
+    history: pd.DataFrame,
+    only_hourly: bool = True,
+) -> int:
+    """Upsert actuals from a history DataFrame (UTC-indexed; column = metric).
+    By default only hourly-aligned target_ts are stored so the scoreboard
+    table stays small even when the source history is at 5-min cadence.
+    """
     rows = []
     for metric in history.columns:
         s = history[metric].dropna()
         for ts, val in s.items():
+            tsu = _ts(ts)
+            if only_hourly and tsu % 3600 != 0:
+                continue
+            rows.append((tsu, metric, float(val)))
     if not rows:
         return 0
     conn.executemany(
     metric: str,
     fcst: TotoForecast,
     forecast_made_at: int | None = None,
+    only_hourly: bool = True,
 ) -> int:
+    """Persist a Toto forecast.
+    `only_hourly`: when True (default), only the hourly-aligned target_ts
+    rows are written. Forecast inference may run at 5-min cadence, but the
+    scoreboard score is the same regardless of cadence and the log grows
+    linearly per refresh — hourly keeps it manageable.
+    """
     made = forecast_made_at if forecast_made_at is not None else int(time.time())
+    rows = []
+    for t, p10, p50, p90 in zip(
+        fcst.median.index, fcst.p10.values, fcst.median.values, fcst.p90.values
+    ):
+        tsu = _ts(t)
+        if only_hourly and tsu % 3600 != 0:
+            continue
+        rows.append((made, tsu, "toto", metric, float(p10), float(p50), float(p90)))
+    if not rows:
+        return 0
     conn.executemany(
         "INSERT OR REPLACE INTO forecast_snapshots "
         "(forecast_made_at, target_ts, source, metric, p10, p50, p90) "

src/storage.py CHANGED Viewed

@@ -107,6 +107,55 @@ def max_ts(conn: sqlite3.Connection, cycle_type: str) -> int | None:
     return row[0] if row and row[0] is not None else None
 def stats(conn: sqlite3.Connection) -> list[tuple]:
     return conn.execute(
         "SELECT cycle_type, COUNT(*), MIN(ts_unix), MAX(ts_unix),"

     return row[0] if row and row[0] is not None else None
+def read_history_dataframe(
+    conn: sqlite3.Connection,
+    since_unix: int,
+    until_unix: int | None = None,
+    cycle_type: str = "5min",
+    fields: dict[str, tuple[str, str]] | None = None,
+    resample: str | None = None,
+):
+    """Read a multi-metric history slice from the local archive.
+    Returns a UTC-indexed pandas DataFrame whose columns are the keys of
+    `fields` (default: ecowitt.HISTORY_FIELDS) — temp_f, humidity,
+    pressure_inhg, rain_in_hr. Each column is pulled from the readings
+    table at the requested `cycle_type`; optionally resampled to a uniform
+    cadence with `.mean()`.
+    """
+    import time as _time
+    import pandas as pd  # local import keeps storage importable without pandas
+    if fields is None:
+        from . import ecowitt
+        fields = ecowitt.HISTORY_FIELDS
+    if until_unix is None:
+        until_unix = int(_time.time())
+    series_dict: dict[str, pd.Series] = {}
+    for col, (channel, metric) in fields.items():
+        rows = conn.execute(
+            "SELECT ts_unix, value FROM readings"
+            " WHERE cycle_type=? AND channel=? AND metric=?"
+            "   AND ts_unix BETWEEN ? AND ?"
+            " ORDER BY ts_unix",
+            (cycle_type, channel, metric, since_unix, until_unix),
+        ).fetchall()
+        if not rows:
+            continue
+        idx = pd.to_datetime([r[0] for r in rows], unit="s", utc=True)
+        vals = pd.to_numeric([r[1] for r in rows], errors="coerce")
+        series_dict[col] = pd.Series(vals, index=idx, name=col).sort_index()
+    if not series_dict:
+        return pd.DataFrame()
+    df = pd.concat(series_dict.values(), axis=1)
+    df.columns = list(series_dict.keys())
+    if resample:
+        df = df.resample(resample).mean()
+    return df
 def stats(conn: sqlite3.Connection) -> list[tuple]:
     return conn.execute(
         "SELECT cycle_type, COUNT(*), MIN(ts_unix), MAX(ts_unix),"