Spaces:

bitsofchris
/

time-series-ai-weather-forecast

Running

App Files Files Community

time-series-ai-weather-forecast / src /persist.py

bitsofchris

Switch to 15-min autorefresh, drop manual button, sync all cycle archives

7e7a097 5 days ago

raw

history blame contribute delete

6.09 kB

	"""Persist the forecast SQLite DB across HF Space rebuilds.

	HF Spaces' free tier has ephemeral storage — every `git push` rebuilds the
	container and wipes any local files. We back the forecast log with a
	private HF Dataset:

	- On startup: pull the latest forecasts.db from the dataset (if any).
	- After every refresh: push the current forecasts.db back.

	Environment:
	HF_TOKEN must have write access to the dataset
	LOG_DATASET_REPO override the default dataset repo id
	"""

	from __future__ import annotations

	import os
	import shutil
	import threading
	import time
	import traceback

	DEFAULT_REPO = "bitsofchris/toto-weather-forecast-log"
	PATH_IN_REPO = "forecasts.db"
	DEFAULT_LOCAL = "data/forecasts.db"

	_push_lock = threading.Lock()
	_last_push_at = 0.0
	PUSH_MIN_INTERVAL = 60.0 # seconds — coalesce rapid pushes


	def _repo_id() -> str:
	return os.environ.get("LOG_DATASET_REPO", DEFAULT_REPO)


	def _token() -> str \| None:
	return os.environ.get("HF_TOKEN")


	def pull_db(local_path: str = DEFAULT_LOCAL) -> bool:
	"""Download the latest DB from the dataset, overwriting any local copy.
	Returns True on success."""
	tok = _token()
	if not tok:
	print("[persist] HF_TOKEN not set — skipping pull")
	return False
	try:
	from huggingface_hub import hf_hub_download # noqa: PLC0415
	downloaded = hf_hub_download(
	repo_id=_repo_id(),
	repo_type="dataset",
	filename=PATH_IN_REPO,
	token=tok,
	)
	os.makedirs(os.path.dirname(local_path) or ".", exist_ok=True)
	shutil.copyfile(downloaded, local_path)
	print(f"[persist] pulled DB from {_repo_id()} ({os.path.getsize(local_path)} bytes)")
	return True
	except Exception: # noqa: BLE001
	print(f"[persist] pull skipped (no remote DB or network error):")
	traceback.print_exc()
	return False


	def push_db(local_path: str = DEFAULT_LOCAL) -> bool:
	"""Upload the local DB to the dataset. Coalesced and lock-protected so
	overlapping refreshes don't issue redundant uploads."""
	global _last_push_at
	tok = _token()
	if not tok or not os.path.exists(local_path):
	return False

	# Coalesce: if we just pushed, skip.
	if time.time() - _last_push_at < PUSH_MIN_INTERVAL:
	return False
	if not _push_lock.acquire(blocking=False):
	return False
	try:
	from huggingface_hub import HfApi # noqa: PLC0415
	api = HfApi(token=tok)
	api.upload_file(
	path_or_fileobj=local_path,
	path_in_repo=PATH_IN_REPO,
	repo_id=_repo_id(),
	repo_type="dataset",
	commit_message="forecast log update",
	)
	_last_push_at = time.time()
	print(f"[persist] pushed DB to {_repo_id()} ({os.path.getsize(local_path)} bytes)")
	return True
	except Exception: # noqa: BLE001
	print("[persist] push failed:")
	traceback.print_exc()
	return False
	finally:
	_push_lock.release()


	def push_db_async(local_path: str = DEFAULT_LOCAL) -> None:
	"""Fire-and-forget push so refresh() returns to the user immediately."""
	threading.Thread(
	target=push_db, args=(local_path,), daemon=True, name="persist-push"
	).start()


	# --- multi-file push (forecast log + Ecowitt archive in one commit) ------
	ARCHIVE_LOCAL = "data/ecowitt.db"
	ARCHIVE_PATH_IN_REPO = "ecowitt.db"
	_multi_lock = threading.Lock()
	_multi_last = 0.0


	def push_all(
	forecast_local: str = DEFAULT_LOCAL,
	archive_local: str = ARCHIVE_LOCAL,
	) -> bool:
	"""Upload both DBs in a single dataset commit."""
	global _multi_last
	tok = _token()
	if not tok:
	return False
	if time.time() - _multi_last < PUSH_MIN_INTERVAL:
	return False
	if not _multi_lock.acquire(blocking=False):
	return False
	try:
	from huggingface_hub import CommitOperationAdd, HfApi # noqa: PLC0415
	api = HfApi(token=tok)
	ops = []
	for local, in_repo in (
	(forecast_local, PATH_IN_REPO),
	(archive_local, ARCHIVE_PATH_IN_REPO),
	):
	if os.path.exists(local):
	ops.append(CommitOperationAdd(path_in_repo=in_repo, path_or_fileobj=local))
	if not ops:
	return False
	api.create_commit(
	repo_id=_repo_id(),
	repo_type="dataset",
	operations=ops,
	commit_message="forecast log + archive update",
	)
	_multi_last = time.time()
	sizes = ", ".join(f"{op.path_in_repo}={os.path.getsize(forecast_local if op.path_in_repo==PATH_IN_REPO else archive_local)}B" for op in ops)
	print(f"[persist] pushed multi to {_repo_id()} ({sizes})")
	return True
	except Exception: # noqa: BLE001
	print("[persist] push_all failed:")
	traceback.print_exc()
	return False
	finally:
	_multi_lock.release()


	def push_all_async(
	forecast_local: str = DEFAULT_LOCAL,
	archive_local: str = ARCHIVE_LOCAL,
	) -> None:
	threading.Thread(
	target=push_all, args=(forecast_local, archive_local),
	daemon=True, name="persist-push-all",
	).start()


	def pull_all(
	forecast_local: str = DEFAULT_LOCAL,
	archive_local: str = ARCHIVE_LOCAL,
	) -> None:
	"""Pull both DBs from the dataset on startup. Each missing file is silently skipped."""
	pull_db(forecast_local)
	# Pull the archive too if it exists.
	tok = _token()
	if not tok:
	return
	try:
	from huggingface_hub import hf_hub_download # noqa: PLC0415
	downloaded = hf_hub_download(
	repo_id=_repo_id(),
	repo_type="dataset",
	filename=ARCHIVE_PATH_IN_REPO,
	token=tok,
	)
	os.makedirs(os.path.dirname(archive_local) or ".", exist_ok=True)
	shutil.copyfile(downloaded, archive_local)
	print(f"[persist] pulled archive ({os.path.getsize(archive_local)} bytes)")
	except Exception:
	# 404 on first run is expected.
	pass