Spaces:

lablab-ai-amd-developer-hackathon
/

riprap-nyc

Running

App Files Files Community

riprap-nyc / app /flood_layers /prithvi_live.py

seriffic

fix(prithvi): drop .view() on lists; enrich warmup traceback

bf7ea70 4 days ago

raw

history blame contribute delete

25.6 kB

	"""Prithvi-EO 2.0 (NYC Pluvial v2 fine-tune) live water segmentation.

	A per-query specialist: pulls the most recent low-cloud Sentinel-2 L2A
	scene over the address from Microsoft Planetary Computer, runs the
	NYC-specialized fine-tune, and reports % water within 500 m.

	Distinct from `app/flood_layers/prithvi_water.py`, which serves the
	offline-precomputed 2021 Ida polygons. This one is fresh observation
	each query — same doc_id (`prithvi_live`), but the underlying model
	has been swapped from the Sen1Floods11 base to
	`msradam/Prithvi-EO-2.0-NYC-Pluvial` (Apache-2.0, fine-tuned on AMD
	Instinct MI300X via AMD Developer Cloud — test flood IoU 0.5979,
	6× over the base). The base model is still loadable by setting
	RIPRAP_PRITHVI_LIVE_REPO to the IBM repo as a fallback.

	Network calls (STAC search + COG band reads) and a 300M-param model
	forward pass make this the slowest specialist after the LLM. Gated by
	RIPRAP_PRITHVI_LIVE_ENABLE so deployments without the deps installed
	silently skip it. Cloud-cover refuses out at 30%+ to honor the
	Sen1Floods11 training distribution.

	License: Apache-2.0. See experiments/shared/licenses.md.
	"""

	from __future__ import annotations

	import concurrent.futures
	import logging
	import os
	import threading
	import time
	from typing import Any

	log = logging.getLogger("riprap.prithvi_live")

	ENABLE = os.environ.get("RIPRAP_PRITHVI_LIVE_ENABLE", "1").lower() in ("1", "true", "yes")
	SEARCH_DAYS = int(os.environ.get("RIPRAP_PRITHVI_LIVE_SEARCH_DAYS", "120"))
	MAX_CLOUD_PCT = float(os.environ.get("RIPRAP_PRITHVI_LIVE_MAX_CLOUD", "30"))
	DEVICE = os.environ.get("RIPRAP_PRITHVI_LIVE_DEVICE", "cpu")

	# Default to the NYC Pluvial v2 fine-tune; override to the IBM-NASA base
	# (`ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11`) when the v2
	# artifact is unreachable or for A/B comparisons.
	REPO = os.environ.get(
	"RIPRAP_PRITHVI_LIVE_REPO",
	"msradam/Prithvi-EO-2.0-NYC-Pluvial",
	)
	BASE_REPO = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"

	# Sen1Floods11 expects 6 bands in this exact order.
	BANDS = ["B02", "B03", "B04", "B8A", "B11", "B12"]
	IMG_SIZE = 512 # Sen1Floods11 training crop
	CHIP_PX = 1024
	CHIP_M = CHIP_PX * 10
	HALF_M = CHIP_M / 2
	CENTER_RADIUS_M = 500
	PIXEL_M = 10

	_MODEL = None
	_RUN_MODEL = None
	_INIT_LOCK = threading.Lock() # serializes lazy load if multiple threads
	# hit fetch() before _MODEL is populated


	def _has_required_deps() -> tuple[bool, str \| None]:
	"""Probe deps in two tiers.

	Tier 1 — chip fetching (planetary_computer / pystac_client / rioxarray
	/ xarray / einops) is always required: prithvi_live always pulls a
	Sentinel-2 chip from Microsoft Planetary Computer regardless of where
	inference runs.

	Tier 2 — local inference (terratorch) is only required when remote
	inference is unavailable. On the HF Space we have remote inference
	on the AMD MI300X via app/inference.py, so terratorch is not needed
	even though chip-fetch is.

	Returns (False, missing) if any required dep is missing. Splitting
	the gate this way lets the HF Space deployment fetch chips and run
	remote inference even though it doesn't fit terratorch's transitive
	dep cone (~250 MB) in the HF build sandbox."""
	chip_deps = ("planetary_computer", "pystac_client",
	"rioxarray", "xarray", "einops")
	missing = [n for n in chip_deps
	if not _has_module(n)]
	if missing:
	return False, ", ".join(missing)
	# Tier 2: only need terratorch if we'd run inference locally.
	try:
	from app import inference as _inf
	if _inf.remote_enabled():
	return True, None
	except Exception:
	pass
	if not _has_module("terratorch"):
	return False, "terratorch (local inference)"
	return True, None


	def _has_module(name: str) -> bool:
	"""True if `name` imports cleanly. ImportError → not installed.
	Other exceptions (e.g. torchvision::nms RuntimeError on the HF
	Space) → treat as unavailable too; we don't want a clean-skip
	intent to crash the FSM at deps-probe time."""
	try:
	__import__(name)
	return True
	except ImportError:
	return False
	except Exception as e:
	log.warning("prithvi_live: %s import raised %s; treating as "
	"unavailable", name, type(e).__name__)
	return False


	_DEPS_OK, _DEPS_MISSING = _has_required_deps()


	def warm():
	"""Optional pre-load. The FSM action is lazy too — calling warm()
	here just amortizes the first-query cost at app boot."""
	if not ENABLE:
	return
	try:
	_ensure_model()
	except Exception:
	log.exception("prithvi_live: warm() failed; specialist will no-op")


	def _ensure_model():
	"""Load Prithvi-EO 2.0 once into RAM.

	The v2 NYC Pluvial fine-tune (`msradam/Prithvi-EO-2.0-NYC-Pluvial`)
	is architecturally distinct from the IBM-NASA Sen1Floods11
	base: v2 ships a `UNetDecoder` + 2-class head, the base ships a
	UperNet with PSP / FPN. The model has to be built from each
	repo's own config.yaml — there's no key-mapping shim that bridges
	them.

	Strategy:

	1. If the active REPO != BASE_REPO, try to build from the v2
	yaml + v2 ckpt. The v2 yaml's data: paths point at the
	training droplet's filesystem (`/root/terramind_nyc/...`)
	which doesn't exist locally; that's fine — the
	GenericNonGeoSegmentationDataModule constructor only
	records the paths, splits aren't read until `setup()`.
	2. On any v2 failure (yaml not present, datamodule constructor
	strict, weights mismatch), fall back to the base yaml + base
	ckpt. The base path is the proven pre-C5 behaviour.

	The shared `inference.run_model` helper is only published by the
	IBM-NASA base repo; we always pull it from there.
	"""
	global _MODEL, _RUN_MODEL
	if _MODEL is not None:
	return _MODEL, _RUN_MODEL
	with _INIT_LOCK:
	if _MODEL is not None: # double-check inside the lock
	return _MODEL, _RUN_MODEL
	import importlib.util

	from huggingface_hub import hf_hub_download
	from terratorch.cli_tools import LightningInferenceModel
	log.info("prithvi_live: loading model from %s", REPO)

	# Inference helper only lives in the IBM-NASA base repo.
	inference_py = hf_hub_download(BASE_REPO, "inference.py")

	m = None
	# ---- v2 path: yaml + ckpt from the published repo ----------
	if REPO != BASE_REPO:
	try:
	# The v2 repo publishes `prithvi_nyc_phase14.yaml` and
	# `prithvi_nyc_pluvial_v2.ckpt`. Be tolerant of small
	# naming drift (best_val_loss.ckpt etc.) by probing.
	v2_yaml = None
	for name in ("prithvi_nyc_phase14.yaml",
	"config.yaml", "phase14.yaml",
	"prithvi_nyc_v2.yaml"):
	try:
	v2_yaml = hf_hub_download(REPO, name)
	break
	except Exception:
	continue
	v2_ckpt = None
	for name in ("prithvi_nyc_pluvial_v2.ckpt",
	"best_val_loss.ckpt", "model.ckpt",
	"last.ckpt"):
	try:
	v2_ckpt = hf_hub_download(REPO, name)
	break
	except Exception:
	continue
	if v2_yaml and v2_ckpt:
	log.info("prithvi_live: building v2 model from "
	"yaml=%s ckpt=%s", v2_yaml, v2_ckpt)
	m = LightningInferenceModel.from_config(v2_yaml, v2_ckpt)
	# prithvi_nyc_phase14.yaml uses GenericNonGeoSegmentationDataModule
	# which omits test_transform (→ None) and uses terratorch Normalize
	# for aug (only handles 4D/5D). IBM inference.py:run_model() calls
	# both on a 3D dict. Patch both to match the IBM base contract:
	# ToTensorV2 for test_transform; Kornia AugmentationSequential
	# (accepts dict input, adds batch dim) for aug.
	if getattr(getattr(m, 'datamodule', None),
	'test_transform', None) is None:
	import albumentations as A
	import torch as _torch
	from albumentations.pytorch import ToTensorV2
	m.datamodule.test_transform = A.Compose([ToTensorV2()])
	_old = m.datamodule.aug

	# IBM's inference.py:188 calls
	# `datamodule.aug({'image': tensor})['image']`.
	# kornia's AugmentationSequential doesn't accept
	# dict input cleanly and tripped the
	# `'list' object has no attribute 'view'`
	# error on the L4 deploy. Use a hand-rolled
	# dict-aware normalizer instead — same math,
	# fewer moving parts, no kornia version skew.
	class _DictNormalize:
	def __init__(self, mean, std):
	self.mean = _torch.as_tensor(mean).view(-1, 1, 1).float()
	self.std = _torch.as_tensor(std).view(-1, 1, 1).float()

	def __call__(self, sample):
	if isinstance(sample, dict):
	img = sample["image"]
	mean = self.mean.to(img.device)
	std = self.std.to(img.device)
	return {**sample, "image": (img - mean) / std}
	mean = self.mean.to(sample.device)
	std = self.std.to(sample.device)
	return (sample - mean) / std

	# `_old.means` / `_old.stds` come from the
	# yaml as Python lists — calling `.view()` on
	# them is what tripped the original
	# `'list' object has no attribute 'view'`.
	# _DictNormalize handles the conversion via
	# torch.as_tensor internally; just pass the
	# raw values whatever their type.
	m.datamodule.aug = _DictNormalize(_old.means, _old.stds)
	log.info("prithvi_live: patched v2 datamodule transforms "
	"for IBM inference.py compat (dict-aware Normalize)")
	else:
	log.warning("prithvi_live: v2 yaml/ckpt not "
	"discoverable in %s; falling back to base",
	REPO)
	except Exception as e:
	log.warning("prithvi_live: v2 build failed (%s); "
	"falling back to base", e)
	m = None

	# ---- base path: proven IBM-NASA Sen1Floods11 fine-tune -----
	if m is None:
	base_config = hf_hub_download(BASE_REPO, "config.yaml")
	base_ckpt = hf_hub_download(
	BASE_REPO, "Prithvi-EO-V2-300M-TL-Sen1Floods11.pt")
	m = LightningInferenceModel.from_config(base_config, base_ckpt)

	m.model.eval()
	if DEVICE == "cuda":
	try:
	import torch
	if torch.cuda.is_available():
	m.model.cuda()
	except Exception:
	log.exception("prithvi_live: cuda move failed")

	spec = importlib.util.spec_from_file_location("_prithvi_inference",
	inference_py)
	mod = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(mod)
	_MODEL = m
	_RUN_MODEL = mod.run_model
	return _MODEL, _RUN_MODEL


	def _search_recent_scene(lat: float, lon: float):
	"""Most recent low-cloud S2 L2A item near (lat, lon) in the last
	SEARCH_DAYS days, or None."""
	import datetime as dt

	import planetary_computer as pc
	from pystac_client import Client
	end = dt.datetime.utcnow().date()
	start = end - dt.timedelta(days=SEARCH_DAYS)
	client = Client.open(
	"https://planetarycomputer.microsoft.com/api/stac/v1",
	modifier=pc.sign_inplace,
	)
	delta = 0.02
	search = client.search(
	collections=["sentinel-2-l2a"],
	bbox=[lon - delta, lat - delta, lon + delta, lat + delta],
	datetime=f"{start}/{end}",
	query={"eo:cloud_cover": {"lt": MAX_CLOUD_PCT}},
	max_items=20,
	)
	items = sorted(
	search.items(),
	key=lambda it: (it.properties.get("eo:cloud_cover", 100),
	-(it.datetime.timestamp() if it.datetime else 0)),
	)
	return items[0] if items else None


	def _build_chip(item, lat: float, lon: float):
	"""Returns (img, ref_da, epsg) — img is the (6, H, W) center-cropped
	float32 array; ref_da is the rioxarray DataArray of the reference
	band BEFORE the center crop (kept so we can compute the affine
	transform for polygonization in EPSG:4326)."""
	import numpy as np
	import rioxarray # noqa: F401
	import xarray as xr
	from pyproj import Transformer
	if "proj:epsg" in item.properties:
	epsg = int(item.properties["proj:epsg"])
	else:
	code = item.properties.get("proj:code", "")
	if code.startswith("EPSG:"):
	epsg = int(code.split(":", 1)[1])
	else:
	raise RuntimeError("STAC item missing proj:epsg / proj:code")
	fwd = Transformer.from_crs("EPSG:4326", f"EPSG:{epsg}", always_xy=True)
	cx, cy = fwd.transform(lon, lat)
	xmin, xmax = cx - HALF_M, cx + HALF_M
	ymin, ymax = cy - HALF_M, cy + HALF_M
	ref = rioxarray.open_rasterio(item.assets[BANDS[0]].href, masked=False).squeeze(drop=True)
	ref = ref.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax)
	ref = ref.isel(y=slice(0, CHIP_PX), x=slice(0, CHIP_PX))
	arrs = [ref.astype("float32")]
	for b in BANDS[1:]:
	da = rioxarray.open_rasterio(item.assets[b].href, masked=False).squeeze(drop=True)
	da = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax)
	if da.shape != ref.shape:
	da = da.rio.reproject_match(ref)
	arrs.append(da.astype("float32"))
	stacked = xr.concat(arrs, dim="band", join="override").assign_coords(band=BANDS)
	img = stacked.values # (6, H, W)
	# Center crop to IMG_SIZE x IMG_SIZE.
	_, h, w = img.shape
	sy, sx = (h - IMG_SIZE) // 2, (w - IMG_SIZE) // 2
	img = img[:, sy:sy + IMG_SIZE, sx:sx + IMG_SIZE]
	if img.mean() > 1:
	img = img / 10000.0
	return np.nan_to_num(img.astype("float32")), ref, epsg


	def _polygonize_mask(pred, ref_da, epsg: int) -> dict \| None:
	"""Vectorize the binary water mask into an EPSG:4326 GeoJSON
	FeatureCollection so the frontend can paint it on the MapLibre
	map. Returns None on failure (best-effort — never raises into the
	caller path)."""
	try:
	import json

	import geopandas as gpd
	from rasterio.features import shapes
	from rasterio.transform import from_origin
	from shapely.geometry import shape
	# Reconstruct the affine transform of the center-cropped pred.
	# ref_da has 1024 px at 10 m; we cropped to the central 512.
	xs = ref_da.x.values
	ys = ref_da.y.values
	if len(xs) < IMG_SIZE or len(ys) < IMG_SIZE:
	return None
	# rioxarray gives pixel-centered coords; offset by half a pixel
	# to the upper-left to build a from_origin transform.
	sy = (len(ys) - IMG_SIZE) // 2
	sx = (len(xs) - IMG_SIZE) // 2
	# ys are descending (top-to-bottom); take the top of the crop.
	top_y = float(ys[sy]) + (PIXEL_M / 2.0)
	left_x = float(xs[sx]) - (PIXEL_M / 2.0)
	transform = from_origin(left_x, top_y, PIXEL_M, PIXEL_M)
	# Polygonize only the water class (1).
	mask = (pred == 1).astype("uint8")
	polys = []
	for geom, value in shapes(mask, mask=mask.astype(bool),
	transform=transform):
	if value != 1:
	continue
	polys.append(shape(geom))
	if not polys:
	return {"type": "FeatureCollection", "features": []}
	gdf = gpd.GeoDataFrame({"geometry": polys},
	crs=f"EPSG:{epsg}").to_crs("EPSG:4326")
	# Simplify slightly to keep the SSE payload small (10 m raster
	# over 5 km square = up to ~10 k tiny squares; simplification
	# collapses adjacent water pixels into smooth polygons).
	gdf["geometry"] = gdf.geometry.simplify(0.00005, preserve_topology=True)
	return json.loads(gdf.to_json())
	except Exception:
	log.exception("prithvi_live: polygonize failed")
	return None


	def _fetch_inner(lat: float, lon: float, timeout_s: float) -> dict[str, Any]:
	"""Core fetch logic — run inside a bounded thread via fetch()."""
	t0 = time.time()
	try:
	item = _search_recent_scene(lat, lon)
	if item is None:
	return {"ok": False, "skipped": f"no <{MAX_CLOUD_PCT}% cloud "
	f"S2 in last {SEARCH_DAYS}d"}
	cc = float(item.properties.get("eo:cloud_cover", -1))
	if time.time() - t0 > timeout_s:
	return {"ok": False, "skipped": "stac search exceeded budget"}
	img, ref_da, epsg = _build_chip(item, lat, lon)
	if time.time() - t0 > timeout_s:
	return {"ok": False, "skipped": "chip build exceeded budget"}

	# v0.4.5 — try the MI300X inference service first if configured.
	# On RemoteUnreachable (service down / not configured / 5xx) fall
	# through to the local terratorch path. When remote is configured
	# but returns non-ok we surface that signal directly: the local
	# path on this machine has been brittle (v2 datamodule
	# `test_transform=None` race), so a configured remote is more
	# reliable than the fallback.
	remote_attempted = False
	try:
	from app import inference as _inf
	if _inf.remote_enabled():
	remote_attempted = True
	remote = _inf.prithvi_pluvial(
	img, scene_id=item.id,
	scene_datetime=str(item.datetime),
	cloud_cover=cc,
	timeout=timeout_s,
	)
	if remote.get("ok"):
	# Vectorize the remote prediction raster so the map
	# actually renders the live water polygons. The
	# droplet returns `pred_b64` (uint8 binary mask);
	# we polygonize against the chip's WGS84 bounds
	# which we know locally from `ref_da`.
	polys = None
	pred_b64 = remote.get("pred_b64")
	pred_shape = remote.get("pred_shape")
	if pred_b64 and pred_shape:
	try:
	xs = ref_da.x.values
	ys = ref_da.y.values
	from pyproj import Transformer
	t_inv = Transformer.from_crs(
	f"EPSG:{epsg}", "EPSG:4326",
	always_xy=True)
	minx, maxx = float(xs.min()), float(xs.max())
	miny, maxy = float(ys.min()), float(ys.max())
	minlon, minlat = t_inv.transform(minx, miny)
	maxlon, maxlat = t_inv.transform(maxx, maxy)
	from app.context._polygonize import (
	polygonize_binary_mask,
	)
	polys = polygonize_binary_mask(
	pred_b64, pred_shape,
	(minlon, minlat, maxlon, maxlat),
	label="water", fill_color="#1F77B4",
	simplify_tolerance=2e-5,
	)
	except Exception:
	log.exception("prithvi_live: remote polygonize failed")
	polys = None
	return {
	"ok": True,
	"item_id": item.id,
	"item_datetime": str(item.datetime),
	"cloud_cover": cc,
	"pct_water_full": remote.get("pct_water_full"),
	"pct_water_within_500m": remote.get("pct_water_within_500m"),
	"polygons_geojson": polys,
	"compute": f"remote · {remote.get('device', 'gpu')}",
	"elapsed_s": round(time.time() - t0, 2),
	}
	err = (remote.get("err")
	or remote.get("error")
	or remote.get("skipped")
	or "unknown")
	return {"ok": False,
	"skipped": f"remote prithvi-pluvial non-ok: {err}",
	"elapsed_s": round(time.time() - t0, 2)}
	except _inf.RemoteUnreachable as e:
	log.info("prithvi_live: remote unreachable (%s)", e)
	if remote_attempted:
	# Don't fall to local — torchvision::nms is broken on the
	# CPU-tier UI Spaces and crashes the FSM specialist with
	# a confusing RuntimeError. Return a clean skipped row so
	# the trace says "remote unreachable" instead.
	return {"ok": False,
	"skipped": f"remote prithvi-pluvial unreachable: {e}",
	"elapsed_s": round(time.time() - t0, 2)}
	except Exception as e:
	log.exception("prithvi_live: remote call failed")
	if remote_attempted:
	return {"ok": False,
	"skipped": f"remote prithvi-pluvial error: "
	f"{type(e).__name__}: {e}",
	"elapsed_s": round(time.time() - t0, 2)}

	# Local fallback — the path that's been live since v0.4.4.
	# Reached only when remote_attempted is False (i.e. remote
	# backend not configured at all).
	model, run_model = _ensure_model()
	x = img[None, :, None, :, :] # (1, 6, 1, H, W)
	pred_t = run_model(x, None, None, model.model, model.datamodule, IMG_SIZE)
	import numpy as np
	pred = pred_t[0].cpu().numpy().astype("uint8")
	pct_full = float(100.0 * pred.mean())
	yy, xx = np.indices(pred.shape)
	cy, cx = pred.shape[0] // 2, pred.shape[1] // 2
	radius_px = CENTER_RADIUS_M / PIXEL_M
	circle = (yy - cy) 2 + (xx - cx) 2 <= radius_px ** 2
	pct_500 = float(100.0 * pred[circle].mean()) if circle.sum() else 0.0
	polygons_geojson = _polygonize_mask(pred, ref_da, epsg)
	return {
	"ok": True,
	"item_id": item.id,
	"item_datetime": str(item.datetime),
	"cloud_cover": cc,
	"pct_water_full": pct_full,
	"pct_water_within_500m": pct_500,
	"polygons_geojson": polygons_geojson,
	"compute": "local",
	"elapsed_s": round(time.time() - t0, 2),
	}
	except Exception as e:
	log.exception("prithvi_live: fetch failed")
	return {"ok": False, "err": f"{type(e).__name__}: {e}",
	"elapsed_s": round(time.time() - t0, 2)}


	def fetch(lat: float, lon: float, timeout_s: float = 60.0) -> dict[str, Any]:
	"""Run the specialist. Wraps _fetch_inner in a bounded thread so that
	STAC searches and COG band reads (which lack per-request HTTP timeouts)
	cannot hang the FSM indefinitely.

	Returns a dict with at minimum:
	{ "ok": bool, "skipped": str \| None, "item_id": str \| None,
	"cloud_cover": float \| None, "pct_water_within_500m": float \| None }
	Designed to never raise; failures show up as ok=False with an `err`.
	"""
	if not ENABLE:
	return {"ok": False, "skipped": "RIPRAP_PRITHVI_LIVE_ENABLE=0"}
	if not _DEPS_OK:
	return {"ok": False,
	"skipped": f"deps unavailable on this deployment: "
	f"{_DEPS_MISSING}"}
	hard_timeout = timeout_s + 15.0
	from app import emissions as _emissions
	_parent_tracker = _emissions.current()
	with concurrent.futures.ThreadPoolExecutor(
	max_workers=1,
	initializer=lambda t=_parent_tracker: _emissions.install(t),
	) as pool:
	future = pool.submit(_fetch_inner, lat, lon, timeout_s)
	try:
	return future.result(timeout=hard_timeout)
	except concurrent.futures.TimeoutError:
	log.warning("prithvi_live: hard timeout after %.0fs (STAC/COG hung)",
	hard_timeout)
	return {"ok": False,
	"skipped": f"prithvi_live timed out after {hard_timeout:.0f}s"}