Spaces:

lablab-ai-amd-developer-hackathon
/

riprap-nyc

Running

seriffic commited on 3 days ago

Commit

dbf7a0e

1 Parent(s): 304d3b7

Offline pre-compute scripts

The fixtures Riprap ships baked all start their lives in scripts/:

fetch_nyc_dem — pull USGS 3DEP tiles, mosaic to NYC bbox
compute_hydrology_indices — TWI + HAND from the DEM via whitebox
fetch_ida_hwms — STN API → ida_2021_hwms_ny.geojson
run_prithvi_flood — Prithvi-EO 2.0 segmentation runner
run_prithvi_ida — Ida-specific pre/post diff variant
build_{nycha,schools,mta_entrances}_register
— full FSM pass over each register's
rows, JSON cached to data/registers/
audit — full evidence audit per query for the
printable report
dry_run — end-to-end CLI smoke without uvicorn

These run offline, on a fat workstation; only their outputs land in
the runtime image.

Files changed (10) hide show

scripts/audit.py +159 -0
scripts/build_mta_entrances_register.py +24 -0
scripts/build_nycha_register.py +21 -0
scripts/build_schools_register.py +23 -0
scripts/compute_hydrology_indices.py +92 -0
scripts/dry_run.py +127 -0
scripts/fetch_ida_hwms.py +54 -0
scripts/fetch_nyc_dem.py +50 -0
scripts/run_prithvi_flood.py +172 -0
scripts/run_prithvi_ida.py +214 -0

scripts/audit.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""Hallucination audit harness.
+Runs the FSM against a curated address sweep, logs every paragraph,
+counts dropped sentences, flags any sentence with an event name not in
+its source documents.
+Run after the schools register has finished building (otherwise it
+contends with the batch for Ollama).
+    python scripts/audit.py
+"""
+from __future__ import annotations
+import json
+import re
+import sys
+import time
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore")
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+from app.fsm import run  # noqa: E402
+OUT = ROOT / "outputs" / "audit_log.jsonl"
+OUT.parent.mkdir(exist_ok=True, parents=True)
+# A curated cross-borough sweep covering the full range of conditions
+ADDRESSES = [
+    # Far Rockaway / Sandy zone (everything fires)
+    "180 Beach 35 St, Queens",
+    "Beach 105 Street and Rockaway Boulevard, Queens",
+    # Hollis / Jamaica (Ida basement deaths)
+    "153-09 90 Avenue, Jamaica, Queens",
+    "Hollis Avenue and 200th Street, Queens",
+    # Brooklyn coastal — Coney Island / NYCHA
+    "2950 W 25 Street, Brooklyn",
+    "Surf Avenue and West 25 Street, Brooklyn",
+    "Sheepshead Bay Road, Brooklyn",
+    # Carroll Gardens / Gowanus (chronic flooding)
+    "Smith and 9 Street, Brooklyn",
+    "Carroll Street and 3 Avenue, Brooklyn",
+    # Lower Manhattan / Sandy zone
+    "280 Broome Street, Manhattan",
+    "South Street Seaport, Manhattan",
+    "Battery Park, Manhattan",
+    # Midtown / dry control
+    "350 5 Avenue, Manhattan",          # Empire State
+    "1 Times Square, Manhattan",
+    "Lincoln Center, Manhattan",
+    # Bronx
+    "Pelham Bay Park, Bronx",
+    "Hunts Point, Bronx",
+    "Yankee Stadium, Bronx",
+    # Staten Island
+    "Tottenville, Staten Island",
+    "Great Kills, Staten Island",
+    "St. George Ferry Terminal, Staten Island",
+    # Queens dry / inland
+    "Forest Hills, Queens",
+    "JFK Airport, Queens",
+    "Astoria Park, Queens",
+    # Edge cases
+    "Brooklyn Bridge Park, Brooklyn",
+    "Roosevelt Island, Manhattan",
+]
+EVENT_NAMES = ["sandy", "ida", "ophelia", "henri", "irene", "isaias",
+               "harvey", "katrina", "florence"]
+def find_event_leaks(paragraph: str, doc_corpus: str) -> list[str]:
+    leaks = []
+    p = paragraph.lower()
+    for ev in EVENT_NAMES:
+        if ev in p and ev not in doc_corpus.lower():
+            leaks.append(ev)
+    return leaks
+def main() -> int:
+    if OUT.exists():
+        OUT.unlink()
+    print(f"running audit on {len(ADDRESSES)} addresses; logging to {OUT}",
+          file=sys.stderr)
+    summary = {
+        "total": 0, "ok": 0, "dropped_total": 0,
+        "with_drops": 0, "event_leaks": 0,
+    }
+    t0 = time.time()
+    for q in ADDRESSES:
+        try:
+            r = run(q)
+        except Exception as e:
+            print(f"  ! {q[:50]:<50} ERR: {type(e).__name__}: {e}", file=sys.stderr)
+            continue
+        para = r.get("paragraph") or ""
+        audit = r.get("audit") or {}
+        dropped = audit.get("dropped", []) or []
+        # rebuild a haystack from documents we sent to Granite
+        from app.reconcile import build_documents
+        # NOTE: build_documents needs the same snap shape the FSM stored
+        snap = {k: r.get(k) for k in ("geocode","sandy","dep","floodnet",
+                                       "nyc311","microtopo","ida_hwm","rag")}
+        doc_msgs = build_documents(snap)
+        haystack = "\n".join(m.get("content", "") for m in doc_msgs)
+        leaks = find_event_leaks(para, haystack)
+        rec = {
+            "query": q,
+            "address": (r.get("geocode") or {}).get("address"),
+            "borough": (r.get("geocode") or {}).get("borough"),
+            "paragraph": para,
+            "raw": audit.get("raw"),
+            "dropped": dropped,
+            "event_leaks": leaks,
+            "sandy": r.get("sandy"),
+            "n_floodnet_events_3y": (r.get("floodnet") or {}).get("n_flood_events_3y", 0),
+            "n_311": (r.get("nyc311") or {}).get("n", 0),
+            "microtopo_pct_200m": (r.get("microtopo") or {}).get("rel_elev_pct_200m"),
+        }
+        with OUT.open("a") as f:
+            f.write(json.dumps(rec, default=str) + "\n")
+        summary["total"] += 1
+        summary["dropped_total"] += len(dropped)
+        if dropped: summary["with_drops"] += 1
+        if leaks:   summary["event_leaks"] += 1
+        if not leaks and not dropped: summary["ok"] += 1
+        marker = "✓" if (not leaks and not dropped) else ("⚠" if dropped or leaks else "·")
+        print(f"  {marker} {q[:50]:<50} dropped={len(dropped)}  leaks={leaks or '-'}",
+              file=sys.stderr)
+    elapsed = time.time() - t0
+    print(f"\n=== SUMMARY (in {elapsed:.0f}s) ===", file=sys.stderr)
+    for k, v in summary.items():
+        print(f"  {k:18s} {v}", file=sys.stderr)
+    print(f"\nfull log: {OUT}", file=sys.stderr)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

scripts/build_mta_entrances_register.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Pre-compute the MTA Subway Entrances flood-exposure register.
+Run: python scripts/build_mta_entrances_register.py
+Resume-safe: re-running picks up after a network blip.
+"""
+from __future__ import annotations
+import sys
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore")
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+from app.assets import mta_entrances  # noqa: E402
+from app.register_builder import build_register  # noqa: E402
+if __name__ == "__main__":
+    build_register("mta_entrances", mta_entrances.load,
+                   meta_keys=("name", "address", "borough", "entrance_type"))

scripts/build_nycha_register.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Pre-compute the NYCHA developments flood-exposure register.
+Run: python scripts/build_nycha_register.py
+"""
+from __future__ import annotations
+import sys
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore")
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+from app.assets import nycha  # noqa: E402
+from app.register_builder import build_register  # noqa: E402
+if __name__ == "__main__":
+    build_register("nycha", nycha.load,
+                   meta_keys=("name", "address", "borough", "tds_num"))

scripts/build_schools_register.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""Pre-compute the NYC public schools flood-exposure register.
+Run: python scripts/build_schools_register.py
+Resume-safe: re-running picks up after a network blip.
+"""
+from __future__ import annotations
+import sys
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore")
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+from app.assets import schools  # noqa: E402
+from app.register_builder import build_register  # noqa: E402
+if __name__ == "__main__":
+    build_register("schools", schools.load,
+                   meta_keys=("name", "address", "borough", "bbl", "bin"))

scripts/compute_hydrology_indices.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""Pre-compute TWI (Topographic Wetness Index) and HAND (Height Above
+Nearest Drainage) for the cached NYC DEM.
+These are standard hydrology indices used by InfoWorks ICM, HEC-RAS,
+and the Forest Service / USGS. They give the microtopo specialist new
+per-address signal beyond elevation percentile + relief:
+- **TWI** = ln(specific_catchment_area / tan(slope)). HIGH values mean
+  a cell is saturation-prone (large upslope drainage area + low slope =
+  water accumulates here).
+- **HAND** = vertical distance from each cell to the nearest channel.
+  LOW values (sub-meter) mean the address sits at or near drainage
+  level — flood-vulnerable. HIGH values mean it's perched on dry ground.
+Output: data/twi.tif and data/hand.tif, aligned with data/nyc_dem_30m.tif.
+Run: python scripts/compute_hydrology_indices.py
+"""
+from __future__ import annotations
+import sys
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore")
+ROOT = Path(__file__).resolve().parent.parent
+DEM_PATH = ROOT / "data" / "nyc_dem_30m.tif"
+TWI_OUT = ROOT / "data" / "twi.tif"
+HAND_OUT = ROOT / "data" / "hand.tif"
+def main() -> int:
+    if not DEM_PATH.exists():
+        print(f"missing {DEM_PATH}; run scripts/fetch_nyc_dem.py first",
+              file=sys.stderr)
+        return 1
+    if TWI_OUT.exists() and HAND_OUT.exists():
+        print(f"already exist: {TWI_OUT.name}, {HAND_OUT.name}", file=sys.stderr)
+        return 0
+    import whitebox_workflows as wbw
+    wbe = wbw.WbEnvironment()
+    wbe.verbose = True
+    wbe.working_directory = str(ROOT / "data")
+    print("loading DEM...", file=sys.stderr)
+    dem = wbe.read_raster(str(DEM_PATH))
+    # 1. Hydrologic conditioning — fill depressions so flow routes terminate
+    #    at the boundary, not inside spurious sinks. Wang & Liu fill is fast.
+    print("filling depressions (Wang & Liu)...", file=sys.stderr)
+    dem_filled = wbe.fill_depressions_wang_and_liu(dem)
+    # 2. D-infinity flow accumulation -> specific catchment area for TWI
+    print("D-infinity flow accumulation...", file=sys.stderr)
+    sca = wbe.dinf_flow_accum(dem_filled, out_type="specific contributing area",
+                               log_transform=False)
+    # 3. Slope (degrees) for TWI
+    print("slope...", file=sys.stderr)
+    slope = wbe.slope(dem_filled, units="degrees")
+    # 4. TWI = ln(SCA / tan(slope))
+    print("TWI...", file=sys.stderr)
+    twi = wbe.wetness_index(sca, slope)
+    wbe.write_raster(twi, str(TWI_OUT.name), compress=True)
+    # 5. Streams: D8 flow accumulation + threshold to a stream raster
+    print("D8 flow accumulation for stream extraction...", file=sys.stderr)
+    d8_accum = wbe.d8_flow_accum(dem_filled, out_type="cells",
+                                  log_transform=False)
+    # Threshold the flow accumulation to identify channels — pick a value that
+    # gives a reasonable drainage network density. For 30m DEM over NYC,
+    # >1500 cells (~1.35 km²) is a reasonable channel-initiation threshold.
+    print("extracting streams...", file=sys.stderr)
+    streams = wbe.extract_streams(d8_accum, threshold=1500.0)
+    # 6. HAND = vertical distance to nearest stream (along flow paths)
+    print("HAND (elevation_above_stream)...", file=sys.stderr)
+    hand = wbe.elevation_above_stream(dem_filled, streams)
+    wbe.write_raster(hand, str(HAND_OUT.name), compress=True)
+    print(f"\nwrote:\n  {TWI_OUT}  ({TWI_OUT.stat().st_size // 1024} KB)\n"
+          f"  {HAND_OUT}  ({HAND_OUT.stat().st_size // 1024} KB)",
+          file=sys.stderr)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

scripts/dry_run.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""Quick end-to-end sanity check.
+Exercises every public route once and prints a summary. Catches:
+ - 404/500s on routes
+ - missing static assets
+ - broken /api/stream or /api/compare SSE
+ - missing register data
+ - hallucination drops > N
+Run while the server is up:
+    python scripts/dry_run.py
+"""
+from __future__ import annotations
+import json
+import sys
+import time
+import httpx
+BASE = "http://127.0.0.1:8765"
+def check(label: str, fn):
+    t0 = time.time()
+    try:
+        ok, detail = fn()
+        elapsed = time.time() - t0
+        marker = "✓" if ok else "✗"
+        print(f"  {marker} {label:42s}  ({elapsed:5.2f}s)  {detail}")
+        return ok
+    except Exception as e:
+        elapsed = time.time() - t0
+        print(f"  ✗ {label:42s}  ({elapsed:5.2f}s)  EXCEPTION: {type(e).__name__}: {e}")
+        return False
+def get_status(path: str) -> tuple[bool, str]:
+    r = httpx.get(BASE + path, timeout=10)
+    return r.status_code == 200, f"HTTP {r.status_code} ({len(r.content)} bytes)"
+def stream_one(query: str) -> tuple[bool, str]:
+    with httpx.stream("GET", BASE + f"/api/stream?q={query}", timeout=120) as r:
+        if r.status_code != 200:
+            return False, f"HTTP {r.status_code}"
+        steps = 0; final = None
+        for line in r.iter_lines():
+            if line.startswith("data: "):
+                d = json.loads(line[6:])
+                if d.get("kind") == "step": steps += 1
+                elif d.get("kind") == "final": final = d
+        if not final:
+            return False, f"no final event (steps={steps})"
+        dropped = len(((final.get("audit") or {}).get("dropped") or []))
+        en = final.get("energy") or {}
+        return True, (f"steps={steps}, dropped={dropped}, "
+                      f"energy={en.get('local_mwh','?')} mWh local")
+def compare_one(a: str, b: str) -> tuple[bool, str]:
+    with httpx.stream("GET", BASE + f"/api/compare?a={a}&b={b}", timeout=120) as r:
+        if r.status_code != 200:
+            return False, f"HTTP {r.status_code}"
+        finals = {}
+        steps = 0
+        for line in r.iter_lines():
+            if line.startswith("data: "):
+                d = json.loads(line[6:])
+                if d.get("kind") == "step": steps += 1
+                elif d.get("kind") == "final": finals[d.get("side")] = d
+        if "a" not in finals or "b" not in finals:
+            return False, f"missing final (got {list(finals)})"
+        return True, f"both sides done; steps={steps}"
+def register_check(asset_class: str) -> tuple[bool, str]:
+    r = httpx.get(BASE + f"/api/register/{asset_class}", timeout=10)
+    if r.status_code == 503:
+        return False, "register not built"
+    if r.status_code != 200:
+        return False, f"HTTP {r.status_code}"
+    data = r.json()
+    rows = data.get("rows", [])
+    tiers = {1: 0, 2: 0, 3: 0}
+    for r_ in rows:
+        tiers[r_.get("tier", 0)] = tiers.get(r_.get("tier", 0), 0) + 1
+    return True, f"{len(rows)} rows · tier1={tiers.get(1,0)} t2={tiers.get(2,0)} t3={tiers.get(3,0)}"
+def main():
+    print(f"=== Riprap dry-run vs {BASE} ===\n")
+    print("[Pages]")
+    check("/",                  lambda: get_status("/"))
+    check("/compare",           lambda: get_status("/compare"))
+    check("/register/schools",  lambda: get_status("/register/schools"))
+    check("/register/nycha",    lambda: get_status("/register/nycha"))
+    check("/static/style.css",  lambda: get_status("/static/style.css"))
+    check("/static/app.js",     lambda: get_status("/static/app.js"))
+    check("/static/compare.js", lambda: get_status("/static/compare.js"))
+    check("/static/register.js",lambda: get_status("/static/register.js"))
+    fontf = "/static/vendor/nyco/fonts/IBM-Plex-Sans/IBMPlexSans-Regular.woff2"
+    check(fontf,                lambda: get_status(fontf))
+    print("\n[API: layer endpoints]")
+    check("/api/layers/sandy",  lambda: get_status("/api/layers/sandy?lat=40.59&lon=-73.77&r=1500"))
+    check("/api/layers/dep_extreme_2080",
+          lambda: get_status("/api/layers/dep_extreme_2080?lat=40.59&lon=-73.77&r=1500"))
+    check("/api/floodnet_near", lambda: get_status("/api/floodnet_near?lat=40.59&lon=-73.77&r=1000"))
+    print("\n[API: register endpoints]")
+    check("/api/register/schools", lambda: register_check("schools"))
+    check("/api/register/nycha",   lambda: register_check("nycha"))
+    print("\n[Streams]")
+    check("stream  · 180 Beach 35 St",
+          lambda: stream_one("180 Beach 35 St, Queens"))
+    check("stream  · Empire State (cleaner case)",
+          lambda: stream_one("350 5 Avenue, Manhattan"))
+    check("compare · Hollis vs Empire State",
+          lambda: compare_one("153-09 90 Avenue Jamaica Queens",
+                              "350 5 Avenue Manhattan"))
+if __name__ == "__main__":
+    sys.exit(main())

scripts/fetch_ida_hwms.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""One-shot fetch of NYC Hurricane Ida 2021 high-water marks from USGS STN.
+Output: data/ida_2021_hwms_ny.geojson — point GeoJSON with elev_ft + site
+metadata. Used by the Riprap agent's `step_ida_hwm` action as the
+empirical post-event flood signal (the same role Prithvi-EO plays for
+SAR-derived extents in the parent project).
+"""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+import httpx
+OUT = Path(__file__).resolve().parent.parent / "data" / "ida_2021_hwms_ny.geojson"
+URL = "https://stn.wim.usgs.gov/STNServices/HWMs/FilteredHWMs.json"
+def main() -> int:
+    print("fetching USGS STN Ida 2021 NY HWMs...", file=sys.stderr)
+    r = httpx.get(URL, params={"Event": 312, "States": "NY"}, timeout=60)
+    r.raise_for_status()
+    data = r.json()
+    features = []
+    for d in data:
+        lat = d.get("latitude"); lon = d.get("longitude")
+        if lat is None or lon is None:
+            continue
+        features.append({
+            "type": "Feature",
+            "geometry": {"type": "Point", "coordinates": [lon, lat]},
+            "properties": {
+                "hwm_id": d.get("hwm_id"),
+                "site_no": d.get("site_no"),
+                "elev_ft": d.get("elev_ft"),
+                "height_above_gnd": d.get("height_above_gnd"),
+                "hwm_type": d.get("hwmTypeName"),
+                "hwm_quality": d.get("hwmQualityName"),
+                "county": d.get("countyName"),
+                "site_description": d.get("siteDescription"),
+                "waterbody": d.get("waterbody"),
+            },
+        })
+    OUT.parent.mkdir(exist_ok=True, parents=True)
+    OUT.write_text(json.dumps({"type": "FeatureCollection", "features": features}))
+    print(f"wrote {len(features)} HWMs -> {OUT} ({OUT.stat().st_size // 1024} KB)",
+          file=sys.stderr)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

scripts/fetch_nyc_dem.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""One-shot fetch of an NYC-wide DEM for the microtopo specialist.
+Run this once before launching the agent or web UI:
+    python scripts/fetch_nyc_dem.py
+Output: data/nyc_dem_30m.tif (~few MB at 30 m, citywide).
+We use 30 m resolution for the precomputed tile because at higher
+resolution the file gets large and microtopo metrics (200/750 m
+windows) don't need 10 m granularity.
+"""
+from __future__ import annotations
+import sys
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore")
+import py3dep  # noqa: E402
+DATA = Path(__file__).resolve().parent.parent / "data"
+OUT = DATA / "nyc_dem_30m.tif"
+# NYC bbox (lon_min, lat_min, lon_max, lat_max) plus a bit of padding
+NYC_BBOX = (-74.30, 40.45, -73.65, 40.95)
+def main() -> int:
+    if OUT.exists():
+        print(f"already exists: {OUT}", file=sys.stderr)
+        return 0
+    DATA.mkdir(exist_ok=True, parents=True)
+    print(f"fetching NYC DEM @ 30 m for bbox {NYC_BBOX}", file=sys.stderr)
+    dem = py3dep.get_dem(NYC_BBOX, resolution=30)
+    print(f"  shape: {dem.shape}", file=sys.stderr)
+    # Reproject to WGS84 if needed
+    try:
+        if dem.rio.crs and dem.rio.crs.to_epsg() != 4326:
+            dem = dem.rio.reproject("EPSG:4326")
+            print("  reprojected to EPSG:4326", file=sys.stderr)
+    except Exception:
+        pass
+    dem.rio.to_raster(str(OUT), compress="DEFLATE", dtype="float32")
+    print(f"wrote {OUT} ({OUT.stat().st_size // 1024} KB)", file=sys.stderr)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

scripts/run_prithvi_flood.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""Run Prithvi-EO-2.0-300M-TL-Sen1Floods11 once on a low-cloud HLS scene
+over NYC. Save the resulting water mask as a vectorized GeoJSON for use
+as a Riprap flood-layer specialist.
+This script defers to IBM's official inference.py (downloaded from the
+model repo) rather than reimplementing the inference loop — that file
+knows about the temporal/location-coord embeddings, the per-window
+albumentations stack, and the upernet decoder output shape, all of
+which are easy to get wrong.
+    python scripts/run_prithvi_flood.py
+"""
+from __future__ import annotations
+import importlib.util
+import json
+import sys
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore")
+ROOT = Path(__file__).resolve().parent.parent
+OUT_DIR = ROOT / "data"
+OUT_DIR.mkdir(exist_ok=True, parents=True)
+# NYC needs two MGRS tiles to cover everything:
+#  T18TWL covers Manhattan, Bronx, western Brooklyn, Newark Bay
+#  T18TXK covers eastern Brooklyn, Queens, Far Rockaway, Jamaica Bay, Long Island Sound
+SCENES = [
+    ("HLS.S30.T18TWL.2024247T153941.v2.0", "2024-09-04"),  # 1% cloud, central NYC
+    ("HLS.S30.T18TXK.2024252T153819.v2.0", "2024-09-08"),  # 0% cloud, eastern NYC
+]
+SCENE_ID, SCENE_DATE = SCENES[0]  # back-compat for legacy users
+MODEL_REPO = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
+PRITHVI_BAND_NAMES = ["B02", "B03", "B04", "B8A", "B11", "B12"]
+def _stage_stack(out_path: Path, scene_id: str = SCENE_ID) -> bool:
+    if out_path.exists():
+        return True
+    import pystac_client, planetary_computer, rasterio, numpy as np
+    print(f"fetching scene {scene_id}...", file=sys.stderr)
+    catalog = pystac_client.Client.open(
+        "https://planetarycomputer.microsoft.com/api/stac/v1",
+        modifier=planetary_computer.sign_inplace,
+    )
+    item = catalog.get_collection("hls2-s30").get_item(scene_id)
+    if item is None:
+        print("  scene not retrievable", file=sys.stderr)
+        return False
+    arrays = []; profile = None
+    for band in PRITHVI_BAND_NAMES:
+        with rasterio.open(item.assets[band].href) as ds:
+            arrays.append(ds.read(1))
+            if profile is None:
+                profile = ds.profile.copy()
+    stack = np.stack(arrays, axis=0).astype("float32")
+    # Replace nodata -9999 with the inference.py NO_DATA_FLOAT sentinel (0.0001).
+    # inference.py only treats nodata correctly when explicit mean/std are
+    # configured — for this Sen1Floods11 fine-tune mean/std are None, so we
+    # do the substitution upstream and write a clean float32 raster in 0..1
+    # reflectance units (constant_scale=0.0001 in config => DN/10000).
+    stack[stack <= -9000] = 0.0
+    stack = stack / 10000.0
+    stack = np.clip(stack, 0.0, 1.0).astype("float32")
+    profile.update(count=6, dtype="float32",
+                   compress="DEFLATE", tiled=True,
+                   blockxsize=256, blockysize=256, nodata=0.0)
+    with rasterio.open(out_path, "w", **profile) as ds:
+        for i in range(6):
+            ds.write(stack[i], i + 1)
+    print(f"  wrote {out_path} ({out_path.stat().st_size // (1024*1024)} MB) "
+          f"(reflectance units, nodata→0)", file=sys.stderr)
+    return True
+def _process_one(scene_id: str, scene_date: str) -> list[dict]:
+    """Stage one MGRS tile, run Prithvi, vectorise to features. Returns
+    a list of GeoJSON Features in EPSG:4326 (so they can be merged across
+    tiles in different UTM zones)."""
+    stack_path = OUT_DIR / f"hls_stack_{scene_date}.tif"
+    if not _stage_stack(stack_path, scene_id=scene_id):
+        return []
+    from huggingface_hub import hf_hub_download
+    inf_py = hf_hub_download(MODEL_REPO, "inference.py")
+    cfg = hf_hub_download(MODEL_REPO, "config.yaml")
+    ckpt = hf_hub_download(MODEL_REPO, "Prithvi-EO-V2-300M-TL-Sen1Floods11.pt")
+    spec = importlib.util.spec_from_file_location("prithvi_inf", inf_py)
+    pm = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(pm)
+    out_dir = OUT_DIR / "prithvi_runs"
+    out_dir.mkdir(exist_ok=True)
+    pred_path = out_dir / f"pred_{stack_path.stem}.tiff"
+    if not pred_path.exists():
+        print(f"running Prithvi on {scene_id}...", file=sys.stderr)
+        pm.main(data_file=str(stack_path), config=cfg, checkpoint=ckpt,
+                output_dir=str(out_dir), rgb_outputs=False, input_indices=None)
+    else:
+        print(f"  reusing existing pred: {pred_path}", file=sys.stderr)
+    if not pred_path.exists():
+        cands = list(out_dir.glob(f"pred_{stack_path.stem}*"))
+        pred_path = cands[0] if cands else None
+    if pred_path is None or not pred_path.exists():
+        print(f"  no prediction tiff for {scene_id}", file=sys.stderr)
+        return []
+    import rasterio
+    from rasterio.features import shapes
+    from shapely.geometry import shape, mapping
+    import geopandas as gpd
+    with rasterio.open(pred_path) as ds:
+        pred = ds.read(1); transform = ds.transform; src_crs = ds.crs
+    water_mask = pred == 255
+    n_water = int(water_mask.sum())
+    print(f"  {scene_id}: {n_water} water px "
+          f"({100*n_water/pred.size:.2f}%)", file=sys.stderr)
+    feats = []
+    for geom, val in shapes(water_mask.astype("uint8"),
+                              mask=water_mask, transform=transform):
+        if val == 1:
+            poly = shape(geom)
+            if poly.area > 0:
+                feats.append({"type": "Feature",
+                               "geometry": mapping(poly),
+                               "properties": {"class": "water",
+                                              "scene_id": scene_id,
+                                              "scene_date": scene_date}})
+    if not feats:
+        return []
+    # Reproject to EPSG:4326 for cross-tile merging
+    g = gpd.GeoDataFrame.from_features(feats, crs=src_crs)
+    g = g.to_crs("EPSG:4326")
+    return json.loads(g.to_json())["features"]
+def main() -> int:
+    out_geojson = OUT_DIR / "prithvi_flood_nyc.geojson"
+    if out_geojson.exists():
+        print(f"already exists: {out_geojson}", file=sys.stderr)
+        return 0
+    all_features = []
+    scene_ids = []; scene_dates = []
+    for scene_id, scene_date in SCENES:
+        feats = _process_one(scene_id, scene_date)
+        all_features.extend(feats)
+        if feats:
+            scene_ids.append(scene_id); scene_dates.append(scene_date)
+    out = {"type": "FeatureCollection", "features": all_features,
+            "scene_ids": scene_ids, "scene_dates": scene_dates,
+            "model": MODEL_REPO, "crs": "EPSG:4326"}
+    out_geojson.write_text(json.dumps(out))
+    print(f"\nwrote {len(all_features)} water polygons across "
+          f"{len(scene_ids)} scenes -> {out_geojson} "
+          f"({out_geojson.stat().st_size // 1024} KB)", file=sys.stderr)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

scripts/run_prithvi_ida.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""Run Prithvi-EO 2.0 (Sen1Floods11) on a real Hurricane Ida pre/post pair.
+Pre-event:  HLS.S30.T18TWK.2021237T153809.v2.0   (2021-08-25, 3% cloud)
+Post-event: HLS.S30.T18TWK.2021245T154911.v2.0   (2021-09-02, 1% cloud,
+                                                  ~12h after peak rainfall)
+This is the genuinely-defensible Prithvi run for the demo: a real flood
+event, two clean scenes within the model's optical comfort zone, with a
+diff that isolates *new* surface water attributable to Ida from the
+permanent rivers/harbor that are present in both scenes.
+Honest framing baked into the metadata:
+- The model still misses subway and basement flooding (sub-surface; the
+  dominant Ida damage mode in NYC). Optical satellite cannot see those.
+- By 16:02 UTC Sep 2 (~12 h post-peak), pluvial street water had largely
+  drained. The diff signal is mostly: Jamaica Bay marsh ponding,
+  riverside spillover, low-lying park inundation.
+- This is what an Apache-2.0 foundation model can defensibly contribute
+  to a flood-event assessment, and we say so in the report.
+    python scripts/run_prithvi_ida.py
+"""
+from __future__ import annotations
+import importlib.util
+import json
+import sys
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore")
+ROOT = Path(__file__).resolve().parent.parent
+OUT_DIR = ROOT / "data"
+OUT_DIR.mkdir(exist_ok=True, parents=True)
+PRE_SCENE  = "HLS.S30.T18TWK.2021237T153809.v2.0"
+POST_SCENE = "HLS.S30.T18TWK.2021245T154911.v2.0"
+PRE_DATE   = "2021-08-25"
+POST_DATE  = "2021-09-02"
+EVENT      = "Hurricane Ida"
+MODEL_REPO = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
+PRITHVI_BAND_NAMES = ["B02", "B03", "B04", "B8A", "B11", "B12"]
+def _stage_stack(out_path: Path, scene_id: str) -> bool:
+    if out_path.exists():
+        print(f"  reusing {out_path.name}", file=sys.stderr)
+        return True
+    import pystac_client, planetary_computer, rasterio, numpy as np
+    print(f"fetching {scene_id}...", file=sys.stderr)
+    catalog = pystac_client.Client.open(
+        "https://planetarycomputer.microsoft.com/api/stac/v1",
+        modifier=planetary_computer.sign_inplace,
+    )
+    item = catalog.get_collection("hls2-s30").get_item(scene_id)
+    if item is None:
+        print(f"  {scene_id} not retrievable", file=sys.stderr)
+        return False
+    arrays = []
+    profile = None
+    for band in PRITHVI_BAND_NAMES:
+        with rasterio.open(item.assets[band].href) as ds:
+            arrays.append(ds.read(1))
+            if profile is None:
+                profile = ds.profile.copy()
+    stack = np.stack(arrays, axis=0).astype("float32")
+    stack[stack <= -9000] = 0.0
+    stack = stack / 10000.0
+    stack = np.clip(stack, 0.0, 1.0).astype("float32")
+    profile.update(count=6, dtype="float32",
+                   compress="DEFLATE", tiled=True,
+                   blockxsize=256, blockysize=256, nodata=0.0)
+    with rasterio.open(out_path, "w", **profile) as ds:
+        for i in range(6):
+            ds.write(stack[i], i + 1)
+    print(f"  wrote {out_path.name} ({out_path.stat().st_size // (1024*1024)} MB)",
+          file=sys.stderr)
+    return True
+def _run_prithvi(stack_path: Path, out_dir: Path) -> Path | None:
+    """Run inference if needed; return path to pred .tiff."""
+    pred_path = out_dir / f"pred_{stack_path.stem}.tiff"
+    if pred_path.exists():
+        print(f"  reusing existing pred: {pred_path.name}", file=sys.stderr)
+        return pred_path
+    from huggingface_hub import hf_hub_download
+    inf_py = hf_hub_download(MODEL_REPO, "inference.py")
+    cfg = hf_hub_download(MODEL_REPO, "config.yaml")
+    ckpt = hf_hub_download(MODEL_REPO, "Prithvi-EO-V2-300M-TL-Sen1Floods11.pt")
+    spec = importlib.util.spec_from_file_location("prithvi_inf", inf_py)
+    pm = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(pm)
+    print(f"  running Prithvi on {stack_path.name}...", file=sys.stderr)
+    pm.main(data_file=str(stack_path), config=cfg, checkpoint=ckpt,
+            output_dir=str(out_dir), rgb_outputs=False, input_indices=None)
+    if pred_path.exists():
+        return pred_path
+    cands = list(out_dir.glob(f"pred_{stack_path.stem}*"))
+    return cands[0] if cands else None
+def main() -> int:
+    out_geojson = OUT_DIR / "prithvi_ida_2021.geojson"
+    if out_geojson.exists():
+        print(f"already exists: {out_geojson}", file=sys.stderr)
+        return 0
+    pre_stack  = OUT_DIR / f"hls_stack_pre_ida_{PRE_DATE}.tif"
+    post_stack = OUT_DIR / f"hls_stack_post_ida_{POST_DATE}.tif"
+    if not (_stage_stack(pre_stack,  PRE_SCENE) and
+            _stage_stack(post_stack, POST_SCENE)):
+        return 1
+    out_dir = OUT_DIR / "prithvi_runs"
+    out_dir.mkdir(exist_ok=True)
+    pre_pred  = _run_prithvi(pre_stack,  out_dir)
+    post_pred = _run_prithvi(post_stack, out_dir)
+    if pre_pred is None or post_pred is None:
+        print("inference failed", file=sys.stderr)
+        return 2
+    # ---- diff: NEW water in post that wasn't in pre = Ida-attributable ----
+    import rasterio
+    import numpy as np
+    from rasterio.features import shapes
+    from shapely.geometry import shape, mapping
+    import geopandas as gpd
+    with rasterio.open(pre_pred) as ds:
+        pre = ds.read(1)
+    with rasterio.open(post_pred) as ds:
+        post = ds.read(1)
+        transform = ds.transform
+        crs = ds.crs
+    # The model emits 0 / 255. New-water = post(255) AND pre(!=255)
+    new_water = (post == 255) & (pre != 255)
+    n_new = int(new_water.sum())
+    n_pre = int((pre == 255).sum())
+    n_post = int((post == 255).sum())
+    print(f"  pre  water px: {n_pre:>8d} ({100*n_pre/pre.size:.2f}%)", file=sys.stderr)
+    print(f"  post water px: {n_post:>8d} ({100*n_post/post.size:.2f}%)", file=sys.stderr)
+    print(f"  NEW  water px: {n_new:>8d} ({100*n_new/post.size:.2f}%)", file=sys.stderr)
+    # also save the post mask for "all post-event water" if useful
+    post_water = post == 255
+    # vectorize NEW water (Ida-attributable inundation)
+    feats_new = []
+    for geom, val in shapes(new_water.astype("uint8"),
+                              mask=new_water, transform=transform):
+        if val == 1:
+            poly = shape(geom)
+            if poly.area > 0:
+                feats_new.append({"type": "Feature",
+                                   "geometry": mapping(poly),
+                                   "properties": {"class": "new_water_post_ida"}})
+    # vectorize ALL post-event water (for legend / context)
+    feats_post = []
+    for geom, val in shapes(post_water.astype("uint8"),
+                              mask=post_water, transform=transform):
+        if val == 1:
+            poly = shape(geom)
+            if poly.area > 0:
+                feats_post.append({"type": "Feature",
+                                    "geometry": mapping(poly),
+                                    "properties": {"class": "post_event_water"}})
+    g_new = gpd.GeoDataFrame.from_features(feats_new, crs=crs).to_crs("EPSG:4326") \
+        if feats_new else gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
+    g_post = gpd.GeoDataFrame.from_features(feats_post, crs=crs).to_crs("EPSG:4326") \
+        if feats_post else gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
+    new_features = json.loads(g_new.to_json())["features"]
+    post_features = json.loads(g_post.to_json())["features"]
+    out = {
+        "type": "FeatureCollection",
+        "features": new_features,
+        "_post_event_water_features": post_features,  # carried for reference
+        "event": EVENT,
+        "pre_scene_id": PRE_SCENE,  "pre_scene_date": PRE_DATE,
+        "post_scene_id": POST_SCENE, "post_scene_date": POST_DATE,
+        "model": MODEL_REPO,
+        "crs": "EPSG:4326",
+        "interpretation": (
+            "Polygons in `features` are pixels classified as water in the "
+            "post-event scene but NOT in the pre-event scene — i.e., "
+            "candidate Hurricane Ida-attributable inundation. The Sep 2 "
+            "Sentinel-2 pass was ~12 h after peak rainfall; pluvial street "
+            "and basement flooding (the dominant Ida damage mode in NYC) "
+            "had largely drained by then, so this signal mostly captures "
+            "marsh ponding, riverside spillover, and low-lying park water. "
+            "Subway and basement flooding are not surface-visible to "
+            "optical satellites."
+        ),
+    }
+    out_geojson.write_text(json.dumps(out))
+    print(f"\nwrote {len(new_features)} new-water polygons + "
+          f"{len(post_features)} post-event water polygons "
+          f"-> {out_geojson} ({out_geojson.stat().st_size // 1024} KB)",
+          file=sys.stderr)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())