Spaces:

lablab-ai-amd-developer-hackathon
/

riprap-nyc

Running

App Files Files Community

riprap-nyc / app /inference.py

seriffic

feat: terramind_synthesis now routes through droplet remote inference

eea4d6e about 2 hours ago

raw

history blame contribute delete

9.2 kB

	"""Remote-vs-local ML inference router.

	Mirrors the call-surface shape of `app/llm.py` but for the non-LLM
	heavy models (Prithvi, TerraMind, TTM, Granite Embedding, GLiNER).

	The droplet runs a `riprap-models` FastAPI service alongside vLLM that
	exposes an OpenAI-style endpoint per model class. When configured the
	router POSTs the relevant payload there and returns the parsed response;
	on connection error / 5xx / timeout it surfaces a typed exception that
	caller modules catch and fall back to a local in-process model load.

	Backend selection (env):

	RIPRAP_ML_BACKEND = "remote" \| "local" \| "auto" (default: auto)
	- remote: use only the droplet, raise if it errors
	- local : never call the droplet, always use the
	in-process model
	- auto : try remote first, fall back to local if
	remote is unreachable / errors out;
	same semantics as app/llm.py
	RIPRAP_ML_BASE_URL = http://129.212.181.238:8002 (no trailing slash)
	RIPRAP_ML_API_KEY = <bearer token>

	The router is transport-only — it does not own model bytes, weights,
	or framework imports. Each specialist that wants remote inference calls
	into the helpers below and provides its own local fallback. That keeps
	the dependency graph clean: the local code path keeps working when the
	RIPRAP_ML_* env is unset (e.g. on first-light dev or in unit tests).
	"""
	from __future__ import annotations

	import base64
	import logging
	import os
	from collections.abc import Iterable
	from typing import Any

	import httpx

	log = logging.getLogger("riprap.inference")

	_BACKEND = os.environ.get("RIPRAP_ML_BACKEND", "auto").lower()
	_BASE_URL = os.environ.get("RIPRAP_ML_BASE_URL", "").rstrip("/")
	_API_KEY = os.environ.get("RIPRAP_ML_API_KEY", "")
	_DEFAULT_TIMEOUT = float(os.environ.get("RIPRAP_ML_TIMEOUT_S", "60"))


	class RemoteUnreachable(RuntimeError):
	"""Raised when the remote inference service is unconfigured, down,
	times out, or returns 5xx. Callers catch this to fall through to a
	local model load. 4xx errors propagate as the generic exception so
	a caller bug doesn't get masked by a "fallback to local" path."""


	def remote_enabled() -> bool:
	"""True iff the router is configured to attempt remote calls.
	Returns False under explicit `local` mode or when the base URL is
	empty (the auto-default with no env config)."""
	if _BACKEND == "local":
	return False
	if not _BASE_URL:
	return False
	return True


	def _client(timeout: float \| None = None) -> httpx.Client:
	headers = {"User-Agent": "riprap-app/0.4.5"}
	if _API_KEY:
	headers["Authorization"] = f"Bearer {_API_KEY}"
	return httpx.Client(
	base_url=_BASE_URL,
	headers=headers,
	timeout=timeout if timeout is not None else _DEFAULT_TIMEOUT,
	)


	def _post(path: str, payload: dict[str, Any], timeout: float \| None = None) -> dict:
	"""POST {payload} as JSON to the remote service's `path`. Returns the
	parsed JSON body. Raises RemoteUnreachable on transport errors;
	raises HTTPStatusError on 4xx so caller bugs surface."""
	if not remote_enabled():
	raise RemoteUnreachable("remote ML backend not configured "
	"(RIPRAP_ML_BASE_URL empty or BACKEND=local)")
	try:
	with _client(timeout) as c:
	r = c.post(path, json=payload)
	except (httpx.ConnectError, httpx.ReadError, httpx.WriteError,
	httpx.TimeoutException, httpx.RemoteProtocolError) as e:
	raise RemoteUnreachable(f"{type(e).__name__}: {e}") from e
	if r.status_code >= 500:
	raise RemoteUnreachable(f"HTTP {r.status_code} from {path}: {r.text[:200]}")
	r.raise_for_status()
	return r.json()


	def _serialize_array(arr) -> str:
	"""numpy/torch tensor → base64-encoded float32 raw bytes for transport.
	Each remote handler decodes to (shape, dtype=float32) and reconstructs.
	Reasonable round-trip for chips up to a few MB; large rasters should
	use compressed numpy-savez instead — TODO when a model needs > 8 MB."""
	import numpy as np
	np_arr = arr if isinstance(arr, np.ndarray) else _to_numpy(arr)
	np_arr = np_arr.astype("float32", copy=False)
	return base64.b64encode(np_arr.tobytes()).decode("ascii")


	def _to_numpy(t):
	"""Best-effort tensor → numpy. Accepts torch.Tensor or numpy already."""
	try:
	import torch
	if isinstance(t, torch.Tensor):
	return t.detach().cpu().numpy()
	except ImportError:
	pass
	import numpy as np
	return np.asarray(t)


	def _deserialize_array(b64: str, shape: list[int]):
	"""Inverse of _serialize_array — bytes → numpy float32 with given shape."""
	import numpy as np
	raw = base64.b64decode(b64)
	return np.frombuffer(raw, dtype="float32").reshape(shape)


	# ---- Public router entry points -------------------------------------------

	def healthcheck(timeout: float = 3.0) -> bool:
	"""Quick reachability probe. True if the service responds 200 to GET
	/healthz within `timeout` seconds. Used by /api/backend so the UI can
	show whether the remote ML backend is currently live."""
	if not remote_enabled():
	return False
	try:
	with _client(timeout) as c:
	r = c.get("/healthz")
	return r.status_code == 200
	except Exception:
	return False


	def backend_info() -> dict[str, Any]:
	"""Snapshot for /api/backend — what the UI should advertise."""
	return {
	"backend": _BACKEND,
	"base_url": _BASE_URL or None,
	"remote_enabled": remote_enabled(),
	"reachable": healthcheck() if remote_enabled() else False,
	}


	def prithvi_pluvial(s2_chip, *, scene_id: str \| None = None,
	scene_datetime: str \| None = None,
	cloud_cover: float \| None = None,
	timeout: float \| None = None) -> dict[str, Any]:
	"""Remote forward pass through Prithvi-NYC-Pluvial v2.
	Input: 6-band Sentinel-2 chip (numpy or torch, shape [6, H, W]).
	Output: { ok, pct_water_within_500m, pct_water_full, scene_id, ... }.
	Raises RemoteUnreachable if the service is down."""
	arr = _to_numpy(s2_chip)
	return _post("/v1/prithvi-pluvial", {
	"s2": _serialize_array(arr),
	"shape": list(arr.shape),
	"scene_id": scene_id,
	"scene_datetime": scene_datetime,
	"cloud_cover": cloud_cover,
	}, timeout=timeout)


	def terramind(adapter: str, s2l2a=None, s1rtc=None, dem=None, *,
	timeout: float \| None = None) -> dict[str, Any]:
	"""Remote forward through TerraMind-NYC-Adapters (LULC or Buildings)
	or the v1 base generative path (synthesis). `adapter` is one of:
	lulc, buildings, synthesis. Each modality is a numpy array, torch
	tensor, or None — `synthesis` only needs DEM; the LoRA adapters
	need at minimum S2L2A."""
	payload: dict[str, Any] = {"adapter": adapter}
	if s2l2a is not None:
	s2_np = _to_numpy(s2l2a)
	payload["s2"] = _serialize_array(s2_np)
	payload["s2_shape"] = list(s2_np.shape)
	if s1rtc is not None:
	s1_np = _to_numpy(s1rtc)
	payload["s1"] = _serialize_array(s1_np)
	payload["s1_shape"] = list(s1_np.shape)
	if dem is not None:
	dem_np = _to_numpy(dem)
	payload["dem"] = _serialize_array(dem_np)
	payload["dem_shape"] = list(dem_np.shape)
	return _post("/v1/terramind", payload, timeout=timeout)


	def ttm_forecast(model: str, history: Iterable[float], *,
	context_length: int, prediction_length: int,
	cadence: str = "h",
	timeout: float \| None = None) -> dict[str, Any]:
	"""Remote Granite TTM r2 forecast.
	`model` is one of: zero_shot_battery, fine_tune_battery, weekly_311,
	floodnet_recurrence — the service decides which checkpoint to use.
	`history` is a 1-D iterable of floats (the time series); `cadence`
	is for the service's labelling (h / d / w / 6m). Output shape is
	`{ ok, forecast: [...], peak_index, peak_value }`."""
	series = list(map(float, history))
	return _post("/v1/ttm-forecast", {
	"model": model,
	"history": series,
	"context_length": context_length,
	"prediction_length": prediction_length,
	"cadence": cadence,
	}, timeout=timeout)


	def granite_embed(texts: list[str], *,
	timeout: float \| None = None) -> dict[str, Any]:
	"""Remote Granite Embedding 278M batch encode.
	Output: { ok, vectors: [[float, ...], ...] }. Vector dimension fixed
	at 768 (granite-embedding-278m-multilingual)."""
	return _post("/v1/granite-embed", {"texts": list(texts)}, timeout=timeout)


	def gliner_extract(text: str, labels: list[str], *,
	timeout: float \| None = None) -> dict[str, Any]:
	"""Remote GLiNER typed-entity extraction.
	Output: { ok, entities: [{label, text, start, end, score}, ...] }."""
	return _post("/v1/gliner-extract", {
	"text": text, "labels": list(labels),
	}, timeout=timeout)