riprap-nyc / app /geocode.py
msradam's picture
Upload app/geocode.py with huggingface_hub
820f968 verified
"""Address geocoding — NYC primary + national fallback.
NYC primary: NYC DCP Geosearch (geosearch.planninglabs.nyc), no auth,
NYC-only. It will fuzzy-match upstate addresses to NYC streets — e.g.
'257 Washington Ave, Albany NY' silently maps to Clinton Hill, Brooklyn.
We detect this via a non-NYC region or non-NYC ZIP and fall back to
OpenStreetMap Nominatim (no key, free, rate-limited per usage policy).
Includes a borough-hint post-filter so Queens hyphenated-style addresses
(e.g. '153-09 90 Ave, Jamaica, Queens') preferentially resolve to the
borough the user named.
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass
import httpx
log = logging.getLogger("riprap.geocode")
URL = "https://geosearch.planninglabs.nyc/v2/search"
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
NOMINATIM_UA = "Riprap-NYC/0.5 (civic-flood-tool; +https://huggingface.co/spaces/msradam/riprap-nyc)"
# NYC-bbox guard: lat 40.49–40.92, lon -74.27 to -73.69.
NYC_BBOX = (40.49, -74.27, 40.92, -73.69)
_UPSTATE_ZIP_RE = re.compile(r"\b1[2-4]\d{3}\b")
_BOROUGHS = ("Manhattan", "Bronx", "Brooklyn", "Queens", "Staten Island")
def _detect_borough(text: str) -> str | None:
t = text.lower()
for b in _BOROUGHS:
if b.lower() in t:
return b
# neighborhood -> borough hints
hints = {
"queens": "Queens", "jamaica": "Queens", "rockaway": "Queens",
"astoria": "Queens", "flushing": "Queens",
"manhattan": "Manhattan", "harlem": "Manhattan", "soho": "Manhattan",
"brooklyn": "Brooklyn", "bushwick": "Brooklyn", "red hook": "Brooklyn",
"bronx": "Bronx", "fordham": "Bronx",
"staten island": "Staten Island",
}
for needle, boro in hints.items():
if needle in t:
return boro
return None
@dataclass
class GeocodeHit:
address: str
borough: str | None
lat: float
lon: float
bbl: str | None
bin: str | None
raw: dict
def geocode(text: str, limit: int = 5) -> list[GeocodeHit]:
"""NYC Geosearch primary."""
try:
r = httpx.get(URL, params={"text": text, "size": limit}, timeout=5)
r.raise_for_status()
feats = r.json().get("features", [])
out = []
for f in feats:
p = f.get("properties", {})
coords = (f.get("geometry") or {}).get("coordinates") or [None, None]
out.append(GeocodeHit(
address=p.get("label") or p.get("name") or text,
borough=p.get("borough"),
lat=coords[1],
lon=coords[0],
bbl=p.get("addendum", {}).get("pad", {}).get("bbl"),
bin=p.get("addendum", {}).get("pad", {}).get("bin"),
raw=p,
))
return out
except Exception as e:
log.warning("Geosearch failed: %r", e)
return []
def geocode_nominatim(text: str) -> GeocodeHit | None:
"""National OSM Nominatim fallback."""
try:
r = httpx.get(NOMINATIM_URL, params={
"q": text, "format": "jsonv2", "addressdetails": "1",
"limit": 1, "countrycodes": "us",
}, headers={"User-Agent": NOMINATIM_UA}, timeout=10)
r.raise_for_status()
rows = r.json()
except Exception as e:
log.warning("Nominatim fetch failed: %r", e)
return None
if not rows:
return None
row = rows[0]
addr = row.get("address") or {}
# Try to map Nominatim borough/county back to NYC standard
boro = addr.get("suburb") or addr.get("city_district") or addr.get("county")
if boro and "Kings" in boro: boro = "Brooklyn"
if boro and "New York County" in boro: boro = "Manhattan"
if boro and "Queens" in boro: boro = "Queens"
if boro and "Bronx" in boro: boro = "Bronx"
if boro and "Richmond" in boro: boro = "Staten Island"
return GeocodeHit(
address=row.get("display_name") or text,
borough=boro,
lat=float(row["lat"]),
lon=float(row["lon"]),
bbl=None, # Nominatim doesn't have BBLs
bin=None,
raw={"source": "nominatim", **row},
)
def geocode_one(text: str) -> GeocodeHit | None:
"""Dynamic geocoder with failover."""
# 1. Try Geosearch
hits = geocode(text)
hint = _detect_borough(text)
if hint:
in_boro = [h for h in hits if h.borough and h.borough.lower() == hint.lower()]
if in_boro: return in_boro[0]
if hits:
top = hits[0]
if top.lat and 40.4 <= top.lat <= 41.0: # Broad NYC check
return top
# 2. Fall back to Nominatim
log.info("Falling back to Nominatim for %r", text)
return geocode_nominatim(text)