Upload app/geocode.py with huggingface_hub
Browse files- app/geocode.py +52 -127
app/geocode.py
CHANGED
|
@@ -22,55 +22,33 @@ log = logging.getLogger("riprap.geocode")
|
|
| 22 |
|
| 23 |
URL = "https://geosearch.planninglabs.nyc/v2/search"
|
| 24 |
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
|
| 25 |
-
NOMINATIM_UA = "Riprap-NYC/0.
|
| 26 |
|
| 27 |
-
# NYC-bbox guard: lat 40.49–40.92, lon -74.27 to -73.69.
|
| 28 |
-
# this is probably not NYC; treat NYC Geosearch hits outside it as bogus.
|
| 29 |
NYC_BBOX = (40.49, -74.27, 40.92, -73.69)
|
| 30 |
|
| 31 |
-
# NYC ZIP prefixes are 100–104 (Manhattan), 110 (Queens), 112 (Brooklyn),
|
| 32 |
-
# 113 (Queens), 114 (Queens), 116 (Queens), 100 (Bronx 104), 103 (SI 1),
|
| 33 |
-
# basically all 1x with 3rd char 0–6. Upstate NY is 12x, 13x, 14x. We use
|
| 34 |
-
# this only as a HINT to escalate to Nominatim, not as a hard filter.
|
| 35 |
_UPSTATE_ZIP_RE = re.compile(r"\b1[2-4]\d{3}\b")
|
| 36 |
-
_NON_NYC_HINTS = re.compile(
|
| 37 |
-
r"\b(albany|troy|schenectady|saratoga|kingston|poughkeepsie|newburgh|"
|
| 38 |
-
r"yonkers|white plains|hudson|rhinebeck|peekskill|beacon|tarrytown|"
|
| 39 |
-
r"new paltz|catskill|tivoli|hyde park|coxsackie|cohoes|amsterdam|"
|
| 40 |
-
r"glens falls|lake george|nyack|garrison|cold spring|highland|saugerties)\b",
|
| 41 |
-
re.IGNORECASE,
|
| 42 |
-
)
|
| 43 |
-
|
| 44 |
_BOROUGHS = ("Manhattan", "Bronx", "Brooklyn", "Queens", "Staten Island")
|
| 45 |
|
| 46 |
-
|
| 47 |
def _detect_borough(text: str) -> str | None:
|
| 48 |
t = text.lower()
|
| 49 |
for b in _BOROUGHS:
|
| 50 |
if b.lower() in t:
|
| 51 |
return b
|
| 52 |
-
# neighborhood -> borough hints
|
| 53 |
hints = {
|
| 54 |
-
"queens": "Queens",
|
| 55 |
-
"
|
| 56 |
-
"elmhurst": "Queens", "maspeth": "Queens", "ozone park": "Queens",
|
| 57 |
-
"astoria": "Queens", "flushing": "Queens", "edgemere": "Queens",
|
| 58 |
"manhattan": "Manhattan", "harlem": "Manhattan", "soho": "Manhattan",
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"carroll gardens": "Brooklyn", "gowanus": "Brooklyn",
|
| 63 |
-
"park slope": "Brooklyn", "williamsburg": "Brooklyn",
|
| 64 |
-
"coney island": "Brooklyn", "red hook": "Brooklyn",
|
| 65 |
-
"bronx": "Bronx", "fordham": "Bronx", "riverdale": "Bronx",
|
| 66 |
-
"staten island": "Staten Island", "richmond": "Staten Island",
|
| 67 |
}
|
| 68 |
for needle, boro in hints.items():
|
| 69 |
if needle in t:
|
| 70 |
return boro
|
| 71 |
return None
|
| 72 |
|
| 73 |
-
|
| 74 |
@dataclass
|
| 75 |
class GeocodeHit:
|
| 76 |
address: str
|
|
@@ -81,45 +59,32 @@ class GeocodeHit:
|
|
| 81 |
bin: str | None
|
| 82 |
raw: dict
|
| 83 |
|
| 84 |
-
|
| 85 |
def geocode(text: str, limit: int = 5) -> list[GeocodeHit]:
|
| 86 |
-
"""
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
if _UPSTATE_ZIP_RE.search(text):
|
| 109 |
-
return True
|
| 110 |
-
if _NON_NYC_HINTS.search(text):
|
| 111 |
-
return True
|
| 112 |
-
return False
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
def _in_nyc_bbox(lat: float, lon: float) -> bool:
|
| 116 |
-
s, w, n, e = NYC_BBOX
|
| 117 |
-
return s <= lat <= n and w <= lon <= e
|
| 118 |
-
|
| 119 |
|
| 120 |
def geocode_nominatim(text: str) -> GeocodeHit | None:
|
| 121 |
-
"""National OSM Nominatim fallback.
|
| 122 |
-
plausibly answer the query."""
|
| 123 |
try:
|
| 124 |
r = httpx.get(NOMINATIM_URL, params={
|
| 125 |
"q": text, "format": "jsonv2", "addressdetails": "1",
|
|
@@ -134,80 +99,40 @@ def geocode_nominatim(text: str) -> GeocodeHit | None:
|
|
| 134 |
return None
|
| 135 |
row = rows[0]
|
| 136 |
addr = row.get("address") or {}
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
return GeocodeHit(
|
| 139 |
-
address=
|
| 140 |
-
borough=
|
| 141 |
lat=float(row["lat"]),
|
| 142 |
lon=float(row["lon"]),
|
| 143 |
-
bbl=None,
|
| 144 |
bin=None,
|
| 145 |
raw={"source": "nominatim", **row},
|
| 146 |
)
|
| 147 |
|
| 148 |
-
|
| 149 |
def geocode_one(text: str) -> GeocodeHit | None:
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
# RESILIENCE PATCH: Hardcoded success for canonical demo addresses.
|
| 154 |
-
t = text.lower()
|
| 155 |
-
# 1. 80 Pioneer Street (Red Hook)
|
| 156 |
-
if '80 pioneer' in t:
|
| 157 |
-
return GeocodeHit(
|
| 158 |
-
address='80 Pioneer Street, Brooklyn, NY 11231',
|
| 159 |
-
borough='Brooklyn', lat=40.67805, lon=-74.00958,
|
| 160 |
-
bbl='3005530030', bin='3008985', raw={'source': 'patch'},
|
| 161 |
-
)
|
| 162 |
-
# 2. PS 188 (Lower East Side) - very close to East River and transit
|
| 163 |
-
if 'ps 188' in t or '442 east houston' in t:
|
| 164 |
-
return GeocodeHit(
|
| 165 |
-
address='442 East Houston Street, Manhattan, NY 10002',
|
| 166 |
-
borough='Manhattan', lat=40.71965, lon=-73.97745,
|
| 167 |
-
bbl='1003550001', bin='1004124', raw={'source': 'patch'},
|
| 168 |
-
)
|
| 169 |
-
# 3. Bowling Green (Financial District) - subway-heavy
|
| 170 |
-
if 'bowling green' in t:
|
| 171 |
-
return GeocodeHit(
|
| 172 |
-
address='Bowling Green Station, Manhattan, NY 10004',
|
| 173 |
-
borough='Manhattan', lat=40.7048, lon=-74.0135,
|
| 174 |
-
bbl='1000070001', bin='1000001', raw={'source': 'patch'},
|
| 175 |
-
)
|
| 176 |
-
# 4. 2950 W 25 St (Coney Island) - NYCHA + Sandy heavy
|
| 177 |
-
if '2950 w 25' in t or 'coney island' in t:
|
| 178 |
-
return GeocodeHit(
|
| 179 |
-
address='2950 West 25th Street, Brooklyn, NY 11224',
|
| 180 |
-
borough='Brooklyn', lat=40.5755, lon=-73.9930,
|
| 181 |
-
bbl='3070490001', bin='3000000', raw={'source': 'patch'},
|
| 182 |
-
)
|
| 183 |
-
|
| 184 |
-
if _looks_upstate(text):
|
| 185 |
-
log.info("upstate hint detected in %r — using Nominatim", text)
|
| 186 |
-
hit = geocode_nominatim(text)
|
| 187 |
-
if hit:
|
| 188 |
-
return hit
|
| 189 |
-
|
| 190 |
hint = _detect_borough(text)
|
| 191 |
-
|
| 192 |
-
hits = geocode(text, limit=8)
|
| 193 |
-
except Exception as e:
|
| 194 |
-
# Geosearch is unreachable or returned a server error — fall back to
|
| 195 |
-
# Nominatim rather than surfacing a 503 to every downstream specialist.
|
| 196 |
-
log.warning("Geosearch unavailable (%r) — falling back to Nominatim", e)
|
| 197 |
-
return geocode_nominatim(text)
|
| 198 |
if hint:
|
| 199 |
in_boro = [h for h in hits if h.borough and h.borough.lower() == hint.lower()]
|
| 200 |
-
if in_boro:
|
| 201 |
-
|
| 202 |
-
|
| 203 |
if hits:
|
| 204 |
top = hits[0]
|
| 205 |
-
if top.lat
|
| 206 |
return top
|
| 207 |
-
# Geosearch returned a hit, but it's outside the NYC bbox — that
|
| 208 |
-
# means even the NYC API thinks the answer isn't NYC. Try
|
| 209 |
-
# Nominatim before giving up.
|
| 210 |
-
log.info("Geosearch top hit outside NYC bbox (%s, %s) — falling back",
|
| 211 |
-
top.lat, top.lon)
|
| 212 |
|
|
|
|
|
|
|
| 213 |
return geocode_nominatim(text)
|
|
|
|
| 22 |
|
| 23 |
URL = "https://geosearch.planninglabs.nyc/v2/search"
|
| 24 |
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
|
| 25 |
+
NOMINATIM_UA = "Riprap-NYC/0.5 (civic-flood-tool; +https://huggingface.co/spaces/msradam/riprap-nyc)"
|
| 26 |
|
| 27 |
+
# NYC-bbox guard: lat 40.49–40.92, lon -74.27 to -73.69.
|
|
|
|
| 28 |
NYC_BBOX = (40.49, -74.27, 40.92, -73.69)
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
_UPSTATE_ZIP_RE = re.compile(r"\b1[2-4]\d{3}\b")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
_BOROUGHS = ("Manhattan", "Bronx", "Brooklyn", "Queens", "Staten Island")
|
| 32 |
|
|
|
|
| 33 |
def _detect_borough(text: str) -> str | None:
|
| 34 |
t = text.lower()
|
| 35 |
for b in _BOROUGHS:
|
| 36 |
if b.lower() in t:
|
| 37 |
return b
|
| 38 |
+
# neighborhood -> borough hints
|
| 39 |
hints = {
|
| 40 |
+
"queens": "Queens", "jamaica": "Queens", "rockaway": "Queens",
|
| 41 |
+
"astoria": "Queens", "flushing": "Queens",
|
|
|
|
|
|
|
| 42 |
"manhattan": "Manhattan", "harlem": "Manhattan", "soho": "Manhattan",
|
| 43 |
+
"brooklyn": "Brooklyn", "bushwick": "Brooklyn", "red hook": "Brooklyn",
|
| 44 |
+
"bronx": "Bronx", "fordham": "Bronx",
|
| 45 |
+
"staten island": "Staten Island",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
}
|
| 47 |
for needle, boro in hints.items():
|
| 48 |
if needle in t:
|
| 49 |
return boro
|
| 50 |
return None
|
| 51 |
|
|
|
|
| 52 |
@dataclass
|
| 53 |
class GeocodeHit:
|
| 54 |
address: str
|
|
|
|
| 59 |
bin: str | None
|
| 60 |
raw: dict
|
| 61 |
|
|
|
|
| 62 |
def geocode(text: str, limit: int = 5) -> list[GeocodeHit]:
|
| 63 |
+
"""NYC Geosearch primary."""
|
| 64 |
+
try:
|
| 65 |
+
r = httpx.get(URL, params={"text": text, "size": limit}, timeout=5)
|
| 66 |
+
r.raise_for_status()
|
| 67 |
+
feats = r.json().get("features", [])
|
| 68 |
+
out = []
|
| 69 |
+
for f in feats:
|
| 70 |
+
p = f.get("properties", {})
|
| 71 |
+
coords = (f.get("geometry") or {}).get("coordinates") or [None, None]
|
| 72 |
+
out.append(GeocodeHit(
|
| 73 |
+
address=p.get("label") or p.get("name") or text,
|
| 74 |
+
borough=p.get("borough"),
|
| 75 |
+
lat=coords[1],
|
| 76 |
+
lon=coords[0],
|
| 77 |
+
bbl=p.get("addendum", {}).get("pad", {}).get("bbl"),
|
| 78 |
+
bin=p.get("addendum", {}).get("pad", {}).get("bin"),
|
| 79 |
+
raw=p,
|
| 80 |
+
))
|
| 81 |
+
return out
|
| 82 |
+
except Exception as e:
|
| 83 |
+
log.warning("Geosearch failed: %r", e)
|
| 84 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
def geocode_nominatim(text: str) -> GeocodeHit | None:
|
| 87 |
+
"""National OSM Nominatim fallback."""
|
|
|
|
| 88 |
try:
|
| 89 |
r = httpx.get(NOMINATIM_URL, params={
|
| 90 |
"q": text, "format": "jsonv2", "addressdetails": "1",
|
|
|
|
| 99 |
return None
|
| 100 |
row = rows[0]
|
| 101 |
addr = row.get("address") or {}
|
| 102 |
+
|
| 103 |
+
# Try to map Nominatim borough/county back to NYC standard
|
| 104 |
+
boro = addr.get("suburb") or addr.get("city_district") or addr.get("county")
|
| 105 |
+
if boro and "Kings" in boro: boro = "Brooklyn"
|
| 106 |
+
if boro and "New York County" in boro: boro = "Manhattan"
|
| 107 |
+
if boro and "Queens" in boro: boro = "Queens"
|
| 108 |
+
if boro and "Bronx" in boro: boro = "Bronx"
|
| 109 |
+
if boro and "Richmond" in boro: boro = "Staten Island"
|
| 110 |
+
|
| 111 |
return GeocodeHit(
|
| 112 |
+
address=row.get("display_name") or text,
|
| 113 |
+
borough=boro,
|
| 114 |
lat=float(row["lat"]),
|
| 115 |
lon=float(row["lon"]),
|
| 116 |
+
bbl=None, # Nominatim doesn't have BBLs
|
| 117 |
bin=None,
|
| 118 |
raw={"source": "nominatim", **row},
|
| 119 |
)
|
| 120 |
|
|
|
|
| 121 |
def geocode_one(text: str) -> GeocodeHit | None:
|
| 122 |
+
"""Dynamic geocoder with failover."""
|
| 123 |
+
# 1. Try Geosearch
|
| 124 |
+
hits = geocode(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
hint = _detect_borough(text)
|
| 126 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
if hint:
|
| 128 |
in_boro = [h for h in hits if h.borough and h.borough.lower() == hint.lower()]
|
| 129 |
+
if in_boro: return in_boro[0]
|
| 130 |
+
|
|
|
|
| 131 |
if hits:
|
| 132 |
top = hits[0]
|
| 133 |
+
if top.lat and 40.4 <= top.lat <= 41.0: # Broad NYC check
|
| 134 |
return top
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
+
# 2. Fall back to Nominatim
|
| 137 |
+
log.info("Falling back to Nominatim for %r", text)
|
| 138 |
return geocode_nominatim(text)
|