msradam commited on
Commit
820f968
·
verified ·
1 Parent(s): 7dcfd02

Upload app/geocode.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app/geocode.py +52 -127
app/geocode.py CHANGED
@@ -22,55 +22,33 @@ log = logging.getLogger("riprap.geocode")
22
 
23
  URL = "https://geosearch.planninglabs.nyc/v2/search"
24
  NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
25
- NOMINATIM_UA = "Riprap-NYC/0.1 (civic-flood-tool; +https://huggingface.co/spaces/msradam/riprap-nyc)"
26
 
27
- # NYC-bbox guard: lat 40.49–40.92, lon -74.27 to -73.69. Anything outside
28
- # this is probably not NYC; treat NYC Geosearch hits outside it as bogus.
29
  NYC_BBOX = (40.49, -74.27, 40.92, -73.69)
30
 
31
- # NYC ZIP prefixes are 100–104 (Manhattan), 110 (Queens), 112 (Brooklyn),
32
- # 113 (Queens), 114 (Queens), 116 (Queens), 100 (Bronx 104), 103 (SI 1),
33
- # basically all 1x with 3rd char 0–6. Upstate NY is 12x, 13x, 14x. We use
34
- # this only as a HINT to escalate to Nominatim, not as a hard filter.
35
  _UPSTATE_ZIP_RE = re.compile(r"\b1[2-4]\d{3}\b")
36
- _NON_NYC_HINTS = re.compile(
37
- r"\b(albany|troy|schenectady|saratoga|kingston|poughkeepsie|newburgh|"
38
- r"yonkers|white plains|hudson|rhinebeck|peekskill|beacon|tarrytown|"
39
- r"new paltz|catskill|tivoli|hyde park|coxsackie|cohoes|amsterdam|"
40
- r"glens falls|lake george|nyack|garrison|cold spring|highland|saugerties)\b",
41
- re.IGNORECASE,
42
- )
43
-
44
  _BOROUGHS = ("Manhattan", "Bronx", "Brooklyn", "Queens", "Staten Island")
45
 
46
-
47
  def _detect_borough(text: str) -> str | None:
48
  t = text.lower()
49
  for b in _BOROUGHS:
50
  if b.lower() in t:
51
  return b
52
- # neighborhood -> borough hints (incomplete but covers our demo set)
53
  hints = {
54
- "queens": "Queens",
55
- "jamaica": "Queens", "hollis": "Queens", "rockaway": "Queens",
56
- "elmhurst": "Queens", "maspeth": "Queens", "ozone park": "Queens",
57
- "astoria": "Queens", "flushing": "Queens", "edgemere": "Queens",
58
  "manhattan": "Manhattan", "harlem": "Manhattan", "soho": "Manhattan",
59
- "tribeca": "Manhattan", "midtown": "Manhattan", "les": "Manhattan",
60
- "chelsea": "Manhattan", "noho": "Manhattan",
61
- "brooklyn": "Brooklyn", "bushwick": "Brooklyn",
62
- "carroll gardens": "Brooklyn", "gowanus": "Brooklyn",
63
- "park slope": "Brooklyn", "williamsburg": "Brooklyn",
64
- "coney island": "Brooklyn", "red hook": "Brooklyn",
65
- "bronx": "Bronx", "fordham": "Bronx", "riverdale": "Bronx",
66
- "staten island": "Staten Island", "richmond": "Staten Island",
67
  }
68
  for needle, boro in hints.items():
69
  if needle in t:
70
  return boro
71
  return None
72
 
73
-
74
  @dataclass
75
  class GeocodeHit:
76
  address: str
@@ -81,45 +59,32 @@ class GeocodeHit:
81
  bin: str | None
82
  raw: dict
83
 
84
-
85
  def geocode(text: str, limit: int = 5) -> list[GeocodeHit]:
86
- """Return up to `limit` candidates from Geosearch, ranked by API order."""
87
- r = httpx.get(URL, params={"text": text, "size": limit}, timeout=15)
88
- r.raise_for_status()
89
- feats = r.json().get("features", [])
90
- out = []
91
- for f in feats:
92
- p = f.get("properties", {})
93
- coords = (f.get("geometry") or {}).get("coordinates") or [None, None]
94
- out.append(GeocodeHit(
95
- address=p.get("label") or p.get("name") or text,
96
- borough=p.get("borough"),
97
- lat=coords[1],
98
- lon=coords[0],
99
- bbl=p.get("addendum", {}).get("pad", {}).get("bbl"),
100
- bin=p.get("addendum", {}).get("pad", {}).get("bin"),
101
- raw=p,
102
- ))
103
- return out
104
-
105
-
106
- def _looks_upstate(text: str) -> bool:
107
- """Heuristic: should this query bypass NYC Geosearch?"""
108
- if _UPSTATE_ZIP_RE.search(text):
109
- return True
110
- if _NON_NYC_HINTS.search(text):
111
- return True
112
- return False
113
-
114
-
115
- def _in_nyc_bbox(lat: float, lon: float) -> bool:
116
- s, w, n, e = NYC_BBOX
117
- return s <= lat <= n and w <= lon <= e
118
-
119
 
120
  def geocode_nominatim(text: str) -> GeocodeHit | None:
121
- """National OSM Nominatim fallback. Used when NYC Geosearch can't
122
- plausibly answer the query."""
123
  try:
124
  r = httpx.get(NOMINATIM_URL, params={
125
  "q": text, "format": "jsonv2", "addressdetails": "1",
@@ -134,80 +99,40 @@ def geocode_nominatim(text: str) -> GeocodeHit | None:
134
  return None
135
  row = rows[0]
136
  addr = row.get("address") or {}
137
- label = row.get("display_name") or text
 
 
 
 
 
 
 
 
138
  return GeocodeHit(
139
- address=label,
140
- borough=addr.get("city") or addr.get("town") or addr.get("village") or addr.get("county"),
141
  lat=float(row["lat"]),
142
  lon=float(row["lon"]),
143
- bbl=None,
144
  bin=None,
145
  raw={"source": "nominatim", **row},
146
  )
147
 
148
-
149
  def geocode_one(text: str) -> GeocodeHit | None:
150
- \"\"\"Best match for `text`, using NYC Geosearch primary with a national
151
- OSM Nominatim fallback for upstate / non-NYC queries.
152
- \"\"\"
153
- # RESILIENCE PATCH: Hardcoded success for canonical demo addresses.
154
- t = text.lower()
155
- # 1. 80 Pioneer Street (Red Hook)
156
- if '80 pioneer' in t:
157
- return GeocodeHit(
158
- address='80 Pioneer Street, Brooklyn, NY 11231',
159
- borough='Brooklyn', lat=40.67805, lon=-74.00958,
160
- bbl='3005530030', bin='3008985', raw={'source': 'patch'},
161
- )
162
- # 2. PS 188 (Lower East Side) - very close to East River and transit
163
- if 'ps 188' in t or '442 east houston' in t:
164
- return GeocodeHit(
165
- address='442 East Houston Street, Manhattan, NY 10002',
166
- borough='Manhattan', lat=40.71965, lon=-73.97745,
167
- bbl='1003550001', bin='1004124', raw={'source': 'patch'},
168
- )
169
- # 3. Bowling Green (Financial District) - subway-heavy
170
- if 'bowling green' in t:
171
- return GeocodeHit(
172
- address='Bowling Green Station, Manhattan, NY 10004',
173
- borough='Manhattan', lat=40.7048, lon=-74.0135,
174
- bbl='1000070001', bin='1000001', raw={'source': 'patch'},
175
- )
176
- # 4. 2950 W 25 St (Coney Island) - NYCHA + Sandy heavy
177
- if '2950 w 25' in t or 'coney island' in t:
178
- return GeocodeHit(
179
- address='2950 West 25th Street, Brooklyn, NY 11224',
180
- borough='Brooklyn', lat=40.5755, lon=-73.9930,
181
- bbl='3070490001', bin='3000000', raw={'source': 'patch'},
182
- )
183
-
184
- if _looks_upstate(text):
185
- log.info("upstate hint detected in %r — using Nominatim", text)
186
- hit = geocode_nominatim(text)
187
- if hit:
188
- return hit
189
-
190
  hint = _detect_borough(text)
191
- try:
192
- hits = geocode(text, limit=8)
193
- except Exception as e:
194
- # Geosearch is unreachable or returned a server error — fall back to
195
- # Nominatim rather than surfacing a 503 to every downstream specialist.
196
- log.warning("Geosearch unavailable (%r) — falling back to Nominatim", e)
197
- return geocode_nominatim(text)
198
  if hint:
199
  in_boro = [h for h in hits if h.borough and h.borough.lower() == hint.lower()]
200
- if in_boro:
201
- return in_boro[0]
202
-
203
  if hits:
204
  top = hits[0]
205
- if top.lat is not None and _in_nyc_bbox(top.lat, top.lon):
206
  return top
207
- # Geosearch returned a hit, but it's outside the NYC bbox — that
208
- # means even the NYC API thinks the answer isn't NYC. Try
209
- # Nominatim before giving up.
210
- log.info("Geosearch top hit outside NYC bbox (%s, %s) — falling back",
211
- top.lat, top.lon)
212
 
 
 
213
  return geocode_nominatim(text)
 
22
 
23
  URL = "https://geosearch.planninglabs.nyc/v2/search"
24
  NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
25
+ NOMINATIM_UA = "Riprap-NYC/0.5 (civic-flood-tool; +https://huggingface.co/spaces/msradam/riprap-nyc)"
26
 
27
+ # NYC-bbox guard: lat 40.49–40.92, lon -74.27 to -73.69.
 
28
  NYC_BBOX = (40.49, -74.27, 40.92, -73.69)
29
 
 
 
 
 
30
  _UPSTATE_ZIP_RE = re.compile(r"\b1[2-4]\d{3}\b")
 
 
 
 
 
 
 
 
31
  _BOROUGHS = ("Manhattan", "Bronx", "Brooklyn", "Queens", "Staten Island")
32
 
 
33
  def _detect_borough(text: str) -> str | None:
34
  t = text.lower()
35
  for b in _BOROUGHS:
36
  if b.lower() in t:
37
  return b
38
+ # neighborhood -> borough hints
39
  hints = {
40
+ "queens": "Queens", "jamaica": "Queens", "rockaway": "Queens",
41
+ "astoria": "Queens", "flushing": "Queens",
 
 
42
  "manhattan": "Manhattan", "harlem": "Manhattan", "soho": "Manhattan",
43
+ "brooklyn": "Brooklyn", "bushwick": "Brooklyn", "red hook": "Brooklyn",
44
+ "bronx": "Bronx", "fordham": "Bronx",
45
+ "staten island": "Staten Island",
 
 
 
 
 
46
  }
47
  for needle, boro in hints.items():
48
  if needle in t:
49
  return boro
50
  return None
51
 
 
52
  @dataclass
53
  class GeocodeHit:
54
  address: str
 
59
  bin: str | None
60
  raw: dict
61
 
 
62
  def geocode(text: str, limit: int = 5) -> list[GeocodeHit]:
63
+ """NYC Geosearch primary."""
64
+ try:
65
+ r = httpx.get(URL, params={"text": text, "size": limit}, timeout=5)
66
+ r.raise_for_status()
67
+ feats = r.json().get("features", [])
68
+ out = []
69
+ for f in feats:
70
+ p = f.get("properties", {})
71
+ coords = (f.get("geometry") or {}).get("coordinates") or [None, None]
72
+ out.append(GeocodeHit(
73
+ address=p.get("label") or p.get("name") or text,
74
+ borough=p.get("borough"),
75
+ lat=coords[1],
76
+ lon=coords[0],
77
+ bbl=p.get("addendum", {}).get("pad", {}).get("bbl"),
78
+ bin=p.get("addendum", {}).get("pad", {}).get("bin"),
79
+ raw=p,
80
+ ))
81
+ return out
82
+ except Exception as e:
83
+ log.warning("Geosearch failed: %r", e)
84
+ return []
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  def geocode_nominatim(text: str) -> GeocodeHit | None:
87
+ """National OSM Nominatim fallback."""
 
88
  try:
89
  r = httpx.get(NOMINATIM_URL, params={
90
  "q": text, "format": "jsonv2", "addressdetails": "1",
 
99
  return None
100
  row = rows[0]
101
  addr = row.get("address") or {}
102
+
103
+ # Try to map Nominatim borough/county back to NYC standard
104
+ boro = addr.get("suburb") or addr.get("city_district") or addr.get("county")
105
+ if boro and "Kings" in boro: boro = "Brooklyn"
106
+ if boro and "New York County" in boro: boro = "Manhattan"
107
+ if boro and "Queens" in boro: boro = "Queens"
108
+ if boro and "Bronx" in boro: boro = "Bronx"
109
+ if boro and "Richmond" in boro: boro = "Staten Island"
110
+
111
  return GeocodeHit(
112
+ address=row.get("display_name") or text,
113
+ borough=boro,
114
  lat=float(row["lat"]),
115
  lon=float(row["lon"]),
116
+ bbl=None, # Nominatim doesn't have BBLs
117
  bin=None,
118
  raw={"source": "nominatim", **row},
119
  )
120
 
 
121
  def geocode_one(text: str) -> GeocodeHit | None:
122
+ """Dynamic geocoder with failover."""
123
+ # 1. Try Geosearch
124
+ hits = geocode(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  hint = _detect_borough(text)
126
+
 
 
 
 
 
 
127
  if hint:
128
  in_boro = [h for h in hits if h.borough and h.borough.lower() == hint.lower()]
129
+ if in_boro: return in_boro[0]
130
+
 
131
  if hits:
132
  top = hits[0]
133
+ if top.lat and 40.4 <= top.lat <= 41.0: # Broad NYC check
134
  return top
 
 
 
 
 
135
 
136
+ # 2. Fall back to Nominatim
137
+ log.info("Falling back to Nominatim for %r", text)
138
  return geocode_nominatim(text)