File size: 8,738 Bytes
6a82282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5de71b8
6a82282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
"""NYC Neighborhood Tabulation Area (NTA 2020) resolver.

NTAs are NYC Department of City Planning's official neighborhood unit:
~262 polygons covering all 5 boroughs, including some park / airport
slivers. They are the canonical "neighborhood" unit for NYC civic data.

This module provides:
  - load() β†’ GeoDataFrame with all NTAs (cached)
  - resolve(name) β†’ list of matching NTAs by fuzzy name match, or by borough
  - by_code(code) β†’ exact lookup
  - polygon_for(code) β†’ shapely Polygon in EPSG:4326
"""
from __future__ import annotations

import re
from functools import lru_cache
from pathlib import Path
from typing import Any

import geopandas as gpd
from shapely.geometry import Polygon

DATA_PATH = Path(__file__).resolve().parents[2] / "data" / "nyc_ntas_2020.geojson"

# Common alias map: user-typed strings β†’ canonical NTA names. We don't need to
# be exhaustive here; the fuzzy matcher catches most cases. This handles the
# few hard ones where the official NTA name differs from local usage.
ALIASES = {
    "the rockaways":    "Rockaway Beach-Arverne-Edgemere",
    "rockaway":         "Rockaway Beach-Arverne-Edgemere",
    "brighton":         "Brighton Beach",
    "lower east side":  "Lower East Side",
    "les":              "Lower East Side",
    "soho":             "SoHo-Little Italy-Hudson Square",
    "tribeca":          "Tribeca-Civic Center",
    "fidi":             "Financial District-Battery Park City",
    "downtown brooklyn":"Downtown Brooklyn-DUMBO-Boerum Hill",
    "dumbo":            "Downtown Brooklyn-DUMBO-Boerum Hill",
    "park slope":       "Park Slope",
    "carroll gardens":  "Carroll Gardens-Cobble Hill-Gowanus-Red Hook",
    "red hook":         "Carroll Gardens-Cobble Hill-Gowanus-Red Hook",
    "gowanus":          "Carroll Gardens-Cobble Hill-Gowanus-Red Hook",
    "hollis":           "Queens Village-Hollis-Bellerose",
    "long island city": "Hunters Point-Sunnyside-West Maspeth",
    "lic":              "Hunters Point-Sunnyside-West Maspeth",
    "astoria":          "Astoria (Central)",
    "flushing":         "Flushing-Willets Point",
    "harlem":           "Central Harlem (North)",
    "east harlem":      "East Harlem (North)",
    "washington heights":"Washington Heights (North)",
    "midtown":          "Midtown South-Flatiron-Union Square",
    "upper east side":  "Upper East Side-Carnegie Hill",
    "ues":              "Upper East Side-Carnegie Hill",
    "upper west side":  "Upper West Side-Lincoln Square",
    "uws":              "Upper West Side-Lincoln Square",
    "coney island":     "Coney Island-Sea Gate",
}

BOROUGH_NORMALIZE = {
    "manhattan": "Manhattan", "mn": "Manhattan",
    "brooklyn":  "Brooklyn",  "bk": "Brooklyn",  "kings": "Brooklyn",
    "queens":    "Queens",    "qn": "Queens",
    "bronx":     "Bronx",     "the bronx": "Bronx", "bx": "Bronx",
    "staten island": "Staten Island", "si": "Staten Island", "richmond": "Staten Island",
}


def _normalize(s: str) -> str:
    return re.sub(r"[^a-z]+", "", (s or "").lower())


@lru_cache(maxsize=1)
def load() -> gpd.GeoDataFrame:
    """Load the NTA 2020 GeoJSON; coerce CRS to EPSG:4326. Cached."""
    g = gpd.read_file(DATA_PATH)
    if g.crs is None or g.crs.to_string() != "EPSG:4326":
        g = g.to_crs("EPSG:4326")
    return g


def by_code(code: str) -> dict | None:
    g = load()
    hit = g[g["nta2020"] == code]
    if hit.empty:
        return None
    return _row_to_dict(hit.iloc[0])


def _row_to_dict(row) -> dict:
    return {
        "nta_code":  row["nta2020"],
        "nta_name":  row["ntaname"],
        "borough":   row["boroname"],
        "cdta":      row.get("cdtaname"),
        "geometry":  row["geometry"],
    }


def borough_match(query: str) -> str | None:
    """If query matches a borough name (or common abbreviation), return the
    canonical name. Otherwise return None."""
    q = query.strip().lower()
    return BOROUGH_NORMALIZE.get(q)


def resolve(query: str) -> list[dict[str, Any]]:
    """Resolve a free-text query to NTA(s).

    Strategy (in priority order):
      1. Borough match β†’ all NTAs in borough.
      2. Alias map β†’ exact NTA name match.
      3. Case-insensitive EXACT name match (so 'Kew Gardens' wins over
         'Kew Gardens Hills' when both exist).
      4. Substring match on normalized NTA name. When multiple match,
         prefer the one whose normalized name length is closest to the
         query β€” avoids 'Kew Gardens' resolving to 'Kew Gardens Hills'.
      5. CDTA-name substring fallback.
    """
    g = load()
    q = (query or "").strip()
    if not q:
        return []
    boro = borough_match(q)
    if boro:
        hits = g[g["boroname"] == boro]
        return [_row_to_dict(r) for _, r in hits.iterrows()]

    alias = ALIASES.get(q.lower())
    if alias:
        hits = g[g["ntaname"] == alias]
        if not hits.empty:
            return [_row_to_dict(r) for _, r in hits.iterrows()]

    # Exact (case-insensitive) β€” preferred over substring
    name_lower = g["ntaname"].fillna("").str.lower()
    exact = g[name_lower == q.lower()]
    if not exact.empty:
        return [_row_to_dict(r) for _, r in exact.iterrows()]

    qn = _normalize(q)
    if not qn:
        return []
    name_norm = g["ntaname"].fillna("").map(_normalize)
    contains = g[name_norm.str.contains(qn, na=False)].copy()
    if not contains.empty:
        contains["_diff"] = contains["ntaname"].fillna("").map(
            lambda s: abs(len(_normalize(s)) - len(qn))
        )
        contains = contains.sort_values("_diff")
        return [_row_to_dict(r) for _, r in contains.iterrows()]

    cdta_norm = g["cdtaname"].fillna("").map(_normalize)
    contains = g[cdta_norm.str.contains(qn, na=False)]
    if not contains.empty:
        return [_row_to_dict(r) for _, r in contains.iterrows()]

    return []


def polygon_for(code: str) -> Polygon | None:
    hit = by_code(code)
    return hit["geometry"] if hit else None


def resolve_from_text(text: str) -> list[dict[str, Any]]:  # TODO(cleanup): cc-grade-D (25)
    """Scan free-text (e.g. a full natural-language query) for any known NTA
    name, alias, or borough. Returns the first match. This is the fallback
    when the planner failed to extract a clean target.

    Strategy: walk ALIASES first (cheap), then iterate NTA names and look
    for the longest match contained in the text. We prefer the longest
    match so 'Carroll Gardens' wins over 'Gardens'.
    """
    t = (text or "").lower()
    if not t:
        return []
    # Boroughs first (whole-word-ish β€” avoid false hits inside "queensland" etc.)
    for boro_key, canon in BOROUGH_NORMALIZE.items():
        if f" {boro_key} " in f" {t} " or t.startswith(boro_key + " ") or t.endswith(" " + boro_key):
            hits = resolve(canon)
            if hits:
                return hits
    # Alias keys, longest first
    for key in sorted(ALIASES.keys(), key=len, reverse=True):
        if key in t:
            hits = resolve(key)
            if hits:
                return hits
    # NTA names. Order: longest first so multi-word names match before
    # shorter substrings, AND preferring the WORD-BOUNDARY match so
    # "Kew Gardens" in the query doesn't collide with "Kew Gardens Hills"
    # (the latter is longer; without word-boundary checking it'd match
    # nothing, but with substring-in-text it'd match if the query ever
    # contained the longer phrase). Caller picks the closest-length match.
    g = load()
    names = sorted(set(g["ntaname"].dropna().str.lower().tolist()), key=len, reverse=True)
    matches = []
    for name in names:
        if not name or len(name) < 4:
            continue
        # Word-boundary-ish check: name must appear bounded by start/end or
        # whitespace/punct (so "kew gardens hills" matches but "kew gardens"
        # alone doesn't trigger "kew gardens hills" because of the trailing
        # space requirement).
        padded_t = f" {t} "
        if f" {name} " in padded_t or f" {name}." in padded_t or f" {name}," in padded_t or f" {name}?" in padded_t:
            matches.append(name)
    if matches:
        # Prefer the longest word-boundary match β€” most specific.
        best = sorted(matches, key=len, reverse=True)[0]
        hits = resolve(best)
        if hits:
            return hits
    # Fallback: any substring (no boundary). Less precise, but catches
    # casual queries like "show me red hook" where "red hook" is a
    # neighborhood-name fragment within a longer NTA name.
    for name in names:
        if not name or len(name) < 4:
            continue
        if name in t:
            hits = resolve(name)
            if hits:
                return hits
    return []