seriffic commited on
Commit
bc00192
·
1 Parent(s): d7bc619

Granite Embedding 278M RAG over five NYC resilience plans

Browse files

Five corpus PDFs constitute the textual evidence base:
comptroller_rain_2024 — NYC Comptroller cloudburst report
coned_22_e_0222 — ConEd Sandy storm-hardening rate case
dep_wastewater_2013 — DEP post-Sandy wastewater facilities plan
mta_resilience_2025 — MTA Climate Resilience Roadmap
nycha_lessons — NYCHA Sandy lessons-learned

Granite Embedding 278M (sentence-transformers, CPU-only) chunks every
PDF on first start, caches the embeddings, and answers retrieve(query)
with cosine top-k passages + page numbers. The reconciler treats each
chunk as a doc-role message so citations resolve back to a specific
PDF and page.

.gitattributes routes *.pdf through LFS.

.gitattributes CHANGED
@@ -1,6 +1,7 @@
1
  # Riprap-specific LFS tracking
2
  *.geojson filter=lfs diff=lfs merge=lfs -text
3
  *.tif filter=lfs diff=lfs merge=lfs -text
 
4
 
5
  # Esri FileGDB internal binary files (DEP Stormwater scenario data)
6
  *.gdbtable filter=lfs diff=lfs merge=lfs -text
 
1
  # Riprap-specific LFS tracking
2
  *.geojson filter=lfs diff=lfs merge=lfs -text
3
  *.tif filter=lfs diff=lfs merge=lfs -text
4
+ *.pdf filter=lfs diff=lfs merge=lfs -text
5
 
6
  # Esri FileGDB internal binary files (DEP Stormwater scenario data)
7
  *.gdbtable filter=lfs diff=lfs merge=lfs -text
app/rag.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Granite Embedding 278M RAG over the NYC flood-resilience policy corpus.
2
+
3
+ Specialists this powers:
4
+ step_rag — for any query (geo + intent), retrieve top-k relevant
5
+ policy paragraphs from HMP/NPCC4/DEP/MTA/NYCHA/Comptroller
6
+ and emit them as <document id="rag_*"> blocks.
7
+
8
+ We chunk page-by-page with a soft target of ~600 chars per chunk, embed
9
+ once at startup, and store a numpy matrix + FAISS L2 index in memory.
10
+ The index is small (~1k chunks across 5 PDFs).
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import re
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+
19
+ import numpy as np
20
+
21
+ log = logging.getLogger("riprap.rag")
22
+
23
+ CORPUS_DIR = Path(__file__).resolve().parent.parent / "corpus"
24
+ EMBED_MODEL_NAME = "ibm-granite/granite-embedding-278m-multilingual"
25
+
26
+ CORPUS_META = {
27
+ "dep_wastewater_2013.pdf": {
28
+ "doc_id": "rag_dep_2013",
29
+ "title": "NYC DEP Wastewater Resiliency Plan (2013)",
30
+ "citation": "NYC DEP Wastewater Resiliency Plan, 2013",
31
+ },
32
+ "nycha_lessons.pdf": {
33
+ "doc_id": "rag_nycha",
34
+ "title": "Flood Resilience at NYCHA — Lessons Learned",
35
+ "citation": "NYCHA, Flood Resilience: Lessons Learned",
36
+ },
37
+ "coned_22_e_0222.pdf": {
38
+ "doc_id": "rag_coned",
39
+ "title": "Con Edison Climate Change Resilience Plan (2023, Case 22-E-0222)",
40
+ "citation": "Con Edison Climate Change Resilience Plan (2023, NY PSC Case 22-E-0222)",
41
+ },
42
+ "mta_resilience_2025.pdf": {
43
+ "doc_id": "rag_mta",
44
+ "title": "MTA Climate Resilience Roadmap (October 2025 update)",
45
+ "citation": "MTA Climate Resilience Roadmap, October 2025 update",
46
+ },
47
+ "comptroller_rain_2024.pdf": {
48
+ "doc_id": "rag_comptroller",
49
+ "title": "NYC Comptroller — Is NYC Ready for Rain? (2024)",
50
+ "citation": "NYC Comptroller, \"Is New York City Ready for Rain?\" (2024)",
51
+ },
52
+ }
53
+
54
+
55
+ @dataclass
56
+ class Chunk:
57
+ text: str
58
+ file: str
59
+ page: int
60
+ doc_id: str
61
+ title: str
62
+ citation: str
63
+
64
+
65
+ def _chunks_from_pdf(path: Path, target_chars: int = 700) -> list[Chunk]:
66
+ import pypdf
67
+ meta = CORPUS_META.get(path.name, {
68
+ "doc_id": f"rag_{path.stem}",
69
+ "title": path.stem,
70
+ "citation": path.stem,
71
+ })
72
+ out: list[Chunk] = []
73
+ try:
74
+ reader = pypdf.PdfReader(str(path))
75
+ except Exception as e:
76
+ log.warning("pdf load failed for %s: %s", path.name, e)
77
+ return out
78
+ for i, page in enumerate(reader.pages):
79
+ try:
80
+ txt = page.extract_text() or ""
81
+ except Exception:
82
+ txt = ""
83
+ txt = re.sub(r"\s+", " ", txt).strip()
84
+ if len(txt) < 80:
85
+ continue
86
+ # split into ~target_chars chunks at sentence boundaries
87
+ sentences = re.split(r"(?<=[.!?])\s+", txt)
88
+ buf = ""
89
+ for s in sentences:
90
+ if len(buf) + len(s) + 1 <= target_chars or not buf:
91
+ buf = (buf + " " + s).strip() if buf else s
92
+ else:
93
+ out.append(Chunk(text=buf, file=path.name, page=i + 1,
94
+ doc_id=meta["doc_id"], title=meta["title"],
95
+ citation=meta["citation"]))
96
+ buf = s
97
+ if buf:
98
+ out.append(Chunk(text=buf, file=path.name, page=i + 1,
99
+ doc_id=meta["doc_id"], title=meta["title"],
100
+ citation=meta["citation"]))
101
+ return out
102
+
103
+
104
+ _INDEX: dict | None = None
105
+
106
+
107
+ def _ensure_index():
108
+ global _INDEX
109
+ if _INDEX is not None:
110
+ return _INDEX
111
+
112
+ chunks: list[Chunk] = []
113
+ for f in sorted(CORPUS_DIR.glob("*.pdf")):
114
+ log.info("rag: chunking %s", f.name)
115
+ chunks.extend(_chunks_from_pdf(f))
116
+ log.info("rag: %d chunks across %d files",
117
+ len(chunks), len(set(c.file for c in chunks)))
118
+ if not chunks:
119
+ _INDEX = {"chunks": [], "embs": None, "model": None}
120
+ return _INDEX
121
+
122
+ from sentence_transformers import SentenceTransformer
123
+ log.info("rag: loading %s", EMBED_MODEL_NAME)
124
+ model = SentenceTransformer(EMBED_MODEL_NAME)
125
+
126
+ texts = [c.text for c in chunks]
127
+ log.info("rag: embedding %d chunks", len(texts))
128
+ embs = model.encode(texts, batch_size=32, show_progress_bar=False,
129
+ convert_to_numpy=True, normalize_embeddings=True)
130
+ _INDEX = {"chunks": chunks, "embs": embs.astype("float32"), "model": model}
131
+ log.info("rag: index ready (%s)", embs.shape)
132
+ return _INDEX
133
+
134
+
135
+ def warm():
136
+ _ensure_index()
137
+
138
+
139
+ def retrieve(query: str, k: int = 4, min_score: float = 0.30) -> list[dict]:
140
+ idx = _ensure_index()
141
+ if idx["embs"] is None or not idx["chunks"]:
142
+ return []
143
+ qv = idx["model"].encode([query], convert_to_numpy=True,
144
+ normalize_embeddings=True).astype("float32")
145
+ # cosine similarity (vectors are L2-normalized)
146
+ sims = (idx["embs"] @ qv.T).ravel()
147
+ top = np.argsort(-sims)[:k * 3] # over-fetch then de-dupe per doc
148
+ out: list[dict] = []
149
+ seen_per_doc: dict[str, int] = {}
150
+ for i in top:
151
+ if sims[i] < min_score:
152
+ continue
153
+ c = idx["chunks"][i]
154
+ if seen_per_doc.get(c.doc_id, 0) >= 1: # at most 1 chunk per doc
155
+ continue
156
+ seen_per_doc[c.doc_id] = seen_per_doc.get(c.doc_id, 0) + 1
157
+ out.append({
158
+ "doc_id": c.doc_id,
159
+ "title": c.title,
160
+ "citation": c.citation,
161
+ "file": c.file,
162
+ "page": c.page,
163
+ "text": c.text,
164
+ "score": float(sims[i]),
165
+ })
166
+ if len(out) >= k:
167
+ break
168
+ return out
corpus/comptroller_rain_2024.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c4ad386a3eaffc2278a44013f852c2ddd1bf06e278346227e24615ee3a387fc
3
+ size 2616885
corpus/coned_22_e_0222.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d76c7f108bd0336cefa5dd6fb5064cf91adee3c9c1b91eeaa279d0dd0fcdb59
3
+ size 5045344
corpus/dep_wastewater_2013.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e57a402851521bd61494c572fded8327cd28b35118eb95f7bf7d0bd2bdba32d
3
+ size 732738
corpus/mta_resilience_2025.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d7e3e7c6634adbfc00ee80f7f5bfe1a2d9c46ce0d1321363d3b3bb3446a2582
3
+ size 8455213
corpus/nycha_lessons.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d460034b6a786fcf8b0a96aa3da983d9131bfdb521d7967b9d29c1f55e265d8c
3
+ size 14226816