Granite Embedding 278M RAG over five NYC resilience plans
Browse filesFive corpus PDFs constitute the textual evidence base:
comptroller_rain_2024 — NYC Comptroller cloudburst report
coned_22_e_0222 — ConEd Sandy storm-hardening rate case
dep_wastewater_2013 — DEP post-Sandy wastewater facilities plan
mta_resilience_2025 — MTA Climate Resilience Roadmap
nycha_lessons — NYCHA Sandy lessons-learned
Granite Embedding 278M (sentence-transformers, CPU-only) chunks every
PDF on first start, caches the embeddings, and answers retrieve(query)
with cosine top-k passages + page numbers. The reconciler treats each
chunk as a doc-role message so citations resolve back to a specific
PDF and page.
.gitattributes routes *.pdf through LFS.
- .gitattributes +1 -0
- app/rag.py +168 -0
- corpus/comptroller_rain_2024.pdf +3 -0
- corpus/coned_22_e_0222.pdf +3 -0
- corpus/dep_wastewater_2013.pdf +3 -0
- corpus/mta_resilience_2025.pdf +3 -0
- corpus/nycha_lessons.pdf +3 -0
.gitattributes
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
# Riprap-specific LFS tracking
|
| 2 |
*.geojson filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.tif filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 4 |
|
| 5 |
# Esri FileGDB internal binary files (DEP Stormwater scenario data)
|
| 6 |
*.gdbtable filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
# Riprap-specific LFS tracking
|
| 2 |
*.geojson filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.tif filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 5 |
|
| 6 |
# Esri FileGDB internal binary files (DEP Stormwater scenario data)
|
| 7 |
*.gdbtable filter=lfs diff=lfs merge=lfs -text
|
app/rag.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Granite Embedding 278M RAG over the NYC flood-resilience policy corpus.
|
| 2 |
+
|
| 3 |
+
Specialists this powers:
|
| 4 |
+
step_rag — for any query (geo + intent), retrieve top-k relevant
|
| 5 |
+
policy paragraphs from HMP/NPCC4/DEP/MTA/NYCHA/Comptroller
|
| 6 |
+
and emit them as <document id="rag_*"> blocks.
|
| 7 |
+
|
| 8 |
+
We chunk page-by-page with a soft target of ~600 chars per chunk, embed
|
| 9 |
+
once at startup, and store a numpy matrix + FAISS L2 index in memory.
|
| 10 |
+
The index is small (~1k chunks across 5 PDFs).
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import logging
|
| 15 |
+
import re
|
| 16 |
+
from dataclasses import dataclass
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
import numpy as np
|
| 20 |
+
|
| 21 |
+
log = logging.getLogger("riprap.rag")
|
| 22 |
+
|
| 23 |
+
CORPUS_DIR = Path(__file__).resolve().parent.parent / "corpus"
|
| 24 |
+
EMBED_MODEL_NAME = "ibm-granite/granite-embedding-278m-multilingual"
|
| 25 |
+
|
| 26 |
+
CORPUS_META = {
|
| 27 |
+
"dep_wastewater_2013.pdf": {
|
| 28 |
+
"doc_id": "rag_dep_2013",
|
| 29 |
+
"title": "NYC DEP Wastewater Resiliency Plan (2013)",
|
| 30 |
+
"citation": "NYC DEP Wastewater Resiliency Plan, 2013",
|
| 31 |
+
},
|
| 32 |
+
"nycha_lessons.pdf": {
|
| 33 |
+
"doc_id": "rag_nycha",
|
| 34 |
+
"title": "Flood Resilience at NYCHA — Lessons Learned",
|
| 35 |
+
"citation": "NYCHA, Flood Resilience: Lessons Learned",
|
| 36 |
+
},
|
| 37 |
+
"coned_22_e_0222.pdf": {
|
| 38 |
+
"doc_id": "rag_coned",
|
| 39 |
+
"title": "Con Edison Climate Change Resilience Plan (2023, Case 22-E-0222)",
|
| 40 |
+
"citation": "Con Edison Climate Change Resilience Plan (2023, NY PSC Case 22-E-0222)",
|
| 41 |
+
},
|
| 42 |
+
"mta_resilience_2025.pdf": {
|
| 43 |
+
"doc_id": "rag_mta",
|
| 44 |
+
"title": "MTA Climate Resilience Roadmap (October 2025 update)",
|
| 45 |
+
"citation": "MTA Climate Resilience Roadmap, October 2025 update",
|
| 46 |
+
},
|
| 47 |
+
"comptroller_rain_2024.pdf": {
|
| 48 |
+
"doc_id": "rag_comptroller",
|
| 49 |
+
"title": "NYC Comptroller — Is NYC Ready for Rain? (2024)",
|
| 50 |
+
"citation": "NYC Comptroller, \"Is New York City Ready for Rain?\" (2024)",
|
| 51 |
+
},
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@dataclass
|
| 56 |
+
class Chunk:
|
| 57 |
+
text: str
|
| 58 |
+
file: str
|
| 59 |
+
page: int
|
| 60 |
+
doc_id: str
|
| 61 |
+
title: str
|
| 62 |
+
citation: str
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _chunks_from_pdf(path: Path, target_chars: int = 700) -> list[Chunk]:
|
| 66 |
+
import pypdf
|
| 67 |
+
meta = CORPUS_META.get(path.name, {
|
| 68 |
+
"doc_id": f"rag_{path.stem}",
|
| 69 |
+
"title": path.stem,
|
| 70 |
+
"citation": path.stem,
|
| 71 |
+
})
|
| 72 |
+
out: list[Chunk] = []
|
| 73 |
+
try:
|
| 74 |
+
reader = pypdf.PdfReader(str(path))
|
| 75 |
+
except Exception as e:
|
| 76 |
+
log.warning("pdf load failed for %s: %s", path.name, e)
|
| 77 |
+
return out
|
| 78 |
+
for i, page in enumerate(reader.pages):
|
| 79 |
+
try:
|
| 80 |
+
txt = page.extract_text() or ""
|
| 81 |
+
except Exception:
|
| 82 |
+
txt = ""
|
| 83 |
+
txt = re.sub(r"\s+", " ", txt).strip()
|
| 84 |
+
if len(txt) < 80:
|
| 85 |
+
continue
|
| 86 |
+
# split into ~target_chars chunks at sentence boundaries
|
| 87 |
+
sentences = re.split(r"(?<=[.!?])\s+", txt)
|
| 88 |
+
buf = ""
|
| 89 |
+
for s in sentences:
|
| 90 |
+
if len(buf) + len(s) + 1 <= target_chars or not buf:
|
| 91 |
+
buf = (buf + " " + s).strip() if buf else s
|
| 92 |
+
else:
|
| 93 |
+
out.append(Chunk(text=buf, file=path.name, page=i + 1,
|
| 94 |
+
doc_id=meta["doc_id"], title=meta["title"],
|
| 95 |
+
citation=meta["citation"]))
|
| 96 |
+
buf = s
|
| 97 |
+
if buf:
|
| 98 |
+
out.append(Chunk(text=buf, file=path.name, page=i + 1,
|
| 99 |
+
doc_id=meta["doc_id"], title=meta["title"],
|
| 100 |
+
citation=meta["citation"]))
|
| 101 |
+
return out
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
_INDEX: dict | None = None
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _ensure_index():
|
| 108 |
+
global _INDEX
|
| 109 |
+
if _INDEX is not None:
|
| 110 |
+
return _INDEX
|
| 111 |
+
|
| 112 |
+
chunks: list[Chunk] = []
|
| 113 |
+
for f in sorted(CORPUS_DIR.glob("*.pdf")):
|
| 114 |
+
log.info("rag: chunking %s", f.name)
|
| 115 |
+
chunks.extend(_chunks_from_pdf(f))
|
| 116 |
+
log.info("rag: %d chunks across %d files",
|
| 117 |
+
len(chunks), len(set(c.file for c in chunks)))
|
| 118 |
+
if not chunks:
|
| 119 |
+
_INDEX = {"chunks": [], "embs": None, "model": None}
|
| 120 |
+
return _INDEX
|
| 121 |
+
|
| 122 |
+
from sentence_transformers import SentenceTransformer
|
| 123 |
+
log.info("rag: loading %s", EMBED_MODEL_NAME)
|
| 124 |
+
model = SentenceTransformer(EMBED_MODEL_NAME)
|
| 125 |
+
|
| 126 |
+
texts = [c.text for c in chunks]
|
| 127 |
+
log.info("rag: embedding %d chunks", len(texts))
|
| 128 |
+
embs = model.encode(texts, batch_size=32, show_progress_bar=False,
|
| 129 |
+
convert_to_numpy=True, normalize_embeddings=True)
|
| 130 |
+
_INDEX = {"chunks": chunks, "embs": embs.astype("float32"), "model": model}
|
| 131 |
+
log.info("rag: index ready (%s)", embs.shape)
|
| 132 |
+
return _INDEX
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def warm():
|
| 136 |
+
_ensure_index()
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def retrieve(query: str, k: int = 4, min_score: float = 0.30) -> list[dict]:
|
| 140 |
+
idx = _ensure_index()
|
| 141 |
+
if idx["embs"] is None or not idx["chunks"]:
|
| 142 |
+
return []
|
| 143 |
+
qv = idx["model"].encode([query], convert_to_numpy=True,
|
| 144 |
+
normalize_embeddings=True).astype("float32")
|
| 145 |
+
# cosine similarity (vectors are L2-normalized)
|
| 146 |
+
sims = (idx["embs"] @ qv.T).ravel()
|
| 147 |
+
top = np.argsort(-sims)[:k * 3] # over-fetch then de-dupe per doc
|
| 148 |
+
out: list[dict] = []
|
| 149 |
+
seen_per_doc: dict[str, int] = {}
|
| 150 |
+
for i in top:
|
| 151 |
+
if sims[i] < min_score:
|
| 152 |
+
continue
|
| 153 |
+
c = idx["chunks"][i]
|
| 154 |
+
if seen_per_doc.get(c.doc_id, 0) >= 1: # at most 1 chunk per doc
|
| 155 |
+
continue
|
| 156 |
+
seen_per_doc[c.doc_id] = seen_per_doc.get(c.doc_id, 0) + 1
|
| 157 |
+
out.append({
|
| 158 |
+
"doc_id": c.doc_id,
|
| 159 |
+
"title": c.title,
|
| 160 |
+
"citation": c.citation,
|
| 161 |
+
"file": c.file,
|
| 162 |
+
"page": c.page,
|
| 163 |
+
"text": c.text,
|
| 164 |
+
"score": float(sims[i]),
|
| 165 |
+
})
|
| 166 |
+
if len(out) >= k:
|
| 167 |
+
break
|
| 168 |
+
return out
|
corpus/comptroller_rain_2024.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c4ad386a3eaffc2278a44013f852c2ddd1bf06e278346227e24615ee3a387fc
|
| 3 |
+
size 2616885
|
corpus/coned_22_e_0222.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d76c7f108bd0336cefa5dd6fb5064cf91adee3c9c1b91eeaa279d0dd0fcdb59
|
| 3 |
+
size 5045344
|
corpus/dep_wastewater_2013.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e57a402851521bd61494c572fded8327cd28b35118eb95f7bf7d0bd2bdba32d
|
| 3 |
+
size 732738
|
corpus/mta_resilience_2025.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d7e3e7c6634adbfc00ee80f7f5bfe1a2d9c46ce0d1321363d3b3bb3446a2582
|
| 3 |
+
size 8455213
|
corpus/nycha_lessons.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d460034b6a786fcf8b0a96aa3da983d9131bfdb521d7967b9d29c1f55e265d8c
|
| 3 |
+
size 14226816
|