Add output-quality features: highlighting, hedging, linking, currency
Browse filesRetrieval and presentation improvements for better-grounded answers:
- Legal-abbreviation query expansion (new canlex/synonyms.py): PRRA,
H&C, RAD, CBSA and similar shorthand expand to statutory wording
before BM25 and semantic retrieval.
- Diversity cap: no single case or memo may take more than two result
slots, so one document cannot monopolise a topical query.
- Legislation guarantee: when a result set is dominated by case law,
the governing statute is pulled into it.
- Pinpoint highlighting: each long result flags the subsection or
paragraph most on point (e.g. s. 34(1)(c)), scored by the
cross-encoder.
- Low-confidence hedging: when the top semantic match is weak, the
search tool warns that the corpus may not address the question.
- Currency signalling: every legislation result carries a prominent
Currency line; the grounding note demands a dated answer.
- Cross-reference linking: a result also surfaces the regulations made
under its Act (and a regulation its enabling Act) and the CBSA
D-memoranda that cite the provision.
Eval: Hit@3 0.74 -> 0.77, Hit@5 holds at 0.89, no regression.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- canlex/index.py +193 -17
- canlex/server.py +47 -11
- canlex/synonyms.py +62 -0
|
@@ -8,12 +8,14 @@ from collections import Counter, defaultdict
|
|
| 8 |
import snowballstemmer
|
| 9 |
|
| 10 |
from .config import PROCESSED_DIR
|
|
|
|
| 11 |
|
| 12 |
K1 = 1.5
|
| 13 |
B = 0.75
|
| 14 |
RRF_K = 60 # reciprocal-rank-fusion damping constant
|
| 15 |
CANDIDATES = 80 # hits each retriever contributes to the fusion
|
| 16 |
RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
|
|
|
|
| 17 |
|
| 18 |
_TOKEN = re.compile(r"[a-z0-9]+")
|
| 19 |
_SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
|
|
@@ -23,6 +25,16 @@ _XREF = re.compile(
|
|
| 23 |
r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)",
|
| 24 |
re.IGNORECASE)
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
_STEMMER = snowballstemmer.stemmer("english")
|
| 28 |
_STEM_CACHE = {}
|
|
@@ -49,6 +61,37 @@ def _section_refs(query):
|
|
| 49 |
return set(_SECTION_REF.findall(query.lower()))
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
class LegislationIndex:
|
| 53 |
def __init__(self):
|
| 54 |
self.chunks = []
|
|
@@ -148,20 +191,104 @@ class LegislationIndex:
|
|
| 148 |
qv = self.embedder.encode_query(query)
|
| 149 |
sims = self.vectors @ qv
|
| 150 |
order = self._np.argsort(sims)[::-1][:CANDIDATES]
|
| 151 |
-
|
|
|
|
|
|
|
| 152 |
|
| 153 |
def _rerank_doc(self, idx):
|
| 154 |
c = self.chunks[idx]
|
| 155 |
return f"{c['citation']} — {c['marginal_note']}\n{c['text']}"
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
def search(self, query, top_k=6, act=None, doc_type=None):
|
| 158 |
"""Hybrid candidate fusion (BM25 + semantic), then cross-encoder rerank."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
fused = defaultdict(float)
|
| 160 |
-
bm25 = self._bm25_scores(
|
| 161 |
for rank, idx in enumerate(sorted(bm25, key=bm25.get, reverse=True)[:CANDIDATES]):
|
| 162 |
fused[idx] += 1.0 / (RRF_K + rank)
|
| 163 |
if self.semantic:
|
| 164 |
-
|
|
|
|
| 165 |
fused[idx] += 1.0 / (RRF_K + rank)
|
| 166 |
|
| 167 |
# Ensure explicitly-referenced sections are retrieved even if recall missed them.
|
|
@@ -208,7 +335,20 @@ class LegislationIndex:
|
|
| 208 |
pinned_set = set(pinned)
|
| 209 |
candidates = pinned + [i for i in candidates if i not in pinned_set]
|
| 210 |
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
def get_section(self, act, section):
|
| 214 |
act = act.lower()
|
|
@@ -218,10 +358,12 @@ class LegislationIndex:
|
|
| 218 |
return None
|
| 219 |
|
| 220 |
def _build_xref(self):
|
| 221 |
-
"""Index legislation by (act, section)
|
| 222 |
-
section
|
|
|
|
| 223 |
self._by_section = {}
|
| 224 |
self._defs_section = {}
|
|
|
|
| 225 |
for c in self.chunks:
|
| 226 |
if c.get("doc_type", "legislation") != "legislation":
|
| 227 |
continue
|
|
@@ -230,20 +372,48 @@ class LegislationIndex:
|
|
| 230 |
c["marginal_note"].strip().lower() in (
|
| 231 |
"definitions", "definition", "interpretation")):
|
| 232 |
self._defs_section[c["act_code"]] = c
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
def related(self, chunk):
|
| 235 |
-
"""
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
"""
|
| 240 |
if chunk.get("doc_type", "legislation") != "legislation":
|
| 241 |
-
return
|
| 242 |
act = chunk["act_code"]
|
| 243 |
-
|
| 244 |
defs = self._defs_section.get(act)
|
| 245 |
if defs and defs["section"] not in seen:
|
| 246 |
-
|
| 247 |
seen.add(defs["section"])
|
| 248 |
for match in _XREF.finditer(chunk["text"]):
|
| 249 |
sec = match.group(1)
|
|
@@ -251,11 +421,17 @@ class LegislationIndex:
|
|
| 251 |
continue
|
| 252 |
target = self._by_section.get((act, sec))
|
| 253 |
if target:
|
| 254 |
-
|
| 255 |
seen.add(sec)
|
| 256 |
-
if len(
|
| 257 |
break
|
| 258 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
def main():
|
|
|
|
| 8 |
import snowballstemmer
|
| 9 |
|
| 10 |
from .config import PROCESSED_DIR
|
| 11 |
+
from .synonyms import expand_query
|
| 12 |
|
| 13 |
K1 = 1.5
|
| 14 |
B = 0.75
|
| 15 |
RRF_K = 60 # reciprocal-rank-fusion damping constant
|
| 16 |
CANDIDATES = 80 # hits each retriever contributes to the fusion
|
| 17 |
RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
|
| 18 |
+
SOURCE_CAP = 2 # max chunks one case/memo/agreement/directive may contribute
|
| 19 |
|
| 20 |
_TOKEN = re.compile(r"[a-z0-9]+")
|
| 21 |
_SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
|
|
|
|
| 25 |
r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)",
|
| 26 |
re.IGNORECASE)
|
| 27 |
|
| 28 |
+
# A line opening with "(1)", "(a)" or "(b.1)" -- a citable subdivision
|
| 29 |
+
# (subsection, paragraph or subparagraph) of a provision.
|
| 30 |
+
_MARKER = re.compile(r"(?m)^\(([0-9a-zA-Z]+(?:\.\d+)?)\)")
|
| 31 |
+
|
| 32 |
+
# A D-memorandum's reference to a provision -- "section 32 of the Customs Act",
|
| 33 |
+
# or "section 32 of the Act" (the Act a D-memo administers -- the Customs Act).
|
| 34 |
+
_MEMO_CITE = re.compile(
|
| 35 |
+
r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+"
|
| 36 |
+
r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE)
|
| 37 |
+
|
| 38 |
|
| 39 |
_STEMMER = snowballstemmer.stemmer("english")
|
| 40 |
_STEM_CACHE = {}
|
|
|
|
| 61 |
return set(_SECTION_REF.findall(query.lower()))
|
| 62 |
|
| 63 |
|
| 64 |
+
def _provision_units(text):
|
| 65 |
+
"""Citable parts of a provision, for pinpoint scoring -- a list of
|
| 66 |
+
(citation_suffix, scoring_text, snippet). One entry per paragraph, with its
|
| 67 |
+
subsection chapeau prepended to scoring_text for context, plus one per
|
| 68 |
+
paragraph-less subsection. Returns [] when the provision is too flat to
|
| 69 |
+
pinpoint (fewer than two subdivisions)."""
|
| 70 |
+
marks = list(_MARKER.finditer(text))
|
| 71 |
+
if len(marks) < 2:
|
| 72 |
+
return []
|
| 73 |
+
spans = []
|
| 74 |
+
for i, m in enumerate(marks):
|
| 75 |
+
end = marks[i + 1].start() if i + 1 < len(marks) else len(text)
|
| 76 |
+
spans.append((m.group(1), text[m.start():end].strip()))
|
| 77 |
+
units, cur_sub, cur_intro = [], "", ""
|
| 78 |
+
for j, (token, body) in enumerate(spans):
|
| 79 |
+
if "[Repealed" in body[:40]:
|
| 80 |
+
if token[0].isdigit():
|
| 81 |
+
cur_sub, cur_intro = f"({token})", ""
|
| 82 |
+
continue
|
| 83 |
+
if token[0].isdigit():
|
| 84 |
+
cur_sub, cur_intro = f"({token})", body
|
| 85 |
+
nxt = spans[j + 1][0] if j + 1 < len(spans) else ""
|
| 86 |
+
if not nxt or nxt[0].isdigit():
|
| 87 |
+
units.append((cur_sub, body, body)) # subsection has no paragraphs
|
| 88 |
+
# otherwise the chapeau is emitted via its paragraphs below
|
| 89 |
+
else:
|
| 90 |
+
label = f"{cur_sub}({token})" if cur_sub else f"({token})"
|
| 91 |
+
units.append((label, f"{cur_intro} {body}".strip(), body))
|
| 92 |
+
return units
|
| 93 |
+
|
| 94 |
+
|
| 95 |
class LegislationIndex:
|
| 96 |
def __init__(self):
|
| 97 |
self.chunks = []
|
|
|
|
| 191 |
qv = self.embedder.encode_query(query)
|
| 192 |
sims = self.vectors @ qv
|
| 193 |
order = self._np.argsort(sims)[::-1][:CANDIDATES]
|
| 194 |
+
# The top cosine similarity doubles as a corpus-coverage signal: a query
|
| 195 |
+
# the corpus cannot answer has no passage close to it.
|
| 196 |
+
return [int(i) for i in order], float(sims.max())
|
| 197 |
|
| 198 |
def _rerank_doc(self, idx):
|
| 199 |
c = self.chunks[idx]
|
| 200 |
return f"{c['citation']} — {c['marginal_note']}\n{c['text']}"
|
| 201 |
|
| 202 |
+
def _source_key(self, idx):
|
| 203 |
+
"""The parent document a chunk belongs to, for diversity capping. Returns
|
| 204 |
+
None for legislation -- each section is a distinct provision and is never
|
| 205 |
+
capped; case law is keyed by citation, memoranda by memo number."""
|
| 206 |
+
c = self.chunks[idx]
|
| 207 |
+
doc_type = c.get("doc_type", "legislation")
|
| 208 |
+
if doc_type == "legislation":
|
| 209 |
+
return None
|
| 210 |
+
if doc_type == "memorandum":
|
| 211 |
+
return ("memorandum", c["section"]) # act_code is a shared constant
|
| 212 |
+
return (doc_type, c["act_code"]) # caselaw / agreement / directive
|
| 213 |
+
|
| 214 |
+
def _diversify(self, ordered):
|
| 215 |
+
"""Reorder so no single case, memorandum, agreement or directive can
|
| 216 |
+
monopolise the results: once a source has contributed SOURCE_CAP chunks,
|
| 217 |
+
its remaining chunks are deferred below every other candidate. This stops
|
| 218 |
+
a heavily paragraph-chunked decision from crowding out the statute it
|
| 219 |
+
interprets. Legislation is never capped."""
|
| 220 |
+
kept, deferred, counts = [], [], defaultdict(int)
|
| 221 |
+
for idx in ordered:
|
| 222 |
+
key = self._source_key(idx)
|
| 223 |
+
if key is None:
|
| 224 |
+
kept.append(idx)
|
| 225 |
+
continue
|
| 226 |
+
counts[key] += 1
|
| 227 |
+
(kept if counts[key] <= SOURCE_CAP else deferred).append(idx)
|
| 228 |
+
return kept + deferred
|
| 229 |
+
|
| 230 |
+
def _ensure_legislation(self, ordered, top_k):
|
| 231 |
+
"""Guarantee the governing statute is surfaced: when the natural top_k is
|
| 232 |
+
monopolised by case law or memoranda, pull the best legislation results
|
| 233 |
+
up to just below the top hit, displacing the lowest-ranked secondary
|
| 234 |
+
sources. The single best match is always kept in place."""
|
| 235 |
+
if top_k < 3:
|
| 236 |
+
return ordered
|
| 237 |
+
def is_leg(i):
|
| 238 |
+
return self.chunks[i].get("doc_type", "legislation") == "legislation"
|
| 239 |
+
top, rest = ordered[:top_k], ordered[top_k:]
|
| 240 |
+
need = 2 - sum(1 for i in top if is_leg(i))
|
| 241 |
+
if need <= 0:
|
| 242 |
+
return ordered
|
| 243 |
+
promote = [i for i in rest if is_leg(i)][:need]
|
| 244 |
+
drop = [i for i in reversed(top) if not is_leg(i)][:len(promote)]
|
| 245 |
+
if not drop:
|
| 246 |
+
return ordered
|
| 247 |
+
promote = promote[:len(drop)]
|
| 248 |
+
dropped, promoted = set(drop), set(promote)
|
| 249 |
+
kept = [i for i in top if i not in dropped]
|
| 250 |
+
return kept[:1] + promote + kept[1:] + drop + [
|
| 251 |
+
i for i in rest if i not in promoted]
|
| 252 |
+
|
| 253 |
+
def _highlight(self, query, indices):
|
| 254 |
+
"""For each result chunk, the subsection or paragraph most on point for
|
| 255 |
+
the query: {result_position: (citation_suffix, snippet)}. Uses the
|
| 256 |
+
cross-encoder; returns {} if it is unavailable or nothing is structured.
|
| 257 |
+
Only the first results are scored -- a pinpoint deep in the list is not
|
| 258 |
+
worth the cross-encoder cost."""
|
| 259 |
+
if not self.reranker:
|
| 260 |
+
return {}
|
| 261 |
+
jobs = [] # (result_position, label, scoring_text, snippet)
|
| 262 |
+
for pos, idx in enumerate(indices[:8]):
|
| 263 |
+
c = self.chunks[idx]
|
| 264 |
+
if c.get("doc_type", "legislation") != "legislation":
|
| 265 |
+
continue
|
| 266 |
+
note = c["marginal_note"]
|
| 267 |
+
for label, scoring, snippet in _provision_units(c["text"]):
|
| 268 |
+
jobs.append((pos, label, f"{note}. {scoring}", snippet))
|
| 269 |
+
if not jobs:
|
| 270 |
+
return {}
|
| 271 |
+
best = {} # result_position -> (score, label, snippet)
|
| 272 |
+
for (pos, label, _, snippet), score in zip(
|
| 273 |
+
jobs, self.reranker.score(query, [j[2] for j in jobs])):
|
| 274 |
+
if pos not in best or score > best[pos][0]:
|
| 275 |
+
best[pos] = (score, label, snippet)
|
| 276 |
+
return {pos: (label, " ".join(snippet[:240].split()))
|
| 277 |
+
for pos, (score, label, snippet) in best.items()}
|
| 278 |
+
|
| 279 |
def search(self, query, top_k=6, act=None, doc_type=None):
|
| 280 |
"""Hybrid candidate fusion (BM25 + semantic), then cross-encoder rerank."""
|
| 281 |
+
# Expand legal abbreviations (PRRA, H&C, ...) into statutory wording for
|
| 282 |
+
# the recall stages; the reranker still sees the user's original query.
|
| 283 |
+
expanded = expand_query(query)
|
| 284 |
+
confidence = None
|
| 285 |
fused = defaultdict(float)
|
| 286 |
+
bm25 = self._bm25_scores(expanded)
|
| 287 |
for rank, idx in enumerate(sorted(bm25, key=bm25.get, reverse=True)[:CANDIDATES]):
|
| 288 |
fused[idx] += 1.0 / (RRF_K + rank)
|
| 289 |
if self.semantic:
|
| 290 |
+
sem_order, confidence = self._semantic_ranking(expanded)
|
| 291 |
+
for rank, idx in enumerate(sem_order):
|
| 292 |
fused[idx] += 1.0 / (RRF_K + rank)
|
| 293 |
|
| 294 |
# Ensure explicitly-referenced sections are retrieved even if recall missed them.
|
|
|
|
| 335 |
pinned_set = set(pinned)
|
| 336 |
candidates = pinned + [i for i in candidates if i not in pinned_set]
|
| 337 |
|
| 338 |
+
# Cap one-source monopolies, then guarantee the statute is represented.
|
| 339 |
+
candidates = self._diversify(candidates)
|
| 340 |
+
candidates = self._ensure_legislation(candidates, top_k)
|
| 341 |
+
|
| 342 |
+
top = candidates[:top_k]
|
| 343 |
+
highlights = self._highlight(query, top)
|
| 344 |
+
results = []
|
| 345 |
+
for pos, i in enumerate(top):
|
| 346 |
+
result = {**self.chunks[i], "score": round(scores[i], 4),
|
| 347 |
+
"confidence": confidence}
|
| 348 |
+
if pos in highlights:
|
| 349 |
+
result["highlight"] = highlights[pos]
|
| 350 |
+
results.append(result)
|
| 351 |
+
return results
|
| 352 |
|
| 353 |
def get_section(self, act, section):
|
| 354 |
act = act.lower()
|
|
|
|
| 358 |
return None
|
| 359 |
|
| 360 |
def _build_xref(self):
|
| 361 |
+
"""Index legislation by (act, section); find each Act's definitions
|
| 362 |
+
section; link every regulation to its enabling Act and every
|
| 363 |
+
D-memorandum to the provisions it cites -- all for cross-referencing."""
|
| 364 |
self._by_section = {}
|
| 365 |
self._defs_section = {}
|
| 366 |
+
acts, regs = {}, {} # act_code -> (act_short, act_name)
|
| 367 |
for c in self.chunks:
|
| 368 |
if c.get("doc_type", "legislation") != "legislation":
|
| 369 |
continue
|
|
|
|
| 372 |
c["marginal_note"].strip().lower() in (
|
| 373 |
"definitions", "definition", "interpretation")):
|
| 374 |
self._defs_section[c["act_code"]] = c
|
| 375 |
+
bucket = regs if c["act_code"].startswith(("SOR", "C.R.C")) else acts
|
| 376 |
+
bucket.setdefault(c["act_code"], (c["act_short"], c["act_name"]))
|
| 377 |
+
|
| 378 |
+
# Link a regulation to the Act it is made under by matching their names
|
| 379 |
+
# ("X Regulations" <-> "X Act").
|
| 380 |
+
self._enabling_act = {} # reg code -> (act_short, act_name)
|
| 381 |
+
self._regulations = defaultdict(list) # act code -> [(reg_short, reg_name)]
|
| 382 |
+
def base(name):
|
| 383 |
+
return re.sub(r"\b(?:Act|Regulations)\b", "", name).strip().lower()
|
| 384 |
+
act_by_base = {base(n): (code, s, n) for code, (s, n) in acts.items()}
|
| 385 |
+
for rcode, (rshort, rname) in regs.items():
|
| 386 |
+
hit = act_by_base.get(base(rname))
|
| 387 |
+
if hit:
|
| 388 |
+
self._enabling_act[rcode] = (hit[1], hit[2])
|
| 389 |
+
self._regulations[hit[0]].append((rshort, rname))
|
| 390 |
+
|
| 391 |
+
# Link D-memoranda to the Customs Act / Customs Tariff provisions they
|
| 392 |
+
# cite; an unqualified "the Act" in a D-memo means the Customs Act.
|
| 393 |
+
by_short = {s.lower(): code for code, (s, n) in acts.items()}
|
| 394 |
+
customs, tariff = by_short.get("customs act"), by_short.get("customs tariff")
|
| 395 |
+
self._memos_for_section = defaultdict(set) # (act_code, section) -> memos
|
| 396 |
+
for c in self.chunks:
|
| 397 |
+
if c.get("doc_type") != "memorandum":
|
| 398 |
+
continue
|
| 399 |
+
for num, which in _MEMO_CITE.findall(c["text"]):
|
| 400 |
+
code = tariff if which.lower() == "customs tariff" else customs
|
| 401 |
+
if code:
|
| 402 |
+
self._memos_for_section[(code, num)].add(c["section"])
|
| 403 |
|
| 404 |
def related(self, chunk):
|
| 405 |
+
"""Cross-references for a legislation result, as a dict: 'provisions'
|
| 406 |
+
(intra-Act sections it cites, plus the definitions section),
|
| 407 |
+
'regulations' (made under this Act), 'enabling_act' (for a regulation,
|
| 408 |
+
the Act it is made under) and 'memoranda' (D-memo numbers citing this
|
| 409 |
+
section). Empty dict for case law, memoranda, etc."""
|
| 410 |
if chunk.get("doc_type", "legislation") != "legislation":
|
| 411 |
+
return {}
|
| 412 |
act = chunk["act_code"]
|
| 413 |
+
provisions, seen = [], {chunk["section"]}
|
| 414 |
defs = self._defs_section.get(act)
|
| 415 |
if defs and defs["section"] not in seen:
|
| 416 |
+
provisions.append((defs["section"], defs["marginal_note"]))
|
| 417 |
seen.add(defs["section"])
|
| 418 |
for match in _XREF.finditer(chunk["text"]):
|
| 419 |
sec = match.group(1)
|
|
|
|
| 421 |
continue
|
| 422 |
target = self._by_section.get((act, sec))
|
| 423 |
if target:
|
| 424 |
+
provisions.append((sec, target["marginal_note"]))
|
| 425 |
seen.add(sec)
|
| 426 |
+
if len(provisions) >= 8:
|
| 427 |
break
|
| 428 |
+
return {
|
| 429 |
+
"provisions": provisions,
|
| 430 |
+
"regulations": self._regulations.get(act, []),
|
| 431 |
+
"enabling_act": self._enabling_act.get(act),
|
| 432 |
+
"memoranda": sorted(self._memos_for_section.get(
|
| 433 |
+
(act, chunk["section"]), []))[:6],
|
| 434 |
+
}
|
| 435 |
|
| 436 |
|
| 437 |
def main():
|
|
@@ -31,9 +31,10 @@ _READONLY = {
|
|
| 31 |
GROUNDING_NOTE = (
|
| 32 |
"ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
|
| 33 |
"specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
|
| 34 |
-
"When a result lists
|
| 35 |
-
"-- the definitions section, an exception, a
|
| 36 |
-
"
|
|
|
|
| 37 |
"Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
|
| 38 |
"CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
|
| 39 |
"and a court may disagree with them; collective agreements and the National "
|
|
@@ -43,13 +44,24 @@ GROUNDING_NOTE = (
|
|
| 43 |
"name the deciding court and the date, and do not assume a decision is still "
|
| 44 |
"good law if it may have been overtaken (the canlex_case tool checks a "
|
| 45 |
"decision's later treatment on CanLII -- give it the neutral citation). "
|
| 46 |
-
"
|
| 47 |
-
"
|
|
|
|
| 48 |
"below does not fully resolve the question -- including where it turns on case "
|
| 49 |
"law or facts not present here -- say so explicitly. This is legal information, "
|
| 50 |
"not legal advice."
|
| 51 |
)
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
_INDEX: Optional[LegislationIndex] = None
|
| 54 |
|
| 55 |
|
|
@@ -99,16 +111,36 @@ def _format_section(c: dict, related=None) -> str:
|
|
| 99 |
if c["heading"]:
|
| 100 |
lines.append(f"Subject: {c['heading']}")
|
| 101 |
else:
|
| 102 |
-
meta = [f"current to {c['current_to'] or 'n/a'}"]
|
| 103 |
if c["last_amended"]:
|
| 104 |
meta.append(f"last amended {c['last_amended']}")
|
| 105 |
-
lines.append(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
lines.append("")
|
| 107 |
lines.append(c["text"])
|
| 108 |
lines.append("")
|
| 109 |
if related:
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
if c["history"]:
|
| 113 |
if doc_type == "caselaw":
|
| 114 |
lines.append(f"Also reported: {c['history']}")
|
|
@@ -205,8 +237,12 @@ def canlex_search_legislation(params: SearchInput) -> str:
|
|
| 205 |
return (f"No results matched '{params.query}'{scope}. "
|
| 206 |
f"Try broader or different keywords, or call canlex_list_acts to see "
|
| 207 |
f"what is currently loaded.")
|
| 208 |
-
blocks = [
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
for c in results:
|
| 211 |
blocks.append("")
|
| 212 |
blocks.append("---")
|
|
|
|
| 31 |
GROUNDING_NOTE = (
|
| 32 |
"ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
|
| 33 |
"specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
|
| 34 |
+
"When a result lists related provisions, regulations or D-memoranda, fetch "
|
| 35 |
+
"any that bear on the question -- the definitions section, an exception, a "
|
| 36 |
+
"cross-referenced rule, the regulation that adds detail -- with "
|
| 37 |
+
"canlex_get_section or canlex_search_legislation before answering. "
|
| 38 |
"Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
|
| 39 |
"CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
|
| 40 |
"and a court may disagree with them; collective agreements and the National "
|
|
|
|
| 44 |
"name the deciding court and the date, and do not assume a decision is still "
|
| 45 |
"good law if it may have been overtaken (the canlex_case tool checks a "
|
| 46 |
"decision's later treatment on CanLII -- give it the neutral citation). "
|
| 47 |
+
"Always state the date the source is current to, and that the answer "
|
| 48 |
+
"reflects the law only as of that date -- for a time-sensitive matter, tell "
|
| 49 |
+
"the reader to verify no amendment has come into force since. If the material "
|
| 50 |
"below does not fully resolve the question -- including where it turns on case "
|
| 51 |
"law or facts not present here -- say so explicitly. This is legal information, "
|
| 52 |
"not legal advice."
|
| 53 |
)
|
| 54 |
|
| 55 |
+
HEDGE_THRESHOLD = 0.72 # max semantic similarity below which results are weak
|
| 56 |
+
|
| 57 |
+
WEAK_MATCH_NOTE = (
|
| 58 |
+
"RETRIEVAL CAUTION: the material below is only a weak match for this query "
|
| 59 |
+
"— CanLex may not contain a provision or decision that directly answers it. "
|
| 60 |
+
"Read it critically; if it does not actually address the question, say so "
|
| 61 |
+
"plainly rather than stretching it to fit, and consider canlex_list_acts to "
|
| 62 |
+
"check what the corpus covers."
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
_INDEX: Optional[LegislationIndex] = None
|
| 66 |
|
| 67 |
|
|
|
|
| 111 |
if c["heading"]:
|
| 112 |
lines.append(f"Subject: {c['heading']}")
|
| 113 |
else:
|
| 114 |
+
meta = [f"in force; text current to {c['current_to'] or 'n/a'}"]
|
| 115 |
if c["last_amended"]:
|
| 116 |
meta.append(f"last amended {c['last_amended']}")
|
| 117 |
+
lines.append(f"**Currency:** {'; '.join(meta)}. Does not reflect any "
|
| 118 |
+
f"amendment that came into force after the 'current to' date.")
|
| 119 |
+
hl = c.get("highlight")
|
| 120 |
+
if hl:
|
| 121 |
+
label, snippet = hl
|
| 122 |
+
lines.append(f"**Most on point for this query:** "
|
| 123 |
+
f"{c['citation']}{label} — {snippet}")
|
| 124 |
lines.append("")
|
| 125 |
lines.append(c["text"])
|
| 126 |
lines.append("")
|
| 127 |
if related:
|
| 128 |
+
provisions = related.get("provisions")
|
| 129 |
+
if provisions:
|
| 130 |
+
refs = "; ".join(f"s. {s} ({n})" if n else f"s. {s}"
|
| 131 |
+
for s, n in provisions)
|
| 132 |
+
lines.append(f"Related provisions in this Act: {refs}")
|
| 133 |
+
regs = related.get("regulations")
|
| 134 |
+
if regs:
|
| 135 |
+
lines.append("Regulations made under this Act: "
|
| 136 |
+
+ "; ".join(f"{n} ({s})" for s, n in regs))
|
| 137 |
+
enabling = related.get("enabling_act")
|
| 138 |
+
if enabling:
|
| 139 |
+
lines.append(f"Made under: {enabling[1]} ({enabling[0]})")
|
| 140 |
+
memos = related.get("memoranda")
|
| 141 |
+
if memos:
|
| 142 |
+
lines.append("CBSA D-memoranda citing this section (guidance, not "
|
| 143 |
+
"binding): " + ", ".join(memos))
|
| 144 |
if c["history"]:
|
| 145 |
if doc_type == "caselaw":
|
| 146 |
lines.append(f"Also reported: {c['history']}")
|
|
|
|
| 237 |
return (f"No results matched '{params.query}'{scope}. "
|
| 238 |
f"Try broader or different keywords, or call canlex_list_acts to see "
|
| 239 |
f"what is currently loaded.")
|
| 240 |
+
blocks = []
|
| 241 |
+
weak = results[0].get("confidence")
|
| 242 |
+
if weak is not None and weak < HEDGE_THRESHOLD:
|
| 243 |
+
blocks += [WEAK_MATCH_NOTE, ""]
|
| 244 |
+
blocks += [GROUNDING_NOTE, "",
|
| 245 |
+
f'{len(results)} relevant section(s) for: "{params.query}"']
|
| 246 |
for c in results:
|
| 247 |
blocks.append("")
|
| 248 |
blocks.append("---")
|
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Query-side expansion of legal abbreviations and informal terms.
|
| 2 |
+
|
| 3 |
+
Statutes use formal wording -- "application for protection", "removal order" --
|
| 4 |
+
but users (and a model drafting a search) reach for everyday shorthand: "PRRA",
|
| 5 |
+
"H&C", "deportation". Before retrieval, expand_query() appends the canonical
|
| 6 |
+
statutory terms for any abbreviation or nickname it recognises, so the BM25 and
|
| 7 |
+
semantic stages can match the provision's actual language. It only ever ADDS
|
| 8 |
+
words -- the user's own phrasing is left untouched -- and the cross-encoder
|
| 9 |
+
reranker still sees the original query, so precision is unaffected.
|
| 10 |
+
|
| 11 |
+
python -m canlex.synonyms "PRRA eligibility and an H&C application"
|
| 12 |
+
"""
|
| 13 |
+
import re
|
| 14 |
+
import sys
|
| 15 |
+
|
| 16 |
+
# (trigger, canonical terms to append). The trigger is a regex fragment matched
|
| 17 |
+
# case-insensitively as a whole word. Keep this list high-precision: an entry
|
| 18 |
+
# earns its place only when the shorthand is unambiguous in Canadian border,
|
| 19 |
+
# immigration, customs, financial-crime or labour law.
|
| 20 |
+
_SYNONYMS = [
|
| 21 |
+
# Immigration and refugee law
|
| 22 |
+
(r"prra", "pre-removal risk assessment application for protection"),
|
| 23 |
+
(r"pre[- ]removal risk assessment", "application for protection"),
|
| 24 |
+
(r"h\s*&\s*c", "humanitarian and compassionate"),
|
| 25 |
+
(r"rad", "refugee appeal division"),
|
| 26 |
+
(r"rpd", "refugee protection division"),
|
| 27 |
+
(r"iad", "immigration appeal division"),
|
| 28 |
+
(r"irb", "immigration and refugee board"),
|
| 29 |
+
(r"trp", "temporary resident permit"),
|
| 30 |
+
(r"deportation", "removal order"),
|
| 31 |
+
(r"misrep", "misrepresentation"),
|
| 32 |
+
(r"ircc", "immigration refugees and citizenship canada"),
|
| 33 |
+
# Border and customs
|
| 34 |
+
(r"cbsa", "canada border services agency"),
|
| 35 |
+
(r"bsos?", "border services officer"),
|
| 36 |
+
(r"amps", "administrative monetary penalty system"),
|
| 37 |
+
# Financial-crime and labour
|
| 38 |
+
(r"fintrac", "financial transactions and reports analysis centre"),
|
| 39 |
+
(r"njc", "national joint council"),
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
_COMPILED = [(re.compile(rf"\b{trigger}\b", re.IGNORECASE), expansion)
|
| 43 |
+
for trigger, expansion in _SYNONYMS]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def expand_query(query):
|
| 47 |
+
"""Return `query` with canonical statutory terms appended for every legal
|
| 48 |
+
abbreviation it contains; return it unchanged if it contains none."""
|
| 49 |
+
additions = [exp for pattern, exp in _COMPILED if pattern.search(query)]
|
| 50 |
+
if not additions:
|
| 51 |
+
return query
|
| 52 |
+
return f"{query} {' '.join(additions)}"
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def main():
|
| 56 |
+
query = " ".join(sys.argv[1:]) or "PRRA eligibility and an H&C application"
|
| 57 |
+
print(f"query: {query}")
|
| 58 |
+
print(f"expanded: {expand_query(query)}")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
main()
|