Beemer Claude Opus 4.7 commited on
Commit
1e58371
·
1 Parent(s): 2966f10

Add output-quality features: highlighting, hedging, linking, currency

Browse files

Retrieval and presentation improvements for better-grounded answers:

- Legal-abbreviation query expansion (new canlex/synonyms.py): PRRA,
H&C, RAD, CBSA and similar shorthand expand to statutory wording
before BM25 and semantic retrieval.
- Diversity cap: no single case or memo may take more than two result
slots, so one document cannot monopolise a topical query.
- Legislation guarantee: when a result set is dominated by case law,
the governing statute is pulled into it.
- Pinpoint highlighting: each long result flags the subsection or
paragraph most on point (e.g. s. 34(1)(c)), scored by the
cross-encoder.
- Low-confidence hedging: when the top semantic match is weak, the
search tool warns that the corpus may not address the question.
- Currency signalling: every legislation result carries a prominent
Currency line; the grounding note demands a dated answer.
- Cross-reference linking: a result also surfaces the regulations made
under its Act (and a regulation its enabling Act) and the CBSA
D-memoranda that cite the provision.

Eval: Hit@3 0.74 -> 0.77, Hit@5 holds at 0.89, no regression.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (3) hide show
  1. canlex/index.py +193 -17
  2. canlex/server.py +47 -11
  3. canlex/synonyms.py +62 -0
canlex/index.py CHANGED
@@ -8,12 +8,14 @@ from collections import Counter, defaultdict
8
  import snowballstemmer
9
 
10
  from .config import PROCESSED_DIR
 
11
 
12
  K1 = 1.5
13
  B = 0.75
14
  RRF_K = 60 # reciprocal-rank-fusion damping constant
15
  CANDIDATES = 80 # hits each retriever contributes to the fusion
16
  RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
 
17
 
18
  _TOKEN = re.compile(r"[a-z0-9]+")
19
  _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
@@ -23,6 +25,16 @@ _XREF = re.compile(
23
  r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)",
24
  re.IGNORECASE)
25
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  _STEMMER = snowballstemmer.stemmer("english")
28
  _STEM_CACHE = {}
@@ -49,6 +61,37 @@ def _section_refs(query):
49
  return set(_SECTION_REF.findall(query.lower()))
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  class LegislationIndex:
53
  def __init__(self):
54
  self.chunks = []
@@ -148,20 +191,104 @@ class LegislationIndex:
148
  qv = self.embedder.encode_query(query)
149
  sims = self.vectors @ qv
150
  order = self._np.argsort(sims)[::-1][:CANDIDATES]
151
- return [int(i) for i in order]
 
 
152
 
153
  def _rerank_doc(self, idx):
154
  c = self.chunks[idx]
155
  return f"{c['citation']} — {c['marginal_note']}\n{c['text']}"
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def search(self, query, top_k=6, act=None, doc_type=None):
158
  """Hybrid candidate fusion (BM25 + semantic), then cross-encoder rerank."""
 
 
 
 
159
  fused = defaultdict(float)
160
- bm25 = self._bm25_scores(query)
161
  for rank, idx in enumerate(sorted(bm25, key=bm25.get, reverse=True)[:CANDIDATES]):
162
  fused[idx] += 1.0 / (RRF_K + rank)
163
  if self.semantic:
164
- for rank, idx in enumerate(self._semantic_ranking(query)):
 
165
  fused[idx] += 1.0 / (RRF_K + rank)
166
 
167
  # Ensure explicitly-referenced sections are retrieved even if recall missed them.
@@ -208,7 +335,20 @@ class LegislationIndex:
208
  pinned_set = set(pinned)
209
  candidates = pinned + [i for i in candidates if i not in pinned_set]
210
 
211
- return [{**self.chunks[i], "score": round(scores[i], 4)} for i in candidates[:top_k]]
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
  def get_section(self, act, section):
214
  act = act.lower()
@@ -218,10 +358,12 @@ class LegislationIndex:
218
  return None
219
 
220
  def _build_xref(self):
221
- """Index legislation by (act, section) and locate each Act's definitions
222
- section, to support cross-reference lookup."""
 
223
  self._by_section = {}
224
  self._defs_section = {}
 
225
  for c in self.chunks:
226
  if c.get("doc_type", "legislation") != "legislation":
227
  continue
@@ -230,20 +372,48 @@ class LegislationIndex:
230
  c["marginal_note"].strip().lower() in (
231
  "definitions", "definition", "interpretation")):
232
  self._defs_section[c["act_code"]] = c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
  def related(self, chunk):
235
- """Return [(section, marginal_note), ...]: provisions of the same Act
236
- that this one cross-references, plus the Act's definitions section.
237
-
238
- Legislation chunks only; returns [] for case law, memoranda, etc.
239
- """
240
  if chunk.get("doc_type", "legislation") != "legislation":
241
- return []
242
  act = chunk["act_code"]
243
- out, seen = [], {chunk["section"]}
244
  defs = self._defs_section.get(act)
245
  if defs and defs["section"] not in seen:
246
- out.append((defs["section"], defs["marginal_note"]))
247
  seen.add(defs["section"])
248
  for match in _XREF.finditer(chunk["text"]):
249
  sec = match.group(1)
@@ -251,11 +421,17 @@ class LegislationIndex:
251
  continue
252
  target = self._by_section.get((act, sec))
253
  if target:
254
- out.append((sec, target["marginal_note"]))
255
  seen.add(sec)
256
- if len(out) >= 8:
257
  break
258
- return out
 
 
 
 
 
 
259
 
260
 
261
  def main():
 
8
  import snowballstemmer
9
 
10
  from .config import PROCESSED_DIR
11
+ from .synonyms import expand_query
12
 
13
  K1 = 1.5
14
  B = 0.75
15
  RRF_K = 60 # reciprocal-rank-fusion damping constant
16
  CANDIDATES = 80 # hits each retriever contributes to the fusion
17
  RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
18
+ SOURCE_CAP = 2 # max chunks one case/memo/agreement/directive may contribute
19
 
20
  _TOKEN = re.compile(r"[a-z0-9]+")
21
  _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
 
25
  r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)",
26
  re.IGNORECASE)
27
 
28
+ # A line opening with "(1)", "(a)" or "(b.1)" -- a citable subdivision
29
+ # (subsection, paragraph or subparagraph) of a provision.
30
+ _MARKER = re.compile(r"(?m)^\(([0-9a-zA-Z]+(?:\.\d+)?)\)")
31
+
32
+ # A D-memorandum's reference to a provision -- "section 32 of the Customs Act",
33
+ # or "section 32 of the Act" (the Act a D-memo administers -- the Customs Act).
34
+ _MEMO_CITE = re.compile(
35
+ r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+"
36
+ r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE)
37
+
38
 
39
  _STEMMER = snowballstemmer.stemmer("english")
40
  _STEM_CACHE = {}
 
61
  return set(_SECTION_REF.findall(query.lower()))
62
 
63
 
64
+ def _provision_units(text):
65
+ """Citable parts of a provision, for pinpoint scoring -- a list of
66
+ (citation_suffix, scoring_text, snippet). One entry per paragraph, with its
67
+ subsection chapeau prepended to scoring_text for context, plus one per
68
+ paragraph-less subsection. Returns [] when the provision is too flat to
69
+ pinpoint (fewer than two subdivisions)."""
70
+ marks = list(_MARKER.finditer(text))
71
+ if len(marks) < 2:
72
+ return []
73
+ spans = []
74
+ for i, m in enumerate(marks):
75
+ end = marks[i + 1].start() if i + 1 < len(marks) else len(text)
76
+ spans.append((m.group(1), text[m.start():end].strip()))
77
+ units, cur_sub, cur_intro = [], "", ""
78
+ for j, (token, body) in enumerate(spans):
79
+ if "[Repealed" in body[:40]:
80
+ if token[0].isdigit():
81
+ cur_sub, cur_intro = f"({token})", ""
82
+ continue
83
+ if token[0].isdigit():
84
+ cur_sub, cur_intro = f"({token})", body
85
+ nxt = spans[j + 1][0] if j + 1 < len(spans) else ""
86
+ if not nxt or nxt[0].isdigit():
87
+ units.append((cur_sub, body, body)) # subsection has no paragraphs
88
+ # otherwise the chapeau is emitted via its paragraphs below
89
+ else:
90
+ label = f"{cur_sub}({token})" if cur_sub else f"({token})"
91
+ units.append((label, f"{cur_intro} {body}".strip(), body))
92
+ return units
93
+
94
+
95
  class LegislationIndex:
96
  def __init__(self):
97
  self.chunks = []
 
191
  qv = self.embedder.encode_query(query)
192
  sims = self.vectors @ qv
193
  order = self._np.argsort(sims)[::-1][:CANDIDATES]
194
+ # The top cosine similarity doubles as a corpus-coverage signal: a query
195
+ # the corpus cannot answer has no passage close to it.
196
+ return [int(i) for i in order], float(sims.max())
197
 
198
  def _rerank_doc(self, idx):
199
  c = self.chunks[idx]
200
  return f"{c['citation']} — {c['marginal_note']}\n{c['text']}"
201
 
202
+ def _source_key(self, idx):
203
+ """The parent document a chunk belongs to, for diversity capping. Returns
204
+ None for legislation -- each section is a distinct provision and is never
205
+ capped; case law is keyed by citation, memoranda by memo number."""
206
+ c = self.chunks[idx]
207
+ doc_type = c.get("doc_type", "legislation")
208
+ if doc_type == "legislation":
209
+ return None
210
+ if doc_type == "memorandum":
211
+ return ("memorandum", c["section"]) # act_code is a shared constant
212
+ return (doc_type, c["act_code"]) # caselaw / agreement / directive
213
+
214
+ def _diversify(self, ordered):
215
+ """Reorder so no single case, memorandum, agreement or directive can
216
+ monopolise the results: once a source has contributed SOURCE_CAP chunks,
217
+ its remaining chunks are deferred below every other candidate. This stops
218
+ a heavily paragraph-chunked decision from crowding out the statute it
219
+ interprets. Legislation is never capped."""
220
+ kept, deferred, counts = [], [], defaultdict(int)
221
+ for idx in ordered:
222
+ key = self._source_key(idx)
223
+ if key is None:
224
+ kept.append(idx)
225
+ continue
226
+ counts[key] += 1
227
+ (kept if counts[key] <= SOURCE_CAP else deferred).append(idx)
228
+ return kept + deferred
229
+
230
+ def _ensure_legislation(self, ordered, top_k):
231
+ """Guarantee the governing statute is surfaced: when the natural top_k is
232
+ monopolised by case law or memoranda, pull the best legislation results
233
+ up to just below the top hit, displacing the lowest-ranked secondary
234
+ sources. The single best match is always kept in place."""
235
+ if top_k < 3:
236
+ return ordered
237
+ def is_leg(i):
238
+ return self.chunks[i].get("doc_type", "legislation") == "legislation"
239
+ top, rest = ordered[:top_k], ordered[top_k:]
240
+ need = 2 - sum(1 for i in top if is_leg(i))
241
+ if need <= 0:
242
+ return ordered
243
+ promote = [i for i in rest if is_leg(i)][:need]
244
+ drop = [i for i in reversed(top) if not is_leg(i)][:len(promote)]
245
+ if not drop:
246
+ return ordered
247
+ promote = promote[:len(drop)]
248
+ dropped, promoted = set(drop), set(promote)
249
+ kept = [i for i in top if i not in dropped]
250
+ return kept[:1] + promote + kept[1:] + drop + [
251
+ i for i in rest if i not in promoted]
252
+
253
+ def _highlight(self, query, indices):
254
+ """For each result chunk, the subsection or paragraph most on point for
255
+ the query: {result_position: (citation_suffix, snippet)}. Uses the
256
+ cross-encoder; returns {} if it is unavailable or nothing is structured.
257
+ Only the first results are scored -- a pinpoint deep in the list is not
258
+ worth the cross-encoder cost."""
259
+ if not self.reranker:
260
+ return {}
261
+ jobs = [] # (result_position, label, scoring_text, snippet)
262
+ for pos, idx in enumerate(indices[:8]):
263
+ c = self.chunks[idx]
264
+ if c.get("doc_type", "legislation") != "legislation":
265
+ continue
266
+ note = c["marginal_note"]
267
+ for label, scoring, snippet in _provision_units(c["text"]):
268
+ jobs.append((pos, label, f"{note}. {scoring}", snippet))
269
+ if not jobs:
270
+ return {}
271
+ best = {} # result_position -> (score, label, snippet)
272
+ for (pos, label, _, snippet), score in zip(
273
+ jobs, self.reranker.score(query, [j[2] for j in jobs])):
274
+ if pos not in best or score > best[pos][0]:
275
+ best[pos] = (score, label, snippet)
276
+ return {pos: (label, " ".join(snippet[:240].split()))
277
+ for pos, (score, label, snippet) in best.items()}
278
+
279
  def search(self, query, top_k=6, act=None, doc_type=None):
280
  """Hybrid candidate fusion (BM25 + semantic), then cross-encoder rerank."""
281
+ # Expand legal abbreviations (PRRA, H&C, ...) into statutory wording for
282
+ # the recall stages; the reranker still sees the user's original query.
283
+ expanded = expand_query(query)
284
+ confidence = None
285
  fused = defaultdict(float)
286
+ bm25 = self._bm25_scores(expanded)
287
  for rank, idx in enumerate(sorted(bm25, key=bm25.get, reverse=True)[:CANDIDATES]):
288
  fused[idx] += 1.0 / (RRF_K + rank)
289
  if self.semantic:
290
+ sem_order, confidence = self._semantic_ranking(expanded)
291
+ for rank, idx in enumerate(sem_order):
292
  fused[idx] += 1.0 / (RRF_K + rank)
293
 
294
  # Ensure explicitly-referenced sections are retrieved even if recall missed them.
 
335
  pinned_set = set(pinned)
336
  candidates = pinned + [i for i in candidates if i not in pinned_set]
337
 
338
+ # Cap one-source monopolies, then guarantee the statute is represented.
339
+ candidates = self._diversify(candidates)
340
+ candidates = self._ensure_legislation(candidates, top_k)
341
+
342
+ top = candidates[:top_k]
343
+ highlights = self._highlight(query, top)
344
+ results = []
345
+ for pos, i in enumerate(top):
346
+ result = {**self.chunks[i], "score": round(scores[i], 4),
347
+ "confidence": confidence}
348
+ if pos in highlights:
349
+ result["highlight"] = highlights[pos]
350
+ results.append(result)
351
+ return results
352
 
353
  def get_section(self, act, section):
354
  act = act.lower()
 
358
  return None
359
 
360
  def _build_xref(self):
361
+ """Index legislation by (act, section); find each Act's definitions
362
+ section; link every regulation to its enabling Act and every
363
+ D-memorandum to the provisions it cites -- all for cross-referencing."""
364
  self._by_section = {}
365
  self._defs_section = {}
366
+ acts, regs = {}, {} # act_code -> (act_short, act_name)
367
  for c in self.chunks:
368
  if c.get("doc_type", "legislation") != "legislation":
369
  continue
 
372
  c["marginal_note"].strip().lower() in (
373
  "definitions", "definition", "interpretation")):
374
  self._defs_section[c["act_code"]] = c
375
+ bucket = regs if c["act_code"].startswith(("SOR", "C.R.C")) else acts
376
+ bucket.setdefault(c["act_code"], (c["act_short"], c["act_name"]))
377
+
378
+ # Link a regulation to the Act it is made under by matching their names
379
+ # ("X Regulations" <-> "X Act").
380
+ self._enabling_act = {} # reg code -> (act_short, act_name)
381
+ self._regulations = defaultdict(list) # act code -> [(reg_short, reg_name)]
382
+ def base(name):
383
+ return re.sub(r"\b(?:Act|Regulations)\b", "", name).strip().lower()
384
+ act_by_base = {base(n): (code, s, n) for code, (s, n) in acts.items()}
385
+ for rcode, (rshort, rname) in regs.items():
386
+ hit = act_by_base.get(base(rname))
387
+ if hit:
388
+ self._enabling_act[rcode] = (hit[1], hit[2])
389
+ self._regulations[hit[0]].append((rshort, rname))
390
+
391
+ # Link D-memoranda to the Customs Act / Customs Tariff provisions they
392
+ # cite; an unqualified "the Act" in a D-memo means the Customs Act.
393
+ by_short = {s.lower(): code for code, (s, n) in acts.items()}
394
+ customs, tariff = by_short.get("customs act"), by_short.get("customs tariff")
395
+ self._memos_for_section = defaultdict(set) # (act_code, section) -> memos
396
+ for c in self.chunks:
397
+ if c.get("doc_type") != "memorandum":
398
+ continue
399
+ for num, which in _MEMO_CITE.findall(c["text"]):
400
+ code = tariff if which.lower() == "customs tariff" else customs
401
+ if code:
402
+ self._memos_for_section[(code, num)].add(c["section"])
403
 
404
  def related(self, chunk):
405
+ """Cross-references for a legislation result, as a dict: 'provisions'
406
+ (intra-Act sections it cites, plus the definitions section),
407
+ 'regulations' (made under this Act), 'enabling_act' (for a regulation,
408
+ the Act it is made under) and 'memoranda' (D-memo numbers citing this
409
+ section). Empty dict for case law, memoranda, etc."""
410
  if chunk.get("doc_type", "legislation") != "legislation":
411
+ return {}
412
  act = chunk["act_code"]
413
+ provisions, seen = [], {chunk["section"]}
414
  defs = self._defs_section.get(act)
415
  if defs and defs["section"] not in seen:
416
+ provisions.append((defs["section"], defs["marginal_note"]))
417
  seen.add(defs["section"])
418
  for match in _XREF.finditer(chunk["text"]):
419
  sec = match.group(1)
 
421
  continue
422
  target = self._by_section.get((act, sec))
423
  if target:
424
+ provisions.append((sec, target["marginal_note"]))
425
  seen.add(sec)
426
+ if len(provisions) >= 8:
427
  break
428
+ return {
429
+ "provisions": provisions,
430
+ "regulations": self._regulations.get(act, []),
431
+ "enabling_act": self._enabling_act.get(act),
432
+ "memoranda": sorted(self._memos_for_section.get(
433
+ (act, chunk["section"]), []))[:6],
434
+ }
435
 
436
 
437
  def main():
canlex/server.py CHANGED
@@ -31,9 +31,10 @@ _READONLY = {
31
  GROUNDING_NOTE = (
32
  "ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
33
  "specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
34
- "When a result lists Related provisions, fetch any that bear on the question "
35
- "-- the definitions section, an exception, a cross-referenced rule -- with "
36
- "canlex_get_section before answering. "
 
37
  "Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
38
  "CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
39
  "and a court may disagree with them; collective agreements and the National "
@@ -43,13 +44,24 @@ GROUNDING_NOTE = (
43
  "name the deciding court and the date, and do not assume a decision is still "
44
  "good law if it may have been overtaken (the canlex_case tool checks a "
45
  "decision's later treatment on CanLII -- give it the neutral citation). "
46
- "State the "
47
- "'current to', modified, or in-force date when stating the law. If the material "
 
48
  "below does not fully resolve the question -- including where it turns on case "
49
  "law or facts not present here -- say so explicitly. This is legal information, "
50
  "not legal advice."
51
  )
52
 
 
 
 
 
 
 
 
 
 
 
53
  _INDEX: Optional[LegislationIndex] = None
54
 
55
 
@@ -99,16 +111,36 @@ def _format_section(c: dict, related=None) -> str:
99
  if c["heading"]:
100
  lines.append(f"Subject: {c['heading']}")
101
  else:
102
- meta = [f"current to {c['current_to'] or 'n/a'}"]
103
  if c["last_amended"]:
104
  meta.append(f"last amended {c['last_amended']}")
105
- lines.append(f"({'; '.join(meta)})")
 
 
 
 
 
 
106
  lines.append("")
107
  lines.append(c["text"])
108
  lines.append("")
109
  if related:
110
- refs = "; ".join(f"s. {s} ({n})" if n else f"s. {s}" for s, n in related)
111
- lines.append(f"Related provisions in this Act: {refs}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  if c["history"]:
113
  if doc_type == "caselaw":
114
  lines.append(f"Also reported: {c['history']}")
@@ -205,8 +237,12 @@ def canlex_search_legislation(params: SearchInput) -> str:
205
  return (f"No results matched '{params.query}'{scope}. "
206
  f"Try broader or different keywords, or call canlex_list_acts to see "
207
  f"what is currently loaded.")
208
- blocks = [GROUNDING_NOTE, "",
209
- f'{len(results)} relevant section(s) for: "{params.query}"']
 
 
 
 
210
  for c in results:
211
  blocks.append("")
212
  blocks.append("---")
 
31
  GROUNDING_NOTE = (
32
  "ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
33
  "specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
34
+ "When a result lists related provisions, regulations or D-memoranda, fetch "
35
+ "any that bear on the question -- the definitions section, an exception, a "
36
+ "cross-referenced rule, the regulation that adds detail -- with "
37
+ "canlex_get_section or canlex_search_legislation before answering. "
38
  "Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
39
  "CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
40
  "and a court may disagree with them; collective agreements and the National "
 
44
  "name the deciding court and the date, and do not assume a decision is still "
45
  "good law if it may have been overtaken (the canlex_case tool checks a "
46
  "decision's later treatment on CanLII -- give it the neutral citation). "
47
+ "Always state the date the source is current to, and that the answer "
48
+ "reflects the law only as of that date -- for a time-sensitive matter, tell "
49
+ "the reader to verify no amendment has come into force since. If the material "
50
  "below does not fully resolve the question -- including where it turns on case "
51
  "law or facts not present here -- say so explicitly. This is legal information, "
52
  "not legal advice."
53
  )
54
 
55
+ HEDGE_THRESHOLD = 0.72 # max semantic similarity below which results are weak
56
+
57
+ WEAK_MATCH_NOTE = (
58
+ "RETRIEVAL CAUTION: the material below is only a weak match for this query "
59
+ "— CanLex may not contain a provision or decision that directly answers it. "
60
+ "Read it critically; if it does not actually address the question, say so "
61
+ "plainly rather than stretching it to fit, and consider canlex_list_acts to "
62
+ "check what the corpus covers."
63
+ )
64
+
65
  _INDEX: Optional[LegislationIndex] = None
66
 
67
 
 
111
  if c["heading"]:
112
  lines.append(f"Subject: {c['heading']}")
113
  else:
114
+ meta = [f"in force; text current to {c['current_to'] or 'n/a'}"]
115
  if c["last_amended"]:
116
  meta.append(f"last amended {c['last_amended']}")
117
+ lines.append(f"**Currency:** {'; '.join(meta)}. Does not reflect any "
118
+ f"amendment that came into force after the 'current to' date.")
119
+ hl = c.get("highlight")
120
+ if hl:
121
+ label, snippet = hl
122
+ lines.append(f"**Most on point for this query:** "
123
+ f"{c['citation']}{label} — {snippet}")
124
  lines.append("")
125
  lines.append(c["text"])
126
  lines.append("")
127
  if related:
128
+ provisions = related.get("provisions")
129
+ if provisions:
130
+ refs = "; ".join(f"s. {s} ({n})" if n else f"s. {s}"
131
+ for s, n in provisions)
132
+ lines.append(f"Related provisions in this Act: {refs}")
133
+ regs = related.get("regulations")
134
+ if regs:
135
+ lines.append("Regulations made under this Act: "
136
+ + "; ".join(f"{n} ({s})" for s, n in regs))
137
+ enabling = related.get("enabling_act")
138
+ if enabling:
139
+ lines.append(f"Made under: {enabling[1]} ({enabling[0]})")
140
+ memos = related.get("memoranda")
141
+ if memos:
142
+ lines.append("CBSA D-memoranda citing this section (guidance, not "
143
+ "binding): " + ", ".join(memos))
144
  if c["history"]:
145
  if doc_type == "caselaw":
146
  lines.append(f"Also reported: {c['history']}")
 
237
  return (f"No results matched '{params.query}'{scope}. "
238
  f"Try broader or different keywords, or call canlex_list_acts to see "
239
  f"what is currently loaded.")
240
+ blocks = []
241
+ weak = results[0].get("confidence")
242
+ if weak is not None and weak < HEDGE_THRESHOLD:
243
+ blocks += [WEAK_MATCH_NOTE, ""]
244
+ blocks += [GROUNDING_NOTE, "",
245
+ f'{len(results)} relevant section(s) for: "{params.query}"']
246
  for c in results:
247
  blocks.append("")
248
  blocks.append("---")
canlex/synonyms.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Query-side expansion of legal abbreviations and informal terms.
2
+
3
+ Statutes use formal wording -- "application for protection", "removal order" --
4
+ but users (and a model drafting a search) reach for everyday shorthand: "PRRA",
5
+ "H&C", "deportation". Before retrieval, expand_query() appends the canonical
6
+ statutory terms for any abbreviation or nickname it recognises, so the BM25 and
7
+ semantic stages can match the provision's actual language. It only ever ADDS
8
+ words -- the user's own phrasing is left untouched -- and the cross-encoder
9
+ reranker still sees the original query, so precision is unaffected.
10
+
11
+ python -m canlex.synonyms "PRRA eligibility and an H&C application"
12
+ """
13
+ import re
14
+ import sys
15
+
16
+ # (trigger, canonical terms to append). The trigger is a regex fragment matched
17
+ # case-insensitively as a whole word. Keep this list high-precision: an entry
18
+ # earns its place only when the shorthand is unambiguous in Canadian border,
19
+ # immigration, customs, financial-crime or labour law.
20
+ _SYNONYMS = [
21
+ # Immigration and refugee law
22
+ (r"prra", "pre-removal risk assessment application for protection"),
23
+ (r"pre[- ]removal risk assessment", "application for protection"),
24
+ (r"h\s*&\s*c", "humanitarian and compassionate"),
25
+ (r"rad", "refugee appeal division"),
26
+ (r"rpd", "refugee protection division"),
27
+ (r"iad", "immigration appeal division"),
28
+ (r"irb", "immigration and refugee board"),
29
+ (r"trp", "temporary resident permit"),
30
+ (r"deportation", "removal order"),
31
+ (r"misrep", "misrepresentation"),
32
+ (r"ircc", "immigration refugees and citizenship canada"),
33
+ # Border and customs
34
+ (r"cbsa", "canada border services agency"),
35
+ (r"bsos?", "border services officer"),
36
+ (r"amps", "administrative monetary penalty system"),
37
+ # Financial-crime and labour
38
+ (r"fintrac", "financial transactions and reports analysis centre"),
39
+ (r"njc", "national joint council"),
40
+ ]
41
+
42
+ _COMPILED = [(re.compile(rf"\b{trigger}\b", re.IGNORECASE), expansion)
43
+ for trigger, expansion in _SYNONYMS]
44
+
45
+
46
+ def expand_query(query):
47
+ """Return `query` with canonical statutory terms appended for every legal
48
+ abbreviation it contains; return it unchanged if it contains none."""
49
+ additions = [exp for pattern, exp in _COMPILED if pattern.search(query)]
50
+ if not additions:
51
+ return query
52
+ return f"{query} {' '.join(additions)}"
53
+
54
+
55
+ def main():
56
+ query = " ".join(sys.argv[1:]) or "PRRA eligibility and an H&C application"
57
+ print(f"query: {query}")
58
+ print(f"expanded: {expand_query(query)}")
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()