Beemer Claude Opus 4.7 commited on
Commit
2966f10
·
1 Parent(s): b8c217b

Upgrade retrieval: bge-small embeddings + promote-only reranking

Browse files

Replace the model2vec static embedding with bge-small-en-v1.5, a local
transformer sentence-embedder run as ONNX on CPU (key-free). The reranker
now only promotes candidates -- placing each at the better of its fusion
and rerank position, never lower -- because the cross-encoder scores long
statutory text unreliably and was burying correct results.

Also includes intra-Act cross-reference and definition linking in search
results, a 47-question retrieval eval harness (canlex/eval.py), and
FPSLREB/CIRB ingestion wiring in caselaw.py (decisions not yet fetched).

Eval: Hit@5 0.74 -> 0.89, Hit@10 0.81 -> 0.94, MRR 0.60 -> 0.64.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

canlex/caselaw.py CHANGED
@@ -13,15 +13,16 @@ deliberately not a comprehensive scrape.
13
  import json
14
  import re
15
  import time
 
16
  import urllib.request
17
 
18
  from bs4 import BeautifulSoup
19
 
20
  from .config import PROCESSED_DIR, RAW_DIR
21
 
22
- # Each court's official Lexum database: (display name, item-URL template). All
23
- # three sites behave identically -- same iframe trick, metadata block and
24
- # bracketed paragraph numbers -- so one parser serves them all.
25
  COURTS = {
26
  "scc": ("Supreme Court of Canada",
27
  "https://decisions.scc-csc.ca/scc-csc/scc-csc/en/item/{id}/index.do"),
@@ -29,6 +30,11 @@ COURTS = {
29
  "https://decisions.fca-caf.gc.ca/fca-caf/decisions/en/item/{id}/index.do"),
30
  "fc": ("Federal Court",
31
  "https://decisions.fct-cf.gc.ca/fc-cf/decisions/en/item/{id}/index.do"),
 
 
 
 
 
32
  }
33
  _RAW = RAW_DIR / "caselaw"
34
  OUT = PROCESSED_DIR / "caselaw.json"
@@ -38,7 +44,7 @@ OUT = PROCESSED_DIR / "caselaw.json"
38
  # from the throttle below and from caching every fetched page on disk.
39
  _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
40
  "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
41
- _THROTTLE = 2.0 # seconds between live fetches
42
  _CHUNK_CHARS = 1800 # target characters per chunk
43
 
44
  # Marks the post-reasons apparatus (appended legislation, solicitors list),
@@ -200,6 +206,71 @@ CASES = [
200
  {"court": "fc", "id": 62413, "short": "Da Huang",
201
  "topic": "PCMLTFA currency forfeiture; partial return of seized funds "
202
  "where only part is shown to be of legitimate origin"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  ]
204
 
205
  # In-force jurisprudential guides of the Immigration and Refugee Board's Refugee
@@ -224,14 +295,25 @@ IRB_GUIDES = [
224
 
225
 
226
  def _get(url, cache_name):
227
- """Fetch a page, caching the raw HTML under data/raw/caselaw."""
 
 
 
228
  cache = _RAW / cache_name
229
  if cache.exists():
230
  return cache.read_text(encoding="utf-8")
231
  req = urllib.request.Request(url, headers={"User-Agent": _UA})
232
- time.sleep(_THROTTLE)
233
- with urllib.request.urlopen(req, timeout=60) as resp:
234
- text = resp.read().decode("utf-8", errors="replace")
 
 
 
 
 
 
 
 
235
  _RAW.mkdir(parents=True, exist_ok=True)
236
  cache.write_text(text, encoding="utf-8")
237
  return text
@@ -398,9 +480,10 @@ def _decision_chunks(case, soup):
398
  court_name, item_tmpl = COURTS[case["court"]]
399
  name, fields = _metadata(soup)
400
  name = name or case["short"]
401
- cite = fields.get("neutral citation") or fields.get("report") or ""
 
402
  report = fields.get("report", "")
403
- date = fields.get("date", "")
404
  citation = f"{name}, {cite}" if cite else name
405
  item_url = item_tmpl.format(id=case["id"])
406
  modern, paras = _paragraphs(soup)
 
13
  import json
14
  import re
15
  import time
16
+ import urllib.error
17
  import urllib.request
18
 
19
  from bs4 import BeautifulSoup
20
 
21
  from .config import PROCESSED_DIR, RAW_DIR
22
 
23
+ # Each court or tribunal's Lexum decisions database: (display name, item-URL
24
+ # template). All five run the same Lexum platform -- same iframe trick, metadata
25
+ # block and bracketed paragraph numbers -- so one parser serves them all.
26
  COURTS = {
27
  "scc": ("Supreme Court of Canada",
28
  "https://decisions.scc-csc.ca/scc-csc/scc-csc/en/item/{id}/index.do"),
 
30
  "https://decisions.fca-caf.gc.ca/fca-caf/decisions/en/item/{id}/index.do"),
31
  "fc": ("Federal Court",
32
  "https://decisions.fct-cf.gc.ca/fc-cf/decisions/en/item/{id}/index.do"),
33
+ "fpslreb": ("Federal Public Sector Labour Relations and Employment Board",
34
+ "https://decisions.fpslreb-crtespf.gc.ca/fpslreb-crtespf/d/en/"
35
+ "item/{id}/index.do"),
36
+ "cirb": ("Canada Industrial Relations Board",
37
+ "https://decisia.lexum.com/cirb-ccri/cirb-ccri/en/item/{id}/index.do"),
38
  }
39
  _RAW = RAW_DIR / "caselaw"
40
  OUT = PROCESSED_DIR / "caselaw.json"
 
44
  # from the throttle below and from caching every fetched page on disk.
45
  _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
46
  "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
47
+ _THROTTLE = 6.0 # seconds between live fetches (Lexum rate-limits hard)
48
  _CHUNK_CHARS = 1800 # target characters per chunk
49
 
50
  # Marks the post-reasons apparatus (appended legislation, solicitors list),
 
206
  {"court": "fc", "id": 62413, "short": "Da Huang",
207
  "topic": "PCMLTFA currency forfeiture; partial return of seized funds "
208
  "where only part is shown to be of legitimate origin"},
209
+
210
+ # --- Federal Public Sector Labour Relations and Employment Board ---
211
+ {"court": "fpslreb", "id": 520990, "short": "Menzies",
212
+ "topic": "Progressive discipline of a CBSA border services officer; the "
213
+ "lock-step approach to discipline rejected"},
214
+ {"court": "fpslreb", "id": 521231, "short": "Kline",
215
+ "topic": "Bad-faith termination of a CBSA employee; reinstatement and "
216
+ "damages"},
217
+ {"court": "fpslreb", "id": 521195, "short": "Sousa Dias",
218
+ "topic": "Discipline and termination grievance of a CBSA employee"},
219
+ {"court": "fpslreb", "id": 521082, "short": "Anderson",
220
+ "topic": "CBSA grievance; interpretation of the FB-group collective "
221
+ "agreement"},
222
+ {"court": "fpslreb", "id": 520948, "short": "Burlacu",
223
+ "topic": "CBSA; occupational health and safety and staffing"},
224
+ {"court": "fpslreb", "id": 483604, "short": "Malik",
225
+ "topic": "Discipline and termination grievance of a CBSA employee"},
226
+ {"court": "fpslreb", "id": 500554, "short": "Andruszkiewicz",
227
+ "topic": "Unfair labour practice complaint involving the CBSA"},
228
+ {"court": "fpslreb", "id": 359013, "short": "PSAC v TB (CBSA)",
229
+ "topic": "Policy grievance; collective agreement interpretation at the "
230
+ "CBSA"},
231
+ {"court": "fpslreb", "id": 359065, "short": "Martin-Ivie",
232
+ "topic": "Occupational health and safety; the arming and safety of CBSA "
233
+ "border officers"},
234
+ {"court": "fpslreb", "id": 358886, "short": "Basra (2012)",
235
+ "topic": "Discipline and termination grievance; a later proceeding in "
236
+ "the leading Basra line"},
237
+ {"court": "fpslreb", "id": 358025, "short": "Basra (2007)",
238
+ "topic": "The foundational Basra decision on discipline and the burden "
239
+ "of proof in a grievance"},
240
+ {"court": "fpslreb", "id": 358150, "short": "Quadrini",
241
+ "topic": "Unfair labour practice and freedom of expression in the "
242
+ "federal public service"},
243
+ {"court": "fpslreb", "id": 358180, "short": "Pepper",
244
+ "topic": "Discipline and termination; frequently-cited principles on "
245
+ "just cause"},
246
+ {"court": "fpslreb", "id": 358097, "short": "Richmond",
247
+ "topic": "Classification grievance in the federal public service"},
248
+ {"court": "fpslreb", "id": 358890, "short": "Baldasaro and Thiessen",
249
+ "topic": "Hours of work and overtime under a collective agreement"},
250
+ {"court": "fpslreb", "id": 358203, "short": "PSAC v TB (pay)",
251
+ "topic": "Collective agreement and pay administration policy grievance"},
252
+ {"court": "fpslreb", "id": 360456, "short": "Kinhnicki",
253
+ "topic": "Occupational health and safety; a refusal to work in a customs "
254
+ "context"},
255
+
256
+ # --- Canada Industrial Relations Board ---
257
+ {"court": "cirb", "id": 519772, "short": "Watson",
258
+ "topic": "Duty of fair representation and a mandatory vaccination policy "
259
+ "under the Canada Labour Code"},
260
+ {"court": "cirb", "id": 5478, "short": "McRaeJackson",
261
+ "topic": "The leading test for the duty of fair representation under "
262
+ "s. 37 of the Canada Labour Code"},
263
+ {"court": "cirb", "id": 5491, "short": "Securicor",
264
+ "topic": "Certification and bargaining-unit determination under the "
265
+ "Canada Labour Code"},
266
+ {"court": "cirb", "id": 5593, "short": "Dover Industries",
267
+ "topic": "Successor rights on the sale of a business under the Canada "
268
+ "Labour Code"},
269
+ {"court": "cirb", "id": 301063, "short": "Swissport",
270
+ "topic": "Unfair labour practice complaint under the Canada Labour Code"},
271
+ {"court": "cirb", "id": 5599, "short": "Cooney Transport",
272
+ "topic": "Related-employer (common-employer) declaration under the "
273
+ "Canada Labour Code"},
274
  ]
275
 
276
  # In-force jurisprudential guides of the Immigration and Refugee Board's Refugee
 
295
 
296
 
297
  def _get(url, cache_name):
298
+ """Fetch a page, caching the raw HTML under data/raw/caselaw.
299
+
300
+ Retries once on HTTP 403/429 -- the Lexum hosts rate-limit by IP.
301
+ """
302
  cache = _RAW / cache_name
303
  if cache.exists():
304
  return cache.read_text(encoding="utf-8")
305
  req = urllib.request.Request(url, headers={"User-Agent": _UA})
306
+ text = None
307
+ for attempt in range(2):
308
+ time.sleep(_THROTTLE if attempt == 0 else 25.0)
309
+ try:
310
+ with urllib.request.urlopen(req, timeout=60) as resp:
311
+ text = resp.read().decode("utf-8", errors="replace")
312
+ break
313
+ except urllib.error.HTTPError as exc:
314
+ if exc.code in (403, 429) and attempt == 0:
315
+ continue
316
+ raise
317
  _RAW.mkdir(parents=True, exist_ok=True)
318
  cache.write_text(text, encoding="utf-8")
319
  return text
 
480
  court_name, item_tmpl = COURTS[case["court"]]
481
  name, fields = _metadata(soup)
482
  name = name or case["short"]
483
+ cite = (fields.get("neutral citation") or fields.get("citation")
484
+ or fields.get("report") or "")
485
  report = fields.get("report", "")
486
+ date = fields.get("date") or fields.get("decision rendered") or ""
487
  citation = f"{name}, {cite}" if cite else name
488
  item_url = item_tmpl.format(id=case["id"])
489
  modern, paras = _paragraphs(soup)
canlex/embed.py CHANGED
@@ -1,13 +1,26 @@
1
- """Build semantic embeddings for ingested legislation chunks (local, key-free)."""
 
 
 
 
 
 
2
  import json
3
 
4
  import numpy as np
 
 
 
5
 
6
  from .config import PROCESSED_DIR
7
 
8
- MODEL_NAME = "minishlab/potion-retrieval-32M"
9
  EMB_PATH = PROCESSED_DIR / "embeddings.npz"
10
- _MAX_BODY = 2000 # cap embedded body text so long sections stay topically focused
 
 
 
 
11
 
12
 
13
  def load_chunks():
@@ -22,33 +35,75 @@ def embed_text(chunk):
22
  note = chunk["marginal_note"]
23
  body = chunk["text"][:_MAX_BODY]
24
  # The marginal note (section title) is the strongest topical signal, so it
25
- # is repeated to weight it up in the mean-pooled static embedding.
26
  parts = [chunk["act_short"], note, note, chunk["heading"], body]
27
  return " . ".join(p for p in parts if p)
28
 
29
 
30
  class Embedder:
31
- """Local static-embedding model (model2vec): no API key, runs offline once cached."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- def __init__(self, model_name=MODEL_NAME):
34
- from model2vec import StaticModel
35
- self.model = StaticModel.from_pretrained(model_name)
 
 
 
 
 
36
 
37
- def encode(self, texts):
38
- """Return L2-normalized float32 vectors, one row per input text."""
39
- vecs = np.asarray(self.model.encode(list(texts)), dtype=np.float32)
40
- if vecs.ndim == 1:
41
- vecs = vecs.reshape(1, -1)
42
- norms = np.linalg.norm(vecs, axis=1, keepdims=True)
43
- return vecs / np.maximum(norms, 1e-9)
44
 
45
 
46
  def build():
47
  chunks = load_chunks()
48
  if not chunks:
49
- print(f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
50
  return
51
- print(f"Embedding {len(chunks)} sections with {MODEL_NAME} ...")
52
  vectors = Embedder().encode([embed_text(c) for c in chunks])
53
  ids = np.array([c["id"] for c in chunks])
54
  np.savez(EMB_PATH, ids=ids, vectors=vectors)
 
1
+ """Build semantic embeddings for ingested chunks (local, key-free).
2
+
3
+ Uses BAAI's bge-small-en-v1.5 sentence-embedding model as ONNX, run on CPU via
4
+ onnxruntime -- no API key. A transformer embedding has far stronger retrieval
5
+ recall than a static one: it can connect a natural-language question to a
6
+ provision even when the two share few exact words.
7
+ """
8
  import json
9
 
10
  import numpy as np
11
+ import onnxruntime as ort
12
+ from huggingface_hub import hf_hub_download
13
+ from tokenizers import Tokenizer
14
 
15
  from .config import PROCESSED_DIR
16
 
17
+ EMB_REPO = "Xenova/bge-small-en-v1.5"
18
  EMB_PATH = PROCESSED_DIR / "embeddings.npz"
19
+ _MAX_TOKENS = 512
20
+ _MAX_BODY = 2000 # cap embedded body text so long sections stay topically focused
21
+ # bge-small retrieval: the query is prefixed with this instruction; passages
22
+ # are embedded without it. The asymmetry is how the model was trained.
23
+ _QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
24
 
25
 
26
  def load_chunks():
 
35
  note = chunk["marginal_note"]
36
  body = chunk["text"][:_MAX_BODY]
37
  # The marginal note (section title) is the strongest topical signal, so it
38
+ # is repeated to emphasise it.
39
  parts = [chunk["act_short"], note, note, chunk["heading"], body]
40
  return " . ".join(p for p in parts if p)
41
 
42
 
43
  class Embedder:
44
+ """Local transformer sentence-embedder: bge-small-en-v1.5 as ONNX on CPU.
45
+
46
+ No API key; the model is downloaded once and cached. Produces L2-normalized
47
+ vectors, so a dot product between them is cosine similarity.
48
+ """
49
+
50
+ def __init__(self):
51
+ model_path = None
52
+ for name in ("onnx/model_quantized.onnx", "onnx/model.onnx"):
53
+ try:
54
+ model_path = hf_hub_download(EMB_REPO, name)
55
+ break
56
+ except Exception:
57
+ continue
58
+ if model_path is None:
59
+ raise RuntimeError(f"Could not download an ONNX model from {EMB_REPO}.")
60
+ tok_path = hf_hub_download(EMB_REPO, "tokenizer.json")
61
+ self.session = ort.InferenceSession(model_path,
62
+ providers=["CPUExecutionProvider"])
63
+ self.input_names = {i.name for i in self.session.get_inputs()}
64
+ self.tokenizer = Tokenizer.from_file(tok_path)
65
+ self.tokenizer.enable_truncation(max_length=_MAX_TOKENS)
66
+
67
+ def _run(self, texts):
68
+ """Tokenize, run the encoder, CLS-pool and L2-normalize one batch."""
69
+ encs = self.tokenizer.encode_batch(list(texts))
70
+ width = max(len(e.ids) for e in encs)
71
+ input_ids = np.zeros((len(encs), width), dtype=np.int64)
72
+ attention = np.zeros((len(encs), width), dtype=np.int64)
73
+ type_ids = np.zeros((len(encs), width), dtype=np.int64)
74
+ for row, enc in enumerate(encs):
75
+ n = len(enc.ids)
76
+ input_ids[row, :n] = enc.ids
77
+ attention[row, :n] = enc.attention_mask
78
+ type_ids[row, :n] = enc.type_ids
79
+ feed = {"input_ids": input_ids, "attention_mask": attention}
80
+ if "token_type_ids" in self.input_names:
81
+ feed["token_type_ids"] = type_ids
82
+ hidden = np.asarray(self.session.run(None, feed)[0], dtype=np.float32)
83
+ cls = hidden[:, 0, :] if hidden.ndim == 3 else hidden # BGE: CLS pooling
84
+ norms = np.linalg.norm(cls, axis=1, keepdims=True)
85
+ return cls / np.maximum(norms, 1e-9)
86
 
87
+ def encode(self, texts, batch_size=32):
88
+ """Return L2-normalized embeddings for passages, one row per text."""
89
+ texts = list(texts)
90
+ if not texts:
91
+ return np.zeros((0, 384), dtype=np.float32)
92
+ rows = [self._run(texts[i:i + batch_size])
93
+ for i in range(0, len(texts), batch_size)]
94
+ return np.vstack(rows)
95
 
96
+ def encode_query(self, text):
97
+ """Return the L2-normalized embedding for one search query."""
98
+ return self._run([_QUERY_PREFIX + text])[0]
 
 
 
 
99
 
100
 
101
  def build():
102
  chunks = load_chunks()
103
  if not chunks:
104
+ print(f"No processed data in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
105
  return
106
+ print(f"Embedding {len(chunks)} sections with {EMB_REPO} ...")
107
  vectors = Embedder().encode([embed_text(c) for c in chunks])
108
  ids = np.array([c["id"] for c in chunks])
109
  np.savez(EMB_PATH, ids=ids, vectors=vectors)
canlex/eval.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Measure CanLex retrieval quality against a curated question set.
2
+
3
+ Each item in data/eval/questions.json pairs a realistic legal question with the
4
+ provision(s) or case(s) that answer it. This runs every question through the
5
+ retrieval index and reports Hit@k and MRR. Re-run it after any retrieval change
6
+ -- a new reranker, different embeddings, a chunking tweak -- to see whether
7
+ quality moved, and read the "Misses" list to see exactly what to fix.
8
+
9
+ py -m canlex.eval
10
+ """
11
+ import json
12
+ import sys
13
+
14
+ from .config import ROOT
15
+ from .index import LegislationIndex
16
+
17
+ QUESTIONS = ROOT / "data" / "eval" / "questions.json"
18
+ EVAL_TOP_K = 20 # search depth, so ranks past the usual 6 are still visible
19
+
20
+
21
+ def _matches(result, answers):
22
+ """True if a search result is one of the gold answers (act + section).
23
+
24
+ A gold answer is [act, section]; an empty section matches any chunk of that
25
+ act/case (used for case-law answers, whose chunks carry no section number).
26
+ """
27
+ r_acts = {result.get("act_short", "").lower(),
28
+ result.get("act_code", "").lower()}
29
+ r_sec = result.get("section", "")
30
+ for act, section in answers:
31
+ if act.lower() in r_acts and (section == r_sec or section == ""):
32
+ return True
33
+ return False
34
+
35
+
36
+ def run():
37
+ if not QUESTIONS.exists():
38
+ print(f"No question set at {QUESTIONS}.", file=sys.stderr)
39
+ return
40
+ items = json.loads(QUESTIONS.read_text(encoding="utf-8"))
41
+ index = LegislationIndex()
42
+ ranks = [] # rank of the first gold hit per question (0 = miss)
43
+ misses = []
44
+ for item in items:
45
+ answers = [tuple(a) for a in item["answers"]]
46
+ results = index.search(item["query"], top_k=EVAL_TOP_K)
47
+ rank = 0
48
+ for i, result in enumerate(results, start=1):
49
+ if _matches(result, answers):
50
+ rank = i
51
+ break
52
+ ranks.append(rank)
53
+ if rank == 0 or rank > 5:
54
+ top = results[0] if results else None
55
+ misses.append((item["query"], answers, rank, top))
56
+
57
+ n = len(ranks) or 1
58
+ hit = lambda k: sum(1 for r in ranks if 0 < r <= k) / n
59
+ mrr = sum(1.0 / r for r in ranks if r) / n
60
+ print(f"CanLex retrieval evaluation -- {len(ranks)} questions\n")
61
+ print(f" Hit@1: {hit(1):.2f}")
62
+ print(f" Hit@3: {hit(3):.2f}")
63
+ print(f" Hit@5: {hit(5):.2f}")
64
+ print(f" Hit@10: {hit(10):.2f}")
65
+ print(f" MRR: {mrr:.2f}")
66
+
67
+ if misses:
68
+ print(f"\n{len(misses)} miss(es) -- gold answer ranked >5 or absent:")
69
+ for query, answers, rank, top in misses:
70
+ gold = ", ".join(f"{a} s.{s}".rstrip(" s.") for a, s in answers)
71
+ where = f"ranked #{rank}" if rank else f"absent (searched {EVAL_TOP_K})"
72
+ got = (f"{top.get('act_short', '')} s.{top.get('section', '')}".rstrip(" s.")
73
+ if top else "nothing")
74
+ print(f" [{where}] {query}")
75
+ print(f" gold: {gold} | top result: {got}")
76
+ print()
77
+
78
+
79
+ if __name__ == "__main__":
80
+ run()
canlex/index.py CHANGED
@@ -5,6 +5,8 @@ import re
5
  import sys
6
  from collections import Counter, defaultdict
7
 
 
 
8
  from .config import PROCESSED_DIR
9
 
10
  K1 = 1.5
@@ -15,10 +17,31 @@ RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
15
 
16
  _TOKEN = re.compile(r"[a-z0-9]+")
17
  _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  def tokenize(text):
21
- return _TOKEN.findall(text.lower())
 
 
 
22
 
23
 
24
  def _section_refs(query):
@@ -35,6 +58,7 @@ class LegislationIndex:
35
  raise RuntimeError(
36
  f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
37
  self._build_bm25()
 
38
  self._load_semantic()
39
  self._load_reranker()
40
 
@@ -44,10 +68,12 @@ class LegislationIndex:
44
  df = defaultdict(int)
45
  for idx, c in enumerate(self.chunks):
46
  # The marginal note (title) is repeated to weight it above body text;
47
- # act_code and section are indexed so codes/numbers are searchable too.
 
 
48
  blob = " ".join((c["marginal_note"], c["marginal_note"], c["heading"],
49
- c["part"], c["division"], c["act_code"], c["section"],
50
- c["text"]))
51
  counts = Counter(tokenize(blob))
52
  self.doc_len.append(sum(counts.values()))
53
  for term, tf in counts.items():
@@ -119,7 +145,7 @@ class LegislationIndex:
119
  return scores
120
 
121
  def _semantic_ranking(self, query):
122
- qv = self.embedder.encode([query])[0]
123
  sims = self.vectors @ qv
124
  order = self._np.argsort(sims)[::-1][:CANDIDATES]
125
  return [int(i) for i in order]
@@ -158,13 +184,21 @@ class LegislationIndex:
158
  return []
159
  scores = {i: fused[i] for i in candidates}
160
 
161
- # Precision stage: the cross-encoder rescores the top candidate pool.
 
 
 
 
 
162
  if self.reranker:
163
  pool = candidates[:RERANK_POOL]
164
- for idx, ce in zip(pool, self.reranker.score(
165
- query, [self._rerank_doc(i) for i in pool])):
166
- scores[idx] = ce
167
- pool.sort(key=lambda i: scores[i], reverse=True)
 
 
 
168
  candidates = pool + candidates[RERANK_POOL:]
169
 
170
  # Explicit section references are pinned to the very top.
@@ -183,6 +217,46 @@ class LegislationIndex:
183
  return c
184
  return None
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  def main():
188
  if len(sys.argv) < 2:
 
5
  import sys
6
  from collections import Counter, defaultdict
7
 
8
+ import snowballstemmer
9
+
10
  from .config import PROCESSED_DIR
11
 
12
  K1 = 1.5
 
17
 
18
  _TOKEN = re.compile(r"[a-z0-9]+")
19
  _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
20
+ # A cross-reference to another provision -- "section 34", "subsection 25(1)",
21
+ # "paragraph 36(1)(a)", "s. 34" -- capturing the top-level section number.
22
+ _XREF = re.compile(
23
+ r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)",
24
+ re.IGNORECASE)
25
+
26
+
27
+ _STEMMER = snowballstemmer.stemmer("english")
28
+ _STEM_CACHE = {}
29
+
30
+
31
+ def _stem(word):
32
+ """Snowball-stem a word, memoised -- legal text repeats terms heavily."""
33
+ stemmed = _STEM_CACHE.get(word)
34
+ if stemmed is None:
35
+ stemmed = _STEMMER.stemWord(word)
36
+ _STEM_CACHE[word] = stemmed
37
+ return stemmed
38
 
39
 
40
  def tokenize(text):
41
+ """Lower-case, split on word characters, and Snowball-stem each token, so a
42
+ query matches a provision even when their word forms differ -- 'possession'
43
+ vs 'possess', 'reporting' vs 'report', 'importation' vs 'import'."""
44
+ return [_stem(w) for w in _TOKEN.findall(text.lower())]
45
 
46
 
47
  def _section_refs(query):
 
58
  raise RuntimeError(
59
  f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
60
  self._build_bm25()
61
+ self._build_xref()
62
  self._load_semantic()
63
  self._load_reranker()
64
 
 
68
  df = defaultdict(int)
69
  for idx, c in enumerate(self.chunks):
70
  # The marginal note (title) is repeated to weight it above body text;
71
+ # the Act name, code and section are indexed too, so an Act's own
72
+ # terminology (e.g. "controlled substance") and its codes/numbers
73
+ # are searchable even when a section's text omits them.
74
  blob = " ".join((c["marginal_note"], c["marginal_note"], c["heading"],
75
+ c["part"], c["division"], c["act_name"], c["act_code"],
76
+ c["section"], c["text"]))
77
  counts = Counter(tokenize(blob))
78
  self.doc_len.append(sum(counts.values()))
79
  for term, tf in counts.items():
 
145
  return scores
146
 
147
  def _semantic_ranking(self, query):
148
+ qv = self.embedder.encode_query(query)
149
  sims = self.vectors @ qv
150
  order = self._np.argsort(sims)[::-1][:CANDIDATES]
151
  return [int(i) for i in order]
 
184
  return []
185
  scores = {i: fused[i] for i in candidates}
186
 
187
+ # Precision stage: the cross-encoder rescores the top candidate pool, but
188
+ # may only PROMOTE -- each pooled candidate is placed at the better of its
189
+ # fusion rank and its rerank rank, never below its fusion rank. The
190
+ # reranker reliably surfaces a strong answer the fusion ranked low, yet is
191
+ # unreliable on long statutory text (it can score the right section
192
+ # negative), so its power to demote a candidate is deliberately removed.
193
  if self.reranker:
194
  pool = candidates[:RERANK_POOL]
195
+ ce = dict(zip(pool, self.reranker.score(
196
+ query, [self._rerank_doc(i) for i in pool])))
197
+ fusion_rank = {idx: r for r, idx in enumerate(pool)}
198
+ rerank_rank = {idx: r for r, idx in enumerate(
199
+ sorted(pool, key=ce.get, reverse=True))}
200
+ pool.sort(key=lambda i: (min(fusion_rank[i], rerank_rank[i]),
201
+ fusion_rank[i]))
202
  candidates = pool + candidates[RERANK_POOL:]
203
 
204
  # Explicit section references are pinned to the very top.
 
217
  return c
218
  return None
219
 
220
+ def _build_xref(self):
221
+ """Index legislation by (act, section) and locate each Act's definitions
222
+ section, to support cross-reference lookup."""
223
+ self._by_section = {}
224
+ self._defs_section = {}
225
+ for c in self.chunks:
226
+ if c.get("doc_type", "legislation") != "legislation":
227
+ continue
228
+ self._by_section[(c["act_code"], c["section"])] = c
229
+ if c["act_code"] not in self._defs_section and (
230
+ c["marginal_note"].strip().lower() in (
231
+ "definitions", "definition", "interpretation")):
232
+ self._defs_section[c["act_code"]] = c
233
+
234
+ def related(self, chunk):
235
+ """Return [(section, marginal_note), ...]: provisions of the same Act
236
+ that this one cross-references, plus the Act's definitions section.
237
+
238
+ Legislation chunks only; returns [] for case law, memoranda, etc.
239
+ """
240
+ if chunk.get("doc_type", "legislation") != "legislation":
241
+ return []
242
+ act = chunk["act_code"]
243
+ out, seen = [], {chunk["section"]}
244
+ defs = self._defs_section.get(act)
245
+ if defs and defs["section"] not in seen:
246
+ out.append((defs["section"], defs["marginal_note"]))
247
+ seen.add(defs["section"])
248
+ for match in _XREF.finditer(chunk["text"]):
249
+ sec = match.group(1)
250
+ if sec in seen:
251
+ continue
252
+ target = self._by_section.get((act, sec))
253
+ if target:
254
+ out.append((sec, target["marginal_note"]))
255
+ seen.add(sec)
256
+ if len(out) >= 8:
257
+ break
258
+ return out
259
+
260
 
261
  def main():
262
  if len(sys.argv) < 2:
canlex/server.py CHANGED
@@ -31,6 +31,9 @@ _READONLY = {
31
  GROUNDING_NOTE = (
32
  "ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
33
  "specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
 
 
 
34
  "Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
35
  "CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
36
  "and a court may disagree with them; collective agreements and the National "
@@ -58,7 +61,7 @@ def _index() -> LegislationIndex:
58
  return _INDEX
59
 
60
 
61
- def _format_section(c: dict) -> str:
62
  """Render one chunk (legislation, D-Memo, or agreement) as cited Markdown."""
63
  doc_type = c.get("doc_type", "legislation")
64
  header = f"### {c['citation']} — {c['marginal_note']}".rstrip(" —")
@@ -83,6 +86,11 @@ def _format_section(c: dict) -> str:
83
  "— IRB members apply its reasoning to similar cases or "
84
  "explain why not; persuasive, and subject to revocation "
85
  "or to review by the Federal Court._")
 
 
 
 
 
86
  else:
87
  lines.append("_Court decision — binding precedent depending on the "
88
  "court and jurisdiction; confirm it has not been "
@@ -98,6 +106,9 @@ def _format_section(c: dict) -> str:
98
  lines.append("")
99
  lines.append(c["text"])
100
  lines.append("")
 
 
 
101
  if c["history"]:
102
  if doc_type == "caselaw":
103
  lines.append(f"Also reported: {c['history']}")
@@ -133,8 +144,8 @@ class SearchInput(BaseModel):
133
  default=None,
134
  description="Optional filter by source type: 'legislation' (Acts and "
135
  "regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
136
- "agreements), 'directive' (NJC directives), or 'caselaw' (Supreme Court, "
137
- "Federal Court of Appeal and Federal Court decisions). Omit to search all.",
138
  )
139
 
140
 
@@ -159,9 +170,10 @@ def canlex_search_legislation(params: SearchInput) -> str:
159
  CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
160
  how it applies customs and border law); Treasury Board collective agreements
161
  (currently the FB / Border Services group); National Joint Council directives
162
- (travel, relocation, isolated posts and more); and leading Supreme Court of
163
- Canada, Federal Court of Appeal and Federal Court decisions on immigration,
164
- customs and Charter law. Use this for ANY question about that material. It ranks results by relevance and returns
 
165
  their full text so the answer can cite the actual wording; an explicit section
166
  reference (e.g. "section 34") is always surfaced. Each result is marked with its
167
  source type.
@@ -199,7 +211,7 @@ def canlex_search_legislation(params: SearchInput) -> str:
199
  blocks.append("")
200
  blocks.append("---")
201
  blocks.append("")
202
- blocks.append(_format_section(c))
203
  return "\n".join(blocks)
204
 
205
 
@@ -233,7 +245,7 @@ def canlex_get_section(params: GetSectionInput) -> str:
233
  return (f"Error: no section '{params.section}' found in '{params.act}'. "
234
  f"Loaded Acts: {', '.join(acts) or 'none'}. Check the section number, "
235
  f"or use canlex_search_legislation to locate the provision by topic.")
236
- return GROUNDING_NOTE + "\n\n" + _format_section(section)
237
 
238
 
239
  @mcp.tool(name="canlex_list_acts",
 
31
  GROUNDING_NOTE = (
32
  "ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
33
  "specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
34
+ "When a result lists Related provisions, fetch any that bear on the question "
35
+ "-- the definitions section, an exception, a cross-referenced rule -- with "
36
+ "canlex_get_section before answering. "
37
  "Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
38
  "CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
39
  "and a court may disagree with them; collective agreements and the National "
 
61
  return _INDEX
62
 
63
 
64
+ def _format_section(c: dict, related=None) -> str:
65
  """Render one chunk (legislation, D-Memo, or agreement) as cited Markdown."""
66
  doc_type = c.get("doc_type", "legislation")
67
  header = f"### {c['citation']} — {c['marginal_note']}".rstrip(" —")
 
86
  "— IRB members apply its reasoning to similar cases or "
87
  "explain why not; persuasive, and subject to revocation "
88
  "or to review by the Federal Court._")
89
+ elif "Board" in c["part"]:
90
+ lines.append("_Labour-board decision — a federal administrative "
91
+ "tribunal's ruling; persuasive within the board's own "
92
+ "jurisprudence, and subject to judicial review by the "
93
+ "Federal Court of Appeal._")
94
  else:
95
  lines.append("_Court decision — binding precedent depending on the "
96
  "court and jurisdiction; confirm it has not been "
 
106
  lines.append("")
107
  lines.append(c["text"])
108
  lines.append("")
109
+ if related:
110
+ refs = "; ".join(f"s. {s} ({n})" if n else f"s. {s}" for s, n in related)
111
+ lines.append(f"Related provisions in this Act: {refs}")
112
  if c["history"]:
113
  if doc_type == "caselaw":
114
  lines.append(f"Also reported: {c['history']}")
 
144
  default=None,
145
  description="Optional filter by source type: 'legislation' (Acts and "
146
  "regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
147
+ "agreements), 'directive' (NJC directives), or 'caselaw' (court and "
148
+ "tribunal decisions). Omit to search all.",
149
  )
150
 
151
 
 
170
  CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
171
  how it applies customs and border law); Treasury Board collective agreements
172
  (currently the FB / Border Services group); National Joint Council directives
173
+ (travel, relocation, isolated posts and more); and leading decisions of the
174
+ courts and federal tribunals: the Supreme Court, Federal Court of Appeal and
175
+ Federal Court, the Immigration and Refugee Board, and the FPSLREB and CIRB
176
+ labour boards. Use this for ANY question about that material. It ranks results by relevance and returns
177
  their full text so the answer can cite the actual wording; an explicit section
178
  reference (e.g. "section 34") is always surfaced. Each result is marked with its
179
  source type.
 
211
  blocks.append("")
212
  blocks.append("---")
213
  blocks.append("")
214
+ blocks.append(_format_section(c, index.related(c)))
215
  return "\n".join(blocks)
216
 
217
 
 
245
  return (f"Error: no section '{params.section}' found in '{params.act}'. "
246
  f"Loaded Acts: {', '.join(acts) or 'none'}. Check the section number, "
247
  f"or use canlex_search_legislation to locate the provision by topic.")
248
+ return GROUNDING_NOTE + "\n\n" + _format_section(section, index.related(section))
249
 
250
 
251
  @mcp.tool(name="canlex_list_acts",
data/eval/questions.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {"query": "How soon must the Immigration Division review the detention of a foreign national?", "answers": [["IRPA", "57"]]},
3
+ {"query": "On what security grounds is a foreign national inadmissible to Canada?", "answers": [["IRPA", "34"]]},
4
+ {"query": "When is a permanent resident inadmissible for serious criminality?", "answers": [["IRPA", "36"]]},
5
+ {"query": "What makes a person inadmissible for organized criminality?", "answers": [["IRPA", "37"]]},
6
+ {"query": "Can someone be found inadmissible to Canada for misrepresentation?", "answers": [["IRPA", "40"]]},
7
+ {"query": "Is a foreign national inadmissible on health grounds?", "answers": [["IRPA", "38"]]},
8
+ {"query": "Inadmissibility for violating human or international rights", "answers": [["IRPA", "35"]]},
9
+ {"query": "Can a person be inadmissible to Canada for financial reasons?", "answers": [["IRPA", "39"]]},
10
+ {"query": "Is someone inadmissible because an accompanying family member is inadmissible?", "answers": [["IRPA", "42"]]},
11
+ {"query": "What humanitarian and compassionate relief can the Minister grant a foreign national?", "answers": [["IRPA", "25"], ["IRPA", "25.1"]]},
12
+ {"query": "When can an officer arrest and detain a foreign national without a warrant?", "answers": [["IRPA", "55"]]},
13
+ {"query": "Who prepares a report that a permanent resident is inadmissible?", "answers": [["IRPA", "44"]]},
14
+ {"query": "Must a person appear for an examination when seeking to enter Canada?", "answers": [["IRPA", "18"]]},
15
+ {"query": "What must a person establish to be allowed to enter Canada?", "answers": [["IRPA", "20"]]},
16
+ {"query": "What is the definition of a Convention refugee?", "answers": [["IRPA", "96"]]},
17
+ {"query": "Who qualifies as a person in need of protection?", "answers": [["IRPA", "97"]]},
18
+ {"query": "When is a refugee claim ineligible to be referred to the Refugee Protection Division?", "answers": [["IRPA", "101"]]},
19
+ {"query": "What is a pre-removal risk assessment and who can apply for one?", "answers": [["IRPA", "112"]]},
20
+ {"query": "When does a removal order become enforceable?", "answers": [["IRPA", "48"]]},
21
+ {"query": "Is it an offence to organize the illegal entry of people into Canada?", "answers": [["IRPA", "117"]]},
22
+ {"query": "Must a person report to a customs officer when arriving in Canada?", "answers": [["Customs Act", "11"]]},
23
+ {"query": "What is the duty to report goods imported into Canada?", "answers": [["Customs Act", "12"]]},
24
+ {"query": "Can a customs officer examine imported goods?", "answers": [["Customs Act", "99"]]},
25
+ {"query": "When can a customs officer search a person at the border?", "answers": [["Customs Act", "98"]]},
26
+ {"query": "What happens when goods are seized for a customs contravention?", "answers": [["Customs Act", "110"]]},
27
+ {"query": "What is ascertained forfeiture under the Customs Act?", "answers": [["Customs Act", "124"]]},
28
+ {"query": "When do imported goods become forfeit after a customs contravention?", "answers": [["Customs Act", "122"]]},
29
+ {"query": "How can a person appeal a customs seizure or penalty decision to the Federal Court?", "answers": [["Customs Act", "135"]]},
30
+ {"query": "Advance information about commercial goods before they arrive in Canada", "answers": [["Customs Act", "12.1"]]},
31
+ {"query": "How is the value for duty of imported goods determined?", "answers": [["Customs Act", "46"], ["Customs Act", "47"], ["Customs Act", "48"]]},
32
+ {"query": "How are imported goods classified under the Customs Tariff?", "answers": [["Customs Tariff", "10"]]},
33
+ {"query": "Must travellers report large amounts of currency when crossing the border?", "answers": [["PCMLTFA", "12"]]},
34
+ {"query": "Can an officer seize currency that was not reported at the border?", "answers": [["PCMLTFA", "18"]]},
35
+ {"query": "How does someone appeal the forfeiture of seized currency to the Federal Court?", "answers": [["PCMLTFA", "30"]]},
36
+ {"query": "Is simple possession of a controlled substance an offence?", "answers": [["CDSA", "4"]]},
37
+ {"query": "What is the offence of trafficking in a controlled substance?", "answers": [["CDSA", "5"]]},
38
+ {"query": "Is it an offence to import or export a controlled substance?", "answers": [["CDSA", "6"]]},
39
+ {"query": "Is possession of cannabis an offence?", "answers": [["Cannabis Act", "8"]]},
40
+ {"query": "Can cannabis be imported into or exported from Canada?", "answers": [["Cannabis Act", "11"]]},
41
+ {"query": "When can a peace officer arrest a person without a warrant?", "answers": [["Criminal Code", "495"]]},
42
+ {"query": "What right does a person have to access their own personal information held by a government institution?", "answers": [["Privacy Act", "12"]]},
43
+ {"query": "When may a government institution disclose someone's personal information?", "answers": [["Privacy Act", "8"]]},
44
+ {"query": "Can an employee refuse to do work that presents a danger?", "answers": [["Canada Labour Code", "128"]]},
45
+ {"query": "What are the standard hours of work for an employee?", "answers": [["Canada Labour Code", "169"]]},
46
+ {"query": "What is the standard of review of an administrative decision on judicial review?", "answers": [["Vavilov", ""]]},
47
+ {"query": "How does the Refugee Appeal Division review a decision of the Refugee Protection Division?", "answers": [["Huruglica", ""]]},
48
+ {"query": "To get back currency seized at the border, what must the claimant show about the money?", "answers": [["Sellathurai", ""]]}
49
+ ]
requirements.txt CHANGED
@@ -2,10 +2,10 @@
2
  # py -m venv .venv
3
  # .venv\Scripts\python.exe -m pip install -r requirements.txt
4
  mcp>=1.2 # MCP server (server.py)
5
- model2vec>=0.6 # local semantic embeddings (embed.py)
6
  numpy>=2.0 # vector math for hybrid retrieval (index.py)
7
- onnxruntime>=1.20 # cross-encoder reranker runtime (rerank.py)
8
  huggingface-hub>=0.20 # one-time model downloads (embed.py, rerank.py)
9
- tokenizers>=0.20 # cross-encoder tokenization (rerank.py)
10
  beautifulsoup4>=4.12 # parse CBSA D-Memoranda HTML (dmemo.py)
11
  pypdf>=4.0 # extract text from PDF-only D-Memoranda (dmemo.py)
 
 
2
  # py -m venv .venv
3
  # .venv\Scripts\python.exe -m pip install -r requirements.txt
4
  mcp>=1.2 # MCP server (server.py)
 
5
  numpy>=2.0 # vector math for hybrid retrieval (index.py)
6
+ onnxruntime>=1.20 # embedding + reranker model runtime (embed.py, rerank.py)
7
  huggingface-hub>=0.20 # one-time model downloads (embed.py, rerank.py)
8
+ tokenizers>=0.20 # tokenization for the embedding and reranker models
9
  beautifulsoup4>=4.12 # parse CBSA D-Memoranda HTML (dmemo.py)
10
  pypdf>=4.0 # extract text from PDF-only D-Memoranda (dmemo.py)
11
+ snowballstemmer>=2.2 # English stemmer for keyword search (index.py)