Spaces:

Beemer0
/

CanLex

Running

Beemer Claude Opus 4.7 commited on 6 days ago

Commit

2966f10

1 Parent(s): b8c217b

Upgrade retrieval: bge-small embeddings + promote-only reranking

Replace the model2vec static embedding with bge-small-en-v1.5, a local
transformer sentence-embedder run as ONNX on CPU (key-free). The reranker
now only promotes candidates -- placing each at the better of its fusion
and rerank position, never lower -- because the cross-encoder scores long
statutory text unreliably and was burying correct results.

Also includes intra-Act cross-reference and definition linking in search
results, a 47-question retrieval eval harness (canlex/eval.py), and
FPSLREB/CIRB ingestion wiring in caselaw.py (decisions not yet fetched).

Eval: Hit@5 0.74 -> 0.89, Hit@10 0.81 -> 0.94, MRR 0.60 -> 0.64.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (7) hide show

canlex/caselaw.py +93 -10
canlex/embed.py +72 -17
canlex/eval.py +80 -0
canlex/index.py +84 -10
canlex/server.py +20 -8
data/eval/questions.json +49 -0
requirements.txt +3 -3

canlex/caselaw.py CHANGED Viewed

@@ -13,15 +13,16 @@ deliberately not a comprehensive scrape.
 import json
 import re
 import time
 import urllib.request
 from bs4 import BeautifulSoup
 from .config import PROCESSED_DIR, RAW_DIR
-# Each court's official Lexum database: (display name, item-URL template). All
-# three sites behave identically -- same iframe trick, metadata block and
-# bracketed paragraph numbers -- so one parser serves them all.
 COURTS = {
     "scc": ("Supreme Court of Canada",
             "https://decisions.scc-csc.ca/scc-csc/scc-csc/en/item/{id}/index.do"),
@@ -29,6 +30,11 @@ COURTS = {
             "https://decisions.fca-caf.gc.ca/fca-caf/decisions/en/item/{id}/index.do"),
     "fc": ("Federal Court",
            "https://decisions.fct-cf.gc.ca/fc-cf/decisions/en/item/{id}/index.do"),
 }
 _RAW = RAW_DIR / "caselaw"
 OUT = PROCESSED_DIR / "caselaw.json"
@@ -38,7 +44,7 @@ OUT = PROCESSED_DIR / "caselaw.json"
 # from the throttle below and from caching every fetched page on disk.
 _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
-_THROTTLE = 2.0       # seconds between live fetches
 _CHUNK_CHARS = 1800   # target characters per chunk
 # Marks the post-reasons apparatus (appended legislation, solicitors list),
@@ -200,6 +206,71 @@ CASES = [
     {"court": "fc", "id": 62413, "short": "Da Huang",
      "topic": "PCMLTFA currency forfeiture; partial return of seized funds "
               "where only part is shown to be of legitimate origin"},
 ]
 # In-force jurisprudential guides of the Immigration and Refugee Board's Refugee
@@ -224,14 +295,25 @@ IRB_GUIDES = [
 def _get(url, cache_name):
-    """Fetch a page, caching the raw HTML under data/raw/caselaw."""
     cache = _RAW / cache_name
     if cache.exists():
         return cache.read_text(encoding="utf-8")
     req = urllib.request.Request(url, headers={"User-Agent": _UA})
-    time.sleep(_THROTTLE)
-    with urllib.request.urlopen(req, timeout=60) as resp:
-        text = resp.read().decode("utf-8", errors="replace")
     _RAW.mkdir(parents=True, exist_ok=True)
     cache.write_text(text, encoding="utf-8")
     return text
@@ -398,9 +480,10 @@ def _decision_chunks(case, soup):
     court_name, item_tmpl = COURTS[case["court"]]
     name, fields = _metadata(soup)
     name = name or case["short"]
-    cite = fields.get("neutral citation") or fields.get("report") or ""
     report = fields.get("report", "")
-    date = fields.get("date", "")
     citation = f"{name}, {cite}" if cite else name
     item_url = item_tmpl.format(id=case["id"])
     modern, paras = _paragraphs(soup)

 import json
 import re
 import time
+import urllib.error
 import urllib.request
 from bs4 import BeautifulSoup
 from .config import PROCESSED_DIR, RAW_DIR
+# Each court or tribunal's Lexum decisions database: (display name, item-URL
+# template). All five run the same Lexum platform -- same iframe trick, metadata
+# block and bracketed paragraph numbers -- so one parser serves them all.
 COURTS = {
     "scc": ("Supreme Court of Canada",
             "https://decisions.scc-csc.ca/scc-csc/scc-csc/en/item/{id}/index.do"),
             "https://decisions.fca-caf.gc.ca/fca-caf/decisions/en/item/{id}/index.do"),
     "fc": ("Federal Court",
            "https://decisions.fct-cf.gc.ca/fc-cf/decisions/en/item/{id}/index.do"),
+    "fpslreb": ("Federal Public Sector Labour Relations and Employment Board",
+                "https://decisions.fpslreb-crtespf.gc.ca/fpslreb-crtespf/d/en/"
+                "item/{id}/index.do"),
+    "cirb": ("Canada Industrial Relations Board",
+             "https://decisia.lexum.com/cirb-ccri/cirb-ccri/en/item/{id}/index.do"),
 }
 _RAW = RAW_DIR / "caselaw"
 OUT = PROCESSED_DIR / "caselaw.json"
 # from the throttle below and from caching every fetched page on disk.
 _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+_THROTTLE = 6.0       # seconds between live fetches (Lexum rate-limits hard)
 _CHUNK_CHARS = 1800   # target characters per chunk
 # Marks the post-reasons apparatus (appended legislation, solicitors list),
     {"court": "fc", "id": 62413, "short": "Da Huang",
      "topic": "PCMLTFA currency forfeiture; partial return of seized funds "
               "where only part is shown to be of legitimate origin"},
+    # --- Federal Public Sector Labour Relations and Employment Board ---
+    {"court": "fpslreb", "id": 520990, "short": "Menzies",
+     "topic": "Progressive discipline of a CBSA border services officer; the "
+              "lock-step approach to discipline rejected"},
+    {"court": "fpslreb", "id": 521231, "short": "Kline",
+     "topic": "Bad-faith termination of a CBSA employee; reinstatement and "
+              "damages"},
+    {"court": "fpslreb", "id": 521195, "short": "Sousa Dias",
+     "topic": "Discipline and termination grievance of a CBSA employee"},
+    {"court": "fpslreb", "id": 521082, "short": "Anderson",
+     "topic": "CBSA grievance; interpretation of the FB-group collective "
+              "agreement"},
+    {"court": "fpslreb", "id": 520948, "short": "Burlacu",
+     "topic": "CBSA; occupational health and safety and staffing"},
+    {"court": "fpslreb", "id": 483604, "short": "Malik",
+     "topic": "Discipline and termination grievance of a CBSA employee"},
+    {"court": "fpslreb", "id": 500554, "short": "Andruszkiewicz",
+     "topic": "Unfair labour practice complaint involving the CBSA"},
+    {"court": "fpslreb", "id": 359013, "short": "PSAC v TB (CBSA)",
+     "topic": "Policy grievance; collective agreement interpretation at the "
+              "CBSA"},
+    {"court": "fpslreb", "id": 359065, "short": "Martin-Ivie",
+     "topic": "Occupational health and safety; the arming and safety of CBSA "
+              "border officers"},
+    {"court": "fpslreb", "id": 358886, "short": "Basra (2012)",
+     "topic": "Discipline and termination grievance; a later proceeding in "
+              "the leading Basra line"},
+    {"court": "fpslreb", "id": 358025, "short": "Basra (2007)",
+     "topic": "The foundational Basra decision on discipline and the burden "
+              "of proof in a grievance"},
+    {"court": "fpslreb", "id": 358150, "short": "Quadrini",
+     "topic": "Unfair labour practice and freedom of expression in the "
+              "federal public service"},
+    {"court": "fpslreb", "id": 358180, "short": "Pepper",
+     "topic": "Discipline and termination; frequently-cited principles on "
+              "just cause"},
+    {"court": "fpslreb", "id": 358097, "short": "Richmond",
+     "topic": "Classification grievance in the federal public service"},
+    {"court": "fpslreb", "id": 358890, "short": "Baldasaro and Thiessen",
+     "topic": "Hours of work and overtime under a collective agreement"},
+    {"court": "fpslreb", "id": 358203, "short": "PSAC v TB (pay)",
+     "topic": "Collective agreement and pay administration policy grievance"},
+    {"court": "fpslreb", "id": 360456, "short": "Kinhnicki",
+     "topic": "Occupational health and safety; a refusal to work in a customs "
+              "context"},
+    # --- Canada Industrial Relations Board ---
+    {"court": "cirb", "id": 519772, "short": "Watson",
+     "topic": "Duty of fair representation and a mandatory vaccination policy "
+              "under the Canada Labour Code"},
+    {"court": "cirb", "id": 5478, "short": "McRaeJackson",
+     "topic": "The leading test for the duty of fair representation under "
+              "s. 37 of the Canada Labour Code"},
+    {"court": "cirb", "id": 5491, "short": "Securicor",
+     "topic": "Certification and bargaining-unit determination under the "
+              "Canada Labour Code"},
+    {"court": "cirb", "id": 5593, "short": "Dover Industries",
+     "topic": "Successor rights on the sale of a business under the Canada "
+              "Labour Code"},
+    {"court": "cirb", "id": 301063, "short": "Swissport",
+     "topic": "Unfair labour practice complaint under the Canada Labour Code"},
+    {"court": "cirb", "id": 5599, "short": "Cooney Transport",
+     "topic": "Related-employer (common-employer) declaration under the "
+              "Canada Labour Code"},
 ]
 # In-force jurisprudential guides of the Immigration and Refugee Board's Refugee
 def _get(url, cache_name):
+    """Fetch a page, caching the raw HTML under data/raw/caselaw.
+    Retries once on HTTP 403/429 -- the Lexum hosts rate-limit by IP.
+    """
     cache = _RAW / cache_name
     if cache.exists():
         return cache.read_text(encoding="utf-8")
     req = urllib.request.Request(url, headers={"User-Agent": _UA})
+    text = None
+    for attempt in range(2):
+        time.sleep(_THROTTLE if attempt == 0 else 25.0)
+        try:
+            with urllib.request.urlopen(req, timeout=60) as resp:
+                text = resp.read().decode("utf-8", errors="replace")
+            break
+        except urllib.error.HTTPError as exc:
+            if exc.code in (403, 429) and attempt == 0:
+                continue
+            raise
     _RAW.mkdir(parents=True, exist_ok=True)
     cache.write_text(text, encoding="utf-8")
     return text
     court_name, item_tmpl = COURTS[case["court"]]
     name, fields = _metadata(soup)
     name = name or case["short"]
+    cite = (fields.get("neutral citation") or fields.get("citation")
+            or fields.get("report") or "")
     report = fields.get("report", "")
+    date = fields.get("date") or fields.get("decision rendered") or ""
     citation = f"{name}, {cite}" if cite else name
     item_url = item_tmpl.format(id=case["id"])
     modern, paras = _paragraphs(soup)

canlex/embed.py CHANGED Viewed

@@ -1,13 +1,26 @@
-"""Build semantic embeddings for ingested legislation chunks (local, key-free)."""
 import json
 import numpy as np
 from .config import PROCESSED_DIR
-MODEL_NAME = "minishlab/potion-retrieval-32M"
 EMB_PATH = PROCESSED_DIR / "embeddings.npz"
-_MAX_BODY = 2000  # cap embedded body text so long sections stay topically focused
 def load_chunks():
@@ -22,33 +35,75 @@ def embed_text(chunk):
     note = chunk["marginal_note"]
     body = chunk["text"][:_MAX_BODY]
     # The marginal note (section title) is the strongest topical signal, so it
-    # is repeated to weight it up in the mean-pooled static embedding.
     parts = [chunk["act_short"], note, note, chunk["heading"], body]
     return " . ".join(p for p in parts if p)
 class Embedder:
-    """Local static-embedding model (model2vec): no API key, runs offline once cached."""
-    def __init__(self, model_name=MODEL_NAME):
-        from model2vec import StaticModel
-        self.model = StaticModel.from_pretrained(model_name)
-    def encode(self, texts):
-        """Return L2-normalized float32 vectors, one row per input text."""
-        vecs = np.asarray(self.model.encode(list(texts)), dtype=np.float32)
-        if vecs.ndim == 1:
-            vecs = vecs.reshape(1, -1)
-        norms = np.linalg.norm(vecs, axis=1, keepdims=True)
-        return vecs / np.maximum(norms, 1e-9)
 def build():
     chunks = load_chunks()
     if not chunks:
-        print(f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
         return
-    print(f"Embedding {len(chunks)} sections with {MODEL_NAME} ...")
     vectors = Embedder().encode([embed_text(c) for c in chunks])
     ids = np.array([c["id"] for c in chunks])
     np.savez(EMB_PATH, ids=ids, vectors=vectors)

+"""Build semantic embeddings for ingested chunks (local, key-free).
+Uses BAAI's bge-small-en-v1.5 sentence-embedding model as ONNX, run on CPU via
+onnxruntime -- no API key. A transformer embedding has far stronger retrieval
+recall than a static one: it can connect a natural-language question to a
+provision even when the two share few exact words.
+"""
 import json
 import numpy as np
+import onnxruntime as ort
+from huggingface_hub import hf_hub_download
+from tokenizers import Tokenizer
 from .config import PROCESSED_DIR
+EMB_REPO = "Xenova/bge-small-en-v1.5"
 EMB_PATH = PROCESSED_DIR / "embeddings.npz"
+_MAX_TOKENS = 512
+_MAX_BODY = 2000   # cap embedded body text so long sections stay topically focused
+# bge-small retrieval: the query is prefixed with this instruction; passages
+# are embedded without it. The asymmetry is how the model was trained.
+_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
 def load_chunks():
     note = chunk["marginal_note"]
     body = chunk["text"][:_MAX_BODY]
     # The marginal note (section title) is the strongest topical signal, so it
+    # is repeated to emphasise it.
     parts = [chunk["act_short"], note, note, chunk["heading"], body]
     return " . ".join(p for p in parts if p)
 class Embedder:
+    """Local transformer sentence-embedder: bge-small-en-v1.5 as ONNX on CPU.
+    No API key; the model is downloaded once and cached. Produces L2-normalized
+    vectors, so a dot product between them is cosine similarity.
+    """
+    def __init__(self):
+        model_path = None
+        for name in ("onnx/model_quantized.onnx", "onnx/model.onnx"):
+            try:
+                model_path = hf_hub_download(EMB_REPO, name)
+                break
+            except Exception:
+                continue
+        if model_path is None:
+            raise RuntimeError(f"Could not download an ONNX model from {EMB_REPO}.")
+        tok_path = hf_hub_download(EMB_REPO, "tokenizer.json")
+        self.session = ort.InferenceSession(model_path,
+                                            providers=["CPUExecutionProvider"])
+        self.input_names = {i.name for i in self.session.get_inputs()}
+        self.tokenizer = Tokenizer.from_file(tok_path)
+        self.tokenizer.enable_truncation(max_length=_MAX_TOKENS)
+    def _run(self, texts):
+        """Tokenize, run the encoder, CLS-pool and L2-normalize one batch."""
+        encs = self.tokenizer.encode_batch(list(texts))
+        width = max(len(e.ids) for e in encs)
+        input_ids = np.zeros((len(encs), width), dtype=np.int64)
+        attention = np.zeros((len(encs), width), dtype=np.int64)
+        type_ids = np.zeros((len(encs), width), dtype=np.int64)
+        for row, enc in enumerate(encs):
+            n = len(enc.ids)
+            input_ids[row, :n] = enc.ids
+            attention[row, :n] = enc.attention_mask
+            type_ids[row, :n] = enc.type_ids
+        feed = {"input_ids": input_ids, "attention_mask": attention}
+        if "token_type_ids" in self.input_names:
+            feed["token_type_ids"] = type_ids
+        hidden = np.asarray(self.session.run(None, feed)[0], dtype=np.float32)
+        cls = hidden[:, 0, :] if hidden.ndim == 3 else hidden   # BGE: CLS pooling
+        norms = np.linalg.norm(cls, axis=1, keepdims=True)
+        return cls / np.maximum(norms, 1e-9)
+    def encode(self, texts, batch_size=32):
+        """Return L2-normalized embeddings for passages, one row per text."""
+        texts = list(texts)
+        if not texts:
+            return np.zeros((0, 384), dtype=np.float32)
+        rows = [self._run(texts[i:i + batch_size])
+                for i in range(0, len(texts), batch_size)]
+        return np.vstack(rows)
+    def encode_query(self, text):
+        """Return the L2-normalized embedding for one search query."""
+        return self._run([_QUERY_PREFIX + text])[0]
 def build():
     chunks = load_chunks()
     if not chunks:
+        print(f"No processed data in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
         return
+    print(f"Embedding {len(chunks)} sections with {EMB_REPO} ...")
     vectors = Embedder().encode([embed_text(c) for c in chunks])
     ids = np.array([c["id"] for c in chunks])
     np.savez(EMB_PATH, ids=ids, vectors=vectors)

canlex/eval.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Measure CanLex retrieval quality against a curated question set.
+Each item in data/eval/questions.json pairs a realistic legal question with the
+provision(s) or case(s) that answer it. This runs every question through the
+retrieval index and reports Hit@k and MRR. Re-run it after any retrieval change
+-- a new reranker, different embeddings, a chunking tweak -- to see whether
+quality moved, and read the "Misses" list to see exactly what to fix.
+    py -m canlex.eval
+"""
+import json
+import sys
+from .config import ROOT
+from .index import LegislationIndex
+QUESTIONS = ROOT / "data" / "eval" / "questions.json"
+EVAL_TOP_K = 20      # search depth, so ranks past the usual 6 are still visible
+def _matches(result, answers):
+    """True if a search result is one of the gold answers (act + section).
+    A gold answer is [act, section]; an empty section matches any chunk of that
+    act/case (used for case-law answers, whose chunks carry no section number).
+    """
+    r_acts = {result.get("act_short", "").lower(),
+              result.get("act_code", "").lower()}
+    r_sec = result.get("section", "")
+    for act, section in answers:
+        if act.lower() in r_acts and (section == r_sec or section == ""):
+            return True
+    return False
+def run():
+    if not QUESTIONS.exists():
+        print(f"No question set at {QUESTIONS}.", file=sys.stderr)
+        return
+    items = json.loads(QUESTIONS.read_text(encoding="utf-8"))
+    index = LegislationIndex()
+    ranks = []          # rank of the first gold hit per question (0 = miss)
+    misses = []
+    for item in items:
+        answers = [tuple(a) for a in item["answers"]]
+        results = index.search(item["query"], top_k=EVAL_TOP_K)
+        rank = 0
+        for i, result in enumerate(results, start=1):
+            if _matches(result, answers):
+                rank = i
+                break
+        ranks.append(rank)
+        if rank == 0 or rank > 5:
+            top = results[0] if results else None
+            misses.append((item["query"], answers, rank, top))
+    n = len(ranks) or 1
+    hit = lambda k: sum(1 for r in ranks if 0 < r <= k) / n
+    mrr = sum(1.0 / r for r in ranks if r) / n
+    print(f"CanLex retrieval evaluation -- {len(ranks)} questions\n")
+    print(f"  Hit@1:   {hit(1):.2f}")
+    print(f"  Hit@3:   {hit(3):.2f}")
+    print(f"  Hit@5:   {hit(5):.2f}")
+    print(f"  Hit@10:  {hit(10):.2f}")
+    print(f"  MRR:     {mrr:.2f}")
+    if misses:
+        print(f"\n{len(misses)} miss(es) -- gold answer ranked >5 or absent:")
+        for query, answers, rank, top in misses:
+            gold = ", ".join(f"{a} s.{s}".rstrip(" s.") for a, s in answers)
+            where = f"ranked #{rank}" if rank else f"absent (searched {EVAL_TOP_K})"
+            got = (f"{top.get('act_short', '')} s.{top.get('section', '')}".rstrip(" s.")
+                   if top else "nothing")
+            print(f"  [{where}] {query}")
+            print(f"      gold: {gold}   |   top result: {got}")
+    print()
+if __name__ == "__main__":
+    run()

canlex/index.py CHANGED Viewed

@@ -5,6 +5,8 @@ import re
 import sys
 from collections import Counter, defaultdict
 from .config import PROCESSED_DIR
 K1 = 1.5
@@ -15,10 +17,31 @@ RERANK_POOL = 50    # top fused candidates the cross-encoder rescores
 _TOKEN = re.compile(r"[a-z0-9]+")
 _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
 def tokenize(text):
-    return _TOKEN.findall(text.lower())
 def _section_refs(query):
@@ -35,6 +58,7 @@ class LegislationIndex:
             raise RuntimeError(
                 f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
         self._build_bm25()
         self._load_semantic()
         self._load_reranker()
@@ -44,10 +68,12 @@ class LegislationIndex:
         df = defaultdict(int)
         for idx, c in enumerate(self.chunks):
             # The marginal note (title) is repeated to weight it above body text;
-            # act_code and section are indexed so codes/numbers are searchable too.
             blob = " ".join((c["marginal_note"], c["marginal_note"], c["heading"],
-                             c["part"], c["division"], c["act_code"], c["section"],
-                             c["text"]))
             counts = Counter(tokenize(blob))
             self.doc_len.append(sum(counts.values()))
             for term, tf in counts.items():
@@ -119,7 +145,7 @@ class LegislationIndex:
         return scores
     def _semantic_ranking(self, query):
-        qv = self.embedder.encode([query])[0]
         sims = self.vectors @ qv
         order = self._np.argsort(sims)[::-1][:CANDIDATES]
         return [int(i) for i in order]
@@ -158,13 +184,21 @@ class LegislationIndex:
             return []
         scores = {i: fused[i] for i in candidates}
-        # Precision stage: the cross-encoder rescores the top candidate pool.
         if self.reranker:
             pool = candidates[:RERANK_POOL]
-            for idx, ce in zip(pool, self.reranker.score(
-                    query, [self._rerank_doc(i) for i in pool])):
-                scores[idx] = ce
-            pool.sort(key=lambda i: scores[i], reverse=True)
             candidates = pool + candidates[RERANK_POOL:]
         # Explicit section references are pinned to the very top.
@@ -183,6 +217,46 @@ class LegislationIndex:
                 return c
         return None
 def main():
     if len(sys.argv) < 2:

 import sys
 from collections import Counter, defaultdict
+import snowballstemmer
 from .config import PROCESSED_DIR
 K1 = 1.5
 _TOKEN = re.compile(r"[a-z0-9]+")
 _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
+# A cross-reference to another provision -- "section 34", "subsection 25(1)",
+# "paragraph 36(1)(a)", "s. 34" -- capturing the top-level section number.
+_XREF = re.compile(
+    r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)",
+    re.IGNORECASE)
+_STEMMER = snowballstemmer.stemmer("english")
+_STEM_CACHE = {}
+def _stem(word):
+    """Snowball-stem a word, memoised -- legal text repeats terms heavily."""
+    stemmed = _STEM_CACHE.get(word)
+    if stemmed is None:
+        stemmed = _STEMMER.stemWord(word)
+        _STEM_CACHE[word] = stemmed
+    return stemmed
 def tokenize(text):
+    """Lower-case, split on word characters, and Snowball-stem each token, so a
+    query matches a provision even when their word forms differ -- 'possession'
+    vs 'possess', 'reporting' vs 'report', 'importation' vs 'import'."""
+    return [_stem(w) for w in _TOKEN.findall(text.lower())]
 def _section_refs(query):
             raise RuntimeError(
                 f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
         self._build_bm25()
+        self._build_xref()
         self._load_semantic()
         self._load_reranker()
         df = defaultdict(int)
         for idx, c in enumerate(self.chunks):
             # The marginal note (title) is repeated to weight it above body text;
+            # the Act name, code and section are indexed too, so an Act's own
+            # terminology (e.g. "controlled substance") and its codes/numbers
+            # are searchable even when a section's text omits them.
             blob = " ".join((c["marginal_note"], c["marginal_note"], c["heading"],
+                             c["part"], c["division"], c["act_name"], c["act_code"],
+                             c["section"], c["text"]))
             counts = Counter(tokenize(blob))
             self.doc_len.append(sum(counts.values()))
             for term, tf in counts.items():
         return scores
     def _semantic_ranking(self, query):
+        qv = self.embedder.encode_query(query)
         sims = self.vectors @ qv
         order = self._np.argsort(sims)[::-1][:CANDIDATES]
         return [int(i) for i in order]
             return []
         scores = {i: fused[i] for i in candidates}
+        # Precision stage: the cross-encoder rescores the top candidate pool, but
+        # may only PROMOTE -- each pooled candidate is placed at the better of its
+        # fusion rank and its rerank rank, never below its fusion rank. The
+        # reranker reliably surfaces a strong answer the fusion ranked low, yet is
+        # unreliable on long statutory text (it can score the right section
+        # negative), so its power to demote a candidate is deliberately removed.
         if self.reranker:
             pool = candidates[:RERANK_POOL]
+            ce = dict(zip(pool, self.reranker.score(
+                query, [self._rerank_doc(i) for i in pool])))
+            fusion_rank = {idx: r for r, idx in enumerate(pool)}
+            rerank_rank = {idx: r for r, idx in enumerate(
+                sorted(pool, key=ce.get, reverse=True))}
+            pool.sort(key=lambda i: (min(fusion_rank[i], rerank_rank[i]),
+                                     fusion_rank[i]))
             candidates = pool + candidates[RERANK_POOL:]
         # Explicit section references are pinned to the very top.
                 return c
         return None
+    def _build_xref(self):
+        """Index legislation by (act, section) and locate each Act's definitions
+        section, to support cross-reference lookup."""
+        self._by_section = {}
+        self._defs_section = {}
+        for c in self.chunks:
+            if c.get("doc_type", "legislation") != "legislation":
+                continue
+            self._by_section[(c["act_code"], c["section"])] = c
+            if c["act_code"] not in self._defs_section and (
+                    c["marginal_note"].strip().lower() in (
+                        "definitions", "definition", "interpretation")):
+                self._defs_section[c["act_code"]] = c
+    def related(self, chunk):
+        """Return [(section, marginal_note), ...]: provisions of the same Act
+        that this one cross-references, plus the Act's definitions section.
+        Legislation chunks only; returns [] for case law, memoranda, etc.
+        """
+        if chunk.get("doc_type", "legislation") != "legislation":
+            return []
+        act = chunk["act_code"]
+        out, seen = [], {chunk["section"]}
+        defs = self._defs_section.get(act)
+        if defs and defs["section"] not in seen:
+            out.append((defs["section"], defs["marginal_note"]))
+            seen.add(defs["section"])
+        for match in _XREF.finditer(chunk["text"]):
+            sec = match.group(1)
+            if sec in seen:
+                continue
+            target = self._by_section.get((act, sec))
+            if target:
+                out.append((sec, target["marginal_note"]))
+                seen.add(sec)
+            if len(out) >= 8:
+                break
+        return out
 def main():
     if len(sys.argv) < 2:

canlex/server.py CHANGED Viewed

@@ -31,6 +31,9 @@ _READONLY = {
 GROUNDING_NOTE = (
     "ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
     "specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
     "Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
     "CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
     "and a court may disagree with them; collective agreements and the National "
@@ -58,7 +61,7 @@ def _index() -> LegislationIndex:
     return _INDEX
-def _format_section(c: dict) -> str:
     """Render one chunk (legislation, D-Memo, or agreement) as cited Markdown."""
     doc_type = c.get("doc_type", "legislation")
     header = f"### {c['citation']} — {c['marginal_note']}".rstrip(" —")
@@ -83,6 +86,11 @@ def _format_section(c: dict) -> str:
                          "— IRB members apply its reasoning to similar cases or "
                          "explain why not; persuasive, and subject to revocation "
                          "or to review by the Federal Court._")
         else:
             lines.append("_Court decision — binding precedent depending on the "
                          "court and jurisdiction; confirm it has not been "
@@ -98,6 +106,9 @@ def _format_section(c: dict) -> str:
     lines.append("")
     lines.append(c["text"])
     lines.append("")
     if c["history"]:
         if doc_type == "caselaw":
             lines.append(f"Also reported: {c['history']}")
@@ -133,8 +144,8 @@ class SearchInput(BaseModel):
         default=None,
         description="Optional filter by source type: 'legislation' (Acts and "
         "regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
-        "agreements), 'directive' (NJC directives), or 'caselaw' (Supreme Court, "
-        "Federal Court of Appeal and Federal Court decisions). Omit to search all.",
     )
@@ -159,9 +170,10 @@ def canlex_search_legislation(params: SearchInput) -> str:
     CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
     how it applies customs and border law); Treasury Board collective agreements
     (currently the FB / Border Services group); National Joint Council directives
-    (travel, relocation, isolated posts and more); and leading Supreme Court of
-    Canada, Federal Court of Appeal and Federal Court decisions on immigration,
-    customs and Charter law. Use this for ANY question about that material. It ranks results by relevance and returns
     their full text so the answer can cite the actual wording; an explicit section
     reference (e.g. "section 34") is always surfaced. Each result is marked with its
     source type.
@@ -199,7 +211,7 @@ def canlex_search_legislation(params: SearchInput) -> str:
         blocks.append("")
         blocks.append("---")
         blocks.append("")
-        blocks.append(_format_section(c))
     return "\n".join(blocks)
@@ -233,7 +245,7 @@ def canlex_get_section(params: GetSectionInput) -> str:
         return (f"Error: no section '{params.section}' found in '{params.act}'. "
                 f"Loaded Acts: {', '.join(acts) or 'none'}. Check the section number, "
                 f"or use canlex_search_legislation to locate the provision by topic.")
-    return GROUNDING_NOTE + "\n\n" + _format_section(section)
 @mcp.tool(name="canlex_list_acts",

 GROUNDING_NOTE = (
     "ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
     "specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
+    "When a result lists Related provisions, fetch any that bear on the question "
+    "-- the definitions section, an exception, a cross-referenced rule -- with "
+    "canlex_get_section before answering. "
     "Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
     "CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
     "and a court may disagree with them; collective agreements and the National "
     return _INDEX
+def _format_section(c: dict, related=None) -> str:
     """Render one chunk (legislation, D-Memo, or agreement) as cited Markdown."""
     doc_type = c.get("doc_type", "legislation")
     header = f"### {c['citation']} — {c['marginal_note']}".rstrip(" —")
                          "— IRB members apply its reasoning to similar cases or "
                          "explain why not; persuasive, and subject to revocation "
                          "or to review by the Federal Court._")
+        elif "Board" in c["part"]:
+            lines.append("_Labour-board decision — a federal administrative "
+                         "tribunal's ruling; persuasive within the board's own "
+                         "jurisprudence, and subject to judicial review by the "
+                         "Federal Court of Appeal._")
         else:
             lines.append("_Court decision — binding precedent depending on the "
                          "court and jurisdiction; confirm it has not been "
     lines.append("")
     lines.append(c["text"])
     lines.append("")
+    if related:
+        refs = "; ".join(f"s. {s} ({n})" if n else f"s. {s}" for s, n in related)
+        lines.append(f"Related provisions in this Act: {refs}")
     if c["history"]:
         if doc_type == "caselaw":
             lines.append(f"Also reported: {c['history']}")
         default=None,
         description="Optional filter by source type: 'legislation' (Acts and "
         "regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
+        "agreements), 'directive' (NJC directives), or 'caselaw' (court and "
+        "tribunal decisions). Omit to search all.",
     )
     CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
     how it applies customs and border law); Treasury Board collective agreements
     (currently the FB / Border Services group); National Joint Council directives
+    (travel, relocation, isolated posts and more); and leading decisions of the
+    courts and federal tribunals: the Supreme Court, Federal Court of Appeal and
+    Federal Court, the Immigration and Refugee Board, and the FPSLREB and CIRB
+    labour boards. Use this for ANY question about that material. It ranks results by relevance and returns
     their full text so the answer can cite the actual wording; an explicit section
     reference (e.g. "section 34") is always surfaced. Each result is marked with its
     source type.
         blocks.append("")
         blocks.append("---")
         blocks.append("")
+        blocks.append(_format_section(c, index.related(c)))
     return "\n".join(blocks)
         return (f"Error: no section '{params.section}' found in '{params.act}'. "
                 f"Loaded Acts: {', '.join(acts) or 'none'}. Check the section number, "
                 f"or use canlex_search_legislation to locate the provision by topic.")
+    return GROUNDING_NOTE + "\n\n" + _format_section(section, index.related(section))
 @mcp.tool(name="canlex_list_acts",

data/eval/questions.json ADDED Viewed

	@@ -0,0 +1,49 @@

+[
+  {"query": "How soon must the Immigration Division review the detention of a foreign national?", "answers": [["IRPA", "57"]]},
+  {"query": "On what security grounds is a foreign national inadmissible to Canada?", "answers": [["IRPA", "34"]]},
+  {"query": "When is a permanent resident inadmissible for serious criminality?", "answers": [["IRPA", "36"]]},
+  {"query": "What makes a person inadmissible for organized criminality?", "answers": [["IRPA", "37"]]},
+  {"query": "Can someone be found inadmissible to Canada for misrepresentation?", "answers": [["IRPA", "40"]]},
+  {"query": "Is a foreign national inadmissible on health grounds?", "answers": [["IRPA", "38"]]},
+  {"query": "Inadmissibility for violating human or international rights", "answers": [["IRPA", "35"]]},
+  {"query": "Can a person be inadmissible to Canada for financial reasons?", "answers": [["IRPA", "39"]]},
+  {"query": "Is someone inadmissible because an accompanying family member is inadmissible?", "answers": [["IRPA", "42"]]},
+  {"query": "What humanitarian and compassionate relief can the Minister grant a foreign national?", "answers": [["IRPA", "25"], ["IRPA", "25.1"]]},
+  {"query": "When can an officer arrest and detain a foreign national without a warrant?", "answers": [["IRPA", "55"]]},
+  {"query": "Who prepares a report that a permanent resident is inadmissible?", "answers": [["IRPA", "44"]]},
+  {"query": "Must a person appear for an examination when seeking to enter Canada?", "answers": [["IRPA", "18"]]},
+  {"query": "What must a person establish to be allowed to enter Canada?", "answers": [["IRPA", "20"]]},
+  {"query": "What is the definition of a Convention refugee?", "answers": [["IRPA", "96"]]},
+  {"query": "Who qualifies as a person in need of protection?", "answers": [["IRPA", "97"]]},
+  {"query": "When is a refugee claim ineligible to be referred to the Refugee Protection Division?", "answers": [["IRPA", "101"]]},
+  {"query": "What is a pre-removal risk assessment and who can apply for one?", "answers": [["IRPA", "112"]]},
+  {"query": "When does a removal order become enforceable?", "answers": [["IRPA", "48"]]},
+  {"query": "Is it an offence to organize the illegal entry of people into Canada?", "answers": [["IRPA", "117"]]},
+  {"query": "Must a person report to a customs officer when arriving in Canada?", "answers": [["Customs Act", "11"]]},
+  {"query": "What is the duty to report goods imported into Canada?", "answers": [["Customs Act", "12"]]},
+  {"query": "Can a customs officer examine imported goods?", "answers": [["Customs Act", "99"]]},
+  {"query": "When can a customs officer search a person at the border?", "answers": [["Customs Act", "98"]]},
+  {"query": "What happens when goods are seized for a customs contravention?", "answers": [["Customs Act", "110"]]},
+  {"query": "What is ascertained forfeiture under the Customs Act?", "answers": [["Customs Act", "124"]]},
+  {"query": "When do imported goods become forfeit after a customs contravention?", "answers": [["Customs Act", "122"]]},
+  {"query": "How can a person appeal a customs seizure or penalty decision to the Federal Court?", "answers": [["Customs Act", "135"]]},
+  {"query": "Advance information about commercial goods before they arrive in Canada", "answers": [["Customs Act", "12.1"]]},
+  {"query": "How is the value for duty of imported goods determined?", "answers": [["Customs Act", "46"], ["Customs Act", "47"], ["Customs Act", "48"]]},
+  {"query": "How are imported goods classified under the Customs Tariff?", "answers": [["Customs Tariff", "10"]]},
+  {"query": "Must travellers report large amounts of currency when crossing the border?", "answers": [["PCMLTFA", "12"]]},
+  {"query": "Can an officer seize currency that was not reported at the border?", "answers": [["PCMLTFA", "18"]]},
+  {"query": "How does someone appeal the forfeiture of seized currency to the Federal Court?", "answers": [["PCMLTFA", "30"]]},
+  {"query": "Is simple possession of a controlled substance an offence?", "answers": [["CDSA", "4"]]},
+  {"query": "What is the offence of trafficking in a controlled substance?", "answers": [["CDSA", "5"]]},
+  {"query": "Is it an offence to import or export a controlled substance?", "answers": [["CDSA", "6"]]},
+  {"query": "Is possession of cannabis an offence?", "answers": [["Cannabis Act", "8"]]},
+  {"query": "Can cannabis be imported into or exported from Canada?", "answers": [["Cannabis Act", "11"]]},
+  {"query": "When can a peace officer arrest a person without a warrant?", "answers": [["Criminal Code", "495"]]},
+  {"query": "What right does a person have to access their own personal information held by a government institution?", "answers": [["Privacy Act", "12"]]},
+  {"query": "When may a government institution disclose someone's personal information?", "answers": [["Privacy Act", "8"]]},
+  {"query": "Can an employee refuse to do work that presents a danger?", "answers": [["Canada Labour Code", "128"]]},
+  {"query": "What are the standard hours of work for an employee?", "answers": [["Canada Labour Code", "169"]]},
+  {"query": "What is the standard of review of an administrative decision on judicial review?", "answers": [["Vavilov", ""]]},
+  {"query": "How does the Refugee Appeal Division review a decision of the Refugee Protection Division?", "answers": [["Huruglica", ""]]},
+  {"query": "To get back currency seized at the border, what must the claimant show about the money?", "answers": [["Sellathurai", ""]]}
+]

requirements.txt CHANGED Viewed

@@ -2,10 +2,10 @@
 #   py -m venv .venv
 #   .venv\Scripts\python.exe -m pip install -r requirements.txt
 mcp>=1.2               # MCP server (server.py)
-model2vec>=0.6         # local semantic embeddings (embed.py)
 numpy>=2.0             # vector math for hybrid retrieval (index.py)
-onnxruntime>=1.20      # cross-encoder reranker runtime (rerank.py)
 huggingface-hub>=0.20  # one-time model downloads (embed.py, rerank.py)
-tokenizers>=0.20       # cross-encoder tokenization (rerank.py)
 beautifulsoup4>=4.12   # parse CBSA D-Memoranda HTML (dmemo.py)
 pypdf>=4.0             # extract text from PDF-only D-Memoranda (dmemo.py)

 #   py -m venv .venv
 #   .venv\Scripts\python.exe -m pip install -r requirements.txt
 mcp>=1.2               # MCP server (server.py)
 numpy>=2.0             # vector math for hybrid retrieval (index.py)
+onnxruntime>=1.20      # embedding + reranker model runtime (embed.py, rerank.py)
 huggingface-hub>=0.20  # one-time model downloads (embed.py, rerank.py)
+tokenizers>=0.20       # tokenization for the embedding and reranker models
 beautifulsoup4>=4.12   # parse CBSA D-Memoranda HTML (dmemo.py)
 pypdf>=4.0             # extract text from PDF-only D-Memoranda (dmemo.py)
+snowballstemmer>=2.2   # English stemmer for keyword search (index.py)