Beemer Claude Opus 4.7 commited on
Commit ·
2966f10
1
Parent(s): b8c217b
Upgrade retrieval: bge-small embeddings + promote-only reranking
Browse filesReplace the model2vec static embedding with bge-small-en-v1.5, a local
transformer sentence-embedder run as ONNX on CPU (key-free). The reranker
now only promotes candidates -- placing each at the better of its fusion
and rerank position, never lower -- because the cross-encoder scores long
statutory text unreliably and was burying correct results.
Also includes intra-Act cross-reference and definition linking in search
results, a 47-question retrieval eval harness (canlex/eval.py), and
FPSLREB/CIRB ingestion wiring in caselaw.py (decisions not yet fetched).
Eval: Hit@5 0.74 -> 0.89, Hit@10 0.81 -> 0.94, MRR 0.60 -> 0.64.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- canlex/caselaw.py +93 -10
- canlex/embed.py +72 -17
- canlex/eval.py +80 -0
- canlex/index.py +84 -10
- canlex/server.py +20 -8
- data/eval/questions.json +49 -0
- requirements.txt +3 -3
canlex/caselaw.py
CHANGED
|
@@ -13,15 +13,16 @@ deliberately not a comprehensive scrape.
|
|
| 13 |
import json
|
| 14 |
import re
|
| 15 |
import time
|
|
|
|
| 16 |
import urllib.request
|
| 17 |
|
| 18 |
from bs4 import BeautifulSoup
|
| 19 |
|
| 20 |
from .config import PROCESSED_DIR, RAW_DIR
|
| 21 |
|
| 22 |
-
# Each court's
|
| 23 |
-
#
|
| 24 |
-
# bracketed paragraph numbers -- so one parser serves them all.
|
| 25 |
COURTS = {
|
| 26 |
"scc": ("Supreme Court of Canada",
|
| 27 |
"https://decisions.scc-csc.ca/scc-csc/scc-csc/en/item/{id}/index.do"),
|
|
@@ -29,6 +30,11 @@ COURTS = {
|
|
| 29 |
"https://decisions.fca-caf.gc.ca/fca-caf/decisions/en/item/{id}/index.do"),
|
| 30 |
"fc": ("Federal Court",
|
| 31 |
"https://decisions.fct-cf.gc.ca/fc-cf/decisions/en/item/{id}/index.do"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
}
|
| 33 |
_RAW = RAW_DIR / "caselaw"
|
| 34 |
OUT = PROCESSED_DIR / "caselaw.json"
|
|
@@ -38,7 +44,7 @@ OUT = PROCESSED_DIR / "caselaw.json"
|
|
| 38 |
# from the throttle below and from caching every fetched page on disk.
|
| 39 |
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
| 40 |
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
| 41 |
-
_THROTTLE =
|
| 42 |
_CHUNK_CHARS = 1800 # target characters per chunk
|
| 43 |
|
| 44 |
# Marks the post-reasons apparatus (appended legislation, solicitors list),
|
|
@@ -200,6 +206,71 @@ CASES = [
|
|
| 200 |
{"court": "fc", "id": 62413, "short": "Da Huang",
|
| 201 |
"topic": "PCMLTFA currency forfeiture; partial return of seized funds "
|
| 202 |
"where only part is shown to be of legitimate origin"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
]
|
| 204 |
|
| 205 |
# In-force jurisprudential guides of the Immigration and Refugee Board's Refugee
|
|
@@ -224,14 +295,25 @@ IRB_GUIDES = [
|
|
| 224 |
|
| 225 |
|
| 226 |
def _get(url, cache_name):
|
| 227 |
-
"""Fetch a page, caching the raw HTML under data/raw/caselaw.
|
|
|
|
|
|
|
|
|
|
| 228 |
cache = _RAW / cache_name
|
| 229 |
if cache.exists():
|
| 230 |
return cache.read_text(encoding="utf-8")
|
| 231 |
req = urllib.request.Request(url, headers={"User-Agent": _UA})
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
_RAW.mkdir(parents=True, exist_ok=True)
|
| 236 |
cache.write_text(text, encoding="utf-8")
|
| 237 |
return text
|
|
@@ -398,9 +480,10 @@ def _decision_chunks(case, soup):
|
|
| 398 |
court_name, item_tmpl = COURTS[case["court"]]
|
| 399 |
name, fields = _metadata(soup)
|
| 400 |
name = name or case["short"]
|
| 401 |
-
cite = fields.get("neutral citation") or fields.get("
|
|
|
|
| 402 |
report = fields.get("report", "")
|
| 403 |
-
date = fields.get("date"
|
| 404 |
citation = f"{name}, {cite}" if cite else name
|
| 405 |
item_url = item_tmpl.format(id=case["id"])
|
| 406 |
modern, paras = _paragraphs(soup)
|
|
|
|
| 13 |
import json
|
| 14 |
import re
|
| 15 |
import time
|
| 16 |
+
import urllib.error
|
| 17 |
import urllib.request
|
| 18 |
|
| 19 |
from bs4 import BeautifulSoup
|
| 20 |
|
| 21 |
from .config import PROCESSED_DIR, RAW_DIR
|
| 22 |
|
| 23 |
+
# Each court or tribunal's Lexum decisions database: (display name, item-URL
|
| 24 |
+
# template). All five run the same Lexum platform -- same iframe trick, metadata
|
| 25 |
+
# block and bracketed paragraph numbers -- so one parser serves them all.
|
| 26 |
COURTS = {
|
| 27 |
"scc": ("Supreme Court of Canada",
|
| 28 |
"https://decisions.scc-csc.ca/scc-csc/scc-csc/en/item/{id}/index.do"),
|
|
|
|
| 30 |
"https://decisions.fca-caf.gc.ca/fca-caf/decisions/en/item/{id}/index.do"),
|
| 31 |
"fc": ("Federal Court",
|
| 32 |
"https://decisions.fct-cf.gc.ca/fc-cf/decisions/en/item/{id}/index.do"),
|
| 33 |
+
"fpslreb": ("Federal Public Sector Labour Relations and Employment Board",
|
| 34 |
+
"https://decisions.fpslreb-crtespf.gc.ca/fpslreb-crtespf/d/en/"
|
| 35 |
+
"item/{id}/index.do"),
|
| 36 |
+
"cirb": ("Canada Industrial Relations Board",
|
| 37 |
+
"https://decisia.lexum.com/cirb-ccri/cirb-ccri/en/item/{id}/index.do"),
|
| 38 |
}
|
| 39 |
_RAW = RAW_DIR / "caselaw"
|
| 40 |
OUT = PROCESSED_DIR / "caselaw.json"
|
|
|
|
| 44 |
# from the throttle below and from caching every fetched page on disk.
|
| 45 |
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
| 46 |
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
| 47 |
+
_THROTTLE = 6.0 # seconds between live fetches (Lexum rate-limits hard)
|
| 48 |
_CHUNK_CHARS = 1800 # target characters per chunk
|
| 49 |
|
| 50 |
# Marks the post-reasons apparatus (appended legislation, solicitors list),
|
|
|
|
| 206 |
{"court": "fc", "id": 62413, "short": "Da Huang",
|
| 207 |
"topic": "PCMLTFA currency forfeiture; partial return of seized funds "
|
| 208 |
"where only part is shown to be of legitimate origin"},
|
| 209 |
+
|
| 210 |
+
# --- Federal Public Sector Labour Relations and Employment Board ---
|
| 211 |
+
{"court": "fpslreb", "id": 520990, "short": "Menzies",
|
| 212 |
+
"topic": "Progressive discipline of a CBSA border services officer; the "
|
| 213 |
+
"lock-step approach to discipline rejected"},
|
| 214 |
+
{"court": "fpslreb", "id": 521231, "short": "Kline",
|
| 215 |
+
"topic": "Bad-faith termination of a CBSA employee; reinstatement and "
|
| 216 |
+
"damages"},
|
| 217 |
+
{"court": "fpslreb", "id": 521195, "short": "Sousa Dias",
|
| 218 |
+
"topic": "Discipline and termination grievance of a CBSA employee"},
|
| 219 |
+
{"court": "fpslreb", "id": 521082, "short": "Anderson",
|
| 220 |
+
"topic": "CBSA grievance; interpretation of the FB-group collective "
|
| 221 |
+
"agreement"},
|
| 222 |
+
{"court": "fpslreb", "id": 520948, "short": "Burlacu",
|
| 223 |
+
"topic": "CBSA; occupational health and safety and staffing"},
|
| 224 |
+
{"court": "fpslreb", "id": 483604, "short": "Malik",
|
| 225 |
+
"topic": "Discipline and termination grievance of a CBSA employee"},
|
| 226 |
+
{"court": "fpslreb", "id": 500554, "short": "Andruszkiewicz",
|
| 227 |
+
"topic": "Unfair labour practice complaint involving the CBSA"},
|
| 228 |
+
{"court": "fpslreb", "id": 359013, "short": "PSAC v TB (CBSA)",
|
| 229 |
+
"topic": "Policy grievance; collective agreement interpretation at the "
|
| 230 |
+
"CBSA"},
|
| 231 |
+
{"court": "fpslreb", "id": 359065, "short": "Martin-Ivie",
|
| 232 |
+
"topic": "Occupational health and safety; the arming and safety of CBSA "
|
| 233 |
+
"border officers"},
|
| 234 |
+
{"court": "fpslreb", "id": 358886, "short": "Basra (2012)",
|
| 235 |
+
"topic": "Discipline and termination grievance; a later proceeding in "
|
| 236 |
+
"the leading Basra line"},
|
| 237 |
+
{"court": "fpslreb", "id": 358025, "short": "Basra (2007)",
|
| 238 |
+
"topic": "The foundational Basra decision on discipline and the burden "
|
| 239 |
+
"of proof in a grievance"},
|
| 240 |
+
{"court": "fpslreb", "id": 358150, "short": "Quadrini",
|
| 241 |
+
"topic": "Unfair labour practice and freedom of expression in the "
|
| 242 |
+
"federal public service"},
|
| 243 |
+
{"court": "fpslreb", "id": 358180, "short": "Pepper",
|
| 244 |
+
"topic": "Discipline and termination; frequently-cited principles on "
|
| 245 |
+
"just cause"},
|
| 246 |
+
{"court": "fpslreb", "id": 358097, "short": "Richmond",
|
| 247 |
+
"topic": "Classification grievance in the federal public service"},
|
| 248 |
+
{"court": "fpslreb", "id": 358890, "short": "Baldasaro and Thiessen",
|
| 249 |
+
"topic": "Hours of work and overtime under a collective agreement"},
|
| 250 |
+
{"court": "fpslreb", "id": 358203, "short": "PSAC v TB (pay)",
|
| 251 |
+
"topic": "Collective agreement and pay administration policy grievance"},
|
| 252 |
+
{"court": "fpslreb", "id": 360456, "short": "Kinhnicki",
|
| 253 |
+
"topic": "Occupational health and safety; a refusal to work in a customs "
|
| 254 |
+
"context"},
|
| 255 |
+
|
| 256 |
+
# --- Canada Industrial Relations Board ---
|
| 257 |
+
{"court": "cirb", "id": 519772, "short": "Watson",
|
| 258 |
+
"topic": "Duty of fair representation and a mandatory vaccination policy "
|
| 259 |
+
"under the Canada Labour Code"},
|
| 260 |
+
{"court": "cirb", "id": 5478, "short": "McRaeJackson",
|
| 261 |
+
"topic": "The leading test for the duty of fair representation under "
|
| 262 |
+
"s. 37 of the Canada Labour Code"},
|
| 263 |
+
{"court": "cirb", "id": 5491, "short": "Securicor",
|
| 264 |
+
"topic": "Certification and bargaining-unit determination under the "
|
| 265 |
+
"Canada Labour Code"},
|
| 266 |
+
{"court": "cirb", "id": 5593, "short": "Dover Industries",
|
| 267 |
+
"topic": "Successor rights on the sale of a business under the Canada "
|
| 268 |
+
"Labour Code"},
|
| 269 |
+
{"court": "cirb", "id": 301063, "short": "Swissport",
|
| 270 |
+
"topic": "Unfair labour practice complaint under the Canada Labour Code"},
|
| 271 |
+
{"court": "cirb", "id": 5599, "short": "Cooney Transport",
|
| 272 |
+
"topic": "Related-employer (common-employer) declaration under the "
|
| 273 |
+
"Canada Labour Code"},
|
| 274 |
]
|
| 275 |
|
| 276 |
# In-force jurisprudential guides of the Immigration and Refugee Board's Refugee
|
|
|
|
| 295 |
|
| 296 |
|
| 297 |
def _get(url, cache_name):
|
| 298 |
+
"""Fetch a page, caching the raw HTML under data/raw/caselaw.
|
| 299 |
+
|
| 300 |
+
Retries once on HTTP 403/429 -- the Lexum hosts rate-limit by IP.
|
| 301 |
+
"""
|
| 302 |
cache = _RAW / cache_name
|
| 303 |
if cache.exists():
|
| 304 |
return cache.read_text(encoding="utf-8")
|
| 305 |
req = urllib.request.Request(url, headers={"User-Agent": _UA})
|
| 306 |
+
text = None
|
| 307 |
+
for attempt in range(2):
|
| 308 |
+
time.sleep(_THROTTLE if attempt == 0 else 25.0)
|
| 309 |
+
try:
|
| 310 |
+
with urllib.request.urlopen(req, timeout=60) as resp:
|
| 311 |
+
text = resp.read().decode("utf-8", errors="replace")
|
| 312 |
+
break
|
| 313 |
+
except urllib.error.HTTPError as exc:
|
| 314 |
+
if exc.code in (403, 429) and attempt == 0:
|
| 315 |
+
continue
|
| 316 |
+
raise
|
| 317 |
_RAW.mkdir(parents=True, exist_ok=True)
|
| 318 |
cache.write_text(text, encoding="utf-8")
|
| 319 |
return text
|
|
|
|
| 480 |
court_name, item_tmpl = COURTS[case["court"]]
|
| 481 |
name, fields = _metadata(soup)
|
| 482 |
name = name or case["short"]
|
| 483 |
+
cite = (fields.get("neutral citation") or fields.get("citation")
|
| 484 |
+
or fields.get("report") or "")
|
| 485 |
report = fields.get("report", "")
|
| 486 |
+
date = fields.get("date") or fields.get("decision rendered") or ""
|
| 487 |
citation = f"{name}, {cite}" if cite else name
|
| 488 |
item_url = item_tmpl.format(id=case["id"])
|
| 489 |
modern, paras = _paragraphs(soup)
|
canlex/embed.py
CHANGED
|
@@ -1,13 +1,26 @@
|
|
| 1 |
-
"""Build semantic embeddings for ingested
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import json
|
| 3 |
|
| 4 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
from .config import PROCESSED_DIR
|
| 7 |
|
| 8 |
-
|
| 9 |
EMB_PATH = PROCESSED_DIR / "embeddings.npz"
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
def load_chunks():
|
|
@@ -22,33 +35,75 @@ def embed_text(chunk):
|
|
| 22 |
note = chunk["marginal_note"]
|
| 23 |
body = chunk["text"][:_MAX_BODY]
|
| 24 |
# The marginal note (section title) is the strongest topical signal, so it
|
| 25 |
-
# is repeated to
|
| 26 |
parts = [chunk["act_short"], note, note, chunk["heading"], body]
|
| 27 |
return " . ".join(p for p in parts if p)
|
| 28 |
|
| 29 |
|
| 30 |
class Embedder:
|
| 31 |
-
"""Local
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
def
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
def
|
| 38 |
-
"""Return L2-normalized
|
| 39 |
-
|
| 40 |
-
if vecs.ndim == 1:
|
| 41 |
-
vecs = vecs.reshape(1, -1)
|
| 42 |
-
norms = np.linalg.norm(vecs, axis=1, keepdims=True)
|
| 43 |
-
return vecs / np.maximum(norms, 1e-9)
|
| 44 |
|
| 45 |
|
| 46 |
def build():
|
| 47 |
chunks = load_chunks()
|
| 48 |
if not chunks:
|
| 49 |
-
print(f"No processed
|
| 50 |
return
|
| 51 |
-
print(f"Embedding {len(chunks)} sections with {
|
| 52 |
vectors = Embedder().encode([embed_text(c) for c in chunks])
|
| 53 |
ids = np.array([c["id"] for c in chunks])
|
| 54 |
np.savez(EMB_PATH, ids=ids, vectors=vectors)
|
|
|
|
| 1 |
+
"""Build semantic embeddings for ingested chunks (local, key-free).
|
| 2 |
+
|
| 3 |
+
Uses BAAI's bge-small-en-v1.5 sentence-embedding model as ONNX, run on CPU via
|
| 4 |
+
onnxruntime -- no API key. A transformer embedding has far stronger retrieval
|
| 5 |
+
recall than a static one: it can connect a natural-language question to a
|
| 6 |
+
provision even when the two share few exact words.
|
| 7 |
+
"""
|
| 8 |
import json
|
| 9 |
|
| 10 |
import numpy as np
|
| 11 |
+
import onnxruntime as ort
|
| 12 |
+
from huggingface_hub import hf_hub_download
|
| 13 |
+
from tokenizers import Tokenizer
|
| 14 |
|
| 15 |
from .config import PROCESSED_DIR
|
| 16 |
|
| 17 |
+
EMB_REPO = "Xenova/bge-small-en-v1.5"
|
| 18 |
EMB_PATH = PROCESSED_DIR / "embeddings.npz"
|
| 19 |
+
_MAX_TOKENS = 512
|
| 20 |
+
_MAX_BODY = 2000 # cap embedded body text so long sections stay topically focused
|
| 21 |
+
# bge-small retrieval: the query is prefixed with this instruction; passages
|
| 22 |
+
# are embedded without it. The asymmetry is how the model was trained.
|
| 23 |
+
_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
|
| 24 |
|
| 25 |
|
| 26 |
def load_chunks():
|
|
|
|
| 35 |
note = chunk["marginal_note"]
|
| 36 |
body = chunk["text"][:_MAX_BODY]
|
| 37 |
# The marginal note (section title) is the strongest topical signal, so it
|
| 38 |
+
# is repeated to emphasise it.
|
| 39 |
parts = [chunk["act_short"], note, note, chunk["heading"], body]
|
| 40 |
return " . ".join(p for p in parts if p)
|
| 41 |
|
| 42 |
|
| 43 |
class Embedder:
|
| 44 |
+
"""Local transformer sentence-embedder: bge-small-en-v1.5 as ONNX on CPU.
|
| 45 |
+
|
| 46 |
+
No API key; the model is downloaded once and cached. Produces L2-normalized
|
| 47 |
+
vectors, so a dot product between them is cosine similarity.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
def __init__(self):
|
| 51 |
+
model_path = None
|
| 52 |
+
for name in ("onnx/model_quantized.onnx", "onnx/model.onnx"):
|
| 53 |
+
try:
|
| 54 |
+
model_path = hf_hub_download(EMB_REPO, name)
|
| 55 |
+
break
|
| 56 |
+
except Exception:
|
| 57 |
+
continue
|
| 58 |
+
if model_path is None:
|
| 59 |
+
raise RuntimeError(f"Could not download an ONNX model from {EMB_REPO}.")
|
| 60 |
+
tok_path = hf_hub_download(EMB_REPO, "tokenizer.json")
|
| 61 |
+
self.session = ort.InferenceSession(model_path,
|
| 62 |
+
providers=["CPUExecutionProvider"])
|
| 63 |
+
self.input_names = {i.name for i in self.session.get_inputs()}
|
| 64 |
+
self.tokenizer = Tokenizer.from_file(tok_path)
|
| 65 |
+
self.tokenizer.enable_truncation(max_length=_MAX_TOKENS)
|
| 66 |
+
|
| 67 |
+
def _run(self, texts):
|
| 68 |
+
"""Tokenize, run the encoder, CLS-pool and L2-normalize one batch."""
|
| 69 |
+
encs = self.tokenizer.encode_batch(list(texts))
|
| 70 |
+
width = max(len(e.ids) for e in encs)
|
| 71 |
+
input_ids = np.zeros((len(encs), width), dtype=np.int64)
|
| 72 |
+
attention = np.zeros((len(encs), width), dtype=np.int64)
|
| 73 |
+
type_ids = np.zeros((len(encs), width), dtype=np.int64)
|
| 74 |
+
for row, enc in enumerate(encs):
|
| 75 |
+
n = len(enc.ids)
|
| 76 |
+
input_ids[row, :n] = enc.ids
|
| 77 |
+
attention[row, :n] = enc.attention_mask
|
| 78 |
+
type_ids[row, :n] = enc.type_ids
|
| 79 |
+
feed = {"input_ids": input_ids, "attention_mask": attention}
|
| 80 |
+
if "token_type_ids" in self.input_names:
|
| 81 |
+
feed["token_type_ids"] = type_ids
|
| 82 |
+
hidden = np.asarray(self.session.run(None, feed)[0], dtype=np.float32)
|
| 83 |
+
cls = hidden[:, 0, :] if hidden.ndim == 3 else hidden # BGE: CLS pooling
|
| 84 |
+
norms = np.linalg.norm(cls, axis=1, keepdims=True)
|
| 85 |
+
return cls / np.maximum(norms, 1e-9)
|
| 86 |
|
| 87 |
+
def encode(self, texts, batch_size=32):
|
| 88 |
+
"""Return L2-normalized embeddings for passages, one row per text."""
|
| 89 |
+
texts = list(texts)
|
| 90 |
+
if not texts:
|
| 91 |
+
return np.zeros((0, 384), dtype=np.float32)
|
| 92 |
+
rows = [self._run(texts[i:i + batch_size])
|
| 93 |
+
for i in range(0, len(texts), batch_size)]
|
| 94 |
+
return np.vstack(rows)
|
| 95 |
|
| 96 |
+
def encode_query(self, text):
|
| 97 |
+
"""Return the L2-normalized embedding for one search query."""
|
| 98 |
+
return self._run([_QUERY_PREFIX + text])[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
|
| 101 |
def build():
|
| 102 |
chunks = load_chunks()
|
| 103 |
if not chunks:
|
| 104 |
+
print(f"No processed data in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
|
| 105 |
return
|
| 106 |
+
print(f"Embedding {len(chunks)} sections with {EMB_REPO} ...")
|
| 107 |
vectors = Embedder().encode([embed_text(c) for c in chunks])
|
| 108 |
ids = np.array([c["id"] for c in chunks])
|
| 109 |
np.savez(EMB_PATH, ids=ids, vectors=vectors)
|
canlex/eval.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Measure CanLex retrieval quality against a curated question set.
|
| 2 |
+
|
| 3 |
+
Each item in data/eval/questions.json pairs a realistic legal question with the
|
| 4 |
+
provision(s) or case(s) that answer it. This runs every question through the
|
| 5 |
+
retrieval index and reports Hit@k and MRR. Re-run it after any retrieval change
|
| 6 |
+
-- a new reranker, different embeddings, a chunking tweak -- to see whether
|
| 7 |
+
quality moved, and read the "Misses" list to see exactly what to fix.
|
| 8 |
+
|
| 9 |
+
py -m canlex.eval
|
| 10 |
+
"""
|
| 11 |
+
import json
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
from .config import ROOT
|
| 15 |
+
from .index import LegislationIndex
|
| 16 |
+
|
| 17 |
+
QUESTIONS = ROOT / "data" / "eval" / "questions.json"
|
| 18 |
+
EVAL_TOP_K = 20 # search depth, so ranks past the usual 6 are still visible
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _matches(result, answers):
|
| 22 |
+
"""True if a search result is one of the gold answers (act + section).
|
| 23 |
+
|
| 24 |
+
A gold answer is [act, section]; an empty section matches any chunk of that
|
| 25 |
+
act/case (used for case-law answers, whose chunks carry no section number).
|
| 26 |
+
"""
|
| 27 |
+
r_acts = {result.get("act_short", "").lower(),
|
| 28 |
+
result.get("act_code", "").lower()}
|
| 29 |
+
r_sec = result.get("section", "")
|
| 30 |
+
for act, section in answers:
|
| 31 |
+
if act.lower() in r_acts and (section == r_sec or section == ""):
|
| 32 |
+
return True
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def run():
|
| 37 |
+
if not QUESTIONS.exists():
|
| 38 |
+
print(f"No question set at {QUESTIONS}.", file=sys.stderr)
|
| 39 |
+
return
|
| 40 |
+
items = json.loads(QUESTIONS.read_text(encoding="utf-8"))
|
| 41 |
+
index = LegislationIndex()
|
| 42 |
+
ranks = [] # rank of the first gold hit per question (0 = miss)
|
| 43 |
+
misses = []
|
| 44 |
+
for item in items:
|
| 45 |
+
answers = [tuple(a) for a in item["answers"]]
|
| 46 |
+
results = index.search(item["query"], top_k=EVAL_TOP_K)
|
| 47 |
+
rank = 0
|
| 48 |
+
for i, result in enumerate(results, start=1):
|
| 49 |
+
if _matches(result, answers):
|
| 50 |
+
rank = i
|
| 51 |
+
break
|
| 52 |
+
ranks.append(rank)
|
| 53 |
+
if rank == 0 or rank > 5:
|
| 54 |
+
top = results[0] if results else None
|
| 55 |
+
misses.append((item["query"], answers, rank, top))
|
| 56 |
+
|
| 57 |
+
n = len(ranks) or 1
|
| 58 |
+
hit = lambda k: sum(1 for r in ranks if 0 < r <= k) / n
|
| 59 |
+
mrr = sum(1.0 / r for r in ranks if r) / n
|
| 60 |
+
print(f"CanLex retrieval evaluation -- {len(ranks)} questions\n")
|
| 61 |
+
print(f" Hit@1: {hit(1):.2f}")
|
| 62 |
+
print(f" Hit@3: {hit(3):.2f}")
|
| 63 |
+
print(f" Hit@5: {hit(5):.2f}")
|
| 64 |
+
print(f" Hit@10: {hit(10):.2f}")
|
| 65 |
+
print(f" MRR: {mrr:.2f}")
|
| 66 |
+
|
| 67 |
+
if misses:
|
| 68 |
+
print(f"\n{len(misses)} miss(es) -- gold answer ranked >5 or absent:")
|
| 69 |
+
for query, answers, rank, top in misses:
|
| 70 |
+
gold = ", ".join(f"{a} s.{s}".rstrip(" s.") for a, s in answers)
|
| 71 |
+
where = f"ranked #{rank}" if rank else f"absent (searched {EVAL_TOP_K})"
|
| 72 |
+
got = (f"{top.get('act_short', '')} s.{top.get('section', '')}".rstrip(" s.")
|
| 73 |
+
if top else "nothing")
|
| 74 |
+
print(f" [{where}] {query}")
|
| 75 |
+
print(f" gold: {gold} | top result: {got}")
|
| 76 |
+
print()
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
if __name__ == "__main__":
|
| 80 |
+
run()
|
canlex/index.py
CHANGED
|
@@ -5,6 +5,8 @@ import re
|
|
| 5 |
import sys
|
| 6 |
from collections import Counter, defaultdict
|
| 7 |
|
|
|
|
|
|
|
| 8 |
from .config import PROCESSED_DIR
|
| 9 |
|
| 10 |
K1 = 1.5
|
|
@@ -15,10 +17,31 @@ RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
|
|
| 15 |
|
| 16 |
_TOKEN = re.compile(r"[a-z0-9]+")
|
| 17 |
_SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def tokenize(text):
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def _section_refs(query):
|
|
@@ -35,6 +58,7 @@ class LegislationIndex:
|
|
| 35 |
raise RuntimeError(
|
| 36 |
f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
|
| 37 |
self._build_bm25()
|
|
|
|
| 38 |
self._load_semantic()
|
| 39 |
self._load_reranker()
|
| 40 |
|
|
@@ -44,10 +68,12 @@ class LegislationIndex:
|
|
| 44 |
df = defaultdict(int)
|
| 45 |
for idx, c in enumerate(self.chunks):
|
| 46 |
# The marginal note (title) is repeated to weight it above body text;
|
| 47 |
-
#
|
|
|
|
|
|
|
| 48 |
blob = " ".join((c["marginal_note"], c["marginal_note"], c["heading"],
|
| 49 |
-
c["part"], c["division"], c["
|
| 50 |
-
c["text"]))
|
| 51 |
counts = Counter(tokenize(blob))
|
| 52 |
self.doc_len.append(sum(counts.values()))
|
| 53 |
for term, tf in counts.items():
|
|
@@ -119,7 +145,7 @@ class LegislationIndex:
|
|
| 119 |
return scores
|
| 120 |
|
| 121 |
def _semantic_ranking(self, query):
|
| 122 |
-
qv = self.embedder.
|
| 123 |
sims = self.vectors @ qv
|
| 124 |
order = self._np.argsort(sims)[::-1][:CANDIDATES]
|
| 125 |
return [int(i) for i in order]
|
|
@@ -158,13 +184,21 @@ class LegislationIndex:
|
|
| 158 |
return []
|
| 159 |
scores = {i: fused[i] for i in candidates}
|
| 160 |
|
| 161 |
-
# Precision stage: the cross-encoder rescores the top candidate pool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
if self.reranker:
|
| 163 |
pool = candidates[:RERANK_POOL]
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
| 168 |
candidates = pool + candidates[RERANK_POOL:]
|
| 169 |
|
| 170 |
# Explicit section references are pinned to the very top.
|
|
@@ -183,6 +217,46 @@ class LegislationIndex:
|
|
| 183 |
return c
|
| 184 |
return None
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
def main():
|
| 188 |
if len(sys.argv) < 2:
|
|
|
|
| 5 |
import sys
|
| 6 |
from collections import Counter, defaultdict
|
| 7 |
|
| 8 |
+
import snowballstemmer
|
| 9 |
+
|
| 10 |
from .config import PROCESSED_DIR
|
| 11 |
|
| 12 |
K1 = 1.5
|
|
|
|
| 17 |
|
| 18 |
_TOKEN = re.compile(r"[a-z0-9]+")
|
| 19 |
_SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
|
| 20 |
+
# A cross-reference to another provision -- "section 34", "subsection 25(1)",
|
| 21 |
+
# "paragraph 36(1)(a)", "s. 34" -- capturing the top-level section number.
|
| 22 |
+
_XREF = re.compile(
|
| 23 |
+
r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)",
|
| 24 |
+
re.IGNORECASE)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
_STEMMER = snowballstemmer.stemmer("english")
|
| 28 |
+
_STEM_CACHE = {}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _stem(word):
|
| 32 |
+
"""Snowball-stem a word, memoised -- legal text repeats terms heavily."""
|
| 33 |
+
stemmed = _STEM_CACHE.get(word)
|
| 34 |
+
if stemmed is None:
|
| 35 |
+
stemmed = _STEMMER.stemWord(word)
|
| 36 |
+
_STEM_CACHE[word] = stemmed
|
| 37 |
+
return stemmed
|
| 38 |
|
| 39 |
|
| 40 |
def tokenize(text):
|
| 41 |
+
"""Lower-case, split on word characters, and Snowball-stem each token, so a
|
| 42 |
+
query matches a provision even when their word forms differ -- 'possession'
|
| 43 |
+
vs 'possess', 'reporting' vs 'report', 'importation' vs 'import'."""
|
| 44 |
+
return [_stem(w) for w in _TOKEN.findall(text.lower())]
|
| 45 |
|
| 46 |
|
| 47 |
def _section_refs(query):
|
|
|
|
| 58 |
raise RuntimeError(
|
| 59 |
f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
|
| 60 |
self._build_bm25()
|
| 61 |
+
self._build_xref()
|
| 62 |
self._load_semantic()
|
| 63 |
self._load_reranker()
|
| 64 |
|
|
|
|
| 68 |
df = defaultdict(int)
|
| 69 |
for idx, c in enumerate(self.chunks):
|
| 70 |
# The marginal note (title) is repeated to weight it above body text;
|
| 71 |
+
# the Act name, code and section are indexed too, so an Act's own
|
| 72 |
+
# terminology (e.g. "controlled substance") and its codes/numbers
|
| 73 |
+
# are searchable even when a section's text omits them.
|
| 74 |
blob = " ".join((c["marginal_note"], c["marginal_note"], c["heading"],
|
| 75 |
+
c["part"], c["division"], c["act_name"], c["act_code"],
|
| 76 |
+
c["section"], c["text"]))
|
| 77 |
counts = Counter(tokenize(blob))
|
| 78 |
self.doc_len.append(sum(counts.values()))
|
| 79 |
for term, tf in counts.items():
|
|
|
|
| 145 |
return scores
|
| 146 |
|
| 147 |
def _semantic_ranking(self, query):
|
| 148 |
+
qv = self.embedder.encode_query(query)
|
| 149 |
sims = self.vectors @ qv
|
| 150 |
order = self._np.argsort(sims)[::-1][:CANDIDATES]
|
| 151 |
return [int(i) for i in order]
|
|
|
|
| 184 |
return []
|
| 185 |
scores = {i: fused[i] for i in candidates}
|
| 186 |
|
| 187 |
+
# Precision stage: the cross-encoder rescores the top candidate pool, but
|
| 188 |
+
# may only PROMOTE -- each pooled candidate is placed at the better of its
|
| 189 |
+
# fusion rank and its rerank rank, never below its fusion rank. The
|
| 190 |
+
# reranker reliably surfaces a strong answer the fusion ranked low, yet is
|
| 191 |
+
# unreliable on long statutory text (it can score the right section
|
| 192 |
+
# negative), so its power to demote a candidate is deliberately removed.
|
| 193 |
if self.reranker:
|
| 194 |
pool = candidates[:RERANK_POOL]
|
| 195 |
+
ce = dict(zip(pool, self.reranker.score(
|
| 196 |
+
query, [self._rerank_doc(i) for i in pool])))
|
| 197 |
+
fusion_rank = {idx: r for r, idx in enumerate(pool)}
|
| 198 |
+
rerank_rank = {idx: r for r, idx in enumerate(
|
| 199 |
+
sorted(pool, key=ce.get, reverse=True))}
|
| 200 |
+
pool.sort(key=lambda i: (min(fusion_rank[i], rerank_rank[i]),
|
| 201 |
+
fusion_rank[i]))
|
| 202 |
candidates = pool + candidates[RERANK_POOL:]
|
| 203 |
|
| 204 |
# Explicit section references are pinned to the very top.
|
|
|
|
| 217 |
return c
|
| 218 |
return None
|
| 219 |
|
| 220 |
+
def _build_xref(self):
|
| 221 |
+
"""Index legislation by (act, section) and locate each Act's definitions
|
| 222 |
+
section, to support cross-reference lookup."""
|
| 223 |
+
self._by_section = {}
|
| 224 |
+
self._defs_section = {}
|
| 225 |
+
for c in self.chunks:
|
| 226 |
+
if c.get("doc_type", "legislation") != "legislation":
|
| 227 |
+
continue
|
| 228 |
+
self._by_section[(c["act_code"], c["section"])] = c
|
| 229 |
+
if c["act_code"] not in self._defs_section and (
|
| 230 |
+
c["marginal_note"].strip().lower() in (
|
| 231 |
+
"definitions", "definition", "interpretation")):
|
| 232 |
+
self._defs_section[c["act_code"]] = c
|
| 233 |
+
|
| 234 |
+
def related(self, chunk):
|
| 235 |
+
"""Return [(section, marginal_note), ...]: provisions of the same Act
|
| 236 |
+
that this one cross-references, plus the Act's definitions section.
|
| 237 |
+
|
| 238 |
+
Legislation chunks only; returns [] for case law, memoranda, etc.
|
| 239 |
+
"""
|
| 240 |
+
if chunk.get("doc_type", "legislation") != "legislation":
|
| 241 |
+
return []
|
| 242 |
+
act = chunk["act_code"]
|
| 243 |
+
out, seen = [], {chunk["section"]}
|
| 244 |
+
defs = self._defs_section.get(act)
|
| 245 |
+
if defs and defs["section"] not in seen:
|
| 246 |
+
out.append((defs["section"], defs["marginal_note"]))
|
| 247 |
+
seen.add(defs["section"])
|
| 248 |
+
for match in _XREF.finditer(chunk["text"]):
|
| 249 |
+
sec = match.group(1)
|
| 250 |
+
if sec in seen:
|
| 251 |
+
continue
|
| 252 |
+
target = self._by_section.get((act, sec))
|
| 253 |
+
if target:
|
| 254 |
+
out.append((sec, target["marginal_note"]))
|
| 255 |
+
seen.add(sec)
|
| 256 |
+
if len(out) >= 8:
|
| 257 |
+
break
|
| 258 |
+
return out
|
| 259 |
+
|
| 260 |
|
| 261 |
def main():
|
| 262 |
if len(sys.argv) < 2:
|
canlex/server.py
CHANGED
|
@@ -31,6 +31,9 @@ _READONLY = {
|
|
| 31 |
GROUNDING_NOTE = (
|
| 32 |
"ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
|
| 33 |
"specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
|
|
|
|
|
|
|
|
|
|
| 34 |
"Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
|
| 35 |
"CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
|
| 36 |
"and a court may disagree with them; collective agreements and the National "
|
|
@@ -58,7 +61,7 @@ def _index() -> LegislationIndex:
|
|
| 58 |
return _INDEX
|
| 59 |
|
| 60 |
|
| 61 |
-
def _format_section(c: dict) -> str:
|
| 62 |
"""Render one chunk (legislation, D-Memo, or agreement) as cited Markdown."""
|
| 63 |
doc_type = c.get("doc_type", "legislation")
|
| 64 |
header = f"### {c['citation']} — {c['marginal_note']}".rstrip(" —")
|
|
@@ -83,6 +86,11 @@ def _format_section(c: dict) -> str:
|
|
| 83 |
"— IRB members apply its reasoning to similar cases or "
|
| 84 |
"explain why not; persuasive, and subject to revocation "
|
| 85 |
"or to review by the Federal Court._")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
else:
|
| 87 |
lines.append("_Court decision — binding precedent depending on the "
|
| 88 |
"court and jurisdiction; confirm it has not been "
|
|
@@ -98,6 +106,9 @@ def _format_section(c: dict) -> str:
|
|
| 98 |
lines.append("")
|
| 99 |
lines.append(c["text"])
|
| 100 |
lines.append("")
|
|
|
|
|
|
|
|
|
|
| 101 |
if c["history"]:
|
| 102 |
if doc_type == "caselaw":
|
| 103 |
lines.append(f"Also reported: {c['history']}")
|
|
@@ -133,8 +144,8 @@ class SearchInput(BaseModel):
|
|
| 133 |
default=None,
|
| 134 |
description="Optional filter by source type: 'legislation' (Acts and "
|
| 135 |
"regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
|
| 136 |
-
"agreements), 'directive' (NJC directives), or 'caselaw' (
|
| 137 |
-
"
|
| 138 |
)
|
| 139 |
|
| 140 |
|
|
@@ -159,9 +170,10 @@ def canlex_search_legislation(params: SearchInput) -> str:
|
|
| 159 |
CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
|
| 160 |
how it applies customs and border law); Treasury Board collective agreements
|
| 161 |
(currently the FB / Border Services group); National Joint Council directives
|
| 162 |
-
(travel, relocation, isolated posts and more); and leading
|
| 163 |
-
|
| 164 |
-
|
|
|
|
| 165 |
their full text so the answer can cite the actual wording; an explicit section
|
| 166 |
reference (e.g. "section 34") is always surfaced. Each result is marked with its
|
| 167 |
source type.
|
|
@@ -199,7 +211,7 @@ def canlex_search_legislation(params: SearchInput) -> str:
|
|
| 199 |
blocks.append("")
|
| 200 |
blocks.append("---")
|
| 201 |
blocks.append("")
|
| 202 |
-
blocks.append(_format_section(c))
|
| 203 |
return "\n".join(blocks)
|
| 204 |
|
| 205 |
|
|
@@ -233,7 +245,7 @@ def canlex_get_section(params: GetSectionInput) -> str:
|
|
| 233 |
return (f"Error: no section '{params.section}' found in '{params.act}'. "
|
| 234 |
f"Loaded Acts: {', '.join(acts) or 'none'}. Check the section number, "
|
| 235 |
f"or use canlex_search_legislation to locate the provision by topic.")
|
| 236 |
-
return GROUNDING_NOTE + "\n\n" + _format_section(section)
|
| 237 |
|
| 238 |
|
| 239 |
@mcp.tool(name="canlex_list_acts",
|
|
|
|
| 31 |
GROUNDING_NOTE = (
|
| 32 |
"ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
|
| 33 |
"specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
|
| 34 |
+
"When a result lists Related provisions, fetch any that bear on the question "
|
| 35 |
+
"-- the definitions section, an exception, a cross-referenced rule -- with "
|
| 36 |
+
"canlex_get_section before answering. "
|
| 37 |
"Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
|
| 38 |
"CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
|
| 39 |
"and a court may disagree with them; collective agreements and the National "
|
|
|
|
| 61 |
return _INDEX
|
| 62 |
|
| 63 |
|
| 64 |
+
def _format_section(c: dict, related=None) -> str:
|
| 65 |
"""Render one chunk (legislation, D-Memo, or agreement) as cited Markdown."""
|
| 66 |
doc_type = c.get("doc_type", "legislation")
|
| 67 |
header = f"### {c['citation']} — {c['marginal_note']}".rstrip(" —")
|
|
|
|
| 86 |
"— IRB members apply its reasoning to similar cases or "
|
| 87 |
"explain why not; persuasive, and subject to revocation "
|
| 88 |
"or to review by the Federal Court._")
|
| 89 |
+
elif "Board" in c["part"]:
|
| 90 |
+
lines.append("_Labour-board decision — a federal administrative "
|
| 91 |
+
"tribunal's ruling; persuasive within the board's own "
|
| 92 |
+
"jurisprudence, and subject to judicial review by the "
|
| 93 |
+
"Federal Court of Appeal._")
|
| 94 |
else:
|
| 95 |
lines.append("_Court decision — binding precedent depending on the "
|
| 96 |
"court and jurisdiction; confirm it has not been "
|
|
|
|
| 106 |
lines.append("")
|
| 107 |
lines.append(c["text"])
|
| 108 |
lines.append("")
|
| 109 |
+
if related:
|
| 110 |
+
refs = "; ".join(f"s. {s} ({n})" if n else f"s. {s}" for s, n in related)
|
| 111 |
+
lines.append(f"Related provisions in this Act: {refs}")
|
| 112 |
if c["history"]:
|
| 113 |
if doc_type == "caselaw":
|
| 114 |
lines.append(f"Also reported: {c['history']}")
|
|
|
|
| 144 |
default=None,
|
| 145 |
description="Optional filter by source type: 'legislation' (Acts and "
|
| 146 |
"regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
|
| 147 |
+
"agreements), 'directive' (NJC directives), or 'caselaw' (court and "
|
| 148 |
+
"tribunal decisions). Omit to search all.",
|
| 149 |
)
|
| 150 |
|
| 151 |
|
|
|
|
| 170 |
CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
|
| 171 |
how it applies customs and border law); Treasury Board collective agreements
|
| 172 |
(currently the FB / Border Services group); National Joint Council directives
|
| 173 |
+
(travel, relocation, isolated posts and more); and leading decisions of the
|
| 174 |
+
courts and federal tribunals: the Supreme Court, Federal Court of Appeal and
|
| 175 |
+
Federal Court, the Immigration and Refugee Board, and the FPSLREB and CIRB
|
| 176 |
+
labour boards. Use this for ANY question about that material. It ranks results by relevance and returns
|
| 177 |
their full text so the answer can cite the actual wording; an explicit section
|
| 178 |
reference (e.g. "section 34") is always surfaced. Each result is marked with its
|
| 179 |
source type.
|
|
|
|
| 211 |
blocks.append("")
|
| 212 |
blocks.append("---")
|
| 213 |
blocks.append("")
|
| 214 |
+
blocks.append(_format_section(c, index.related(c)))
|
| 215 |
return "\n".join(blocks)
|
| 216 |
|
| 217 |
|
|
|
|
| 245 |
return (f"Error: no section '{params.section}' found in '{params.act}'. "
|
| 246 |
f"Loaded Acts: {', '.join(acts) or 'none'}. Check the section number, "
|
| 247 |
f"or use canlex_search_legislation to locate the provision by topic.")
|
| 248 |
+
return GROUNDING_NOTE + "\n\n" + _format_section(section, index.related(section))
|
| 249 |
|
| 250 |
|
| 251 |
@mcp.tool(name="canlex_list_acts",
|
data/eval/questions.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{"query": "How soon must the Immigration Division review the detention of a foreign national?", "answers": [["IRPA", "57"]]},
|
| 3 |
+
{"query": "On what security grounds is a foreign national inadmissible to Canada?", "answers": [["IRPA", "34"]]},
|
| 4 |
+
{"query": "When is a permanent resident inadmissible for serious criminality?", "answers": [["IRPA", "36"]]},
|
| 5 |
+
{"query": "What makes a person inadmissible for organized criminality?", "answers": [["IRPA", "37"]]},
|
| 6 |
+
{"query": "Can someone be found inadmissible to Canada for misrepresentation?", "answers": [["IRPA", "40"]]},
|
| 7 |
+
{"query": "Is a foreign national inadmissible on health grounds?", "answers": [["IRPA", "38"]]},
|
| 8 |
+
{"query": "Inadmissibility for violating human or international rights", "answers": [["IRPA", "35"]]},
|
| 9 |
+
{"query": "Can a person be inadmissible to Canada for financial reasons?", "answers": [["IRPA", "39"]]},
|
| 10 |
+
{"query": "Is someone inadmissible because an accompanying family member is inadmissible?", "answers": [["IRPA", "42"]]},
|
| 11 |
+
{"query": "What humanitarian and compassionate relief can the Minister grant a foreign national?", "answers": [["IRPA", "25"], ["IRPA", "25.1"]]},
|
| 12 |
+
{"query": "When can an officer arrest and detain a foreign national without a warrant?", "answers": [["IRPA", "55"]]},
|
| 13 |
+
{"query": "Who prepares a report that a permanent resident is inadmissible?", "answers": [["IRPA", "44"]]},
|
| 14 |
+
{"query": "Must a person appear for an examination when seeking to enter Canada?", "answers": [["IRPA", "18"]]},
|
| 15 |
+
{"query": "What must a person establish to be allowed to enter Canada?", "answers": [["IRPA", "20"]]},
|
| 16 |
+
{"query": "What is the definition of a Convention refugee?", "answers": [["IRPA", "96"]]},
|
| 17 |
+
{"query": "Who qualifies as a person in need of protection?", "answers": [["IRPA", "97"]]},
|
| 18 |
+
{"query": "When is a refugee claim ineligible to be referred to the Refugee Protection Division?", "answers": [["IRPA", "101"]]},
|
| 19 |
+
{"query": "What is a pre-removal risk assessment and who can apply for one?", "answers": [["IRPA", "112"]]},
|
| 20 |
+
{"query": "When does a removal order become enforceable?", "answers": [["IRPA", "48"]]},
|
| 21 |
+
{"query": "Is it an offence to organize the illegal entry of people into Canada?", "answers": [["IRPA", "117"]]},
|
| 22 |
+
{"query": "Must a person report to a customs officer when arriving in Canada?", "answers": [["Customs Act", "11"]]},
|
| 23 |
+
{"query": "What is the duty to report goods imported into Canada?", "answers": [["Customs Act", "12"]]},
|
| 24 |
+
{"query": "Can a customs officer examine imported goods?", "answers": [["Customs Act", "99"]]},
|
| 25 |
+
{"query": "When can a customs officer search a person at the border?", "answers": [["Customs Act", "98"]]},
|
| 26 |
+
{"query": "What happens when goods are seized for a customs contravention?", "answers": [["Customs Act", "110"]]},
|
| 27 |
+
{"query": "What is ascertained forfeiture under the Customs Act?", "answers": [["Customs Act", "124"]]},
|
| 28 |
+
{"query": "When do imported goods become forfeit after a customs contravention?", "answers": [["Customs Act", "122"]]},
|
| 29 |
+
{"query": "How can a person appeal a customs seizure or penalty decision to the Federal Court?", "answers": [["Customs Act", "135"]]},
|
| 30 |
+
{"query": "Advance information about commercial goods before they arrive in Canada", "answers": [["Customs Act", "12.1"]]},
|
| 31 |
+
{"query": "How is the value for duty of imported goods determined?", "answers": [["Customs Act", "46"], ["Customs Act", "47"], ["Customs Act", "48"]]},
|
| 32 |
+
{"query": "How are imported goods classified under the Customs Tariff?", "answers": [["Customs Tariff", "10"]]},
|
| 33 |
+
{"query": "Must travellers report large amounts of currency when crossing the border?", "answers": [["PCMLTFA", "12"]]},
|
| 34 |
+
{"query": "Can an officer seize currency that was not reported at the border?", "answers": [["PCMLTFA", "18"]]},
|
| 35 |
+
{"query": "How does someone appeal the forfeiture of seized currency to the Federal Court?", "answers": [["PCMLTFA", "30"]]},
|
| 36 |
+
{"query": "Is simple possession of a controlled substance an offence?", "answers": [["CDSA", "4"]]},
|
| 37 |
+
{"query": "What is the offence of trafficking in a controlled substance?", "answers": [["CDSA", "5"]]},
|
| 38 |
+
{"query": "Is it an offence to import or export a controlled substance?", "answers": [["CDSA", "6"]]},
|
| 39 |
+
{"query": "Is possession of cannabis an offence?", "answers": [["Cannabis Act", "8"]]},
|
| 40 |
+
{"query": "Can cannabis be imported into or exported from Canada?", "answers": [["Cannabis Act", "11"]]},
|
| 41 |
+
{"query": "When can a peace officer arrest a person without a warrant?", "answers": [["Criminal Code", "495"]]},
|
| 42 |
+
{"query": "What right does a person have to access their own personal information held by a government institution?", "answers": [["Privacy Act", "12"]]},
|
| 43 |
+
{"query": "When may a government institution disclose someone's personal information?", "answers": [["Privacy Act", "8"]]},
|
| 44 |
+
{"query": "Can an employee refuse to do work that presents a danger?", "answers": [["Canada Labour Code", "128"]]},
|
| 45 |
+
{"query": "What are the standard hours of work for an employee?", "answers": [["Canada Labour Code", "169"]]},
|
| 46 |
+
{"query": "What is the standard of review of an administrative decision on judicial review?", "answers": [["Vavilov", ""]]},
|
| 47 |
+
{"query": "How does the Refugee Appeal Division review a decision of the Refugee Protection Division?", "answers": [["Huruglica", ""]]},
|
| 48 |
+
{"query": "To get back currency seized at the border, what must the claimant show about the money?", "answers": [["Sellathurai", ""]]}
|
| 49 |
+
]
|
requirements.txt
CHANGED
|
@@ -2,10 +2,10 @@
|
|
| 2 |
# py -m venv .venv
|
| 3 |
# .venv\Scripts\python.exe -m pip install -r requirements.txt
|
| 4 |
mcp>=1.2 # MCP server (server.py)
|
| 5 |
-
model2vec>=0.6 # local semantic embeddings (embed.py)
|
| 6 |
numpy>=2.0 # vector math for hybrid retrieval (index.py)
|
| 7 |
-
onnxruntime>=1.20 #
|
| 8 |
huggingface-hub>=0.20 # one-time model downloads (embed.py, rerank.py)
|
| 9 |
-
tokenizers>=0.20 #
|
| 10 |
beautifulsoup4>=4.12 # parse CBSA D-Memoranda HTML (dmemo.py)
|
| 11 |
pypdf>=4.0 # extract text from PDF-only D-Memoranda (dmemo.py)
|
|
|
|
|
|
| 2 |
# py -m venv .venv
|
| 3 |
# .venv\Scripts\python.exe -m pip install -r requirements.txt
|
| 4 |
mcp>=1.2 # MCP server (server.py)
|
|
|
|
| 5 |
numpy>=2.0 # vector math for hybrid retrieval (index.py)
|
| 6 |
+
onnxruntime>=1.20 # embedding + reranker model runtime (embed.py, rerank.py)
|
| 7 |
huggingface-hub>=0.20 # one-time model downloads (embed.py, rerank.py)
|
| 8 |
+
tokenizers>=0.20 # tokenization for the embedding and reranker models
|
| 9 |
beautifulsoup4>=4.12 # parse CBSA D-Memoranda HTML (dmemo.py)
|
| 10 |
pypdf>=4.0 # extract text from PDF-only D-Memoranda (dmemo.py)
|
| 11 |
+
snowballstemmer>=2.2 # English stemmer for keyword search (index.py)
|