Add IRPA delegation instruments, R. v. Carignan, and retrieval polish
Browse filesNew 'delegation' doc_type and canlex/delegation.py, ingesting two
instruments of delegation and designation under IRPA/IRPR: the CBSA
"Delegation of Authority and Designations of Officers" (HTML, 86 items)
and the IRCC IL3 "Instrument of Designation and Delegation" (PDF, 221
items) -- 307 chunks recording which officials may exercise which
ministerial powers. Wired through index.py (delegation items are
uncapped primary instruments) and server.py (rendering, the doc_type
filter, canlex_list_acts).
R. v. Carignan, 2025 SCC 43 -- a leading SCC decision on the
warrantless-arrest power in Criminal Code s. 495 and the Charter s. 9
right against arbitrary arrest (caselaw.py, +71 chunks).
Retrieval-polish batch: the diversity cap no longer collapses
agreements and directives; an idf-weighted title-match boost;
Act-over-regulation and agreement-back-matter fusion penalties;
dmemo.py now fills the 'part' title for every memo and embed.py uses
it; the eval set is broadened to 129 questions; a 25-test stdlib
unittest suite under tests/.
129-question eval: Hit@1 0.74, Hit@3 0.89, Hit@5 0.93, Hit@10 0.97,
MRR 0.82 -- flat against the pre-Carignan/-delegation baseline of
0.74 / 0.90 / 0.94 / 0.96 / 0.83; the 378 new chunks introduce no
meaningful regression.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- canlex/caselaw.py +10 -0
- canlex/delegation.py +356 -0
- canlex/dmemo.py +10 -3
- canlex/embed.py +8 -3
- canlex/index.py +76 -4
- canlex/server.py +32 -10
- data/eval/questions.json +36 -1
- data/processed/caselaw.json +0 -0
- data/processed/delegation.json +0 -0
- data/processed/dmemos.json +0 -0
- tests/__init__.py +1 -0
- tests/test_embed.py +33 -0
- tests/test_index.py +170 -0
- tests/test_synonyms.py +27 -0
|
@@ -152,6 +152,11 @@ CASES = [
|
|
| 152 |
{"court": "scc", "id": 2198, "short": "Martineau",
|
| 153 |
"topic": "Whether an ascertained-forfeiture proceeding under the Customs "
|
| 154 |
"Act is penal and engages the right against self-incrimination"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
# --- Federal Court of Appeal ---
|
| 157 |
{"court": "fca", "id": 143136, "short": "Huruglica",
|
|
@@ -217,6 +222,11 @@ CASES = [
|
|
| 217 |
{"court": "fca", "id": 31447, "short": "Rahaman",
|
| 218 |
"topic": "Refugee claims and the meaning of a 'no credible basis' "
|
| 219 |
"finding under IRPA"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
# --- Federal Court ---
|
| 222 |
{"court": "fc", "id": 64594, "short": "Goburdhun",
|
|
|
|
| 152 |
{"court": "scc", "id": 2198, "short": "Martineau",
|
| 153 |
"topic": "Whether an ascertained-forfeiture proceeding under the Customs "
|
| 154 |
"Act is penal and engages the right against self-incrimination"},
|
| 155 |
+
{"court": "scc", "id": 21317, "short": "Carignan",
|
| 156 |
+
"topic": "Powers of arrest without warrant under Criminal Code s. 495; an "
|
| 157 |
+
"arrest that contravenes the s. 495(2) limits can ground a breach "
|
| 158 |
+
"of the Charter s. 9 right against arbitrary arrest, despite "
|
| 159 |
+
"s. 495(3)"},
|
| 160 |
|
| 161 |
# --- Federal Court of Appeal ---
|
| 162 |
{"court": "fca", "id": 143136, "short": "Huruglica",
|
|
|
|
| 222 |
{"court": "fca", "id": 31447, "short": "Rahaman",
|
| 223 |
"topic": "Refugee claims and the meaning of a 'no credible basis' "
|
| 224 |
"finding under IRPA"},
|
| 225 |
+
{"court": "fca", "id": 521840, "short": "Rodriguez Anzola",
|
| 226 |
+
"topic": "Criminal inadmissibility under IRPA s. 36(1)(b) for a "
|
| 227 |
+
"conviction abroad; whether an immigration decision-maker may "
|
| 228 |
+
"consider that the defence of duress was effectively unavailable "
|
| 229 |
+
"in the foreign jurisdiction"},
|
| 230 |
|
| 231 |
# --- Federal Court ---
|
| 232 |
{"court": "fc", "id": 64594, "short": "Goburdhun",
|
|
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ingest instruments of delegation and designation under IRPA / IRPR.
|
| 2 |
+
|
| 3 |
+
These instruments record which officer positions the Minister has delegated
|
| 4 |
+
powers to, or designated for functions, under the Immigration and Refugee
|
| 5 |
+
Protection Act and its Regulations. They are administrative instruments -- not
|
| 6 |
+
enacted law, and not guidance -- so every chunk is tagged doc_type="delegation".
|
| 7 |
+
|
| 8 |
+
Sources:
|
| 9 |
+
- the CBSA "Delegation of Authority and Designations of Officers ..."
|
| 10 |
+
instrument (HTML, cbsa-asfc.gc.ca);
|
| 11 |
+
- the IRCC "IL3 -- Instrument of Designation and Delegation" (PDF, canada.ca).
|
| 12 |
+
|
| 13 |
+
py -m canlex.delegation
|
| 14 |
+
"""
|
| 15 |
+
import io
|
| 16 |
+
import json
|
| 17 |
+
import re
|
| 18 |
+
import subprocess
|
| 19 |
+
import time
|
| 20 |
+
import urllib.request
|
| 21 |
+
|
| 22 |
+
from bs4 import BeautifulSoup
|
| 23 |
+
from pypdf import PdfReader
|
| 24 |
+
|
| 25 |
+
from .config import PROCESSED_DIR, RAW_DIR
|
| 26 |
+
|
| 27 |
+
RAW = RAW_DIR / "delegation"
|
| 28 |
+
OUT = PROCESSED_DIR / "delegation.json"
|
| 29 |
+
|
| 30 |
+
# cbsa-asfc.gc.ca serves an ordinary client fine with a browser User-Agent;
|
| 31 |
+
# canada.ca (the IRCC PDF) blocks Python's HTTP client at the TLS layer, so that
|
| 32 |
+
# one is fetched via PowerShell's (.NET) HTTP stack, as agreement.py does.
|
| 33 |
+
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
| 34 |
+
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
|
| 35 |
+
|
| 36 |
+
SOURCES = {
|
| 37 |
+
"cbsa": {
|
| 38 |
+
"code": "cbsa",
|
| 39 |
+
"kind": "html-cbsa",
|
| 40 |
+
"act_code": "CBSA-IRPA-DELEG",
|
| 41 |
+
"act_short": "CBSA Delegation",
|
| 42 |
+
"act_name": ("Delegation of Authority and Designations of Officers by "
|
| 43 |
+
"the Minister of Public Safety and Emergency Preparedness "
|
| 44 |
+
"under the Immigration and Refugee Protection Act and the "
|
| 45 |
+
"Immigration and Refugee Protection Regulations"),
|
| 46 |
+
"url": ("https://www.cbsa-asfc.gc.ca/agency-agence/actreg-loireg/"
|
| 47 |
+
"delegation/irpa-lipr-2016-07-eng.html"),
|
| 48 |
+
},
|
| 49 |
+
"ircc": {
|
| 50 |
+
"code": "ircc",
|
| 51 |
+
"kind": "pdf-ircc",
|
| 52 |
+
"act_code": "IRCC-IL3-DELEG",
|
| 53 |
+
"act_short": "IRCC IL3",
|
| 54 |
+
"act_name": ("IL3 — Instrument of Designation and Delegation, "
|
| 55 |
+
"Immigration and Refugee Protection Act and Regulations"),
|
| 56 |
+
"url": ("https://www.canada.ca/content/dam/ircc/migration/ircc/english/"
|
| 57 |
+
"resources/manuals/il/il3-eng.pdf"),
|
| 58 |
+
},
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _norm(text):
|
| 63 |
+
"""Collapse all whitespace -- including the non-breaking spaces these
|
| 64 |
+
sources use heavily -- to single spaces."""
|
| 65 |
+
return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip()
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _normalize_refs(text):
|
| 69 |
+
"""Expand the instruments' provision shorthand so the section numbers are
|
| 70 |
+
searchable as tokens: 'A55(1)' -> 'IRPA 55(1)', 'R39' -> 'IRPR 39'. Both
|
| 71 |
+
instruments write 'A' for the Act and 'R' for the Regulations."""
|
| 72 |
+
text = re.sub(r"\bA(?=\d)", "IRPA ", text)
|
| 73 |
+
text = re.sub(r"\bR(?=\d)", "IRPR ", text)
|
| 74 |
+
return _norm(text)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _fetch(url, dest, powershell=False):
|
| 78 |
+
"""Fetch a page or file, caching the raw bytes under data/raw/delegation.
|
| 79 |
+
canada.ca blocks Python's HTTP client, so its PDF is fetched via PowerShell."""
|
| 80 |
+
if dest.exists():
|
| 81 |
+
return dest.read_bytes()
|
| 82 |
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
| 83 |
+
if powershell:
|
| 84 |
+
command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' "
|
| 85 |
+
f"-UseBasicParsing -UserAgent '{_UA}'")
|
| 86 |
+
subprocess.run(
|
| 87 |
+
["powershell", "-NoProfile", "-NonInteractive", "-Command", command],
|
| 88 |
+
check=True, capture_output=True, timeout=180)
|
| 89 |
+
else:
|
| 90 |
+
req = urllib.request.Request(url, headers={"User-Agent": _UA})
|
| 91 |
+
with urllib.request.urlopen(req, timeout=60) as resp:
|
| 92 |
+
dest.write_bytes(resp.read())
|
| 93 |
+
time.sleep(0.5) # be polite to the server
|
| 94 |
+
return dest.read_bytes()
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# --- CBSA instrument (HTML) ---------------------------------------------------
|
| 98 |
+
|
| 99 |
+
def _delegates(cell):
|
| 100 |
+
"""Flatten a 'Delegates / Designated officials' table cell into readable
|
| 101 |
+
lines. The cell groups officer positions under an organisation header
|
| 102 |
+
(<p class="h4">: CBSA, RCMP, IRCC) and an optional sub-heading
|
| 103 |
+
(<p class="h5">: a region or a headquarters branch), each followed by a
|
| 104 |
+
<ul> of position titles."""
|
| 105 |
+
lines, org, sub, org_used = [], "", "", False
|
| 106 |
+
for el in cell.find_all(["p", "ul"], recursive=False):
|
| 107 |
+
if el.name == "p":
|
| 108 |
+
classes = " ".join(el.get("class") or [])
|
| 109 |
+
text = _norm(el.get_text())
|
| 110 |
+
if not text:
|
| 111 |
+
continue
|
| 112 |
+
if "h4" in classes:
|
| 113 |
+
if org and not org_used: # a previous org with no list of its own
|
| 114 |
+
lines.append(org)
|
| 115 |
+
org, sub, org_used = text.rstrip(": "), "", False
|
| 116 |
+
elif "h5" in classes:
|
| 117 |
+
sub = text
|
| 118 |
+
else:
|
| 119 |
+
lines.append(text) # a free-standing note
|
| 120 |
+
else: # <ul> of position titles
|
| 121 |
+
positions = "; ".join(
|
| 122 |
+
t for t in (_norm(li.get_text())
|
| 123 |
+
for li in el.find_all("li", recursive=False)) if t)
|
| 124 |
+
if positions:
|
| 125 |
+
label = f"{org} — {sub}" if sub else org
|
| 126 |
+
lines.append(f"{label}: {positions}" if label else positions)
|
| 127 |
+
org_used = True
|
| 128 |
+
if org and not org_used: # a trailing org with no list
|
| 129 |
+
lines.append(org)
|
| 130 |
+
return "\n".join(lines)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def parse_cbsa(html, src):
|
| 134 |
+
"""Parse the CBSA delegation instrument into one chunk per Schedule item,
|
| 135 |
+
plus one chunk for the preamble."""
|
| 136 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 137 |
+
main = soup.find("main")
|
| 138 |
+
if main is None:
|
| 139 |
+
return []
|
| 140 |
+
for sup in main.find_all("sup"): # drop footnote-reference superscripts
|
| 141 |
+
sup.decompose()
|
| 142 |
+
|
| 143 |
+
time_el = main.find("time")
|
| 144 |
+
date = _norm(time_el.get("datetime") or time_el.get_text()) if time_el else ""
|
| 145 |
+
|
| 146 |
+
chunks = []
|
| 147 |
+
|
| 148 |
+
# Preamble: the paragraphs between the title and the Schedule, which set out
|
| 149 |
+
# the tiers of delegation and designation and how the columns are read.
|
| 150 |
+
schedule = main.find("h2", id="sch")
|
| 151 |
+
if schedule:
|
| 152 |
+
paras = [_norm(p.get_text())
|
| 153 |
+
for p in reversed(schedule.find_previous_siblings("p"))]
|
| 154 |
+
body = "\n".join(p for p in paras if p)
|
| 155 |
+
if body:
|
| 156 |
+
chunks.append({
|
| 157 |
+
"id": f"delegation-{src['code']}-preamble",
|
| 158 |
+
"doc_type": "delegation",
|
| 159 |
+
"act_code": src["act_code"],
|
| 160 |
+
"act_short": src["act_short"],
|
| 161 |
+
"act_name": src["act_name"],
|
| 162 |
+
"section": "",
|
| 163 |
+
"marginal_note": "Tiers of delegation and designation",
|
| 164 |
+
"part": "",
|
| 165 |
+
"division": "",
|
| 166 |
+
"heading": "Instrument of delegation and designation under IRPA",
|
| 167 |
+
"text": body,
|
| 168 |
+
"history": "",
|
| 169 |
+
"last_amended": "",
|
| 170 |
+
"current_to": date,
|
| 171 |
+
"citation": f"{src['act_short']} — Preamble",
|
| 172 |
+
"source_url": src["url"],
|
| 173 |
+
})
|
| 174 |
+
|
| 175 |
+
# One chunk per Schedule item. Each topical <h3> is followed by a four-column
|
| 176 |
+
# table: Item | Act/Regulations reference | Description of power | Delegates.
|
| 177 |
+
for table in main.find_all("table", class_="table-bordered"):
|
| 178 |
+
h3 = table.find_previous_sibling("h3")
|
| 179 |
+
section_name = _norm(h3.get_text()) if h3 else ""
|
| 180 |
+
for tr in table.find_all("tr"):
|
| 181 |
+
cells = tr.find_all("td", recursive=False)
|
| 182 |
+
if len(cells) != 4:
|
| 183 |
+
continue # the header row (<th>) or a stray row
|
| 184 |
+
item_no = _norm(cells[0].get_text()).rstrip(".")
|
| 185 |
+
refs = _normalize_refs(_norm(cells[1].get_text()))
|
| 186 |
+
power = " ".join(_norm(p.get_text())
|
| 187 |
+
for p in cells[2].find_all("p")) \
|
| 188 |
+
or _norm(cells[2].get_text())
|
| 189 |
+
delegates = _delegates(cells[3])
|
| 190 |
+
if not item_no or not (power or refs):
|
| 191 |
+
continue
|
| 192 |
+
text = power
|
| 193 |
+
if refs:
|
| 194 |
+
text += f"\n\nProvisions (IRPA / IRPR): {refs}."
|
| 195 |
+
if delegates:
|
| 196 |
+
text += "\n\nDelegated / designated to:\n" + delegates
|
| 197 |
+
chunks.append({
|
| 198 |
+
"id": f"delegation-{src['code']}-{item_no}",
|
| 199 |
+
"doc_type": "delegation",
|
| 200 |
+
"act_code": src["act_code"],
|
| 201 |
+
"act_short": src["act_short"],
|
| 202 |
+
"act_name": src["act_name"],
|
| 203 |
+
"section": "",
|
| 204 |
+
"marginal_note": refs or f"Item {item_no}",
|
| 205 |
+
"part": section_name,
|
| 206 |
+
"division": "",
|
| 207 |
+
"heading": section_name,
|
| 208 |
+
"text": text,
|
| 209 |
+
"history": "",
|
| 210 |
+
"last_amended": "",
|
| 211 |
+
"current_to": date,
|
| 212 |
+
"citation": f"{src['act_short']}, Item {item_no}",
|
| 213 |
+
"source_url": src["url"],
|
| 214 |
+
})
|
| 215 |
+
return chunks
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
# --- IRCC IL3 instrument (PDF) ------------------------------------------------
|
| 219 |
+
|
| 220 |
+
# A topical part heading -- a line in full upper case (MINISTERIAL INSTRUCTIONS,
|
| 221 |
+
# MEDICAL, MISCELLANEOUS). Organisation acronyms (CI, CBSA, RCMP) are shorter
|
| 222 |
+
# than the 6-character floor and so are not mistaken for headings.
|
| 223 |
+
_IL3_PART = re.compile(r"[A-Z][A-Z &/,()'.\-]{5,}")
|
| 224 |
+
# An item opens "<n>. <A/R provision>" -- the number must be followed by a
|
| 225 |
+
# provision reference, which rejects ordinary numbered prose.
|
| 226 |
+
_IL3_ITEM = re.compile(r"(\d+)\.\s+(?=[AR]\d)")
|
| 227 |
+
# The word that introduces an item's power description, after the provisions.
|
| 228 |
+
_IL3_POWER = re.compile(r"(?:Delegation|Designation)\s*[-–—]\s*")
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def _pdf_pages(pdf_bytes):
|
| 232 |
+
"""Extract each page's text, dropping the printed page number that pypdf
|
| 233 |
+
emits as the page's first line (roman in the front matter, arabic later)."""
|
| 234 |
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
| 235 |
+
pages = []
|
| 236 |
+
for page in reader.pages:
|
| 237 |
+
lines = (page.extract_text() or "").split("\n")
|
| 238 |
+
if lines and re.fullmatch(r"\s*[ivxlcdm\d]+\s*", lines[0] or "", re.I):
|
| 239 |
+
lines = lines[1:]
|
| 240 |
+
pages.append("\n".join(lines))
|
| 241 |
+
return pages
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def parse_ircc(pdf_bytes, src):
|
| 245 |
+
"""Parse the IRCC IL3 instrument into one chunk per Schedule item, plus a
|
| 246 |
+
preamble chunk. The PDF flattens the four-column table into a linear text
|
| 247 |
+
stream, so each item runs from its numbered marker to the next; the power
|
| 248 |
+
description and the delegated positions are kept together within the item."""
|
| 249 |
+
pages = _pdf_pages(pdf_bytes)
|
| 250 |
+
if not pages:
|
| 251 |
+
return []
|
| 252 |
+
version_match = re.search(r"(?:Spring|Summer|Fall|Winter)\s+\d{4}", pages[0])
|
| 253 |
+
version = version_match.group(0) if version_match else ""
|
| 254 |
+
|
| 255 |
+
chunks = []
|
| 256 |
+
preamble = _norm(pages[1]) if len(pages) > 1 else ""
|
| 257 |
+
if preamble:
|
| 258 |
+
chunks.append({
|
| 259 |
+
"id": f"delegation-{src['code']}-preamble",
|
| 260 |
+
"doc_type": "delegation",
|
| 261 |
+
"act_code": src["act_code"],
|
| 262 |
+
"act_short": src["act_short"],
|
| 263 |
+
"act_name": src["act_name"],
|
| 264 |
+
"section": "",
|
| 265 |
+
"marginal_note": "Preamble — designation and delegation under IRPA s. 6",
|
| 266 |
+
"part": "",
|
| 267 |
+
"division": "",
|
| 268 |
+
"heading": "Instrument of designation and delegation under IRPA",
|
| 269 |
+
"text": preamble,
|
| 270 |
+
"history": "",
|
| 271 |
+
"last_amended": "",
|
| 272 |
+
"current_to": version,
|
| 273 |
+
"citation": f"{src['act_short']} — Preamble",
|
| 274 |
+
"source_url": src["url"],
|
| 275 |
+
})
|
| 276 |
+
|
| 277 |
+
# Walk every line: an all-caps line is a topical part heading; a line that
|
| 278 |
+
# opens "<n>. <A/R provision>" starts a new item. Lines before the first
|
| 279 |
+
# item (the preamble, definitions and table of contents) are skipped.
|
| 280 |
+
items, part = [], ""
|
| 281 |
+
cur_no = cur_part = None
|
| 282 |
+
cur_lines = []
|
| 283 |
+
for line in "\n".join(pages).split("\n"):
|
| 284 |
+
s = _norm(line)
|
| 285 |
+
if not s:
|
| 286 |
+
continue
|
| 287 |
+
if _IL3_PART.fullmatch(s):
|
| 288 |
+
part = s.title()
|
| 289 |
+
continue
|
| 290 |
+
m = _IL3_ITEM.match(s)
|
| 291 |
+
if m:
|
| 292 |
+
if cur_no is not None:
|
| 293 |
+
items.append((cur_part, cur_no, cur_lines))
|
| 294 |
+
cur_no, cur_part, cur_lines = m.group(1), part, [s]
|
| 295 |
+
elif cur_no is not None:
|
| 296 |
+
cur_lines.append(s)
|
| 297 |
+
if cur_no is not None:
|
| 298 |
+
items.append((cur_part, cur_no, cur_lines))
|
| 299 |
+
|
| 300 |
+
for n, (item_part, item_no, lines) in enumerate(items, start=1):
|
| 301 |
+
body = re.sub(r"^\d+\.\s*", "", "\n".join(lines)).strip()
|
| 302 |
+
if not body:
|
| 303 |
+
continue
|
| 304 |
+
power = _IL3_POWER.search(body)
|
| 305 |
+
refs = _normalize_refs(body[:power.start()]) if power else ""
|
| 306 |
+
chunks.append({
|
| 307 |
+
"id": f"delegation-{src['code']}-{n}",
|
| 308 |
+
"doc_type": "delegation",
|
| 309 |
+
"act_code": src["act_code"],
|
| 310 |
+
"act_short": src["act_short"],
|
| 311 |
+
"act_name": src["act_name"],
|
| 312 |
+
"section": "",
|
| 313 |
+
"marginal_note": refs or f"Item {item_no}",
|
| 314 |
+
"part": item_part or "",
|
| 315 |
+
"division": "",
|
| 316 |
+
"heading": item_part or "",
|
| 317 |
+
"text": _normalize_refs(body),
|
| 318 |
+
"history": "",
|
| 319 |
+
"last_amended": "",
|
| 320 |
+
"current_to": version,
|
| 321 |
+
"citation": (f"{src['act_short']} — {item_part}, Item {item_no}"
|
| 322 |
+
if item_part else f"{src['act_short']}, Item {item_no}"),
|
| 323 |
+
"source_url": src["url"],
|
| 324 |
+
})
|
| 325 |
+
return chunks
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def build():
|
| 329 |
+
"""Fetch, parse and chunk every delegation instrument into delegation.json."""
|
| 330 |
+
all_chunks = []
|
| 331 |
+
for src in SOURCES.values():
|
| 332 |
+
print(f"Ingesting {src['act_short']} ...")
|
| 333 |
+
try:
|
| 334 |
+
if src["kind"] == "html-cbsa":
|
| 335 |
+
html = _fetch(src["url"], RAW / f"{src['code']}.html")
|
| 336 |
+
chunks = parse_cbsa(html, src)
|
| 337 |
+
elif src["kind"] == "pdf-ircc":
|
| 338 |
+
pdf = _fetch(src["url"], RAW / f"{src['code']}.pdf",
|
| 339 |
+
powershell=True)
|
| 340 |
+
chunks = parse_ircc(pdf, src)
|
| 341 |
+
else:
|
| 342 |
+
chunks = []
|
| 343 |
+
except Exception as exc:
|
| 344 |
+
print(f" !! {src['act_short']}: {type(exc).__name__}: {exc}")
|
| 345 |
+
continue
|
| 346 |
+
all_chunks.extend(chunks)
|
| 347 |
+
print(f" {len(chunks)} chunks")
|
| 348 |
+
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
| 349 |
+
OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
|
| 350 |
+
encoding="utf-8")
|
| 351 |
+
print(f"\n{len(all_chunks)} delegation chunks from {len(SOURCES)} "
|
| 352 |
+
f"instrument(s) -> {OUT}")
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
if __name__ == "__main__":
|
| 356 |
+
build()
|
|
@@ -88,8 +88,15 @@ def parse_memo(html, url):
|
|
| 88 |
number = match.group(1).upper() if match else url
|
| 89 |
h1 = main.find("h1")
|
| 90 |
topic = ""
|
| 91 |
-
if h1
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
dm = main.find("time", attrs={"property": "dateModified"})
|
| 94 |
date = _norm(dm.get("datetime") or dm.get_text()) if dm else ""
|
| 95 |
|
|
@@ -191,7 +198,7 @@ def parse_pdf_memo(html, url):
|
|
| 191 |
"act_name": "CBSA D-Memoranda",
|
| 192 |
"section": number,
|
| 193 |
"marginal_note": label,
|
| 194 |
-
"part":
|
| 195 |
"division": "",
|
| 196 |
"heading": "",
|
| 197 |
"text": part,
|
|
|
|
| 88 |
number = match.group(1).upper() if match else url
|
| 89 |
h1 = main.find("h1")
|
| 90 |
topic = ""
|
| 91 |
+
if h1:
|
| 92 |
+
# Pages vary: most carry the memo title in <h1><small>, others as plain
|
| 93 |
+
# "Memorandum DNN-N-N: Title" h1 text. Use the <small> if present, else
|
| 94 |
+
# the h1 text, and strip any leading memo-number prefix either way.
|
| 95 |
+
small = h1.find("small")
|
| 96 |
+
raw = (small.get_text(" ", strip=True) if small
|
| 97 |
+
else h1.get_text(" ", strip=True))
|
| 98 |
+
topic = re.sub(r"^Memorandum\s+D[\w-]+\s*[:–-]\s*", "",
|
| 99 |
+
_norm(raw), flags=re.I)
|
| 100 |
dm = main.find("time", attrs={"property": "dateModified"})
|
| 101 |
date = _norm(dm.get("datetime") or dm.get_text()) if dm else ""
|
| 102 |
|
|
|
|
| 198 |
"act_name": "CBSA D-Memoranda",
|
| 199 |
"section": number,
|
| 200 |
"marginal_note": label,
|
| 201 |
+
"part": topic,
|
| 202 |
"division": "",
|
| 203 |
"heading": "",
|
| 204 |
"text": part,
|
|
@@ -32,10 +32,15 @@ def load_chunks():
|
|
| 32 |
|
| 33 |
def embed_text(chunk):
|
| 34 |
"""Compact, retrieval-focused representation of one section."""
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
body = chunk["text"][:_MAX_BODY]
|
| 37 |
-
# The marginal note (section title) is the strongest topical signal, so it
|
| 38 |
-
# is repeated to emphasise it.
|
| 39 |
parts = [chunk["act_short"], note, note, chunk["heading"], body]
|
| 40 |
return " . ".join(p for p in parts if p)
|
| 41 |
|
|
|
|
| 32 |
|
| 33 |
def embed_text(chunk):
|
| 34 |
"""Compact, retrieval-focused representation of one section."""
|
| 35 |
+
# The section title is the strongest topical signal, so it is repeated to
|
| 36 |
+
# emphasise it. For D-memoranda the marginal note is only a generic section
|
| 37 |
+
# label ('Guidelines', 'Legislation'); the memo's actual subject lives in
|
| 38 |
+
# the 'part' field, so that is used as the title instead.
|
| 39 |
+
if chunk.get("doc_type") == "memorandum":
|
| 40 |
+
note = chunk.get("part") or chunk["marginal_note"]
|
| 41 |
+
else:
|
| 42 |
+
note = chunk["marginal_note"]
|
| 43 |
body = chunk["text"][:_MAX_BODY]
|
|
|
|
|
|
|
| 44 |
parts = [chunk["act_short"], note, note, chunk["heading"], body]
|
| 45 |
return " . ".join(p for p in parts if p)
|
| 46 |
|
|
@@ -16,7 +16,21 @@ RRF_K = 60 # reciprocal-rank-fusion damping constant
|
|
| 16 |
W_SEM = 2.0 # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
|
| 17 |
CANDIDATES = 80 # hits each retriever contributes to the fusion
|
| 18 |
RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
_TOKEN = re.compile(r"[a-z0-9]+")
|
| 22 |
_SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
|
|
@@ -102,6 +116,7 @@ class LegislationIndex:
|
|
| 102 |
raise RuntimeError(
|
| 103 |
f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
|
| 104 |
self._build_bm25()
|
|
|
|
| 105 |
self._build_xref()
|
| 106 |
self._load_semantic()
|
| 107 |
self._load_reranker()
|
|
@@ -127,6 +142,32 @@ class LegislationIndex:
|
|
| 127 |
self.avgdl = sum(self.doc_len) / n
|
| 128 |
self.idf = {t: math.log(1 + (n - d + 0.5) / (d + 0.5)) for t, d in df.items()}
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
def _load_semantic(self):
|
| 131 |
"""Load precomputed embeddings and the query embedder.
|
| 132 |
|
|
@@ -202,15 +243,16 @@ class LegislationIndex:
|
|
| 202 |
|
| 203 |
def _source_key(self, idx):
|
| 204 |
"""The parent document a chunk belongs to, for diversity capping. Returns
|
| 205 |
-
None for
|
|
|
|
| 206 |
capped; case law is keyed by citation, memoranda by memo number."""
|
| 207 |
c = self.chunks[idx]
|
| 208 |
doc_type = c.get("doc_type", "legislation")
|
| 209 |
-
if doc_type
|
| 210 |
return None
|
| 211 |
if doc_type == "memorandum":
|
| 212 |
return ("memorandum", c["section"]) # act_code is a shared constant
|
| 213 |
-
return (doc_type, c["act_code"]) #
|
| 214 |
|
| 215 |
def _diversify(self, ordered):
|
| 216 |
"""Reorder so no single case, memorandum, agreement or directive can
|
|
@@ -299,6 +341,36 @@ class LegislationIndex:
|
|
| 299 |
if c["section"] in refs and idx not in fused:
|
| 300 |
fused[idx] = 0.0
|
| 301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
def allowed(idx):
|
| 303 |
c = self.chunks[idx]
|
| 304 |
if act and act.lower() not in (c["act_short"].lower(), c["act_code"].lower()):
|
|
|
|
| 16 |
W_SEM = 2.0 # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
|
| 17 |
CANDIDATES = 80 # hits each retriever contributes to the fusion
|
| 18 |
RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
|
| 19 |
+
MN_WEIGHT = 0.0024 # title-match boost per unit of idf-weighted overlap between
|
| 20 |
+
# the query and a candidate's marginal note (section title)
|
| 21 |
+
MN_CAP = 0.012 # ceiling on the title-match boost -- it nudges the ranking
|
| 22 |
+
# without overriding a strong base score
|
| 23 |
+
REG_PENALTY = 0.008 # small fusion penalty on regulation sections, so the Act
|
| 24 |
+
# that creates a duty outranks the regulation elaborating it
|
| 25 |
+
BACKMATTER_PENALTY = 0.008 # likewise for a collective agreement's back-matter
|
| 26 |
+
# (memoranda, letters of understanding) vs its numbered articles
|
| 27 |
+
SOURCE_CAP = 2 # max chunks one case or memorandum may contribute
|
| 28 |
+
|
| 29 |
+
# Primary instruments -- enacted law, collective agreements, the NJC directives
|
| 30 |
+
# incorporated into them, and the IRPA delegation instruments. Their sections or
|
| 31 |
+
# items are distinct provisions, so (like legislation) they are never collapsed
|
| 32 |
+
# under the diversity cap.
|
| 33 |
+
PRIMARY_DOC_TYPES = frozenset({"legislation", "agreement", "directive", "delegation"})
|
| 34 |
|
| 35 |
_TOKEN = re.compile(r"[a-z0-9]+")
|
| 36 |
_SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
|
|
|
|
| 116 |
raise RuntimeError(
|
| 117 |
f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
|
| 118 |
self._build_bm25()
|
| 119 |
+
self._build_note_tokens()
|
| 120 |
self._build_xref()
|
| 121 |
self._load_semantic()
|
| 122 |
self._load_reranker()
|
|
|
|
| 142 |
self.avgdl = sum(self.doc_len) / n
|
| 143 |
self.idf = {t: math.log(1 + (n - d + 0.5) / (d + 0.5)) for t, d in df.items()}
|
| 144 |
|
| 145 |
+
def _build_note_tokens(self):
|
| 146 |
+
"""Pre-tokenise each chunk's topical title, for the title-match boost in
|
| 147 |
+
search(). For legislation, agreements and directives the title is the
|
| 148 |
+
marginal note (the section heading). A D-memorandum's marginal note is
|
| 149 |
+
generic ('Legislation', 'Guidelines and General Information', or a stray
|
| 150 |
+
page banner), so the memo's subject -- carried in its 'part' field -- is
|
| 151 |
+
used instead. Each chunk is also flagged as a regulation (act codes
|
| 152 |
+
beginning SOR/C.R.C.) for the Act-over-regulation preference, and as
|
| 153 |
+
collective-agreement back-matter (memoranda and letters with no article
|
| 154 |
+
number) for the back-matter penalty."""
|
| 155 |
+
self._note_tokens = []
|
| 156 |
+
self._is_regulation = []
|
| 157 |
+
self._is_backmatter = []
|
| 158 |
+
for c in self.chunks:
|
| 159 |
+
if c.get("doc_type") == "memorandum":
|
| 160 |
+
title = c.get("part") or c["marginal_note"]
|
| 161 |
+
else:
|
| 162 |
+
title = c["marginal_note"]
|
| 163 |
+
self._note_tokens.append(set(tokenize(title)))
|
| 164 |
+
self._is_regulation.append(
|
| 165 |
+
c.get("doc_type", "legislation") == "legislation"
|
| 166 |
+
and c["act_code"].startswith(("SOR", "C.R.C")))
|
| 167 |
+
self._is_backmatter.append(
|
| 168 |
+
c.get("doc_type") == "agreement"
|
| 169 |
+
and not str(c["section"])[:1].isdigit())
|
| 170 |
+
|
| 171 |
def _load_semantic(self):
|
| 172 |
"""Load precomputed embeddings and the query embedder.
|
| 173 |
|
|
|
|
| 243 |
|
| 244 |
def _source_key(self, idx):
|
| 245 |
"""The parent document a chunk belongs to, for diversity capping. Returns
|
| 246 |
+
None for primary instruments -- legislation, collective agreements and
|
| 247 |
+
directives -- whose sections are distinct provisions and are never
|
| 248 |
capped; case law is keyed by citation, memoranda by memo number."""
|
| 249 |
c = self.chunks[idx]
|
| 250 |
doc_type = c.get("doc_type", "legislation")
|
| 251 |
+
if doc_type in PRIMARY_DOC_TYPES:
|
| 252 |
return None
|
| 253 |
if doc_type == "memorandum":
|
| 254 |
return ("memorandum", c["section"]) # act_code is a shared constant
|
| 255 |
+
return (doc_type, c["act_code"]) # one decision, keyed by citation
|
| 256 |
|
| 257 |
def _diversify(self, ordered):
|
| 258 |
"""Reorder so no single case, memorandum, agreement or directive can
|
|
|
|
| 341 |
if c["section"] in refs and idx not in fused:
|
| 342 |
fused[idx] = 0.0
|
| 343 |
|
| 344 |
+
# Title-match boost: the marginal note is a section's canonical subject.
|
| 345 |
+
# Reward a candidate by how completely and how specifically the query
|
| 346 |
+
# matches its marginal note. The overlap is idf-weighted (matching a
|
| 347 |
+
# distinctive title like "hours of work" counts far more than a generic
|
| 348 |
+
# one like "Decision"), scaled by coverage, and capped -- so it nudges
|
| 349 |
+
# ranking toward the provision a question names by topic without
|
| 350 |
+
# overriding a strong base score.
|
| 351 |
+
q_tokens = set(tokenize(expanded))
|
| 352 |
+
for idx in list(fused):
|
| 353 |
+
note_tokens = self._note_tokens[idx]
|
| 354 |
+
total = sum(self.idf.get(t, 0.0) for t in note_tokens)
|
| 355 |
+
if total <= 0:
|
| 356 |
+
continue
|
| 357 |
+
matched = sum(self.idf.get(t, 0.0)
|
| 358 |
+
for t in note_tokens if t in q_tokens)
|
| 359 |
+
if matched > 0:
|
| 360 |
+
fused[idx] += min(MN_WEIGHT * matched * matched / total, MN_CAP)
|
| 361 |
+
|
| 362 |
+
# Hierarchy penalties: a topical question should surface the governing
|
| 363 |
+
# provision, not the supplementary material around it. An Act creates a
|
| 364 |
+
# duty while a regulation only elaborates procedure; a collective
|
| 365 |
+
# agreement's numbered articles are its substance while its memoranda
|
| 366 |
+
# and letters of understanding are back-matter. Both take a small
|
| 367 |
+
# fusion penalty so the governing provision wins a close contest.
|
| 368 |
+
for idx in list(fused):
|
| 369 |
+
if self._is_regulation[idx]:
|
| 370 |
+
fused[idx] -= REG_PENALTY
|
| 371 |
+
elif self._is_backmatter[idx]:
|
| 372 |
+
fused[idx] -= BACKMATTER_PENALTY
|
| 373 |
+
|
| 374 |
def allowed(idx):
|
| 375 |
c = self.chunks[idx]
|
| 376 |
if act and act.lower() not in (c["act_short"].lower(), c["act_code"].lower()):
|
|
@@ -110,6 +110,12 @@ def _format_section(c: dict, related=None) -> str:
|
|
| 110 |
lines.append(f"(decided {c['current_to'] or 'n/a'})")
|
| 111 |
if c["heading"]:
|
| 112 |
lines.append(f"Subject: {c['heading']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
else:
|
| 114 |
meta = [f"in force; text current to {c['current_to'] or 'n/a'}"]
|
| 115 |
if c["last_amended"]:
|
|
@@ -176,8 +182,9 @@ class SearchInput(BaseModel):
|
|
| 176 |
default=None,
|
| 177 |
description="Optional filter by source type: 'legislation' (Acts and "
|
| 178 |
"regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
|
| 179 |
-
"agreements), 'directive' (NJC directives),
|
| 180 |
-
"tribunal decisions)
|
|
|
|
| 181 |
)
|
| 182 |
|
| 183 |
|
|
@@ -197,15 +204,16 @@ def canlex_search_legislation(params: SearchInput) -> str:
|
|
| 197 |
"""Search Canadian federal law, CBSA D-Memoranda, agreements, NJC directives,
|
| 198 |
and leading court decisions.
|
| 199 |
|
| 200 |
-
The CanLex corpus has
|
| 201 |
(immigration, customs, criminal, drugs, food/health, labour, privacy and more);
|
| 202 |
CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
|
| 203 |
how it applies customs and border law); Treasury Board collective agreements
|
| 204 |
(currently the FB / Border Services group); National Joint Council directives
|
| 205 |
-
(travel, relocation, isolated posts and more);
|
| 206 |
courts and federal tribunals: the Supreme Court, Federal Court of Appeal and
|
| 207 |
Federal Court, the Immigration and Refugee Board, and the FPSLREB and CIRB
|
| 208 |
-
labour boards
|
|
|
|
| 209 |
their full text so the answer can cite the actual wording; an explicit section
|
| 210 |
reference (e.g. "section 34") is always surfaced. Each result is marked with its
|
| 211 |
source type.
|
|
@@ -216,7 +224,8 @@ def canlex_search_legislation(params: SearchInput) -> str:
|
|
| 216 |
- top_k (int): How many sections to return, 1-20 (default 6).
|
| 217 |
- act (Optional[str]): Restrict to one Act by short name/code, or omit for all.
|
| 218 |
- doc_type (Optional[str]): 'legislation', 'memorandum', 'agreement',
|
| 219 |
-
'directive', or '
|
|
|
|
| 220 |
|
| 221 |
Returns:
|
| 222 |
str: Markdown with answering instructions followed by the matching sections.
|
|
@@ -288,7 +297,8 @@ def canlex_get_section(params: GetSectionInput) -> str:
|
|
| 288 |
annotations={"title": "List Loaded Legislation", **_READONLY})
|
| 289 |
def canlex_list_acts() -> str:
|
| 290 |
"""List what the CanLex corpus contains -- Acts and regulations, CBSA
|
| 291 |
-
D-Memoranda, collective agreements, NJC directives,
|
|
|
|
| 292 |
|
| 293 |
Use this to learn the scope and currency of the corpus before searching, or to
|
| 294 |
report it to the user.
|
|
@@ -301,6 +311,7 @@ def canlex_list_acts() -> str:
|
|
| 301 |
agreements: dict[str, dict] = {}
|
| 302 |
directives: dict[str, dict] = {}
|
| 303 |
cases: dict[str, dict] = {}
|
|
|
|
| 304 |
memo_numbers: set[str] = set()
|
| 305 |
memo_chunks = 0
|
| 306 |
memo_date = ""
|
|
@@ -326,6 +337,12 @@ def canlex_list_acts() -> str:
|
|
| 326 |
"name": c["act_name"], "decided": c["current_to"], "count": 0,
|
| 327 |
})
|
| 328 |
entry["count"] += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
else:
|
| 330 |
entry = acts.setdefault(c["act_code"], {
|
| 331 |
"short": c["act_short"], "name": c["act_name"],
|
|
@@ -357,10 +374,15 @@ def canlex_list_acts() -> str:
|
|
| 357 |
for cite, a in sorted(cases.items(), key=lambda kv: kv[1]["decided"]):
|
| 358 |
lines.append(f"- **{a['name']}**, {cite}: {a['count']} excerpts, "
|
| 359 |
f"decided {a['decided'] or 'n/a'}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
lines += ["", "Search with canlex_search_legislation; filter by doc_type "
|
| 361 |
-
"(legislation / memorandum / agreement / directive / caselaw
|
| 362 |
-
"a known provision with canlex_get_section, or
|
| 363 |
-
"with canlex_case."]
|
| 364 |
return "\n".join(lines)
|
| 365 |
|
| 366 |
|
|
|
|
| 110 |
lines.append(f"(decided {c['current_to'] or 'n/a'})")
|
| 111 |
if c["heading"]:
|
| 112 |
lines.append(f"Subject: {c['heading']}")
|
| 113 |
+
elif doc_type == "delegation":
|
| 114 |
+
lines.append("_Instrument of delegation and designation — it records "
|
| 115 |
+
"which officials the Minister has delegated powers to, or "
|
| 116 |
+
"designated for functions, under IRPA and the IRPR. "
|
| 117 |
+
"Administrative; confirm it is still the current version._")
|
| 118 |
+
lines.append(f"(dated {c['current_to'] or 'n/a'})")
|
| 119 |
else:
|
| 120 |
meta = [f"in force; text current to {c['current_to'] or 'n/a'}"]
|
| 121 |
if c["last_amended"]:
|
|
|
|
| 182 |
default=None,
|
| 183 |
description="Optional filter by source type: 'legislation' (Acts and "
|
| 184 |
"regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
|
| 185 |
+
"agreements), 'directive' (NJC directives), 'caselaw' (court and "
|
| 186 |
+
"tribunal decisions), or 'delegation' (IRPA/IRPR delegation and "
|
| 187 |
+
"designation instruments). Omit to search all.",
|
| 188 |
)
|
| 189 |
|
| 190 |
|
|
|
|
| 204 |
"""Search Canadian federal law, CBSA D-Memoranda, agreements, NJC directives,
|
| 205 |
and leading court decisions.
|
| 206 |
|
| 207 |
+
The CanLex corpus has six kinds of source: 31 federal Acts and regulations
|
| 208 |
(immigration, customs, criminal, drugs, food/health, labour, privacy and more);
|
| 209 |
CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
|
| 210 |
how it applies customs and border law); Treasury Board collective agreements
|
| 211 |
(currently the FB / Border Services group); National Joint Council directives
|
| 212 |
+
(travel, relocation, isolated posts and more); leading decisions of the
|
| 213 |
courts and federal tribunals: the Supreme Court, Federal Court of Appeal and
|
| 214 |
Federal Court, the Immigration and Refugee Board, and the FPSLREB and CIRB
|
| 215 |
+
labour boards; and instruments of delegation and designation under IRPA and
|
| 216 |
+
the IRPR (which officials the Minister has authorized to exercise which powers). Use this for ANY question about that material. It ranks results by relevance and returns
|
| 217 |
their full text so the answer can cite the actual wording; an explicit section
|
| 218 |
reference (e.g. "section 34") is always surfaced. Each result is marked with its
|
| 219 |
source type.
|
|
|
|
| 224 |
- top_k (int): How many sections to return, 1-20 (default 6).
|
| 225 |
- act (Optional[str]): Restrict to one Act by short name/code, or omit for all.
|
| 226 |
- doc_type (Optional[str]): 'legislation', 'memorandum', 'agreement',
|
| 227 |
+
'directive', 'caselaw', or 'delegation' to restrict to one source
|
| 228 |
+
type; omit for all.
|
| 229 |
|
| 230 |
Returns:
|
| 231 |
str: Markdown with answering instructions followed by the matching sections.
|
|
|
|
| 297 |
annotations={"title": "List Loaded Legislation", **_READONLY})
|
| 298 |
def canlex_list_acts() -> str:
|
| 299 |
"""List what the CanLex corpus contains -- Acts and regulations, CBSA
|
| 300 |
+
D-Memoranda, collective agreements, NJC directives, leading cases, and
|
| 301 |
+
delegation instruments.
|
| 302 |
|
| 303 |
Use this to learn the scope and currency of the corpus before searching, or to
|
| 304 |
report it to the user.
|
|
|
|
| 311 |
agreements: dict[str, dict] = {}
|
| 312 |
directives: dict[str, dict] = {}
|
| 313 |
cases: dict[str, dict] = {}
|
| 314 |
+
delegations: dict[str, dict] = {}
|
| 315 |
memo_numbers: set[str] = set()
|
| 316 |
memo_chunks = 0
|
| 317 |
memo_date = ""
|
|
|
|
| 337 |
"name": c["act_name"], "decided": c["current_to"], "count": 0,
|
| 338 |
})
|
| 339 |
entry["count"] += 1
|
| 340 |
+
elif doc_type == "delegation":
|
| 341 |
+
entry = delegations.setdefault(c["act_code"], {
|
| 342 |
+
"short": c["act_short"], "name": c["act_name"],
|
| 343 |
+
"current_to": c["current_to"], "count": 0,
|
| 344 |
+
})
|
| 345 |
+
entry["count"] += 1
|
| 346 |
else:
|
| 347 |
entry = acts.setdefault(c["act_code"], {
|
| 348 |
"short": c["act_short"], "name": c["act_name"],
|
|
|
|
| 374 |
for cite, a in sorted(cases.items(), key=lambda kv: kv[1]["decided"]):
|
| 375 |
lines.append(f"- **{a['name']}**, {cite}: {a['count']} excerpts, "
|
| 376 |
f"decided {a['decided'] or 'n/a'}")
|
| 377 |
+
if delegations:
|
| 378 |
+
lines += ["", "## Delegation instruments"]
|
| 379 |
+
for a in sorted(delegations.values(), key=lambda x: x["short"]):
|
| 380 |
+
lines.append(f"- **{a['short']}** — {a['name']}: {a['count']} items, "
|
| 381 |
+
f"dated {a['current_to'] or 'n/a'}")
|
| 382 |
lines += ["", "Search with canlex_search_legislation; filter by doc_type "
|
| 383 |
+
"(legislation / memorandum / agreement / directive / caselaw / "
|
| 384 |
+
"delegation). Fetch a known provision with canlex_get_section, or "
|
| 385 |
+
"a case's citations with canlex_case."]
|
| 386 |
return "\n".join(lines)
|
| 387 |
|
| 388 |
|
|
@@ -92,5 +92,40 @@
|
|
| 92 |
{"query": "What is the burden and standard of proof when an employer defends a disciplinary grievance?", "answers": [["Basra (2007)", ""]]},
|
| 93 |
{"query": "What factors determine whether an employer had just cause to discipline or terminate an employee?", "answers": [["Pepper", ""]]},
|
| 94 |
{"query": "What is the test for whether a union breached its duty of fair representation?", "answers": [["McRaeJackson", ""]]},
|
| 95 |
-
{"query": "Can a union breach the duty of fair representation in how it handles a vaccination-policy grievance?", "answers": [["Watson", ""]]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
]
|
|
|
|
| 92 |
{"query": "What is the burden and standard of proof when an employer defends a disciplinary grievance?", "answers": [["Basra (2007)", ""]]},
|
| 93 |
{"query": "What factors determine whether an employer had just cause to discipline or terminate an employee?", "answers": [["Pepper", ""]]},
|
| 94 |
{"query": "What is the test for whether a union breached its duty of fair representation?", "answers": [["McRaeJackson", ""]]},
|
| 95 |
+
{"query": "Can a union breach the duty of fair representation in how it handles a vaccination-policy grievance?", "answers": [["Watson", ""]]},
|
| 96 |
+
{"query": "Is it consistent with the Charter of Rights to deport a permanent resident convicted of serious crimes?", "answers": [["Chiarelli", ""]]},
|
| 97 |
+
{"query": "When the Immigration Appeal Division hears an appeal from a removal order, may it weigh the hardship the person would face in the country of removal?", "answers": [["Chieu", ""]]},
|
| 98 |
+
{"query": "Does a non-citizen have a Charter right to remain in Canada?", "answers": [["Medovarski", ""]]},
|
| 99 |
+
{"query": "What must be established for inadmissibility for incitement to genocide or a crime against humanity?", "answers": [["Mugesera", ""]]},
|
| 100 |
+
{"query": "Is the IRPA security-certificate regime, with its special advocate scheme, consistent with the Charter?", "answers": [["Harkat", ""]]},
|
| 101 |
+
{"query": "How does the principle of non-refoulement constrain Canada when it extradites a person who holds refugee protection?", "answers": [["Németh", ""]]},
|
| 102 |
+
{"query": "What procedural fairness is owed to a sponsor before the government enforces a sponsorship-undertaking debt?", "answers": [["Mavi", ""]]},
|
| 103 |
+
{"query": "Can a sentencing court take the immigration consequences of a sentence into account?", "answers": [["Pham", ""]]},
|
| 104 |
+
{"query": "Can a fear of coerced sterilization under a one-child population-control policy support a refugee claim?", "answers": [["Chan", ""]]},
|
| 105 |
+
{"query": "Can a customs officer stop and search a vehicle near the border on reasonable suspicion of a customs offence?", "answers": [["Jacques", ""]]},
|
| 106 |
+
{"query": "Is an ascertained-forfeiture proceeding under the Customs Act penal enough to engage the right against self-incrimination?", "answers": [["Martineau", ""]]},
|
| 107 |
+
{"query": "Does a person's age at the time of involvement matter to inadmissibility for membership in a terrorist organization?", "answers": [["Poshteh", ""]]},
|
| 108 |
+
{"query": "Can a deserter from the United States military succeed in a refugee or humanitarian and compassionate claim in Canada?", "answers": [["Hinzman", ""]]},
|
| 109 |
+
{"query": "Does a Refugee Protection Division guideline that sets the order of questioning improperly fetter a member's discretion?", "answers": [["Thamotharem", ""]]},
|
| 110 |
+
{"query": "Can a person with no immigration status obtain coverage under the Interim Federal Health Program?", "answers": [["Toussaint", ""]]},
|
| 111 |
+
{"query": "What does a finding of no credible basis mean for a refugee claim?", "answers": [["Rahaman", ""]]},
|
| 112 |
+
{"query": "How is the reasonableness of a security certificate assessed on a re-determination after Charkaoui?", "answers": [["Almrei", ""]]},
|
| 113 |
+
{"query": "Can a permanent resident be found inadmissible for acts of violence that endanger safety in Canada even without a conviction?", "answers": [["Mason", ""]]},
|
| 114 |
+
{"query": "How much deference does the Immigration Appeal Division receive when it declines to grant discretionary relief from removal?", "answers": [["Khosa", ""]]},
|
| 115 |
+
{"query": "Is lengthy immigration detention consistent with the Charter protection against arbitrary detention?", "answers": [["Brown", ""]]},
|
| 116 |
+
{"query": "What meal and incidental allowances can a federal employee claim while travelling on government business?", "answers": [["Travel Directive", ""]]},
|
| 117 |
+
{"query": "What is a federal employee entitled to when the employer requires them to wear a uniform?", "answers": [["Uniforms Directive", ""]]},
|
| 118 |
+
{"query": "What protections does an indeterminate employee have when their position is declared surplus?", "answers": [["Work Force Adjustment Directive", ""]]},
|
| 119 |
+
{"query": "Is there an allowance for an employee required to administer first aid to the general public?", "answers": [["First Aid to the General Public - Allowance for Employees", ""]]},
|
| 120 |
+
{"query": "What allowances are available to a federal employee posted at an isolated post?", "answers": [["Isolated Posts and Government Housing Directive", ""]]},
|
| 121 |
+
{"query": "What relocation expenses are reimbursed when a federal employee must move for work?", "answers": [["NJC Relocation Directive", ""]]},
|
| 122 |
+
{"query": "What occupational health and safety obligations does the NJC directive place on the employer?", "answers": [["Occupational Health and Safety Directive", ""]]},
|
| 123 |
+
{"query": "What coverage does the Public Service Health Care Plan provide?", "answers": [["Public Service Health Care Plan Directive", ""]]},
|
| 124 |
+
{"query": "What support do the Foreign Service Directives give an employee posted outside Canada?", "answers": [["Foreign Service Directives", ""]]},
|
| 125 |
+
{"query": "Who has a right to appeal a decision to the Immigration Appeal Division?", "answers": [["IRPA", "63"]]},
|
| 126 |
+
{"query": "What are the objectives of the Immigration and Refugee Protection Act?", "answers": [["IRPA", "3"]]},
|
| 127 |
+
{"query": "Can a non-unionized federally regulated employee complain that they were unjustly dismissed?", "answers": [["Canada Labour Code", "240"]]},
|
| 128 |
+
{"query": "What does CBSA disclose about normal values and export prices under the Special Import Measures Act?", "answers": [["D-Memo", "D14-1-2"]]},
|
| 129 |
+
{"query": "What are CBSA's requirements for importing or exporting cultural property?", "answers": [["D-Memo", "D19-4-1"]]},
|
| 130 |
+
{"query": "When assessing criminal inadmissibility for a conviction abroad, must a decision-maker consider whether the defence of duress was practically available in that country?", "answers": [["Rodriguez Anzola", ""]]}
|
| 131 |
]
|
|
The diff for this file is too large to render.
See raw diff
|
|
|
|
The diff for this file is too large to render.
See raw diff
|
|
|
|
The diff for this file is too large to render.
See raw diff
|
|
|
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""CanLex unit-test suite."""
|
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for the embedding-text builder (canlex/embed.py)."""
|
| 2 |
+
import unittest
|
| 3 |
+
|
| 4 |
+
from canlex.embed import embed_text
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def _chunk(**kw):
|
| 8 |
+
base = {"doc_type": "legislation", "act_short": "X", "marginal_note": "",
|
| 9 |
+
"part": "", "heading": "", "text": ""}
|
| 10 |
+
base.update(kw)
|
| 11 |
+
return base
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class EmbedTextTests(unittest.TestCase):
|
| 15 |
+
def test_memorandum_title_comes_from_part(self):
|
| 16 |
+
# A memo's marginal note is generic; its subject lives in 'part'.
|
| 17 |
+
out = embed_text(_chunk(doc_type="memorandum", act_short="D-Memo",
|
| 18 |
+
marginal_note="Guidelines",
|
| 19 |
+
part="Value for Duty", text="body"))
|
| 20 |
+
self.assertIn("Value for Duty", out)
|
| 21 |
+
|
| 22 |
+
def test_legislation_title_comes_from_marginal_note(self):
|
| 23 |
+
out = embed_text(_chunk(marginal_note="Application for protection",
|
| 24 |
+
part="PART 2", text="body"))
|
| 25 |
+
self.assertIn("Application for protection", out)
|
| 26 |
+
|
| 27 |
+
def test_title_is_repeated_for_emphasis(self):
|
| 28 |
+
out = embed_text(_chunk(marginal_note="UNIQUEWORD", text="b"))
|
| 29 |
+
self.assertEqual(out.count("UNIQUEWORD"), 2)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
unittest.main()
|
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for the retrieval pipeline (canlex/index.py).
|
| 2 |
+
|
| 3 |
+
Fast, offline tests of the pure retrieval logic -- tokenisation, section-
|
| 4 |
+
reference parsing, the diversity cap, the result-set guarantee and the doc-type
|
| 5 |
+
flags. They build a bare LegislationIndex via __new__, so no corpus, embeddings
|
| 6 |
+
or reranker are loaded.
|
| 7 |
+
|
| 8 |
+
python -m unittest discover -s tests
|
| 9 |
+
"""
|
| 10 |
+
import unittest
|
| 11 |
+
|
| 12 |
+
from canlex.index import (
|
| 13 |
+
LegislationIndex, SOURCE_CAP, tokenize, _section_refs, _provision_units,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def chunk(doc_type="legislation", act_code="I-2.5", section="1",
|
| 18 |
+
marginal_note="Title", part="", **extra):
|
| 19 |
+
"""A minimal corpus chunk carrying the fields the index logic reads."""
|
| 20 |
+
c = {"doc_type": doc_type, "act_code": act_code, "section": section,
|
| 21 |
+
"marginal_note": marginal_note, "part": part, "heading": "",
|
| 22 |
+
"act_short": "X", "text": ""}
|
| 23 |
+
c.update(extra)
|
| 24 |
+
return c
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def bare_index(chunks):
|
| 28 |
+
"""A LegislationIndex with only .chunks set -- enough for the pure methods."""
|
| 29 |
+
idx = LegislationIndex.__new__(LegislationIndex)
|
| 30 |
+
idx.chunks = chunks
|
| 31 |
+
return idx
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class TokenizeTests(unittest.TestCase):
|
| 35 |
+
def test_case_insensitive(self):
|
| 36 |
+
self.assertEqual(tokenize("REPORT goods"), tokenize("report Goods"))
|
| 37 |
+
|
| 38 |
+
def test_stemming_unifies_word_forms(self):
|
| 39 |
+
# The point of stemming: different forms collapse to one token.
|
| 40 |
+
self.assertEqual(tokenize("reporting"), tokenize("reported"))
|
| 41 |
+
self.assertEqual(tokenize("importation"), tokenize("import"))
|
| 42 |
+
|
| 43 |
+
def test_splits_on_non_alphanumeric(self):
|
| 44 |
+
self.assertEqual(tokenize("s.34(1)(a)"), ["s", "34", "1", "a"])
|
| 45 |
+
|
| 46 |
+
def test_empty(self):
|
| 47 |
+
self.assertEqual(tokenize(""), [])
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class SectionRefTests(unittest.TestCase):
|
| 51 |
+
def test_plain_section(self):
|
| 52 |
+
self.assertEqual(_section_refs("inadmissible under section 34"), {"34"})
|
| 53 |
+
|
| 54 |
+
def test_decimal_and_abbreviated(self):
|
| 55 |
+
self.assertEqual(_section_refs("see s. 20.1 and section 5"), {"20.1", "5"})
|
| 56 |
+
|
| 57 |
+
def test_no_reference(self):
|
| 58 |
+
self.assertEqual(_section_refs("what is a pre-removal risk assessment"),
|
| 59 |
+
set())
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class ProvisionUnitsTests(unittest.TestCase):
|
| 63 |
+
def test_structured_provision_yields_units(self):
|
| 64 |
+
text = "(1) The chapeau.\n(a) first paragraph\n(b) second paragraph"
|
| 65 |
+
self.assertTrue(_provision_units(text))
|
| 66 |
+
|
| 67 |
+
def test_flat_provision_yields_nothing(self):
|
| 68 |
+
self.assertEqual(_provision_units("A flat provision with no markers."),
|
| 69 |
+
[])
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class SourceKeyTests(unittest.TestCase):
|
| 73 |
+
"""_source_key decides what the diversity cap collapses."""
|
| 74 |
+
|
| 75 |
+
def test_primary_instruments_are_never_capped(self):
|
| 76 |
+
idx = bare_index([
|
| 77 |
+
chunk(doc_type="legislation"),
|
| 78 |
+
chunk(doc_type="agreement", act_code="FB"),
|
| 79 |
+
chunk(doc_type="directive", act_code="d1"),
|
| 80 |
+
])
|
| 81 |
+
for i in range(3):
|
| 82 |
+
self.assertIsNone(idx._source_key(i))
|
| 83 |
+
|
| 84 |
+
def test_caselaw_and_memoranda_are_keyed(self):
|
| 85 |
+
idx = bare_index([
|
| 86 |
+
chunk(doc_type="memorandum", act_code="D-Memo", section="D1-1-1"),
|
| 87 |
+
chunk(doc_type="caselaw", act_code="2019 SCC 65"),
|
| 88 |
+
])
|
| 89 |
+
self.assertEqual(idx._source_key(0), ("memorandum", "D1-1-1"))
|
| 90 |
+
self.assertEqual(idx._source_key(1), ("caselaw", "2019 SCC 65"))
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class DiversifyTests(unittest.TestCase):
|
| 94 |
+
def test_caps_caselaw_per_decision(self):
|
| 95 |
+
n = SOURCE_CAP + 2
|
| 96 |
+
chunks = [chunk(doc_type="caselaw", act_code="2019 SCC 65")
|
| 97 |
+
for _ in range(n)]
|
| 98 |
+
chunks.append(chunk(doc_type="legislation")) # index n
|
| 99 |
+
idx = bare_index(chunks)
|
| 100 |
+
out = idx._diversify(list(range(n + 1)))
|
| 101 |
+
kept, deferred = out[:SOURCE_CAP + 1], out[SOURCE_CAP + 1:]
|
| 102 |
+
self.assertIn(n, kept) # legislation never capped
|
| 103 |
+
self.assertEqual(
|
| 104 |
+
sum(1 for i in kept if idx.chunks[i]["doc_type"] == "caselaw"),
|
| 105 |
+
SOURCE_CAP)
|
| 106 |
+
self.assertEqual(len(deferred), n - SOURCE_CAP)
|
| 107 |
+
|
| 108 |
+
def test_does_not_cap_agreements(self):
|
| 109 |
+
n = SOURCE_CAP + 3
|
| 110 |
+
idx = bare_index([chunk(doc_type="agreement", act_code="FB",
|
| 111 |
+
section=str(i)) for i in range(n)])
|
| 112 |
+
out = idx._diversify(list(range(n)))
|
| 113 |
+
self.assertEqual(out, list(range(n))) # uncapped: order intact
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class EnsureLegislationTests(unittest.TestCase):
|
| 117 |
+
def test_pulls_legislation_into_a_caselaw_dominated_top_k(self):
|
| 118 |
+
idx = bare_index([
|
| 119 |
+
chunk(doc_type="caselaw", act_code="A"),
|
| 120 |
+
chunk(doc_type="caselaw", act_code="B"),
|
| 121 |
+
chunk(doc_type="caselaw", act_code="C"),
|
| 122 |
+
chunk(doc_type="legislation"),
|
| 123 |
+
chunk(doc_type="legislation"),
|
| 124 |
+
])
|
| 125 |
+
out = idx._ensure_legislation([0, 1, 2, 3, 4], top_k=3)
|
| 126 |
+
top = out[:3]
|
| 127 |
+
n_leg = sum(1 for i in top
|
| 128 |
+
if idx.chunks[i]["doc_type"] == "legislation")
|
| 129 |
+
self.assertGreaterEqual(n_leg, 2)
|
| 130 |
+
self.assertEqual(out[0], 0) # the #1 hit is preserved
|
| 131 |
+
|
| 132 |
+
def test_no_op_when_legislation_already_present(self):
|
| 133 |
+
idx = bare_index([
|
| 134 |
+
chunk(doc_type="legislation"),
|
| 135 |
+
chunk(doc_type="legislation"),
|
| 136 |
+
chunk(doc_type="caselaw", act_code="A"),
|
| 137 |
+
])
|
| 138 |
+
self.assertEqual(idx._ensure_legislation([0, 1, 2], top_k=3), [0, 1, 2])
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
class DocTypeFlagTests(unittest.TestCase):
|
| 142 |
+
"""_build_note_tokens also flags regulations and agreement back-matter."""
|
| 143 |
+
|
| 144 |
+
def setUp(self):
|
| 145 |
+
self.idx = bare_index([
|
| 146 |
+
chunk(doc_type="legislation", act_code="I-2.5"),
|
| 147 |
+
chunk(doc_type="legislation", act_code="SOR-2002-227"),
|
| 148 |
+
chunk(doc_type="legislation", act_code="C.R.C.,_c._1041"),
|
| 149 |
+
chunk(doc_type="agreement", act_code="FB", section="17"),
|
| 150 |
+
chunk(doc_type="agreement", act_code="FB", section=""),
|
| 151 |
+
chunk(doc_type="memorandum", act_code="D-Memo", section="D1-1-1",
|
| 152 |
+
marginal_note="Guidelines", part="Importing goods"),
|
| 153 |
+
])
|
| 154 |
+
self.idx._build_note_tokens()
|
| 155 |
+
|
| 156 |
+
def test_regulation_flag(self):
|
| 157 |
+
self.assertEqual(self.idx._is_regulation,
|
| 158 |
+
[False, True, True, False, False, False])
|
| 159 |
+
|
| 160 |
+
def test_agreement_backmatter_flag(self):
|
| 161 |
+
self.assertEqual(self.idx._is_backmatter,
|
| 162 |
+
[False, False, False, False, True, False])
|
| 163 |
+
|
| 164 |
+
def test_memorandum_title_tokens_come_from_part(self):
|
| 165 |
+
# A memo's marginal note is generic; its title is the 'part' field.
|
| 166 |
+
self.assertEqual(self.idx._note_tokens[5], set(tokenize("Importing goods")))
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
if __name__ == "__main__":
|
| 170 |
+
unittest.main()
|
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for legal-abbreviation query expansion (canlex/synonyms.py)."""
|
| 2 |
+
import unittest
|
| 3 |
+
|
| 4 |
+
from canlex.synonyms import expand_query
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ExpandQueryTests(unittest.TestCase):
|
| 8 |
+
def test_keeps_the_original_query(self):
|
| 9 |
+
self.assertTrue(
|
| 10 |
+
expand_query("PRRA eligibility").startswith("PRRA eligibility"))
|
| 11 |
+
|
| 12 |
+
def test_expands_a_known_abbreviation(self):
|
| 13 |
+
# 'PRRA' should pull in the statutory wording the Act actually uses.
|
| 14 |
+
self.assertIn("application for protection",
|
| 15 |
+
expand_query("PRRA eligibility"))
|
| 16 |
+
|
| 17 |
+
def test_case_insensitive(self):
|
| 18 |
+
self.assertIn("humanitarian and compassionate",
|
| 19 |
+
expand_query("an H&C application"))
|
| 20 |
+
|
| 21 |
+
def test_unknown_query_is_unchanged(self):
|
| 22 |
+
q = "what are the standard hours of work"
|
| 23 |
+
self.assertEqual(expand_query(q), q)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
if __name__ == "__main__":
|
| 27 |
+
unittest.main()
|