Refresh the CBSA IRPA delegation: 2023 restatement + 5 amendments + peace-officer auth
Browse filesThe 2017 instrument we previously ingested was expressly superseded by Mendicino's
May 8, 2023 restatement and has since been amended five times (Sep 2023, Nov 2023,
twice in Mar 2024, Jul 2025). The corpus now carries all seven instruments
side-by-side, so a question about a delegated power surfaces both the current base
item and any amendment that touches it. Adds a separate IRPA s. 138(1) peace-
officer authorization (Aug 2022) under a new narrative-prose parser.
parse_cbsa now accepts the 2025-07 amendment's new row layout (<th> item + 2-3
<td> cells, no Delegates column when only refs/description change). Effective
dates come from SOURCES rather than the first <time> in <main>: amendment pages
quote the base instrument's signing date in their preamble, so the previous
extraction returned the wrong year for every amendment.
Corpus: 42 new delegation chunks (307 -> 349), 15511 total. Eval 129-Q:
Hit@1 0.74, Hit@3 0.88, Hit@5 0.92, Hit@10 0.96, MRR 0.82 -- a 1-question dip
(Basra 2007 burden of proof now #6 behind Basra 2012 + Canada Labour Code burden
sections, a borderline-gold reshuffle, not delegation-induced).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- canlex/delegation.py +168 -22
- data/processed/delegation.json +0 -0
|
@@ -33,18 +33,89 @@ OUT = PROCESSED_DIR / "delegation.json"
|
|
| 33 |
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
| 34 |
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
SOURCES = {
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
"kind": "html-cbsa",
|
| 40 |
-
"act_code": "CBSA-IRPA-DELEG",
|
| 41 |
-
"act_short": "CBSA
|
| 42 |
-
"act_name":
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
},
|
| 49 |
"ircc": {
|
| 50 |
"code": "ircc",
|
|
@@ -140,8 +211,11 @@ def parse_cbsa(html, src):
|
|
| 140 |
for sup in main.find_all("sup"): # drop footnote-reference superscripts
|
| 141 |
sup.decompose()
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
chunks = []
|
| 147 |
|
|
@@ -172,21 +246,33 @@ def parse_cbsa(html, src):
|
|
| 172 |
"source_url": src["url"],
|
| 173 |
})
|
| 174 |
|
| 175 |
-
# One chunk per Schedule item.
|
| 176 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
for table in main.find_all("table", class_="table-bordered"):
|
| 178 |
h3 = table.find_previous_sibling("h3")
|
| 179 |
section_name = _norm(h3.get_text()) if h3 else ""
|
| 180 |
for tr in table.find_all("tr"):
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
power = " ".join(_norm(p.get_text())
|
| 187 |
-
for p in
|
| 188 |
-
or _norm(
|
| 189 |
-
delegates = _delegates(
|
| 190 |
if not item_no or not (power or refs):
|
| 191 |
continue
|
| 192 |
text = power
|
|
@@ -215,6 +301,63 @@ def parse_cbsa(html, src):
|
|
| 215 |
return chunks
|
| 216 |
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
# --- IRCC IL3 instrument (PDF) ------------------------------------------------
|
| 219 |
|
| 220 |
# A topical part heading -- a line in full upper case (MINISTERIAL INSTRUCTIONS,
|
|
@@ -334,6 +477,9 @@ def build():
|
|
| 334 |
if src["kind"] == "html-cbsa":
|
| 335 |
html = _fetch(src["url"], RAW / f"{src['code']}.html")
|
| 336 |
chunks = parse_cbsa(html, src)
|
|
|
|
|
|
|
|
|
|
| 337 |
elif src["kind"] == "pdf-ircc":
|
| 338 |
pdf = _fetch(src["url"], RAW / f"{src['code']}.pdf",
|
| 339 |
powershell=True)
|
|
|
|
| 33 |
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
| 34 |
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
|
| 35 |
|
| 36 |
+
# The CBSA IRPA delegation has been a moving target: the November 28, 2017
|
| 37 |
+
# instrument was expressly superseded by a full restatement on May 8, 2023
|
| 38 |
+
# (signed by Mendicino), which has itself been amended five times since. CBSA
|
| 39 |
+
# does not publish a consolidated version, so the current effective state is the
|
| 40 |
+
# 2023 restatement read together with its later amendments; we ingest each one
|
| 41 |
+
# as a separate "act" so a user (or the LLM) sees the base item and any
|
| 42 |
+
# amendments that touch it side-by-side in retrieval.
|
| 43 |
+
_CBSA_DELEG_URL = ("https://www.cbsa-asfc.gc.ca/agency-agence/actreg-loireg/"
|
| 44 |
+
"delegation/")
|
| 45 |
+
_CBSA_DELEG_NAME = ("Delegation of Authority and Designations of Officers by "
|
| 46 |
+
"the Minister of Public Safety and Emergency Preparedness "
|
| 47 |
+
"under the Immigration and Refugee Protection Act and the "
|
| 48 |
+
"Immigration and Refugee Protection Regulations")
|
| 49 |
+
|
| 50 |
SOURCES = {
|
| 51 |
+
# The 2023 restatement -- the current base instrument.
|
| 52 |
+
"cbsa-2023-05": {
|
| 53 |
+
"code": "cbsa-2023-05",
|
| 54 |
+
"kind": "html-cbsa",
|
| 55 |
+
"act_code": "CBSA-IRPA-DELEG-2023-05",
|
| 56 |
+
"act_short": "CBSA Deleg 2023-05-08",
|
| 57 |
+
"act_name": _CBSA_DELEG_NAME,
|
| 58 |
+
"url": _CBSA_DELEG_URL + "irpa-lipr-2023-05-08-eng.html",
|
| 59 |
+
"effective": "2023-05-08",
|
| 60 |
+
},
|
| 61 |
+
# Amendments to the 2023 restatement, in chronological order.
|
| 62 |
+
"cbsa-2023-09": {
|
| 63 |
+
"code": "cbsa-2023-09",
|
| 64 |
+
"kind": "html-cbsa",
|
| 65 |
+
"act_code": "CBSA-IRPA-DELEG-AMEND-2023-09-08",
|
| 66 |
+
"act_short": "CBSA Deleg Amend 2023-09-08",
|
| 67 |
+
"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
|
| 68 |
+
"url": _CBSA_DELEG_URL + "irpa-lipr-2023-09-08-eng.html",
|
| 69 |
+
"effective": "2023-09-08",
|
| 70 |
+
},
|
| 71 |
+
"cbsa-2023-11": {
|
| 72 |
+
"code": "cbsa-2023-11",
|
| 73 |
+
"kind": "html-cbsa",
|
| 74 |
+
"act_code": "CBSA-IRPA-DELEG-AMEND-2023-11-17",
|
| 75 |
+
"act_short": "CBSA Deleg Amend 2023-11-17",
|
| 76 |
+
"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
|
| 77 |
+
"url": _CBSA_DELEG_URL + "irpa-lipr-2023-11-17-eng.html",
|
| 78 |
+
"effective": "2023-11-17",
|
| 79 |
+
},
|
| 80 |
+
"cbsa-2024-03-05": {
|
| 81 |
+
"code": "cbsa-2024-03-05",
|
| 82 |
"kind": "html-cbsa",
|
| 83 |
+
"act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-05",
|
| 84 |
+
"act_short": "CBSA Deleg Amend 2024-03-05",
|
| 85 |
+
"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
|
| 86 |
+
"url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-05-eng.html",
|
| 87 |
+
"effective": "2024-03-05",
|
| 88 |
+
},
|
| 89 |
+
"cbsa-2024-03-15": {
|
| 90 |
+
"code": "cbsa-2024-03-15",
|
| 91 |
+
"kind": "html-cbsa",
|
| 92 |
+
"act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-15",
|
| 93 |
+
"act_short": "CBSA Deleg Amend 2024-03-15",
|
| 94 |
+
"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
|
| 95 |
+
"url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-15-eng.html",
|
| 96 |
+
"effective": "2024-03-15",
|
| 97 |
+
},
|
| 98 |
+
"cbsa-2025-07": {
|
| 99 |
+
"code": "cbsa-2025-07",
|
| 100 |
+
"kind": "html-cbsa",
|
| 101 |
+
"act_code": "CBSA-IRPA-DELEG-AMEND-2025-07-10",
|
| 102 |
+
"act_short": "CBSA Deleg Amend 2025-07-10",
|
| 103 |
+
"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
|
| 104 |
+
"url": _CBSA_DELEG_URL + "irpa-lipr-2025-07-10-eng.html",
|
| 105 |
+
"effective": "2025-07-10",
|
| 106 |
+
},
|
| 107 |
+
# Separate authority: a peace-officer designation under IRPA s. 138(1).
|
| 108 |
+
# Narrative prose, not a Schedule table -- needs its own parser.
|
| 109 |
+
"cbsa-peaceofficer": {
|
| 110 |
+
"code": "cbsa-peaceofficer",
|
| 111 |
+
"kind": "html-cbsa-narrative",
|
| 112 |
+
"act_code": "CBSA-IRPA-PEACEOFF-2022-08",
|
| 113 |
+
"act_short": "CBSA Peace Officer Auth 2022-08-18",
|
| 114 |
+
"act_name": ("Authorization to have the Authority and Powers of a "
|
| 115 |
+
"Peace Officer under the Immigration and Refugee "
|
| 116 |
+
"Protection Act (subsection 138(1))"),
|
| 117 |
+
"url": _CBSA_DELEG_URL + "desig/po-ag_2022-08-eng.html",
|
| 118 |
+
"effective": "2022-08-18",
|
| 119 |
},
|
| 120 |
"ircc": {
|
| 121 |
"code": "ircc",
|
|
|
|
| 211 |
for sup in main.find_all("sup"): # drop footnote-reference superscripts
|
| 212 |
sup.decompose()
|
| 213 |
|
| 214 |
+
# The effective date comes from SOURCES, not the first <time> in <main>:
|
| 215 |
+
# amendment pages quote the base instrument's date ("signed on May 8, 2023")
|
| 216 |
+
# in their preamble, so the first <time> on an amendment page is the base
|
| 217 |
+
# instrument's date, not the amendment's own.
|
| 218 |
+
date = src["effective"]
|
| 219 |
|
| 220 |
chunks = []
|
| 221 |
|
|
|
|
| 246 |
"source_url": src["url"],
|
| 247 |
})
|
| 248 |
|
| 249 |
+
# One chunk per Schedule item. Two row shapes are accepted:
|
| 250 |
+
# (a) four <td> cells: Item | Refs | Power | Delegates (the 2023-05
|
| 251 |
+
# restatement and the 2023-09, 2023-11, 2024-03 amendments).
|
| 252 |
+
# (b) one <th> + two or three <td> cells: the <th> carries the item
|
| 253 |
+
# number and the <td>s carry Refs | Power [| Delegates]. The
|
| 254 |
+
# 2025-07-10 amendment uses this layout, and may omit the Delegates
|
| 255 |
+
# column when an amendment changes only references or descriptions.
|
| 256 |
+
# Each topical <h3>, if present, names the schedule section the table belongs to.
|
| 257 |
for table in main.find_all("table", class_="table-bordered"):
|
| 258 |
h3 = table.find_previous_sibling("h3")
|
| 259 |
section_name = _norm(h3.get_text()) if h3 else ""
|
| 260 |
for tr in table.find_all("tr"):
|
| 261 |
+
th_cells = tr.find_all("th", recursive=False)
|
| 262 |
+
td_cells = tr.find_all("td", recursive=False)
|
| 263 |
+
if not th_cells and len(td_cells) == 4:
|
| 264 |
+
item_cell, refs_cell, power_cell, deleg_cell = td_cells
|
| 265 |
+
elif len(th_cells) == 1 and len(td_cells) in (2, 3):
|
| 266 |
+
item_cell, refs_cell, power_cell = th_cells[0], td_cells[0], td_cells[1]
|
| 267 |
+
deleg_cell = td_cells[2] if len(td_cells) == 3 else None
|
| 268 |
+
else:
|
| 269 |
+
continue # header row or a stray row
|
| 270 |
+
item_no = _norm(item_cell.get_text()).rstrip(".")
|
| 271 |
+
refs = _normalize_refs(_norm(refs_cell.get_text()))
|
| 272 |
power = " ".join(_norm(p.get_text())
|
| 273 |
+
for p in power_cell.find_all("p")) \
|
| 274 |
+
or _norm(power_cell.get_text())
|
| 275 |
+
delegates = _delegates(deleg_cell) if deleg_cell is not None else ""
|
| 276 |
if not item_no or not (power or refs):
|
| 277 |
continue
|
| 278 |
text = power
|
|
|
|
| 301 |
return chunks
|
| 302 |
|
| 303 |
|
| 304 |
+
# --- CBSA narrative-prose instrument (e.g. the peace-officer designation) -----
|
| 305 |
+
|
| 306 |
+
def parse_cbsa_narrative(html, src):
|
| 307 |
+
"""Parse a narrative-prose CBSA designation instrument into a single chunk.
|
| 308 |
+
|
| 309 |
+
Used for the peace-officer authorization under IRPA s. 138(1) -- plain prose
|
| 310 |
+
listing 18 designated officer positions, not a four-column Schedule table,
|
| 311 |
+
so parse_cbsa's table walker would yield nothing. The whole operative text
|
| 312 |
+
is a few hundred words, well within a single chunk."""
|
| 313 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 314 |
+
main = soup.find("main")
|
| 315 |
+
if main is None:
|
| 316 |
+
return []
|
| 317 |
+
for sup in main.find_all("sup"):
|
| 318 |
+
sup.decompose()
|
| 319 |
+
|
| 320 |
+
date = src["effective"]
|
| 321 |
+
|
| 322 |
+
# Skip the breadcrumb/title/footer chrome -- only paragraphs and lists in
|
| 323 |
+
# <main> that carry real content. Lists are rendered as "; "-joined items.
|
| 324 |
+
parts = []
|
| 325 |
+
for el in main.find_all(["p", "ul", "ol"]):
|
| 326 |
+
if el.find_parent(["ul", "ol"]):
|
| 327 |
+
continue # nested lists are picked up by their parent
|
| 328 |
+
if el.name in ("ul", "ol"):
|
| 329 |
+
items = [_norm(li.get_text()) for li in el.find_all("li")]
|
| 330 |
+
joined = "; ".join(t for t in items if t)
|
| 331 |
+
if joined:
|
| 332 |
+
parts.append(joined)
|
| 333 |
+
else:
|
| 334 |
+
text = _norm(el.get_text())
|
| 335 |
+
if text:
|
| 336 |
+
parts.append(text)
|
| 337 |
+
body = "\n".join(parts)
|
| 338 |
+
if not body:
|
| 339 |
+
return []
|
| 340 |
+
|
| 341 |
+
return [{
|
| 342 |
+
"id": f"delegation-{src['code']}",
|
| 343 |
+
"doc_type": "delegation",
|
| 344 |
+
"act_code": src["act_code"],
|
| 345 |
+
"act_short": src["act_short"],
|
| 346 |
+
"act_name": src["act_name"],
|
| 347 |
+
"section": "",
|
| 348 |
+
"marginal_note": "Peace-officer authorization — IRPA s. 138(1)",
|
| 349 |
+
"part": "",
|
| 350 |
+
"division": "",
|
| 351 |
+
"heading": src["act_name"],
|
| 352 |
+
"text": body,
|
| 353 |
+
"history": "",
|
| 354 |
+
"last_amended": "",
|
| 355 |
+
"current_to": date,
|
| 356 |
+
"citation": src["act_short"],
|
| 357 |
+
"source_url": src["url"],
|
| 358 |
+
}]
|
| 359 |
+
|
| 360 |
+
|
| 361 |
# --- IRCC IL3 instrument (PDF) ------------------------------------------------
|
| 362 |
|
| 363 |
# A topical part heading -- a line in full upper case (MINISTERIAL INSTRUCTIONS,
|
|
|
|
| 477 |
if src["kind"] == "html-cbsa":
|
| 478 |
html = _fetch(src["url"], RAW / f"{src['code']}.html")
|
| 479 |
chunks = parse_cbsa(html, src)
|
| 480 |
+
elif src["kind"] == "html-cbsa-narrative":
|
| 481 |
+
html = _fetch(src["url"], RAW / f"{src['code']}.html")
|
| 482 |
+
chunks = parse_cbsa_narrative(html, src)
|
| 483 |
elif src["kind"] == "pdf-ircc":
|
| 484 |
pdf = _fetch(src["url"], RAW / f"{src['code']}.pdf",
|
| 485 |
powershell=True)
|
|
The diff for this file is too large to render.
See raw diff
|
|
|