Beemer Claude Opus 4.7 commited on
Commit
ef6e3dc
·
1 Parent(s): 58fc4b4

Refresh the CBSA IRPA delegation: 2023 restatement + 5 amendments + peace-officer auth

Browse files

The 2017 instrument we previously ingested was expressly superseded by Mendicino's
May 8, 2023 restatement and has since been amended five times (Sep 2023, Nov 2023,
twice in Mar 2024, Jul 2025). The corpus now carries all seven instruments
side-by-side, so a question about a delegated power surfaces both the current base
item and any amendment that touches it. Adds a separate IRPA s. 138(1) peace-
officer authorization (Aug 2022) under a new narrative-prose parser.

parse_cbsa now accepts the 2025-07 amendment's new row layout (<th> item + 2-3
<td> cells, no Delegates column when only refs/description change). Effective
dates come from SOURCES rather than the first <time> in <main>: amendment pages
quote the base instrument's signing date in their preamble, so the previous
extraction returned the wrong year for every amendment.

Corpus: 42 new delegation chunks (307 -> 349), 15511 total. Eval 129-Q:
Hit@1 0.74, Hit@3 0.88, Hit@5 0.92, Hit@10 0.96, MRR 0.82 -- a 1-question dip
(Basra 2007 burden of proof now #6 behind Basra 2012 + Canada Labour Code burden
sections, a borderline-gold reshuffle, not delegation-induced).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

canlex/delegation.py CHANGED
@@ -33,18 +33,89 @@ OUT = PROCESSED_DIR / "delegation.json"
33
  _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
34
  "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  SOURCES = {
37
- "cbsa": {
38
- "code": "cbsa",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  "kind": "html-cbsa",
40
- "act_code": "CBSA-IRPA-DELEG",
41
- "act_short": "CBSA Delegation",
42
- "act_name": ("Delegation of Authority and Designations of Officers by "
43
- "the Minister of Public Safety and Emergency Preparedness "
44
- "under the Immigration and Refugee Protection Act and the "
45
- "Immigration and Refugee Protection Regulations"),
46
- "url": ("https://www.cbsa-asfc.gc.ca/agency-agence/actreg-loireg/"
47
- "delegation/irpa-lipr-2016-07-eng.html"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  },
49
  "ircc": {
50
  "code": "ircc",
@@ -140,8 +211,11 @@ def parse_cbsa(html, src):
140
  for sup in main.find_all("sup"): # drop footnote-reference superscripts
141
  sup.decompose()
142
 
143
- time_el = main.find("time")
144
- date = _norm(time_el.get("datetime") or time_el.get_text()) if time_el else ""
 
 
 
145
 
146
  chunks = []
147
 
@@ -172,21 +246,33 @@ def parse_cbsa(html, src):
172
  "source_url": src["url"],
173
  })
174
 
175
- # One chunk per Schedule item. Each topical <h3> is followed by a four-column
176
- # table: Item | Act/Regulations reference | Description of power | Delegates.
 
 
 
 
 
 
177
  for table in main.find_all("table", class_="table-bordered"):
178
  h3 = table.find_previous_sibling("h3")
179
  section_name = _norm(h3.get_text()) if h3 else ""
180
  for tr in table.find_all("tr"):
181
- cells = tr.find_all("td", recursive=False)
182
- if len(cells) != 4:
183
- continue # the header row (<th>) or a stray row
184
- item_no = _norm(cells[0].get_text()).rstrip(".")
185
- refs = _normalize_refs(_norm(cells[1].get_text()))
 
 
 
 
 
 
186
  power = " ".join(_norm(p.get_text())
187
- for p in cells[2].find_all("p")) \
188
- or _norm(cells[2].get_text())
189
- delegates = _delegates(cells[3])
190
  if not item_no or not (power or refs):
191
  continue
192
  text = power
@@ -215,6 +301,63 @@ def parse_cbsa(html, src):
215
  return chunks
216
 
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  # --- IRCC IL3 instrument (PDF) ------------------------------------------------
219
 
220
  # A topical part heading -- a line in full upper case (MINISTERIAL INSTRUCTIONS,
@@ -334,6 +477,9 @@ def build():
334
  if src["kind"] == "html-cbsa":
335
  html = _fetch(src["url"], RAW / f"{src['code']}.html")
336
  chunks = parse_cbsa(html, src)
 
 
 
337
  elif src["kind"] == "pdf-ircc":
338
  pdf = _fetch(src["url"], RAW / f"{src['code']}.pdf",
339
  powershell=True)
 
33
  _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
34
  "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
35
 
36
+ # The CBSA IRPA delegation has been a moving target: the November 28, 2017
37
+ # instrument was expressly superseded by a full restatement on May 8, 2023
38
+ # (signed by Mendicino), which has itself been amended five times since. CBSA
39
+ # does not publish a consolidated version, so the current effective state is the
40
+ # 2023 restatement read together with its later amendments; we ingest each one
41
+ # as a separate "act" so a user (or the LLM) sees the base item and any
42
+ # amendments that touch it side-by-side in retrieval.
43
+ _CBSA_DELEG_URL = ("https://www.cbsa-asfc.gc.ca/agency-agence/actreg-loireg/"
44
+ "delegation/")
45
+ _CBSA_DELEG_NAME = ("Delegation of Authority and Designations of Officers by "
46
+ "the Minister of Public Safety and Emergency Preparedness "
47
+ "under the Immigration and Refugee Protection Act and the "
48
+ "Immigration and Refugee Protection Regulations")
49
+
50
  SOURCES = {
51
+ # The 2023 restatement -- the current base instrument.
52
+ "cbsa-2023-05": {
53
+ "code": "cbsa-2023-05",
54
+ "kind": "html-cbsa",
55
+ "act_code": "CBSA-IRPA-DELEG-2023-05",
56
+ "act_short": "CBSA Deleg 2023-05-08",
57
+ "act_name": _CBSA_DELEG_NAME,
58
+ "url": _CBSA_DELEG_URL + "irpa-lipr-2023-05-08-eng.html",
59
+ "effective": "2023-05-08",
60
+ },
61
+ # Amendments to the 2023 restatement, in chronological order.
62
+ "cbsa-2023-09": {
63
+ "code": "cbsa-2023-09",
64
+ "kind": "html-cbsa",
65
+ "act_code": "CBSA-IRPA-DELEG-AMEND-2023-09-08",
66
+ "act_short": "CBSA Deleg Amend 2023-09-08",
67
+ "act_name": "Amendment to the " + _CBSA_DELEG_NAME,
68
+ "url": _CBSA_DELEG_URL + "irpa-lipr-2023-09-08-eng.html",
69
+ "effective": "2023-09-08",
70
+ },
71
+ "cbsa-2023-11": {
72
+ "code": "cbsa-2023-11",
73
+ "kind": "html-cbsa",
74
+ "act_code": "CBSA-IRPA-DELEG-AMEND-2023-11-17",
75
+ "act_short": "CBSA Deleg Amend 2023-11-17",
76
+ "act_name": "Amendment to the " + _CBSA_DELEG_NAME,
77
+ "url": _CBSA_DELEG_URL + "irpa-lipr-2023-11-17-eng.html",
78
+ "effective": "2023-11-17",
79
+ },
80
+ "cbsa-2024-03-05": {
81
+ "code": "cbsa-2024-03-05",
82
  "kind": "html-cbsa",
83
+ "act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-05",
84
+ "act_short": "CBSA Deleg Amend 2024-03-05",
85
+ "act_name": "Amendment to the " + _CBSA_DELEG_NAME,
86
+ "url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-05-eng.html",
87
+ "effective": "2024-03-05",
88
+ },
89
+ "cbsa-2024-03-15": {
90
+ "code": "cbsa-2024-03-15",
91
+ "kind": "html-cbsa",
92
+ "act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-15",
93
+ "act_short": "CBSA Deleg Amend 2024-03-15",
94
+ "act_name": "Amendment to the " + _CBSA_DELEG_NAME,
95
+ "url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-15-eng.html",
96
+ "effective": "2024-03-15",
97
+ },
98
+ "cbsa-2025-07": {
99
+ "code": "cbsa-2025-07",
100
+ "kind": "html-cbsa",
101
+ "act_code": "CBSA-IRPA-DELEG-AMEND-2025-07-10",
102
+ "act_short": "CBSA Deleg Amend 2025-07-10",
103
+ "act_name": "Amendment to the " + _CBSA_DELEG_NAME,
104
+ "url": _CBSA_DELEG_URL + "irpa-lipr-2025-07-10-eng.html",
105
+ "effective": "2025-07-10",
106
+ },
107
+ # Separate authority: a peace-officer designation under IRPA s. 138(1).
108
+ # Narrative prose, not a Schedule table -- needs its own parser.
109
+ "cbsa-peaceofficer": {
110
+ "code": "cbsa-peaceofficer",
111
+ "kind": "html-cbsa-narrative",
112
+ "act_code": "CBSA-IRPA-PEACEOFF-2022-08",
113
+ "act_short": "CBSA Peace Officer Auth 2022-08-18",
114
+ "act_name": ("Authorization to have the Authority and Powers of a "
115
+ "Peace Officer under the Immigration and Refugee "
116
+ "Protection Act (subsection 138(1))"),
117
+ "url": _CBSA_DELEG_URL + "desig/po-ag_2022-08-eng.html",
118
+ "effective": "2022-08-18",
119
  },
120
  "ircc": {
121
  "code": "ircc",
 
211
  for sup in main.find_all("sup"): # drop footnote-reference superscripts
212
  sup.decompose()
213
 
214
+ # The effective date comes from SOURCES, not the first <time> in <main>:
215
+ # amendment pages quote the base instrument's date ("signed on May 8, 2023")
216
+ # in their preamble, so the first <time> on an amendment page is the base
217
+ # instrument's date, not the amendment's own.
218
+ date = src["effective"]
219
 
220
  chunks = []
221
 
 
246
  "source_url": src["url"],
247
  })
248
 
249
+ # One chunk per Schedule item. Two row shapes are accepted:
250
+ # (a) four <td> cells: Item | Refs | Power | Delegates (the 2023-05
251
+ # restatement and the 2023-09, 2023-11, 2024-03 amendments).
252
+ # (b) one <th> + two or three <td> cells: the <th> carries the item
253
+ # number and the <td>s carry Refs | Power [| Delegates]. The
254
+ # 2025-07-10 amendment uses this layout, and may omit the Delegates
255
+ # column when an amendment changes only references or descriptions.
256
+ # Each topical <h3>, if present, names the schedule section the table belongs to.
257
  for table in main.find_all("table", class_="table-bordered"):
258
  h3 = table.find_previous_sibling("h3")
259
  section_name = _norm(h3.get_text()) if h3 else ""
260
  for tr in table.find_all("tr"):
261
+ th_cells = tr.find_all("th", recursive=False)
262
+ td_cells = tr.find_all("td", recursive=False)
263
+ if not th_cells and len(td_cells) == 4:
264
+ item_cell, refs_cell, power_cell, deleg_cell = td_cells
265
+ elif len(th_cells) == 1 and len(td_cells) in (2, 3):
266
+ item_cell, refs_cell, power_cell = th_cells[0], td_cells[0], td_cells[1]
267
+ deleg_cell = td_cells[2] if len(td_cells) == 3 else None
268
+ else:
269
+ continue # header row or a stray row
270
+ item_no = _norm(item_cell.get_text()).rstrip(".")
271
+ refs = _normalize_refs(_norm(refs_cell.get_text()))
272
  power = " ".join(_norm(p.get_text())
273
+ for p in power_cell.find_all("p")) \
274
+ or _norm(power_cell.get_text())
275
+ delegates = _delegates(deleg_cell) if deleg_cell is not None else ""
276
  if not item_no or not (power or refs):
277
  continue
278
  text = power
 
301
  return chunks
302
 
303
 
304
+ # --- CBSA narrative-prose instrument (e.g. the peace-officer designation) -----
305
+
306
+ def parse_cbsa_narrative(html, src):
307
+ """Parse a narrative-prose CBSA designation instrument into a single chunk.
308
+
309
+ Used for the peace-officer authorization under IRPA s. 138(1) -- plain prose
310
+ listing 18 designated officer positions, not a four-column Schedule table,
311
+ so parse_cbsa's table walker would yield nothing. The whole operative text
312
+ is a few hundred words, well within a single chunk."""
313
+ soup = BeautifulSoup(html, "html.parser")
314
+ main = soup.find("main")
315
+ if main is None:
316
+ return []
317
+ for sup in main.find_all("sup"):
318
+ sup.decompose()
319
+
320
+ date = src["effective"]
321
+
322
+ # Skip the breadcrumb/title/footer chrome -- only paragraphs and lists in
323
+ # <main> that carry real content. Lists are rendered as "; "-joined items.
324
+ parts = []
325
+ for el in main.find_all(["p", "ul", "ol"]):
326
+ if el.find_parent(["ul", "ol"]):
327
+ continue # nested lists are picked up by their parent
328
+ if el.name in ("ul", "ol"):
329
+ items = [_norm(li.get_text()) for li in el.find_all("li")]
330
+ joined = "; ".join(t for t in items if t)
331
+ if joined:
332
+ parts.append(joined)
333
+ else:
334
+ text = _norm(el.get_text())
335
+ if text:
336
+ parts.append(text)
337
+ body = "\n".join(parts)
338
+ if not body:
339
+ return []
340
+
341
+ return [{
342
+ "id": f"delegation-{src['code']}",
343
+ "doc_type": "delegation",
344
+ "act_code": src["act_code"],
345
+ "act_short": src["act_short"],
346
+ "act_name": src["act_name"],
347
+ "section": "",
348
+ "marginal_note": "Peace-officer authorization — IRPA s. 138(1)",
349
+ "part": "",
350
+ "division": "",
351
+ "heading": src["act_name"],
352
+ "text": body,
353
+ "history": "",
354
+ "last_amended": "",
355
+ "current_to": date,
356
+ "citation": src["act_short"],
357
+ "source_url": src["url"],
358
+ }]
359
+
360
+
361
  # --- IRCC IL3 instrument (PDF) ------------------------------------------------
362
 
363
  # A topical part heading -- a line in full upper case (MINISTERIAL INSTRUCTIONS,
 
477
  if src["kind"] == "html-cbsa":
478
  html = _fetch(src["url"], RAW / f"{src['code']}.html")
479
  chunks = parse_cbsa(html, src)
480
+ elif src["kind"] == "html-cbsa-narrative":
481
+ html = _fetch(src["url"], RAW / f"{src['code']}.html")
482
+ chunks = parse_cbsa_narrative(html, src)
483
  elif src["kind"] == "pdf-ircc":
484
  pdf = _fetch(src["url"], RAW / f"{src['code']}.pdf",
485
  powershell=True)
data/processed/delegation.json CHANGED
The diff for this file is too large to render. See raw diff