CanLex / canlex /delegation.py
Beemer
Refresh the CBSA IRPA delegation: 2023 restatement + 5 amendments + peace-officer auth
ef6e3dc
"""Ingest instruments of delegation and designation under IRPA / IRPR.
These instruments record which officer positions the Minister has delegated
powers to, or designated for functions, under the Immigration and Refugee
Protection Act and its Regulations. They are administrative instruments -- not
enacted law, and not guidance -- so every chunk is tagged doc_type="delegation".
Sources:
- the CBSA "Delegation of Authority and Designations of Officers ..."
instrument (HTML, cbsa-asfc.gc.ca);
- the IRCC "IL3 -- Instrument of Designation and Delegation" (PDF, canada.ca).
py -m canlex.delegation
"""
import io
import json
import re
import subprocess
import time
import urllib.request
from bs4 import BeautifulSoup
from pypdf import PdfReader
from .config import PROCESSED_DIR, RAW_DIR
RAW = RAW_DIR / "delegation"
OUT = PROCESSED_DIR / "delegation.json"
# cbsa-asfc.gc.ca serves an ordinary client fine with a browser User-Agent;
# canada.ca (the IRCC PDF) blocks Python's HTTP client at the TLS layer, so that
# one is fetched via PowerShell's (.NET) HTTP stack, as agreement.py does.
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
# The CBSA IRPA delegation has been a moving target: the November 28, 2017
# instrument was expressly superseded by a full restatement on May 8, 2023
# (signed by Mendicino), which has itself been amended five times since. CBSA
# does not publish a consolidated version, so the current effective state is the
# 2023 restatement read together with its later amendments; we ingest each one
# as a separate "act" so a user (or the LLM) sees the base item and any
# amendments that touch it side-by-side in retrieval.
_CBSA_DELEG_URL = ("https://www.cbsa-asfc.gc.ca/agency-agence/actreg-loireg/"
"delegation/")
_CBSA_DELEG_NAME = ("Delegation of Authority and Designations of Officers by "
"the Minister of Public Safety and Emergency Preparedness "
"under the Immigration and Refugee Protection Act and the "
"Immigration and Refugee Protection Regulations")
SOURCES = {
# The 2023 restatement -- the current base instrument.
"cbsa-2023-05": {
"code": "cbsa-2023-05",
"kind": "html-cbsa",
"act_code": "CBSA-IRPA-DELEG-2023-05",
"act_short": "CBSA Deleg 2023-05-08",
"act_name": _CBSA_DELEG_NAME,
"url": _CBSA_DELEG_URL + "irpa-lipr-2023-05-08-eng.html",
"effective": "2023-05-08",
},
# Amendments to the 2023 restatement, in chronological order.
"cbsa-2023-09": {
"code": "cbsa-2023-09",
"kind": "html-cbsa",
"act_code": "CBSA-IRPA-DELEG-AMEND-2023-09-08",
"act_short": "CBSA Deleg Amend 2023-09-08",
"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
"url": _CBSA_DELEG_URL + "irpa-lipr-2023-09-08-eng.html",
"effective": "2023-09-08",
},
"cbsa-2023-11": {
"code": "cbsa-2023-11",
"kind": "html-cbsa",
"act_code": "CBSA-IRPA-DELEG-AMEND-2023-11-17",
"act_short": "CBSA Deleg Amend 2023-11-17",
"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
"url": _CBSA_DELEG_URL + "irpa-lipr-2023-11-17-eng.html",
"effective": "2023-11-17",
},
"cbsa-2024-03-05": {
"code": "cbsa-2024-03-05",
"kind": "html-cbsa",
"act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-05",
"act_short": "CBSA Deleg Amend 2024-03-05",
"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
"url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-05-eng.html",
"effective": "2024-03-05",
},
"cbsa-2024-03-15": {
"code": "cbsa-2024-03-15",
"kind": "html-cbsa",
"act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-15",
"act_short": "CBSA Deleg Amend 2024-03-15",
"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
"url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-15-eng.html",
"effective": "2024-03-15",
},
"cbsa-2025-07": {
"code": "cbsa-2025-07",
"kind": "html-cbsa",
"act_code": "CBSA-IRPA-DELEG-AMEND-2025-07-10",
"act_short": "CBSA Deleg Amend 2025-07-10",
"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
"url": _CBSA_DELEG_URL + "irpa-lipr-2025-07-10-eng.html",
"effective": "2025-07-10",
},
# Separate authority: a peace-officer designation under IRPA s. 138(1).
# Narrative prose, not a Schedule table -- needs its own parser.
"cbsa-peaceofficer": {
"code": "cbsa-peaceofficer",
"kind": "html-cbsa-narrative",
"act_code": "CBSA-IRPA-PEACEOFF-2022-08",
"act_short": "CBSA Peace Officer Auth 2022-08-18",
"act_name": ("Authorization to have the Authority and Powers of a "
"Peace Officer under the Immigration and Refugee "
"Protection Act (subsection 138(1))"),
"url": _CBSA_DELEG_URL + "desig/po-ag_2022-08-eng.html",
"effective": "2022-08-18",
},
"ircc": {
"code": "ircc",
"kind": "pdf-ircc",
"act_code": "IRCC-IL3-DELEG",
"act_short": "IRCC IL3",
"act_name": ("IL3 — Instrument of Designation and Delegation, "
"Immigration and Refugee Protection Act and Regulations"),
"url": ("https://www.canada.ca/content/dam/ircc/migration/ircc/english/"
"resources/manuals/il/il3-eng.pdf"),
},
}
def _norm(text):
"""Collapse all whitespace -- including the non-breaking spaces these
sources use heavily -- to single spaces."""
return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip()
def _normalize_refs(text):
"""Expand the instruments' provision shorthand so the section numbers are
searchable as tokens: 'A55(1)' -> 'IRPA 55(1)', 'R39' -> 'IRPR 39'. Both
instruments write 'A' for the Act and 'R' for the Regulations."""
text = re.sub(r"\bA(?=\d)", "IRPA ", text)
text = re.sub(r"\bR(?=\d)", "IRPR ", text)
return _norm(text)
def _fetch(url, dest, powershell=False):
"""Fetch a page or file, caching the raw bytes under data/raw/delegation.
canada.ca blocks Python's HTTP client, so its PDF is fetched via PowerShell."""
if dest.exists():
return dest.read_bytes()
dest.parent.mkdir(parents=True, exist_ok=True)
if powershell:
command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' "
f"-UseBasicParsing -UserAgent '{_UA}'")
subprocess.run(
["powershell", "-NoProfile", "-NonInteractive", "-Command", command],
check=True, capture_output=True, timeout=180)
else:
req = urllib.request.Request(url, headers={"User-Agent": _UA})
with urllib.request.urlopen(req, timeout=60) as resp:
dest.write_bytes(resp.read())
time.sleep(0.5) # be polite to the server
return dest.read_bytes()
# --- CBSA instrument (HTML) ---------------------------------------------------
def _delegates(cell):
"""Flatten a 'Delegates / Designated officials' table cell into readable
lines. The cell groups officer positions under an organisation header
(<p class="h4">: CBSA, RCMP, IRCC) and an optional sub-heading
(<p class="h5">: a region or a headquarters branch), each followed by a
<ul> of position titles."""
lines, org, sub, org_used = [], "", "", False
for el in cell.find_all(["p", "ul"], recursive=False):
if el.name == "p":
classes = " ".join(el.get("class") or [])
text = _norm(el.get_text())
if not text:
continue
if "h4" in classes:
if org and not org_used: # a previous org with no list of its own
lines.append(org)
org, sub, org_used = text.rstrip(": "), "", False
elif "h5" in classes:
sub = text
else:
lines.append(text) # a free-standing note
else: # <ul> of position titles
positions = "; ".join(
t for t in (_norm(li.get_text())
for li in el.find_all("li", recursive=False)) if t)
if positions:
label = f"{org}{sub}" if sub else org
lines.append(f"{label}: {positions}" if label else positions)
org_used = True
if org and not org_used: # a trailing org with no list
lines.append(org)
return "\n".join(lines)
def parse_cbsa(html, src):
"""Parse the CBSA delegation instrument into one chunk per Schedule item,
plus one chunk for the preamble."""
soup = BeautifulSoup(html, "html.parser")
main = soup.find("main")
if main is None:
return []
for sup in main.find_all("sup"): # drop footnote-reference superscripts
sup.decompose()
# The effective date comes from SOURCES, not the first <time> in <main>:
# amendment pages quote the base instrument's date ("signed on May 8, 2023")
# in their preamble, so the first <time> on an amendment page is the base
# instrument's date, not the amendment's own.
date = src["effective"]
chunks = []
# Preamble: the paragraphs between the title and the Schedule, which set out
# the tiers of delegation and designation and how the columns are read.
schedule = main.find("h2", id="sch")
if schedule:
paras = [_norm(p.get_text())
for p in reversed(schedule.find_previous_siblings("p"))]
body = "\n".join(p for p in paras if p)
if body:
chunks.append({
"id": f"delegation-{src['code']}-preamble",
"doc_type": "delegation",
"act_code": src["act_code"],
"act_short": src["act_short"],
"act_name": src["act_name"],
"section": "",
"marginal_note": "Tiers of delegation and designation",
"part": "",
"division": "",
"heading": "Instrument of delegation and designation under IRPA",
"text": body,
"history": "",
"last_amended": "",
"current_to": date,
"citation": f"{src['act_short']} — Preamble",
"source_url": src["url"],
})
# One chunk per Schedule item. Two row shapes are accepted:
# (a) four <td> cells: Item | Refs | Power | Delegates (the 2023-05
# restatement and the 2023-09, 2023-11, 2024-03 amendments).
# (b) one <th> + two or three <td> cells: the <th> carries the item
# number and the <td>s carry Refs | Power [| Delegates]. The
# 2025-07-10 amendment uses this layout, and may omit the Delegates
# column when an amendment changes only references or descriptions.
# Each topical <h3>, if present, names the schedule section the table belongs to.
for table in main.find_all("table", class_="table-bordered"):
h3 = table.find_previous_sibling("h3")
section_name = _norm(h3.get_text()) if h3 else ""
for tr in table.find_all("tr"):
th_cells = tr.find_all("th", recursive=False)
td_cells = tr.find_all("td", recursive=False)
if not th_cells and len(td_cells) == 4:
item_cell, refs_cell, power_cell, deleg_cell = td_cells
elif len(th_cells) == 1 and len(td_cells) in (2, 3):
item_cell, refs_cell, power_cell = th_cells[0], td_cells[0], td_cells[1]
deleg_cell = td_cells[2] if len(td_cells) == 3 else None
else:
continue # header row or a stray row
item_no = _norm(item_cell.get_text()).rstrip(".")
refs = _normalize_refs(_norm(refs_cell.get_text()))
power = " ".join(_norm(p.get_text())
for p in power_cell.find_all("p")) \
or _norm(power_cell.get_text())
delegates = _delegates(deleg_cell) if deleg_cell is not None else ""
if not item_no or not (power or refs):
continue
text = power
if refs:
text += f"\n\nProvisions (IRPA / IRPR): {refs}."
if delegates:
text += "\n\nDelegated / designated to:\n" + delegates
chunks.append({
"id": f"delegation-{src['code']}-{item_no}",
"doc_type": "delegation",
"act_code": src["act_code"],
"act_short": src["act_short"],
"act_name": src["act_name"],
"section": "",
"marginal_note": refs or f"Item {item_no}",
"part": section_name,
"division": "",
"heading": section_name,
"text": text,
"history": "",
"last_amended": "",
"current_to": date,
"citation": f"{src['act_short']}, Item {item_no}",
"source_url": src["url"],
})
return chunks
# --- CBSA narrative-prose instrument (e.g. the peace-officer designation) -----
def parse_cbsa_narrative(html, src):
"""Parse a narrative-prose CBSA designation instrument into a single chunk.
Used for the peace-officer authorization under IRPA s. 138(1) -- plain prose
listing 18 designated officer positions, not a four-column Schedule table,
so parse_cbsa's table walker would yield nothing. The whole operative text
is a few hundred words, well within a single chunk."""
soup = BeautifulSoup(html, "html.parser")
main = soup.find("main")
if main is None:
return []
for sup in main.find_all("sup"):
sup.decompose()
date = src["effective"]
# Skip the breadcrumb/title/footer chrome -- only paragraphs and lists in
# <main> that carry real content. Lists are rendered as "; "-joined items.
parts = []
for el in main.find_all(["p", "ul", "ol"]):
if el.find_parent(["ul", "ol"]):
continue # nested lists are picked up by their parent
if el.name in ("ul", "ol"):
items = [_norm(li.get_text()) for li in el.find_all("li")]
joined = "; ".join(t for t in items if t)
if joined:
parts.append(joined)
else:
text = _norm(el.get_text())
if text:
parts.append(text)
body = "\n".join(parts)
if not body:
return []
return [{
"id": f"delegation-{src['code']}",
"doc_type": "delegation",
"act_code": src["act_code"],
"act_short": src["act_short"],
"act_name": src["act_name"],
"section": "",
"marginal_note": "Peace-officer authorization — IRPA s. 138(1)",
"part": "",
"division": "",
"heading": src["act_name"],
"text": body,
"history": "",
"last_amended": "",
"current_to": date,
"citation": src["act_short"],
"source_url": src["url"],
}]
# --- IRCC IL3 instrument (PDF) ------------------------------------------------
# A topical part heading -- a line in full upper case (MINISTERIAL INSTRUCTIONS,
# MEDICAL, MISCELLANEOUS). Organisation acronyms (CI, CBSA, RCMP) are shorter
# than the 6-character floor and so are not mistaken for headings.
_IL3_PART = re.compile(r"[A-Z][A-Z &/,()'.\-]{5,}")
# An item opens "<n>. <A/R provision>" -- the number must be followed by a
# provision reference, which rejects ordinary numbered prose.
_IL3_ITEM = re.compile(r"(\d+)\.\s+(?=[AR]\d)")
# The word that introduces an item's power description, after the provisions.
_IL3_POWER = re.compile(r"(?:Delegation|Designation)\s*[-–—]\s*")
def _pdf_pages(pdf_bytes):
"""Extract each page's text, dropping the printed page number that pypdf
emits as the page's first line (roman in the front matter, arabic later)."""
reader = PdfReader(io.BytesIO(pdf_bytes))
pages = []
for page in reader.pages:
lines = (page.extract_text() or "").split("\n")
if lines and re.fullmatch(r"\s*[ivxlcdm\d]+\s*", lines[0] or "", re.I):
lines = lines[1:]
pages.append("\n".join(lines))
return pages
def parse_ircc(pdf_bytes, src):
"""Parse the IRCC IL3 instrument into one chunk per Schedule item, plus a
preamble chunk. The PDF flattens the four-column table into a linear text
stream, so each item runs from its numbered marker to the next; the power
description and the delegated positions are kept together within the item."""
pages = _pdf_pages(pdf_bytes)
if not pages:
return []
version_match = re.search(r"(?:Spring|Summer|Fall|Winter)\s+\d{4}", pages[0])
version = version_match.group(0) if version_match else ""
chunks = []
preamble = _norm(pages[1]) if len(pages) > 1 else ""
if preamble:
chunks.append({
"id": f"delegation-{src['code']}-preamble",
"doc_type": "delegation",
"act_code": src["act_code"],
"act_short": src["act_short"],
"act_name": src["act_name"],
"section": "",
"marginal_note": "Preamble — designation and delegation under IRPA s. 6",
"part": "",
"division": "",
"heading": "Instrument of designation and delegation under IRPA",
"text": preamble,
"history": "",
"last_amended": "",
"current_to": version,
"citation": f"{src['act_short']} — Preamble",
"source_url": src["url"],
})
# Walk every line: an all-caps line is a topical part heading; a line that
# opens "<n>. <A/R provision>" starts a new item. Lines before the first
# item (the preamble, definitions and table of contents) are skipped.
items, part = [], ""
cur_no = cur_part = None
cur_lines = []
for line in "\n".join(pages).split("\n"):
s = _norm(line)
if not s:
continue
if _IL3_PART.fullmatch(s):
part = s.title()
continue
m = _IL3_ITEM.match(s)
if m:
if cur_no is not None:
items.append((cur_part, cur_no, cur_lines))
cur_no, cur_part, cur_lines = m.group(1), part, [s]
elif cur_no is not None:
cur_lines.append(s)
if cur_no is not None:
items.append((cur_part, cur_no, cur_lines))
for n, (item_part, item_no, lines) in enumerate(items, start=1):
body = re.sub(r"^\d+\.\s*", "", "\n".join(lines)).strip()
if not body:
continue
power = _IL3_POWER.search(body)
refs = _normalize_refs(body[:power.start()]) if power else ""
chunks.append({
"id": f"delegation-{src['code']}-{n}",
"doc_type": "delegation",
"act_code": src["act_code"],
"act_short": src["act_short"],
"act_name": src["act_name"],
"section": "",
"marginal_note": refs or f"Item {item_no}",
"part": item_part or "",
"division": "",
"heading": item_part or "",
"text": _normalize_refs(body),
"history": "",
"last_amended": "",
"current_to": version,
"citation": (f"{src['act_short']}{item_part}, Item {item_no}"
if item_part else f"{src['act_short']}, Item {item_no}"),
"source_url": src["url"],
})
return chunks
def build():
"""Fetch, parse and chunk every delegation instrument into delegation.json."""
all_chunks = []
for src in SOURCES.values():
print(f"Ingesting {src['act_short']} ...")
try:
if src["kind"] == "html-cbsa":
html = _fetch(src["url"], RAW / f"{src['code']}.html")
chunks = parse_cbsa(html, src)
elif src["kind"] == "html-cbsa-narrative":
html = _fetch(src["url"], RAW / f"{src['code']}.html")
chunks = parse_cbsa_narrative(html, src)
elif src["kind"] == "pdf-ircc":
pdf = _fetch(src["url"], RAW / f"{src['code']}.pdf",
powershell=True)
chunks = parse_ircc(pdf, src)
else:
chunks = []
except Exception as exc:
print(f" !! {src['act_short']}: {type(exc).__name__}: {exc}")
continue
all_chunks.extend(chunks)
print(f" {len(chunks)} chunks")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
encoding="utf-8")
print(f"\n{len(all_chunks)} delegation chunks from {len(SOURCES)} "
f"instrument(s) -> {OUT}")
if __name__ == "__main__":
build()