Spaces:

Beemer0
/

CanLex

Running

CanLex / canlex /delegation.py

Beemer

Refresh the CBSA IRPA delegation: 2023 restatement + 5 amendments + peace-officer auth

ef6e3dc 4 days ago

21.1 kB

	"""Ingest instruments of delegation and designation under IRPA / IRPR.

	These instruments record which officer positions the Minister has delegated
	powers to, or designated for functions, under the Immigration and Refugee
	Protection Act and its Regulations. They are administrative instruments -- not
	enacted law, and not guidance -- so every chunk is tagged doc_type="delegation".

	Sources:
	- the CBSA "Delegation of Authority and Designations of Officers ..."
	instrument (HTML, cbsa-asfc.gc.ca);
	- the IRCC "IL3 -- Instrument of Designation and Delegation" (PDF, canada.ca).

	py -m canlex.delegation
	"""
	import io
	import json
	import re
	import subprocess
	import time
	import urllib.request

	from bs4 import BeautifulSoup
	from pypdf import PdfReader

	from .config import PROCESSED_DIR, RAW_DIR

	RAW = RAW_DIR / "delegation"
	OUT = PROCESSED_DIR / "delegation.json"

	# cbsa-asfc.gc.ca serves an ordinary client fine with a browser User-Agent;
	# canada.ca (the IRCC PDF) blocks Python's HTTP client at the TLS layer, so that
	# one is fetched via PowerShell's (.NET) HTTP stack, as agreement.py does.
	_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

	# The CBSA IRPA delegation has been a moving target: the November 28, 2017
	# instrument was expressly superseded by a full restatement on May 8, 2023
	# (signed by Mendicino), which has itself been amended five times since. CBSA
	# does not publish a consolidated version, so the current effective state is the
	# 2023 restatement read together with its later amendments; we ingest each one
	# as a separate "act" so a user (or the LLM) sees the base item and any
	# amendments that touch it side-by-side in retrieval.
	_CBSA_DELEG_URL = ("https://www.cbsa-asfc.gc.ca/agency-agence/actreg-loireg/"
	"delegation/")
	_CBSA_DELEG_NAME = ("Delegation of Authority and Designations of Officers by "
	"the Minister of Public Safety and Emergency Preparedness "
	"under the Immigration and Refugee Protection Act and the "
	"Immigration and Refugee Protection Regulations")

	SOURCES = {
	# The 2023 restatement -- the current base instrument.
	"cbsa-2023-05": {
	"code": "cbsa-2023-05",
	"kind": "html-cbsa",
	"act_code": "CBSA-IRPA-DELEG-2023-05",
	"act_short": "CBSA Deleg 2023-05-08",
	"act_name": _CBSA_DELEG_NAME,
	"url": _CBSA_DELEG_URL + "irpa-lipr-2023-05-08-eng.html",
	"effective": "2023-05-08",
	},
	# Amendments to the 2023 restatement, in chronological order.
	"cbsa-2023-09": {
	"code": "cbsa-2023-09",
	"kind": "html-cbsa",
	"act_code": "CBSA-IRPA-DELEG-AMEND-2023-09-08",
	"act_short": "CBSA Deleg Amend 2023-09-08",
	"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
	"url": _CBSA_DELEG_URL + "irpa-lipr-2023-09-08-eng.html",
	"effective": "2023-09-08",
	},
	"cbsa-2023-11": {
	"code": "cbsa-2023-11",
	"kind": "html-cbsa",
	"act_code": "CBSA-IRPA-DELEG-AMEND-2023-11-17",
	"act_short": "CBSA Deleg Amend 2023-11-17",
	"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
	"url": _CBSA_DELEG_URL + "irpa-lipr-2023-11-17-eng.html",
	"effective": "2023-11-17",
	},
	"cbsa-2024-03-05": {
	"code": "cbsa-2024-03-05",
	"kind": "html-cbsa",
	"act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-05",
	"act_short": "CBSA Deleg Amend 2024-03-05",
	"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
	"url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-05-eng.html",
	"effective": "2024-03-05",
	},
	"cbsa-2024-03-15": {
	"code": "cbsa-2024-03-15",
	"kind": "html-cbsa",
	"act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-15",
	"act_short": "CBSA Deleg Amend 2024-03-15",
	"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
	"url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-15-eng.html",
	"effective": "2024-03-15",
	},
	"cbsa-2025-07": {
	"code": "cbsa-2025-07",
	"kind": "html-cbsa",
	"act_code": "CBSA-IRPA-DELEG-AMEND-2025-07-10",
	"act_short": "CBSA Deleg Amend 2025-07-10",
	"act_name": "Amendment to the " + _CBSA_DELEG_NAME,
	"url": _CBSA_DELEG_URL + "irpa-lipr-2025-07-10-eng.html",
	"effective": "2025-07-10",
	},
	# Separate authority: a peace-officer designation under IRPA s. 138(1).
	# Narrative prose, not a Schedule table -- needs its own parser.
	"cbsa-peaceofficer": {
	"code": "cbsa-peaceofficer",
	"kind": "html-cbsa-narrative",
	"act_code": "CBSA-IRPA-PEACEOFF-2022-08",
	"act_short": "CBSA Peace Officer Auth 2022-08-18",
	"act_name": ("Authorization to have the Authority and Powers of a "
	"Peace Officer under the Immigration and Refugee "
	"Protection Act (subsection 138(1))"),
	"url": _CBSA_DELEG_URL + "desig/po-ag_2022-08-eng.html",
	"effective": "2022-08-18",
	},
	"ircc": {
	"code": "ircc",
	"kind": "pdf-ircc",
	"act_code": "IRCC-IL3-DELEG",
	"act_short": "IRCC IL3",
	"act_name": ("IL3 — Instrument of Designation and Delegation, "
	"Immigration and Refugee Protection Act and Regulations"),
	"url": ("https://www.canada.ca/content/dam/ircc/migration/ircc/english/"
	"resources/manuals/il/il3-eng.pdf"),
	},
	}


	def _norm(text):
	"""Collapse all whitespace -- including the non-breaking spaces these
	sources use heavily -- to single spaces."""
	return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip()


	def _normalize_refs(text):
	"""Expand the instruments' provision shorthand so the section numbers are
	searchable as tokens: 'A55(1)' -> 'IRPA 55(1)', 'R39' -> 'IRPR 39'. Both
	instruments write 'A' for the Act and 'R' for the Regulations."""
	text = re.sub(r"\bA(?=\d)", "IRPA ", text)
	text = re.sub(r"\bR(?=\d)", "IRPR ", text)
	return _norm(text)


	def _fetch(url, dest, powershell=False):
	"""Fetch a page or file, caching the raw bytes under data/raw/delegation.
	canada.ca blocks Python's HTTP client, so its PDF is fetched via PowerShell."""
	if dest.exists():
	return dest.read_bytes()
	dest.parent.mkdir(parents=True, exist_ok=True)
	if powershell:
	command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' "
	f"-UseBasicParsing -UserAgent '{_UA}'")
	subprocess.run(
	["powershell", "-NoProfile", "-NonInteractive", "-Command", command],
	check=True, capture_output=True, timeout=180)
	else:
	req = urllib.request.Request(url, headers={"User-Agent": _UA})
	with urllib.request.urlopen(req, timeout=60) as resp:
	dest.write_bytes(resp.read())
	time.sleep(0.5) # be polite to the server
	return dest.read_bytes()


	# --- CBSA instrument (HTML) ---------------------------------------------------

	def _delegates(cell):
	"""Flatten a 'Delegates / Designated officials' table cell into readable
	lines. The cell groups officer positions under an organisation header
	(<p class="h4">: CBSA, RCMP, IRCC) and an optional sub-heading
	(<p class="h5">: a region or a headquarters branch), each followed by a
	<ul> of position titles."""
	lines, org, sub, org_used = [], "", "", False
	for el in cell.find_all(["p", "ul"], recursive=False):
	if el.name == "p":
	classes = " ".join(el.get("class") or [])
	text = _norm(el.get_text())
	if not text:
	continue
	if "h4" in classes:
	if org and not org_used: # a previous org with no list of its own
	lines.append(org)
	org, sub, org_used = text.rstrip(": "), "", False
	elif "h5" in classes:
	sub = text
	else:
	lines.append(text) # a free-standing note
	else: # <ul> of position titles
	positions = "; ".join(
	t for t in (_norm(li.get_text())
	for li in el.find_all("li", recursive=False)) if t)
	if positions:
	label = f"{org} — {sub}" if sub else org
	lines.append(f"{label}: {positions}" if label else positions)
	org_used = True
	if org and not org_used: # a trailing org with no list
	lines.append(org)
	return "\n".join(lines)


	def parse_cbsa(html, src):
	"""Parse the CBSA delegation instrument into one chunk per Schedule item,
	plus one chunk for the preamble."""
	soup = BeautifulSoup(html, "html.parser")
	main = soup.find("main")
	if main is None:
	return []
	for sup in main.find_all("sup"): # drop footnote-reference superscripts
	sup.decompose()

	# The effective date comes from SOURCES, not the first <time> in <main>:
	# amendment pages quote the base instrument's date ("signed on May 8, 2023")
	# in their preamble, so the first <time> on an amendment page is the base
	# instrument's date, not the amendment's own.
	date = src["effective"]

	chunks = []

	# Preamble: the paragraphs between the title and the Schedule, which set out
	# the tiers of delegation and designation and how the columns are read.
	schedule = main.find("h2", id="sch")
	if schedule:
	paras = [_norm(p.get_text())
	for p in reversed(schedule.find_previous_siblings("p"))]
	body = "\n".join(p for p in paras if p)
	if body:
	chunks.append({
	"id": f"delegation-{src['code']}-preamble",
	"doc_type": "delegation",
	"act_code": src["act_code"],
	"act_short": src["act_short"],
	"act_name": src["act_name"],
	"section": "",
	"marginal_note": "Tiers of delegation and designation",
	"part": "",
	"division": "",
	"heading": "Instrument of delegation and designation under IRPA",
	"text": body,
	"history": "",
	"last_amended": "",
	"current_to": date,
	"citation": f"{src['act_short']} — Preamble",
	"source_url": src["url"],
	})

	# One chunk per Schedule item. Two row shapes are accepted:
	# (a) four <td> cells: Item \| Refs \| Power \| Delegates (the 2023-05
	# restatement and the 2023-09, 2023-11, 2024-03 amendments).
	# (b) one <th> + two or three <td> cells: the <th> carries the item
	# number and the <td>s carry Refs \| Power [\| Delegates]. The
	# 2025-07-10 amendment uses this layout, and may omit the Delegates
	# column when an amendment changes only references or descriptions.
	# Each topical <h3>, if present, names the schedule section the table belongs to.
	for table in main.find_all("table", class_="table-bordered"):
	h3 = table.find_previous_sibling("h3")
	section_name = _norm(h3.get_text()) if h3 else ""
	for tr in table.find_all("tr"):
	th_cells = tr.find_all("th", recursive=False)
	td_cells = tr.find_all("td", recursive=False)
	if not th_cells and len(td_cells) == 4:
	item_cell, refs_cell, power_cell, deleg_cell = td_cells
	elif len(th_cells) == 1 and len(td_cells) in (2, 3):
	item_cell, refs_cell, power_cell = th_cells[0], td_cells[0], td_cells[1]
	deleg_cell = td_cells[2] if len(td_cells) == 3 else None
	else:
	continue # header row or a stray row
	item_no = _norm(item_cell.get_text()).rstrip(".")
	refs = _normalize_refs(_norm(refs_cell.get_text()))
	power = " ".join(_norm(p.get_text())
	for p in power_cell.find_all("p")) \
	or _norm(power_cell.get_text())
	delegates = _delegates(deleg_cell) if deleg_cell is not None else ""
	if not item_no or not (power or refs):
	continue
	text = power
	if refs:
	text += f"\n\nProvisions (IRPA / IRPR): {refs}."
	if delegates:
	text += "\n\nDelegated / designated to:\n" + delegates
	chunks.append({
	"id": f"delegation-{src['code']}-{item_no}",
	"doc_type": "delegation",
	"act_code": src["act_code"],
	"act_short": src["act_short"],
	"act_name": src["act_name"],
	"section": "",
	"marginal_note": refs or f"Item {item_no}",
	"part": section_name,
	"division": "",
	"heading": section_name,
	"text": text,
	"history": "",
	"last_amended": "",
	"current_to": date,
	"citation": f"{src['act_short']}, Item {item_no}",
	"source_url": src["url"],
	})
	return chunks


	# --- CBSA narrative-prose instrument (e.g. the peace-officer designation) -----

	def parse_cbsa_narrative(html, src):
	"""Parse a narrative-prose CBSA designation instrument into a single chunk.

	Used for the peace-officer authorization under IRPA s. 138(1) -- plain prose
	listing 18 designated officer positions, not a four-column Schedule table,
	so parse_cbsa's table walker would yield nothing. The whole operative text
	is a few hundred words, well within a single chunk."""
	soup = BeautifulSoup(html, "html.parser")
	main = soup.find("main")
	if main is None:
	return []
	for sup in main.find_all("sup"):
	sup.decompose()

	date = src["effective"]

	# Skip the breadcrumb/title/footer chrome -- only paragraphs and lists in
	# <main> that carry real content. Lists are rendered as "; "-joined items.
	parts = []
	for el in main.find_all(["p", "ul", "ol"]):
	if el.find_parent(["ul", "ol"]):
	continue # nested lists are picked up by their parent
	if el.name in ("ul", "ol"):
	items = [_norm(li.get_text()) for li in el.find_all("li")]
	joined = "; ".join(t for t in items if t)
	if joined:
	parts.append(joined)
	else:
	text = _norm(el.get_text())
	if text:
	parts.append(text)
	body = "\n".join(parts)
	if not body:
	return []

	return [{
	"id": f"delegation-{src['code']}",
	"doc_type": "delegation",
	"act_code": src["act_code"],
	"act_short": src["act_short"],
	"act_name": src["act_name"],
	"section": "",
	"marginal_note": "Peace-officer authorization — IRPA s. 138(1)",
	"part": "",
	"division": "",
	"heading": src["act_name"],
	"text": body,
	"history": "",
	"last_amended": "",
	"current_to": date,
	"citation": src["act_short"],
	"source_url": src["url"],
	}]


	# --- IRCC IL3 instrument (PDF) ------------------------------------------------

	# A topical part heading -- a line in full upper case (MINISTERIAL INSTRUCTIONS,
	# MEDICAL, MISCELLANEOUS). Organisation acronyms (CI, CBSA, RCMP) are shorter
	# than the 6-character floor and so are not mistaken for headings.
	_IL3_PART = re.compile(r"[A-Z][A-Z &/,()'.\-]{5,}")
	# An item opens "<n>. <A/R provision>" -- the number must be followed by a
	# provision reference, which rejects ordinary numbered prose.
	_IL3_ITEM = re.compile(r"(\d+)\.\s+(?=[AR]\d)")
	# The word that introduces an item's power description, after the provisions.
	_IL3_POWER = re.compile(r"(?:Delegation\|Designation)\s[-–—]\s")


	def _pdf_pages(pdf_bytes):
	"""Extract each page's text, dropping the printed page number that pypdf
	emits as the page's first line (roman in the front matter, arabic later)."""
	reader = PdfReader(io.BytesIO(pdf_bytes))
	pages = []
	for page in reader.pages:
	lines = (page.extract_text() or "").split("\n")
	if lines and re.fullmatch(r"\s[ivxlcdm\d]+\s", lines[0] or "", re.I):
	lines = lines[1:]
	pages.append("\n".join(lines))
	return pages


	def parse_ircc(pdf_bytes, src):
	"""Parse the IRCC IL3 instrument into one chunk per Schedule item, plus a
	preamble chunk. The PDF flattens the four-column table into a linear text
	stream, so each item runs from its numbered marker to the next; the power
	description and the delegated positions are kept together within the item."""
	pages = _pdf_pages(pdf_bytes)
	if not pages:
	return []
	version_match = re.search(r"(?:Spring\|Summer\|Fall\|Winter)\s+\d{4}", pages[0])
	version = version_match.group(0) if version_match else ""

	chunks = []
	preamble = _norm(pages[1]) if len(pages) > 1 else ""
	if preamble:
	chunks.append({
	"id": f"delegation-{src['code']}-preamble",
	"doc_type": "delegation",
	"act_code": src["act_code"],
	"act_short": src["act_short"],
	"act_name": src["act_name"],
	"section": "",
	"marginal_note": "Preamble — designation and delegation under IRPA s. 6",
	"part": "",
	"division": "",
	"heading": "Instrument of designation and delegation under IRPA",
	"text": preamble,
	"history": "",
	"last_amended": "",
	"current_to": version,
	"citation": f"{src['act_short']} — Preamble",
	"source_url": src["url"],
	})

	# Walk every line: an all-caps line is a topical part heading; a line that
	# opens "<n>. <A/R provision>" starts a new item. Lines before the first
	# item (the preamble, definitions and table of contents) are skipped.
	items, part = [], ""
	cur_no = cur_part = None
	cur_lines = []
	for line in "\n".join(pages).split("\n"):
	s = _norm(line)
	if not s:
	continue
	if _IL3_PART.fullmatch(s):
	part = s.title()
	continue
	m = _IL3_ITEM.match(s)
	if m:
	if cur_no is not None:
	items.append((cur_part, cur_no, cur_lines))
	cur_no, cur_part, cur_lines = m.group(1), part, [s]
	elif cur_no is not None:
	cur_lines.append(s)
	if cur_no is not None:
	items.append((cur_part, cur_no, cur_lines))

	for n, (item_part, item_no, lines) in enumerate(items, start=1):
	body = re.sub(r"^\d+\.\s*", "", "\n".join(lines)).strip()
	if not body:
	continue
	power = _IL3_POWER.search(body)
	refs = _normalize_refs(body[:power.start()]) if power else ""
	chunks.append({
	"id": f"delegation-{src['code']}-{n}",
	"doc_type": "delegation",
	"act_code": src["act_code"],
	"act_short": src["act_short"],
	"act_name": src["act_name"],
	"section": "",
	"marginal_note": refs or f"Item {item_no}",
	"part": item_part or "",
	"division": "",
	"heading": item_part or "",
	"text": _normalize_refs(body),
	"history": "",
	"last_amended": "",
	"current_to": version,
	"citation": (f"{src['act_short']} — {item_part}, Item {item_no}"
	if item_part else f"{src['act_short']}, Item {item_no}"),
	"source_url": src["url"],
	})
	return chunks


	def build():
	"""Fetch, parse and chunk every delegation instrument into delegation.json."""
	all_chunks = []
	for src in SOURCES.values():
	print(f"Ingesting {src['act_short']} ...")
	try:
	if src["kind"] == "html-cbsa":
	html = _fetch(src["url"], RAW / f"{src['code']}.html")
	chunks = parse_cbsa(html, src)
	elif src["kind"] == "html-cbsa-narrative":
	html = _fetch(src["url"], RAW / f"{src['code']}.html")
	chunks = parse_cbsa_narrative(html, src)
	elif src["kind"] == "pdf-ircc":
	pdf = _fetch(src["url"], RAW / f"{src['code']}.pdf",
	powershell=True)
	chunks = parse_ircc(pdf, src)
	else:
	chunks = []
	except Exception as exc:
	print(f" !! {src['act_short']}: {type(exc).__name__}: {exc}")
	continue
	all_chunks.extend(chunks)
	print(f" {len(chunks)} chunks")
	PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
	OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
	encoding="utf-8")
	print(f"\n{len(all_chunks)} delegation chunks from {len(SOURCES)} "
	f"instrument(s) -> {OUT}")


	if __name__ == "__main__":
	build()