"""Ingest instruments of delegation and designation under IRPA / IRPR. These instruments record which officer positions the Minister has delegated powers to, or designated for functions, under the Immigration and Refugee Protection Act and its Regulations. They are administrative instruments -- not enacted law, and not guidance -- so every chunk is tagged doc_type="delegation". Sources: - the CBSA "Delegation of Authority and Designations of Officers ..." instrument (HTML, cbsa-asfc.gc.ca); - the IRCC "IL3 -- Instrument of Designation and Delegation" (PDF, canada.ca). py -m canlex.delegation """ import io import json import re import subprocess import time import urllib.request from bs4 import BeautifulSoup from pypdf import PdfReader from .config import PROCESSED_DIR, RAW_DIR RAW = RAW_DIR / "delegation" OUT = PROCESSED_DIR / "delegation.json" # cbsa-asfc.gc.ca serves an ordinary client fine with a browser User-Agent; # canada.ca (the IRCC PDF) blocks Python's HTTP client at the TLS layer, so that # one is fetched via PowerShell's (.NET) HTTP stack, as agreement.py does. _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36") # The CBSA IRPA delegation has been a moving target: the November 28, 2017 # instrument was expressly superseded by a full restatement on May 8, 2023 # (signed by Mendicino), which has itself been amended five times since. CBSA # does not publish a consolidated version, so the current effective state is the # 2023 restatement read together with its later amendments; we ingest each one # as a separate "act" so a user (or the LLM) sees the base item and any # amendments that touch it side-by-side in retrieval. _CBSA_DELEG_URL = ("https://www.cbsa-asfc.gc.ca/agency-agence/actreg-loireg/" "delegation/") _CBSA_DELEG_NAME = ("Delegation of Authority and Designations of Officers by " "the Minister of Public Safety and Emergency Preparedness " "under the Immigration and Refugee Protection Act and the " "Immigration and Refugee Protection Regulations") SOURCES = { # The 2023 restatement -- the current base instrument. "cbsa-2023-05": { "code": "cbsa-2023-05", "kind": "html-cbsa", "act_code": "CBSA-IRPA-DELEG-2023-05", "act_short": "CBSA Deleg 2023-05-08", "act_name": _CBSA_DELEG_NAME, "url": _CBSA_DELEG_URL + "irpa-lipr-2023-05-08-eng.html", "effective": "2023-05-08", }, # Amendments to the 2023 restatement, in chronological order. "cbsa-2023-09": { "code": "cbsa-2023-09", "kind": "html-cbsa", "act_code": "CBSA-IRPA-DELEG-AMEND-2023-09-08", "act_short": "CBSA Deleg Amend 2023-09-08", "act_name": "Amendment to the " + _CBSA_DELEG_NAME, "url": _CBSA_DELEG_URL + "irpa-lipr-2023-09-08-eng.html", "effective": "2023-09-08", }, "cbsa-2023-11": { "code": "cbsa-2023-11", "kind": "html-cbsa", "act_code": "CBSA-IRPA-DELEG-AMEND-2023-11-17", "act_short": "CBSA Deleg Amend 2023-11-17", "act_name": "Amendment to the " + _CBSA_DELEG_NAME, "url": _CBSA_DELEG_URL + "irpa-lipr-2023-11-17-eng.html", "effective": "2023-11-17", }, "cbsa-2024-03-05": { "code": "cbsa-2024-03-05", "kind": "html-cbsa", "act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-05", "act_short": "CBSA Deleg Amend 2024-03-05", "act_name": "Amendment to the " + _CBSA_DELEG_NAME, "url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-05-eng.html", "effective": "2024-03-05", }, "cbsa-2024-03-15": { "code": "cbsa-2024-03-15", "kind": "html-cbsa", "act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-15", "act_short": "CBSA Deleg Amend 2024-03-15", "act_name": "Amendment to the " + _CBSA_DELEG_NAME, "url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-15-eng.html", "effective": "2024-03-15", }, "cbsa-2025-07": { "code": "cbsa-2025-07", "kind": "html-cbsa", "act_code": "CBSA-IRPA-DELEG-AMEND-2025-07-10", "act_short": "CBSA Deleg Amend 2025-07-10", "act_name": "Amendment to the " + _CBSA_DELEG_NAME, "url": _CBSA_DELEG_URL + "irpa-lipr-2025-07-10-eng.html", "effective": "2025-07-10", }, # Separate authority: a peace-officer designation under IRPA s. 138(1). # Narrative prose, not a Schedule table -- needs its own parser. "cbsa-peaceofficer": { "code": "cbsa-peaceofficer", "kind": "html-cbsa-narrative", "act_code": "CBSA-IRPA-PEACEOFF-2022-08", "act_short": "CBSA Peace Officer Auth 2022-08-18", "act_name": ("Authorization to have the Authority and Powers of a " "Peace Officer under the Immigration and Refugee " "Protection Act (subsection 138(1))"), "url": _CBSA_DELEG_URL + "desig/po-ag_2022-08-eng.html", "effective": "2022-08-18", }, "ircc": { "code": "ircc", "kind": "pdf-ircc", "act_code": "IRCC-IL3-DELEG", "act_short": "IRCC IL3", "act_name": ("IL3 — Instrument of Designation and Delegation, " "Immigration and Refugee Protection Act and Regulations"), "url": ("https://www.canada.ca/content/dam/ircc/migration/ircc/english/" "resources/manuals/il/il3-eng.pdf"), }, } def _norm(text): """Collapse all whitespace -- including the non-breaking spaces these sources use heavily -- to single spaces.""" return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip() def _normalize_refs(text): """Expand the instruments' provision shorthand so the section numbers are searchable as tokens: 'A55(1)' -> 'IRPA 55(1)', 'R39' -> 'IRPR 39'. Both instruments write 'A' for the Act and 'R' for the Regulations.""" text = re.sub(r"\bA(?=\d)", "IRPA ", text) text = re.sub(r"\bR(?=\d)", "IRPR ", text) return _norm(text) def _fetch(url, dest, powershell=False): """Fetch a page or file, caching the raw bytes under data/raw/delegation. canada.ca blocks Python's HTTP client, so its PDF is fetched via PowerShell.""" if dest.exists(): return dest.read_bytes() dest.parent.mkdir(parents=True, exist_ok=True) if powershell: command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' " f"-UseBasicParsing -UserAgent '{_UA}'") subprocess.run( ["powershell", "-NoProfile", "-NonInteractive", "-Command", command], check=True, capture_output=True, timeout=180) else: req = urllib.request.Request(url, headers={"User-Agent": _UA}) with urllib.request.urlopen(req, timeout=60) as resp: dest.write_bytes(resp.read()) time.sleep(0.5) # be polite to the server return dest.read_bytes() # --- CBSA instrument (HTML) --------------------------------------------------- def _delegates(cell): """Flatten a 'Delegates / Designated officials' table cell into readable lines. The cell groups officer positions under an organisation header (

: CBSA, RCMP, IRCC) and an optional sub-heading (

: a region or a headquarters branch), each followed by a