Beemer
Refresh the CBSA IRPA delegation: 2023 restatement + 5 amendments + peace-officer auth
ef6e3dc | """Ingest instruments of delegation and designation under IRPA / IRPR. | |
| These instruments record which officer positions the Minister has delegated | |
| powers to, or designated for functions, under the Immigration and Refugee | |
| Protection Act and its Regulations. They are administrative instruments -- not | |
| enacted law, and not guidance -- so every chunk is tagged doc_type="delegation". | |
| Sources: | |
| - the CBSA "Delegation of Authority and Designations of Officers ..." | |
| instrument (HTML, cbsa-asfc.gc.ca); | |
| - the IRCC "IL3 -- Instrument of Designation and Delegation" (PDF, canada.ca). | |
| py -m canlex.delegation | |
| """ | |
| import io | |
| import json | |
| import re | |
| import subprocess | |
| import time | |
| import urllib.request | |
| from bs4 import BeautifulSoup | |
| from pypdf import PdfReader | |
| from .config import PROCESSED_DIR, RAW_DIR | |
| RAW = RAW_DIR / "delegation" | |
| OUT = PROCESSED_DIR / "delegation.json" | |
| # cbsa-asfc.gc.ca serves an ordinary client fine with a browser User-Agent; | |
| # canada.ca (the IRCC PDF) blocks Python's HTTP client at the TLS layer, so that | |
| # one is fetched via PowerShell's (.NET) HTTP stack, as agreement.py does. | |
| _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36") | |
| # The CBSA IRPA delegation has been a moving target: the November 28, 2017 | |
| # instrument was expressly superseded by a full restatement on May 8, 2023 | |
| # (signed by Mendicino), which has itself been amended five times since. CBSA | |
| # does not publish a consolidated version, so the current effective state is the | |
| # 2023 restatement read together with its later amendments; we ingest each one | |
| # as a separate "act" so a user (or the LLM) sees the base item and any | |
| # amendments that touch it side-by-side in retrieval. | |
| _CBSA_DELEG_URL = ("https://www.cbsa-asfc.gc.ca/agency-agence/actreg-loireg/" | |
| "delegation/") | |
| _CBSA_DELEG_NAME = ("Delegation of Authority and Designations of Officers by " | |
| "the Minister of Public Safety and Emergency Preparedness " | |
| "under the Immigration and Refugee Protection Act and the " | |
| "Immigration and Refugee Protection Regulations") | |
| SOURCES = { | |
| # The 2023 restatement -- the current base instrument. | |
| "cbsa-2023-05": { | |
| "code": "cbsa-2023-05", | |
| "kind": "html-cbsa", | |
| "act_code": "CBSA-IRPA-DELEG-2023-05", | |
| "act_short": "CBSA Deleg 2023-05-08", | |
| "act_name": _CBSA_DELEG_NAME, | |
| "url": _CBSA_DELEG_URL + "irpa-lipr-2023-05-08-eng.html", | |
| "effective": "2023-05-08", | |
| }, | |
| # Amendments to the 2023 restatement, in chronological order. | |
| "cbsa-2023-09": { | |
| "code": "cbsa-2023-09", | |
| "kind": "html-cbsa", | |
| "act_code": "CBSA-IRPA-DELEG-AMEND-2023-09-08", | |
| "act_short": "CBSA Deleg Amend 2023-09-08", | |
| "act_name": "Amendment to the " + _CBSA_DELEG_NAME, | |
| "url": _CBSA_DELEG_URL + "irpa-lipr-2023-09-08-eng.html", | |
| "effective": "2023-09-08", | |
| }, | |
| "cbsa-2023-11": { | |
| "code": "cbsa-2023-11", | |
| "kind": "html-cbsa", | |
| "act_code": "CBSA-IRPA-DELEG-AMEND-2023-11-17", | |
| "act_short": "CBSA Deleg Amend 2023-11-17", | |
| "act_name": "Amendment to the " + _CBSA_DELEG_NAME, | |
| "url": _CBSA_DELEG_URL + "irpa-lipr-2023-11-17-eng.html", | |
| "effective": "2023-11-17", | |
| }, | |
| "cbsa-2024-03-05": { | |
| "code": "cbsa-2024-03-05", | |
| "kind": "html-cbsa", | |
| "act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-05", | |
| "act_short": "CBSA Deleg Amend 2024-03-05", | |
| "act_name": "Amendment to the " + _CBSA_DELEG_NAME, | |
| "url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-05-eng.html", | |
| "effective": "2024-03-05", | |
| }, | |
| "cbsa-2024-03-15": { | |
| "code": "cbsa-2024-03-15", | |
| "kind": "html-cbsa", | |
| "act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-15", | |
| "act_short": "CBSA Deleg Amend 2024-03-15", | |
| "act_name": "Amendment to the " + _CBSA_DELEG_NAME, | |
| "url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-15-eng.html", | |
| "effective": "2024-03-15", | |
| }, | |
| "cbsa-2025-07": { | |
| "code": "cbsa-2025-07", | |
| "kind": "html-cbsa", | |
| "act_code": "CBSA-IRPA-DELEG-AMEND-2025-07-10", | |
| "act_short": "CBSA Deleg Amend 2025-07-10", | |
| "act_name": "Amendment to the " + _CBSA_DELEG_NAME, | |
| "url": _CBSA_DELEG_URL + "irpa-lipr-2025-07-10-eng.html", | |
| "effective": "2025-07-10", | |
| }, | |
| # Separate authority: a peace-officer designation under IRPA s. 138(1). | |
| # Narrative prose, not a Schedule table -- needs its own parser. | |
| "cbsa-peaceofficer": { | |
| "code": "cbsa-peaceofficer", | |
| "kind": "html-cbsa-narrative", | |
| "act_code": "CBSA-IRPA-PEACEOFF-2022-08", | |
| "act_short": "CBSA Peace Officer Auth 2022-08-18", | |
| "act_name": ("Authorization to have the Authority and Powers of a " | |
| "Peace Officer under the Immigration and Refugee " | |
| "Protection Act (subsection 138(1))"), | |
| "url": _CBSA_DELEG_URL + "desig/po-ag_2022-08-eng.html", | |
| "effective": "2022-08-18", | |
| }, | |
| "ircc": { | |
| "code": "ircc", | |
| "kind": "pdf-ircc", | |
| "act_code": "IRCC-IL3-DELEG", | |
| "act_short": "IRCC IL3", | |
| "act_name": ("IL3 — Instrument of Designation and Delegation, " | |
| "Immigration and Refugee Protection Act and Regulations"), | |
| "url": ("https://www.canada.ca/content/dam/ircc/migration/ircc/english/" | |
| "resources/manuals/il/il3-eng.pdf"), | |
| }, | |
| } | |
| def _norm(text): | |
| """Collapse all whitespace -- including the non-breaking spaces these | |
| sources use heavily -- to single spaces.""" | |
| return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip() | |
| def _normalize_refs(text): | |
| """Expand the instruments' provision shorthand so the section numbers are | |
| searchable as tokens: 'A55(1)' -> 'IRPA 55(1)', 'R39' -> 'IRPR 39'. Both | |
| instruments write 'A' for the Act and 'R' for the Regulations.""" | |
| text = re.sub(r"\bA(?=\d)", "IRPA ", text) | |
| text = re.sub(r"\bR(?=\d)", "IRPR ", text) | |
| return _norm(text) | |
| def _fetch(url, dest, powershell=False): | |
| """Fetch a page or file, caching the raw bytes under data/raw/delegation. | |
| canada.ca blocks Python's HTTP client, so its PDF is fetched via PowerShell.""" | |
| if dest.exists(): | |
| return dest.read_bytes() | |
| dest.parent.mkdir(parents=True, exist_ok=True) | |
| if powershell: | |
| command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' " | |
| f"-UseBasicParsing -UserAgent '{_UA}'") | |
| subprocess.run( | |
| ["powershell", "-NoProfile", "-NonInteractive", "-Command", command], | |
| check=True, capture_output=True, timeout=180) | |
| else: | |
| req = urllib.request.Request(url, headers={"User-Agent": _UA}) | |
| with urllib.request.urlopen(req, timeout=60) as resp: | |
| dest.write_bytes(resp.read()) | |
| time.sleep(0.5) # be polite to the server | |
| return dest.read_bytes() | |
| # --- CBSA instrument (HTML) --------------------------------------------------- | |
| def _delegates(cell): | |
| """Flatten a 'Delegates / Designated officials' table cell into readable | |
| lines. The cell groups officer positions under an organisation header | |
| (<p class="h4">: CBSA, RCMP, IRCC) and an optional sub-heading | |
| (<p class="h5">: a region or a headquarters branch), each followed by a | |
| <ul> of position titles.""" | |
| lines, org, sub, org_used = [], "", "", False | |
| for el in cell.find_all(["p", "ul"], recursive=False): | |
| if el.name == "p": | |
| classes = " ".join(el.get("class") or []) | |
| text = _norm(el.get_text()) | |
| if not text: | |
| continue | |
| if "h4" in classes: | |
| if org and not org_used: # a previous org with no list of its own | |
| lines.append(org) | |
| org, sub, org_used = text.rstrip(": "), "", False | |
| elif "h5" in classes: | |
| sub = text | |
| else: | |
| lines.append(text) # a free-standing note | |
| else: # <ul> of position titles | |
| positions = "; ".join( | |
| t for t in (_norm(li.get_text()) | |
| for li in el.find_all("li", recursive=False)) if t) | |
| if positions: | |
| label = f"{org} — {sub}" if sub else org | |
| lines.append(f"{label}: {positions}" if label else positions) | |
| org_used = True | |
| if org and not org_used: # a trailing org with no list | |
| lines.append(org) | |
| return "\n".join(lines) | |
| def parse_cbsa(html, src): | |
| """Parse the CBSA delegation instrument into one chunk per Schedule item, | |
| plus one chunk for the preamble.""" | |
| soup = BeautifulSoup(html, "html.parser") | |
| main = soup.find("main") | |
| if main is None: | |
| return [] | |
| for sup in main.find_all("sup"): # drop footnote-reference superscripts | |
| sup.decompose() | |
| # The effective date comes from SOURCES, not the first <time> in <main>: | |
| # amendment pages quote the base instrument's date ("signed on May 8, 2023") | |
| # in their preamble, so the first <time> on an amendment page is the base | |
| # instrument's date, not the amendment's own. | |
| date = src["effective"] | |
| chunks = [] | |
| # Preamble: the paragraphs between the title and the Schedule, which set out | |
| # the tiers of delegation and designation and how the columns are read. | |
| schedule = main.find("h2", id="sch") | |
| if schedule: | |
| paras = [_norm(p.get_text()) | |
| for p in reversed(schedule.find_previous_siblings("p"))] | |
| body = "\n".join(p for p in paras if p) | |
| if body: | |
| chunks.append({ | |
| "id": f"delegation-{src['code']}-preamble", | |
| "doc_type": "delegation", | |
| "act_code": src["act_code"], | |
| "act_short": src["act_short"], | |
| "act_name": src["act_name"], | |
| "section": "", | |
| "marginal_note": "Tiers of delegation and designation", | |
| "part": "", | |
| "division": "", | |
| "heading": "Instrument of delegation and designation under IRPA", | |
| "text": body, | |
| "history": "", | |
| "last_amended": "", | |
| "current_to": date, | |
| "citation": f"{src['act_short']} — Preamble", | |
| "source_url": src["url"], | |
| }) | |
| # One chunk per Schedule item. Two row shapes are accepted: | |
| # (a) four <td> cells: Item | Refs | Power | Delegates (the 2023-05 | |
| # restatement and the 2023-09, 2023-11, 2024-03 amendments). | |
| # (b) one <th> + two or three <td> cells: the <th> carries the item | |
| # number and the <td>s carry Refs | Power [| Delegates]. The | |
| # 2025-07-10 amendment uses this layout, and may omit the Delegates | |
| # column when an amendment changes only references or descriptions. | |
| # Each topical <h3>, if present, names the schedule section the table belongs to. | |
| for table in main.find_all("table", class_="table-bordered"): | |
| h3 = table.find_previous_sibling("h3") | |
| section_name = _norm(h3.get_text()) if h3 else "" | |
| for tr in table.find_all("tr"): | |
| th_cells = tr.find_all("th", recursive=False) | |
| td_cells = tr.find_all("td", recursive=False) | |
| if not th_cells and len(td_cells) == 4: | |
| item_cell, refs_cell, power_cell, deleg_cell = td_cells | |
| elif len(th_cells) == 1 and len(td_cells) in (2, 3): | |
| item_cell, refs_cell, power_cell = th_cells[0], td_cells[0], td_cells[1] | |
| deleg_cell = td_cells[2] if len(td_cells) == 3 else None | |
| else: | |
| continue # header row or a stray row | |
| item_no = _norm(item_cell.get_text()).rstrip(".") | |
| refs = _normalize_refs(_norm(refs_cell.get_text())) | |
| power = " ".join(_norm(p.get_text()) | |
| for p in power_cell.find_all("p")) \ | |
| or _norm(power_cell.get_text()) | |
| delegates = _delegates(deleg_cell) if deleg_cell is not None else "" | |
| if not item_no or not (power or refs): | |
| continue | |
| text = power | |
| if refs: | |
| text += f"\n\nProvisions (IRPA / IRPR): {refs}." | |
| if delegates: | |
| text += "\n\nDelegated / designated to:\n" + delegates | |
| chunks.append({ | |
| "id": f"delegation-{src['code']}-{item_no}", | |
| "doc_type": "delegation", | |
| "act_code": src["act_code"], | |
| "act_short": src["act_short"], | |
| "act_name": src["act_name"], | |
| "section": "", | |
| "marginal_note": refs or f"Item {item_no}", | |
| "part": section_name, | |
| "division": "", | |
| "heading": section_name, | |
| "text": text, | |
| "history": "", | |
| "last_amended": "", | |
| "current_to": date, | |
| "citation": f"{src['act_short']}, Item {item_no}", | |
| "source_url": src["url"], | |
| }) | |
| return chunks | |
| # --- CBSA narrative-prose instrument (e.g. the peace-officer designation) ----- | |
| def parse_cbsa_narrative(html, src): | |
| """Parse a narrative-prose CBSA designation instrument into a single chunk. | |
| Used for the peace-officer authorization under IRPA s. 138(1) -- plain prose | |
| listing 18 designated officer positions, not a four-column Schedule table, | |
| so parse_cbsa's table walker would yield nothing. The whole operative text | |
| is a few hundred words, well within a single chunk.""" | |
| soup = BeautifulSoup(html, "html.parser") | |
| main = soup.find("main") | |
| if main is None: | |
| return [] | |
| for sup in main.find_all("sup"): | |
| sup.decompose() | |
| date = src["effective"] | |
| # Skip the breadcrumb/title/footer chrome -- only paragraphs and lists in | |
| # <main> that carry real content. Lists are rendered as "; "-joined items. | |
| parts = [] | |
| for el in main.find_all(["p", "ul", "ol"]): | |
| if el.find_parent(["ul", "ol"]): | |
| continue # nested lists are picked up by their parent | |
| if el.name in ("ul", "ol"): | |
| items = [_norm(li.get_text()) for li in el.find_all("li")] | |
| joined = "; ".join(t for t in items if t) | |
| if joined: | |
| parts.append(joined) | |
| else: | |
| text = _norm(el.get_text()) | |
| if text: | |
| parts.append(text) | |
| body = "\n".join(parts) | |
| if not body: | |
| return [] | |
| return [{ | |
| "id": f"delegation-{src['code']}", | |
| "doc_type": "delegation", | |
| "act_code": src["act_code"], | |
| "act_short": src["act_short"], | |
| "act_name": src["act_name"], | |
| "section": "", | |
| "marginal_note": "Peace-officer authorization — IRPA s. 138(1)", | |
| "part": "", | |
| "division": "", | |
| "heading": src["act_name"], | |
| "text": body, | |
| "history": "", | |
| "last_amended": "", | |
| "current_to": date, | |
| "citation": src["act_short"], | |
| "source_url": src["url"], | |
| }] | |
| # --- IRCC IL3 instrument (PDF) ------------------------------------------------ | |
| # A topical part heading -- a line in full upper case (MINISTERIAL INSTRUCTIONS, | |
| # MEDICAL, MISCELLANEOUS). Organisation acronyms (CI, CBSA, RCMP) are shorter | |
| # than the 6-character floor and so are not mistaken for headings. | |
| _IL3_PART = re.compile(r"[A-Z][A-Z &/,()'.\-]{5,}") | |
| # An item opens "<n>. <A/R provision>" -- the number must be followed by a | |
| # provision reference, which rejects ordinary numbered prose. | |
| _IL3_ITEM = re.compile(r"(\d+)\.\s+(?=[AR]\d)") | |
| # The word that introduces an item's power description, after the provisions. | |
| _IL3_POWER = re.compile(r"(?:Delegation|Designation)\s*[-–—]\s*") | |
| def _pdf_pages(pdf_bytes): | |
| """Extract each page's text, dropping the printed page number that pypdf | |
| emits as the page's first line (roman in the front matter, arabic later).""" | |
| reader = PdfReader(io.BytesIO(pdf_bytes)) | |
| pages = [] | |
| for page in reader.pages: | |
| lines = (page.extract_text() or "").split("\n") | |
| if lines and re.fullmatch(r"\s*[ivxlcdm\d]+\s*", lines[0] or "", re.I): | |
| lines = lines[1:] | |
| pages.append("\n".join(lines)) | |
| return pages | |
| def parse_ircc(pdf_bytes, src): | |
| """Parse the IRCC IL3 instrument into one chunk per Schedule item, plus a | |
| preamble chunk. The PDF flattens the four-column table into a linear text | |
| stream, so each item runs from its numbered marker to the next; the power | |
| description and the delegated positions are kept together within the item.""" | |
| pages = _pdf_pages(pdf_bytes) | |
| if not pages: | |
| return [] | |
| version_match = re.search(r"(?:Spring|Summer|Fall|Winter)\s+\d{4}", pages[0]) | |
| version = version_match.group(0) if version_match else "" | |
| chunks = [] | |
| preamble = _norm(pages[1]) if len(pages) > 1 else "" | |
| if preamble: | |
| chunks.append({ | |
| "id": f"delegation-{src['code']}-preamble", | |
| "doc_type": "delegation", | |
| "act_code": src["act_code"], | |
| "act_short": src["act_short"], | |
| "act_name": src["act_name"], | |
| "section": "", | |
| "marginal_note": "Preamble — designation and delegation under IRPA s. 6", | |
| "part": "", | |
| "division": "", | |
| "heading": "Instrument of designation and delegation under IRPA", | |
| "text": preamble, | |
| "history": "", | |
| "last_amended": "", | |
| "current_to": version, | |
| "citation": f"{src['act_short']} — Preamble", | |
| "source_url": src["url"], | |
| }) | |
| # Walk every line: an all-caps line is a topical part heading; a line that | |
| # opens "<n>. <A/R provision>" starts a new item. Lines before the first | |
| # item (the preamble, definitions and table of contents) are skipped. | |
| items, part = [], "" | |
| cur_no = cur_part = None | |
| cur_lines = [] | |
| for line in "\n".join(pages).split("\n"): | |
| s = _norm(line) | |
| if not s: | |
| continue | |
| if _IL3_PART.fullmatch(s): | |
| part = s.title() | |
| continue | |
| m = _IL3_ITEM.match(s) | |
| if m: | |
| if cur_no is not None: | |
| items.append((cur_part, cur_no, cur_lines)) | |
| cur_no, cur_part, cur_lines = m.group(1), part, [s] | |
| elif cur_no is not None: | |
| cur_lines.append(s) | |
| if cur_no is not None: | |
| items.append((cur_part, cur_no, cur_lines)) | |
| for n, (item_part, item_no, lines) in enumerate(items, start=1): | |
| body = re.sub(r"^\d+\.\s*", "", "\n".join(lines)).strip() | |
| if not body: | |
| continue | |
| power = _IL3_POWER.search(body) | |
| refs = _normalize_refs(body[:power.start()]) if power else "" | |
| chunks.append({ | |
| "id": f"delegation-{src['code']}-{n}", | |
| "doc_type": "delegation", | |
| "act_code": src["act_code"], | |
| "act_short": src["act_short"], | |
| "act_name": src["act_name"], | |
| "section": "", | |
| "marginal_note": refs or f"Item {item_no}", | |
| "part": item_part or "", | |
| "division": "", | |
| "heading": item_part or "", | |
| "text": _normalize_refs(body), | |
| "history": "", | |
| "last_amended": "", | |
| "current_to": version, | |
| "citation": (f"{src['act_short']} — {item_part}, Item {item_no}" | |
| if item_part else f"{src['act_short']}, Item {item_no}"), | |
| "source_url": src["url"], | |
| }) | |
| return chunks | |
| def build(): | |
| """Fetch, parse and chunk every delegation instrument into delegation.json.""" | |
| all_chunks = [] | |
| for src in SOURCES.values(): | |
| print(f"Ingesting {src['act_short']} ...") | |
| try: | |
| if src["kind"] == "html-cbsa": | |
| html = _fetch(src["url"], RAW / f"{src['code']}.html") | |
| chunks = parse_cbsa(html, src) | |
| elif src["kind"] == "html-cbsa-narrative": | |
| html = _fetch(src["url"], RAW / f"{src['code']}.html") | |
| chunks = parse_cbsa_narrative(html, src) | |
| elif src["kind"] == "pdf-ircc": | |
| pdf = _fetch(src["url"], RAW / f"{src['code']}.pdf", | |
| powershell=True) | |
| chunks = parse_ircc(pdf, src) | |
| else: | |
| chunks = [] | |
| except Exception as exc: | |
| print(f" !! {src['act_short']}: {type(exc).__name__}: {exc}") | |
| continue | |
| all_chunks.extend(chunks) | |
| print(f" {len(chunks)} chunks") | |
| PROCESSED_DIR.mkdir(parents=True, exist_ok=True) | |
| OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1), | |
| encoding="utf-8") | |
| print(f"\n{len(all_chunks)} delegation chunks from {len(SOURCES)} " | |
| f"instrument(s) -> {OUT}") | |
| if __name__ == "__main__": | |
| build() | |