Spaces:

Beemer0
/

CanLex

Running

File size: 21,089 Bytes

"""Ingest instruments of delegation and designation under IRPA / IRPR.

These instruments record which officer positions the Minister has delegated
powers to, or designated for functions, under the Immigration and Refugee
Protection Act and its Regulations. They are administrative instruments -- not
enacted law, and not guidance -- so every chunk is tagged doc_type="delegation".

Sources:
  - the CBSA "Delegation of Authority and Designations of Officers ..."
    instrument (HTML, cbsa-asfc.gc.ca);
  - the IRCC "IL3 -- Instrument of Designation and Delegation" (PDF, canada.ca).

    py -m canlex.delegation
"""
import io
import json
import re
import subprocess
import time
import urllib.request

from bs4 import BeautifulSoup
from pypdf import PdfReader

from .config import PROCESSED_DIR, RAW_DIR

RAW = RAW_DIR / "delegation"
OUT = PROCESSED_DIR / "delegation.json"

# cbsa-asfc.gc.ca serves an ordinary client fine with a browser User-Agent;
# canada.ca (the IRCC PDF) blocks Python's HTTP client at the TLS layer, so that
# one is fetched via PowerShell's (.NET) HTTP stack, as agreement.py does.
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
       "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

# The CBSA IRPA delegation has been a moving target: the November 28, 2017
# instrument was expressly superseded by a full restatement on May 8, 2023
# (signed by Mendicino), which has itself been amended five times since. CBSA
# does not publish a consolidated version, so the current effective state is the
# 2023 restatement read together with its later amendments; we ingest each one
# as a separate "act" so a user (or the LLM) sees the base item and any
# amendments that touch it side-by-side in retrieval.
_CBSA_DELEG_URL = ("https://www.cbsa-asfc.gc.ca/agency-agence/actreg-loireg/"
                   "delegation/")
_CBSA_DELEG_NAME = ("Delegation of Authority and Designations of Officers by "
                    "the Minister of Public Safety and Emergency Preparedness "
                    "under the Immigration and Refugee Protection Act and the "
                    "Immigration and Refugee Protection Regulations")

SOURCES = {
    # The 2023 restatement -- the current base instrument.
    "cbsa-2023-05": {
        "code": "cbsa-2023-05",
        "kind": "html-cbsa",
        "act_code": "CBSA-IRPA-DELEG-2023-05",
        "act_short": "CBSA Deleg 2023-05-08",
        "act_name": _CBSA_DELEG_NAME,
        "url": _CBSA_DELEG_URL + "irpa-lipr-2023-05-08-eng.html",
        "effective": "2023-05-08",
    },
    # Amendments to the 2023 restatement, in chronological order.
    "cbsa-2023-09": {
        "code": "cbsa-2023-09",
        "kind": "html-cbsa",
        "act_code": "CBSA-IRPA-DELEG-AMEND-2023-09-08",
        "act_short": "CBSA Deleg Amend 2023-09-08",
        "act_name": "Amendment to the " + _CBSA_DELEG_NAME,
        "url": _CBSA_DELEG_URL + "irpa-lipr-2023-09-08-eng.html",
        "effective": "2023-09-08",
    },
    "cbsa-2023-11": {
        "code": "cbsa-2023-11",
        "kind": "html-cbsa",
        "act_code": "CBSA-IRPA-DELEG-AMEND-2023-11-17",
        "act_short": "CBSA Deleg Amend 2023-11-17",
        "act_name": "Amendment to the " + _CBSA_DELEG_NAME,
        "url": _CBSA_DELEG_URL + "irpa-lipr-2023-11-17-eng.html",
        "effective": "2023-11-17",
    },
    "cbsa-2024-03-05": {
        "code": "cbsa-2024-03-05",
        "kind": "html-cbsa",
        "act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-05",
        "act_short": "CBSA Deleg Amend 2024-03-05",
        "act_name": "Amendment to the " + _CBSA_DELEG_NAME,
        "url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-05-eng.html",
        "effective": "2024-03-05",
    },
    "cbsa-2024-03-15": {
        "code": "cbsa-2024-03-15",
        "kind": "html-cbsa",
        "act_code": "CBSA-IRPA-DELEG-AMEND-2024-03-15",
        "act_short": "CBSA Deleg Amend 2024-03-15",
        "act_name": "Amendment to the " + _CBSA_DELEG_NAME,
        "url": _CBSA_DELEG_URL + "irpa-lipr-2024-03-15-eng.html",
        "effective": "2024-03-15",
    },
    "cbsa-2025-07": {
        "code": "cbsa-2025-07",
        "kind": "html-cbsa",
        "act_code": "CBSA-IRPA-DELEG-AMEND-2025-07-10",
        "act_short": "CBSA Deleg Amend 2025-07-10",
        "act_name": "Amendment to the " + _CBSA_DELEG_NAME,
        "url": _CBSA_DELEG_URL + "irpa-lipr-2025-07-10-eng.html",
        "effective": "2025-07-10",
    },
    # Separate authority: a peace-officer designation under IRPA s. 138(1).
    # Narrative prose, not a Schedule table -- needs its own parser.
    "cbsa-peaceofficer": {
        "code": "cbsa-peaceofficer",
        "kind": "html-cbsa-narrative",
        "act_code": "CBSA-IRPA-PEACEOFF-2022-08",
        "act_short": "CBSA Peace Officer Auth 2022-08-18",
        "act_name": ("Authorization to have the Authority and Powers of a "
                     "Peace Officer under the Immigration and Refugee "
                     "Protection Act (subsection 138(1))"),
        "url": _CBSA_DELEG_URL + "desig/po-ag_2022-08-eng.html",
        "effective": "2022-08-18",
    },
    "ircc": {
        "code": "ircc",
        "kind": "pdf-ircc",
        "act_code": "IRCC-IL3-DELEG",
        "act_short": "IRCC IL3",
        "act_name": ("IL3 — Instrument of Designation and Delegation, "
                     "Immigration and Refugee Protection Act and Regulations"),
        "url": ("https://www.canada.ca/content/dam/ircc/migration/ircc/english/"
                "resources/manuals/il/il3-eng.pdf"),
    },
}


def _norm(text):
    """Collapse all whitespace -- including the non-breaking spaces these
    sources use heavily -- to single spaces."""
    return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip()


def _normalize_refs(text):
    """Expand the instruments' provision shorthand so the section numbers are
    searchable as tokens: 'A55(1)' -> 'IRPA 55(1)', 'R39' -> 'IRPR 39'. Both
    instruments write 'A' for the Act and 'R' for the Regulations."""
    text = re.sub(r"\bA(?=\d)", "IRPA ", text)
    text = re.sub(r"\bR(?=\d)", "IRPR ", text)
    return _norm(text)


def _fetch(url, dest, powershell=False):
    """Fetch a page or file, caching the raw bytes under data/raw/delegation.
    canada.ca blocks Python's HTTP client, so its PDF is fetched via PowerShell."""
    if dest.exists():
        return dest.read_bytes()
    dest.parent.mkdir(parents=True, exist_ok=True)
    if powershell:
        command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' "
                   f"-UseBasicParsing -UserAgent '{_UA}'")
        subprocess.run(
            ["powershell", "-NoProfile", "-NonInteractive", "-Command", command],
            check=True, capture_output=True, timeout=180)
    else:
        req = urllib.request.Request(url, headers={"User-Agent": _UA})
        with urllib.request.urlopen(req, timeout=60) as resp:
            dest.write_bytes(resp.read())
    time.sleep(0.5)   # be polite to the server
    return dest.read_bytes()


# --- CBSA instrument (HTML) ---------------------------------------------------

def _delegates(cell):
    """Flatten a 'Delegates / Designated officials' table cell into readable
    lines. The cell groups officer positions under an organisation header
    (<p class="h4">: CBSA, RCMP, IRCC) and an optional sub-heading
    (<p class="h5">: a region or a headquarters branch), each followed by a
    <ul> of position titles."""
    lines, org, sub, org_used = [], "", "", False
    for el in cell.find_all(["p", "ul"], recursive=False):
        if el.name == "p":
            classes = " ".join(el.get("class") or [])
            text = _norm(el.get_text())
            if not text:
                continue
            if "h4" in classes:
                if org and not org_used:   # a previous org with no list of its own
                    lines.append(org)
                org, sub, org_used = text.rstrip(": "), "", False
            elif "h5" in classes:
                sub = text
            else:
                lines.append(text)         # a free-standing note
        else:                              # <ul> of position titles
            positions = "; ".join(
                t for t in (_norm(li.get_text())
                            for li in el.find_all("li", recursive=False)) if t)
            if positions:
                label = f"{org} — {sub}" if sub else org
                lines.append(f"{label}: {positions}" if label else positions)
                org_used = True
    if org and not org_used:               # a trailing org with no list
        lines.append(org)
    return "\n".join(lines)


def parse_cbsa(html, src):
    """Parse the CBSA delegation instrument into one chunk per Schedule item,
    plus one chunk for the preamble."""
    soup = BeautifulSoup(html, "html.parser")
    main = soup.find("main")
    if main is None:
        return []
    for sup in main.find_all("sup"):       # drop footnote-reference superscripts
        sup.decompose()

    # The effective date comes from SOURCES, not the first <time> in <main>:
    # amendment pages quote the base instrument's date ("signed on May 8, 2023")
    # in their preamble, so the first <time> on an amendment page is the base
    # instrument's date, not the amendment's own.
    date = src["effective"]

    chunks = []

    # Preamble: the paragraphs between the title and the Schedule, which set out
    # the tiers of delegation and designation and how the columns are read.
    schedule = main.find("h2", id="sch")
    if schedule:
        paras = [_norm(p.get_text())
                 for p in reversed(schedule.find_previous_siblings("p"))]
        body = "\n".join(p for p in paras if p)
        if body:
            chunks.append({
                "id": f"delegation-{src['code']}-preamble",
                "doc_type": "delegation",
                "act_code": src["act_code"],
                "act_short": src["act_short"],
                "act_name": src["act_name"],
                "section": "",
                "marginal_note": "Tiers of delegation and designation",
                "part": "",
                "division": "",
                "heading": "Instrument of delegation and designation under IRPA",
                "text": body,
                "history": "",
                "last_amended": "",
                "current_to": date,
                "citation": f"{src['act_short']} — Preamble",
                "source_url": src["url"],
            })

    # One chunk per Schedule item. Two row shapes are accepted:
    #   (a) four <td> cells: Item | Refs | Power | Delegates (the 2023-05
    #       restatement and the 2023-09, 2023-11, 2024-03 amendments).
    #   (b) one <th> + two or three <td> cells: the <th> carries the item
    #       number and the <td>s carry Refs | Power [| Delegates]. The
    #       2025-07-10 amendment uses this layout, and may omit the Delegates
    #       column when an amendment changes only references or descriptions.
    # Each topical <h3>, if present, names the schedule section the table belongs to.
    for table in main.find_all("table", class_="table-bordered"):
        h3 = table.find_previous_sibling("h3")
        section_name = _norm(h3.get_text()) if h3 else ""
        for tr in table.find_all("tr"):
            th_cells = tr.find_all("th", recursive=False)
            td_cells = tr.find_all("td", recursive=False)
            if not th_cells and len(td_cells) == 4:
                item_cell, refs_cell, power_cell, deleg_cell = td_cells
            elif len(th_cells) == 1 and len(td_cells) in (2, 3):
                item_cell, refs_cell, power_cell = th_cells[0], td_cells[0], td_cells[1]
                deleg_cell = td_cells[2] if len(td_cells) == 3 else None
            else:
                continue                   # header row or a stray row
            item_no = _norm(item_cell.get_text()).rstrip(".")
            refs = _normalize_refs(_norm(refs_cell.get_text()))
            power = " ".join(_norm(p.get_text())
                             for p in power_cell.find_all("p")) \
                or _norm(power_cell.get_text())
            delegates = _delegates(deleg_cell) if deleg_cell is not None else ""
            if not item_no or not (power or refs):
                continue
            text = power
            if refs:
                text += f"\n\nProvisions (IRPA / IRPR): {refs}."
            if delegates:
                text += "\n\nDelegated / designated to:\n" + delegates
            chunks.append({
                "id": f"delegation-{src['code']}-{item_no}",
                "doc_type": "delegation",
                "act_code": src["act_code"],
                "act_short": src["act_short"],
                "act_name": src["act_name"],
                "section": "",
                "marginal_note": refs or f"Item {item_no}",
                "part": section_name,
                "division": "",
                "heading": section_name,
                "text": text,
                "history": "",
                "last_amended": "",
                "current_to": date,
                "citation": f"{src['act_short']}, Item {item_no}",
                "source_url": src["url"],
            })
    return chunks


# --- CBSA narrative-prose instrument (e.g. the peace-officer designation) -----

def parse_cbsa_narrative(html, src):
    """Parse a narrative-prose CBSA designation instrument into a single chunk.

    Used for the peace-officer authorization under IRPA s. 138(1) -- plain prose
    listing 18 designated officer positions, not a four-column Schedule table,
    so parse_cbsa's table walker would yield nothing. The whole operative text
    is a few hundred words, well within a single chunk."""
    soup = BeautifulSoup(html, "html.parser")
    main = soup.find("main")
    if main is None:
        return []
    for sup in main.find_all("sup"):
        sup.decompose()

    date = src["effective"]

    # Skip the breadcrumb/title/footer chrome -- only paragraphs and lists in
    # <main> that carry real content. Lists are rendered as "; "-joined items.
    parts = []
    for el in main.find_all(["p", "ul", "ol"]):
        if el.find_parent(["ul", "ol"]):
            continue   # nested lists are picked up by their parent
        if el.name in ("ul", "ol"):
            items = [_norm(li.get_text()) for li in el.find_all("li")]
            joined = "; ".join(t for t in items if t)
            if joined:
                parts.append(joined)
        else:
            text = _norm(el.get_text())
            if text:
                parts.append(text)
    body = "\n".join(parts)
    if not body:
        return []

    return [{
        "id": f"delegation-{src['code']}",
        "doc_type": "delegation",
        "act_code": src["act_code"],
        "act_short": src["act_short"],
        "act_name": src["act_name"],
        "section": "",
        "marginal_note": "Peace-officer authorization — IRPA s. 138(1)",
        "part": "",
        "division": "",
        "heading": src["act_name"],
        "text": body,
        "history": "",
        "last_amended": "",
        "current_to": date,
        "citation": src["act_short"],
        "source_url": src["url"],
    }]


# --- IRCC IL3 instrument (PDF) ------------------------------------------------

# A topical part heading -- a line in full upper case (MINISTERIAL INSTRUCTIONS,
# MEDICAL, MISCELLANEOUS). Organisation acronyms (CI, CBSA, RCMP) are shorter
# than the 6-character floor and so are not mistaken for headings.
_IL3_PART = re.compile(r"[A-Z][A-Z &/,()'.\-]{5,}")
# An item opens "<n>. <A/R provision>" -- the number must be followed by a
# provision reference, which rejects ordinary numbered prose.
_IL3_ITEM = re.compile(r"(\d+)\.\s+(?=[AR]\d)")
# The word that introduces an item's power description, after the provisions.
_IL3_POWER = re.compile(r"(?:Delegation|Designation)\s*[-–—]\s*")


def _pdf_pages(pdf_bytes):
    """Extract each page's text, dropping the printed page number that pypdf
    emits as the page's first line (roman in the front matter, arabic later)."""
    reader = PdfReader(io.BytesIO(pdf_bytes))
    pages = []
    for page in reader.pages:
        lines = (page.extract_text() or "").split("\n")
        if lines and re.fullmatch(r"\s*[ivxlcdm\d]+\s*", lines[0] or "", re.I):
            lines = lines[1:]
        pages.append("\n".join(lines))
    return pages


def parse_ircc(pdf_bytes, src):
    """Parse the IRCC IL3 instrument into one chunk per Schedule item, plus a
    preamble chunk. The PDF flattens the four-column table into a linear text
    stream, so each item runs from its numbered marker to the next; the power
    description and the delegated positions are kept together within the item."""
    pages = _pdf_pages(pdf_bytes)
    if not pages:
        return []
    version_match = re.search(r"(?:Spring|Summer|Fall|Winter)\s+\d{4}", pages[0])
    version = version_match.group(0) if version_match else ""

    chunks = []
    preamble = _norm(pages[1]) if len(pages) > 1 else ""
    if preamble:
        chunks.append({
            "id": f"delegation-{src['code']}-preamble",
            "doc_type": "delegation",
            "act_code": src["act_code"],
            "act_short": src["act_short"],
            "act_name": src["act_name"],
            "section": "",
            "marginal_note": "Preamble — designation and delegation under IRPA s. 6",
            "part": "",
            "division": "",
            "heading": "Instrument of designation and delegation under IRPA",
            "text": preamble,
            "history": "",
            "last_amended": "",
            "current_to": version,
            "citation": f"{src['act_short']} — Preamble",
            "source_url": src["url"],
        })

    # Walk every line: an all-caps line is a topical part heading; a line that
    # opens "<n>. <A/R provision>" starts a new item. Lines before the first
    # item (the preamble, definitions and table of contents) are skipped.
    items, part = [], ""
    cur_no = cur_part = None
    cur_lines = []
    for line in "\n".join(pages).split("\n"):
        s = _norm(line)
        if not s:
            continue
        if _IL3_PART.fullmatch(s):
            part = s.title()
            continue
        m = _IL3_ITEM.match(s)
        if m:
            if cur_no is not None:
                items.append((cur_part, cur_no, cur_lines))
            cur_no, cur_part, cur_lines = m.group(1), part, [s]
        elif cur_no is not None:
            cur_lines.append(s)
    if cur_no is not None:
        items.append((cur_part, cur_no, cur_lines))

    for n, (item_part, item_no, lines) in enumerate(items, start=1):
        body = re.sub(r"^\d+\.\s*", "", "\n".join(lines)).strip()
        if not body:
            continue
        power = _IL3_POWER.search(body)
        refs = _normalize_refs(body[:power.start()]) if power else ""
        chunks.append({
            "id": f"delegation-{src['code']}-{n}",
            "doc_type": "delegation",
            "act_code": src["act_code"],
            "act_short": src["act_short"],
            "act_name": src["act_name"],
            "section": "",
            "marginal_note": refs or f"Item {item_no}",
            "part": item_part or "",
            "division": "",
            "heading": item_part or "",
            "text": _normalize_refs(body),
            "history": "",
            "last_amended": "",
            "current_to": version,
            "citation": (f"{src['act_short']} — {item_part}, Item {item_no}"
                         if item_part else f"{src['act_short']}, Item {item_no}"),
            "source_url": src["url"],
        })
    return chunks


def build():
    """Fetch, parse and chunk every delegation instrument into delegation.json."""
    all_chunks = []
    for src in SOURCES.values():
        print(f"Ingesting {src['act_short']} ...")
        try:
            if src["kind"] == "html-cbsa":
                html = _fetch(src["url"], RAW / f"{src['code']}.html")
                chunks = parse_cbsa(html, src)
            elif src["kind"] == "html-cbsa-narrative":
                html = _fetch(src["url"], RAW / f"{src['code']}.html")
                chunks = parse_cbsa_narrative(html, src)
            elif src["kind"] == "pdf-ircc":
                pdf = _fetch(src["url"], RAW / f"{src['code']}.pdf",
                             powershell=True)
                chunks = parse_ircc(pdf, src)
            else:
                chunks = []
        except Exception as exc:
            print(f"  !! {src['act_short']}: {type(exc).__name__}: {exc}")
            continue
        all_chunks.extend(chunks)
        print(f"  {len(chunks)} chunks")
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
                   encoding="utf-8")
    print(f"\n{len(all_chunks)} delegation chunks from {len(SOURCES)} "
          f"instrument(s) -> {OUT}")


if __name__ == "__main__":
    build()