Spaces:

Beemer0
/

CanLex

Running

File size: 5,869 Bytes

21626e7

"""Ingest Treasury Board collective agreements (HTML) into Article-level chunks.

A collective agreement is a binding contract between the Treasury Board and a
bargaining agent for one occupational group. Chunks are tagged
doc_type="agreement" so CanLex keeps them distinct from legislation and guidance.
"""
import json
import re
import subprocess
import sys
import time

from bs4 import BeautifulSoup

from .config import RAW_DIR, PROCESSED_DIR

AGREEMENT_DIR = RAW_DIR / "agreements"
OUT_FILE = PROCESSED_DIR / "agreements.json"
# canada.ca rejects non-browser user agents, so present a browser one.
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
       "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

# Treasury Board collective agreements to ingest. Add an entry to ingest more.
AGREEMENTS = {
    "FB": {
        "short": "FB Agreement",
        "name": "FB Group Collective Agreement (Border Services)",
        "cite": "FB Collective Agreement",
        "url": "https://www.canada.ca/en/treasury-board-secretariat/topics/pay/"
               "collective-agreements/fb.html",
    },
}

_SKIP_HEADINGS = {"table of contents", "note to readers", "page details",
                  "on this page"}
_CONTENT_TAGS = {"p", "dl", "table", "h4", "h5", "h6", "blockquote"}
_ARTICLE = re.compile(r"Article\s+(\S+?)\s*[:–-]\s*(.+)", re.I)


def _norm(text):
    return re.sub(r"\s+", " ", text or "").strip()


def _fetch(url, dest, force=False):
    """Download a page. canada.ca blocks Python's HTTP client at the TLS layer,
    so fetch via PowerShell's (.NET) HTTP stack, which the site accepts."""
    if dest.exists() and not force:
        return dest.read_bytes()
    dest.parent.mkdir(parents=True, exist_ok=True)
    command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' "
               f"-UseBasicParsing -UserAgent '{_UA}'")
    subprocess.run(["powershell", "-NoProfile", "-NonInteractive", "-Command", command],
                   check=True, capture_output=True, timeout=180)
    time.sleep(0.5)  # be polite to the server
    return dest.read_bytes()


def _block_text(heading):
    """Readable text from a heading up to the next h2/h3 (sections unwrapped)."""
    lines = []
    for sib in heading.find_next_siblings():
        if sib.name in ("h2", "h3"):
            break
        if sib.name in ("ul", "ol"):
            for li in sib.find_all("li", recursive=False):
                item = _norm(li.get_text(" ", strip=True))
                if item:
                    lines.append(f"- {item}")
        elif sib.name in _CONTENT_TAGS:
            text = _norm(sib.get_text(" ", strip=True))
            if text:
                lines.append(text)
    return "\n".join(lines)


def parse_agreement(html, code):
    """Parse a collective agreement page into one chunk per Article / Appendix."""
    meta = AGREEMENTS[code]
    soup = BeautifulSoup(html, "html.parser")
    main = soup.find("main")
    if main is None:
        return []

    # The first <time datetime> is the agreement's expiry; the dateModified one
    # is the page's date.
    expiry = ""
    for t in main.find_all("time"):
        if t.get("datetime") and t.get("property") != "dateModified":
            expiry = _norm(t.get("datetime"))
            break

    for tag in main.find_all(["section", "div"]):
        tag.unwrap()  # flatten so each heading and its content become siblings

    chunks = []
    current_part = ""
    for h in main.find_all(["h2", "h3"]):
        if "wb-inv" in (h.get("class") or []):
            continue
        text = _norm(h.get_text(" ", strip=True)).lstrip("*").strip()
        if not text or text.lower() in _SKIP_HEADINGS:
            continue
        if h.name == "h2":
            current_part = text
        body = _block_text(h)
        if not body:
            continue

        article = _ARTICLE.match(text)
        if article:
            number = article.group(1).rstrip(":.")
            note = article.group(2).strip()
            citation = f"{meta['cite']}, Article {number}"
        elif re.match(r"Appendix\b", text, re.I):
            number, note = text, ""
            citation = f"{meta['cite']}, {text}"
        else:
            # Memoranda of Agreement/Understanding have long titles: keep them in
            # the note only, so the rendered header does not repeat them.
            number, note = "", text
            citation = meta["cite"]

        chunks.append({
            "id": f"agreement-{code}-{len(chunks) + 1}",
            "doc_type": "agreement",
            "act_code": code,
            "act_short": meta["short"],
            "act_name": meta["name"],
            "section": number,
            "marginal_note": note,
            "part": current_part if h.name == "h3" else "",
            "division": "",
            "heading": "",
            "text": body,
            "history": "",
            "last_amended": "",
            "current_to": expiry,
            "citation": citation,
            "source_url": meta["url"],
        })
    return chunks


def main():
    force = "--force" in sys.argv
    all_chunks = []
    for code, meta in AGREEMENTS.items():
        print(f"Ingesting {code} ({meta['short']})...")
        try:
            html = _fetch(meta["url"], AGREEMENT_DIR / f"{code}.html", force=force)
            chunks = parse_agreement(html, code)
            print(f"  {len(chunks)} chunks")
            all_chunks.extend(chunks)
        except Exception as exc:
            print(f"  FAILED {code}: {type(exc).__name__}: {exc}")
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    OUT_FILE.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=2),
                        encoding="utf-8")
    print(f"\n{len(all_chunks)} chunks from {len(AGREEMENTS)} agreement(s) -> {OUT_FILE.name}")


if __name__ == "__main__":
    main()