"""Ingest Treasury Board collective agreements (HTML) into Article-level chunks. A collective agreement is a binding contract between the Treasury Board and a bargaining agent for one occupational group. Chunks are tagged doc_type="agreement" so CanLex keeps them distinct from legislation and guidance. """ import json import re import subprocess import sys import time from bs4 import BeautifulSoup from .config import RAW_DIR, PROCESSED_DIR AGREEMENT_DIR = RAW_DIR / "agreements" OUT_FILE = PROCESSED_DIR / "agreements.json" # canada.ca rejects non-browser user agents, so present a browser one. _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36") # Treasury Board collective agreements to ingest. Add an entry to ingest more. AGREEMENTS = { "FB": { "short": "FB Agreement", "name": "FB Group Collective Agreement (Border Services)", "cite": "FB Collective Agreement", "url": "https://www.canada.ca/en/treasury-board-secretariat/topics/pay/" "collective-agreements/fb.html", }, } _SKIP_HEADINGS = {"table of contents", "note to readers", "page details", "on this page"} _CONTENT_TAGS = {"p", "dl", "table", "h4", "h5", "h6", "blockquote"} _ARTICLE = re.compile(r"Article\s+(\S+?)\s*[:–-]\s*(.+)", re.I) def _norm(text): return re.sub(r"\s+", " ", text or "").strip() def _fetch(url, dest, force=False): """Download a page. canada.ca blocks Python's HTTP client at the TLS layer, so fetch via PowerShell's (.NET) HTTP stack, which the site accepts.""" if dest.exists() and not force: return dest.read_bytes() dest.parent.mkdir(parents=True, exist_ok=True) command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' " f"-UseBasicParsing -UserAgent '{_UA}'") subprocess.run(["powershell", "-NoProfile", "-NonInteractive", "-Command", command], check=True, capture_output=True, timeout=180) time.sleep(0.5) # be polite to the server return dest.read_bytes() def _block_text(heading): """Readable text from a heading up to the next h2/h3 (sections unwrapped).""" lines = [] for sib in heading.find_next_siblings(): if sib.name in ("h2", "h3"): break if sib.name in ("ul", "ol"): for li in sib.find_all("li", recursive=False): item = _norm(li.get_text(" ", strip=True)) if item: lines.append(f"- {item}") elif sib.name in _CONTENT_TAGS: text = _norm(sib.get_text(" ", strip=True)) if text: lines.append(text) return "\n".join(lines) def parse_agreement(html, code): """Parse a collective agreement page into one chunk per Article / Appendix.""" meta = AGREEMENTS[code] soup = BeautifulSoup(html, "html.parser") main = soup.find("main") if main is None: return [] # The first