| """Ingest Treasury Board collective agreements (HTML) into Article-level chunks. |
| |
| A collective agreement is a binding contract between the Treasury Board and a |
| bargaining agent for one occupational group. Chunks are tagged |
| doc_type="agreement" so CanLex keeps them distinct from legislation and guidance. |
| """ |
| import json |
| import re |
| import subprocess |
| import sys |
| import time |
|
|
| from bs4 import BeautifulSoup |
|
|
| from .config import RAW_DIR, PROCESSED_DIR |
|
|
| AGREEMENT_DIR = RAW_DIR / "agreements" |
| OUT_FILE = PROCESSED_DIR / "agreements.json" |
| |
| _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " |
| "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36") |
|
|
| |
| AGREEMENTS = { |
| "FB": { |
| "short": "FB Agreement", |
| "name": "FB Group Collective Agreement (Border Services)", |
| "cite": "FB Collective Agreement", |
| "url": "https://www.canada.ca/en/treasury-board-secretariat/topics/pay/" |
| "collective-agreements/fb.html", |
| }, |
| } |
|
|
| _SKIP_HEADINGS = {"table of contents", "note to readers", "page details", |
| "on this page"} |
| _CONTENT_TAGS = {"p", "dl", "table", "h4", "h5", "h6", "blockquote"} |
| _ARTICLE = re.compile(r"Article\s+(\S+?)\s*[:–-]\s*(.+)", re.I) |
|
|
|
|
| def _norm(text): |
| return re.sub(r"\s+", " ", text or "").strip() |
|
|
|
|
| def _fetch(url, dest, force=False): |
| """Download a page. canada.ca blocks Python's HTTP client at the TLS layer, |
| so fetch via PowerShell's (.NET) HTTP stack, which the site accepts.""" |
| if dest.exists() and not force: |
| return dest.read_bytes() |
| dest.parent.mkdir(parents=True, exist_ok=True) |
| command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' " |
| f"-UseBasicParsing -UserAgent '{_UA}'") |
| subprocess.run(["powershell", "-NoProfile", "-NonInteractive", "-Command", command], |
| check=True, capture_output=True, timeout=180) |
| time.sleep(0.5) |
| return dest.read_bytes() |
|
|
|
|
| def _block_text(heading): |
| """Readable text from a heading up to the next h2/h3 (sections unwrapped).""" |
| lines = [] |
| for sib in heading.find_next_siblings(): |
| if sib.name in ("h2", "h3"): |
| break |
| if sib.name in ("ul", "ol"): |
| for li in sib.find_all("li", recursive=False): |
| item = _norm(li.get_text(" ", strip=True)) |
| if item: |
| lines.append(f"- {item}") |
| elif sib.name in _CONTENT_TAGS: |
| text = _norm(sib.get_text(" ", strip=True)) |
| if text: |
| lines.append(text) |
| return "\n".join(lines) |
|
|
|
|
| def parse_agreement(html, code): |
| """Parse a collective agreement page into one chunk per Article / Appendix.""" |
| meta = AGREEMENTS[code] |
| soup = BeautifulSoup(html, "html.parser") |
| main = soup.find("main") |
| if main is None: |
| return [] |
|
|
| |
| |
| expiry = "" |
| for t in main.find_all("time"): |
| if t.get("datetime") and t.get("property") != "dateModified": |
| expiry = _norm(t.get("datetime")) |
| break |
|
|
| for tag in main.find_all(["section", "div"]): |
| tag.unwrap() |
|
|
| chunks = [] |
| current_part = "" |
| for h in main.find_all(["h2", "h3"]): |
| if "wb-inv" in (h.get("class") or []): |
| continue |
| text = _norm(h.get_text(" ", strip=True)).lstrip("*").strip() |
| if not text or text.lower() in _SKIP_HEADINGS: |
| continue |
| if h.name == "h2": |
| current_part = text |
| body = _block_text(h) |
| if not body: |
| continue |
|
|
| article = _ARTICLE.match(text) |
| if article: |
| number = article.group(1).rstrip(":.") |
| note = article.group(2).strip() |
| citation = f"{meta['cite']}, Article {number}" |
| elif re.match(r"Appendix\b", text, re.I): |
| number, note = text, "" |
| citation = f"{meta['cite']}, {text}" |
| else: |
| |
| |
| number, note = "", text |
| citation = meta["cite"] |
|
|
| chunks.append({ |
| "id": f"agreement-{code}-{len(chunks) + 1}", |
| "doc_type": "agreement", |
| "act_code": code, |
| "act_short": meta["short"], |
| "act_name": meta["name"], |
| "section": number, |
| "marginal_note": note, |
| "part": current_part if h.name == "h3" else "", |
| "division": "", |
| "heading": "", |
| "text": body, |
| "history": "", |
| "last_amended": "", |
| "current_to": expiry, |
| "citation": citation, |
| "source_url": meta["url"], |
| }) |
| return chunks |
|
|
|
|
| def main(): |
| force = "--force" in sys.argv |
| all_chunks = [] |
| for code, meta in AGREEMENTS.items(): |
| print(f"Ingesting {code} ({meta['short']})...") |
| try: |
| html = _fetch(meta["url"], AGREEMENT_DIR / f"{code}.html", force=force) |
| chunks = parse_agreement(html, code) |
| print(f" {len(chunks)} chunks") |
| all_chunks.extend(chunks) |
| except Exception as exc: |
| print(f" FAILED {code}: {type(exc).__name__}: {exc}") |
| PROCESSED_DIR.mkdir(parents=True, exist_ok=True) |
| OUT_FILE.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=2), |
| encoding="utf-8") |
| print(f"\n{len(all_chunks)} chunks from {len(AGREEMENTS)} agreement(s) -> {OUT_FILE.name}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|