Spaces:

Beemer0
/

CanLex

Running

CanLex / canlex /agreement.py

Beemer

CanLex MCP server

21626e7 6 days ago

5.87 kB

	"""Ingest Treasury Board collective agreements (HTML) into Article-level chunks.

	A collective agreement is a binding contract between the Treasury Board and a
	bargaining agent for one occupational group. Chunks are tagged
	doc_type="agreement" so CanLex keeps them distinct from legislation and guidance.
	"""
	import json
	import re
	import subprocess
	import sys
	import time

	from bs4 import BeautifulSoup

	from .config import RAW_DIR, PROCESSED_DIR

	AGREEMENT_DIR = RAW_DIR / "agreements"
	OUT_FILE = PROCESSED_DIR / "agreements.json"
	# canada.ca rejects non-browser user agents, so present a browser one.
	_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

	# Treasury Board collective agreements to ingest. Add an entry to ingest more.
	AGREEMENTS = {
	"FB": {
	"short": "FB Agreement",
	"name": "FB Group Collective Agreement (Border Services)",
	"cite": "FB Collective Agreement",
	"url": "https://www.canada.ca/en/treasury-board-secretariat/topics/pay/"
	"collective-agreements/fb.html",
	},
	}

	_SKIP_HEADINGS = {"table of contents", "note to readers", "page details",
	"on this page"}
	_CONTENT_TAGS = {"p", "dl", "table", "h4", "h5", "h6", "blockquote"}
	_ARTICLE = re.compile(r"Article\s+(\S+?)\s[:–-]\s(.+)", re.I)


	def _norm(text):
	return re.sub(r"\s+", " ", text or "").strip()


	def _fetch(url, dest, force=False):
	"""Download a page. canada.ca blocks Python's HTTP client at the TLS layer,
	so fetch via PowerShell's (.NET) HTTP stack, which the site accepts."""
	if dest.exists() and not force:
	return dest.read_bytes()
	dest.parent.mkdir(parents=True, exist_ok=True)
	command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' "
	f"-UseBasicParsing -UserAgent '{_UA}'")
	subprocess.run(["powershell", "-NoProfile", "-NonInteractive", "-Command", command],
	check=True, capture_output=True, timeout=180)
	time.sleep(0.5) # be polite to the server
	return dest.read_bytes()


	def _block_text(heading):
	"""Readable text from a heading up to the next h2/h3 (sections unwrapped)."""
	lines = []
	for sib in heading.find_next_siblings():
	if sib.name in ("h2", "h3"):
	break
	if sib.name in ("ul", "ol"):
	for li in sib.find_all("li", recursive=False):
	item = _norm(li.get_text(" ", strip=True))
	if item:
	lines.append(f"- {item}")
	elif sib.name in _CONTENT_TAGS:
	text = _norm(sib.get_text(" ", strip=True))
	if text:
	lines.append(text)
	return "\n".join(lines)


	def parse_agreement(html, code):
	"""Parse a collective agreement page into one chunk per Article / Appendix."""
	meta = AGREEMENTS[code]
	soup = BeautifulSoup(html, "html.parser")
	main = soup.find("main")
	if main is None:
	return []

	# The first <time datetime> is the agreement's expiry; the dateModified one
	# is the page's date.
	expiry = ""
	for t in main.find_all("time"):
	if t.get("datetime") and t.get("property") != "dateModified":
	expiry = _norm(t.get("datetime"))
	break

	for tag in main.find_all(["section", "div"]):
	tag.unwrap() # flatten so each heading and its content become siblings

	chunks = []
	current_part = ""
	for h in main.find_all(["h2", "h3"]):
	if "wb-inv" in (h.get("class") or []):
	continue
	text = _norm(h.get_text(" ", strip=True)).lstrip("*").strip()
	if not text or text.lower() in _SKIP_HEADINGS:
	continue
	if h.name == "h2":
	current_part = text
	body = _block_text(h)
	if not body:
	continue

	article = _ARTICLE.match(text)
	if article:
	number = article.group(1).rstrip(":.")
	note = article.group(2).strip()
	citation = f"{meta['cite']}, Article {number}"
	elif re.match(r"Appendix\b", text, re.I):
	number, note = text, ""
	citation = f"{meta['cite']}, {text}"
	else:
	# Memoranda of Agreement/Understanding have long titles: keep them in
	# the note only, so the rendered header does not repeat them.
	number, note = "", text
	citation = meta["cite"]

	chunks.append({
	"id": f"agreement-{code}-{len(chunks) + 1}",
	"doc_type": "agreement",
	"act_code": code,
	"act_short": meta["short"],
	"act_name": meta["name"],
	"section": number,
	"marginal_note": note,
	"part": current_part if h.name == "h3" else "",
	"division": "",
	"heading": "",
	"text": body,
	"history": "",
	"last_amended": "",
	"current_to": expiry,
	"citation": citation,
	"source_url": meta["url"],
	})
	return chunks


	def main():
	force = "--force" in sys.argv
	all_chunks = []
	for code, meta in AGREEMENTS.items():
	print(f"Ingesting {code} ({meta['short']})...")
	try:
	html = _fetch(meta["url"], AGREEMENT_DIR / f"{code}.html", force=force)
	chunks = parse_agreement(html, code)
	print(f" {len(chunks)} chunks")
	all_chunks.extend(chunks)
	except Exception as exc:
	print(f" FAILED {code}: {type(exc).__name__}: {exc}")
	PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
	OUT_FILE.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=2),
	encoding="utf-8")
	print(f"\n{len(all_chunks)} chunks from {len(AGREEMENTS)} agreement(s) -> {OUT_FILE.name}")


	if __name__ == "__main__":
	main()