"""Ingest National Joint Council directives (HTML) into section-level chunks.
NJC directives are negotiated by employer and bargaining-agent representatives;
their provisions form part of collective agreements (and the rate tables in
their appendices apply too). Chunks are tagged doc_type="directive".
"""
import json
import re
import subprocess
import sys
import time
from bs4 import BeautifulSoup
from .config import RAW_DIR, PROCESSED_DIR
INDEX_URL = "https://www.njc-cnm.gc.ca/directive/en"
BASE = "https://www.njc-cnm.gc.ca"
DIRECTIVE_DIR = RAW_DIR / "directives"
OUT_FILE = PROCESSED_DIR / "directives.json"
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
_CONTENT_TAGS = {"p", "dl", "table", "h4", "h5", "h6", "blockquote"}
_NUMBERED = re.compile(r"^(\d+(?:\.\d+)*)\s+(.+)")
_CODE = re.compile(r"/directive/([^/]+)/")
def _norm(text):
return re.sub(r"\s+", " ", text or "").strip()
def _fetch(url, dest, force=False):
"""Fetch via PowerShell's (.NET) HTTP stack -- some government sites block
Python's HTTP client at the TLS layer."""
if dest.exists() and not force:
return dest.read_bytes()
dest.parent.mkdir(parents=True, exist_ok=True)
command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' "
f"-UseBasicParsing -UserAgent '{_UA}'")
subprocess.run(["powershell", "-NoProfile", "-NonInteractive", "-Command", command],
check=True, capture_output=True, timeout=180)
time.sleep(0.5) # be polite to the server
return dest.read_bytes()
def directive_links(force=False):
"""Return [(url, title, date), ...] for the current NJC directives."""
html = _fetch(INDEX_URL, DIRECTIVE_DIR / "_index.html", force=force)
soup = BeautifulSoup(html, "html.parser")
ul = soup.find("ul", class_="directive-list")
if ul is None:
return []
out = []
for li in ul.find_all("li", recursive=False):
a = li.find("a", href=True) # the first is the current directive
if not a:
continue
url = a["href"] if a["href"].startswith("http") else BASE + a["href"]
span = li.find("span", class_="date")
out.append((url, _norm(a.get_text(" ", strip=True)),
_norm(span.get_text()) if span else ""))
return out
def _block_text(heading):
"""Readable text from a heading up to the next h2/h3 (sections unwrapped)."""
lines = []
for sib in heading.find_next_siblings():
if sib.name in ("h2", "h3"):
break
if sib.name in ("ul", "ol"):
for li in sib.find_all("li", recursive=False):
item = _norm(li.get_text(" ", strip=True))
if item:
lines.append(f"- {item}")
elif sib.name in _CONTENT_TAGS:
text = _norm(sib.get_text(" ", strip=True))
if text:
lines.append(text)
return "\n".join(lines)
def parse_directive(html, url, title, date):
"""Parse one NJC directive page into one chunk per h2/h3 section."""
soup = BeautifulSoup(html, "html.parser")
main = soup.find("main")
if main is None:
return []
for tag in main.find_all(["section", "div"]):
tag.unwrap() # flatten so each heading and its content become siblings
code_match = _CODE.search(url)
code = code_match.group(1) if code_match else url
chunks = []
current_part = ""
for h in main.find_all(["h2", "h3"]):
text = _norm(h.get_text(" ", strip=True))
if not text:
continue
if h.name == "h2":
current_part = text
body = _block_text(h)
if not body:
continue
numbered = _NUMBERED.match(text)
if numbered:
number, note = numbered.group(1), numbered.group(2).strip()
citation = f"{title}, s. {number}"
else:
# Un-numbered heading: keep the text in the note only, so the rendered
# header (citation + note) does not repeat it.
number, note = "", text
citation = title
chunks.append({
"id": f"directive-{code}-{len(chunks) + 1}",
"doc_type": "directive",
"act_code": code,
"act_short": title,
"act_name": f"NJC {title}",
"section": number or text,
"marginal_note": note,
"part": current_part if h.name == "h3" else "",
"division": "",
"heading": "",
"text": body,
"history": "",
"last_amended": "",
"current_to": date,
"citation": citation,
"source_url": url,
})
return chunks
def _print_link(html):
"""Find a 'Print Full Directive' / 'Print all FSDs' link on a TOC page."""
soup = BeautifulSoup(html, "html.parser")
for a in soup.find_all("a", href=True):
text = _norm(a.get_text(" ", strip=True)).lower()
if "print full directive" in text or "print all fsd" in text:
return a["href"] if a["href"].startswith("http") else BASE + a["href"]
return None
def main():
force = "--force" in sys.argv
limit = next((int(a.split("=", 1)[1]) for a in sys.argv[1:]
if a.startswith("--limit=")), None)
directives = directive_links(force=force)
if limit:
directives = directives[:limit]
print(f"Ingesting {len(directives)} NJC directives...")
all_chunks, failures = [], []
for url, title, date in directives:
code_match = _CODE.search(url)
code = code_match.group(1) if code_match else "x"
try:
html = _fetch(url, DIRECTIVE_DIR / f"{code}.html", force=force)
chunks = parse_directive(html, url, title, date)
if not chunks:
# Multi-page directive: the landing page is only a table of
# contents -- follow its "Print Full Directive" link.
print_url = _print_link(html)
if print_url:
full = _fetch(print_url, DIRECTIVE_DIR / f"{code}-full.html",
force=force)
chunks = parse_directive(full, url, title, date)
if chunks:
all_chunks.extend(chunks)
print(f" {title}: {len(chunks)} chunks")
else:
failures.append((title, "no content parsed"))
except Exception as exc:
failures.append((title, f"{type(exc).__name__}: {exc}"))
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
OUT_FILE.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=2),
encoding="utf-8")
print(f"\n{len(all_chunks)} chunks from {len(directives) - len(failures)} "
f"directives -> {OUT_FILE.name}")
for title, why in failures:
print(f" FAILED {title}: {why}")
if __name__ == "__main__":
main()