Spaces:

Beemer0
/

CanLex

Running

File size: 7,019 Bytes

21626e7

"""Ingest National Joint Council directives (HTML) into section-level chunks.

NJC directives are negotiated by employer and bargaining-agent representatives;
their provisions form part of collective agreements (and the rate tables in
their appendices apply too). Chunks are tagged doc_type="directive".
"""
import json
import re
import subprocess
import sys
import time

from bs4 import BeautifulSoup

from .config import RAW_DIR, PROCESSED_DIR

INDEX_URL = "https://www.njc-cnm.gc.ca/directive/en"
BASE = "https://www.njc-cnm.gc.ca"
DIRECTIVE_DIR = RAW_DIR / "directives"
OUT_FILE = PROCESSED_DIR / "directives.json"
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
       "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
_CONTENT_TAGS = {"p", "dl", "table", "h4", "h5", "h6", "blockquote"}
_NUMBERED = re.compile(r"^(\d+(?:\.\d+)*)\s+(.+)")
_CODE = re.compile(r"/directive/([^/]+)/")


def _norm(text):
    return re.sub(r"\s+", " ", text or "").strip()


def _fetch(url, dest, force=False):
    """Fetch via PowerShell's (.NET) HTTP stack -- some government sites block
    Python's HTTP client at the TLS layer."""
    if dest.exists() and not force:
        return dest.read_bytes()
    dest.parent.mkdir(parents=True, exist_ok=True)
    command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' "
               f"-UseBasicParsing -UserAgent '{_UA}'")
    subprocess.run(["powershell", "-NoProfile", "-NonInteractive", "-Command", command],
                   check=True, capture_output=True, timeout=180)
    time.sleep(0.5)  # be polite to the server
    return dest.read_bytes()


def directive_links(force=False):
    """Return [(url, title, date), ...] for the current NJC directives."""
    html = _fetch(INDEX_URL, DIRECTIVE_DIR / "_index.html", force=force)
    soup = BeautifulSoup(html, "html.parser")
    ul = soup.find("ul", class_="directive-list")
    if ul is None:
        return []
    out = []
    for li in ul.find_all("li", recursive=False):
        a = li.find("a", href=True)  # the first <a> is the current directive
        if not a:
            continue
        url = a["href"] if a["href"].startswith("http") else BASE + a["href"]
        span = li.find("span", class_="date")
        out.append((url, _norm(a.get_text(" ", strip=True)),
                    _norm(span.get_text()) if span else ""))
    return out


def _block_text(heading):
    """Readable text from a heading up to the next h2/h3 (sections unwrapped)."""
    lines = []
    for sib in heading.find_next_siblings():
        if sib.name in ("h2", "h3"):
            break
        if sib.name in ("ul", "ol"):
            for li in sib.find_all("li", recursive=False):
                item = _norm(li.get_text(" ", strip=True))
                if item:
                    lines.append(f"- {item}")
        elif sib.name in _CONTENT_TAGS:
            text = _norm(sib.get_text(" ", strip=True))
            if text:
                lines.append(text)
    return "\n".join(lines)


def parse_directive(html, url, title, date):
    """Parse one NJC directive page into one chunk per h2/h3 section."""
    soup = BeautifulSoup(html, "html.parser")
    main = soup.find("main")
    if main is None:
        return []
    for tag in main.find_all(["section", "div"]):
        tag.unwrap()  # flatten so each heading and its content become siblings

    code_match = _CODE.search(url)
    code = code_match.group(1) if code_match else url
    chunks = []
    current_part = ""
    for h in main.find_all(["h2", "h3"]):
        text = _norm(h.get_text(" ", strip=True))
        if not text:
            continue
        if h.name == "h2":
            current_part = text
        body = _block_text(h)
        if not body:
            continue
        numbered = _NUMBERED.match(text)
        if numbered:
            number, note = numbered.group(1), numbered.group(2).strip()
            citation = f"{title}, s. {number}"
        else:
            # Un-numbered heading: keep the text in the note only, so the rendered
            # header (citation + note) does not repeat it.
            number, note = "", text
            citation = title
        chunks.append({
            "id": f"directive-{code}-{len(chunks) + 1}",
            "doc_type": "directive",
            "act_code": code,
            "act_short": title,
            "act_name": f"NJC {title}",
            "section": number or text,
            "marginal_note": note,
            "part": current_part if h.name == "h3" else "",
            "division": "",
            "heading": "",
            "text": body,
            "history": "",
            "last_amended": "",
            "current_to": date,
            "citation": citation,
            "source_url": url,
        })
    return chunks


def _print_link(html):
    """Find a 'Print Full Directive' / 'Print all FSDs' link on a TOC page."""
    soup = BeautifulSoup(html, "html.parser")
    for a in soup.find_all("a", href=True):
        text = _norm(a.get_text(" ", strip=True)).lower()
        if "print full directive" in text or "print all fsd" in text:
            return a["href"] if a["href"].startswith("http") else BASE + a["href"]
    return None


def main():
    force = "--force" in sys.argv
    limit = next((int(a.split("=", 1)[1]) for a in sys.argv[1:]
                  if a.startswith("--limit=")), None)
    directives = directive_links(force=force)
    if limit:
        directives = directives[:limit]
    print(f"Ingesting {len(directives)} NJC directives...")
    all_chunks, failures = [], []
    for url, title, date in directives:
        code_match = _CODE.search(url)
        code = code_match.group(1) if code_match else "x"
        try:
            html = _fetch(url, DIRECTIVE_DIR / f"{code}.html", force=force)
            chunks = parse_directive(html, url, title, date)
            if not chunks:
                # Multi-page directive: the landing page is only a table of
                # contents -- follow its "Print Full Directive" link.
                print_url = _print_link(html)
                if print_url:
                    full = _fetch(print_url, DIRECTIVE_DIR / f"{code}-full.html",
                                  force=force)
                    chunks = parse_directive(full, url, title, date)
            if chunks:
                all_chunks.extend(chunks)
                print(f"  {title}: {len(chunks)} chunks")
            else:
                failures.append((title, "no content parsed"))
        except Exception as exc:
            failures.append((title, f"{type(exc).__name__}: {exc}"))
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    OUT_FILE.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=2),
                        encoding="utf-8")
    print(f"\n{len(all_chunks)} chunks from {len(directives) - len(failures)} "
          f"directives -> {OUT_FILE.name}")
    for title, why in failures:
        print(f"  FAILED {title}: {why}")


if __name__ == "__main__":
    main()