File size: 7,019 Bytes
21626e7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | """Ingest National Joint Council directives (HTML) into section-level chunks.
NJC directives are negotiated by employer and bargaining-agent representatives;
their provisions form part of collective agreements (and the rate tables in
their appendices apply too). Chunks are tagged doc_type="directive".
"""
import json
import re
import subprocess
import sys
import time
from bs4 import BeautifulSoup
from .config import RAW_DIR, PROCESSED_DIR
INDEX_URL = "https://www.njc-cnm.gc.ca/directive/en"
BASE = "https://www.njc-cnm.gc.ca"
DIRECTIVE_DIR = RAW_DIR / "directives"
OUT_FILE = PROCESSED_DIR / "directives.json"
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
_CONTENT_TAGS = {"p", "dl", "table", "h4", "h5", "h6", "blockquote"}
_NUMBERED = re.compile(r"^(\d+(?:\.\d+)*)\s+(.+)")
_CODE = re.compile(r"/directive/([^/]+)/")
def _norm(text):
return re.sub(r"\s+", " ", text or "").strip()
def _fetch(url, dest, force=False):
"""Fetch via PowerShell's (.NET) HTTP stack -- some government sites block
Python's HTTP client at the TLS layer."""
if dest.exists() and not force:
return dest.read_bytes()
dest.parent.mkdir(parents=True, exist_ok=True)
command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' "
f"-UseBasicParsing -UserAgent '{_UA}'")
subprocess.run(["powershell", "-NoProfile", "-NonInteractive", "-Command", command],
check=True, capture_output=True, timeout=180)
time.sleep(0.5) # be polite to the server
return dest.read_bytes()
def directive_links(force=False):
"""Return [(url, title, date), ...] for the current NJC directives."""
html = _fetch(INDEX_URL, DIRECTIVE_DIR / "_index.html", force=force)
soup = BeautifulSoup(html, "html.parser")
ul = soup.find("ul", class_="directive-list")
if ul is None:
return []
out = []
for li in ul.find_all("li", recursive=False):
a = li.find("a", href=True) # the first <a> is the current directive
if not a:
continue
url = a["href"] if a["href"].startswith("http") else BASE + a["href"]
span = li.find("span", class_="date")
out.append((url, _norm(a.get_text(" ", strip=True)),
_norm(span.get_text()) if span else ""))
return out
def _block_text(heading):
"""Readable text from a heading up to the next h2/h3 (sections unwrapped)."""
lines = []
for sib in heading.find_next_siblings():
if sib.name in ("h2", "h3"):
break
if sib.name in ("ul", "ol"):
for li in sib.find_all("li", recursive=False):
item = _norm(li.get_text(" ", strip=True))
if item:
lines.append(f"- {item}")
elif sib.name in _CONTENT_TAGS:
text = _norm(sib.get_text(" ", strip=True))
if text:
lines.append(text)
return "\n".join(lines)
def parse_directive(html, url, title, date):
"""Parse one NJC directive page into one chunk per h2/h3 section."""
soup = BeautifulSoup(html, "html.parser")
main = soup.find("main")
if main is None:
return []
for tag in main.find_all(["section", "div"]):
tag.unwrap() # flatten so each heading and its content become siblings
code_match = _CODE.search(url)
code = code_match.group(1) if code_match else url
chunks = []
current_part = ""
for h in main.find_all(["h2", "h3"]):
text = _norm(h.get_text(" ", strip=True))
if not text:
continue
if h.name == "h2":
current_part = text
body = _block_text(h)
if not body:
continue
numbered = _NUMBERED.match(text)
if numbered:
number, note = numbered.group(1), numbered.group(2).strip()
citation = f"{title}, s. {number}"
else:
# Un-numbered heading: keep the text in the note only, so the rendered
# header (citation + note) does not repeat it.
number, note = "", text
citation = title
chunks.append({
"id": f"directive-{code}-{len(chunks) + 1}",
"doc_type": "directive",
"act_code": code,
"act_short": title,
"act_name": f"NJC {title}",
"section": number or text,
"marginal_note": note,
"part": current_part if h.name == "h3" else "",
"division": "",
"heading": "",
"text": body,
"history": "",
"last_amended": "",
"current_to": date,
"citation": citation,
"source_url": url,
})
return chunks
def _print_link(html):
"""Find a 'Print Full Directive' / 'Print all FSDs' link on a TOC page."""
soup = BeautifulSoup(html, "html.parser")
for a in soup.find_all("a", href=True):
text = _norm(a.get_text(" ", strip=True)).lower()
if "print full directive" in text or "print all fsd" in text:
return a["href"] if a["href"].startswith("http") else BASE + a["href"]
return None
def main():
force = "--force" in sys.argv
limit = next((int(a.split("=", 1)[1]) for a in sys.argv[1:]
if a.startswith("--limit=")), None)
directives = directive_links(force=force)
if limit:
directives = directives[:limit]
print(f"Ingesting {len(directives)} NJC directives...")
all_chunks, failures = [], []
for url, title, date in directives:
code_match = _CODE.search(url)
code = code_match.group(1) if code_match else "x"
try:
html = _fetch(url, DIRECTIVE_DIR / f"{code}.html", force=force)
chunks = parse_directive(html, url, title, date)
if not chunks:
# Multi-page directive: the landing page is only a table of
# contents -- follow its "Print Full Directive" link.
print_url = _print_link(html)
if print_url:
full = _fetch(print_url, DIRECTIVE_DIR / f"{code}-full.html",
force=force)
chunks = parse_directive(full, url, title, date)
if chunks:
all_chunks.extend(chunks)
print(f" {title}: {len(chunks)} chunks")
else:
failures.append((title, "no content parsed"))
except Exception as exc:
failures.append((title, f"{type(exc).__name__}: {exc}"))
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
OUT_FILE.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=2),
encoding="utf-8")
print(f"\n{len(all_chunks)} chunks from {len(directives) - len(failures)} "
f"directives -> {OUT_FILE.name}")
for title, why in failures:
print(f" FAILED {title}: {why}")
if __name__ == "__main__":
main()
|