File size: 8,486 Bytes
21626e7 589d46e 21626e7 589d46e 21626e7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | """Ingest CBSA D-Memoranda (HTML) into structured, section-level chunks.
D-Memoranda are CBSA's administrative guidance on how it applies the Customs Act
and related law. They are persuasive, not binding -- every chunk is tagged
doc_type="memorandum" so the rest of CanLex can keep them distinct from statute.
"""
import io
import json
import re
import sys
import time
import urllib.request
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from pypdf import PdfReader
from .config import RAW_DIR, PROCESSED_DIR
INDEX_URL = "https://www.cbsa-asfc.gc.ca/publications/dm-md/d1-d23-eng.html"
DMEMO_DIR = RAW_DIR / "dmemos"
OUT_FILE = PROCESSED_DIR / "dmemos.json"
# <h2> headings that are page boilerplate rather than memo content.
_SKIP_HEADINGS = {"contact us", "related links"}
_MEMO_HREF = re.compile(r"/dm-md/d\d+/d[\d-]+-eng\.html")
_URL_NUMBER = re.compile(r"/(d\d+-[\d-]+)-eng\.html")
def _norm(text):
return re.sub(r"\s+", " ", text or "").strip()
def _fetch(url, dest, force=False):
if dest.exists() and not force:
return dest.read_bytes()
req = urllib.request.Request(url, headers={"User-Agent": "CanLex/0.1"})
with urllib.request.urlopen(req, timeout=60) as resp:
data = resp.read()
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(data)
time.sleep(0.5) # be polite to the CBSA server
return data
def memo_urls(force=False):
"""All individual D-memo URLs listed on the CBSA index page."""
html = _fetch(INDEX_URL, DMEMO_DIR / "_index.html", force=force)
soup = BeautifulSoup(html, "html.parser")
urls, seen = [], set()
for a in soup.find_all("a", href=True):
if _MEMO_HREF.search(a["href"]):
full = urljoin(INDEX_URL, a["href"])
if full not in seen:
seen.add(full)
urls.append(full)
return urls
def _render_section(h2):
"""Readable text from an <h2> up to the next <h2> (sections already unwrapped)."""
lines = []
for sib in h2.find_next_siblings():
if sib.name == "h2" or sib.get("id") == "wb-dtmd":
break
if sib.name in ("ul", "ol"):
for li in sib.find_all("li", recursive=False):
item = _norm(li.get_text(" ", strip=True))
if item:
lines.append(f"- {item}")
else:
text = _norm(sib.get_text(" ", strip=True))
if text:
lines.append(text)
return "\n".join(lines)
def parse_memo(html, url):
"""Parse one D-memo HTML page into one chunk per <h2> content section."""
soup = BeautifulSoup(html, "html.parser")
main = soup.find("main")
if main is None:
return []
for section in main.find_all("section"):
section.unwrap() # flatten so each <h2> and its content become siblings
match = _URL_NUMBER.search(url)
number = match.group(1).upper() if match else url
h1 = main.find("h1")
topic = ""
if h1:
# Pages vary: most carry the memo title in <h1><small>, others as plain
# "Memorandum DNN-N-N: Title" h1 text. Use the <small> if present, else
# the h1 text, and strip any leading memo-number prefix either way.
small = h1.find("small")
raw = (small.get_text(" ", strip=True) if small
else h1.get_text(" ", strip=True))
topic = re.sub(r"^Memorandum\s+D[\w-]+\s*[:–-]\s*", "",
_norm(raw), flags=re.I)
dm = main.find("time", attrs={"property": "dateModified"})
date = _norm(dm.get("datetime") or dm.get_text()) if dm else ""
chunks = []
for h2 in main.find_all("h2"):
heading = _norm(h2.get_text(" ", strip=True))
if not heading or heading.lower() in _SKIP_HEADINGS:
continue
body = _render_section(h2)
if not body:
continue
chunks.append({
"id": f"dmemo-{number}-{len(chunks) + 1}",
"doc_type": "memorandum",
"act_code": "D-Memo",
"act_short": "D-Memo",
"act_name": "CBSA D-Memoranda",
"section": number,
"marginal_note": heading,
"part": topic,
"division": "",
"heading": "",
"text": body,
"history": "",
"last_amended": date,
"current_to": date,
"citation": f"Memorandum {number}",
"source_url": url,
})
return chunks
def _pdf_clean(text):
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n[ \t]+", "\n", text)
return re.sub(r"\n{3,}", "\n\n", text).strip()
def _pdf_text(pdf_bytes):
try:
reader = PdfReader(io.BytesIO(pdf_bytes))
return _pdf_clean("\n".join((p.extract_text() or "") for p in reader.pages))
except Exception:
return ""
def _split(text, target=3000):
"""Split long PDF text into ~target-sized pieces at line boundaries."""
if len(text) <= target:
return [text]
parts, buf, size = [], [], 0
for line in text.split("\n"):
if size + len(line) > target and buf:
parts.append("\n".join(buf))
buf, size = [], 0
buf.append(line)
size += len(line) + 1
if buf:
parts.append("\n".join(buf))
return parts
def parse_pdf_memo(html, url):
"""Fallback for memos whose HTML page is only a stub linking to a PDF."""
soup = BeautifulSoup(html, "html.parser")
main = soup.find("main")
if main is None:
return []
pdf_href = next((a["href"] for a in main.find_all("a", href=True)
if a["href"].lower().endswith(".pdf")), None)
if not pdf_href:
return []
pdf_url = urljoin(url, pdf_href)
match = _URL_NUMBER.search(url)
number = match.group(1).upper() if match else url
h1 = main.find("h1")
topic = _norm(h1.get_text(" ", strip=True)) if h1 else ""
topic = re.sub(r"^Memorandum\s+D[\w-]+\s*[:–-]\s*", "", topic, flags=re.I)
dm = main.find("time", attrs={"property": "dateModified"})
date = _norm(dm.get("datetime") or dm.get_text()) if dm else ""
pdf_bytes = _fetch(pdf_url, DMEMO_DIR / "pdf" / pdf_url.rsplit("/", 1)[-1])
text = _pdf_text(pdf_bytes)
if not text:
return []
parts = _split(text)
chunks = []
for i, part in enumerate(parts, 1):
label = topic or number
if len(parts) > 1:
label = f"{label} (part {i})"
chunks.append({
"id": f"dmemo-{number}-pdf{i}",
"doc_type": "memorandum",
"act_code": "D-Memo",
"act_short": "D-Memo",
"act_name": "CBSA D-Memoranda",
"section": number,
"marginal_note": label,
"part": topic,
"division": "",
"heading": "",
"text": part,
"history": "",
"last_amended": date,
"current_to": date,
"citation": f"Memorandum {number}",
"source_url": url,
})
return chunks
def ingest(force=False, limit=None):
urls = memo_urls(force=force)
if limit:
urls = urls[:limit]
print(f"Ingesting {len(urls)} D-Memoranda...")
all_chunks, failures = [], []
for i, url in enumerate(urls, 1):
try:
html = _fetch(url, DMEMO_DIR / url.rsplit("/", 1)[-1], force=force)
chunks = parse_memo(html, url) or parse_pdf_memo(html, url)
if chunks:
all_chunks.extend(chunks)
else:
failures.append((url, "no content parsed"))
except Exception as exc:
failures.append((url, f"{type(exc).__name__}: {exc}"))
if i % 50 == 0:
print(f" {i}/{len(urls)} ...")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
OUT_FILE.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=2), encoding="utf-8")
print(f" {len(all_chunks)} section-chunks from {len(urls) - len(failures)} memos "
f"-> {OUT_FILE.name}")
if failures:
print(f" {len(failures)} memos with no content / errors:")
for url, why in failures[:15]:
print(f" - {url.rsplit('/', 1)[-1]}: {why}")
def main():
force = "--force" in sys.argv
limit = next((int(a.split("=", 1)[1]) for a in sys.argv[1:]
if a.startswith("--limit=")), None)
ingest(force=force, limit=limit)
if __name__ == "__main__":
main()
|