"""Ingest Chapters 98 and 99 of the Customs Tariff Schedule. The Customs Tariff's Schedule is the Harmonized System classification of goods -- chapters 1-97 categorise every imported good for duty purposes. Those 97 chapters are huge and outside CanLex's scope, but **chapters 98 and 99 are different**: they carry Canada's "special classification" provisions, which matter for almost every CBSA border interaction: - Chapter 98 (non-commercial): traveller exemptions, settler's effects, Canadian goods returned, ancestral household effects, conveyances temporarily imported by a resident, etc. - Chapter 99 (commercial): temporary importations, end-use programs, government imports, reduced-rate goods for specific industries. Source: the CBSA's HTML edition of the current Customs Tariff. The Justice Laws XML for the Act (C-54.011) does NOT include the Schedule. Chunking is one chunk per 4-digit HEADING (98.01, 98.02, ...) plus one chunk per chapter for its Notes and Subheading Notes -- a heading is the natural unit of legal classification (the eight- and ten-digit items below it are the same rule with finer rate granularity). py -m canlex.tariff_schedule """ import json import re import time import urllib.request from collections import defaultdict from bs4 import BeautifulSoup from .config import PROCESSED_DIR, RAW_DIR RAW = RAW_DIR / "tariff_schedule" OUT = PROCESSED_DIR / "tariff_schedule.json" _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36") # The 2026 edition is the current Customs Tariff at the time of writing. To # refresh: bump the year in the URL and `edition` once CBSA publishes the next. SOURCES = { "ch98": { "code": "ch98", "chapter": "98", "title": "Special classification provisions — non-commercial", "url": ("https://www.cbsa-asfc.gc.ca/trade-commerce/tariff-tarif/" "2026/html/00/ch98-eng.html"), "edition": "2026", }, "ch99": { "code": "ch99", "chapter": "99", "title": "Special classification provisions — commercial", "url": ("https://www.cbsa-asfc.gc.ca/trade-commerce/tariff-tarif/" "2026/html/00/ch99-eng.html"), "edition": "2026", }, } def _norm(text): return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip() def _fetch(url, dest): if dest.exists(): return dest.read_bytes() dest.parent.mkdir(parents=True, exist_ok=True) req = urllib.request.Request(url, headers={"User-Agent": _UA}) with urllib.request.urlopen(req, timeout=60) as resp: dest.write_bytes(resp.read()) time.sleep(0.5) return dest.read_bytes() def _heading_of(item): """Map a tariff-item code to its 4-digit heading. The schedule uses three levels: - '98.01' (4 digits) -- heading - '9801.10' (6 digits) -- subheading - '9801.10.10' (8 digits) -- tariff item All three roll up to heading '98.01'.""" digits = re.sub(r"\D", "", item) if len(digits) < 4: return None return f"{digits[:2]}.{digits[2:4]}" def _collect_notes(main): """Return the chapter's Notes + Subheading Notes as a single text block.""" out = [] for label in ("Notes", "Subheading Notes"): h2 = main.find( "h2", string=lambda s, lbl=label: s and lbl in s and ( lbl != "Notes" or "Subheading" not in s)) if not h2: continue parts = [] sib = h2.find_next_sibling() while sib and sib.name != "h2": t = _norm(sib.get_text(" ", strip=True)) if t: parts.append(t) sib = sib.find_next_sibling() if parts: out.append(f"{label}:\n" + "\n".join(parts)) return "\n\n".join(out) def parse_chapter(html, src): """Parse one Customs Tariff Schedule chapter into chunks.""" soup = BeautifulSoup(html, "html.parser") main = soup.find("main") if main is None: return [] for sup in main.find_all("sup"): sup.decompose() chunks = [] chapter = src["chapter"] citation_root = f"Customs Tariff, Sched., Ch. {chapter}" # Chapter Notes + Subheading Notes -- one chunk per chapter. notes_body = _collect_notes(main) if notes_body: chunks.append({ "id": f"tariff-sched-ch{chapter}-notes", "doc_type": "legislation", "act_code": "C-54.011", "act_short": "Customs Tariff", "act_name": "Customs Tariff", "section": f"Sch-Ch{chapter}-Notes", "marginal_note": (f"Chapter {chapter} Notes — " f"{src['title']}"), "part": f"Schedule, Chapter {chapter}", "division": "", "heading": src["title"], "text": notes_body, "history": "", "last_amended": "", "current_to": src["edition"], "citation": f"{citation_root}, Notes", "source_url": src["url"], }) # Walk every row in the schedule table, grouping by 4-digit heading. table = main.find("table") if table is None: return chunks rows_by_heading = defaultdict(list) # heading -> list of (item, ss, desc, unit, mfn, pref) heading_desc = {} # heading -> the 4-digit row's description for tr in table.find_all("tr"): cells = tr.find_all(["td", "th"], recursive=False) if not cells: continue first = _norm(cells[0].get_text(" ", strip=True)) if not first or first == "Tariff Item": continue # header row or blank heading = _heading_of(first) if heading is None: continue def col(i): return _norm(cells[i].get_text(" ", strip=True)) if i < len(cells) else "" ss, desc, unit, mfn, pref = col(1), col(2), col(3), col(4), col(5) # A 4-digit row carries only the heading number and description -- store # it once. Otherwise it's a subheading/item row; remember its rate cells. digits = re.sub(r"\D", "", first) if len(digits) == 4 and desc and heading not in heading_desc: heading_desc[heading] = desc rows_by_heading[heading].append((first, ss, desc, unit, mfn, pref)) for heading in sorted(rows_by_heading): desc = heading_desc.get(heading, "") rows = rows_by_heading[heading] if not desc: # No bare 4-digit row -- fall back to the first row's description. desc = next((r[2] for r in rows if r[2]), "") # Format the chunk: heading description first, then the items as a # readable list. Suppress description on item/subheading rows that just # repeat the heading desc verbatim (very common in this Schedule). lines = [] for item, ss, item_desc, unit, mfn, pref in rows: d = re.sub(r"\D", "", item) if len(d) == 4: continue # the heading row, already in desc label = f" {item}" if ss: label += f" (SS {ss})" extras = [] if item_desc and item_desc != desc: extras.append(item_desc) rate = [] if mfn: rate.append(f"MFN {mfn}") if pref: rate.append(f"Pref: {pref}") if unit and unit != "-": rate.append(f"Unit {unit}") if rate: extras.append(" | ".join(rate)) tail = " — " + "; ".join(extras) if extras else "" lines.append(label + tail) body = f"Heading {heading} — {desc}" if lines: body += "\n\nTariff items:\n" + "\n".join(lines) chunks.append({ "id": f"tariff-sched-{heading.replace('.', '-')}", "doc_type": "legislation", "act_code": "C-54.011", "act_short": "Customs Tariff", "act_name": "Customs Tariff", "section": f"Sch-{heading}", "marginal_note": desc[:200], "part": f"Schedule, Chapter {chapter}", "division": "", "heading": src["title"], "text": body, "history": "", "last_amended": "", "current_to": src["edition"], "citation": f"Customs Tariff, Sched., heading {heading}", "source_url": src["url"], }) return chunks def build(): all_chunks = [] for src in SOURCES.values(): print(f"Ingesting Customs Tariff Schedule {src['code']} ...") try: html = _fetch(src["url"], RAW / f"{src['code']}.html") chunks = parse_chapter(html, src) except Exception as exc: print(f" !! {src['code']}: {type(exc).__name__}: {exc}") continue all_chunks.extend(chunks) print(f" {len(chunks)} chunks") PROCESSED_DIR.mkdir(parents=True, exist_ok=True) OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1), encoding="utf-8") print(f"\n{len(all_chunks)} tariff-schedule chunks from {len(SOURCES)} " f"chapter(s) -> {OUT}") if __name__ == "__main__": build()