| """Ingest Chapters 98 and 99 of the Customs Tariff Schedule. |
| |
| The Customs Tariff's Schedule is the Harmonized System classification of goods |
| -- chapters 1-97 categorise every imported good for duty purposes. Those 97 |
| chapters are huge and outside CanLex's scope, but **chapters 98 and 99 are |
| different**: they carry Canada's "special classification" provisions, which |
| matter for almost every CBSA border interaction: |
| |
| - Chapter 98 (non-commercial): traveller exemptions, settler's effects, |
| Canadian goods returned, ancestral household effects, conveyances |
| temporarily imported by a resident, etc. |
| - Chapter 99 (commercial): temporary importations, end-use programs, |
| government imports, reduced-rate goods for specific industries. |
| |
| Source: the CBSA's HTML edition of the current Customs Tariff. The Justice |
| Laws XML for the Act (C-54.011) does NOT include the Schedule. |
| |
| Chunking is one chunk per 4-digit HEADING (98.01, 98.02, ...) plus one chunk |
| per chapter for its Notes and Subheading Notes -- a heading is the natural |
| unit of legal classification (the eight- and ten-digit items below it are the |
| same rule with finer rate granularity). |
| |
| py -m canlex.tariff_schedule |
| """ |
| import json |
| import re |
| import time |
| import urllib.request |
| from collections import defaultdict |
|
|
| from bs4 import BeautifulSoup |
|
|
| from .config import PROCESSED_DIR, RAW_DIR |
|
|
| RAW = RAW_DIR / "tariff_schedule" |
| OUT = PROCESSED_DIR / "tariff_schedule.json" |
|
|
| _UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " |
| "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36") |
|
|
| |
| |
| SOURCES = { |
| "ch98": { |
| "code": "ch98", |
| "chapter": "98", |
| "title": "Special classification provisions — non-commercial", |
| "url": ("https://www.cbsa-asfc.gc.ca/trade-commerce/tariff-tarif/" |
| "2026/html/00/ch98-eng.html"), |
| "edition": "2026", |
| }, |
| "ch99": { |
| "code": "ch99", |
| "chapter": "99", |
| "title": "Special classification provisions — commercial", |
| "url": ("https://www.cbsa-asfc.gc.ca/trade-commerce/tariff-tarif/" |
| "2026/html/00/ch99-eng.html"), |
| "edition": "2026", |
| }, |
| } |
|
|
|
|
| def _norm(text): |
| return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip() |
|
|
|
|
| def _fetch(url, dest): |
| if dest.exists(): |
| return dest.read_bytes() |
| dest.parent.mkdir(parents=True, exist_ok=True) |
| req = urllib.request.Request(url, headers={"User-Agent": _UA}) |
| with urllib.request.urlopen(req, timeout=60) as resp: |
| dest.write_bytes(resp.read()) |
| time.sleep(0.5) |
| return dest.read_bytes() |
|
|
|
|
| def _heading_of(item): |
| """Map a tariff-item code to its 4-digit heading. The schedule uses three |
| levels: |
| - '98.01' (4 digits) -- heading |
| - '9801.10' (6 digits) -- subheading |
| - '9801.10.10' (8 digits) -- tariff item |
| All three roll up to heading '98.01'.""" |
| digits = re.sub(r"\D", "", item) |
| if len(digits) < 4: |
| return None |
| return f"{digits[:2]}.{digits[2:4]}" |
|
|
|
|
| def _collect_notes(main): |
| """Return the chapter's Notes + Subheading Notes as a single text block.""" |
| out = [] |
| for label in ("Notes", "Subheading Notes"): |
| h2 = main.find( |
| "h2", |
| string=lambda s, lbl=label: s and lbl in s and ( |
| lbl != "Notes" or "Subheading" not in s)) |
| if not h2: |
| continue |
| parts = [] |
| sib = h2.find_next_sibling() |
| while sib and sib.name != "h2": |
| t = _norm(sib.get_text(" ", strip=True)) |
| if t: |
| parts.append(t) |
| sib = sib.find_next_sibling() |
| if parts: |
| out.append(f"{label}:\n" + "\n".join(parts)) |
| return "\n\n".join(out) |
|
|
|
|
| def parse_chapter(html, src): |
| """Parse one Customs Tariff Schedule chapter into chunks.""" |
| soup = BeautifulSoup(html, "html.parser") |
| main = soup.find("main") |
| if main is None: |
| return [] |
| for sup in main.find_all("sup"): |
| sup.decompose() |
|
|
| chunks = [] |
| chapter = src["chapter"] |
| citation_root = f"Customs Tariff, Sched., Ch. {chapter}" |
|
|
| |
| notes_body = _collect_notes(main) |
| if notes_body: |
| chunks.append({ |
| "id": f"tariff-sched-ch{chapter}-notes", |
| "doc_type": "legislation", |
| "act_code": "C-54.011", |
| "act_short": "Customs Tariff", |
| "act_name": "Customs Tariff", |
| "section": f"Sch-Ch{chapter}-Notes", |
| "marginal_note": (f"Chapter {chapter} Notes — " |
| f"{src['title']}"), |
| "part": f"Schedule, Chapter {chapter}", |
| "division": "", |
| "heading": src["title"], |
| "text": notes_body, |
| "history": "", |
| "last_amended": "", |
| "current_to": src["edition"], |
| "citation": f"{citation_root}, Notes", |
| "source_url": src["url"], |
| }) |
|
|
| |
| table = main.find("table") |
| if table is None: |
| return chunks |
|
|
| rows_by_heading = defaultdict(list) |
| heading_desc = {} |
|
|
| for tr in table.find_all("tr"): |
| cells = tr.find_all(["td", "th"], recursive=False) |
| if not cells: |
| continue |
| first = _norm(cells[0].get_text(" ", strip=True)) |
| if not first or first == "Tariff Item": |
| continue |
| heading = _heading_of(first) |
| if heading is None: |
| continue |
|
|
| def col(i): |
| return _norm(cells[i].get_text(" ", strip=True)) if i < len(cells) else "" |
|
|
| ss, desc, unit, mfn, pref = col(1), col(2), col(3), col(4), col(5) |
|
|
| |
| |
| digits = re.sub(r"\D", "", first) |
| if len(digits) == 4 and desc and heading not in heading_desc: |
| heading_desc[heading] = desc |
| rows_by_heading[heading].append((first, ss, desc, unit, mfn, pref)) |
|
|
| for heading in sorted(rows_by_heading): |
| desc = heading_desc.get(heading, "") |
| rows = rows_by_heading[heading] |
| if not desc: |
| |
| desc = next((r[2] for r in rows if r[2]), "") |
|
|
| |
| |
| |
| lines = [] |
| for item, ss, item_desc, unit, mfn, pref in rows: |
| d = re.sub(r"\D", "", item) |
| if len(d) == 4: |
| continue |
| label = f" {item}" |
| if ss: |
| label += f" (SS {ss})" |
| extras = [] |
| if item_desc and item_desc != desc: |
| extras.append(item_desc) |
| rate = [] |
| if mfn: |
| rate.append(f"MFN {mfn}") |
| if pref: |
| rate.append(f"Pref: {pref}") |
| if unit and unit != "-": |
| rate.append(f"Unit {unit}") |
| if rate: |
| extras.append(" | ".join(rate)) |
| tail = " — " + "; ".join(extras) if extras else "" |
| lines.append(label + tail) |
|
|
| body = f"Heading {heading} — {desc}" |
| if lines: |
| body += "\n\nTariff items:\n" + "\n".join(lines) |
|
|
| chunks.append({ |
| "id": f"tariff-sched-{heading.replace('.', '-')}", |
| "doc_type": "legislation", |
| "act_code": "C-54.011", |
| "act_short": "Customs Tariff", |
| "act_name": "Customs Tariff", |
| "section": f"Sch-{heading}", |
| "marginal_note": desc[:200], |
| "part": f"Schedule, Chapter {chapter}", |
| "division": "", |
| "heading": src["title"], |
| "text": body, |
| "history": "", |
| "last_amended": "", |
| "current_to": src["edition"], |
| "citation": f"Customs Tariff, Sched., heading {heading}", |
| "source_url": src["url"], |
| }) |
|
|
| return chunks |
|
|
|
|
| def build(): |
| all_chunks = [] |
| for src in SOURCES.values(): |
| print(f"Ingesting Customs Tariff Schedule {src['code']} ...") |
| try: |
| html = _fetch(src["url"], RAW / f"{src['code']}.html") |
| chunks = parse_chapter(html, src) |
| except Exception as exc: |
| print(f" !! {src['code']}: {type(exc).__name__}: {exc}") |
| continue |
| all_chunks.extend(chunks) |
| print(f" {len(chunks)} chunks") |
| PROCESSED_DIR.mkdir(parents=True, exist_ok=True) |
| OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1), |
| encoding="utf-8") |
| print(f"\n{len(all_chunks)} tariff-schedule chunks from {len(SOURCES)} " |
| f"chapter(s) -> {OUT}") |
|
|
|
|
| if __name__ == "__main__": |
| build() |
|
|