Spaces:

Beemer0
/

CanLex

Running

File size: 9,447 Bytes

8c9bc18

"""Ingest Chapters 98 and 99 of the Customs Tariff Schedule.

The Customs Tariff's Schedule is the Harmonized System classification of goods
-- chapters 1-97 categorise every imported good for duty purposes. Those 97
chapters are huge and outside CanLex's scope, but **chapters 98 and 99 are
different**: they carry Canada's "special classification" provisions, which
matter for almost every CBSA border interaction:

  - Chapter 98 (non-commercial): traveller exemptions, settler's effects,
    Canadian goods returned, ancestral household effects, conveyances
    temporarily imported by a resident, etc.
  - Chapter 99 (commercial): temporary importations, end-use programs,
    government imports, reduced-rate goods for specific industries.

Source: the CBSA's HTML edition of the current Customs Tariff. The Justice
Laws XML for the Act (C-54.011) does NOT include the Schedule.

Chunking is one chunk per 4-digit HEADING (98.01, 98.02, ...) plus one chunk
per chapter for its Notes and Subheading Notes -- a heading is the natural
unit of legal classification (the eight- and ten-digit items below it are the
same rule with finer rate granularity).

    py -m canlex.tariff_schedule
"""
import json
import re
import time
import urllib.request
from collections import defaultdict

from bs4 import BeautifulSoup

from .config import PROCESSED_DIR, RAW_DIR

RAW = RAW_DIR / "tariff_schedule"
OUT = PROCESSED_DIR / "tariff_schedule.json"

_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
       "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

# The 2026 edition is the current Customs Tariff at the time of writing. To
# refresh: bump the year in the URL and `edition` once CBSA publishes the next.
SOURCES = {
    "ch98": {
        "code": "ch98",
        "chapter": "98",
        "title": "Special classification provisions — non-commercial",
        "url": ("https://www.cbsa-asfc.gc.ca/trade-commerce/tariff-tarif/"
                "2026/html/00/ch98-eng.html"),
        "edition": "2026",
    },
    "ch99": {
        "code": "ch99",
        "chapter": "99",
        "title": "Special classification provisions — commercial",
        "url": ("https://www.cbsa-asfc.gc.ca/trade-commerce/tariff-tarif/"
                "2026/html/00/ch99-eng.html"),
        "edition": "2026",
    },
}


def _norm(text):
    return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip()


def _fetch(url, dest):
    if dest.exists():
        return dest.read_bytes()
    dest.parent.mkdir(parents=True, exist_ok=True)
    req = urllib.request.Request(url, headers={"User-Agent": _UA})
    with urllib.request.urlopen(req, timeout=60) as resp:
        dest.write_bytes(resp.read())
    time.sleep(0.5)
    return dest.read_bytes()


def _heading_of(item):
    """Map a tariff-item code to its 4-digit heading. The schedule uses three
    levels:
      - '98.01'       (4 digits)           -- heading
      - '9801.10'     (6 digits)           -- subheading
      - '9801.10.10'  (8 digits)           -- tariff item
    All three roll up to heading '98.01'."""
    digits = re.sub(r"\D", "", item)
    if len(digits) < 4:
        return None
    return f"{digits[:2]}.{digits[2:4]}"


def _collect_notes(main):
    """Return the chapter's Notes + Subheading Notes as a single text block."""
    out = []
    for label in ("Notes", "Subheading Notes"):
        h2 = main.find(
            "h2",
            string=lambda s, lbl=label: s and lbl in s and (
                lbl != "Notes" or "Subheading" not in s))
        if not h2:
            continue
        parts = []
        sib = h2.find_next_sibling()
        while sib and sib.name != "h2":
            t = _norm(sib.get_text(" ", strip=True))
            if t:
                parts.append(t)
            sib = sib.find_next_sibling()
        if parts:
            out.append(f"{label}:\n" + "\n".join(parts))
    return "\n\n".join(out)


def parse_chapter(html, src):
    """Parse one Customs Tariff Schedule chapter into chunks."""
    soup = BeautifulSoup(html, "html.parser")
    main = soup.find("main")
    if main is None:
        return []
    for sup in main.find_all("sup"):
        sup.decompose()

    chunks = []
    chapter = src["chapter"]
    citation_root = f"Customs Tariff, Sched., Ch. {chapter}"

    # Chapter Notes + Subheading Notes -- one chunk per chapter.
    notes_body = _collect_notes(main)
    if notes_body:
        chunks.append({
            "id": f"tariff-sched-ch{chapter}-notes",
            "doc_type": "legislation",
            "act_code": "C-54.011",
            "act_short": "Customs Tariff",
            "act_name": "Customs Tariff",
            "section": f"Sch-Ch{chapter}-Notes",
            "marginal_note": (f"Chapter {chapter} Notes — "
                              f"{src['title']}"),
            "part": f"Schedule, Chapter {chapter}",
            "division": "",
            "heading": src["title"],
            "text": notes_body,
            "history": "",
            "last_amended": "",
            "current_to": src["edition"],
            "citation": f"{citation_root}, Notes",
            "source_url": src["url"],
        })

    # Walk every row in the schedule table, grouping by 4-digit heading.
    table = main.find("table")
    if table is None:
        return chunks

    rows_by_heading = defaultdict(list)   # heading -> list of (item, ss, desc, unit, mfn, pref)
    heading_desc = {}                     # heading -> the 4-digit row's description

    for tr in table.find_all("tr"):
        cells = tr.find_all(["td", "th"], recursive=False)
        if not cells:
            continue
        first = _norm(cells[0].get_text(" ", strip=True))
        if not first or first == "Tariff Item":
            continue                       # header row or blank
        heading = _heading_of(first)
        if heading is None:
            continue

        def col(i):
            return _norm(cells[i].get_text(" ", strip=True)) if i < len(cells) else ""

        ss, desc, unit, mfn, pref = col(1), col(2), col(3), col(4), col(5)

        # A 4-digit row carries only the heading number and description -- store
        # it once. Otherwise it's a subheading/item row; remember its rate cells.
        digits = re.sub(r"\D", "", first)
        if len(digits) == 4 and desc and heading not in heading_desc:
            heading_desc[heading] = desc
        rows_by_heading[heading].append((first, ss, desc, unit, mfn, pref))

    for heading in sorted(rows_by_heading):
        desc = heading_desc.get(heading, "")
        rows = rows_by_heading[heading]
        if not desc:
            # No bare 4-digit row -- fall back to the first row's description.
            desc = next((r[2] for r in rows if r[2]), "")

        # Format the chunk: heading description first, then the items as a
        # readable list. Suppress description on item/subheading rows that just
        # repeat the heading desc verbatim (very common in this Schedule).
        lines = []
        for item, ss, item_desc, unit, mfn, pref in rows:
            d = re.sub(r"\D", "", item)
            if len(d) == 4:
                continue                   # the heading row, already in desc
            label = f"  {item}"
            if ss:
                label += f" (SS {ss})"
            extras = []
            if item_desc and item_desc != desc:
                extras.append(item_desc)
            rate = []
            if mfn:
                rate.append(f"MFN {mfn}")
            if pref:
                rate.append(f"Pref: {pref}")
            if unit and unit != "-":
                rate.append(f"Unit {unit}")
            if rate:
                extras.append(" | ".join(rate))
            tail = " — " + "; ".join(extras) if extras else ""
            lines.append(label + tail)

        body = f"Heading {heading} — {desc}"
        if lines:
            body += "\n\nTariff items:\n" + "\n".join(lines)

        chunks.append({
            "id": f"tariff-sched-{heading.replace('.', '-')}",
            "doc_type": "legislation",
            "act_code": "C-54.011",
            "act_short": "Customs Tariff",
            "act_name": "Customs Tariff",
            "section": f"Sch-{heading}",
            "marginal_note": desc[:200],
            "part": f"Schedule, Chapter {chapter}",
            "division": "",
            "heading": src["title"],
            "text": body,
            "history": "",
            "last_amended": "",
            "current_to": src["edition"],
            "citation": f"Customs Tariff, Sched., heading {heading}",
            "source_url": src["url"],
        })

    return chunks


def build():
    all_chunks = []
    for src in SOURCES.values():
        print(f"Ingesting Customs Tariff Schedule {src['code']} ...")
        try:
            html = _fetch(src["url"], RAW / f"{src['code']}.html")
            chunks = parse_chapter(html, src)
        except Exception as exc:
            print(f"  !! {src['code']}: {type(exc).__name__}: {exc}")
            continue
        all_chunks.extend(chunks)
        print(f"  {len(chunks)} chunks")
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
                   encoding="utf-8")
    print(f"\n{len(all_chunks)} tariff-schedule chunks from {len(SOURCES)} "
          f"chapter(s) -> {OUT}")


if __name__ == "__main__":
    build()