Spaces:

Beemer0
/

CanLex

Running

Beemer Claude Opus 4.7 commited on 4 days ago

Commit

8c9bc18

1 Parent(s): ef6e3dc

Ingest Customs Tariff Schedule Chapters 98 and 99

The CBSA special-classification provisions: Chapter 98 (non-commercial -- traveller
exemptions, settler's effects, returning Canadians, conveyances temporarily imported,
charitable donations, prohibited firearms) and Chapter 99 (commercial -- end-use
programs, temporary importations for industry). 96 chunks, one per 4-digit heading
plus a notes chunk per chapter. Source: CBSA's 2026 HTML edition of the Customs
Tariff (the Schedule is not in Justice Laws XML), bump the SOURCES year to refresh.

Tagged doc_type='legislation', act_code='C-54.011'. Five new tariff-schedule gold
eval questions added (settler's effects, alcohol/tobacco allowance, non-resident
conveyance, prohibited firearms, foreign-country representatives).

134-question eval: Hit@1 0.75 / Hit@3 0.89 / Hit@5 0.93 / Hit@10 0.96 / MRR 0.83
(vs pre-add 129-Q baseline 0.74 / 0.88 / 0.92 / 0.96 / 0.82 -- flat or up everywhere).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (3) hide show

canlex/tariff_schedule.py +259 -0
data/eval/questions.json +5 -0
data/processed/tariff_schedule.json +0 -0

canlex/tariff_schedule.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""Ingest Chapters 98 and 99 of the Customs Tariff Schedule.
+The Customs Tariff's Schedule is the Harmonized System classification of goods
+-- chapters 1-97 categorise every imported good for duty purposes. Those 97
+chapters are huge and outside CanLex's scope, but **chapters 98 and 99 are
+different**: they carry Canada's "special classification" provisions, which
+matter for almost every CBSA border interaction:
+  - Chapter 98 (non-commercial): traveller exemptions, settler's effects,
+    Canadian goods returned, ancestral household effects, conveyances
+    temporarily imported by a resident, etc.
+  - Chapter 99 (commercial): temporary importations, end-use programs,
+    government imports, reduced-rate goods for specific industries.
+Source: the CBSA's HTML edition of the current Customs Tariff. The Justice
+Laws XML for the Act (C-54.011) does NOT include the Schedule.
+Chunking is one chunk per 4-digit HEADING (98.01, 98.02, ...) plus one chunk
+per chapter for its Notes and Subheading Notes -- a heading is the natural
+unit of legal classification (the eight- and ten-digit items below it are the
+same rule with finer rate granularity).
+    py -m canlex.tariff_schedule
+"""
+import json
+import re
+import time
+import urllib.request
+from collections import defaultdict
+from bs4 import BeautifulSoup
+from .config import PROCESSED_DIR, RAW_DIR
+RAW = RAW_DIR / "tariff_schedule"
+OUT = PROCESSED_DIR / "tariff_schedule.json"
+_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+       "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
+# The 2026 edition is the current Customs Tariff at the time of writing. To
+# refresh: bump the year in the URL and `edition` once CBSA publishes the next.
+SOURCES = {
+    "ch98": {
+        "code": "ch98",
+        "chapter": "98",
+        "title": "Special classification provisions — non-commercial",
+        "url": ("https://www.cbsa-asfc.gc.ca/trade-commerce/tariff-tarif/"
+                "2026/html/00/ch98-eng.html"),
+        "edition": "2026",
+    },
+    "ch99": {
+        "code": "ch99",
+        "chapter": "99",
+        "title": "Special classification provisions — commercial",
+        "url": ("https://www.cbsa-asfc.gc.ca/trade-commerce/tariff-tarif/"
+                "2026/html/00/ch99-eng.html"),
+        "edition": "2026",
+    },
+}
+def _norm(text):
+    return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip()
+def _fetch(url, dest):
+    if dest.exists():
+        return dest.read_bytes()
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    req = urllib.request.Request(url, headers={"User-Agent": _UA})
+    with urllib.request.urlopen(req, timeout=60) as resp:
+        dest.write_bytes(resp.read())
+    time.sleep(0.5)
+    return dest.read_bytes()
+def _heading_of(item):
+    """Map a tariff-item code to its 4-digit heading. The schedule uses three
+    levels:
+      - '98.01'       (4 digits)           -- heading
+      - '9801.10'     (6 digits)           -- subheading
+      - '9801.10.10'  (8 digits)           -- tariff item
+    All three roll up to heading '98.01'."""
+    digits = re.sub(r"\D", "", item)
+    if len(digits) < 4:
+        return None
+    return f"{digits[:2]}.{digits[2:4]}"
+def _collect_notes(main):
+    """Return the chapter's Notes + Subheading Notes as a single text block."""
+    out = []
+    for label in ("Notes", "Subheading Notes"):
+        h2 = main.find(
+            "h2",
+            string=lambda s, lbl=label: s and lbl in s and (
+                lbl != "Notes" or "Subheading" not in s))
+        if not h2:
+            continue
+        parts = []
+        sib = h2.find_next_sibling()
+        while sib and sib.name != "h2":
+            t = _norm(sib.get_text(" ", strip=True))
+            if t:
+                parts.append(t)
+            sib = sib.find_next_sibling()
+        if parts:
+            out.append(f"{label}:\n" + "\n".join(parts))
+    return "\n\n".join(out)
+def parse_chapter(html, src):
+    """Parse one Customs Tariff Schedule chapter into chunks."""
+    soup = BeautifulSoup(html, "html.parser")
+    main = soup.find("main")
+    if main is None:
+        return []
+    for sup in main.find_all("sup"):
+        sup.decompose()
+    chunks = []
+    chapter = src["chapter"]
+    citation_root = f"Customs Tariff, Sched., Ch. {chapter}"
+    # Chapter Notes + Subheading Notes -- one chunk per chapter.
+    notes_body = _collect_notes(main)
+    if notes_body:
+        chunks.append({
+            "id": f"tariff-sched-ch{chapter}-notes",
+            "doc_type": "legislation",
+            "act_code": "C-54.011",
+            "act_short": "Customs Tariff",
+            "act_name": "Customs Tariff",
+            "section": f"Sch-Ch{chapter}-Notes",
+            "marginal_note": (f"Chapter {chapter} Notes — "
+                              f"{src['title']}"),
+            "part": f"Schedule, Chapter {chapter}",
+            "division": "",
+            "heading": src["title"],
+            "text": notes_body,
+            "history": "",
+            "last_amended": "",
+            "current_to": src["edition"],
+            "citation": f"{citation_root}, Notes",
+            "source_url": src["url"],
+        })
+    # Walk every row in the schedule table, grouping by 4-digit heading.
+    table = main.find("table")
+    if table is None:
+        return chunks
+    rows_by_heading = defaultdict(list)   # heading -> list of (item, ss, desc, unit, mfn, pref)
+    heading_desc = {}                     # heading -> the 4-digit row's description
+    for tr in table.find_all("tr"):
+        cells = tr.find_all(["td", "th"], recursive=False)
+        if not cells:
+            continue
+        first = _norm(cells[0].get_text(" ", strip=True))
+        if not first or first == "Tariff Item":
+            continue                       # header row or blank
+        heading = _heading_of(first)
+        if heading is None:
+            continue
+        def col(i):
+            return _norm(cells[i].get_text(" ", strip=True)) if i < len(cells) else ""
+        ss, desc, unit, mfn, pref = col(1), col(2), col(3), col(4), col(5)
+        # A 4-digit row carries only the heading number and description -- store
+        # it once. Otherwise it's a subheading/item row; remember its rate cells.
+        digits = re.sub(r"\D", "", first)
+        if len(digits) == 4 and desc and heading not in heading_desc:
+            heading_desc[heading] = desc
+        rows_by_heading[heading].append((first, ss, desc, unit, mfn, pref))
+    for heading in sorted(rows_by_heading):
+        desc = heading_desc.get(heading, "")
+        rows = rows_by_heading[heading]
+        if not desc:
+            # No bare 4-digit row -- fall back to the first row's description.
+            desc = next((r[2] for r in rows if r[2]), "")
+        # Format the chunk: heading description first, then the items as a
+        # readable list. Suppress description on item/subheading rows that just
+        # repeat the heading desc verbatim (very common in this Schedule).
+        lines = []
+        for item, ss, item_desc, unit, mfn, pref in rows:
+            d = re.sub(r"\D", "", item)
+            if len(d) == 4:
+                continue                   # the heading row, already in desc
+            label = f"  {item}"
+            if ss:
+                label += f" (SS {ss})"
+            extras = []
+            if item_desc and item_desc != desc:
+                extras.append(item_desc)
+            rate = []
+            if mfn:
+                rate.append(f"MFN {mfn}")
+            if pref:
+                rate.append(f"Pref: {pref}")
+            if unit and unit != "-":
+                rate.append(f"Unit {unit}")
+            if rate:
+                extras.append(" | ".join(rate))
+            tail = " — " + "; ".join(extras) if extras else ""
+            lines.append(label + tail)
+        body = f"Heading {heading} — {desc}"
+        if lines:
+            body += "\n\nTariff items:\n" + "\n".join(lines)
+        chunks.append({
+            "id": f"tariff-sched-{heading.replace('.', '-')}",
+            "doc_type": "legislation",
+            "act_code": "C-54.011",
+            "act_short": "Customs Tariff",
+            "act_name": "Customs Tariff",
+            "section": f"Sch-{heading}",
+            "marginal_note": desc[:200],
+            "part": f"Schedule, Chapter {chapter}",
+            "division": "",
+            "heading": src["title"],
+            "text": body,
+            "history": "",
+            "last_amended": "",
+            "current_to": src["edition"],
+            "citation": f"Customs Tariff, Sched., heading {heading}",
+            "source_url": src["url"],
+        })
+    return chunks
+def build():
+    all_chunks = []
+    for src in SOURCES.values():
+        print(f"Ingesting Customs Tariff Schedule {src['code']} ...")
+        try:
+            html = _fetch(src["url"], RAW / f"{src['code']}.html")
+            chunks = parse_chapter(html, src)
+        except Exception as exc:
+            print(f"  !! {src['code']}: {type(exc).__name__}: {exc}")
+            continue
+        all_chunks.extend(chunks)
+        print(f"  {len(chunks)} chunks")
+    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
+    OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
+                   encoding="utf-8")
+    print(f"\n{len(all_chunks)} tariff-schedule chunks from {len(SOURCES)} "
+          f"chapter(s) -> {OUT}")
+if __name__ == "__main__":
+    build()

data/eval/questions.json CHANGED Viewed

@@ -30,6 +30,11 @@
   {"query": "Advance information about commercial goods before they arrive in Canada", "answers": [["Customs Act", "12.1"]]},
   {"query": "How is the value for duty of imported goods determined?", "answers": [["Customs Act", "46"], ["Customs Act", "47"], ["Customs Act", "48"]]},
   {"query": "How are imported goods classified under the Customs Tariff?", "answers": [["Customs Tariff", "10"]]},
   {"query": "Must travellers report large amounts of currency when crossing the border?", "answers": [["PCMLTFA", "12"]]},
   {"query": "Can an officer seize currency that was not reported at the border?", "answers": [["PCMLTFA", "18"]]},
   {"query": "How does someone appeal the forfeiture of seized currency to the Federal Court?", "answers": [["PCMLTFA", "30"]]},

   {"query": "Advance information about commercial goods before they arrive in Canada", "answers": [["Customs Act", "12.1"]]},
   {"query": "How is the value for duty of imported goods determined?", "answers": [["Customs Act", "46"], ["Customs Act", "47"], ["Customs Act", "48"]]},
   {"query": "How are imported goods classified under the Customs Tariff?", "answers": [["Customs Tariff", "10"]]},
+  {"query": "Which tariff heading in the Customs Tariff Schedule covers a settler's effects on first arrival?", "answers": [["Customs Tariff", "Sch-98.07"]]},
+  {"query": "What allowance of alcohol and tobacco may a traveller bring back to Canada duty free?", "answers": [["Customs Tariff", "Sch-98.27"]]},
+  {"query": "Conveyance temporarily imported by a non-resident visiting Canada -- customs tariff treatment", "answers": [["Customs Tariff", "Sch-98.03"]]},
+  {"query": "Tariff classification of prohibited firearms and weapons under the Customs Tariff", "answers": [["Customs Tariff", "Sch-98.98"]]},
+  {"query": "Tariff treatment of articles imported for personal use of representatives of foreign countries", "answers": [["Customs Tariff", "Sch-98.08"]]},
   {"query": "Must travellers report large amounts of currency when crossing the border?", "answers": [["PCMLTFA", "12"]]},
   {"query": "Can an officer seize currency that was not reported at the border?", "answers": [["PCMLTFA", "18"]]},
   {"query": "How does someone appeal the forfeiture of seized currency to the Federal Court?", "answers": [["PCMLTFA", "30"]]},

data/processed/tariff_schedule.json ADDED Viewed

The diff for this file is too large to render. See raw diff