Spaces:

Beemer0
/

CanLex

Running

CanLex / canlex /tariff_schedule.py

Beemer

Ingest Customs Tariff Schedule Chapters 98 and 99

8c9bc18 4 days ago

9.45 kB

	"""Ingest Chapters 98 and 99 of the Customs Tariff Schedule.

	The Customs Tariff's Schedule is the Harmonized System classification of goods
	-- chapters 1-97 categorise every imported good for duty purposes. Those 97
	chapters are huge and outside CanLex's scope, but **chapters 98 and 99 are
	different**: they carry Canada's "special classification" provisions, which
	matter for almost every CBSA border interaction:

	- Chapter 98 (non-commercial): traveller exemptions, settler's effects,
	Canadian goods returned, ancestral household effects, conveyances
	temporarily imported by a resident, etc.
	- Chapter 99 (commercial): temporary importations, end-use programs,
	government imports, reduced-rate goods for specific industries.

	Source: the CBSA's HTML edition of the current Customs Tariff. The Justice
	Laws XML for the Act (C-54.011) does NOT include the Schedule.

	Chunking is one chunk per 4-digit HEADING (98.01, 98.02, ...) plus one chunk
	per chapter for its Notes and Subheading Notes -- a heading is the natural
	unit of legal classification (the eight- and ten-digit items below it are the
	same rule with finer rate granularity).

	py -m canlex.tariff_schedule
	"""
	import json
	import re
	import time
	import urllib.request
	from collections import defaultdict

	from bs4 import BeautifulSoup

	from .config import PROCESSED_DIR, RAW_DIR

	RAW = RAW_DIR / "tariff_schedule"
	OUT = PROCESSED_DIR / "tariff_schedule.json"

	_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

	# The 2026 edition is the current Customs Tariff at the time of writing. To
	# refresh: bump the year in the URL and `edition` once CBSA publishes the next.
	SOURCES = {
	"ch98": {
	"code": "ch98",
	"chapter": "98",
	"title": "Special classification provisions — non-commercial",
	"url": ("https://www.cbsa-asfc.gc.ca/trade-commerce/tariff-tarif/"
	"2026/html/00/ch98-eng.html"),
	"edition": "2026",
	},
	"ch99": {
	"code": "ch99",
	"chapter": "99",
	"title": "Special classification provisions — commercial",
	"url": ("https://www.cbsa-asfc.gc.ca/trade-commerce/tariff-tarif/"
	"2026/html/00/ch99-eng.html"),
	"edition": "2026",
	},
	}


	def _norm(text):
	return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip()


	def _fetch(url, dest):
	if dest.exists():
	return dest.read_bytes()
	dest.parent.mkdir(parents=True, exist_ok=True)
	req = urllib.request.Request(url, headers={"User-Agent": _UA})
	with urllib.request.urlopen(req, timeout=60) as resp:
	dest.write_bytes(resp.read())
	time.sleep(0.5)
	return dest.read_bytes()


	def _heading_of(item):
	"""Map a tariff-item code to its 4-digit heading. The schedule uses three
	levels:
	- '98.01' (4 digits) -- heading
	- '9801.10' (6 digits) -- subheading
	- '9801.10.10' (8 digits) -- tariff item
	All three roll up to heading '98.01'."""
	digits = re.sub(r"\D", "", item)
	if len(digits) < 4:
	return None
	return f"{digits[:2]}.{digits[2:4]}"


	def _collect_notes(main):
	"""Return the chapter's Notes + Subheading Notes as a single text block."""
	out = []
	for label in ("Notes", "Subheading Notes"):
	h2 = main.find(
	"h2",
	string=lambda s, lbl=label: s and lbl in s and (
	lbl != "Notes" or "Subheading" not in s))
	if not h2:
	continue
	parts = []
	sib = h2.find_next_sibling()
	while sib and sib.name != "h2":
	t = _norm(sib.get_text(" ", strip=True))
	if t:
	parts.append(t)
	sib = sib.find_next_sibling()
	if parts:
	out.append(f"{label}:\n" + "\n".join(parts))
	return "\n\n".join(out)


	def parse_chapter(html, src):
	"""Parse one Customs Tariff Schedule chapter into chunks."""
	soup = BeautifulSoup(html, "html.parser")
	main = soup.find("main")
	if main is None:
	return []
	for sup in main.find_all("sup"):
	sup.decompose()

	chunks = []
	chapter = src["chapter"]
	citation_root = f"Customs Tariff, Sched., Ch. {chapter}"

	# Chapter Notes + Subheading Notes -- one chunk per chapter.
	notes_body = _collect_notes(main)
	if notes_body:
	chunks.append({
	"id": f"tariff-sched-ch{chapter}-notes",
	"doc_type": "legislation",
	"act_code": "C-54.011",
	"act_short": "Customs Tariff",
	"act_name": "Customs Tariff",
	"section": f"Sch-Ch{chapter}-Notes",
	"marginal_note": (f"Chapter {chapter} Notes — "
	f"{src['title']}"),
	"part": f"Schedule, Chapter {chapter}",
	"division": "",
	"heading": src["title"],
	"text": notes_body,
	"history": "",
	"last_amended": "",
	"current_to": src["edition"],
	"citation": f"{citation_root}, Notes",
	"source_url": src["url"],
	})

	# Walk every row in the schedule table, grouping by 4-digit heading.
	table = main.find("table")
	if table is None:
	return chunks

	rows_by_heading = defaultdict(list) # heading -> list of (item, ss, desc, unit, mfn, pref)
	heading_desc = {} # heading -> the 4-digit row's description

	for tr in table.find_all("tr"):
	cells = tr.find_all(["td", "th"], recursive=False)
	if not cells:
	continue
	first = _norm(cells[0].get_text(" ", strip=True))
	if not first or first == "Tariff Item":
	continue # header row or blank
	heading = _heading_of(first)
	if heading is None:
	continue

	def col(i):
	return _norm(cells[i].get_text(" ", strip=True)) if i < len(cells) else ""

	ss, desc, unit, mfn, pref = col(1), col(2), col(3), col(4), col(5)

	# A 4-digit row carries only the heading number and description -- store
	# it once. Otherwise it's a subheading/item row; remember its rate cells.
	digits = re.sub(r"\D", "", first)
	if len(digits) == 4 and desc and heading not in heading_desc:
	heading_desc[heading] = desc
	rows_by_heading[heading].append((first, ss, desc, unit, mfn, pref))

	for heading in sorted(rows_by_heading):
	desc = heading_desc.get(heading, "")
	rows = rows_by_heading[heading]
	if not desc:
	# No bare 4-digit row -- fall back to the first row's description.
	desc = next((r[2] for r in rows if r[2]), "")

	# Format the chunk: heading description first, then the items as a
	# readable list. Suppress description on item/subheading rows that just
	# repeat the heading desc verbatim (very common in this Schedule).
	lines = []
	for item, ss, item_desc, unit, mfn, pref in rows:
	d = re.sub(r"\D", "", item)
	if len(d) == 4:
	continue # the heading row, already in desc
	label = f" {item}"
	if ss:
	label += f" (SS {ss})"
	extras = []
	if item_desc and item_desc != desc:
	extras.append(item_desc)
	rate = []
	if mfn:
	rate.append(f"MFN {mfn}")
	if pref:
	rate.append(f"Pref: {pref}")
	if unit and unit != "-":
	rate.append(f"Unit {unit}")
	if rate:
	extras.append(" \| ".join(rate))
	tail = " — " + "; ".join(extras) if extras else ""
	lines.append(label + tail)

	body = f"Heading {heading} — {desc}"
	if lines:
	body += "\n\nTariff items:\n" + "\n".join(lines)

	chunks.append({
	"id": f"tariff-sched-{heading.replace('.', '-')}",
	"doc_type": "legislation",
	"act_code": "C-54.011",
	"act_short": "Customs Tariff",
	"act_name": "Customs Tariff",
	"section": f"Sch-{heading}",
	"marginal_note": desc[:200],
	"part": f"Schedule, Chapter {chapter}",
	"division": "",
	"heading": src["title"],
	"text": body,
	"history": "",
	"last_amended": "",
	"current_to": src["edition"],
	"citation": f"Customs Tariff, Sched., heading {heading}",
	"source_url": src["url"],
	})

	return chunks


	def build():
	all_chunks = []
	for src in SOURCES.values():
	print(f"Ingesting Customs Tariff Schedule {src['code']} ...")
	try:
	html = _fetch(src["url"], RAW / f"{src['code']}.html")
	chunks = parse_chapter(html, src)
	except Exception as exc:
	print(f" !! {src['code']}: {type(exc).__name__}: {exc}")
	continue
	all_chunks.extend(chunks)
	print(f" {len(chunks)} chunks")
	PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
	OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
	encoding="utf-8")
	print(f"\n{len(all_chunks)} tariff-schedule chunks from {len(SOURCES)} "
	f"chapter(s) -> {OUT}")


	if __name__ == "__main__":
	build()