Beemer commited on
Commit ·
bf89be4
1
Parent(s): 5527c63
Add Phase 2 case law: 20 Federal Court of Appeal and Federal Court decisions
Browse files- canlex/caselaw.py +133 -52
- canlex/server.py +6 -6
- data/processed/caselaw.json +0 -0
canlex/caselaw.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
-
"""Ingest leading
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
|
| 8 |
py -m canlex.caselaw
|
| 9 |
"""
|
|
@@ -16,13 +18,23 @@ from bs4 import BeautifulSoup
|
|
| 16 |
|
| 17 |
from .config import PROCESSED_DIR, RAW_DIR
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
OUT = PROCESSED_DIR / "caselaw.json"
|
| 22 |
|
| 23 |
-
# A normal browser User-Agent: the
|
| 24 |
-
#
|
| 25 |
-
# throttle below and from caching every fetched page on disk.
|
| 26 |
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
| 27 |
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
| 28 |
_THROTTLE = 2.0 # seconds between live fetches
|
|
@@ -32,79 +44,146 @@ _CHUNK_CHARS = 1800 # target characters per chunk
|
|
| 32 |
# which is not part of the judgment's reasons.
|
| 33 |
_APPARATUS = re.compile(r"^\s*(APPENDIX\b|Solicitors?\s+for\b)", re.I)
|
| 34 |
|
| 35 |
-
# Curated leading
|
| 36 |
-
# 'id' is the verified
|
| 37 |
-
# curated. The case name, citation and date are parsed
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
| 40 |
"topic": "Standard of review on judicial review; the reasonableness "
|
| 41 |
"standard for administrative decisions"},
|
| 42 |
-
{"id": 20081, "short": "Mason",
|
| 43 |
"topic": "Inadmissibility under IRPA s. 34(1)(e) for acts of violence "
|
| 44 |
"endangering safety in Canada; reasonableness review"},
|
| 45 |
-
{"id": 16803, "short": "Tran",
|
| 46 |
"topic": "Serious criminality and inadmissibility under IRPA s. 36; the "
|
| 47 |
"meaning of a term of imprisonment and an offence punishable by"},
|
| 48 |
-
{"id": 15647, "short": "B010",
|
| 49 |
"topic": "Inadmissibility for people smuggling under IRPA s. 37(1)(b); "
|
| 50 |
"organized criminality"},
|
| 51 |
-
{"id": 15648, "short": "Appulonappa",
|
| 52 |
"topic": "The human smuggling offence in IRPA s. 117; constitutional "
|
| 53 |
"overbreadth and humanitarian aid to asylum seekers"},
|
| 54 |
-
{"id": 14419, "short": "Febles",
|
| 55 |
"topic": "Exclusion from refugee protection for a serious non-political "
|
| 56 |
"crime under Article 1F(b) of the Refugee Convention"},
|
| 57 |
-
{"id": 13184, "short": "Ezokola",
|
| 58 |
"topic": "Complicity and exclusion from refugee protection for "
|
| 59 |
"international crimes under Article 1F(a)"},
|
| 60 |
-
{"id": 15665, "short": "Kanthasamy",
|
| 61 |
"topic": "Humanitarian and compassionate relief under IRPA s. 25; the "
|
| 62 |
"best interests of a child"},
|
| 63 |
-
{"id": 13137, "short": "Agraira",
|
| 64 |
"topic": "Ministerial relief from inadmissibility on security grounds; "
|
| 65 |
"the national interest under IRPA"},
|
| 66 |
-
{"id": 6901, "short": "Khosa",
|
| 67 |
"topic": "Standard of review of immigration decisions; judicial review "
|
| 68 |
"of a removal order"},
|
| 69 |
-
{"id": 2345, "short": "Charkaoui",
|
| 70 |
"topic": "Security certificates and immigration detention; the Charter "
|
| 71 |
"and procedural fairness"},
|
| 72 |
-
{"id": 1937, "short": "Suresh",
|
| 73 |
"topic": "Deportation to a risk of torture; Charter s. 7 and removal on "
|
| 74 |
"security grounds"},
|
| 75 |
-
{"id": 17759, "short": "Chhina",
|
| 76 |
"topic": "Habeas corpus as a remedy for immigration detention; review of "
|
| 77 |
"lengthy detention"},
|
| 78 |
-
{"id": 1717, "short": "Baker",
|
| 79 |
"topic": "Procedural fairness in administrative decisions; humanitarian "
|
| 80 |
"and compassionate review; the duty to give reasons"},
|
| 81 |
-
{"id": 39, "short": "Singh",
|
| 82 |
"topic": "Charter s. 7 rights of refugee claimants; the right to an oral "
|
| 83 |
"hearing"},
|
| 84 |
-
{"id": 377, "short": "Simmons",
|
| 85 |
"topic": "Customs searches at the border; Charter s. 8 and the reasonable "
|
| 86 |
"expectation of privacy on entry to Canada"},
|
| 87 |
-
{"id": 1694, "short": "Monney",
|
| 88 |
"topic": "Border detention for a customs search; reasonable suspicion "
|
| 89 |
"and the Customs Act"},
|
| 90 |
-
{"id": 986, "short": "Dehghani",
|
| 91 |
"topic": "Charter rights at a port of entry; secondary examination and "
|
| 92 |
"the right to counsel"},
|
| 93 |
-
{"id": 1627, "short": "Pushpanathan",
|
| 94 |
"topic": "Exclusion from refugee protection under Article 1F(c) for acts "
|
| 95 |
"contrary to the purposes of the United Nations"},
|
| 96 |
-
{"id": 1023, "short": "Ward",
|
| 97 |
"topic": "The refugee definition; a particular social group; the "
|
| 98 |
"availability of state protection"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
]
|
| 100 |
|
| 101 |
|
| 102 |
-
def _fetch(item_id):
|
| 103 |
"""Return a decision's iframe HTML, caching the raw page under data/raw."""
|
| 104 |
-
cache = _RAW / f"{item_id}.html"
|
| 105 |
if cache.exists():
|
| 106 |
return cache.read_text(encoding="utf-8")
|
| 107 |
-
url =
|
| 108 |
req = urllib.request.Request(url, headers={"User-Agent": _UA})
|
| 109 |
time.sleep(_THROTTLE)
|
| 110 |
with urllib.request.urlopen(req, timeout=60) as resp:
|
|
@@ -146,10 +225,10 @@ def _body(soup):
|
|
| 146 |
def _paragraphs(soup):
|
| 147 |
"""Return (is_numbered, [(label, text), ...]) for the judgment body.
|
| 148 |
|
| 149 |
-
Modern
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
<p> between one numbered opener and the next belongs to that paragraph.
|
| 153 |
Older, unnumbered decisions fall back to taking every <p> in document order.
|
| 154 |
"""
|
| 155 |
blocks = [p for p in _body(soup).find_all("p")
|
|
@@ -238,13 +317,14 @@ def _chunk(paras):
|
|
| 238 |
|
| 239 |
def _decision_chunks(case, soup):
|
| 240 |
"""Build CanLex chunk dicts for one decision."""
|
|
|
|
| 241 |
name, fields = _metadata(soup)
|
| 242 |
name = name or case["short"]
|
| 243 |
cite = fields.get("neutral citation") or fields.get("report") or ""
|
| 244 |
report = fields.get("report", "")
|
| 245 |
date = fields.get("date", "")
|
| 246 |
citation = f"{name}, {cite}" if cite else name
|
| 247 |
-
item_url =
|
| 248 |
modern, paras = _paragraphs(soup)
|
| 249 |
chunks = []
|
| 250 |
for i, group in enumerate(_chunk(paras), start=1):
|
|
@@ -255,16 +335,16 @@ def _decision_chunks(case, soup):
|
|
| 255 |
else:
|
| 256 |
locator = f"excerpt {i}"
|
| 257 |
chunks.append({
|
| 258 |
-
"id": f"
|
| 259 |
"doc_type": "caselaw",
|
| 260 |
-
"act_code": cite or f"
|
| 261 |
"act_short": case["short"],
|
| 262 |
"act_name": name,
|
| 263 |
"section": "",
|
| 264 |
"citation": citation,
|
| 265 |
"marginal_note": locator,
|
| 266 |
"heading": case["topic"],
|
| 267 |
-
"part":
|
| 268 |
"division": "",
|
| 269 |
"text": "\n\n".join(t for _, t in group),
|
| 270 |
"current_to": date,
|
|
@@ -276,28 +356,29 @@ def _decision_chunks(case, soup):
|
|
| 276 |
|
| 277 |
|
| 278 |
def build():
|
| 279 |
-
"""Fetch, parse and chunk every curated
|
| 280 |
all_chunks = []
|
| 281 |
-
for case in
|
| 282 |
try:
|
| 283 |
-
soup = BeautifulSoup(_fetch(case["
|
|
|
|
| 284 |
except Exception as exc:
|
| 285 |
print(f" !! {case['short']}: fetch failed -- "
|
| 286 |
f"{type(exc).__name__}: {exc}")
|
| 287 |
continue
|
| 288 |
chunks, citation, n_paras = _decision_chunks(case, soup)
|
| 289 |
if not chunks:
|
| 290 |
-
print(f" !! {case['short']} (
|
| 291 |
-
f"0 chunks -- check parsing")
|
| 292 |
continue
|
| 293 |
all_chunks.extend(chunks)
|
| 294 |
-
print(f" {case['short']:
|
| 295 |
f"{len(chunks):3d} chunks {citation}")
|
| 296 |
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
| 297 |
OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
|
| 298 |
encoding="utf-8")
|
| 299 |
print(f"\n{len(all_chunks)} case-law chunks from "
|
| 300 |
-
f"{len(
|
| 301 |
|
| 302 |
|
| 303 |
if __name__ == "__main__":
|
|
|
|
| 1 |
+
"""Ingest leading Canadian court decisions as section-style chunks.
|
| 2 |
|
| 3 |
+
Sources: the official Lexum decisions databases of the Supreme Court of Canada
|
| 4 |
+
(decisions.scc-csc.ca), the Federal Court of Appeal (decisions.fca-caf.gc.ca)
|
| 5 |
+
and the Federal Court (decisions.fct-cf.gc.ca). A decision's text sits inside an
|
| 6 |
+
iframe, so each item is fetched by appending ?iframe=true to its URL. This
|
| 7 |
+
ingests a *curated* set of leading cases -- it is deliberately not a
|
| 8 |
+
comprehensive scrape.
|
| 9 |
|
| 10 |
py -m canlex.caselaw
|
| 11 |
"""
|
|
|
|
| 18 |
|
| 19 |
from .config import PROCESSED_DIR, RAW_DIR
|
| 20 |
|
| 21 |
+
# Each court's official Lexum database: (display name, item-URL template). All
|
| 22 |
+
# three sites behave identically -- same iframe trick, metadata block and
|
| 23 |
+
# bracketed paragraph numbers -- so one parser serves them all.
|
| 24 |
+
COURTS = {
|
| 25 |
+
"scc": ("Supreme Court of Canada",
|
| 26 |
+
"https://decisions.scc-csc.ca/scc-csc/scc-csc/en/item/{id}/index.do"),
|
| 27 |
+
"fca": ("Federal Court of Appeal",
|
| 28 |
+
"https://decisions.fca-caf.gc.ca/fca-caf/decisions/en/item/{id}/index.do"),
|
| 29 |
+
"fc": ("Federal Court",
|
| 30 |
+
"https://decisions.fct-cf.gc.ca/fc-cf/decisions/en/item/{id}/index.do"),
|
| 31 |
+
}
|
| 32 |
+
_RAW = RAW_DIR / "caselaw"
|
| 33 |
OUT = PROCESSED_DIR / "caselaw.json"
|
| 34 |
|
| 35 |
+
# A normal browser User-Agent: the courts' Lexum sites denylist a few crawler
|
| 36 |
+
# UAs, while robots.txt otherwise permits the decision pages. Politeness comes
|
| 37 |
+
# from the throttle below and from caching every fetched page on disk.
|
| 38 |
_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
| 39 |
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
| 40 |
_THROTTLE = 2.0 # seconds between live fetches
|
|
|
|
| 44 |
# which is not part of the judgment's reasons.
|
| 45 |
_APPARATUS = re.compile(r"^\s*(APPENDIX\b|Solicitors?\s+for\b)", re.I)
|
| 46 |
|
| 47 |
+
# Curated leading cases on Canadian border / immigration / customs / Charter
|
| 48 |
+
# law. 'court' selects a database in COURTS; 'id' is the verified Lexum item ID;
|
| 49 |
+
# 'short' and 'topic' are curated. The case name, citation and date are parsed
|
| 50 |
+
# from the page. FCA decisions later reversed by the SCC are deliberately
|
| 51 |
+
# omitted -- CanLex serves good law, and the SCC versions are already listed.
|
| 52 |
+
CASES = [
|
| 53 |
+
# --- Supreme Court of Canada ---
|
| 54 |
+
{"court": "scc", "id": 18078, "short": "Vavilov",
|
| 55 |
"topic": "Standard of review on judicial review; the reasonableness "
|
| 56 |
"standard for administrative decisions"},
|
| 57 |
+
{"court": "scc", "id": 20081, "short": "Mason",
|
| 58 |
"topic": "Inadmissibility under IRPA s. 34(1)(e) for acts of violence "
|
| 59 |
"endangering safety in Canada; reasonableness review"},
|
| 60 |
+
{"court": "scc", "id": 16803, "short": "Tran",
|
| 61 |
"topic": "Serious criminality and inadmissibility under IRPA s. 36; the "
|
| 62 |
"meaning of a term of imprisonment and an offence punishable by"},
|
| 63 |
+
{"court": "scc", "id": 15647, "short": "B010",
|
| 64 |
"topic": "Inadmissibility for people smuggling under IRPA s. 37(1)(b); "
|
| 65 |
"organized criminality"},
|
| 66 |
+
{"court": "scc", "id": 15648, "short": "Appulonappa",
|
| 67 |
"topic": "The human smuggling offence in IRPA s. 117; constitutional "
|
| 68 |
"overbreadth and humanitarian aid to asylum seekers"},
|
| 69 |
+
{"court": "scc", "id": 14419, "short": "Febles",
|
| 70 |
"topic": "Exclusion from refugee protection for a serious non-political "
|
| 71 |
"crime under Article 1F(b) of the Refugee Convention"},
|
| 72 |
+
{"court": "scc", "id": 13184, "short": "Ezokola",
|
| 73 |
"topic": "Complicity and exclusion from refugee protection for "
|
| 74 |
"international crimes under Article 1F(a)"},
|
| 75 |
+
{"court": "scc", "id": 15665, "short": "Kanthasamy",
|
| 76 |
"topic": "Humanitarian and compassionate relief under IRPA s. 25; the "
|
| 77 |
"best interests of a child"},
|
| 78 |
+
{"court": "scc", "id": 13137, "short": "Agraira",
|
| 79 |
"topic": "Ministerial relief from inadmissibility on security grounds; "
|
| 80 |
"the national interest under IRPA"},
|
| 81 |
+
{"court": "scc", "id": 6901, "short": "Khosa",
|
| 82 |
"topic": "Standard of review of immigration decisions; judicial review "
|
| 83 |
"of a removal order"},
|
| 84 |
+
{"court": "scc", "id": 2345, "short": "Charkaoui",
|
| 85 |
"topic": "Security certificates and immigration detention; the Charter "
|
| 86 |
"and procedural fairness"},
|
| 87 |
+
{"court": "scc", "id": 1937, "short": "Suresh",
|
| 88 |
"topic": "Deportation to a risk of torture; Charter s. 7 and removal on "
|
| 89 |
"security grounds"},
|
| 90 |
+
{"court": "scc", "id": 17759, "short": "Chhina",
|
| 91 |
"topic": "Habeas corpus as a remedy for immigration detention; review of "
|
| 92 |
"lengthy detention"},
|
| 93 |
+
{"court": "scc", "id": 1717, "short": "Baker",
|
| 94 |
"topic": "Procedural fairness in administrative decisions; humanitarian "
|
| 95 |
"and compassionate review; the duty to give reasons"},
|
| 96 |
+
{"court": "scc", "id": 39, "short": "Singh",
|
| 97 |
"topic": "Charter s. 7 rights of refugee claimants; the right to an oral "
|
| 98 |
"hearing"},
|
| 99 |
+
{"court": "scc", "id": 377, "short": "Simmons",
|
| 100 |
"topic": "Customs searches at the border; Charter s. 8 and the reasonable "
|
| 101 |
"expectation of privacy on entry to Canada"},
|
| 102 |
+
{"court": "scc", "id": 1694, "short": "Monney",
|
| 103 |
"topic": "Border detention for a customs search; reasonable suspicion "
|
| 104 |
"and the Customs Act"},
|
| 105 |
+
{"court": "scc", "id": 986, "short": "Dehghani",
|
| 106 |
"topic": "Charter rights at a port of entry; secondary examination and "
|
| 107 |
"the right to counsel"},
|
| 108 |
+
{"court": "scc", "id": 1627, "short": "Pushpanathan",
|
| 109 |
"topic": "Exclusion from refugee protection under Article 1F(c) for acts "
|
| 110 |
"contrary to the purposes of the United Nations"},
|
| 111 |
+
{"court": "scc", "id": 1023, "short": "Ward",
|
| 112 |
"topic": "The refugee definition; a particular social group; the "
|
| 113 |
"availability of state protection"},
|
| 114 |
+
|
| 115 |
+
# --- Federal Court of Appeal ---
|
| 116 |
+
{"court": "fca", "id": 143136, "short": "Huruglica",
|
| 117 |
+
"topic": "The Refugee Appeal Division reviews the Refugee Protection "
|
| 118 |
+
"Division for correctness and makes its own independent assessment"},
|
| 119 |
+
{"court": "fca", "id": 143152, "short": "Singh (new evidence)",
|
| 120 |
+
"topic": "Admitting new evidence at the Refugee Appeal Division under "
|
| 121 |
+
"IRPA s. 110(4)"},
|
| 122 |
+
{"court": "fca", "id": 37663, "short": "Hernandez Febles",
|
| 123 |
+
"topic": "Exclusion from refugee protection for a serious non-political "
|
| 124 |
+
"crime, Refugee Convention Article 1F(b)"},
|
| 125 |
+
{"court": "fca", "id": 36253, "short": "Jayasekara",
|
| 126 |
+
"topic": "The test for a serious crime under Article 1F(b) exclusion "
|
| 127 |
+
"from refugee protection"},
|
| 128 |
+
{"court": "fca", "id": 35786, "short": "Raza",
|
| 129 |
+
"topic": "Pre-removal risk assessment; the test for new evidence under "
|
| 130 |
+
"IRPA s. 113(a)"},
|
| 131 |
+
{"court": "fca", "id": 99694, "short": "Najafi",
|
| 132 |
+
"topic": "Security inadmissibility under IRPA s. 34(1)(b), subversion "
|
| 133 |
+
"against a government"},
|
| 134 |
+
{"court": "fca", "id": 108889, "short": "Kanagendren",
|
| 135 |
+
"topic": "Security inadmissibility for membership under IRPA s. 34(1)(f); "
|
| 136 |
+
"membership requires no complicity analysis"},
|
| 137 |
+
{"court": "fca", "id": 35313, "short": "Sittampalam",
|
| 138 |
+
"topic": "Organized criminality under IRPA s. 37(1)(a); the broad meaning "
|
| 139 |
+
"of a criminal organization"},
|
| 140 |
+
{"court": "fca", "id": 31607, "short": "Thanabalasingham",
|
| 141 |
+
"topic": "Immigration detention review; the Minister's onus and the "
|
| 142 |
+
"weight owed to prior detention decisions"},
|
| 143 |
+
{"court": "fca", "id": 305100, "short": "Lunyamila",
|
| 144 |
+
"topic": "Immigration detention where a detainee will not cooperate with "
|
| 145 |
+
"their own removal"},
|
| 146 |
+
{"court": "fca", "id": 483607, "short": "Brown",
|
| 147 |
+
"topic": "The constitutionality of immigration detention; the Charter "
|
| 148 |
+
"and lengthy detention"},
|
| 149 |
+
{"court": "fca", "id": 36347, "short": "Baron",
|
| 150 |
+
"topic": "Deferral of removal by an enforcement officer; the narrow "
|
| 151 |
+
"scope of the discretion to defer"},
|
| 152 |
+
{"court": "fca", "id": 520921, "short": "Galindo Camayo",
|
| 153 |
+
"topic": "Reasonableness review of an immigration decision and the role "
|
| 154 |
+
"of certified questions on appeal"},
|
| 155 |
+
{"court": "fca", "id": 501244, "short": "Best Buy",
|
| 156 |
+
"topic": "Customs tariff classification; methodology and appeals from "
|
| 157 |
+
"the Canadian International Trade Tribunal"},
|
| 158 |
+
{"court": "fca", "id": 466027, "short": "Honey Fashions",
|
| 159 |
+
"topic": "Customs; judicial review of the Canada Border Services "
|
| 160 |
+
"Agency's exercise of discretion"},
|
| 161 |
+
{"court": "fca", "id": 419470, "short": "Hociung",
|
| 162 |
+
"topic": "Customs Act seizure and forfeiture; the meaning of imported "
|
| 163 |
+
"goods and their valuation"},
|
| 164 |
+
|
| 165 |
+
# --- Federal Court ---
|
| 166 |
+
{"court": "fc", "id": 64594, "short": "Goburdhun",
|
| 167 |
+
"topic": "Inadmissibility for misrepresentation under IRPA s. 40; a "
|
| 168 |
+
"frequently-cited summary of the governing principles"},
|
| 169 |
+
{"court": "fc", "id": 492842, "short": "Garcia",
|
| 170 |
+
"topic": "Criminal inadmissibility; the effect of withdrawn charges and "
|
| 171 |
+
"a claim of self-defence"},
|
| 172 |
+
{"court": "fc", "id": 483303, "short": "Kaur",
|
| 173 |
+
"topic": "Procedural fairness; the right to a meaningful opportunity to "
|
| 174 |
+
"answer a visa officer's concerns"},
|
| 175 |
+
{"court": "fc", "id": 56900, "short": "Nguyen",
|
| 176 |
+
"topic": "Customs Act; judicial review of the amount set for a penalty "
|
| 177 |
+
"or ascertained forfeiture"},
|
| 178 |
]
|
| 179 |
|
| 180 |
|
| 181 |
+
def _fetch(court, item_id):
|
| 182 |
"""Return a decision's iframe HTML, caching the raw page under data/raw."""
|
| 183 |
+
cache = _RAW / f"{court}-{item_id}.html"
|
| 184 |
if cache.exists():
|
| 185 |
return cache.read_text(encoding="utf-8")
|
| 186 |
+
url = COURTS[court][1].format(id=item_id) + "?iframe=true"
|
| 187 |
req = urllib.request.Request(url, headers={"User-Agent": _UA})
|
| 188 |
time.sleep(_THROTTLE)
|
| 189 |
with urllib.request.urlopen(req, timeout=60) as resp:
|
|
|
|
| 225 |
def _paragraphs(soup):
|
| 226 |
"""Return (is_numbered, [(label, text), ...]) for the judgment body.
|
| 227 |
|
| 228 |
+
Modern judgments open each paragraph with a bracketed number "[N]". They are
|
| 229 |
+
detected by content -- a run of sequentially numbered <p> blocks -- so the
|
| 230 |
+
parser does not depend on Word style names, which vary by court and era.
|
| 231 |
+
Every <p> between one numbered opener and the next belongs to that paragraph.
|
| 232 |
Older, unnumbered decisions fall back to taking every <p> in document order.
|
| 233 |
"""
|
| 234 |
blocks = [p for p in _body(soup).find_all("p")
|
|
|
|
| 317 |
|
| 318 |
def _decision_chunks(case, soup):
|
| 319 |
"""Build CanLex chunk dicts for one decision."""
|
| 320 |
+
court_name, item_tmpl = COURTS[case["court"]]
|
| 321 |
name, fields = _metadata(soup)
|
| 322 |
name = name or case["short"]
|
| 323 |
cite = fields.get("neutral citation") or fields.get("report") or ""
|
| 324 |
report = fields.get("report", "")
|
| 325 |
date = fields.get("date", "")
|
| 326 |
citation = f"{name}, {cite}" if cite else name
|
| 327 |
+
item_url = item_tmpl.format(id=case["id"])
|
| 328 |
modern, paras = _paragraphs(soup)
|
| 329 |
chunks = []
|
| 330 |
for i, group in enumerate(_chunk(paras), start=1):
|
|
|
|
| 335 |
else:
|
| 336 |
locator = f"excerpt {i}"
|
| 337 |
chunks.append({
|
| 338 |
+
"id": f"{case['court']}-{case['id']}-{i}",
|
| 339 |
"doc_type": "caselaw",
|
| 340 |
+
"act_code": cite or f"{case['court'].upper()} item {case['id']}",
|
| 341 |
"act_short": case["short"],
|
| 342 |
"act_name": name,
|
| 343 |
"section": "",
|
| 344 |
"citation": citation,
|
| 345 |
"marginal_note": locator,
|
| 346 |
"heading": case["topic"],
|
| 347 |
+
"part": court_name,
|
| 348 |
"division": "",
|
| 349 |
"text": "\n\n".join(t for _, t in group),
|
| 350 |
"current_to": date,
|
|
|
|
| 356 |
|
| 357 |
|
| 358 |
def build():
|
| 359 |
+
"""Fetch, parse and chunk every curated decision into caselaw.json."""
|
| 360 |
all_chunks = []
|
| 361 |
+
for case in CASES:
|
| 362 |
try:
|
| 363 |
+
soup = BeautifulSoup(_fetch(case["court"], case["id"]),
|
| 364 |
+
"html.parser")
|
| 365 |
except Exception as exc:
|
| 366 |
print(f" !! {case['short']}: fetch failed -- "
|
| 367 |
f"{type(exc).__name__}: {exc}")
|
| 368 |
continue
|
| 369 |
chunks, citation, n_paras = _decision_chunks(case, soup)
|
| 370 |
if not chunks:
|
| 371 |
+
print(f" !! {case['short']} ({case['court']} item "
|
| 372 |
+
f"{case['id']}): 0 chunks -- check parsing")
|
| 373 |
continue
|
| 374 |
all_chunks.extend(chunks)
|
| 375 |
+
print(f" {case['court']:4s} {case['short']:20s} {n_paras:4d} paras -> "
|
| 376 |
f"{len(chunks):3d} chunks {citation}")
|
| 377 |
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
| 378 |
OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
|
| 379 |
encoding="utf-8")
|
| 380 |
print(f"\n{len(all_chunks)} case-law chunks from "
|
| 381 |
+
f"{len(CASES)} decisions -> {OUT}")
|
| 382 |
|
| 383 |
|
| 384 |
if __name__ == "__main__":
|
canlex/server.py
CHANGED
|
@@ -125,8 +125,8 @@ class SearchInput(BaseModel):
|
|
| 125 |
default=None,
|
| 126 |
description="Optional filter by source type: 'legislation' (Acts and "
|
| 127 |
"regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
|
| 128 |
-
"agreements), 'directive' (NJC directives), or 'caselaw' (Supreme Court "
|
| 129 |
-
"of
|
| 130 |
)
|
| 131 |
|
| 132 |
|
|
@@ -144,7 +144,7 @@ class GetSectionInput(BaseModel):
|
|
| 144 |
annotations={"title": "Search Canadian Legislation", **_READONLY})
|
| 145 |
def canlex_search_legislation(params: SearchInput) -> str:
|
| 146 |
"""Search Canadian federal law, CBSA D-Memoranda, agreements, NJC directives,
|
| 147 |
-
and leading
|
| 148 |
|
| 149 |
The CanLex corpus has five kinds of source: 31 federal Acts and regulations
|
| 150 |
(immigration, customs, criminal, drugs, food/health, labour, privacy and more);
|
|
@@ -152,8 +152,8 @@ def canlex_search_legislation(params: SearchInput) -> str:
|
|
| 152 |
how it applies customs and border law); Treasury Board collective agreements
|
| 153 |
(currently the FB / Border Services group); National Joint Council directives
|
| 154 |
(travel, relocation, isolated posts and more); and leading Supreme Court of
|
| 155 |
-
Canada
|
| 156 |
-
for ANY question about that material. It ranks results by relevance and returns
|
| 157 |
their full text so the answer can cite the actual wording; an explicit section
|
| 158 |
reference (e.g. "section 34") is always surfaced. Each result is marked with its
|
| 159 |
source type.
|
|
@@ -297,7 +297,7 @@ def canlex_list_acts() -> str:
|
|
| 297 |
lines.append(f"- **{a['short']}**: {a['count']} sections, "
|
| 298 |
f"effective {a['current_to'] or 'n/a'}")
|
| 299 |
if cases:
|
| 300 |
-
lines += ["", "## Case law
|
| 301 |
for cite, a in sorted(cases.items(), key=lambda kv: kv[1]["decided"]):
|
| 302 |
lines.append(f"- **{a['name']}**, {cite}: {a['count']} excerpts, "
|
| 303 |
f"decided {a['decided'] or 'n/a'}")
|
|
|
|
| 125 |
default=None,
|
| 126 |
description="Optional filter by source type: 'legislation' (Acts and "
|
| 127 |
"regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
|
| 128 |
+
"agreements), 'directive' (NJC directives), or 'caselaw' (Supreme Court, "
|
| 129 |
+
"Federal Court of Appeal and Federal Court decisions). Omit to search all.",
|
| 130 |
)
|
| 131 |
|
| 132 |
|
|
|
|
| 144 |
annotations={"title": "Search Canadian Legislation", **_READONLY})
|
| 145 |
def canlex_search_legislation(params: SearchInput) -> str:
|
| 146 |
"""Search Canadian federal law, CBSA D-Memoranda, agreements, NJC directives,
|
| 147 |
+
and leading court decisions.
|
| 148 |
|
| 149 |
The CanLex corpus has five kinds of source: 31 federal Acts and regulations
|
| 150 |
(immigration, customs, criminal, drugs, food/health, labour, privacy and more);
|
|
|
|
| 152 |
how it applies customs and border law); Treasury Board collective agreements
|
| 153 |
(currently the FB / Border Services group); National Joint Council directives
|
| 154 |
(travel, relocation, isolated posts and more); and leading Supreme Court of
|
| 155 |
+
Canada, Federal Court of Appeal and Federal Court decisions on immigration,
|
| 156 |
+
customs and Charter law. Use this for ANY question about that material. It ranks results by relevance and returns
|
| 157 |
their full text so the answer can cite the actual wording; an explicit section
|
| 158 |
reference (e.g. "section 34") is always surfaced. Each result is marked with its
|
| 159 |
source type.
|
|
|
|
| 297 |
lines.append(f"- **{a['short']}**: {a['count']} sections, "
|
| 298 |
f"effective {a['current_to'] or 'n/a'}")
|
| 299 |
if cases:
|
| 300 |
+
lines += ["", "## Case law"]
|
| 301 |
for cite, a in sorted(cases.items(), key=lambda kv: kv[1]["decided"]):
|
| 302 |
lines.append(f"- **{a['name']}**, {cite}: {a['count']} excerpts, "
|
| 303 |
f"decided {a['decided'] or 'n/a'}")
|
data/processed/caselaw.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|