Beemer commited on
Commit
b8c217b
·
1 Parent(s): 4305228

Add Phase 3: IRB jurisprudential guides and citation-based citator lookup

Browse files
canlex/caselaw.py CHANGED
@@ -1,11 +1,12 @@
1
- """Ingest leading Canadian court decisions as section-style chunks.
2
 
3
  Sources: the official Lexum decisions databases of the Supreme Court of Canada
4
  (decisions.scc-csc.ca), the Federal Court of Appeal (decisions.fca-caf.gc.ca)
5
- and the Federal Court (decisions.fct-cf.gc.ca). A decision's text sits inside an
6
- iframe, so each item is fetched by appending ?iframe=true to its URL. This
7
- ingests a *curated* set of leading cases -- it is deliberately not a
8
- comprehensive scrape.
 
9
 
10
  py -m canlex.caselaw
11
  """
@@ -201,13 +202,32 @@ CASES = [
201
  "where only part is shown to be of legitimate origin"},
202
  ]
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- def _fetch(court, item_id):
206
- """Return a decision's iframe HTML, caching the raw page under data/raw."""
207
- cache = _RAW / f"{court}-{item_id}.html"
 
208
  if cache.exists():
209
  return cache.read_text(encoding="utf-8")
210
- url = COURTS[court][1].format(id=item_id) + "?iframe=true"
211
  req = urllib.request.Request(url, headers={"User-Agent": _UA})
212
  time.sleep(_THROTTLE)
213
  with urllib.request.urlopen(req, timeout=60) as resp:
@@ -217,11 +237,36 @@ def _fetch(court, item_id):
217
  return text
218
 
219
 
 
 
 
 
 
 
220
  def _norm(text):
221
  """Collapse all whitespace, including non-breaking spaces."""
222
  return re.sub(r"\s+", " ", text.replace("\xa0", " ")).strip()
223
 
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  def _metadata(soup):
226
  """Return (case_name, {label: value}) from the decision's metadata block."""
227
  box = soup.find("div", class_="metadata")
@@ -239,10 +284,16 @@ def _metadata(soup):
239
 
240
 
241
  def _body(soup):
242
- """Locate the container holding the judgment text."""
 
 
 
 
243
  return (soup.find(id="document-content")
244
  or soup.find("div", class_="documentcontent")
245
  or soup.find("div", class_="WordSection1")
 
 
246
  or soup.body or soup)
247
 
248
 
@@ -255,7 +306,10 @@ def _paragraphs(soup):
255
  Every <p> between one numbered opener and the next belongs to that paragraph.
256
  Older, unnumbered decisions fall back to taking every <p> in document order.
257
  """
258
- blocks = [p for p in _body(soup).find_all("p")
 
 
 
259
  if "MsoFootnoteText" not in (p.get("class") or [])]
260
  texts = [p.get_text() for p in blocks]
261
 
@@ -379,6 +433,40 @@ def _decision_chunks(case, soup):
379
  return chunks, citation, len(paras)
380
 
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  def build():
383
  """Fetch, parse and chunk every curated decision into caselaw.json."""
384
  all_chunks = []
@@ -398,11 +486,26 @@ def build():
398
  all_chunks.extend(chunks)
399
  print(f" {case['court']:4s} {case['short']:20s} {n_paras:4d} paras -> "
400
  f"{len(chunks):3d} chunks {citation}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
402
  OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
403
  encoding="utf-8")
404
  print(f"\n{len(all_chunks)} case-law chunks from "
405
- f"{len(CASES)} decisions -> {OUT}")
406
 
407
 
408
  if __name__ == "__main__":
 
1
+ """Ingest leading Canadian court and tribunal decisions as section-level chunks.
2
 
3
  Sources: the official Lexum decisions databases of the Supreme Court of Canada
4
  (decisions.scc-csc.ca), the Federal Court of Appeal (decisions.fca-caf.gc.ca)
5
+ and the Federal Court (decisions.fct-cf.gc.ca) -- a decision's text sits inside
6
+ an iframe, fetched by appending ?iframe=true to its URL -- plus the in-force
7
+ jurisprudential guides the Immigration and Refugee Board publishes on its own
8
+ site (irb-cisr.gc.ca). This ingests a *curated* set of leading decisions; it is
9
+ deliberately not a comprehensive scrape.
10
 
11
  py -m canlex.caselaw
12
  """
 
202
  "where only part is shown to be of legitimate origin"},
203
  ]
204
 
205
+ # In-force jurisprudential guides of the Immigration and Refugee Board's Refugee
206
+ # Appeal Division -- decisions the Board designates as models that members apply
207
+ # to similar cases or explain why not. Full text is hosted on the IRB's own
208
+ # site; 'topic' is curated, while the date and paragraphs are parsed from the
209
+ # page. (A fourth in-force guide, TC1-05038, is hosted only on CanLII.)
210
+ IRB_GUIDES = [
211
+ {"file": "MB8-00025",
212
+ "url": "https://www.irb-cisr.gc.ca/en/decisions/Pages/mb8-00025.aspx",
213
+ "topic": "Exclusion from refugee protection under Article 1E where the "
214
+ "claimant has protected status in a third country (Haiti / Brazil)"},
215
+ {"file": "TB7-01837",
216
+ "url": "https://www.irb-cisr.gc.ca/en/decisions/Pages/TB7-01837.aspx",
217
+ "topic": "Persecution of Ahmadis in Pakistan; state protection and the "
218
+ "availability of an internal flight alternative"},
219
+ {"file": "TB4-05778",
220
+ "url": "https://www.irb-cisr.gc.ca/en/decisions/Pages/TB4-05778.aspx",
221
+ "topic": "Whether a North Korean refugee claimant has deemed citizenship "
222
+ "of South Korea and protection available there"},
223
+ ]
224
 
225
+
226
+ def _get(url, cache_name):
227
+ """Fetch a page, caching the raw HTML under data/raw/caselaw."""
228
+ cache = _RAW / cache_name
229
  if cache.exists():
230
  return cache.read_text(encoding="utf-8")
 
231
  req = urllib.request.Request(url, headers={"User-Agent": _UA})
232
  time.sleep(_THROTTLE)
233
  with urllib.request.urlopen(req, timeout=60) as resp:
 
237
  return text
238
 
239
 
240
+ def _fetch(court, item_id):
241
+ """Return a Lexum court decision's iframe HTML."""
242
+ url = COURTS[court][1].format(id=item_id) + "?iframe=true"
243
+ return _get(url, f"{court}-{item_id}.html")
244
+
245
+
246
  def _norm(text):
247
  """Collapse all whitespace, including non-breaking spaces."""
248
  return re.sub(r"\s+", " ", text.replace("\xa0", " ")).strip()
249
 
250
 
251
+ _MONTHS = {m: i for i, m in enumerate(
252
+ ["january", "february", "march", "april", "may", "june", "july", "august",
253
+ "september", "october", "november", "december"], start=1)}
254
+
255
+
256
+ def _irb_date(texts):
257
+ """Pull the ISO 'Date of decision' from an IRB decision's front matter.
258
+
259
+ The label and the date sometimes sit in separate elements, so the search
260
+ runs over the joined text rather than block by block.
261
+ """
262
+ m = re.search(r"Date of decision:?\s*([A-Za-z]+)\s+(\d{1,2}),?\s*(\d{4})",
263
+ " ".join(texts))
264
+ if m and m.group(1).lower() in _MONTHS:
265
+ return (f"{m.group(3)}-{_MONTHS[m.group(1).lower()]:02d}-"
266
+ f"{int(m.group(2)):02d}")
267
+ return ""
268
+
269
+
270
  def _metadata(soup):
271
  """Return (case_name, {label: value}) from the decision's metadata block."""
272
  box = soup.find("div", class_="metadata")
 
284
 
285
 
286
  def _body(soup):
287
+ """Locate the container holding the decision text.
288
+
289
+ Handles the Lexum court pages and the IRB's SharePoint pages, whose text
290
+ sits in a 'RichHtmlField' rich-text div.
291
+ """
292
  return (soup.find(id="document-content")
293
  or soup.find("div", class_="documentcontent")
294
  or soup.find("div", class_="WordSection1")
295
+ or soup.find("div", id=lambda v: v and "RichHtmlField" in v)
296
+ or soup.find("div", class_="ms-rtestate-field")
297
  or soup.body or soup)
298
 
299
 
 
306
  Every <p> between one numbered opener and the next belongs to that paragraph.
307
  Older, unnumbered decisions fall back to taking every <p> in document order.
308
  """
309
+ body = _body(soup)
310
+ for aside in body.find_all("aside", class_="wb-fnote"):
311
+ aside.decompose() # drop IRB/WET footnote blocks
312
+ blocks = [p for p in body.find_all("p")
313
  if "MsoFootnoteText" not in (p.get("class") or [])]
314
  texts = [p.get_text() for p in blocks]
315
 
 
433
  return chunks, citation, len(paras)
434
 
435
 
436
+ def _irb_chunks(guide, soup):
437
+ """Build CanLex chunk dicts for one IRB jurisprudential guide."""
438
+ cite = f"IRB Jurisprudential Guide {guide['file']}"
439
+ date = _irb_date(p.get_text() for p in _body(soup).find_all("p"))
440
+ modern, paras = _paragraphs(soup)
441
+ chunks = []
442
+ for i, group in enumerate(_chunk(paras), start=1):
443
+ if modern:
444
+ first, last = group[0][0], group[-1][0]
445
+ locator = (f"para {first}" if first == last
446
+ else f"paras {first}–{last}")
447
+ else:
448
+ locator = f"excerpt {i}"
449
+ chunks.append({
450
+ "id": f"irb-{guide['file']}-{i}",
451
+ "doc_type": "caselaw",
452
+ "act_code": guide["file"],
453
+ "act_short": guide["file"],
454
+ "act_name": cite,
455
+ "section": "",
456
+ "citation": cite,
457
+ "marginal_note": locator,
458
+ "heading": guide["topic"],
459
+ "part": "Immigration and Refugee Board — Refugee Appeal Division",
460
+ "division": "",
461
+ "text": "\n\n".join(t for _, t in group),
462
+ "current_to": date,
463
+ "last_amended": "",
464
+ "history": "",
465
+ "source_url": guide["url"],
466
+ })
467
+ return chunks, cite, len(paras)
468
+
469
+
470
  def build():
471
  """Fetch, parse and chunk every curated decision into caselaw.json."""
472
  all_chunks = []
 
486
  all_chunks.extend(chunks)
487
  print(f" {case['court']:4s} {case['short']:20s} {n_paras:4d} paras -> "
488
  f"{len(chunks):3d} chunks {citation}")
489
+ for guide in IRB_GUIDES:
490
+ try:
491
+ soup = BeautifulSoup(_get(guide["url"], f"irb-{guide['file']}.html"),
492
+ "html.parser")
493
+ except Exception as exc:
494
+ print(f" !! {guide['file']}: fetch failed -- "
495
+ f"{type(exc).__name__}: {exc}")
496
+ continue
497
+ chunks, citation, n_paras = _irb_chunks(guide, soup)
498
+ if not chunks:
499
+ print(f" !! {guide['file']}: 0 chunks -- check parsing")
500
+ continue
501
+ all_chunks.extend(chunks)
502
+ print(f" irb {guide['file']:20s} {n_paras:4d} paras -> "
503
+ f"{len(chunks):3d} chunks {citation}")
504
  PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
505
  OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
506
  encoding="utf-8")
507
  print(f"\n{len(all_chunks)} case-law chunks from "
508
+ f"{len(CASES) + len(IRB_GUIDES)} decisions -> {OUT}")
509
 
510
 
511
  if __name__ == "__main__":
canlex/citator.py CHANGED
@@ -24,6 +24,25 @@ _MAX_LIST = 20 # items shown per citator list (lists can run to thousands)
24
  _CASE_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/doc/\d+/([a-z0-9-]+)")
25
  _DB_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/?$")
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def api_key():
29
  """Return the configured CanLII API key, or '' if not set.
@@ -92,13 +111,21 @@ class Citator:
92
  return {"total": len(items), "items": items[:_MAX_LIST]}
93
 
94
  def case_report(self, case_url):
95
- """Return a citation-graph report for a case, given its canlii.org URL."""
 
 
 
 
 
 
 
96
  if case_url in self._cache:
97
  return self._cache[case_url]
98
  match = _CASE_URL.search(case_url)
99
  if not match:
100
- return {"error": "Provide a full canlii.org case URL, e.g. "
101
- "https://www.canlii.org/en/ca/scc/doc/2019/2019scc65/2019scc65.html"}
 
102
  self._ensure_dbmap()
103
  segment, case_id = match.group(1), match.group(2)
104
  db = self._dbmap.get(segment)
 
24
  _CASE_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/doc/\d+/([a-z0-9-]+)")
25
  _DB_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/?$")
26
 
27
+ # A neutral citation, e.g. "2019 SCC 65", and the CanLII URL segment per court.
28
+ _NEUTRAL = re.compile(r"\b(\d{4})\s+(SCC|FCA|FC)\s+(\d+)\b", re.IGNORECASE)
29
+ _CANLII_SEG = {"scc": "scc", "fca": "fca", "fc": "fct"}
30
+
31
+
32
+ def canlii_url_from_citation(text):
33
+ """Build a canlii.org case URL from a neutral citation, or '' if none found.
34
+
35
+ Works for Supreme Court, Federal Court of Appeal and Federal Court neutral
36
+ citations -- e.g. "2019 SCC 65" -> .../en/ca/scc/doc/2019/2019scc65/...
37
+ """
38
+ m = _NEUTRAL.search(text)
39
+ if not m:
40
+ return ""
41
+ year, court, num = m.group(1), m.group(2).lower(), m.group(3)
42
+ doc = f"{year}{court}{num}"
43
+ return (f"https://www.canlii.org/en/ca/{_CANLII_SEG[court]}/doc/"
44
+ f"{year}/{doc}/{doc}.html")
45
+
46
 
47
  def api_key():
48
  """Return the configured CanLII API key, or '' if not set.
 
111
  return {"total": len(items), "items": items[:_MAX_LIST]}
112
 
113
  def case_report(self, case_url):
114
+ """Return a citation-graph report for a case.
115
+
116
+ Accepts a full canlii.org case URL, or a neutral citation (e.g.
117
+ "2019 SCC 65") for a Supreme Court / Federal Court of Appeal / Federal
118
+ Court decision.
119
+ """
120
+ if not _CASE_URL.search(case_url):
121
+ case_url = canlii_url_from_citation(case_url) or case_url
122
  if case_url in self._cache:
123
  return self._cache[case_url]
124
  match = _CASE_URL.search(case_url)
125
  if not match:
126
+ return {"error": "Provide a full canlii.org case URL, or a neutral "
127
+ "citation such as '2019 SCC 65' (Supreme Court, Federal "
128
+ "Court of Appeal, or Federal Court)."}
129
  self._ensure_dbmap()
130
  segment, case_id = match.group(1), match.group(2)
131
  db = self._dbmap.get(segment)
canlex/server.py CHANGED
@@ -38,7 +38,9 @@ GROUNDING_NOTE = (
38
  "instruments for a bargaining unit; court decisions interpret and apply the "
39
  "law and are binding precedent depending on the court and jurisdiction -- "
40
  "name the deciding court and the date, and do not assume a decision is still "
41
- "good law if it may have been overtaken. State the "
 
 
42
  "'current to', modified, or in-force date when stating the law. If the material "
43
  "below does not fully resolve the question -- including where it turns on case "
44
  "law or facts not present here -- say so explicitly. This is legal information, "
@@ -76,9 +78,15 @@ def _format_section(c: dict) -> str:
76
  "agreements; binding for the matters it covers._")
77
  lines.append(f"(effective {c['current_to'] or 'n/a'})")
78
  elif doc_type == "caselaw":
79
- lines.append("_Court decision binding precedent depending on the court "
80
- "and jurisdiction; confirm it has not been overturned on "
81
- "appeal or overtaken by later authority._")
 
 
 
 
 
 
82
  lines.append(f"(decided {c['current_to'] or 'n/a'})")
83
  if c["heading"]:
84
  lines.append(f"Subject: {c['heading']}")
@@ -326,10 +334,12 @@ class CaseInput(BaseModel):
326
 
327
  case_url: str = Field(
328
  ...,
329
- description="A full canlii.org case URL, e.g. "
330
- "'https://www.canlii.org/en/ca/scc/doc/2019/2019scc65/2019scc65.html'. "
331
- "Find it by web search if you only have the case name.",
332
- min_length=10, max_length=400,
 
 
333
  )
334
 
335
 
@@ -386,19 +396,22 @@ def _format_case(report: dict) -> str:
386
  "destructiveHint": False, "idempotentHint": True,
387
  "openWorldHint": True})
388
  def canlex_case(params: CaseInput) -> str:
389
- """Look up a Canadian court case on CanLII and return its citation graph.
390
 
391
- Given a case's full canlii.org URL, returns the case's metadata plus its
392
- citator: the cases it cites, the cases that cite it (its treatment and how
393
- leading it is), and the legislation it cites -- live from the CanLII API.
 
394
 
395
- The CanLII API has no case search, so the case's full canlii.org URL must be
396
- supplied (find it by web search if you only have the case name). This returns
 
397
  metadata and the citation graph only, NOT the judgment text -- follow the
398
  CanLII link for that. A call takes ~15-20 seconds (the API is rate-limited).
399
 
400
  Args:
401
- params (CaseInput): contains case_url -- a full canlii.org case URL.
 
402
 
403
  Returns:
404
  str: Markdown -- the case's title, neutral citation, date, docket and
 
38
  "instruments for a bargaining unit; court decisions interpret and apply the "
39
  "law and are binding precedent depending on the court and jurisdiction -- "
40
  "name the deciding court and the date, and do not assume a decision is still "
41
+ "good law if it may have been overtaken (the canlex_case tool checks a "
42
+ "decision's later treatment on CanLII -- give it the neutral citation). "
43
+ "State the "
44
  "'current to', modified, or in-force date when stating the law. If the material "
45
  "below does not fully resolve the question -- including where it turns on case "
46
  "law or facts not present here -- say so explicitly. This is legal information, "
 
78
  "agreements; binding for the matters it covers._")
79
  lines.append(f"(effective {c['current_to'] or 'n/a'})")
80
  elif doc_type == "caselaw":
81
+ if "Immigration and Refugee Board" in c["part"]:
82
+ lines.append("_Immigration and Refugee Board jurisprudential guide "
83
+ " IRB members apply its reasoning to similar cases or "
84
+ "explain why not; persuasive, and subject to revocation "
85
+ "or to review by the Federal Court._")
86
+ else:
87
+ lines.append("_Court decision — binding precedent depending on the "
88
+ "court and jurisdiction; confirm it has not been "
89
+ "overturned on appeal or overtaken by later authority._")
90
  lines.append(f"(decided {c['current_to'] or 'n/a'})")
91
  if c["heading"]:
92
  lines.append(f"Subject: {c['heading']}")
 
334
 
335
  case_url: str = Field(
336
  ...,
337
+ description="A Canadian case, given either as a full canlii.org URL or "
338
+ "-- for a Supreme Court, Federal Court of Appeal or Federal Court "
339
+ "decision -- its neutral citation (e.g. '2019 SCC 65' or '2016 FCA 93'). "
340
+ "For other courts, supply the canlii.org URL; find it by web search if "
341
+ "you only have the case name.",
342
+ min_length=8, max_length=400,
343
  )
344
 
345
 
 
396
  "destructiveHint": False, "idempotentHint": True,
397
  "openWorldHint": True})
398
  def canlex_case(params: CaseInput) -> str:
399
+ """Look up a Canadian case on CanLII and return its citation graph.
400
 
401
+ Returns the case's metadata plus its citator: the cases it cites, the cases
402
+ that cite it (its treatment and how leading it is), and the legislation it
403
+ cites -- live from the CanLII API. Use it to gauge whether a decision is
404
+ still good law -- how heavily and how recently it has been cited.
405
 
406
+ Supply either a canlii.org URL or, for a Supreme Court / Federal Court of
407
+ Appeal / Federal Court decision, its neutral citation (e.g. '2019 SCC 65') --
408
+ the citation a canlex_search_legislation result already shows. This returns
409
  metadata and the citation graph only, NOT the judgment text -- follow the
410
  CanLII link for that. A call takes ~15-20 seconds (the API is rate-limited).
411
 
412
  Args:
413
+ params (CaseInput): contains case_url -- a canlii.org URL or a neutral
414
+ citation.
415
 
416
  Returns:
417
  str: Markdown -- the case's title, neutral citation, date, docket and
data/processed/caselaw.json CHANGED
The diff for this file is too large to render. See raw diff