Spaces:

Beemer0
/

CanLex

Running

App Files Files Community

Beemer commited on 6 days ago

Commit

b8c217b

1 Parent(s): 4305228

Add Phase 3: IRB jurisprudential guides and citation-based citator lookup

Browse files

Files changed (4) hide show

canlex/caselaw.py +115 -12
canlex/citator.py +30 -3
canlex/server.py +28 -15
data/processed/caselaw.json +0 -0

canlex/caselaw.py CHANGED Viewed

@@ -1,11 +1,12 @@
-"""Ingest leading Canadian court decisions as section-style chunks.
 Sources: the official Lexum decisions databases of the Supreme Court of Canada
 (decisions.scc-csc.ca), the Federal Court of Appeal (decisions.fca-caf.gc.ca)
-and the Federal Court (decisions.fct-cf.gc.ca). A decision's text sits inside an
-iframe, so each item is fetched by appending ?iframe=true to its URL. This
-ingests a *curated* set of leading cases -- it is deliberately not a
-comprehensive scrape.
     py -m canlex.caselaw
 """
@@ -201,13 +202,32 @@ CASES = [
               "where only part is shown to be of legitimate origin"},
 ]
-def _fetch(court, item_id):
-    """Return a decision's iframe HTML, caching the raw page under data/raw."""
-    cache = _RAW / f"{court}-{item_id}.html"
     if cache.exists():
         return cache.read_text(encoding="utf-8")
-    url = COURTS[court][1].format(id=item_id) + "?iframe=true"
     req = urllib.request.Request(url, headers={"User-Agent": _UA})
     time.sleep(_THROTTLE)
     with urllib.request.urlopen(req, timeout=60) as resp:
@@ -217,11 +237,36 @@ def _fetch(court, item_id):
     return text
 def _norm(text):
     """Collapse all whitespace, including non-breaking spaces."""
     return re.sub(r"\s+", " ", text.replace("\xa0", " ")).strip()
 def _metadata(soup):
     """Return (case_name, {label: value}) from the decision's metadata block."""
     box = soup.find("div", class_="metadata")
@@ -239,10 +284,16 @@ def _metadata(soup):
 def _body(soup):
-    """Locate the container holding the judgment text."""
     return (soup.find(id="document-content")
             or soup.find("div", class_="documentcontent")
             or soup.find("div", class_="WordSection1")
             or soup.body or soup)
@@ -255,7 +306,10 @@ def _paragraphs(soup):
     Every <p> between one numbered opener and the next belongs to that paragraph.
     Older, unnumbered decisions fall back to taking every <p> in document order.
     """
-    blocks = [p for p in _body(soup).find_all("p")
               if "MsoFootnoteText" not in (p.get("class") or [])]
     texts = [p.get_text() for p in blocks]
@@ -379,6 +433,40 @@ def _decision_chunks(case, soup):
     return chunks, citation, len(paras)
 def build():
     """Fetch, parse and chunk every curated decision into caselaw.json."""
     all_chunks = []
@@ -398,11 +486,26 @@ def build():
         all_chunks.extend(chunks)
         print(f"  {case['court']:4s} {case['short']:20s} {n_paras:4d} paras -> "
               f"{len(chunks):3d} chunks   {citation}")
     PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
     OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
                    encoding="utf-8")
     print(f"\n{len(all_chunks)} case-law chunks from "
-          f"{len(CASES)} decisions -> {OUT}")
 if __name__ == "__main__":

+"""Ingest leading Canadian court and tribunal decisions as section-level chunks.
 Sources: the official Lexum decisions databases of the Supreme Court of Canada
 (decisions.scc-csc.ca), the Federal Court of Appeal (decisions.fca-caf.gc.ca)
+and the Federal Court (decisions.fct-cf.gc.ca) -- a decision's text sits inside
+an iframe, fetched by appending ?iframe=true to its URL -- plus the in-force
+jurisprudential guides the Immigration and Refugee Board publishes on its own
+site (irb-cisr.gc.ca). This ingests a *curated* set of leading decisions; it is
+deliberately not a comprehensive scrape.
     py -m canlex.caselaw
 """
               "where only part is shown to be of legitimate origin"},
 ]
+# In-force jurisprudential guides of the Immigration and Refugee Board's Refugee
+# Appeal Division -- decisions the Board designates as models that members apply
+# to similar cases or explain why not. Full text is hosted on the IRB's own
+# site; 'topic' is curated, while the date and paragraphs are parsed from the
+# page. (A fourth in-force guide, TC1-05038, is hosted only on CanLII.)
+IRB_GUIDES = [
+    {"file": "MB8-00025",
+     "url": "https://www.irb-cisr.gc.ca/en/decisions/Pages/mb8-00025.aspx",
+     "topic": "Exclusion from refugee protection under Article 1E where the "
+              "claimant has protected status in a third country (Haiti / Brazil)"},
+    {"file": "TB7-01837",
+     "url": "https://www.irb-cisr.gc.ca/en/decisions/Pages/TB7-01837.aspx",
+     "topic": "Persecution of Ahmadis in Pakistan; state protection and the "
+              "availability of an internal flight alternative"},
+    {"file": "TB4-05778",
+     "url": "https://www.irb-cisr.gc.ca/en/decisions/Pages/TB4-05778.aspx",
+     "topic": "Whether a North Korean refugee claimant has deemed citizenship "
+              "of South Korea and protection available there"},
+]
+def _get(url, cache_name):
+    """Fetch a page, caching the raw HTML under data/raw/caselaw."""
+    cache = _RAW / cache_name
     if cache.exists():
         return cache.read_text(encoding="utf-8")
     req = urllib.request.Request(url, headers={"User-Agent": _UA})
     time.sleep(_THROTTLE)
     with urllib.request.urlopen(req, timeout=60) as resp:
     return text
+def _fetch(court, item_id):
+    """Return a Lexum court decision's iframe HTML."""
+    url = COURTS[court][1].format(id=item_id) + "?iframe=true"
+    return _get(url, f"{court}-{item_id}.html")
 def _norm(text):
     """Collapse all whitespace, including non-breaking spaces."""
     return re.sub(r"\s+", " ", text.replace("\xa0", " ")).strip()
+_MONTHS = {m: i for i, m in enumerate(
+    ["january", "february", "march", "april", "may", "june", "july", "august",
+     "september", "october", "november", "december"], start=1)}
+def _irb_date(texts):
+    """Pull the ISO 'Date of decision' from an IRB decision's front matter.
+    The label and the date sometimes sit in separate elements, so the search
+    runs over the joined text rather than block by block.
+    """
+    m = re.search(r"Date of decision:?\s*([A-Za-z]+)\s+(\d{1,2}),?\s*(\d{4})",
+                  " ".join(texts))
+    if m and m.group(1).lower() in _MONTHS:
+        return (f"{m.group(3)}-{_MONTHS[m.group(1).lower()]:02d}-"
+                f"{int(m.group(2)):02d}")
+    return ""
 def _metadata(soup):
     """Return (case_name, {label: value}) from the decision's metadata block."""
     box = soup.find("div", class_="metadata")
 def _body(soup):
+    """Locate the container holding the decision text.
+    Handles the Lexum court pages and the IRB's SharePoint pages, whose text
+    sits in a 'RichHtmlField' rich-text div.
+    """
     return (soup.find(id="document-content")
             or soup.find("div", class_="documentcontent")
             or soup.find("div", class_="WordSection1")
+            or soup.find("div", id=lambda v: v and "RichHtmlField" in v)
+            or soup.find("div", class_="ms-rtestate-field")
             or soup.body or soup)
     Every <p> between one numbered opener and the next belongs to that paragraph.
     Older, unnumbered decisions fall back to taking every <p> in document order.
     """
+    body = _body(soup)
+    for aside in body.find_all("aside", class_="wb-fnote"):
+        aside.decompose()                        # drop IRB/WET footnote blocks
+    blocks = [p for p in body.find_all("p")
               if "MsoFootnoteText" not in (p.get("class") or [])]
     texts = [p.get_text() for p in blocks]
     return chunks, citation, len(paras)
+def _irb_chunks(guide, soup):
+    """Build CanLex chunk dicts for one IRB jurisprudential guide."""
+    cite = f"IRB Jurisprudential Guide {guide['file']}"
+    date = _irb_date(p.get_text() for p in _body(soup).find_all("p"))
+    modern, paras = _paragraphs(soup)
+    chunks = []
+    for i, group in enumerate(_chunk(paras), start=1):
+        if modern:
+            first, last = group[0][0], group[-1][0]
+            locator = (f"para {first}" if first == last
+                       else f"paras {first}–{last}")
+        else:
+            locator = f"excerpt {i}"
+        chunks.append({
+            "id": f"irb-{guide['file']}-{i}",
+            "doc_type": "caselaw",
+            "act_code": guide["file"],
+            "act_short": guide["file"],
+            "act_name": cite,
+            "section": "",
+            "citation": cite,
+            "marginal_note": locator,
+            "heading": guide["topic"],
+            "part": "Immigration and Refugee Board — Refugee Appeal Division",
+            "division": "",
+            "text": "\n\n".join(t for _, t in group),
+            "current_to": date,
+            "last_amended": "",
+            "history": "",
+            "source_url": guide["url"],
+        })
+    return chunks, cite, len(paras)
 def build():
     """Fetch, parse and chunk every curated decision into caselaw.json."""
     all_chunks = []
         all_chunks.extend(chunks)
         print(f"  {case['court']:4s} {case['short']:20s} {n_paras:4d} paras -> "
               f"{len(chunks):3d} chunks   {citation}")
+    for guide in IRB_GUIDES:
+        try:
+            soup = BeautifulSoup(_get(guide["url"], f"irb-{guide['file']}.html"),
+                                 "html.parser")
+        except Exception as exc:
+            print(f"  !! {guide['file']}: fetch failed -- "
+                  f"{type(exc).__name__}: {exc}")
+            continue
+        chunks, citation, n_paras = _irb_chunks(guide, soup)
+        if not chunks:
+            print(f"  !! {guide['file']}: 0 chunks -- check parsing")
+            continue
+        all_chunks.extend(chunks)
+        print(f"  irb  {guide['file']:20s} {n_paras:4d} paras -> "
+              f"{len(chunks):3d} chunks   {citation}")
     PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
     OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
                    encoding="utf-8")
     print(f"\n{len(all_chunks)} case-law chunks from "
+          f"{len(CASES) + len(IRB_GUIDES)} decisions -> {OUT}")
 if __name__ == "__main__":

canlex/citator.py CHANGED Viewed

@@ -24,6 +24,25 @@ _MAX_LIST = 20      # items shown per citator list (lists can run to thousands)
 _CASE_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/doc/\d+/([a-z0-9-]+)")
 _DB_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/?$")
 def api_key():
     """Return the configured CanLII API key, or '' if not set.
@@ -92,13 +111,21 @@ class Citator:
         return {"total": len(items), "items": items[:_MAX_LIST]}
     def case_report(self, case_url):
-        """Return a citation-graph report for a case, given its canlii.org URL."""
         if case_url in self._cache:
             return self._cache[case_url]
         match = _CASE_URL.search(case_url)
         if not match:
-            return {"error": "Provide a full canlii.org case URL, e.g. "
-                    "https://www.canlii.org/en/ca/scc/doc/2019/2019scc65/2019scc65.html"}
         self._ensure_dbmap()
         segment, case_id = match.group(1), match.group(2)
         db = self._dbmap.get(segment)

 _CASE_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/doc/\d+/([a-z0-9-]+)")
 _DB_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/?$")
+# A neutral citation, e.g. "2019 SCC 65", and the CanLII URL segment per court.
+_NEUTRAL = re.compile(r"\b(\d{4})\s+(SCC|FCA|FC)\s+(\d+)\b", re.IGNORECASE)
+_CANLII_SEG = {"scc": "scc", "fca": "fca", "fc": "fct"}
+def canlii_url_from_citation(text):
+    """Build a canlii.org case URL from a neutral citation, or '' if none found.
+    Works for Supreme Court, Federal Court of Appeal and Federal Court neutral
+    citations -- e.g. "2019 SCC 65" -> .../en/ca/scc/doc/2019/2019scc65/...
+    """
+    m = _NEUTRAL.search(text)
+    if not m:
+        return ""
+    year, court, num = m.group(1), m.group(2).lower(), m.group(3)
+    doc = f"{year}{court}{num}"
+    return (f"https://www.canlii.org/en/ca/{_CANLII_SEG[court]}/doc/"
+            f"{year}/{doc}/{doc}.html")
 def api_key():
     """Return the configured CanLII API key, or '' if not set.
         return {"total": len(items), "items": items[:_MAX_LIST]}
     def case_report(self, case_url):
+        """Return a citation-graph report for a case.
+        Accepts a full canlii.org case URL, or a neutral citation (e.g.
+        "2019 SCC 65") for a Supreme Court / Federal Court of Appeal / Federal
+        Court decision.
+        """
+        if not _CASE_URL.search(case_url):
+            case_url = canlii_url_from_citation(case_url) or case_url
         if case_url in self._cache:
             return self._cache[case_url]
         match = _CASE_URL.search(case_url)
         if not match:
+            return {"error": "Provide a full canlii.org case URL, or a neutral "
+                    "citation such as '2019 SCC 65' (Supreme Court, Federal "
+                    "Court of Appeal, or Federal Court)."}
         self._ensure_dbmap()
         segment, case_id = match.group(1), match.group(2)
         db = self._dbmap.get(segment)

canlex/server.py CHANGED Viewed

@@ -38,7 +38,9 @@ GROUNDING_NOTE = (
     "instruments for a bargaining unit; court decisions interpret and apply the "
     "law and are binding precedent depending on the court and jurisdiction -- "
     "name the deciding court and the date, and do not assume a decision is still "
-    "good law if it may have been overtaken. State the "
     "'current to', modified, or in-force date when stating the law. If the material "
     "below does not fully resolve the question -- including where it turns on case "
     "law or facts not present here -- say so explicitly. This is legal information, "
@@ -76,9 +78,15 @@ def _format_section(c: dict) -> str:
                      "agreements; binding for the matters it covers._")
         lines.append(f"(effective {c['current_to'] or 'n/a'})")
     elif doc_type == "caselaw":
-        lines.append("_Court decision — binding precedent depending on the court "
-                     "and jurisdiction; confirm it has not been overturned on "
-                     "appeal or overtaken by later authority._")
         lines.append(f"(decided {c['current_to'] or 'n/a'})")
         if c["heading"]:
             lines.append(f"Subject: {c['heading']}")
@@ -326,10 +334,12 @@ class CaseInput(BaseModel):
     case_url: str = Field(
         ...,
-        description="A full canlii.org case URL, e.g. "
-        "'https://www.canlii.org/en/ca/scc/doc/2019/2019scc65/2019scc65.html'. "
-        "Find it by web search if you only have the case name.",
-        min_length=10, max_length=400,
     )
@@ -386,19 +396,22 @@ def _format_case(report: dict) -> str:
                        "destructiveHint": False, "idempotentHint": True,
                        "openWorldHint": True})
 def canlex_case(params: CaseInput) -> str:
-    """Look up a Canadian court case on CanLII and return its citation graph.
-    Given a case's full canlii.org URL, returns the case's metadata plus its
-    citator: the cases it cites, the cases that cite it (its treatment and how
-    leading it is), and the legislation it cites -- live from the CanLII API.
-    The CanLII API has no case search, so the case's full canlii.org URL must be
-    supplied (find it by web search if you only have the case name). This returns
     metadata and the citation graph only, NOT the judgment text -- follow the
     CanLII link for that. A call takes ~15-20 seconds (the API is rate-limited).
     Args:
-        params (CaseInput): contains case_url -- a full canlii.org case URL.
     Returns:
         str: Markdown -- the case's title, neutral citation, date, docket and

     "instruments for a bargaining unit; court decisions interpret and apply the "
     "law and are binding precedent depending on the court and jurisdiction -- "
     "name the deciding court and the date, and do not assume a decision is still "
+    "good law if it may have been overtaken (the canlex_case tool checks a "
+    "decision's later treatment on CanLII -- give it the neutral citation). "
+    "State the "
     "'current to', modified, or in-force date when stating the law. If the material "
     "below does not fully resolve the question -- including where it turns on case "
     "law or facts not present here -- say so explicitly. This is legal information, "
                      "agreements; binding for the matters it covers._")
         lines.append(f"(effective {c['current_to'] or 'n/a'})")
     elif doc_type == "caselaw":
+        if "Immigration and Refugee Board" in c["part"]:
+            lines.append("_Immigration and Refugee Board jurisprudential guide "
+                         "— IRB members apply its reasoning to similar cases or "
+                         "explain why not; persuasive, and subject to revocation "
+                         "or to review by the Federal Court._")
+        else:
+            lines.append("_Court decision — binding precedent depending on the "
+                         "court and jurisdiction; confirm it has not been "
+                         "overturned on appeal or overtaken by later authority._")
         lines.append(f"(decided {c['current_to'] or 'n/a'})")
         if c["heading"]:
             lines.append(f"Subject: {c['heading']}")
     case_url: str = Field(
         ...,
+        description="A Canadian case, given either as a full canlii.org URL or "
+        "-- for a Supreme Court, Federal Court of Appeal or Federal Court "
+        "decision -- its neutral citation (e.g. '2019 SCC 65' or '2016 FCA 93'). "
+        "For other courts, supply the canlii.org URL; find it by web search if "
+        "you only have the case name.",
+        min_length=8, max_length=400,
     )
                        "destructiveHint": False, "idempotentHint": True,
                        "openWorldHint": True})
 def canlex_case(params: CaseInput) -> str:
+    """Look up a Canadian case on CanLII and return its citation graph.
+    Returns the case's metadata plus its citator: the cases it cites, the cases
+    that cite it (its treatment and how leading it is), and the legislation it
+    cites -- live from the CanLII API. Use it to gauge whether a decision is
+    still good law -- how heavily and how recently it has been cited.
+    Supply either a canlii.org URL or, for a Supreme Court / Federal Court of
+    Appeal / Federal Court decision, its neutral citation (e.g. '2019 SCC 65') --
+    the citation a canlex_search_legislation result already shows. This returns
     metadata and the citation graph only, NOT the judgment text -- follow the
     CanLII link for that. A call takes ~15-20 seconds (the API is rate-limited).
     Args:
+        params (CaseInput): contains case_url -- a canlii.org URL or a neutral
+            citation.
     Returns:
         str: Markdown -- the case's title, neutral citation, date, docket and

data/processed/caselaw.json CHANGED Viewed

The diff for this file is too large to render. See raw diff