Spaces:

Beemer0
/

CanLex

Running

App Files Files Community

Beemer commited on 6 days ago

Commit

5527c63

1 Parent(s): 21626e7

Add Phase 1 case law: 20 leading Supreme Court of Canada decisions

Browse files

Files changed (3) hide show

canlex/caselaw.py +304 -0
canlex/server.py +46 -16
data/processed/caselaw.json +0 -0

canlex/caselaw.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""Ingest leading Supreme Court of Canada decisions as section-style chunks.
+Source: the SCC's official decisions database (decisions.scc-csc.ca, the Lexum
+"Norma" platform). A decision's text sits inside an iframe, so each item is
+fetched by appending ?iframe=true to its URL. This ingests a *curated* set of
+leading cases -- it is deliberately not a comprehensive scrape.
+    py -m canlex.caselaw
+"""
+import json
+import re
+import time
+import urllib.request
+from bs4 import BeautifulSoup
+from .config import PROCESSED_DIR, RAW_DIR
+ITEM_URL = "https://decisions.scc-csc.ca/scc-csc/scc-csc/en/item/{id}/index.do"
+_RAW = RAW_DIR / "scc"
+OUT = PROCESSED_DIR / "caselaw.json"
+# A normal browser User-Agent: the SCC site denylists a few crawler UAs, while
+# its robots.txt otherwise permits the decision pages. Politeness comes from the
+# throttle below and from caching every fetched page on disk.
+_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+       "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+_THROTTLE = 2.0       # seconds between live fetches
+_CHUNK_CHARS = 1800   # target characters per chunk
+# Marks the post-reasons apparatus (appended legislation, solicitors list),
+# which is not part of the judgment's reasons.
+_APPARATUS = re.compile(r"^\s*(APPENDIX\b|Solicitors?\s+for\b)", re.I)
+# Curated leading SCC cases on border / immigration / customs / Charter law.
+# 'id' is the verified decisions.scc-csc.ca item ID; 'short' and 'topic' are
+# curated. The case name, citation and date are parsed from the page itself.
+SCC_CASES = [
+    {"id": 18078, "short": "Vavilov",
+     "topic": "Standard of review on judicial review; the reasonableness "
+              "standard for administrative decisions"},
+    {"id": 20081, "short": "Mason",
+     "topic": "Inadmissibility under IRPA s. 34(1)(e) for acts of violence "
+              "endangering safety in Canada; reasonableness review"},
+    {"id": 16803, "short": "Tran",
+     "topic": "Serious criminality and inadmissibility under IRPA s. 36; the "
+              "meaning of a term of imprisonment and an offence punishable by"},
+    {"id": 15647, "short": "B010",
+     "topic": "Inadmissibility for people smuggling under IRPA s. 37(1)(b); "
+              "organized criminality"},
+    {"id": 15648, "short": "Appulonappa",
+     "topic": "The human smuggling offence in IRPA s. 117; constitutional "
+              "overbreadth and humanitarian aid to asylum seekers"},
+    {"id": 14419, "short": "Febles",
+     "topic": "Exclusion from refugee protection for a serious non-political "
+              "crime under Article 1F(b) of the Refugee Convention"},
+    {"id": 13184, "short": "Ezokola",
+     "topic": "Complicity and exclusion from refugee protection for "
+              "international crimes under Article 1F(a)"},
+    {"id": 15665, "short": "Kanthasamy",
+     "topic": "Humanitarian and compassionate relief under IRPA s. 25; the "
+              "best interests of a child"},
+    {"id": 13137, "short": "Agraira",
+     "topic": "Ministerial relief from inadmissibility on security grounds; "
+              "the national interest under IRPA"},
+    {"id": 6901, "short": "Khosa",
+     "topic": "Standard of review of immigration decisions; judicial review "
+              "of a removal order"},
+    {"id": 2345, "short": "Charkaoui",
+     "topic": "Security certificates and immigration detention; the Charter "
+              "and procedural fairness"},
+    {"id": 1937, "short": "Suresh",
+     "topic": "Deportation to a risk of torture; Charter s. 7 and removal on "
+              "security grounds"},
+    {"id": 17759, "short": "Chhina",
+     "topic": "Habeas corpus as a remedy for immigration detention; review of "
+              "lengthy detention"},
+    {"id": 1717, "short": "Baker",
+     "topic": "Procedural fairness in administrative decisions; humanitarian "
+              "and compassionate review; the duty to give reasons"},
+    {"id": 39, "short": "Singh",
+     "topic": "Charter s. 7 rights of refugee claimants; the right to an oral "
+              "hearing"},
+    {"id": 377, "short": "Simmons",
+     "topic": "Customs searches at the border; Charter s. 8 and the reasonable "
+              "expectation of privacy on entry to Canada"},
+    {"id": 1694, "short": "Monney",
+     "topic": "Border detention for a customs search; reasonable suspicion "
+              "and the Customs Act"},
+    {"id": 986, "short": "Dehghani",
+     "topic": "Charter rights at a port of entry; secondary examination and "
+              "the right to counsel"},
+    {"id": 1627, "short": "Pushpanathan",
+     "topic": "Exclusion from refugee protection under Article 1F(c) for acts "
+              "contrary to the purposes of the United Nations"},
+    {"id": 1023, "short": "Ward",
+     "topic": "The refugee definition; a particular social group; the "
+              "availability of state protection"},
+]
+def _fetch(item_id):
+    """Return a decision's iframe HTML, caching the raw page under data/raw."""
+    cache = _RAW / f"{item_id}.html"
+    if cache.exists():
+        return cache.read_text(encoding="utf-8")
+    url = ITEM_URL.format(id=item_id) + "?iframe=true"
+    req = urllib.request.Request(url, headers={"User-Agent": _UA})
+    time.sleep(_THROTTLE)
+    with urllib.request.urlopen(req, timeout=60) as resp:
+        text = resp.read().decode("utf-8", errors="replace")
+    _RAW.mkdir(parents=True, exist_ok=True)
+    cache.write_text(text, encoding="utf-8")
+    return text
+def _norm(text):
+    """Collapse all whitespace, including non-breaking spaces."""
+    return re.sub(r"\s+", " ", text.replace("\xa0", " ")).strip()
+def _metadata(soup):
+    """Return (case_name, {label: value}) from the decision's metadata block."""
+    box = soup.find("div", class_="metadata")
+    if not box:
+        return "", {}
+    title = box.find("h3", class_="title")
+    name = _norm(title.get_text()) if title else ""
+    fields = {}
+    for row in box.find_all("tr"):
+        label = row.find("td", class_="label")
+        value = row.find("td", class_="metadata")
+        if label and value:
+            fields[_norm(label.get_text()).lower()] = _norm(value.get_text())
+    return name, fields
+def _body(soup):
+    """Locate the container holding the judgment text."""
+    return (soup.find(id="document-content")
+            or soup.find("div", class_="documentcontent")
+            or soup.find("div", class_="WordSection1")
+            or soup.body or soup)
+def _paragraphs(soup):
+    """Return (is_numbered, [(label, text), ...]) for the judgment body.
+    Modern SCC judgments open each paragraph with a bracketed number "[N]".
+    They are detected by content -- a run of sequentially numbered <p> blocks --
+    so the parser does not depend on Word style names, which vary by era. Every
+    <p> between one numbered opener and the next belongs to that paragraph.
+    Older, unnumbered decisions fall back to taking every <p> in document order.
+    """
+    blocks = [p for p in _body(soup).find_all("p")
+              if "MsoFootnoteText" not in (p.get("class") or [])]
+    texts = [p.get_text() for p in blocks]
+    # Drop the post-reasons apparatus (appended legislation, solicitors list);
+    # it is not part of the reasons and would otherwise swell the last paragraph.
+    for i, raw in enumerate(texts):
+        if _APPARATUS.match(raw):
+            texts = texts[:i]
+            break
+    # A paragraph opens with its number: "[12]" (most decisions) or a bare "12"
+    # followed by wide tab spacing (pre-2009 decisions). The brackets are
+    # self-identifying; a bare number must have 2+ trailing spaces, which
+    # rejects quoted enumerations ("2. ..."). The sequential check rejects
+    # stray bracketed years like "[1998]".
+    openers = {}          # block index -> paragraph number
+    expected = 1
+    for i, raw in enumerate(texts):
+        match = (re.match(r"\s*\[\s*(\d+)\s*\]", raw)
+                 or re.match(r"\s*(\d+)\s{2,}\S", raw))
+        if match:
+            n = int(match.group(1))
+            if expected <= n <= expected + 2:    # sequential, small-gap tolerant
+                openers[i] = n
+                expected = n + 1
+    if len(openers) < 5:                         # unnumbered: take every <p>
+        paras = [(str(j), _norm(t)) for j, t in enumerate(texts, start=1)]
+        return False, [(n, t) for n, t in paras if len(t) > 1]
+    paras, num, buf = [], None, []
+    for i, raw in enumerate(texts):
+        if i in openers:
+            if num is not None:
+                paras.append((str(num), _norm(" ".join(buf))))
+            num = openers[i]
+            buf = [re.sub(r"^\s*\[?\s*\d+\s*\]?\s*", "", raw)]
+        elif num is not None:
+            buf.append(raw)
+    if num is not None:
+        paras.append((str(num), _norm(" ".join(buf))))
+    return True, [(n, t) for n, t in paras if t]
+def _split_text(text, limit):
+    """Split text longer than `limit` into pieces, breaking on a sentence or
+    word boundary so no single chunk blows the retrieval/reranker budget."""
+    if len(text) <= limit:
+        return [text]
+    pieces, start = [], 0
+    while start < len(text):
+        if len(text) - start <= limit:
+            pieces.append(text[start:])
+            break
+        window = text[start:start + limit]
+        cut = window.rfind(". ")
+        cut = cut + 1 if cut > limit // 2 else window.rfind(" ")
+        if cut <= 0:
+            cut = limit
+        pieces.append(text[start:start + cut])
+        start += cut
+    return [p.strip() for p in pieces if p.strip()]
+def _chunk(paras):
+    """Group consecutive paragraphs into ~_CHUNK_CHARS-sized chunks, first
+    splitting any single paragraph that on its own exceeds the budget."""
+    units = []
+    for label, text in paras:
+        for piece in _split_text(text, _CHUNK_CHARS):
+            units.append((label, piece))
+    chunks, buf, size = [], [], 0
+    for label, text in units:
+        if buf and size + len(text) > _CHUNK_CHARS:
+            chunks.append(buf)
+            buf, size = [], 0
+        buf.append((label, text))
+        size += len(text)
+    if buf:
+        chunks.append(buf)
+    return chunks
+def _decision_chunks(case, soup):
+    """Build CanLex chunk dicts for one decision."""
+    name, fields = _metadata(soup)
+    name = name or case["short"]
+    cite = fields.get("neutral citation") or fields.get("report") or ""
+    report = fields.get("report", "")
+    date = fields.get("date", "")
+    citation = f"{name}, {cite}" if cite else name
+    item_url = ITEM_URL.format(id=case["id"])
+    modern, paras = _paragraphs(soup)
+    chunks = []
+    for i, group in enumerate(_chunk(paras), start=1):
+        if modern:
+            first, last = group[0][0], group[-1][0]
+            locator = (f"para {first}" if first == last
+                       else f"paras {first}–{last}")
+        else:
+            locator = f"excerpt {i}"
+        chunks.append({
+            "id": f"scc-{case['id']}-{i}",
+            "doc_type": "caselaw",
+            "act_code": cite or f"SCC item {case['id']}",
+            "act_short": case["short"],
+            "act_name": name,
+            "section": "",
+            "citation": citation,
+            "marginal_note": locator,
+            "heading": case["topic"],
+            "part": "Supreme Court of Canada",
+            "division": "",
+            "text": "\n\n".join(t for _, t in group),
+            "current_to": date,
+            "last_amended": "",
+            "history": report if report and report != cite else "",
+            "source_url": item_url,
+        })
+    return chunks, citation, len(paras)
+def build():
+    """Fetch, parse and chunk every curated SCC decision into caselaw.json."""
+    all_chunks = []
+    for case in SCC_CASES:
+        try:
+            soup = BeautifulSoup(_fetch(case["id"]), "html.parser")
+        except Exception as exc:
+            print(f"  !! {case['short']}: fetch failed -- "
+                  f"{type(exc).__name__}: {exc}")
+            continue
+        chunks, citation, n_paras = _decision_chunks(case, soup)
+        if not chunks:
+            print(f"  !! {case['short']} (item {case['id']}): "
+                  f"0 chunks -- check parsing")
+            continue
+        all_chunks.extend(chunks)
+        print(f"  {case['short']:13s} {n_paras:4d} paras -> "
+              f"{len(chunks):3d} chunks   {citation}")
+    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
+    OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
+                   encoding="utf-8")
+    print(f"\n{len(all_chunks)} case-law chunks from "
+          f"{len(SCC_CASES)} SCC decisions -> {OUT}")
+if __name__ == "__main__":
+    build()

canlex/server.py CHANGED Viewed

@@ -35,7 +35,10 @@ GROUNDING_NOTE = (
     "CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
     "and a court may disagree with them; collective agreements and the National "
     "Joint Council directives they incorporate are binding employment-terms "
-    "instruments for a bargaining unit. State the "
     "'current to', modified, or in-force date when stating the law. If the material "
     "below does not fully resolve the question -- including where it turns on case "
     "law or facts not present here -- say so explicitly. This is legal information, "
@@ -72,6 +75,13 @@ def _format_section(c: dict) -> str:
         lines.append("_National Joint Council directive — forms part of collective "
                      "agreements; binding for the matters it covers._")
         lines.append(f"(effective {c['current_to'] or 'n/a'})")
     else:
         meta = [f"current to {c['current_to'] or 'n/a'}"]
         if c["last_amended"]:
@@ -81,7 +91,12 @@ def _format_section(c: dict) -> str:
     lines.append(c["text"])
     lines.append("")
     if c["history"]:
-        lines.append(f"Amendment history: {c['history']}")
     lines.append(f"Source: {c['source_url']}")
     return "\n".join(lines)
@@ -110,7 +125,8 @@ class SearchInput(BaseModel):
         default=None,
         description="Optional filter by source type: 'legislation' (Acts and "
         "regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
-        "agreements), or 'directive' (NJC directives). Omit to search all.",
     )
@@ -127,25 +143,28 @@ class GetSectionInput(BaseModel):
 @mcp.tool(name="canlex_search_legislation",
           annotations={"title": "Search Canadian Legislation", **_READONLY})
 def canlex_search_legislation(params: SearchInput) -> str:
-    """Search Canadian federal law, CBSA D-Memoranda, agreements, and NJC directives.
-    The CanLex corpus has four kinds of source: 31 federal Acts and regulations
     (immigration, customs, criminal, drugs, food/health, labour, privacy and more);
     CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
     how it applies customs and border law); Treasury Board collective agreements
-    (currently the FB / Border Services group); and National Joint Council directives
-    (travel, relocation, isolated posts and more). Use this for ANY question about
-    that material. It ranks results by relevance and returns their full text so the
-    answer can cite the actual wording; an explicit section reference (e.g. "section
-    34") is always surfaced. Each result is marked with its source type.
     Args:
         params (SearchInput): Validated input containing:
             - query (str): Legal question or keywords to search for.
             - top_k (int): How many sections to return, 1-20 (default 6).
             - act (Optional[str]): Restrict to one Act by short name/code, or omit for all.
-            - doc_type (Optional[str]): 'legislation', 'memorandum', 'agreement', or
-              'directive' to restrict to one source type; omit to search all.
     Returns:
         str: Markdown with answering instructions followed by the matching sections.
@@ -213,7 +232,7 @@ def canlex_get_section(params: GetSectionInput) -> str:
           annotations={"title": "List Loaded Legislation", **_READONLY})
 def canlex_list_acts() -> str:
     """List what the CanLex corpus contains -- Acts and regulations, CBSA
-    D-Memoranda, collective agreements, and NJC directives.
     Use this to learn the scope and currency of the corpus before searching, or to
     report it to the user.
@@ -225,6 +244,7 @@ def canlex_list_acts() -> str:
     acts: dict[str, dict] = {}
     agreements: dict[str, dict] = {}
     directives: dict[str, dict] = {}
     memo_numbers: set[str] = set()
     memo_chunks = 0
     memo_date = ""
@@ -245,6 +265,11 @@ def canlex_list_acts() -> str:
                 "short": c["act_short"], "current_to": c["current_to"], "count": 0,
             })
             entry["count"] += 1
         else:
             entry = acts.setdefault(c["act_code"], {
                 "short": c["act_short"], "name": c["act_name"],
@@ -271,10 +296,15 @@ def canlex_list_acts() -> str:
         for a in sorted(directives.values(), key=lambda x: x["short"]):
             lines.append(f"- **{a['short']}**: {a['count']} sections, "
                          f"effective {a['current_to'] or 'n/a'}")
     lines += ["", "Search with canlex_search_legislation; filter by doc_type "
-              "(legislation / memorandum / agreement / directive). Fetch a known "
-              "provision with canlex_get_section, or a case's citations with "
-              "canlex_case."]
     return "\n".join(lines)

     "CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
     "and a court may disagree with them; collective agreements and the National "
     "Joint Council directives they incorporate are binding employment-terms "
+    "instruments for a bargaining unit; court decisions interpret and apply the "
+    "law and are binding precedent depending on the court and jurisdiction -- "
+    "name the deciding court and the date, and do not assume a decision is still "
+    "good law if it may have been overtaken. State the "
     "'current to', modified, or in-force date when stating the law. If the material "
     "below does not fully resolve the question -- including where it turns on case "
     "law or facts not present here -- say so explicitly. This is legal information, "
         lines.append("_National Joint Council directive — forms part of collective "
                      "agreements; binding for the matters it covers._")
         lines.append(f"(effective {c['current_to'] or 'n/a'})")
+    elif doc_type == "caselaw":
+        lines.append("_Court decision — binding precedent depending on the court "
+                     "and jurisdiction; confirm it has not been overturned on "
+                     "appeal or overtaken by later authority._")
+        lines.append(f"(decided {c['current_to'] or 'n/a'})")
+        if c["heading"]:
+            lines.append(f"Subject: {c['heading']}")
     else:
         meta = [f"current to {c['current_to'] or 'n/a'}"]
         if c["last_amended"]:
     lines.append(c["text"])
     lines.append("")
     if c["history"]:
+        if doc_type == "caselaw":
+            lines.append(f"Also reported: {c['history']}")
+        elif doc_type == "legislation":
+            lines.append(f"Amendment history: {c['history']}")
+        else:
+            lines.append(f"History: {c['history']}")
     lines.append(f"Source: {c['source_url']}")
     return "\n".join(lines)
         default=None,
         description="Optional filter by source type: 'legislation' (Acts and "
         "regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
+        "agreements), 'directive' (NJC directives), or 'caselaw' (Supreme Court "
+        "of Canada decisions). Omit to search all.",
     )
 @mcp.tool(name="canlex_search_legislation",
           annotations={"title": "Search Canadian Legislation", **_READONLY})
 def canlex_search_legislation(params: SearchInput) -> str:
+    """Search Canadian federal law, CBSA D-Memoranda, agreements, NJC directives,
+    and leading Supreme Court of Canada cases.
+    The CanLex corpus has five kinds of source: 31 federal Acts and regulations
     (immigration, customs, criminal, drugs, food/health, labour, privacy and more);
     CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
     how it applies customs and border law); Treasury Board collective agreements
+    (currently the FB / Border Services group); National Joint Council directives
+    (travel, relocation, isolated posts and more); and leading Supreme Court of
+    Canada decisions on immigration, customs and Charter-at-the-border law. Use this
+    for ANY question about that material. It ranks results by relevance and returns
+    their full text so the answer can cite the actual wording; an explicit section
+    reference (e.g. "section 34") is always surfaced. Each result is marked with its
+    source type.
     Args:
         params (SearchInput): Validated input containing:
             - query (str): Legal question or keywords to search for.
             - top_k (int): How many sections to return, 1-20 (default 6).
             - act (Optional[str]): Restrict to one Act by short name/code, or omit for all.
+            - doc_type (Optional[str]): 'legislation', 'memorandum', 'agreement',
+              'directive', or 'caselaw' to restrict to one source type; omit for all.
     Returns:
         str: Markdown with answering instructions followed by the matching sections.
           annotations={"title": "List Loaded Legislation", **_READONLY})
 def canlex_list_acts() -> str:
     """List what the CanLex corpus contains -- Acts and regulations, CBSA
+    D-Memoranda, collective agreements, NJC directives, and leading cases.
     Use this to learn the scope and currency of the corpus before searching, or to
     report it to the user.
     acts: dict[str, dict] = {}
     agreements: dict[str, dict] = {}
     directives: dict[str, dict] = {}
+    cases: dict[str, dict] = {}
     memo_numbers: set[str] = set()
     memo_chunks = 0
     memo_date = ""
                 "short": c["act_short"], "current_to": c["current_to"], "count": 0,
             })
             entry["count"] += 1
+        elif doc_type == "caselaw":
+            entry = cases.setdefault(c["act_code"], {
+                "name": c["act_name"], "decided": c["current_to"], "count": 0,
+            })
+            entry["count"] += 1
         else:
             entry = acts.setdefault(c["act_code"], {
                 "short": c["act_short"], "name": c["act_name"],
         for a in sorted(directives.values(), key=lambda x: x["short"]):
             lines.append(f"- **{a['short']}**: {a['count']} sections, "
                          f"effective {a['current_to'] or 'n/a'}")
+    if cases:
+        lines += ["", "## Case law (Supreme Court of Canada)"]
+        for cite, a in sorted(cases.items(), key=lambda kv: kv[1]["decided"]):
+            lines.append(f"- **{a['name']}**, {cite}: {a['count']} excerpts, "
+                         f"decided {a['decided'] or 'n/a'}")
     lines += ["", "Search with canlex_search_legislation; filter by doc_type "
+              "(legislation / memorandum / agreement / directive / caselaw). Fetch "
+              "a known provision with canlex_get_section, or a case's citations "
+              "with canlex_case."]
     return "\n".join(lines)

data/processed/caselaw.json ADDED Viewed

The diff for this file is too large to render. See raw diff