Spaces:

Beemer0
/

CanLex

Running

File size: 6,497 Bytes

"""CanLII case citator -- live lookup of a Canadian case's citation graph.

Uses the CanLII API (key in canlii_key.txt). The API has no name/topic search,
so a case is identified by its full canlii.org URL. Responses are cached on disk
and calls are throttled, because the API rate-limits aggressively.
"""
import json
import os
import re
import sys
import time
import urllib.error
import urllib.request

from .config import ROOT, DATA_DIR

API = "https://api.canlii.org/v1"
KEY_FILE = ROOT / "canlii_key.txt"
_PLACEHOLDER = "PASTE-YOUR-CANLII-API-KEY-ON-THIS-LINE"
_DBMAP_FILE = DATA_DIR / "citator_dbmap.json"
_CACHE_FILE = DATA_DIR / "citator_cache.json"
_THROTTLE = 3.0     # seconds between CanLII API calls (the API rate-limits hard)
_MAX_LIST = 20      # items shown per citator list (lists can run to thousands)
_CASE_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/doc/\d+/([a-z0-9-]+)")
_DB_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/?$")

# A neutral citation, e.g. "2019 SCC 65", and the CanLII URL segment per court.
_NEUTRAL = re.compile(r"\b(\d{4})\s+(SCC|FCA|FC)\s+(\d+)\b", re.IGNORECASE)
_CANLII_SEG = {"scc": "scc", "fca": "fca", "fc": "fct"}


def canlii_url_from_citation(text):
    """Build a canlii.org case URL from a neutral citation, or '' if none found.

    Works for Supreme Court, Federal Court of Appeal and Federal Court neutral
    citations -- e.g. "2019 SCC 65" -> .../en/ca/scc/doc/2019/2019scc65/...
    """
    m = _NEUTRAL.search(text)
    if not m:
        return ""
    year, court, num = m.group(1), m.group(2).lower(), m.group(3)
    doc = f"{year}{court}{num}"
    return (f"https://www.canlii.org/en/ca/{_CANLII_SEG[court]}/doc/"
            f"{year}/{doc}/{doc}.html")


def api_key():
    """Return the configured CanLII API key, or '' if not set.

    The CANLII_API_KEY environment variable is checked first -- the remote
    deployment injects the key as a secret rather than shipping the file.
    A local run falls back to canlii_key.txt.
    """
    env_key = os.environ.get("CANLII_API_KEY", "").strip()
    if env_key and env_key != _PLACEHOLDER:
        return env_key
    if not KEY_FILE.exists():
        return ""
    key = KEY_FILE.read_text(encoding="utf-8").strip()
    return "" if not key or key == _PLACEHOLDER else key


def _load_json(path):
    try:
        return json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        return {}


class Citator:
    """Live CanLII citator with on-disk caching and rate-limit throttling."""

    def __init__(self):
        self.key = api_key()
        if not self.key:
            raise RuntimeError(f"No CanLII API key -- put your key in {KEY_FILE}.")
        self._dbmap = _load_json(_DBMAP_FILE)   # URL court segment -> databaseId
        self._cache = _load_json(_CACHE_FILE)   # case URL -> report

    def _get(self, path):
        """Throttled GET against the CanLII API, retrying on HTTP 429."""
        for attempt in range(3):
            time.sleep(_THROTTLE if attempt == 0 else 15.0)
            sep = "&" if "?" in path else "?"
            url = f"{API}/{path}{sep}api_key={self.key}"
            try:
                with urllib.request.urlopen(url, timeout=45) as resp:
                    return json.loads(resp.read().decode("utf-8"))
            except urllib.error.HTTPError as exc:
                if exc.code == 429 and attempt < 2:
                    continue
                raise
        raise RuntimeError("CanLII API rate limit reached; retry shortly.")

    def _ensure_dbmap(self):
        if self._dbmap:
            return
        data = self._get("caseBrowse/en/")
        dbmap = {}
        for db in data.get("caseDatabases", []):
            match = _DB_URL.search(db.get("url") or "")
            if match and db.get("databaseId"):
                dbmap[match.group(1)] = db["databaseId"]
        self._dbmap = dbmap
        DATA_DIR.mkdir(parents=True, exist_ok=True)
        _DBMAP_FILE.write_text(json.dumps(dbmap), encoding="utf-8")

    def _citator(self, db, case_id, kind):
        data = self._get(f"caseCitator/en/{db}/{case_id}/{kind}")
        items = data.get(kind, [])
        return {"total": len(items), "items": items[:_MAX_LIST]}

    def case_report(self, case_url):
        """Return a citation-graph report for a case.

        Accepts a full canlii.org case URL, or a neutral citation (e.g.
        "2019 SCC 65") for a Supreme Court / Federal Court of Appeal / Federal
        Court decision.
        """
        if not _CASE_URL.search(case_url):
            case_url = canlii_url_from_citation(case_url) or case_url
        if case_url in self._cache:
            return self._cache[case_url]
        match = _CASE_URL.search(case_url)
        if not match:
            return {"error": "Provide a full canlii.org case URL, or a neutral "
                    "citation such as '2019 SCC 65' (Supreme Court, Federal "
                    "Court of Appeal, or Federal Court)."}
        self._ensure_dbmap()
        segment, case_id = match.group(1), match.group(2)
        db = self._dbmap.get(segment)
        if not db:
            return {"error": f"Unrecognized CanLII court segment '{segment}'."}
        report = {
            "meta": self._get(f"caseBrowse/en/{db}/{case_id}/"),
            "citingCases": self._citator(db, case_id, "citingCases"),
            "citedCases": self._citator(db, case_id, "citedCases"),
            "citedLegislations": self._citator(db, case_id, "citedLegislations"),
        }
        self._cache[case_url] = report
        DATA_DIR.mkdir(parents=True, exist_ok=True)
        _CACHE_FILE.write_text(json.dumps(self._cache, ensure_ascii=False),
                               encoding="utf-8")
        return report


def main():
    if len(sys.argv) < 2:
        print('usage: python -m canlex.citator "<canlii-case-url>"')
        return
    report = Citator().case_report(sys.argv[1])
    if "error" in report:
        print("ERROR:", report["error"])
        return
    meta = report["meta"]
    print(f"{meta.get('title')} -- {meta.get('citation')}  ({meta.get('decisionDate')})")
    print(f"  cited by:  {report['citingCases']['total']}")
    print(f"  cites:     {report['citedCases']['total']}")
    print(f"  legislation cited: {report['citedLegislations']['total']}")
    for item in report["citedLegislations"]["items"][:6]:
        print(f"    - {item.get('title')} ({item.get('citation')})")


if __name__ == "__main__":
    main()