CanLex / canlex /citator.py
Beemer
Add Phase 3: IRB jurisprudential guides and citation-based citator lookup
b8c217b
"""CanLII case citator -- live lookup of a Canadian case's citation graph.
Uses the CanLII API (key in canlii_key.txt). The API has no name/topic search,
so a case is identified by its full canlii.org URL. Responses are cached on disk
and calls are throttled, because the API rate-limits aggressively.
"""
import json
import os
import re
import sys
import time
import urllib.error
import urllib.request
from .config import ROOT, DATA_DIR
API = "https://api.canlii.org/v1"
KEY_FILE = ROOT / "canlii_key.txt"
_PLACEHOLDER = "PASTE-YOUR-CANLII-API-KEY-ON-THIS-LINE"
_DBMAP_FILE = DATA_DIR / "citator_dbmap.json"
_CACHE_FILE = DATA_DIR / "citator_cache.json"
_THROTTLE = 3.0 # seconds between CanLII API calls (the API rate-limits hard)
_MAX_LIST = 20 # items shown per citator list (lists can run to thousands)
_CASE_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/doc/\d+/([a-z0-9-]+)")
_DB_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/?$")
# A neutral citation, e.g. "2019 SCC 65", and the CanLII URL segment per court.
_NEUTRAL = re.compile(r"\b(\d{4})\s+(SCC|FCA|FC)\s+(\d+)\b", re.IGNORECASE)
_CANLII_SEG = {"scc": "scc", "fca": "fca", "fc": "fct"}
def canlii_url_from_citation(text):
"""Build a canlii.org case URL from a neutral citation, or '' if none found.
Works for Supreme Court, Federal Court of Appeal and Federal Court neutral
citations -- e.g. "2019 SCC 65" -> .../en/ca/scc/doc/2019/2019scc65/...
"""
m = _NEUTRAL.search(text)
if not m:
return ""
year, court, num = m.group(1), m.group(2).lower(), m.group(3)
doc = f"{year}{court}{num}"
return (f"https://www.canlii.org/en/ca/{_CANLII_SEG[court]}/doc/"
f"{year}/{doc}/{doc}.html")
def api_key():
"""Return the configured CanLII API key, or '' if not set.
The CANLII_API_KEY environment variable is checked first -- the remote
deployment injects the key as a secret rather than shipping the file.
A local run falls back to canlii_key.txt.
"""
env_key = os.environ.get("CANLII_API_KEY", "").strip()
if env_key and env_key != _PLACEHOLDER:
return env_key
if not KEY_FILE.exists():
return ""
key = KEY_FILE.read_text(encoding="utf-8").strip()
return "" if not key or key == _PLACEHOLDER else key
def _load_json(path):
try:
return json.loads(path.read_text(encoding="utf-8"))
except Exception:
return {}
class Citator:
"""Live CanLII citator with on-disk caching and rate-limit throttling."""
def __init__(self):
self.key = api_key()
if not self.key:
raise RuntimeError(f"No CanLII API key -- put your key in {KEY_FILE}.")
self._dbmap = _load_json(_DBMAP_FILE) # URL court segment -> databaseId
self._cache = _load_json(_CACHE_FILE) # case URL -> report
def _get(self, path):
"""Throttled GET against the CanLII API, retrying on HTTP 429."""
for attempt in range(3):
time.sleep(_THROTTLE if attempt == 0 else 15.0)
sep = "&" if "?" in path else "?"
url = f"{API}/{path}{sep}api_key={self.key}"
try:
with urllib.request.urlopen(url, timeout=45) as resp:
return json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
if exc.code == 429 and attempt < 2:
continue
raise
raise RuntimeError("CanLII API rate limit reached; retry shortly.")
def _ensure_dbmap(self):
if self._dbmap:
return
data = self._get("caseBrowse/en/")
dbmap = {}
for db in data.get("caseDatabases", []):
match = _DB_URL.search(db.get("url") or "")
if match and db.get("databaseId"):
dbmap[match.group(1)] = db["databaseId"]
self._dbmap = dbmap
DATA_DIR.mkdir(parents=True, exist_ok=True)
_DBMAP_FILE.write_text(json.dumps(dbmap), encoding="utf-8")
def _citator(self, db, case_id, kind):
data = self._get(f"caseCitator/en/{db}/{case_id}/{kind}")
items = data.get(kind, [])
return {"total": len(items), "items": items[:_MAX_LIST]}
def case_report(self, case_url):
"""Return a citation-graph report for a case.
Accepts a full canlii.org case URL, or a neutral citation (e.g.
"2019 SCC 65") for a Supreme Court / Federal Court of Appeal / Federal
Court decision.
"""
if not _CASE_URL.search(case_url):
case_url = canlii_url_from_citation(case_url) or case_url
if case_url in self._cache:
return self._cache[case_url]
match = _CASE_URL.search(case_url)
if not match:
return {"error": "Provide a full canlii.org case URL, or a neutral "
"citation such as '2019 SCC 65' (Supreme Court, Federal "
"Court of Appeal, or Federal Court)."}
self._ensure_dbmap()
segment, case_id = match.group(1), match.group(2)
db = self._dbmap.get(segment)
if not db:
return {"error": f"Unrecognized CanLII court segment '{segment}'."}
report = {
"meta": self._get(f"caseBrowse/en/{db}/{case_id}/"),
"citingCases": self._citator(db, case_id, "citingCases"),
"citedCases": self._citator(db, case_id, "citedCases"),
"citedLegislations": self._citator(db, case_id, "citedLegislations"),
}
self._cache[case_url] = report
DATA_DIR.mkdir(parents=True, exist_ok=True)
_CACHE_FILE.write_text(json.dumps(self._cache, ensure_ascii=False),
encoding="utf-8")
return report
def main():
if len(sys.argv) < 2:
print('usage: python -m canlex.citator "<canlii-case-url>"')
return
report = Citator().case_report(sys.argv[1])
if "error" in report:
print("ERROR:", report["error"])
return
meta = report["meta"]
print(f"{meta.get('title')} -- {meta.get('citation')} ({meta.get('decisionDate')})")
print(f" cited by: {report['citingCases']['total']}")
print(f" cites: {report['citedCases']['total']}")
print(f" legislation cited: {report['citedLegislations']['total']}")
for item in report["citedLegislations"]["items"][:6]:
print(f" - {item.get('title')} ({item.get('citation')})")
if __name__ == "__main__":
main()