Spaces:

Beemer0
/

CanLex

Running

Beemer

Add Phase 3: IRB jurisprudential guides and citation-based citator lookup

b8c217b 7 days ago

6.5 kB

	"""CanLII case citator -- live lookup of a Canadian case's citation graph.

	Uses the CanLII API (key in canlii_key.txt). The API has no name/topic search,
	so a case is identified by its full canlii.org URL. Responses are cached on disk
	and calls are throttled, because the API rate-limits aggressively.
	"""
	import json
	import os
	import re
	import sys
	import time
	import urllib.error
	import urllib.request

	from .config import ROOT, DATA_DIR

	API = "https://api.canlii.org/v1"
	KEY_FILE = ROOT / "canlii_key.txt"
	_PLACEHOLDER = "PASTE-YOUR-CANLII-API-KEY-ON-THIS-LINE"
	_DBMAP_FILE = DATA_DIR / "citator_dbmap.json"
	_CACHE_FILE = DATA_DIR / "citator_cache.json"
	_THROTTLE = 3.0 # seconds between CanLII API calls (the API rate-limits hard)
	_MAX_LIST = 20 # items shown per citator list (lists can run to thousands)
	_CASE_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/doc/\d+/([a-z0-9-]+)")
	_DB_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/?$")

	# A neutral citation, e.g. "2019 SCC 65", and the CanLII URL segment per court.
	_NEUTRAL = re.compile(r"\b(\d{4})\s+(SCC\|FCA\|FC)\s+(\d+)\b", re.IGNORECASE)
	_CANLII_SEG = {"scc": "scc", "fca": "fca", "fc": "fct"}


	def canlii_url_from_citation(text):
	"""Build a canlii.org case URL from a neutral citation, or '' if none found.

	Works for Supreme Court, Federal Court of Appeal and Federal Court neutral
	citations -- e.g. "2019 SCC 65" -> .../en/ca/scc/doc/2019/2019scc65/...
	"""
	m = _NEUTRAL.search(text)
	if not m:
	return ""
	year, court, num = m.group(1), m.group(2).lower(), m.group(3)
	doc = f"{year}{court}{num}"
	return (f"https://www.canlii.org/en/ca/{_CANLII_SEG[court]}/doc/"
	f"{year}/{doc}/{doc}.html")


	def api_key():
	"""Return the configured CanLII API key, or '' if not set.

	The CANLII_API_KEY environment variable is checked first -- the remote
	deployment injects the key as a secret rather than shipping the file.
	A local run falls back to canlii_key.txt.
	"""
	env_key = os.environ.get("CANLII_API_KEY", "").strip()
	if env_key and env_key != _PLACEHOLDER:
	return env_key
	if not KEY_FILE.exists():
	return ""
	key = KEY_FILE.read_text(encoding="utf-8").strip()
	return "" if not key or key == _PLACEHOLDER else key


	def _load_json(path):
	try:
	return json.loads(path.read_text(encoding="utf-8"))
	except Exception:
	return {}


	class Citator:
	"""Live CanLII citator with on-disk caching and rate-limit throttling."""

	def __init__(self):
	self.key = api_key()
	if not self.key:
	raise RuntimeError(f"No CanLII API key -- put your key in {KEY_FILE}.")
	self._dbmap = _load_json(_DBMAP_FILE) # URL court segment -> databaseId
	self._cache = _load_json(_CACHE_FILE) # case URL -> report

	def _get(self, path):
	"""Throttled GET against the CanLII API, retrying on HTTP 429."""
	for attempt in range(3):
	time.sleep(_THROTTLE if attempt == 0 else 15.0)
	sep = "&" if "?" in path else "?"
	url = f"{API}/{path}{sep}api_key={self.key}"
	try:
	with urllib.request.urlopen(url, timeout=45) as resp:
	return json.loads(resp.read().decode("utf-8"))
	except urllib.error.HTTPError as exc:
	if exc.code == 429 and attempt < 2:
	continue
	raise
	raise RuntimeError("CanLII API rate limit reached; retry shortly.")

	def _ensure_dbmap(self):
	if self._dbmap:
	return
	data = self._get("caseBrowse/en/")
	dbmap = {}
	for db in data.get("caseDatabases", []):
	match = _DB_URL.search(db.get("url") or "")
	if match and db.get("databaseId"):
	dbmap[match.group(1)] = db["databaseId"]
	self._dbmap = dbmap
	DATA_DIR.mkdir(parents=True, exist_ok=True)
	_DBMAP_FILE.write_text(json.dumps(dbmap), encoding="utf-8")

	def _citator(self, db, case_id, kind):
	data = self._get(f"caseCitator/en/{db}/{case_id}/{kind}")
	items = data.get(kind, [])
	return {"total": len(items), "items": items[:_MAX_LIST]}

	def case_report(self, case_url):
	"""Return a citation-graph report for a case.

	Accepts a full canlii.org case URL, or a neutral citation (e.g.
	"2019 SCC 65") for a Supreme Court / Federal Court of Appeal / Federal
	Court decision.
	"""
	if not _CASE_URL.search(case_url):
	case_url = canlii_url_from_citation(case_url) or case_url
	if case_url in self._cache:
	return self._cache[case_url]
	match = _CASE_URL.search(case_url)
	if not match:
	return {"error": "Provide a full canlii.org case URL, or a neutral "
	"citation such as '2019 SCC 65' (Supreme Court, Federal "
	"Court of Appeal, or Federal Court)."}
	self._ensure_dbmap()
	segment, case_id = match.group(1), match.group(2)
	db = self._dbmap.get(segment)
	if not db:
	return {"error": f"Unrecognized CanLII court segment '{segment}'."}
	report = {
	"meta": self._get(f"caseBrowse/en/{db}/{case_id}/"),
	"citingCases": self._citator(db, case_id, "citingCases"),
	"citedCases": self._citator(db, case_id, "citedCases"),
	"citedLegislations": self._citator(db, case_id, "citedLegislations"),
	}
	self._cache[case_url] = report
	DATA_DIR.mkdir(parents=True, exist_ok=True)
	_CACHE_FILE.write_text(json.dumps(self._cache, ensure_ascii=False),
	encoding="utf-8")
	return report


	def main():
	if len(sys.argv) < 2:
	print('usage: python -m canlex.citator "<canlii-case-url>"')
	return
	report = Citator().case_report(sys.argv[1])
	if "error" in report:
	print("ERROR:", report["error"])
	return
	meta = report["meta"]
	print(f"{meta.get('title')} -- {meta.get('citation')} ({meta.get('decisionDate')})")
	print(f" cited by: {report['citingCases']['total']}")
	print(f" cites: {report['citedCases']['total']}")
	print(f" legislation cited: {report['citedLegislations']['total']}")
	for item in report["citedLegislations"]["items"][:6]:
	print(f" - {item.get('title')} ({item.get('citation')})")


	if __name__ == "__main__":
	main()