File size: 6,497 Bytes
21626e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8c217b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21626e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8c217b
 
 
 
 
 
 
 
21626e7
 
 
 
b8c217b
 
 
21626e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""CanLII case citator -- live lookup of a Canadian case's citation graph.

Uses the CanLII API (key in canlii_key.txt). The API has no name/topic search,
so a case is identified by its full canlii.org URL. Responses are cached on disk
and calls are throttled, because the API rate-limits aggressively.
"""
import json
import os
import re
import sys
import time
import urllib.error
import urllib.request

from .config import ROOT, DATA_DIR

API = "https://api.canlii.org/v1"
KEY_FILE = ROOT / "canlii_key.txt"
_PLACEHOLDER = "PASTE-YOUR-CANLII-API-KEY-ON-THIS-LINE"
_DBMAP_FILE = DATA_DIR / "citator_dbmap.json"
_CACHE_FILE = DATA_DIR / "citator_cache.json"
_THROTTLE = 3.0     # seconds between CanLII API calls (the API rate-limits hard)
_MAX_LIST = 20      # items shown per citator list (lists can run to thousands)
_CASE_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/doc/\d+/([a-z0-9-]+)")
_DB_URL = re.compile(r"canlii\.org/[a-z]{2}/[a-z]+/([a-z0-9-]+)/?$")

# A neutral citation, e.g. "2019 SCC 65", and the CanLII URL segment per court.
_NEUTRAL = re.compile(r"\b(\d{4})\s+(SCC|FCA|FC)\s+(\d+)\b", re.IGNORECASE)
_CANLII_SEG = {"scc": "scc", "fca": "fca", "fc": "fct"}


def canlii_url_from_citation(text):
    """Build a canlii.org case URL from a neutral citation, or '' if none found.

    Works for Supreme Court, Federal Court of Appeal and Federal Court neutral
    citations -- e.g. "2019 SCC 65" -> .../en/ca/scc/doc/2019/2019scc65/...
    """
    m = _NEUTRAL.search(text)
    if not m:
        return ""
    year, court, num = m.group(1), m.group(2).lower(), m.group(3)
    doc = f"{year}{court}{num}"
    return (f"https://www.canlii.org/en/ca/{_CANLII_SEG[court]}/doc/"
            f"{year}/{doc}/{doc}.html")


def api_key():
    """Return the configured CanLII API key, or '' if not set.

    The CANLII_API_KEY environment variable is checked first -- the remote
    deployment injects the key as a secret rather than shipping the file.
    A local run falls back to canlii_key.txt.
    """
    env_key = os.environ.get("CANLII_API_KEY", "").strip()
    if env_key and env_key != _PLACEHOLDER:
        return env_key
    if not KEY_FILE.exists():
        return ""
    key = KEY_FILE.read_text(encoding="utf-8").strip()
    return "" if not key or key == _PLACEHOLDER else key


def _load_json(path):
    try:
        return json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        return {}


class Citator:
    """Live CanLII citator with on-disk caching and rate-limit throttling."""

    def __init__(self):
        self.key = api_key()
        if not self.key:
            raise RuntimeError(f"No CanLII API key -- put your key in {KEY_FILE}.")
        self._dbmap = _load_json(_DBMAP_FILE)   # URL court segment -> databaseId
        self._cache = _load_json(_CACHE_FILE)   # case URL -> report

    def _get(self, path):
        """Throttled GET against the CanLII API, retrying on HTTP 429."""
        for attempt in range(3):
            time.sleep(_THROTTLE if attempt == 0 else 15.0)
            sep = "&" if "?" in path else "?"
            url = f"{API}/{path}{sep}api_key={self.key}"
            try:
                with urllib.request.urlopen(url, timeout=45) as resp:
                    return json.loads(resp.read().decode("utf-8"))
            except urllib.error.HTTPError as exc:
                if exc.code == 429 and attempt < 2:
                    continue
                raise
        raise RuntimeError("CanLII API rate limit reached; retry shortly.")

    def _ensure_dbmap(self):
        if self._dbmap:
            return
        data = self._get("caseBrowse/en/")
        dbmap = {}
        for db in data.get("caseDatabases", []):
            match = _DB_URL.search(db.get("url") or "")
            if match and db.get("databaseId"):
                dbmap[match.group(1)] = db["databaseId"]
        self._dbmap = dbmap
        DATA_DIR.mkdir(parents=True, exist_ok=True)
        _DBMAP_FILE.write_text(json.dumps(dbmap), encoding="utf-8")

    def _citator(self, db, case_id, kind):
        data = self._get(f"caseCitator/en/{db}/{case_id}/{kind}")
        items = data.get(kind, [])
        return {"total": len(items), "items": items[:_MAX_LIST]}

    def case_report(self, case_url):
        """Return a citation-graph report for a case.

        Accepts a full canlii.org case URL, or a neutral citation (e.g.
        "2019 SCC 65") for a Supreme Court / Federal Court of Appeal / Federal
        Court decision.
        """
        if not _CASE_URL.search(case_url):
            case_url = canlii_url_from_citation(case_url) or case_url
        if case_url in self._cache:
            return self._cache[case_url]
        match = _CASE_URL.search(case_url)
        if not match:
            return {"error": "Provide a full canlii.org case URL, or a neutral "
                    "citation such as '2019 SCC 65' (Supreme Court, Federal "
                    "Court of Appeal, or Federal Court)."}
        self._ensure_dbmap()
        segment, case_id = match.group(1), match.group(2)
        db = self._dbmap.get(segment)
        if not db:
            return {"error": f"Unrecognized CanLII court segment '{segment}'."}
        report = {
            "meta": self._get(f"caseBrowse/en/{db}/{case_id}/"),
            "citingCases": self._citator(db, case_id, "citingCases"),
            "citedCases": self._citator(db, case_id, "citedCases"),
            "citedLegislations": self._citator(db, case_id, "citedLegislations"),
        }
        self._cache[case_url] = report
        DATA_DIR.mkdir(parents=True, exist_ok=True)
        _CACHE_FILE.write_text(json.dumps(self._cache, ensure_ascii=False),
                               encoding="utf-8")
        return report


def main():
    if len(sys.argv) < 2:
        print('usage: python -m canlex.citator "<canlii-case-url>"')
        return
    report = Citator().case_report(sys.argv[1])
    if "error" in report:
        print("ERROR:", report["error"])
        return
    meta = report["meta"]
    print(f"{meta.get('title')} -- {meta.get('citation')}  ({meta.get('decisionDate')})")
    print(f"  cited by:  {report['citingCases']['total']}")
    print(f"  cites:     {report['citedCases']['total']}")
    print(f"  legislation cited: {report['citedLegislations']['total']}")
    for item in report["citedLegislations"]["items"][:6]:
        print(f"    - {item.get('title')} ({item.get('citation')})")


if __name__ == "__main__":
    main()