diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..28250218db8247c0c2e307d7710dc69b83e89765 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,21 @@ +# Keep the build context small and keep secrets / regenerated files out. +.git/ +.venv/ +venv/ +__pycache__/ +**/__pycache__/ +*.pyc + +# Raw downloads are not needed -- only data/processed/*.json is. +data/raw/ + +# Regenerated inside the image by 'python -m canlex.embed'. +data/processed/embeddings.npz + +# Local-only: the CanLII key (injected as a runtime secret) and disk caches. +canlii_key.txt +data/citator_dbmap.json +data/citator_cache.json + +# Local MCP client config; not used by the remote server. +.mcp.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..70d85f50798e0b3a9eedb9ecba8d3a0f0508d1d3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +__pycache__/ +*.pyc +.venv/ +venv/ +data/raw/ +data/processed/embeddings.npz +canlii_key.txt +data/citator_*.json diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 0000000000000000000000000000000000000000..8b1a89c86448aa1a777a0a2dd768940deaadc426 --- /dev/null +++ b/.mcp.json @@ -0,0 +1,8 @@ +{ + "mcpServers": { + "canlex": { + "command": "C:\\projects\\CanLex\\.venv\\Scripts\\python.exe", + "args": ["C:\\projects\\CanLex\\canlex\\server.py"] + } + } +} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..7d5538b735b08c6b1a02738e3ea6e0c4aec2f184 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,42 @@ +# CanLex MCP server -- remote (streamable-HTTP) image. +# +# Builds anywhere: Hugging Face Spaces, Google Cloud Run, Fly.io, plain Docker. +# Retrieval is fully local and key-free; the optional CanLII case citator reads +# its key from the CANLII_API_KEY environment variable (supplied as a host +# secret -- the key is never copied into the image). +FROM python:3.12-slim + +# libgomp1 is the OpenMP runtime that onnxruntime (the reranker) links against. +RUN apt-get update \ + && apt-get install -y --no-install-recommends libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +# Run as a non-root user (UID 1000) -- required by Hugging Face Spaces. +RUN useradd --create-home --home-dir /app --uid 1000 app +WORKDIR /app + +# Python dependencies first, so this layer caches across code changes. +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Application code and the processed corpus (section-chunk JSON). +COPY --chown=app:app canlex/ ./canlex/ +COPY --chown=app:app data/processed/*.json ./data/processed/ + +USER app +ENV HOME=/app \ + HF_HOME=/app/.hf_cache \ + CANLEX_HTTP=1 \ + PORT=7860 \ + PYTHONUNBUFFERED=1 + +# Build the semantic embeddings and pre-fetch the cross-encoder model, so the +# model cache is baked into the image and the first request needs no network. +RUN python -m canlex.embed \ + && python -c "from canlex.rerank import Reranker; Reranker()" + +# From here on, model files are served from the baked cache, never fetched. +ENV HF_HUB_OFFLINE=1 + +EXPOSE 7860 +CMD ["python", "-m", "canlex.server"] diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1954cb66518ca2c648ac12a57f302afbb4570dd1 --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +--- +title: CanLex MCP +sdk: docker +app_port: 7860 +pinned: false +--- + +# CanLex MCP + +A remote [Model Context Protocol](https://modelcontextprotocol.io) server for +Canadian legal research. It exposes four read-only tools over streamable HTTP: + +- **canlex_search_legislation** — hybrid (BM25 + semantic) search with a + cross-encoder reranker over federal border, customs, criminal, drug, + food/health, labour and privacy legislation, CBSA D-Memoranda, Treasury Board + collective agreements, and National Joint Council directives. +- **canlex_get_section** — fetch one provision verbatim by Act and section. +- **canlex_list_acts** — list the corpus and its currency. +- **canlex_case** — live CanLII case citator (citation graph for a case URL). + +Retrieval runs fully locally inside the container and needs no API key. The +optional case citator reads a CanLII key from the `CANLII_API_KEY` environment +variable. + +Add this Space's `/mcp` URL as a custom connector in any Claude client. diff --git a/canlex/__init__.py b/canlex/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cfbea6f09c1f6f7e4125c54e1ea0e18dafecebc9 --- /dev/null +++ b/canlex/__init__.py @@ -0,0 +1 @@ +"""CanLex - Canadian legal retrieval for Claude.""" diff --git a/canlex/agreement.py b/canlex/agreement.py new file mode 100644 index 0000000000000000000000000000000000000000..39ac5f99c7af034130ff92625a93dd8454a25276 --- /dev/null +++ b/canlex/agreement.py @@ -0,0 +1,163 @@ +"""Ingest Treasury Board collective agreements (HTML) into Article-level chunks. + +A collective agreement is a binding contract between the Treasury Board and a +bargaining agent for one occupational group. Chunks are tagged +doc_type="agreement" so CanLex keeps them distinct from legislation and guidance. +""" +import json +import re +import subprocess +import sys +import time + +from bs4 import BeautifulSoup + +from .config import RAW_DIR, PROCESSED_DIR + +AGREEMENT_DIR = RAW_DIR / "agreements" +OUT_FILE = PROCESSED_DIR / "agreements.json" +# canada.ca rejects non-browser user agents, so present a browser one. +_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36") + +# Treasury Board collective agreements to ingest. Add an entry to ingest more. +AGREEMENTS = { + "FB": { + "short": "FB Agreement", + "name": "FB Group Collective Agreement (Border Services)", + "cite": "FB Collective Agreement", + "url": "https://www.canada.ca/en/treasury-board-secretariat/topics/pay/" + "collective-agreements/fb.html", + }, +} + +_SKIP_HEADINGS = {"table of contents", "note to readers", "page details", + "on this page"} +_CONTENT_TAGS = {"p", "dl", "table", "h4", "h5", "h6", "blockquote"} +_ARTICLE = re.compile(r"Article\s+(\S+?)\s*[:–-]\s*(.+)", re.I) + + +def _norm(text): + return re.sub(r"\s+", " ", text or "").strip() + + +def _fetch(url, dest, force=False): + """Download a page. canada.ca blocks Python's HTTP client at the TLS layer, + so fetch via PowerShell's (.NET) HTTP stack, which the site accepts.""" + if dest.exists() and not force: + return dest.read_bytes() + dest.parent.mkdir(parents=True, exist_ok=True) + command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' " + f"-UseBasicParsing -UserAgent '{_UA}'") + subprocess.run(["powershell", "-NoProfile", "-NonInteractive", "-Command", command], + check=True, capture_output=True, timeout=180) + time.sleep(0.5) # be polite to the server + return dest.read_bytes() + + +def _block_text(heading): + """Readable text from a heading up to the next h2/h3 (sections unwrapped).""" + lines = [] + for sib in heading.find_next_siblings(): + if sib.name in ("h2", "h3"): + break + if sib.name in ("ul", "ol"): + for li in sib.find_all("li", recursive=False): + item = _norm(li.get_text(" ", strip=True)) + if item: + lines.append(f"- {item}") + elif sib.name in _CONTENT_TAGS: + text = _norm(sib.get_text(" ", strip=True)) + if text: + lines.append(text) + return "\n".join(lines) + + +def parse_agreement(html, code): + """Parse a collective agreement page into one chunk per Article / Appendix.""" + meta = AGREEMENTS[code] + soup = BeautifulSoup(html, "html.parser") + main = soup.find("main") + if main is None: + return [] + + # The first