Spaces:

Beemer0
/

CanLex

Running

Beemer Claude Opus 4.7 commited on 4 days ago

Commit

589d46e

1 Parent(s): 51e7e37

Add IRPA delegation instruments, R. v. Carignan, and retrieval polish

New 'delegation' doc_type and canlex/delegation.py, ingesting two
instruments of delegation and designation under IRPA/IRPR: the CBSA
"Delegation of Authority and Designations of Officers" (HTML, 86 items)
and the IRCC IL3 "Instrument of Designation and Delegation" (PDF, 221
items) -- 307 chunks recording which officials may exercise which
ministerial powers. Wired through index.py (delegation items are
uncapped primary instruments) and server.py (rendering, the doc_type
filter, canlex_list_acts).

R. v. Carignan, 2025 SCC 43 -- a leading SCC decision on the
warrantless-arrest power in Criminal Code s. 495 and the Charter s. 9
right against arbitrary arrest (caselaw.py, +71 chunks).

Retrieval-polish batch: the diversity cap no longer collapses
agreements and directives; an idf-weighted title-match boost;
Act-over-regulation and agreement-back-matter fusion penalties;
dmemo.py now fills the 'part' title for every memo and embed.py uses
it; the eval set is broadened to 129 questions; a 25-test stdlib
unittest suite under tests/.

129-question eval: Hit@1 0.74, Hit@3 0.89, Hit@5 0.93, Hit@10 0.97,
MRR 0.82 -- flat against the pre-Carignan/-delegation baseline of
0.74 / 0.90 / 0.94 / 0.96 / 0.83; the 378 new chunks introduce no
meaningful regression.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (14) hide show

canlex/caselaw.py +10 -0
canlex/delegation.py +356 -0
canlex/dmemo.py +10 -3
canlex/embed.py +8 -3
canlex/index.py +76 -4
canlex/server.py +32 -10
data/eval/questions.json +36 -1
data/processed/caselaw.json +0 -0
data/processed/delegation.json +0 -0
data/processed/dmemos.json +0 -0
tests/__init__.py +1 -0
tests/test_embed.py +33 -0
tests/test_index.py +170 -0
tests/test_synonyms.py +27 -0

canlex/caselaw.py CHANGED Viewed

@@ -152,6 +152,11 @@ CASES = [
     {"court": "scc", "id": 2198, "short": "Martineau",
      "topic": "Whether an ascertained-forfeiture proceeding under the Customs "
               "Act is penal and engages the right against self-incrimination"},
     # --- Federal Court of Appeal ---
     {"court": "fca", "id": 143136, "short": "Huruglica",
@@ -217,6 +222,11 @@ CASES = [
     {"court": "fca", "id": 31447, "short": "Rahaman",
      "topic": "Refugee claims and the meaning of a 'no credible basis' "
               "finding under IRPA"},
     # --- Federal Court ---
     {"court": "fc", "id": 64594, "short": "Goburdhun",

     {"court": "scc", "id": 2198, "short": "Martineau",
      "topic": "Whether an ascertained-forfeiture proceeding under the Customs "
               "Act is penal and engages the right against self-incrimination"},
+    {"court": "scc", "id": 21317, "short": "Carignan",
+     "topic": "Powers of arrest without warrant under Criminal Code s. 495; an "
+              "arrest that contravenes the s. 495(2) limits can ground a breach "
+              "of the Charter s. 9 right against arbitrary arrest, despite "
+              "s. 495(3)"},
     # --- Federal Court of Appeal ---
     {"court": "fca", "id": 143136, "short": "Huruglica",
     {"court": "fca", "id": 31447, "short": "Rahaman",
      "topic": "Refugee claims and the meaning of a 'no credible basis' "
               "finding under IRPA"},
+    {"court": "fca", "id": 521840, "short": "Rodriguez Anzola",
+     "topic": "Criminal inadmissibility under IRPA s. 36(1)(b) for a "
+              "conviction abroad; whether an immigration decision-maker may "
+              "consider that the defence of duress was effectively unavailable "
+              "in the foreign jurisdiction"},
     # --- Federal Court ---
     {"court": "fc", "id": 64594, "short": "Goburdhun",

canlex/delegation.py ADDED Viewed

	@@ -0,0 +1,356 @@

+"""Ingest instruments of delegation and designation under IRPA / IRPR.
+These instruments record which officer positions the Minister has delegated
+powers to, or designated for functions, under the Immigration and Refugee
+Protection Act and its Regulations. They are administrative instruments -- not
+enacted law, and not guidance -- so every chunk is tagged doc_type="delegation".
+Sources:
+  - the CBSA "Delegation of Authority and Designations of Officers ..."
+    instrument (HTML, cbsa-asfc.gc.ca);
+  - the IRCC "IL3 -- Instrument of Designation and Delegation" (PDF, canada.ca).
+    py -m canlex.delegation
+"""
+import io
+import json
+import re
+import subprocess
+import time
+import urllib.request
+from bs4 import BeautifulSoup
+from pypdf import PdfReader
+from .config import PROCESSED_DIR, RAW_DIR
+RAW = RAW_DIR / "delegation"
+OUT = PROCESSED_DIR / "delegation.json"
+# cbsa-asfc.gc.ca serves an ordinary client fine with a browser User-Agent;
+# canada.ca (the IRCC PDF) blocks Python's HTTP client at the TLS layer, so that
+# one is fetched via PowerShell's (.NET) HTTP stack, as agreement.py does.
+_UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+       "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
+SOURCES = {
+    "cbsa": {
+        "code": "cbsa",
+        "kind": "html-cbsa",
+        "act_code": "CBSA-IRPA-DELEG",
+        "act_short": "CBSA Delegation",
+        "act_name": ("Delegation of Authority and Designations of Officers by "
+                     "the Minister of Public Safety and Emergency Preparedness "
+                     "under the Immigration and Refugee Protection Act and the "
+                     "Immigration and Refugee Protection Regulations"),
+        "url": ("https://www.cbsa-asfc.gc.ca/agency-agence/actreg-loireg/"
+                "delegation/irpa-lipr-2016-07-eng.html"),
+    },
+    "ircc": {
+        "code": "ircc",
+        "kind": "pdf-ircc",
+        "act_code": "IRCC-IL3-DELEG",
+        "act_short": "IRCC IL3",
+        "act_name": ("IL3 — Instrument of Designation and Delegation, "
+                     "Immigration and Refugee Protection Act and Regulations"),
+        "url": ("https://www.canada.ca/content/dam/ircc/migration/ircc/english/"
+                "resources/manuals/il/il3-eng.pdf"),
+    },
+}
+def _norm(text):
+    """Collapse all whitespace -- including the non-breaking spaces these
+    sources use heavily -- to single spaces."""
+    return re.sub(r"\s+", " ", (text or "").replace("\xa0", " ")).strip()
+def _normalize_refs(text):
+    """Expand the instruments' provision shorthand so the section numbers are
+    searchable as tokens: 'A55(1)' -> 'IRPA 55(1)', 'R39' -> 'IRPR 39'. Both
+    instruments write 'A' for the Act and 'R' for the Regulations."""
+    text = re.sub(r"\bA(?=\d)", "IRPA ", text)
+    text = re.sub(r"\bR(?=\d)", "IRPR ", text)
+    return _norm(text)
+def _fetch(url, dest, powershell=False):
+    """Fetch a page or file, caching the raw bytes under data/raw/delegation.
+    canada.ca blocks Python's HTTP client, so its PDF is fetched via PowerShell."""
+    if dest.exists():
+        return dest.read_bytes()
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    if powershell:
+        command = (f"Invoke-WebRequest -Uri '{url}' -OutFile '{dest}' "
+                   f"-UseBasicParsing -UserAgent '{_UA}'")
+        subprocess.run(
+            ["powershell", "-NoProfile", "-NonInteractive", "-Command", command],
+            check=True, capture_output=True, timeout=180)
+    else:
+        req = urllib.request.Request(url, headers={"User-Agent": _UA})
+        with urllib.request.urlopen(req, timeout=60) as resp:
+            dest.write_bytes(resp.read())
+    time.sleep(0.5)   # be polite to the server
+    return dest.read_bytes()
+# --- CBSA instrument (HTML) ---------------------------------------------------
+def _delegates(cell):
+    """Flatten a 'Delegates / Designated officials' table cell into readable
+    lines. The cell groups officer positions under an organisation header
+    (<p class="h4">: CBSA, RCMP, IRCC) and an optional sub-heading
+    (<p class="h5">: a region or a headquarters branch), each followed by a
+    <ul> of position titles."""
+    lines, org, sub, org_used = [], "", "", False
+    for el in cell.find_all(["p", "ul"], recursive=False):
+        if el.name == "p":
+            classes = " ".join(el.get("class") or [])
+            text = _norm(el.get_text())
+            if not text:
+                continue
+            if "h4" in classes:
+                if org and not org_used:   # a previous org with no list of its own
+                    lines.append(org)
+                org, sub, org_used = text.rstrip(": "), "", False
+            elif "h5" in classes:
+                sub = text
+            else:
+                lines.append(text)         # a free-standing note
+        else:                              # <ul> of position titles
+            positions = "; ".join(
+                t for t in (_norm(li.get_text())
+                            for li in el.find_all("li", recursive=False)) if t)
+            if positions:
+                label = f"{org} — {sub}" if sub else org
+                lines.append(f"{label}: {positions}" if label else positions)
+                org_used = True
+    if org and not org_used:               # a trailing org with no list
+        lines.append(org)
+    return "\n".join(lines)
+def parse_cbsa(html, src):
+    """Parse the CBSA delegation instrument into one chunk per Schedule item,
+    plus one chunk for the preamble."""
+    soup = BeautifulSoup(html, "html.parser")
+    main = soup.find("main")
+    if main is None:
+        return []
+    for sup in main.find_all("sup"):       # drop footnote-reference superscripts
+        sup.decompose()
+    time_el = main.find("time")
+    date = _norm(time_el.get("datetime") or time_el.get_text()) if time_el else ""
+    chunks = []
+    # Preamble: the paragraphs between the title and the Schedule, which set out
+    # the tiers of delegation and designation and how the columns are read.
+    schedule = main.find("h2", id="sch")
+    if schedule:
+        paras = [_norm(p.get_text())
+                 for p in reversed(schedule.find_previous_siblings("p"))]
+        body = "\n".join(p for p in paras if p)
+        if body:
+            chunks.append({
+                "id": f"delegation-{src['code']}-preamble",
+                "doc_type": "delegation",
+                "act_code": src["act_code"],
+                "act_short": src["act_short"],
+                "act_name": src["act_name"],
+                "section": "",
+                "marginal_note": "Tiers of delegation and designation",
+                "part": "",
+                "division": "",
+                "heading": "Instrument of delegation and designation under IRPA",
+                "text": body,
+                "history": "",
+                "last_amended": "",
+                "current_to": date,
+                "citation": f"{src['act_short']} — Preamble",
+                "source_url": src["url"],
+            })
+    # One chunk per Schedule item. Each topical <h3> is followed by a four-column
+    # table: Item | Act/Regulations reference | Description of power | Delegates.
+    for table in main.find_all("table", class_="table-bordered"):
+        h3 = table.find_previous_sibling("h3")
+        section_name = _norm(h3.get_text()) if h3 else ""
+        for tr in table.find_all("tr"):
+            cells = tr.find_all("td", recursive=False)
+            if len(cells) != 4:
+                continue                   # the header row (<th>) or a stray row
+            item_no = _norm(cells[0].get_text()).rstrip(".")
+            refs = _normalize_refs(_norm(cells[1].get_text()))
+            power = " ".join(_norm(p.get_text())
+                             for p in cells[2].find_all("p")) \
+                or _norm(cells[2].get_text())
+            delegates = _delegates(cells[3])
+            if not item_no or not (power or refs):
+                continue
+            text = power
+            if refs:
+                text += f"\n\nProvisions (IRPA / IRPR): {refs}."
+            if delegates:
+                text += "\n\nDelegated / designated to:\n" + delegates
+            chunks.append({
+                "id": f"delegation-{src['code']}-{item_no}",
+                "doc_type": "delegation",
+                "act_code": src["act_code"],
+                "act_short": src["act_short"],
+                "act_name": src["act_name"],
+                "section": "",
+                "marginal_note": refs or f"Item {item_no}",
+                "part": section_name,
+                "division": "",
+                "heading": section_name,
+                "text": text,
+                "history": "",
+                "last_amended": "",
+                "current_to": date,
+                "citation": f"{src['act_short']}, Item {item_no}",
+                "source_url": src["url"],
+            })
+    return chunks
+# --- IRCC IL3 instrument (PDF) ------------------------------------------------
+# A topical part heading -- a line in full upper case (MINISTERIAL INSTRUCTIONS,
+# MEDICAL, MISCELLANEOUS). Organisation acronyms (CI, CBSA, RCMP) are shorter
+# than the 6-character floor and so are not mistaken for headings.
+_IL3_PART = re.compile(r"[A-Z][A-Z &/,()'.\-]{5,}")
+# An item opens "<n>. <A/R provision>" -- the number must be followed by a
+# provision reference, which rejects ordinary numbered prose.
+_IL3_ITEM = re.compile(r"(\d+)\.\s+(?=[AR]\d)")
+# The word that introduces an item's power description, after the provisions.
+_IL3_POWER = re.compile(r"(?:Delegation|Designation)\s*[-–—]\s*")
+def _pdf_pages(pdf_bytes):
+    """Extract each page's text, dropping the printed page number that pypdf
+    emits as the page's first line (roman in the front matter, arabic later)."""
+    reader = PdfReader(io.BytesIO(pdf_bytes))
+    pages = []
+    for page in reader.pages:
+        lines = (page.extract_text() or "").split("\n")
+        if lines and re.fullmatch(r"\s*[ivxlcdm\d]+\s*", lines[0] or "", re.I):
+            lines = lines[1:]
+        pages.append("\n".join(lines))
+    return pages
+def parse_ircc(pdf_bytes, src):
+    """Parse the IRCC IL3 instrument into one chunk per Schedule item, plus a
+    preamble chunk. The PDF flattens the four-column table into a linear text
+    stream, so each item runs from its numbered marker to the next; the power
+    description and the delegated positions are kept together within the item."""
+    pages = _pdf_pages(pdf_bytes)
+    if not pages:
+        return []
+    version_match = re.search(r"(?:Spring|Summer|Fall|Winter)\s+\d{4}", pages[0])
+    version = version_match.group(0) if version_match else ""
+    chunks = []
+    preamble = _norm(pages[1]) if len(pages) > 1 else ""
+    if preamble:
+        chunks.append({
+            "id": f"delegation-{src['code']}-preamble",
+            "doc_type": "delegation",
+            "act_code": src["act_code"],
+            "act_short": src["act_short"],
+            "act_name": src["act_name"],
+            "section": "",
+            "marginal_note": "Preamble — designation and delegation under IRPA s. 6",
+            "part": "",
+            "division": "",
+            "heading": "Instrument of designation and delegation under IRPA",
+            "text": preamble,
+            "history": "",
+            "last_amended": "",
+            "current_to": version,
+            "citation": f"{src['act_short']} — Preamble",
+            "source_url": src["url"],
+        })
+    # Walk every line: an all-caps line is a topical part heading; a line that
+    # opens "<n>. <A/R provision>" starts a new item. Lines before the first
+    # item (the preamble, definitions and table of contents) are skipped.
+    items, part = [], ""
+    cur_no = cur_part = None
+    cur_lines = []
+    for line in "\n".join(pages).split("\n"):
+        s = _norm(line)
+        if not s:
+            continue
+        if _IL3_PART.fullmatch(s):
+            part = s.title()
+            continue
+        m = _IL3_ITEM.match(s)
+        if m:
+            if cur_no is not None:
+                items.append((cur_part, cur_no, cur_lines))
+            cur_no, cur_part, cur_lines = m.group(1), part, [s]
+        elif cur_no is not None:
+            cur_lines.append(s)
+    if cur_no is not None:
+        items.append((cur_part, cur_no, cur_lines))
+    for n, (item_part, item_no, lines) in enumerate(items, start=1):
+        body = re.sub(r"^\d+\.\s*", "", "\n".join(lines)).strip()
+        if not body:
+            continue
+        power = _IL3_POWER.search(body)
+        refs = _normalize_refs(body[:power.start()]) if power else ""
+        chunks.append({
+            "id": f"delegation-{src['code']}-{n}",
+            "doc_type": "delegation",
+            "act_code": src["act_code"],
+            "act_short": src["act_short"],
+            "act_name": src["act_name"],
+            "section": "",
+            "marginal_note": refs or f"Item {item_no}",
+            "part": item_part or "",
+            "division": "",
+            "heading": item_part or "",
+            "text": _normalize_refs(body),
+            "history": "",
+            "last_amended": "",
+            "current_to": version,
+            "citation": (f"{src['act_short']} — {item_part}, Item {item_no}"
+                         if item_part else f"{src['act_short']}, Item {item_no}"),
+            "source_url": src["url"],
+        })
+    return chunks
+def build():
+    """Fetch, parse and chunk every delegation instrument into delegation.json."""
+    all_chunks = []
+    for src in SOURCES.values():
+        print(f"Ingesting {src['act_short']} ...")
+        try:
+            if src["kind"] == "html-cbsa":
+                html = _fetch(src["url"], RAW / f"{src['code']}.html")
+                chunks = parse_cbsa(html, src)
+            elif src["kind"] == "pdf-ircc":
+                pdf = _fetch(src["url"], RAW / f"{src['code']}.pdf",
+                             powershell=True)
+                chunks = parse_ircc(pdf, src)
+            else:
+                chunks = []
+        except Exception as exc:
+            print(f"  !! {src['act_short']}: {type(exc).__name__}: {exc}")
+            continue
+        all_chunks.extend(chunks)
+        print(f"  {len(chunks)} chunks")
+    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
+    OUT.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=1),
+                   encoding="utf-8")
+    print(f"\n{len(all_chunks)} delegation chunks from {len(SOURCES)} "
+          f"instrument(s) -> {OUT}")
+if __name__ == "__main__":
+    build()

canlex/dmemo.py CHANGED Viewed

@@ -88,8 +88,15 @@ def parse_memo(html, url):
     number = match.group(1).upper() if match else url
     h1 = main.find("h1")
     topic = ""
-    if h1 and h1.find("small"):
-        topic = _norm(h1.find("small").get_text(" ", strip=True))
     dm = main.find("time", attrs={"property": "dateModified"})
     date = _norm(dm.get("datetime") or dm.get_text()) if dm else ""
@@ -191,7 +198,7 @@ def parse_pdf_memo(html, url):
             "act_name": "CBSA D-Memoranda",
             "section": number,
             "marginal_note": label,
-            "part": "",
             "division": "",
             "heading": "",
             "text": part,

     number = match.group(1).upper() if match else url
     h1 = main.find("h1")
     topic = ""
+    if h1:
+        # Pages vary: most carry the memo title in <h1><small>, others as plain
+        # "Memorandum DNN-N-N: Title" h1 text. Use the <small> if present, else
+        # the h1 text, and strip any leading memo-number prefix either way.
+        small = h1.find("small")
+        raw = (small.get_text(" ", strip=True) if small
+               else h1.get_text(" ", strip=True))
+        topic = re.sub(r"^Memorandum\s+D[\w-]+\s*[:–-]\s*", "",
+                       _norm(raw), flags=re.I)
     dm = main.find("time", attrs={"property": "dateModified"})
     date = _norm(dm.get("datetime") or dm.get_text()) if dm else ""
             "act_name": "CBSA D-Memoranda",
             "section": number,
             "marginal_note": label,
+            "part": topic,
             "division": "",
             "heading": "",
             "text": part,

canlex/embed.py CHANGED Viewed

@@ -32,10 +32,15 @@ def load_chunks():
 def embed_text(chunk):
     """Compact, retrieval-focused representation of one section."""
-    note = chunk["marginal_note"]
     body = chunk["text"][:_MAX_BODY]
-    # The marginal note (section title) is the strongest topical signal, so it
-    # is repeated to emphasise it.
     parts = [chunk["act_short"], note, note, chunk["heading"], body]
     return " . ".join(p for p in parts if p)

 def embed_text(chunk):
     """Compact, retrieval-focused representation of one section."""
+    # The section title is the strongest topical signal, so it is repeated to
+    # emphasise it. For D-memoranda the marginal note is only a generic section
+    # label ('Guidelines', 'Legislation'); the memo's actual subject lives in
+    # the 'part' field, so that is used as the title instead.
+    if chunk.get("doc_type") == "memorandum":
+        note = chunk.get("part") or chunk["marginal_note"]
+    else:
+        note = chunk["marginal_note"]
     body = chunk["text"][:_MAX_BODY]
     parts = [chunk["act_short"], note, note, chunk["heading"], body]
     return " . ".join(p for p in parts if p)

canlex/index.py CHANGED Viewed

@@ -16,7 +16,21 @@ RRF_K = 60          # reciprocal-rank-fusion damping constant
 W_SEM = 2.0         # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
 CANDIDATES = 80     # hits each retriever contributes to the fusion
 RERANK_POOL = 50    # top fused candidates the cross-encoder rescores
-SOURCE_CAP = 2      # max chunks one case/memo/agreement/directive may contribute
 _TOKEN = re.compile(r"[a-z0-9]+")
 _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
@@ -102,6 +116,7 @@ class LegislationIndex:
             raise RuntimeError(
                 f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
         self._build_bm25()
         self._build_xref()
         self._load_semantic()
         self._load_reranker()
@@ -127,6 +142,32 @@ class LegislationIndex:
         self.avgdl = sum(self.doc_len) / n
         self.idf = {t: math.log(1 + (n - d + 0.5) / (d + 0.5)) for t, d in df.items()}
     def _load_semantic(self):
         """Load precomputed embeddings and the query embedder.
@@ -202,15 +243,16 @@ class LegislationIndex:
     def _source_key(self, idx):
         """The parent document a chunk belongs to, for diversity capping. Returns
-        None for legislation -- each section is a distinct provision and is never
         capped; case law is keyed by citation, memoranda by memo number."""
         c = self.chunks[idx]
         doc_type = c.get("doc_type", "legislation")
-        if doc_type == "legislation":
             return None
         if doc_type == "memorandum":
             return ("memorandum", c["section"])   # act_code is a shared constant
-        return (doc_type, c["act_code"])          # caselaw / agreement / directive
     def _diversify(self, ordered):
         """Reorder so no single case, memorandum, agreement or directive can
@@ -299,6 +341,36 @@ class LegislationIndex:
                 if c["section"] in refs and idx not in fused:
                     fused[idx] = 0.0
         def allowed(idx):
             c = self.chunks[idx]
             if act and act.lower() not in (c["act_short"].lower(), c["act_code"].lower()):

 W_SEM = 2.0         # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
 CANDIDATES = 80     # hits each retriever contributes to the fusion
 RERANK_POOL = 50    # top fused candidates the cross-encoder rescores
+MN_WEIGHT = 0.0024  # title-match boost per unit of idf-weighted overlap between
+                    # the query and a candidate's marginal note (section title)
+MN_CAP = 0.012      # ceiling on the title-match boost -- it nudges the ranking
+                    # without overriding a strong base score
+REG_PENALTY = 0.008 # small fusion penalty on regulation sections, so the Act
+                    # that creates a duty outranks the regulation elaborating it
+BACKMATTER_PENALTY = 0.008  # likewise for a collective agreement's back-matter
+                    # (memoranda, letters of understanding) vs its numbered articles
+SOURCE_CAP = 2      # max chunks one case or memorandum may contribute
+# Primary instruments -- enacted law, collective agreements, the NJC directives
+# incorporated into them, and the IRPA delegation instruments. Their sections or
+# items are distinct provisions, so (like legislation) they are never collapsed
+# under the diversity cap.
+PRIMARY_DOC_TYPES = frozenset({"legislation", "agreement", "directive", "delegation"})
 _TOKEN = re.compile(r"[a-z0-9]+")
 _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
             raise RuntimeError(
                 f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
         self._build_bm25()
+        self._build_note_tokens()
         self._build_xref()
         self._load_semantic()
         self._load_reranker()
         self.avgdl = sum(self.doc_len) / n
         self.idf = {t: math.log(1 + (n - d + 0.5) / (d + 0.5)) for t, d in df.items()}
+    def _build_note_tokens(self):
+        """Pre-tokenise each chunk's topical title, for the title-match boost in
+        search(). For legislation, agreements and directives the title is the
+        marginal note (the section heading). A D-memorandum's marginal note is
+        generic ('Legislation', 'Guidelines and General Information', or a stray
+        page banner), so the memo's subject -- carried in its 'part' field -- is
+        used instead. Each chunk is also flagged as a regulation (act codes
+        beginning SOR/C.R.C.) for the Act-over-regulation preference, and as
+        collective-agreement back-matter (memoranda and letters with no article
+        number) for the back-matter penalty."""
+        self._note_tokens = []
+        self._is_regulation = []
+        self._is_backmatter = []
+        for c in self.chunks:
+            if c.get("doc_type") == "memorandum":
+                title = c.get("part") or c["marginal_note"]
+            else:
+                title = c["marginal_note"]
+            self._note_tokens.append(set(tokenize(title)))
+            self._is_regulation.append(
+                c.get("doc_type", "legislation") == "legislation"
+                and c["act_code"].startswith(("SOR", "C.R.C")))
+            self._is_backmatter.append(
+                c.get("doc_type") == "agreement"
+                and not str(c["section"])[:1].isdigit())
     def _load_semantic(self):
         """Load precomputed embeddings and the query embedder.
     def _source_key(self, idx):
         """The parent document a chunk belongs to, for diversity capping. Returns
+        None for primary instruments -- legislation, collective agreements and
+        directives -- whose sections are distinct provisions and are never
         capped; case law is keyed by citation, memoranda by memo number."""
         c = self.chunks[idx]
         doc_type = c.get("doc_type", "legislation")
+        if doc_type in PRIMARY_DOC_TYPES:
             return None
         if doc_type == "memorandum":
             return ("memorandum", c["section"])   # act_code is a shared constant
+        return (doc_type, c["act_code"])          # one decision, keyed by citation
     def _diversify(self, ordered):
         """Reorder so no single case, memorandum, agreement or directive can
                 if c["section"] in refs and idx not in fused:
                     fused[idx] = 0.0
+        # Title-match boost: the marginal note is a section's canonical subject.
+        # Reward a candidate by how completely and how specifically the query
+        # matches its marginal note. The overlap is idf-weighted (matching a
+        # distinctive title like "hours of work" counts far more than a generic
+        # one like "Decision"), scaled by coverage, and capped -- so it nudges
+        # ranking toward the provision a question names by topic without
+        # overriding a strong base score.
+        q_tokens = set(tokenize(expanded))
+        for idx in list(fused):
+            note_tokens = self._note_tokens[idx]
+            total = sum(self.idf.get(t, 0.0) for t in note_tokens)
+            if total <= 0:
+                continue
+            matched = sum(self.idf.get(t, 0.0)
+                          for t in note_tokens if t in q_tokens)
+            if matched > 0:
+                fused[idx] += min(MN_WEIGHT * matched * matched / total, MN_CAP)
+        # Hierarchy penalties: a topical question should surface the governing
+        # provision, not the supplementary material around it. An Act creates a
+        # duty while a regulation only elaborates procedure; a collective
+        # agreement's numbered articles are its substance while its memoranda
+        # and letters of understanding are back-matter. Both take a small
+        # fusion penalty so the governing provision wins a close contest.
+        for idx in list(fused):
+            if self._is_regulation[idx]:
+                fused[idx] -= REG_PENALTY
+            elif self._is_backmatter[idx]:
+                fused[idx] -= BACKMATTER_PENALTY
         def allowed(idx):
             c = self.chunks[idx]
             if act and act.lower() not in (c["act_short"].lower(), c["act_code"].lower()):

canlex/server.py CHANGED Viewed

@@ -110,6 +110,12 @@ def _format_section(c: dict, related=None) -> str:
         lines.append(f"(decided {c['current_to'] or 'n/a'})")
         if c["heading"]:
             lines.append(f"Subject: {c['heading']}")
     else:
         meta = [f"in force; text current to {c['current_to'] or 'n/a'}"]
         if c["last_amended"]:
@@ -176,8 +182,9 @@ class SearchInput(BaseModel):
         default=None,
         description="Optional filter by source type: 'legislation' (Acts and "
         "regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
-        "agreements), 'directive' (NJC directives), or 'caselaw' (court and "
-        "tribunal decisions). Omit to search all.",
     )
@@ -197,15 +204,16 @@ def canlex_search_legislation(params: SearchInput) -> str:
     """Search Canadian federal law, CBSA D-Memoranda, agreements, NJC directives,
     and leading court decisions.
-    The CanLex corpus has five kinds of source: 31 federal Acts and regulations
     (immigration, customs, criminal, drugs, food/health, labour, privacy and more);
     CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
     how it applies customs and border law); Treasury Board collective agreements
     (currently the FB / Border Services group); National Joint Council directives
-    (travel, relocation, isolated posts and more); and leading decisions of the
     courts and federal tribunals: the Supreme Court, Federal Court of Appeal and
     Federal Court, the Immigration and Refugee Board, and the FPSLREB and CIRB
-    labour boards. Use this for ANY question about that material. It ranks results by relevance and returns
     their full text so the answer can cite the actual wording; an explicit section
     reference (e.g. "section 34") is always surfaced. Each result is marked with its
     source type.
@@ -216,7 +224,8 @@ def canlex_search_legislation(params: SearchInput) -> str:
             - top_k (int): How many sections to return, 1-20 (default 6).
             - act (Optional[str]): Restrict to one Act by short name/code, or omit for all.
             - doc_type (Optional[str]): 'legislation', 'memorandum', 'agreement',
-              'directive', or 'caselaw' to restrict to one source type; omit for all.
     Returns:
         str: Markdown with answering instructions followed by the matching sections.
@@ -288,7 +297,8 @@ def canlex_get_section(params: GetSectionInput) -> str:
           annotations={"title": "List Loaded Legislation", **_READONLY})
 def canlex_list_acts() -> str:
     """List what the CanLex corpus contains -- Acts and regulations, CBSA
-    D-Memoranda, collective agreements, NJC directives, and leading cases.
     Use this to learn the scope and currency of the corpus before searching, or to
     report it to the user.
@@ -301,6 +311,7 @@ def canlex_list_acts() -> str:
     agreements: dict[str, dict] = {}
     directives: dict[str, dict] = {}
     cases: dict[str, dict] = {}
     memo_numbers: set[str] = set()
     memo_chunks = 0
     memo_date = ""
@@ -326,6 +337,12 @@ def canlex_list_acts() -> str:
                 "name": c["act_name"], "decided": c["current_to"], "count": 0,
             })
             entry["count"] += 1
         else:
             entry = acts.setdefault(c["act_code"], {
                 "short": c["act_short"], "name": c["act_name"],
@@ -357,10 +374,15 @@ def canlex_list_acts() -> str:
         for cite, a in sorted(cases.items(), key=lambda kv: kv[1]["decided"]):
             lines.append(f"- **{a['name']}**, {cite}: {a['count']} excerpts, "
                          f"decided {a['decided'] or 'n/a'}")
     lines += ["", "Search with canlex_search_legislation; filter by doc_type "
-              "(legislation / memorandum / agreement / directive / caselaw). Fetch "
-              "a known provision with canlex_get_section, or a case's citations "
-              "with canlex_case."]
     return "\n".join(lines)

         lines.append(f"(decided {c['current_to'] or 'n/a'})")
         if c["heading"]:
             lines.append(f"Subject: {c['heading']}")
+    elif doc_type == "delegation":
+        lines.append("_Instrument of delegation and designation — it records "
+                     "which officials the Minister has delegated powers to, or "
+                     "designated for functions, under IRPA and the IRPR. "
+                     "Administrative; confirm it is still the current version._")
+        lines.append(f"(dated {c['current_to'] or 'n/a'})")
     else:
         meta = [f"in force; text current to {c['current_to'] or 'n/a'}"]
         if c["last_amended"]:
         default=None,
         description="Optional filter by source type: 'legislation' (Acts and "
         "regulations), 'memorandum' (CBSA D-Memoranda), 'agreement' (collective "
+        "agreements), 'directive' (NJC directives), 'caselaw' (court and "
+        "tribunal decisions), or 'delegation' (IRPA/IRPR delegation and "
+        "designation instruments). Omit to search all.",
     )
     """Search Canadian federal law, CBSA D-Memoranda, agreements, NJC directives,
     and leading court decisions.
+    The CanLex corpus has six kinds of source: 31 federal Acts and regulations
     (immigration, customs, criminal, drugs, food/health, labour, privacy and more);
     CBSA D-Memoranda (the Canada Border Services Agency's administrative guidance on
     how it applies customs and border law); Treasury Board collective agreements
     (currently the FB / Border Services group); National Joint Council directives
+    (travel, relocation, isolated posts and more); leading decisions of the
     courts and federal tribunals: the Supreme Court, Federal Court of Appeal and
     Federal Court, the Immigration and Refugee Board, and the FPSLREB and CIRB
+    labour boards; and instruments of delegation and designation under IRPA and
+    the IRPR (which officials the Minister has authorized to exercise which powers). Use this for ANY question about that material. It ranks results by relevance and returns
     their full text so the answer can cite the actual wording; an explicit section
     reference (e.g. "section 34") is always surfaced. Each result is marked with its
     source type.
             - top_k (int): How many sections to return, 1-20 (default 6).
             - act (Optional[str]): Restrict to one Act by short name/code, or omit for all.
             - doc_type (Optional[str]): 'legislation', 'memorandum', 'agreement',
+              'directive', 'caselaw', or 'delegation' to restrict to one source
+              type; omit for all.
     Returns:
         str: Markdown with answering instructions followed by the matching sections.
           annotations={"title": "List Loaded Legislation", **_READONLY})
 def canlex_list_acts() -> str:
     """List what the CanLex corpus contains -- Acts and regulations, CBSA
+    D-Memoranda, collective agreements, NJC directives, leading cases, and
+    delegation instruments.
     Use this to learn the scope and currency of the corpus before searching, or to
     report it to the user.
     agreements: dict[str, dict] = {}
     directives: dict[str, dict] = {}
     cases: dict[str, dict] = {}
+    delegations: dict[str, dict] = {}
     memo_numbers: set[str] = set()
     memo_chunks = 0
     memo_date = ""
                 "name": c["act_name"], "decided": c["current_to"], "count": 0,
             })
             entry["count"] += 1
+        elif doc_type == "delegation":
+            entry = delegations.setdefault(c["act_code"], {
+                "short": c["act_short"], "name": c["act_name"],
+                "current_to": c["current_to"], "count": 0,
+            })
+            entry["count"] += 1
         else:
             entry = acts.setdefault(c["act_code"], {
                 "short": c["act_short"], "name": c["act_name"],
         for cite, a in sorted(cases.items(), key=lambda kv: kv[1]["decided"]):
             lines.append(f"- **{a['name']}**, {cite}: {a['count']} excerpts, "
                          f"decided {a['decided'] or 'n/a'}")
+    if delegations:
+        lines += ["", "## Delegation instruments"]
+        for a in sorted(delegations.values(), key=lambda x: x["short"]):
+            lines.append(f"- **{a['short']}** — {a['name']}: {a['count']} items, "
+                         f"dated {a['current_to'] or 'n/a'}")
     lines += ["", "Search with canlex_search_legislation; filter by doc_type "
+              "(legislation / memorandum / agreement / directive / caselaw / "
+              "delegation). Fetch a known provision with canlex_get_section, or "
+              "a case's citations with canlex_case."]
     return "\n".join(lines)

data/eval/questions.json CHANGED Viewed

@@ -92,5 +92,40 @@
   {"query": "What is the burden and standard of proof when an employer defends a disciplinary grievance?", "answers": [["Basra (2007)", ""]]},
   {"query": "What factors determine whether an employer had just cause to discipline or terminate an employee?", "answers": [["Pepper", ""]]},
   {"query": "What is the test for whether a union breached its duty of fair representation?", "answers": [["McRaeJackson", ""]]},
-  {"query": "Can a union breach the duty of fair representation in how it handles a vaccination-policy grievance?", "answers": [["Watson", ""]]}
 ]

   {"query": "What is the burden and standard of proof when an employer defends a disciplinary grievance?", "answers": [["Basra (2007)", ""]]},
   {"query": "What factors determine whether an employer had just cause to discipline or terminate an employee?", "answers": [["Pepper", ""]]},
   {"query": "What is the test for whether a union breached its duty of fair representation?", "answers": [["McRaeJackson", ""]]},
+  {"query": "Can a union breach the duty of fair representation in how it handles a vaccination-policy grievance?", "answers": [["Watson", ""]]},
+  {"query": "Is it consistent with the Charter of Rights to deport a permanent resident convicted of serious crimes?", "answers": [["Chiarelli", ""]]},
+  {"query": "When the Immigration Appeal Division hears an appeal from a removal order, may it weigh the hardship the person would face in the country of removal?", "answers": [["Chieu", ""]]},
+  {"query": "Does a non-citizen have a Charter right to remain in Canada?", "answers": [["Medovarski", ""]]},
+  {"query": "What must be established for inadmissibility for incitement to genocide or a crime against humanity?", "answers": [["Mugesera", ""]]},
+  {"query": "Is the IRPA security-certificate regime, with its special advocate scheme, consistent with the Charter?", "answers": [["Harkat", ""]]},
+  {"query": "How does the principle of non-refoulement constrain Canada when it extradites a person who holds refugee protection?", "answers": [["Németh", ""]]},
+  {"query": "What procedural fairness is owed to a sponsor before the government enforces a sponsorship-undertaking debt?", "answers": [["Mavi", ""]]},
+  {"query": "Can a sentencing court take the immigration consequences of a sentence into account?", "answers": [["Pham", ""]]},
+  {"query": "Can a fear of coerced sterilization under a one-child population-control policy support a refugee claim?", "answers": [["Chan", ""]]},
+  {"query": "Can a customs officer stop and search a vehicle near the border on reasonable suspicion of a customs offence?", "answers": [["Jacques", ""]]},
+  {"query": "Is an ascertained-forfeiture proceeding under the Customs Act penal enough to engage the right against self-incrimination?", "answers": [["Martineau", ""]]},
+  {"query": "Does a person's age at the time of involvement matter to inadmissibility for membership in a terrorist organization?", "answers": [["Poshteh", ""]]},
+  {"query": "Can a deserter from the United States military succeed in a refugee or humanitarian and compassionate claim in Canada?", "answers": [["Hinzman", ""]]},
+  {"query": "Does a Refugee Protection Division guideline that sets the order of questioning improperly fetter a member's discretion?", "answers": [["Thamotharem", ""]]},
+  {"query": "Can a person with no immigration status obtain coverage under the Interim Federal Health Program?", "answers": [["Toussaint", ""]]},
+  {"query": "What does a finding of no credible basis mean for a refugee claim?", "answers": [["Rahaman", ""]]},
+  {"query": "How is the reasonableness of a security certificate assessed on a re-determination after Charkaoui?", "answers": [["Almrei", ""]]},
+  {"query": "Can a permanent resident be found inadmissible for acts of violence that endanger safety in Canada even without a conviction?", "answers": [["Mason", ""]]},
+  {"query": "How much deference does the Immigration Appeal Division receive when it declines to grant discretionary relief from removal?", "answers": [["Khosa", ""]]},
+  {"query": "Is lengthy immigration detention consistent with the Charter protection against arbitrary detention?", "answers": [["Brown", ""]]},
+  {"query": "What meal and incidental allowances can a federal employee claim while travelling on government business?", "answers": [["Travel Directive", ""]]},
+  {"query": "What is a federal employee entitled to when the employer requires them to wear a uniform?", "answers": [["Uniforms Directive", ""]]},
+  {"query": "What protections does an indeterminate employee have when their position is declared surplus?", "answers": [["Work Force Adjustment Directive", ""]]},
+  {"query": "Is there an allowance for an employee required to administer first aid to the general public?", "answers": [["First Aid to the General Public - Allowance for Employees", ""]]},
+  {"query": "What allowances are available to a federal employee posted at an isolated post?", "answers": [["Isolated Posts and Government Housing Directive", ""]]},
+  {"query": "What relocation expenses are reimbursed when a federal employee must move for work?", "answers": [["NJC Relocation Directive", ""]]},
+  {"query": "What occupational health and safety obligations does the NJC directive place on the employer?", "answers": [["Occupational Health and Safety Directive", ""]]},
+  {"query": "What coverage does the Public Service Health Care Plan provide?", "answers": [["Public Service Health Care Plan Directive", ""]]},
+  {"query": "What support do the Foreign Service Directives give an employee posted outside Canada?", "answers": [["Foreign Service Directives", ""]]},
+  {"query": "Who has a right to appeal a decision to the Immigration Appeal Division?", "answers": [["IRPA", "63"]]},
+  {"query": "What are the objectives of the Immigration and Refugee Protection Act?", "answers": [["IRPA", "3"]]},
+  {"query": "Can a non-unionized federally regulated employee complain that they were unjustly dismissed?", "answers": [["Canada Labour Code", "240"]]},
+  {"query": "What does CBSA disclose about normal values and export prices under the Special Import Measures Act?", "answers": [["D-Memo", "D14-1-2"]]},
+  {"query": "What are CBSA's requirements for importing or exporting cultural property?", "answers": [["D-Memo", "D19-4-1"]]},
+  {"query": "When assessing criminal inadmissibility for a conviction abroad, must a decision-maker consider whether the defence of duress was practically available in that country?", "answers": [["Rodriguez Anzola", ""]]}
 ]

data/processed/caselaw.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/processed/delegation.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/processed/dmemos.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """CanLex unit-test suite."""

tests/test_embed.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""Unit tests for the embedding-text builder (canlex/embed.py)."""
+import unittest
+from canlex.embed import embed_text
+def _chunk(**kw):
+    base = {"doc_type": "legislation", "act_short": "X", "marginal_note": "",
+            "part": "", "heading": "", "text": ""}
+    base.update(kw)
+    return base
+class EmbedTextTests(unittest.TestCase):
+    def test_memorandum_title_comes_from_part(self):
+        # A memo's marginal note is generic; its subject lives in 'part'.
+        out = embed_text(_chunk(doc_type="memorandum", act_short="D-Memo",
+                                marginal_note="Guidelines",
+                                part="Value for Duty", text="body"))
+        self.assertIn("Value for Duty", out)
+    def test_legislation_title_comes_from_marginal_note(self):
+        out = embed_text(_chunk(marginal_note="Application for protection",
+                                part="PART 2", text="body"))
+        self.assertIn("Application for protection", out)
+    def test_title_is_repeated_for_emphasis(self):
+        out = embed_text(_chunk(marginal_note="UNIQUEWORD", text="b"))
+        self.assertEqual(out.count("UNIQUEWORD"), 2)
+if __name__ == "__main__":
+    unittest.main()

tests/test_index.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""Unit tests for the retrieval pipeline (canlex/index.py).
+Fast, offline tests of the pure retrieval logic -- tokenisation, section-
+reference parsing, the diversity cap, the result-set guarantee and the doc-type
+flags. They build a bare LegislationIndex via __new__, so no corpus, embeddings
+or reranker are loaded.
+    python -m unittest discover -s tests
+"""
+import unittest
+from canlex.index import (
+    LegislationIndex, SOURCE_CAP, tokenize, _section_refs, _provision_units,
+)
+def chunk(doc_type="legislation", act_code="I-2.5", section="1",
+          marginal_note="Title", part="", **extra):
+    """A minimal corpus chunk carrying the fields the index logic reads."""
+    c = {"doc_type": doc_type, "act_code": act_code, "section": section,
+         "marginal_note": marginal_note, "part": part, "heading": "",
+         "act_short": "X", "text": ""}
+    c.update(extra)
+    return c
+def bare_index(chunks):
+    """A LegislationIndex with only .chunks set -- enough for the pure methods."""
+    idx = LegislationIndex.__new__(LegislationIndex)
+    idx.chunks = chunks
+    return idx
+class TokenizeTests(unittest.TestCase):
+    def test_case_insensitive(self):
+        self.assertEqual(tokenize("REPORT goods"), tokenize("report Goods"))
+    def test_stemming_unifies_word_forms(self):
+        # The point of stemming: different forms collapse to one token.
+        self.assertEqual(tokenize("reporting"), tokenize("reported"))
+        self.assertEqual(tokenize("importation"), tokenize("import"))
+    def test_splits_on_non_alphanumeric(self):
+        self.assertEqual(tokenize("s.34(1)(a)"), ["s", "34", "1", "a"])
+    def test_empty(self):
+        self.assertEqual(tokenize(""), [])
+class SectionRefTests(unittest.TestCase):
+    def test_plain_section(self):
+        self.assertEqual(_section_refs("inadmissible under section 34"), {"34"})
+    def test_decimal_and_abbreviated(self):
+        self.assertEqual(_section_refs("see s. 20.1 and section 5"), {"20.1", "5"})
+    def test_no_reference(self):
+        self.assertEqual(_section_refs("what is a pre-removal risk assessment"),
+                         set())
+class ProvisionUnitsTests(unittest.TestCase):
+    def test_structured_provision_yields_units(self):
+        text = "(1) The chapeau.\n(a) first paragraph\n(b) second paragraph"
+        self.assertTrue(_provision_units(text))
+    def test_flat_provision_yields_nothing(self):
+        self.assertEqual(_provision_units("A flat provision with no markers."),
+                         [])
+class SourceKeyTests(unittest.TestCase):
+    """_source_key decides what the diversity cap collapses."""
+    def test_primary_instruments_are_never_capped(self):
+        idx = bare_index([
+            chunk(doc_type="legislation"),
+            chunk(doc_type="agreement", act_code="FB"),
+            chunk(doc_type="directive", act_code="d1"),
+        ])
+        for i in range(3):
+            self.assertIsNone(idx._source_key(i))
+    def test_caselaw_and_memoranda_are_keyed(self):
+        idx = bare_index([
+            chunk(doc_type="memorandum", act_code="D-Memo", section="D1-1-1"),
+            chunk(doc_type="caselaw", act_code="2019 SCC 65"),
+        ])
+        self.assertEqual(idx._source_key(0), ("memorandum", "D1-1-1"))
+        self.assertEqual(idx._source_key(1), ("caselaw", "2019 SCC 65"))
+class DiversifyTests(unittest.TestCase):
+    def test_caps_caselaw_per_decision(self):
+        n = SOURCE_CAP + 2
+        chunks = [chunk(doc_type="caselaw", act_code="2019 SCC 65")
+                  for _ in range(n)]
+        chunks.append(chunk(doc_type="legislation"))   # index n
+        idx = bare_index(chunks)
+        out = idx._diversify(list(range(n + 1)))
+        kept, deferred = out[:SOURCE_CAP + 1], out[SOURCE_CAP + 1:]
+        self.assertIn(n, kept)                          # legislation never capped
+        self.assertEqual(
+            sum(1 for i in kept if idx.chunks[i]["doc_type"] == "caselaw"),
+            SOURCE_CAP)
+        self.assertEqual(len(deferred), n - SOURCE_CAP)
+    def test_does_not_cap_agreements(self):
+        n = SOURCE_CAP + 3
+        idx = bare_index([chunk(doc_type="agreement", act_code="FB",
+                                section=str(i)) for i in range(n)])
+        out = idx._diversify(list(range(n)))
+        self.assertEqual(out, list(range(n)))           # uncapped: order intact
+class EnsureLegislationTests(unittest.TestCase):
+    def test_pulls_legislation_into_a_caselaw_dominated_top_k(self):
+        idx = bare_index([
+            chunk(doc_type="caselaw", act_code="A"),
+            chunk(doc_type="caselaw", act_code="B"),
+            chunk(doc_type="caselaw", act_code="C"),
+            chunk(doc_type="legislation"),
+            chunk(doc_type="legislation"),
+        ])
+        out = idx._ensure_legislation([0, 1, 2, 3, 4], top_k=3)
+        top = out[:3]
+        n_leg = sum(1 for i in top
+                    if idx.chunks[i]["doc_type"] == "legislation")
+        self.assertGreaterEqual(n_leg, 2)
+        self.assertEqual(out[0], 0)                     # the #1 hit is preserved
+    def test_no_op_when_legislation_already_present(self):
+        idx = bare_index([
+            chunk(doc_type="legislation"),
+            chunk(doc_type="legislation"),
+            chunk(doc_type="caselaw", act_code="A"),
+        ])
+        self.assertEqual(idx._ensure_legislation([0, 1, 2], top_k=3), [0, 1, 2])
+class DocTypeFlagTests(unittest.TestCase):
+    """_build_note_tokens also flags regulations and agreement back-matter."""
+    def setUp(self):
+        self.idx = bare_index([
+            chunk(doc_type="legislation", act_code="I-2.5"),
+            chunk(doc_type="legislation", act_code="SOR-2002-227"),
+            chunk(doc_type="legislation", act_code="C.R.C.,_c._1041"),
+            chunk(doc_type="agreement", act_code="FB", section="17"),
+            chunk(doc_type="agreement", act_code="FB", section=""),
+            chunk(doc_type="memorandum", act_code="D-Memo", section="D1-1-1",
+                  marginal_note="Guidelines", part="Importing goods"),
+        ])
+        self.idx._build_note_tokens()
+    def test_regulation_flag(self):
+        self.assertEqual(self.idx._is_regulation,
+                         [False, True, True, False, False, False])
+    def test_agreement_backmatter_flag(self):
+        self.assertEqual(self.idx._is_backmatter,
+                         [False, False, False, False, True, False])
+    def test_memorandum_title_tokens_come_from_part(self):
+        # A memo's marginal note is generic; its title is the 'part' field.
+        self.assertEqual(self.idx._note_tokens[5], set(tokenize("Importing goods")))
+if __name__ == "__main__":
+    unittest.main()

tests/test_synonyms.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Unit tests for legal-abbreviation query expansion (canlex/synonyms.py)."""
+import unittest
+from canlex.synonyms import expand_query
+class ExpandQueryTests(unittest.TestCase):
+    def test_keeps_the_original_query(self):
+        self.assertTrue(
+            expand_query("PRRA eligibility").startswith("PRRA eligibility"))
+    def test_expands_a_known_abbreviation(self):
+        # 'PRRA' should pull in the statutory wording the Act actually uses.
+        self.assertIn("application for protection",
+                      expand_query("PRRA eligibility"))
+    def test_case_insensitive(self):
+        self.assertIn("humanitarian and compassionate",
+                      expand_query("an H&C application"))
+    def test_unknown_query_is_unchanged(self):
+        q = "what are the standard hours of work"
+        self.assertEqual(expand_query(q), q)
+if __name__ == "__main__":
+    unittest.main()