Spaces:

Beemer0
/

CanLex

Running

Beemer Claude Opus 4.7 commited on 6 days ago

Commit

1e58371

1 Parent(s): 2966f10

Add output-quality features: highlighting, hedging, linking, currency

Retrieval and presentation improvements for better-grounded answers:

- Legal-abbreviation query expansion (new canlex/synonyms.py): PRRA,
H&C, RAD, CBSA and similar shorthand expand to statutory wording
before BM25 and semantic retrieval.
- Diversity cap: no single case or memo may take more than two result
slots, so one document cannot monopolise a topical query.
- Legislation guarantee: when a result set is dominated by case law,
the governing statute is pulled into it.
- Pinpoint highlighting: each long result flags the subsection or
paragraph most on point (e.g. s. 34(1)(c)), scored by the
cross-encoder.
- Low-confidence hedging: when the top semantic match is weak, the
search tool warns that the corpus may not address the question.
- Currency signalling: every legislation result carries a prominent
Currency line; the grounding note demands a dated answer.
- Cross-reference linking: a result also surfaces the regulations made
under its Act (and a regulation its enabling Act) and the CBSA
D-memoranda that cite the provision.

Eval: Hit@3 0.74 -> 0.77, Hit@5 holds at 0.89, no regression.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (3) hide show

canlex/index.py +193 -17
canlex/server.py +47 -11
canlex/synonyms.py +62 -0

canlex/index.py CHANGED Viewed

@@ -8,12 +8,14 @@ from collections import Counter, defaultdict
 import snowballstemmer
 from .config import PROCESSED_DIR
 K1 = 1.5
 B = 0.75
 RRF_K = 60          # reciprocal-rank-fusion damping constant
 CANDIDATES = 80     # hits each retriever contributes to the fusion
 RERANK_POOL = 50    # top fused candidates the cross-encoder rescores
 _TOKEN = re.compile(r"[a-z0-9]+")
 _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
@@ -23,6 +25,16 @@ _XREF = re.compile(
     r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)",
     re.IGNORECASE)
 _STEMMER = snowballstemmer.stemmer("english")
 _STEM_CACHE = {}
@@ -49,6 +61,37 @@ def _section_refs(query):
     return set(_SECTION_REF.findall(query.lower()))
 class LegislationIndex:
     def __init__(self):
         self.chunks = []
@@ -148,20 +191,104 @@ class LegislationIndex:
         qv = self.embedder.encode_query(query)
         sims = self.vectors @ qv
         order = self._np.argsort(sims)[::-1][:CANDIDATES]
-        return [int(i) for i in order]
     def _rerank_doc(self, idx):
         c = self.chunks[idx]
         return f"{c['citation']} — {c['marginal_note']}\n{c['text']}"
     def search(self, query, top_k=6, act=None, doc_type=None):
         """Hybrid candidate fusion (BM25 + semantic), then cross-encoder rerank."""
         fused = defaultdict(float)
-        bm25 = self._bm25_scores(query)
         for rank, idx in enumerate(sorted(bm25, key=bm25.get, reverse=True)[:CANDIDATES]):
             fused[idx] += 1.0 / (RRF_K + rank)
         if self.semantic:
-            for rank, idx in enumerate(self._semantic_ranking(query)):
                 fused[idx] += 1.0 / (RRF_K + rank)
         # Ensure explicitly-referenced sections are retrieved even if recall missed them.
@@ -208,7 +335,20 @@ class LegislationIndex:
                 pinned_set = set(pinned)
                 candidates = pinned + [i for i in candidates if i not in pinned_set]
-        return [{**self.chunks[i], "score": round(scores[i], 4)} for i in candidates[:top_k]]
     def get_section(self, act, section):
         act = act.lower()
@@ -218,10 +358,12 @@ class LegislationIndex:
         return None
     def _build_xref(self):
-        """Index legislation by (act, section) and locate each Act's definitions
-        section, to support cross-reference lookup."""
         self._by_section = {}
         self._defs_section = {}
         for c in self.chunks:
             if c.get("doc_type", "legislation") != "legislation":
                 continue
@@ -230,20 +372,48 @@ class LegislationIndex:
                     c["marginal_note"].strip().lower() in (
                         "definitions", "definition", "interpretation")):
                 self._defs_section[c["act_code"]] = c
     def related(self, chunk):
-        """Return [(section, marginal_note), ...]: provisions of the same Act
-        that this one cross-references, plus the Act's definitions section.
-        Legislation chunks only; returns [] for case law, memoranda, etc.
-        """
         if chunk.get("doc_type", "legislation") != "legislation":
-            return []
         act = chunk["act_code"]
-        out, seen = [], {chunk["section"]}
         defs = self._defs_section.get(act)
         if defs and defs["section"] not in seen:
-            out.append((defs["section"], defs["marginal_note"]))
             seen.add(defs["section"])
         for match in _XREF.finditer(chunk["text"]):
             sec = match.group(1)
@@ -251,11 +421,17 @@ class LegislationIndex:
                 continue
             target = self._by_section.get((act, sec))
             if target:
-                out.append((sec, target["marginal_note"]))
                 seen.add(sec)
-            if len(out) >= 8:
                 break
-        return out
 def main():

 import snowballstemmer
 from .config import PROCESSED_DIR
+from .synonyms import expand_query
 K1 = 1.5
 B = 0.75
 RRF_K = 60          # reciprocal-rank-fusion damping constant
 CANDIDATES = 80     # hits each retriever contributes to the fusion
 RERANK_POOL = 50    # top fused candidates the cross-encoder rescores
+SOURCE_CAP = 2      # max chunks one case/memo/agreement/directive may contribute
 _TOKEN = re.compile(r"[a-z0-9]+")
 _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
     r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)",
     re.IGNORECASE)
+# A line opening with "(1)", "(a)" or "(b.1)" -- a citable subdivision
+# (subsection, paragraph or subparagraph) of a provision.
+_MARKER = re.compile(r"(?m)^\(([0-9a-zA-Z]+(?:\.\d+)?)\)")
+# A D-memorandum's reference to a provision -- "section 32 of the Customs Act",
+# or "section 32 of the Act" (the Act a D-memo administers -- the Customs Act).
+_MEMO_CITE = re.compile(
+    r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+"
+    r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE)
 _STEMMER = snowballstemmer.stemmer("english")
 _STEM_CACHE = {}
     return set(_SECTION_REF.findall(query.lower()))
+def _provision_units(text):
+    """Citable parts of a provision, for pinpoint scoring -- a list of
+    (citation_suffix, scoring_text, snippet). One entry per paragraph, with its
+    subsection chapeau prepended to scoring_text for context, plus one per
+    paragraph-less subsection. Returns [] when the provision is too flat to
+    pinpoint (fewer than two subdivisions)."""
+    marks = list(_MARKER.finditer(text))
+    if len(marks) < 2:
+        return []
+    spans = []
+    for i, m in enumerate(marks):
+        end = marks[i + 1].start() if i + 1 < len(marks) else len(text)
+        spans.append((m.group(1), text[m.start():end].strip()))
+    units, cur_sub, cur_intro = [], "", ""
+    for j, (token, body) in enumerate(spans):
+        if "[Repealed" in body[:40]:
+            if token[0].isdigit():
+                cur_sub, cur_intro = f"({token})", ""
+            continue
+        if token[0].isdigit():
+            cur_sub, cur_intro = f"({token})", body
+            nxt = spans[j + 1][0] if j + 1 < len(spans) else ""
+            if not nxt or nxt[0].isdigit():
+                units.append((cur_sub, body, body))   # subsection has no paragraphs
+            # otherwise the chapeau is emitted via its paragraphs below
+        else:
+            label = f"{cur_sub}({token})" if cur_sub else f"({token})"
+            units.append((label, f"{cur_intro} {body}".strip(), body))
+    return units
 class LegislationIndex:
     def __init__(self):
         self.chunks = []
         qv = self.embedder.encode_query(query)
         sims = self.vectors @ qv
         order = self._np.argsort(sims)[::-1][:CANDIDATES]
+        # The top cosine similarity doubles as a corpus-coverage signal: a query
+        # the corpus cannot answer has no passage close to it.
+        return [int(i) for i in order], float(sims.max())
     def _rerank_doc(self, idx):
         c = self.chunks[idx]
         return f"{c['citation']} — {c['marginal_note']}\n{c['text']}"
+    def _source_key(self, idx):
+        """The parent document a chunk belongs to, for diversity capping. Returns
+        None for legislation -- each section is a distinct provision and is never
+        capped; case law is keyed by citation, memoranda by memo number."""
+        c = self.chunks[idx]
+        doc_type = c.get("doc_type", "legislation")
+        if doc_type == "legislation":
+            return None
+        if doc_type == "memorandum":
+            return ("memorandum", c["section"])   # act_code is a shared constant
+        return (doc_type, c["act_code"])          # caselaw / agreement / directive
+    def _diversify(self, ordered):
+        """Reorder so no single case, memorandum, agreement or directive can
+        monopolise the results: once a source has contributed SOURCE_CAP chunks,
+        its remaining chunks are deferred below every other candidate. This stops
+        a heavily paragraph-chunked decision from crowding out the statute it
+        interprets. Legislation is never capped."""
+        kept, deferred, counts = [], [], defaultdict(int)
+        for idx in ordered:
+            key = self._source_key(idx)
+            if key is None:
+                kept.append(idx)
+                continue
+            counts[key] += 1
+            (kept if counts[key] <= SOURCE_CAP else deferred).append(idx)
+        return kept + deferred
+    def _ensure_legislation(self, ordered, top_k):
+        """Guarantee the governing statute is surfaced: when the natural top_k is
+        monopolised by case law or memoranda, pull the best legislation results
+        up to just below the top hit, displacing the lowest-ranked secondary
+        sources. The single best match is always kept in place."""
+        if top_k < 3:
+            return ordered
+        def is_leg(i):
+            return self.chunks[i].get("doc_type", "legislation") == "legislation"
+        top, rest = ordered[:top_k], ordered[top_k:]
+        need = 2 - sum(1 for i in top if is_leg(i))
+        if need <= 0:
+            return ordered
+        promote = [i for i in rest if is_leg(i)][:need]
+        drop = [i for i in reversed(top) if not is_leg(i)][:len(promote)]
+        if not drop:
+            return ordered
+        promote = promote[:len(drop)]
+        dropped, promoted = set(drop), set(promote)
+        kept = [i for i in top if i not in dropped]
+        return kept[:1] + promote + kept[1:] + drop + [
+            i for i in rest if i not in promoted]
+    def _highlight(self, query, indices):
+        """For each result chunk, the subsection or paragraph most on point for
+        the query: {result_position: (citation_suffix, snippet)}. Uses the
+        cross-encoder; returns {} if it is unavailable or nothing is structured.
+        Only the first results are scored -- a pinpoint deep in the list is not
+        worth the cross-encoder cost."""
+        if not self.reranker:
+            return {}
+        jobs = []   # (result_position, label, scoring_text, snippet)
+        for pos, idx in enumerate(indices[:8]):
+            c = self.chunks[idx]
+            if c.get("doc_type", "legislation") != "legislation":
+                continue
+            note = c["marginal_note"]
+            for label, scoring, snippet in _provision_units(c["text"]):
+                jobs.append((pos, label, f"{note}. {scoring}", snippet))
+        if not jobs:
+            return {}
+        best = {}   # result_position -> (score, label, snippet)
+        for (pos, label, _, snippet), score in zip(
+                jobs, self.reranker.score(query, [j[2] for j in jobs])):
+            if pos not in best or score > best[pos][0]:
+                best[pos] = (score, label, snippet)
+        return {pos: (label, " ".join(snippet[:240].split()))
+                for pos, (score, label, snippet) in best.items()}
     def search(self, query, top_k=6, act=None, doc_type=None):
         """Hybrid candidate fusion (BM25 + semantic), then cross-encoder rerank."""
+        # Expand legal abbreviations (PRRA, H&C, ...) into statutory wording for
+        # the recall stages; the reranker still sees the user's original query.
+        expanded = expand_query(query)
+        confidence = None
         fused = defaultdict(float)
+        bm25 = self._bm25_scores(expanded)
         for rank, idx in enumerate(sorted(bm25, key=bm25.get, reverse=True)[:CANDIDATES]):
             fused[idx] += 1.0 / (RRF_K + rank)
         if self.semantic:
+            sem_order, confidence = self._semantic_ranking(expanded)
+            for rank, idx in enumerate(sem_order):
                 fused[idx] += 1.0 / (RRF_K + rank)
         # Ensure explicitly-referenced sections are retrieved even if recall missed them.
                 pinned_set = set(pinned)
                 candidates = pinned + [i for i in candidates if i not in pinned_set]
+        # Cap one-source monopolies, then guarantee the statute is represented.
+        candidates = self._diversify(candidates)
+        candidates = self._ensure_legislation(candidates, top_k)
+        top = candidates[:top_k]
+        highlights = self._highlight(query, top)
+        results = []
+        for pos, i in enumerate(top):
+            result = {**self.chunks[i], "score": round(scores[i], 4),
+                      "confidence": confidence}
+            if pos in highlights:
+                result["highlight"] = highlights[pos]
+            results.append(result)
+        return results
     def get_section(self, act, section):
         act = act.lower()
         return None
     def _build_xref(self):
+        """Index legislation by (act, section); find each Act's definitions
+        section; link every regulation to its enabling Act and every
+        D-memorandum to the provisions it cites -- all for cross-referencing."""
         self._by_section = {}
         self._defs_section = {}
+        acts, regs = {}, {}        # act_code -> (act_short, act_name)
         for c in self.chunks:
             if c.get("doc_type", "legislation") != "legislation":
                 continue
                     c["marginal_note"].strip().lower() in (
                         "definitions", "definition", "interpretation")):
                 self._defs_section[c["act_code"]] = c
+            bucket = regs if c["act_code"].startswith(("SOR", "C.R.C")) else acts
+            bucket.setdefault(c["act_code"], (c["act_short"], c["act_name"]))
+        # Link a regulation to the Act it is made under by matching their names
+        # ("X Regulations" <-> "X Act").
+        self._enabling_act = {}                 # reg code -> (act_short, act_name)
+        self._regulations = defaultdict(list)   # act code -> [(reg_short, reg_name)]
+        def base(name):
+            return re.sub(r"\b(?:Act|Regulations)\b", "", name).strip().lower()
+        act_by_base = {base(n): (code, s, n) for code, (s, n) in acts.items()}
+        for rcode, (rshort, rname) in regs.items():
+            hit = act_by_base.get(base(rname))
+            if hit:
+                self._enabling_act[rcode] = (hit[1], hit[2])
+                self._regulations[hit[0]].append((rshort, rname))
+        # Link D-memoranda to the Customs Act / Customs Tariff provisions they
+        # cite; an unqualified "the Act" in a D-memo means the Customs Act.
+        by_short = {s.lower(): code for code, (s, n) in acts.items()}
+        customs, tariff = by_short.get("customs act"), by_short.get("customs tariff")
+        self._memos_for_section = defaultdict(set)   # (act_code, section) -> memos
+        for c in self.chunks:
+            if c.get("doc_type") != "memorandum":
+                continue
+            for num, which in _MEMO_CITE.findall(c["text"]):
+                code = tariff if which.lower() == "customs tariff" else customs
+                if code:
+                    self._memos_for_section[(code, num)].add(c["section"])
     def related(self, chunk):
+        """Cross-references for a legislation result, as a dict: 'provisions'
+        (intra-Act sections it cites, plus the definitions section),
+        'regulations' (made under this Act), 'enabling_act' (for a regulation,
+        the Act it is made under) and 'memoranda' (D-memo numbers citing this
+        section). Empty dict for case law, memoranda, etc."""
         if chunk.get("doc_type", "legislation") != "legislation":
+            return {}
         act = chunk["act_code"]
+        provisions, seen = [], {chunk["section"]}
         defs = self._defs_section.get(act)
         if defs and defs["section"] not in seen:
+            provisions.append((defs["section"], defs["marginal_note"]))
             seen.add(defs["section"])
         for match in _XREF.finditer(chunk["text"]):
             sec = match.group(1)
                 continue
             target = self._by_section.get((act, sec))
             if target:
+                provisions.append((sec, target["marginal_note"]))
                 seen.add(sec)
+            if len(provisions) >= 8:
                 break
+        return {
+            "provisions": provisions,
+            "regulations": self._regulations.get(act, []),
+            "enabling_act": self._enabling_act.get(act),
+            "memoranda": sorted(self._memos_for_section.get(
+                (act, chunk["section"]), []))[:6],
+        }
 def main():

canlex/server.py CHANGED Viewed

@@ -31,9 +31,10 @@ _READONLY = {
 GROUNDING_NOTE = (
     "ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
     "specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
-    "When a result lists Related provisions, fetch any that bear on the question "
-    "-- the definitions section, an exception, a cross-referenced rule -- with "
-    "canlex_get_section before answering. "
     "Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
     "CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
     "and a court may disagree with them; collective agreements and the National "
@@ -43,13 +44,24 @@ GROUNDING_NOTE = (
     "name the deciding court and the date, and do not assume a decision is still "
     "good law if it may have been overtaken (the canlex_case tool checks a "
     "decision's later treatment on CanLII -- give it the neutral citation). "
-    "State the "
-    "'current to', modified, or in-force date when stating the law. If the material "
     "below does not fully resolve the question -- including where it turns on case "
     "law or facts not present here -- say so explicitly. This is legal information, "
     "not legal advice."
 )
 _INDEX: Optional[LegislationIndex] = None
@@ -99,16 +111,36 @@ def _format_section(c: dict, related=None) -> str:
         if c["heading"]:
             lines.append(f"Subject: {c['heading']}")
     else:
-        meta = [f"current to {c['current_to'] or 'n/a'}"]
         if c["last_amended"]:
             meta.append(f"last amended {c['last_amended']}")
-        lines.append(f"({'; '.join(meta)})")
     lines.append("")
     lines.append(c["text"])
     lines.append("")
     if related:
-        refs = "; ".join(f"s. {s} ({n})" if n else f"s. {s}" for s, n in related)
-        lines.append(f"Related provisions in this Act: {refs}")
     if c["history"]:
         if doc_type == "caselaw":
             lines.append(f"Also reported: {c['history']}")
@@ -205,8 +237,12 @@ def canlex_search_legislation(params: SearchInput) -> str:
         return (f"No results matched '{params.query}'{scope}. "
                 f"Try broader or different keywords, or call canlex_list_acts to see "
                 f"what is currently loaded.")
-    blocks = [GROUNDING_NOTE, "",
-              f'{len(results)} relevant section(s) for: "{params.query}"']
     for c in results:
         blocks.append("")
         blocks.append("---")

 GROUNDING_NOTE = (
     "ANSWERING INSTRUCTIONS: Base the answer only on the material below. Cite "
     "specific provisions and quote key operative words (e.g. 'IRPA s. 34(1)(c)'). "
+    "When a result lists related provisions, regulations or D-memoranda, fetch "
+    "any that bear on the question -- the definitions section, an exception, a "
+    "cross-referenced rule, the regulation that adds detail -- with "
+    "canlex_get_section or canlex_search_legislation before answering. "
     "Distinguish the kinds of source: enacted law (Acts and regulations) is binding; "
     "CBSA D-Memoranda are administrative guidance -- persuasive only, not binding, "
     "and a court may disagree with them; collective agreements and the National "
     "name the deciding court and the date, and do not assume a decision is still "
     "good law if it may have been overtaken (the canlex_case tool checks a "
     "decision's later treatment on CanLII -- give it the neutral citation). "
+    "Always state the date the source is current to, and that the answer "
+    "reflects the law only as of that date -- for a time-sensitive matter, tell "
+    "the reader to verify no amendment has come into force since. If the material "
     "below does not fully resolve the question -- including where it turns on case "
     "law or facts not present here -- say so explicitly. This is legal information, "
     "not legal advice."
 )
+HEDGE_THRESHOLD = 0.72   # max semantic similarity below which results are weak
+WEAK_MATCH_NOTE = (
+    "RETRIEVAL CAUTION: the material below is only a weak match for this query "
+    "— CanLex may not contain a provision or decision that directly answers it. "
+    "Read it critically; if it does not actually address the question, say so "
+    "plainly rather than stretching it to fit, and consider canlex_list_acts to "
+    "check what the corpus covers."
+)
 _INDEX: Optional[LegislationIndex] = None
         if c["heading"]:
             lines.append(f"Subject: {c['heading']}")
     else:
+        meta = [f"in force; text current to {c['current_to'] or 'n/a'}"]
         if c["last_amended"]:
             meta.append(f"last amended {c['last_amended']}")
+        lines.append(f"**Currency:** {'; '.join(meta)}. Does not reflect any "
+                     f"amendment that came into force after the 'current to' date.")
+    hl = c.get("highlight")
+    if hl:
+        label, snippet = hl
+        lines.append(f"**Most on point for this query:** "
+                     f"{c['citation']}{label} — {snippet}")
     lines.append("")
     lines.append(c["text"])
     lines.append("")
     if related:
+        provisions = related.get("provisions")
+        if provisions:
+            refs = "; ".join(f"s. {s} ({n})" if n else f"s. {s}"
+                             for s, n in provisions)
+            lines.append(f"Related provisions in this Act: {refs}")
+        regs = related.get("regulations")
+        if regs:
+            lines.append("Regulations made under this Act: "
+                         + "; ".join(f"{n} ({s})" for s, n in regs))
+        enabling = related.get("enabling_act")
+        if enabling:
+            lines.append(f"Made under: {enabling[1]} ({enabling[0]})")
+        memos = related.get("memoranda")
+        if memos:
+            lines.append("CBSA D-memoranda citing this section (guidance, not "
+                         "binding): " + ", ".join(memos))
     if c["history"]:
         if doc_type == "caselaw":
             lines.append(f"Also reported: {c['history']}")
         return (f"No results matched '{params.query}'{scope}. "
                 f"Try broader or different keywords, or call canlex_list_acts to see "
                 f"what is currently loaded.")
+    blocks = []
+    weak = results[0].get("confidence")
+    if weak is not None and weak < HEDGE_THRESHOLD:
+        blocks += [WEAK_MATCH_NOTE, ""]
+    blocks += [GROUNDING_NOTE, "",
+               f'{len(results)} relevant section(s) for: "{params.query}"']
     for c in results:
         blocks.append("")
         blocks.append("---")

canlex/synonyms.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""Query-side expansion of legal abbreviations and informal terms.
+Statutes use formal wording -- "application for protection", "removal order" --
+but users (and a model drafting a search) reach for everyday shorthand: "PRRA",
+"H&C", "deportation". Before retrieval, expand_query() appends the canonical
+statutory terms for any abbreviation or nickname it recognises, so the BM25 and
+semantic stages can match the provision's actual language. It only ever ADDS
+words -- the user's own phrasing is left untouched -- and the cross-encoder
+reranker still sees the original query, so precision is unaffected.
+    python -m canlex.synonyms "PRRA eligibility and an H&C application"
+"""
+import re
+import sys
+# (trigger, canonical terms to append). The trigger is a regex fragment matched
+# case-insensitively as a whole word. Keep this list high-precision: an entry
+# earns its place only when the shorthand is unambiguous in Canadian border,
+# immigration, customs, financial-crime or labour law.
+_SYNONYMS = [
+    # Immigration and refugee law
+    (r"prra", "pre-removal risk assessment application for protection"),
+    (r"pre[- ]removal risk assessment", "application for protection"),
+    (r"h\s*&\s*c", "humanitarian and compassionate"),
+    (r"rad", "refugee appeal division"),
+    (r"rpd", "refugee protection division"),
+    (r"iad", "immigration appeal division"),
+    (r"irb", "immigration and refugee board"),
+    (r"trp", "temporary resident permit"),
+    (r"deportation", "removal order"),
+    (r"misrep", "misrepresentation"),
+    (r"ircc", "immigration refugees and citizenship canada"),
+    # Border and customs
+    (r"cbsa", "canada border services agency"),
+    (r"bsos?", "border services officer"),
+    (r"amps", "administrative monetary penalty system"),
+    # Financial-crime and labour
+    (r"fintrac", "financial transactions and reports analysis centre"),
+    (r"njc", "national joint council"),
+]
+_COMPILED = [(re.compile(rf"\b{trigger}\b", re.IGNORECASE), expansion)
+             for trigger, expansion in _SYNONYMS]
+def expand_query(query):
+    """Return `query` with canonical statutory terms appended for every legal
+    abbreviation it contains; return it unchanged if it contains none."""
+    additions = [exp for pattern, exp in _COMPILED if pattern.search(query)]
+    if not additions:
+        return query
+    return f"{query} {' '.join(additions)}"
+def main():
+    query = " ".join(sys.argv[1:]) or "PRRA eligibility and an H&C application"
+    print(f"query:    {query}")
+    print(f"expanded: {expand_query(query)}")
+if __name__ == "__main__":
+    main()