Spaces:

Beemer0
/

CanLex

Running

File size: 32,825 Bytes

"""Hybrid retrieval (BM25 + semantic) with cross-encoder reranking."""
import json
import math
import os
import re
import sys
from collections import Counter, defaultdict

import snowballstemmer

from .config import PROCESSED_DIR
from .synonyms import expand_query

K1 = 1.5
B = 0.75
RRF_K = 60          # reciprocal-rank-fusion damping constant
W_SEM = 2.0         # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
CANDIDATES = 80     # hits each retriever contributes to the fusion
RERANK_POOL = 50    # top fused candidates the cross-encoder rescores
MN_WEIGHT = float(os.environ.get("CANLEX_MN_WEIGHT", "0.0024"))
                    # title-match boost per unit of idf-weighted overlap between
                    # the query and a candidate's marginal note (section title)
MN_CAP = float(os.environ.get("CANLEX_MN_CAP", "0.012"))
                    # ceiling on the title-match boost -- it nudges the ranking
                    # without overriding a strong base score
REG_PENALTY = float(os.environ.get("CANLEX_REG_PENALTY", "0.004"))
                    # small fusion penalty on regulation sections, so the Act
                    # that creates a duty outranks the regulation elaborating it
                    # (sweep-tuned 2026-05-23 from 0.008 -> 0.004; see sweep.log)
BACKMATTER_PENALTY = float(os.environ.get("CANLEX_BACKMATTER_PENALTY", "0.004"))
                    # likewise for a collective agreement's back-matter
                    # (memoranda, letters of understanding) vs its numbered articles
                    # (sweep-tuned 2026-05-23 from 0.008 -> 0.004)
SOURCE_CAP = 2      # max chunks one case or memorandum may contribute
APPENDIX_CAP = 3    # max referenced appendices co-surfaced into a result set

# Primary instruments -- enacted law, collective agreements, the NJC directives
# incorporated into them, and the IRPA delegation instruments. Their sections or
# items are distinct provisions, so (like legislation) they are never collapsed
# under the diversity cap.
PRIMARY_DOC_TYPES = frozenset({"legislation", "agreement", "directive", "delegation"})

_TOKEN = re.compile(r"[a-z0-9]+")
_SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)")
# A cross-reference to another provision -- "section 34", "subsection 25(1)",
# "paragraph 36(1)(a)", "s. 34" -- capturing the top-level section number.
_XREF = re.compile(
    r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)",
    re.IGNORECASE)

# A line opening with "(1)", "(a)" or "(b.1)" -- a citable subdivision
# (subsection, paragraph or subparagraph) of a provision.
_MARKER = re.compile(r"(?m)^\(([0-9a-zA-Z]+(?:\.\d+)?)\)")

# A D-memorandum's reference to a provision -- "section 32 of the Customs Act",
# or "section 32 of the Act" (the Act a D-memo administers -- the Customs Act).
_MEMO_CITE = re.compile(
    r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+"
    r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE)

# A directive section's reference to an appendix of the same directive --
# "as specified in Appendix C". A trailing "of" ("Appendix C of the NJC Travel
# Directive") marks a cross-directive citation and is deliberately left alone.
_APPENDIX_REF = re.compile(r"\bAppendi(?:x|ces)\s+([A-Za-z])\b(?!\s+of\b)",
                           re.IGNORECASE)
# A directive chunk that *is* an appendix: its title opens "Appendix C ...".
_APPENDIX_HEAD = re.compile(r"Appendix\s+([A-Za-z])\b", re.IGNORECASE)


_STEMMER = snowballstemmer.stemmer("english")
_STEM_CACHE = {}


# Stem pairs Snowball does not merge but that share a legal meaning, so a
# query naming the verb still matches a provision titled with the noun (or
# vice versa). Mapped to the verb form on both index and query sides, which
# is consistent and arbitrary -- the merge is what matters.
_STEM_NORMALIZE = {
    "seizur": "seiz",            # seizure -> seize
    "forfeitur": "forfeit",      # forfeiture -> forfeit
    "appel": "appeal",           # appellate/appellant -> appeal
    "detent": "detain",          # detention -> detain
    "exclus": "exclud",          # exclusion -> exclude
    "admiss": "admit",           # admission/admissibility -> admit
    "applic": "appli",           # application -> apply
    "complianc": "compli",       # compliance -> comply
    "grievanc": "griev",         # grievance -> grieve
}


def _stem(word):
    """Snowball-stem a word, memoised -- legal text repeats terms heavily.
    A small post-stem normalization merges a few verb/noun pairs Snowball
    leaves apart ('seize'/'seizure', 'forfeit'/'forfeiture')."""
    stemmed = _STEM_CACHE.get(word)
    if stemmed is None:
        stemmed = _STEMMER.stemWord(word)
        stemmed = _STEM_NORMALIZE.get(stemmed, stemmed)
        _STEM_CACHE[word] = stemmed
    return stemmed


def tokenize(text):
    """Lower-case, split on word characters, and Snowball-stem each token, so a
    query matches a provision even when their word forms differ -- 'possession'
    vs 'possess', 'reporting' vs 'report', 'importation' vs 'import'."""
    return [_stem(w) for w in _TOKEN.findall(text.lower())]


def _section_refs(query):
    """Pull explicit section numbers from a query, e.g. 'section 34', 's. 20.1'."""
    return set(_SECTION_REF.findall(query.lower()))


def topical_title(chunk):
    """Return the chunk's topic-bearing string, used wherever a section's
    'title' is weighted for retrieval -- BM25 indexing, the title-match boost,
    and the semantic embedding. Differs by doc_type because the field that
    carries the topic differs: legislation/agreement/directive/delegation use
    the marginal_note (section heading); D-memoranda use 'part' because their
    marginal_note is a generic banner; case-law uses 'heading' because its
    marginal_note is just the paragraph range ('paras 11-13') and the case
    proposition lives in heading."""
    doc_type = chunk.get("doc_type")
    if doc_type == "memorandum":
        return chunk.get("part") or chunk["marginal_note"]
    if doc_type == "caselaw":
        return chunk.get("heading") or chunk["marginal_note"]
    return chunk["marginal_note"]


def _provision_units(text):
    """Citable parts of a provision, for pinpoint scoring -- a list of
    (citation_suffix, scoring_text, snippet). One entry per paragraph, with its
    subsection chapeau prepended to scoring_text for context, plus one per
    paragraph-less subsection. Returns [] when the provision is too flat to
    pinpoint (fewer than two subdivisions)."""
    marks = list(_MARKER.finditer(text))
    if len(marks) < 2:
        return []
    spans = []
    for i, m in enumerate(marks):
        end = marks[i + 1].start() if i + 1 < len(marks) else len(text)
        spans.append((m.group(1), text[m.start():end].strip()))
    units, cur_sub, cur_intro = [], "", ""
    for j, (token, body) in enumerate(spans):
        if "[Repealed" in body[:40]:
            if token[0].isdigit():
                cur_sub, cur_intro = f"({token})", ""
            continue
        if token[0].isdigit():
            cur_sub, cur_intro = f"({token})", body
            nxt = spans[j + 1][0] if j + 1 < len(spans) else ""
            if not nxt or nxt[0].isdigit():
                units.append((cur_sub, body, body))   # subsection has no paragraphs
            # otherwise the chapeau is emitted via its paragraphs below
        else:
            label = f"{cur_sub}({token})" if cur_sub else f"({token})"
            units.append((label, f"{cur_intro} {body}".strip(), body))
    return units


class LegislationIndex:
    def __init__(self):
        self.chunks = []
        for path in sorted(PROCESSED_DIR.glob("*.json")):
            self.chunks.extend(json.loads(path.read_text(encoding="utf-8")))
        if not self.chunks:
            raise RuntimeError(
                f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.")
        self._build_bm25()
        self._build_note_tokens()
        self._build_xref()
        self._build_appendix_index()
        self._load_semantic()
        self._load_reranker()

    def _build_bm25(self):
        self.doc_len = []
        self.postings = defaultdict(list)  # term -> [(doc_idx, term_frequency), ...]
        df = defaultdict(int)
        for idx, c in enumerate(self.chunks):
            # The topical title is repeated to weight it above body text;
            # the Act name, code and section are indexed too, so an Act's own
            # terminology (e.g. "controlled substance") and its codes/numbers
            # are searchable even when a section's text omits them. The title
            # is doc_type-aware via topical_title -- for case-law it picks
            # the case proposition (heading), not the paragraph range
            # (marginal_note), so a leading case surfaces on a topical query.
            title = topical_title(c)
            blob = " ".join((title, title, c["heading"],
                             c["part"], c["division"], c["act_name"], c["act_code"],
                             c["section"], c["text"]))
            counts = Counter(tokenize(blob))
            self.doc_len.append(sum(counts.values()))
            for term, tf in counts.items():
                self.postings[term].append((idx, tf))
                df[term] += 1
        n = len(self.chunks)
        self.avgdl = sum(self.doc_len) / n
        self.idf = {t: math.log(1 + (n - d + 0.5) / (d + 0.5)) for t, d in df.items()}

    def _build_note_tokens(self):
        """Pre-tokenise each chunk's topical title (see topical_title) for the
        title-match boost in search(). Each chunk is also flagged as a
        regulation (act codes beginning SOR/C.R.C.) for the Act-over-regulation
        preference, and as collective-agreement back-matter (memoranda and
        letters with no article number) for the back-matter penalty."""
        self._note_tokens = []
        self._is_regulation = []
        self._is_backmatter = []
        for c in self.chunks:
            self._note_tokens.append(set(tokenize(topical_title(c))))
            self._is_regulation.append(
                c.get("doc_type", "legislation") == "legislation"
                and c["act_code"].startswith(("SOR", "C.R.C")))
            self._is_backmatter.append(
                c.get("doc_type") == "agreement"
                and not str(c["section"])[:1].isdigit())

    def _build_appendix_index(self):
        """Index directive appendices by (act_code, letter), so a directive
        section that cites 'Appendix C' can pull that appendix into the result
        set -- a directive's rate-table appendices are bare numbers and rank
        poorly on a natural-language query, yet the section citing them is of
        little use without them."""
        self._appendix = defaultdict(list)
        for idx, c in enumerate(self.chunks):
            if c.get("doc_type") != "directive":
                continue
            m = _APPENDIX_HEAD.match(c["marginal_note"])
            if m:
                self._appendix[(c["act_code"], m.group(1).upper())].append(idx)

    def _load_semantic(self):
        """Load precomputed embeddings and the query embedder.

        Any failure (missing embeddings, or numpy/model unavailable) degrades the
        index to BM25-only rather than breaking retrieval.
        """
        self.semantic = False
        emb_path = PROCESSED_DIR / "embeddings.npz"
        if not emb_path.exists():
            print("CanLex index: no embeddings.npz; using BM25 only "
                  "(run 'canlex.embed' to enable semantic search).", file=sys.stderr)
            return
        try:
            import numpy as np
            from .embed import Embedder
            with np.load(emb_path) as data:
                id_to_vec = dict(zip(data["ids"].tolist(), data["vectors"]))
                dim = int(data["vectors"].shape[1])
            missing = 0
            rows = []
            for c in self.chunks:
                vec = id_to_vec.get(c["id"])
                if vec is None:
                    missing += 1
                    rows.append(np.zeros(dim, dtype=np.float32))
                else:
                    rows.append(vec)
            self._np = np
            self.vectors = np.vstack(rows)
            self.embedder = Embedder()
            self.semantic = True
            if missing:
                print(f"CanLex index: {missing}/{len(self.chunks)} sections lack "
                      f"embeddings; re-run 'canlex.embed' to refresh.", file=sys.stderr)
        except Exception as exc:
            print(f"CanLex index: semantic search disabled ({type(exc).__name__}: "
                  f"{exc}); using BM25 only.", file=sys.stderr)
            self.semantic = False

    def _load_reranker(self):
        """Load the cross-encoder reranker; degrade to the fusion order on failure."""
        self.reranker = None
        try:
            from .rerank import Reranker
            self.reranker = Reranker()
        except Exception as exc:
            print(f"CanLex index: reranker disabled ({type(exc).__name__}: {exc}); "
                  f"using hybrid fusion order.", file=sys.stderr)

    def _bm25_scores(self, query):
        scores = defaultdict(float)
        for term in set(tokenize(query)):
            idf = self.idf.get(term)
            if idf is None:
                continue
            for idx, tf in self.postings[term]:
                dl = self.doc_len[idx]
                denom = tf + K1 * (1 - B + B * dl / self.avgdl)
                scores[idx] += idf * tf * (K1 + 1) / denom
        return scores

    def _semantic_ranking(self, query):
        qv = self.embedder.encode_query(query)
        sims = self.vectors @ qv
        order = self._np.argsort(sims)[::-1][:CANDIDATES]
        # The top cosine similarity doubles as a corpus-coverage signal: a query
        # the corpus cannot answer has no passage close to it.
        return [int(i) for i in order], float(sims.max())

    def _rerank_doc(self, idx):
        c = self.chunks[idx]
        return f"{c['citation']} — {c['marginal_note']}\n{c['text']}"

    def _source_key(self, idx):
        """The parent document a chunk belongs to, for diversity capping. Returns
        None for primary instruments -- legislation, collective agreements and
        directives -- whose sections are distinct provisions and are never
        capped; case law is keyed by citation, memoranda by memo number."""
        c = self.chunks[idx]
        doc_type = c.get("doc_type", "legislation")
        if doc_type in PRIMARY_DOC_TYPES:
            return None
        if doc_type == "memorandum":
            return ("memorandum", c["section"])   # act_code is a shared constant
        return (doc_type, c["act_code"])          # one decision, keyed by citation

    def _diversify(self, ordered):
        """Reorder so no single case, memorandum, agreement or directive can
        monopolise the results: once a source has contributed SOURCE_CAP chunks,
        its remaining chunks are deferred below every other candidate. This stops
        a heavily paragraph-chunked decision from crowding out the statute it
        interprets. Legislation is never capped."""
        kept, deferred, counts = [], [], defaultdict(int)
        for idx in ordered:
            key = self._source_key(idx)
            if key is None:
                kept.append(idx)
                continue
            counts[key] += 1
            (kept if counts[key] <= SOURCE_CAP else deferred).append(idx)
        return kept + deferred

    def _ensure_primary(self, ordered, top_k, q_tokens):
        """Guarantee the governing primary instrument is surfaced: when the
        natural top_k is monopolised by case law or D-memoranda that interpret
        a statute, pull the most topically-on-target Act/agreement/directive/
        delegation section into the top_k, displacing the lowest-ranked
        secondary sources. The single best match is always kept in place.

        Two changes from the older 'ensure_legislation' guarantee: (i) all
        primary instruments count, not only legislation -- so an FB-Agreement
        query that surfaces only FPSLREB case law gets the agreement article
        pulled in too; (ii) the candidate to promote is chosen by title-match
        against the query (the section whose marginal_note covers the most of
        the query's distinctive vocabulary), not by raw fusion rank. The
        fusion rank surfaces tangentially-on-topic sections that share the
        Act's general vocabulary; the title-match scorer surfaces the section
        actually about the topic ('Seizure and forfeiture' over 'Report to
        President' for a 'seize currency' query)."""
        if top_k < 3:
            return ordered
        def is_primary(i):
            return self.chunks[i].get("doc_type", "legislation") in PRIMARY_DOC_TYPES
        top, rest = ordered[:top_k], ordered[top_k:]
        need = 2 - sum(1 for i in top if is_primary(i))
        if need <= 0:
            return ordered
        primary_in_rest = [i for i in rest if is_primary(i)]
        if not primary_in_rest:
            return ordered
        if q_tokens:
            def title_score(idx):
                note_tokens = self._note_tokens[idx]
                if not note_tokens:
                    return 0.0
                matched = sum(self.idf.get(t, 0.0)
                              for t in note_tokens if t in q_tokens)
                total = sum(self.idf.get(t, 0.0) for t in note_tokens) or 1.0
                score = matched * matched / total
                # Mirror the fusion-stage hierarchy preferences for tiebreaks:
                # the governing Act beats its regulation, and numbered
                # agreement articles beat their back-matter, when both have
                # identical titles (e.g. IRPA s. 112 and IRPR s. 160 both
                # marginal-noted 'Application for protection').
                if self._is_regulation[idx]:
                    score -= REG_PENALTY
                if self._is_backmatter[idx]:
                    score -= BACKMATTER_PENALTY
                return score
            # Sort by title-match descending, then by original fusion order as
            # a tiebreak (stable sort: keep the original rest order).
            primary_in_rest.sort(key=title_score, reverse=True)
        promote = primary_in_rest[:need]
        drop = [i for i in reversed(top) if not is_primary(i)][:len(promote)]
        if not drop:
            return ordered
        promote = promote[:len(drop)]
        dropped, promoted = set(drop), set(promote)
        kept = [i for i in top if i not in dropped]
        return kept[:1] + promote + kept[1:] + drop + [
            i for i in rest if i not in promoted]

    def _cosurface_appendices(self, top):
        """Append the appendices the directive results cite but that retrieval
        missed. A directive's rate tables ('Appendix C') rank poorly on a
        natural-language query, yet a section that cites them is of little use
        without them -- so the appendix travels with it. When more appendices
        are cited than APPENDIX_CAP allows, the ones cited by the most results
        win, so a lone off-topic result cannot outvote the relevant ones.
        Returns `top` extended by up to APPENDIX_CAP appendix chunks."""
        have = set(top)
        cited = Counter()
        for idx in top:
            c = self.chunks[idx]
            if c.get("doc_type") != "directive":
                continue
            seen = set()        # count an appendix once per citing result
            for m in _APPENDIX_REF.finditer(c["text"]):
                key = (c["act_code"], m.group(1).upper())
                for app in self._appendix.get(key, ()):
                    if app not in have and app not in seen:
                        seen.add(app)
                        cited[app] += 1
        return top + [app for app, _ in cited.most_common(APPENDIX_CAP)]

    def _highlight(self, query, indices):
        """For each result chunk, the subsection or paragraph most on point for
        the query: {result_position: (citation_suffix, snippet)}. Uses the
        cross-encoder; returns {} if it is unavailable or nothing is structured.
        Only the first results are scored -- a pinpoint deep in the list is not
        worth the cross-encoder cost."""
        if not self.reranker:
            return {}
        jobs = []   # (result_position, label, scoring_text, snippet)
        for pos, idx in enumerate(indices[:8]):
            c = self.chunks[idx]
            if c.get("doc_type", "legislation") != "legislation":
                continue
            note = c["marginal_note"]
            for label, scoring, snippet in _provision_units(c["text"]):
                jobs.append((pos, label, f"{note}. {scoring}", snippet))
        if not jobs:
            return {}
        best = {}   # result_position -> (score, label, snippet)
        for (pos, label, _, snippet), score in zip(
                jobs, self.reranker.score(query, [j[2] for j in jobs])):
            if pos not in best or score > best[pos][0]:
                best[pos] = (score, label, snippet)
        return {pos: (label, " ".join(snippet[:240].split()))
                for pos, (score, label, snippet) in best.items()}

    def search(self, query, top_k=6, act=None, doc_type=None):
        """Hybrid candidate fusion (BM25 + semantic), then cross-encoder rerank."""
        # Expand legal abbreviations (PRRA, H&C, ...) into statutory wording for
        # the recall stages; the reranker still sees the user's original query.
        expanded = expand_query(query)
        confidence = None
        fused = defaultdict(float)
        bm25 = self._bm25_scores(expanded)
        for rank, idx in enumerate(sorted(bm25, key=bm25.get, reverse=True)[:CANDIDATES]):
            fused[idx] += 1.0 / (RRF_K + rank)
        if self.semantic:
            sem_order, confidence = self._semantic_ranking(expanded)
            for rank, idx in enumerate(sem_order):
                fused[idx] += W_SEM / (RRF_K + rank)

        # Ensure explicitly-referenced sections are retrieved even if recall
        # missed them -- but only for Acts the query actually names. A query
        # like "IRPA s. 40 misrepresentation defence" uses the section number
        # topically; pulling every Act's s. 40 into the pool would drown out
        # the case law that interprets the section the user meant. Substring
        # check rather than token-overlap because act_codes split into trivial
        # tokens ("A-8.8" -> {a, 8}) that spuriously match common query words.
        refs = _section_refs(query)
        q_lc = query.lower()
        def _act_in_query(c):
            short = c["act_short"].lower()
            code = c["act_code"].lower()
            return ((short and short in q_lc)
                    or (code and len(code) >= 3 and code in q_lc))
        if refs:
            for idx, c in enumerate(self.chunks):
                if (c["section"] in refs and idx not in fused
                        and _act_in_query(c)):
                    fused[idx] = 0.0

        # Title-match boost: the marginal note is a section's canonical subject.
        # Reward a candidate by how completely and how specifically the query
        # matches its marginal note. The overlap is idf-weighted (matching a
        # distinctive title like "hours of work" counts far more than a generic
        # one like "Decision"), scaled by coverage, and capped -- so it nudges
        # ranking toward the provision a question names by topic without
        # overriding a strong base score.
        q_tokens = set(tokenize(expanded))
        for idx in list(fused):
            note_tokens = self._note_tokens[idx]
            total = sum(self.idf.get(t, 0.0) for t in note_tokens)
            if total <= 0:
                continue
            matched = sum(self.idf.get(t, 0.0)
                          for t in note_tokens if t in q_tokens)
            if matched > 0:
                fused[idx] += min(MN_WEIGHT * matched * matched / total, MN_CAP)

        # Hierarchy penalties: a topical question should surface the governing
        # provision, not the supplementary material around it. An Act creates a
        # duty while a regulation only elaborates procedure; a collective
        # agreement's numbered articles are its substance while its memoranda
        # and letters of understanding are back-matter. Both take a small
        # fusion penalty so the governing provision wins a close contest.
        for idx in list(fused):
            if self._is_regulation[idx]:
                fused[idx] -= REG_PENALTY
            elif self._is_backmatter[idx]:
                fused[idx] -= BACKMATTER_PENALTY

        def allowed(idx):
            c = self.chunks[idx]
            if act and act.lower() not in (c["act_short"].lower(), c["act_code"].lower()):
                return False
            if doc_type and c.get("doc_type", "legislation") != doc_type:
                return False
            return True

        candidates = [i for i in sorted(fused, key=fused.get, reverse=True) if allowed(i)]
        if not candidates:
            return []
        scores = {i: fused[i] for i in candidates}

        # Precision stage: the cross-encoder rescores the top candidate pool, but
        # may only PROMOTE -- each pooled candidate is placed at the better of its
        # fusion rank and its rerank rank, never below its fusion rank. The
        # reranker reliably surfaces a strong answer the fusion ranked low, yet is
        # unreliable on long statutory text (it can score the right section
        # negative), so its power to demote a candidate is deliberately removed.
        if self.reranker:
            pool = candidates[:RERANK_POOL]
            ce = dict(zip(pool, self.reranker.score(
                query, [self._rerank_doc(i) for i in pool])))
            fusion_rank = {idx: r for r, idx in enumerate(pool)}
            rerank_rank = {idx: r for r, idx in enumerate(
                sorted(pool, key=ce.get, reverse=True))}
            pool.sort(key=lambda i: (min(fusion_rank[i], rerank_rank[i]),
                                     fusion_rank[i]))
            candidates = pool + candidates[RERANK_POOL:]

        # Explicit section references are pinned to the very top -- using the
        # same Act-mentioned constraint as the recall step above, for the same
        # reason: a bare "s. 40" without an Act name is usually topical
        # (e.g. "the IRPA s. 40 misrepresentation defence"), not a lookup.
        if refs:
            pinned = [i for i in candidates
                      if self.chunks[i]["section"] in refs
                      and _act_in_query(self.chunks[i])]
            if pinned:
                pinned_set = set(pinned)
                candidates = pinned + [i for i in candidates if i not in pinned_set]

        # Cap one-source monopolies, then guarantee a primary instrument on
        # the topic is represented. The guarantee operates on a fixed visible
        # window of min(top_k, 5), not the full top_k -- with top_k=20 (the
        # eval default) the larger window almost always contains incidental
        # legislation, so the guarantee never fires even when the governing
        # provision is buried at rank 10+.
        candidates = self._diversify(candidates)
        candidates = self._ensure_primary(candidates, min(top_k, 5), q_tokens)

        top = self._cosurface_appendices(candidates[:top_k])
        highlights = self._highlight(query, top)
        results = []
        for pos, i in enumerate(top):
            result = {**self.chunks[i], "score": round(scores.get(i, 0.0), 4),
                      "confidence": confidence}
            if pos in highlights:
                result["highlight"] = highlights[pos]
            results.append(result)
        return results

    def get_section(self, act, section):
        act = act.lower()
        for c in self.chunks:
            if c["section"] == section and act in (c["act_short"].lower(), c["act_code"].lower()):
                return c
        return None

    def _build_xref(self):
        """Index legislation by (act, section); find each Act's definitions
        section; link every regulation to its enabling Act and every
        D-memorandum to the provisions it cites -- all for cross-referencing."""
        self._by_section = {}
        self._defs_section = {}
        acts, regs = {}, {}        # act_code -> (act_short, act_name)
        for c in self.chunks:
            if c.get("doc_type", "legislation") != "legislation":
                continue
            self._by_section[(c["act_code"], c["section"])] = c
            if c["act_code"] not in self._defs_section and (
                    c["marginal_note"].strip().lower() in (
                        "definitions", "definition", "interpretation")):
                self._defs_section[c["act_code"]] = c
            bucket = regs if c["act_code"].startswith(("SOR", "C.R.C")) else acts
            bucket.setdefault(c["act_code"], (c["act_short"], c["act_name"]))

        # Link a regulation to the Act it is made under by matching their names
        # ("X Regulations" <-> "X Act").
        self._enabling_act = {}                 # reg code -> (act_short, act_name)
        self._regulations = defaultdict(list)   # act code -> [(reg_short, reg_name)]
        def base(name):
            return re.sub(r"\b(?:Act|Regulations)\b", "", name).strip().lower()
        act_by_base = {base(n): (code, s, n) for code, (s, n) in acts.items()}
        for rcode, (rshort, rname) in regs.items():
            hit = act_by_base.get(base(rname))
            if hit:
                self._enabling_act[rcode] = (hit[1], hit[2])
                self._regulations[hit[0]].append((rshort, rname))

        # Link D-memoranda to the Customs Act / Customs Tariff provisions they
        # cite; an unqualified "the Act" in a D-memo means the Customs Act.
        by_short = {s.lower(): code for code, (s, n) in acts.items()}
        customs, tariff = by_short.get("customs act"), by_short.get("customs tariff")
        self._memos_for_section = defaultdict(set)   # (act_code, section) -> memos
        for c in self.chunks:
            if c.get("doc_type") != "memorandum":
                continue
            for num, which in _MEMO_CITE.findall(c["text"]):
                code = tariff if which.lower() == "customs tariff" else customs
                if code:
                    self._memos_for_section[(code, num)].add(c["section"])

    def related(self, chunk):
        """Cross-references for a legislation result, as a dict: 'provisions'
        (intra-Act sections it cites, plus the definitions section),
        'regulations' (made under this Act), 'enabling_act' (for a regulation,
        the Act it is made under) and 'memoranda' (D-memo numbers citing this
        section). Empty dict for case law, memoranda, etc."""
        if chunk.get("doc_type", "legislation") != "legislation":
            return {}
        act = chunk["act_code"]
        provisions, seen = [], {chunk["section"]}
        defs = self._defs_section.get(act)
        if defs and defs["section"] not in seen:
            provisions.append((defs["section"], defs["marginal_note"]))
            seen.add(defs["section"])
        for match in _XREF.finditer(chunk["text"]):
            sec = match.group(1)
            if sec in seen:
                continue
            target = self._by_section.get((act, sec))
            if target:
                provisions.append((sec, target["marginal_note"]))
                seen.add(sec)
            if len(provisions) >= 8:
                break
        return {
            "provisions": provisions,
            "regulations": self._regulations.get(act, []),
            "enabling_act": self._enabling_act.get(act),
            "memoranda": sorted(self._memos_for_section.get(
                (act, chunk["section"]), []))[:6],
        }


def main():
    if len(sys.argv) < 2:
        print('usage: python -m canlex.index "your query"')
        return
    query = " ".join(sys.argv[1:])
    index = LegislationIndex()
    if index.reranker:
        mode = "hybrid + cross-encoder rerank"
    elif index.semantic:
        mode = "hybrid (BM25 + semantic)"
    else:
        mode = "BM25 only"
    print(f"{len(index.chunks)} sections indexed - {mode}. Query: {query!r}\n")
    for r in index.search(query):
        print(f"[{r['score']:.3f}] {r['citation']} - {r['marginal_note']}")
        print(f"          {r['text'].replace(chr(10), ' ')[:160]}")
    print()


if __name__ == "__main__":
    main()