"""Hybrid retrieval (BM25 + semantic) with cross-encoder reranking.""" import json import math import os import re import sys from collections import Counter, defaultdict import snowballstemmer from .config import PROCESSED_DIR from .synonyms import expand_query K1 = 1.5 B = 0.75 RRF_K = 60 # reciprocal-rank-fusion damping constant W_SEM = 2.0 # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned) CANDIDATES = 80 # hits each retriever contributes to the fusion RERANK_POOL = 50 # top fused candidates the cross-encoder rescores MN_WEIGHT = float(os.environ.get("CANLEX_MN_WEIGHT", "0.0024")) # title-match boost per unit of idf-weighted overlap between # the query and a candidate's marginal note (section title) MN_CAP = float(os.environ.get("CANLEX_MN_CAP", "0.012")) # ceiling on the title-match boost -- it nudges the ranking # without overriding a strong base score REG_PENALTY = float(os.environ.get("CANLEX_REG_PENALTY", "0.004")) # small fusion penalty on regulation sections, so the Act # that creates a duty outranks the regulation elaborating it # (sweep-tuned 2026-05-23 from 0.008 -> 0.004; see sweep.log) BACKMATTER_PENALTY = float(os.environ.get("CANLEX_BACKMATTER_PENALTY", "0.004")) # likewise for a collective agreement's back-matter # (memoranda, letters of understanding) vs its numbered articles # (sweep-tuned 2026-05-23 from 0.008 -> 0.004) SOURCE_CAP = 2 # max chunks one case or memorandum may contribute APPENDIX_CAP = 3 # max referenced appendices co-surfaced into a result set # Primary instruments -- enacted law, collective agreements, the NJC directives # incorporated into them, and the IRPA delegation instruments. Their sections or # items are distinct provisions, so (like legislation) they are never collapsed # under the diversity cap. PRIMARY_DOC_TYPES = frozenset({"legislation", "agreement", "directive", "delegation"}) _TOKEN = re.compile(r"[a-z0-9]+") _SECTION_REF = re.compile(r"\bs(?:ec(?:tion)?)?s?\.?\s*(\d+(?:\.\d+)?)") # A cross-reference to another provision -- "section 34", "subsection 25(1)", # "paragraph 36(1)(a)", "s. 34" -- capturing the top-level section number. _XREF = re.compile( r"\b(?:sections?|subsections?|paragraphs?|ss?\.)\s*(\d+(?:\.\d+)?)", re.IGNORECASE) # A line opening with "(1)", "(a)" or "(b.1)" -- a citable subdivision # (subsection, paragraph or subparagraph) of a provision. _MARKER = re.compile(r"(?m)^\(([0-9a-zA-Z]+(?:\.\d+)?)\)") # A D-memorandum's reference to a provision -- "section 32 of the Customs Act", # or "section 32 of the Act" (the Act a D-memo administers -- the Customs Act). _MEMO_CITE = re.compile( r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+" r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE) # A directive section's reference to an appendix of the same directive -- # "as specified in Appendix C". A trailing "of" ("Appendix C of the NJC Travel # Directive") marks a cross-directive citation and is deliberately left alone. _APPENDIX_REF = re.compile(r"\bAppendi(?:x|ces)\s+([A-Za-z])\b(?!\s+of\b)", re.IGNORECASE) # A directive chunk that *is* an appendix: its title opens "Appendix C ...". _APPENDIX_HEAD = re.compile(r"Appendix\s+([A-Za-z])\b", re.IGNORECASE) _STEMMER = snowballstemmer.stemmer("english") _STEM_CACHE = {} # Stem pairs Snowball does not merge but that share a legal meaning, so a # query naming the verb still matches a provision titled with the noun (or # vice versa). Mapped to the verb form on both index and query sides, which # is consistent and arbitrary -- the merge is what matters. _STEM_NORMALIZE = { "seizur": "seiz", # seizure -> seize "forfeitur": "forfeit", # forfeiture -> forfeit "appel": "appeal", # appellate/appellant -> appeal "detent": "detain", # detention -> detain "exclus": "exclud", # exclusion -> exclude "admiss": "admit", # admission/admissibility -> admit "applic": "appli", # application -> apply "complianc": "compli", # compliance -> comply "grievanc": "griev", # grievance -> grieve } def _stem(word): """Snowball-stem a word, memoised -- legal text repeats terms heavily. A small post-stem normalization merges a few verb/noun pairs Snowball leaves apart ('seize'/'seizure', 'forfeit'/'forfeiture').""" stemmed = _STEM_CACHE.get(word) if stemmed is None: stemmed = _STEMMER.stemWord(word) stemmed = _STEM_NORMALIZE.get(stemmed, stemmed) _STEM_CACHE[word] = stemmed return stemmed def tokenize(text): """Lower-case, split on word characters, and Snowball-stem each token, so a query matches a provision even when their word forms differ -- 'possession' vs 'possess', 'reporting' vs 'report', 'importation' vs 'import'.""" return [_stem(w) for w in _TOKEN.findall(text.lower())] def _section_refs(query): """Pull explicit section numbers from a query, e.g. 'section 34', 's. 20.1'.""" return set(_SECTION_REF.findall(query.lower())) def topical_title(chunk): """Return the chunk's topic-bearing string, used wherever a section's 'title' is weighted for retrieval -- BM25 indexing, the title-match boost, and the semantic embedding. Differs by doc_type because the field that carries the topic differs: legislation/agreement/directive/delegation use the marginal_note (section heading); D-memoranda use 'part' because their marginal_note is a generic banner; case-law uses 'heading' because its marginal_note is just the paragraph range ('paras 11-13') and the case proposition lives in heading.""" doc_type = chunk.get("doc_type") if doc_type == "memorandum": return chunk.get("part") or chunk["marginal_note"] if doc_type == "caselaw": return chunk.get("heading") or chunk["marginal_note"] return chunk["marginal_note"] def _provision_units(text): """Citable parts of a provision, for pinpoint scoring -- a list of (citation_suffix, scoring_text, snippet). One entry per paragraph, with its subsection chapeau prepended to scoring_text for context, plus one per paragraph-less subsection. Returns [] when the provision is too flat to pinpoint (fewer than two subdivisions).""" marks = list(_MARKER.finditer(text)) if len(marks) < 2: return [] spans = [] for i, m in enumerate(marks): end = marks[i + 1].start() if i + 1 < len(marks) else len(text) spans.append((m.group(1), text[m.start():end].strip())) units, cur_sub, cur_intro = [], "", "" for j, (token, body) in enumerate(spans): if "[Repealed" in body[:40]: if token[0].isdigit(): cur_sub, cur_intro = f"({token})", "" continue if token[0].isdigit(): cur_sub, cur_intro = f"({token})", body nxt = spans[j + 1][0] if j + 1 < len(spans) else "" if not nxt or nxt[0].isdigit(): units.append((cur_sub, body, body)) # subsection has no paragraphs # otherwise the chapeau is emitted via its paragraphs below else: label = f"{cur_sub}({token})" if cur_sub else f"({token})" units.append((label, f"{cur_intro} {body}".strip(), body)) return units class LegislationIndex: def __init__(self): self.chunks = [] for path in sorted(PROCESSED_DIR.glob("*.json")): self.chunks.extend(json.loads(path.read_text(encoding="utf-8"))) if not self.chunks: raise RuntimeError( f"No processed legislation in {PROCESSED_DIR}. Run 'canlex.ingest' first.") self._build_bm25() self._build_note_tokens() self._build_xref() self._build_appendix_index() self._load_semantic() self._load_reranker() def _build_bm25(self): self.doc_len = [] self.postings = defaultdict(list) # term -> [(doc_idx, term_frequency), ...] df = defaultdict(int) for idx, c in enumerate(self.chunks): # The topical title is repeated to weight it above body text; # the Act name, code and section are indexed too, so an Act's own # terminology (e.g. "controlled substance") and its codes/numbers # are searchable even when a section's text omits them. The title # is doc_type-aware via topical_title -- for case-law it picks # the case proposition (heading), not the paragraph range # (marginal_note), so a leading case surfaces on a topical query. title = topical_title(c) blob = " ".join((title, title, c["heading"], c["part"], c["division"], c["act_name"], c["act_code"], c["section"], c["text"])) counts = Counter(tokenize(blob)) self.doc_len.append(sum(counts.values())) for term, tf in counts.items(): self.postings[term].append((idx, tf)) df[term] += 1 n = len(self.chunks) self.avgdl = sum(self.doc_len) / n self.idf = {t: math.log(1 + (n - d + 0.5) / (d + 0.5)) for t, d in df.items()} def _build_note_tokens(self): """Pre-tokenise each chunk's topical title (see topical_title) for the title-match boost in search(). Each chunk is also flagged as a regulation (act codes beginning SOR/C.R.C.) for the Act-over-regulation preference, and as collective-agreement back-matter (memoranda and letters with no article number) for the back-matter penalty.""" self._note_tokens = [] self._is_regulation = [] self._is_backmatter = [] for c in self.chunks: self._note_tokens.append(set(tokenize(topical_title(c)))) self._is_regulation.append( c.get("doc_type", "legislation") == "legislation" and c["act_code"].startswith(("SOR", "C.R.C"))) self._is_backmatter.append( c.get("doc_type") == "agreement" and not str(c["section"])[:1].isdigit()) def _build_appendix_index(self): """Index directive appendices by (act_code, letter), so a directive section that cites 'Appendix C' can pull that appendix into the result set -- a directive's rate-table appendices are bare numbers and rank poorly on a natural-language query, yet the section citing them is of little use without them.""" self._appendix = defaultdict(list) for idx, c in enumerate(self.chunks): if c.get("doc_type") != "directive": continue m = _APPENDIX_HEAD.match(c["marginal_note"]) if m: self._appendix[(c["act_code"], m.group(1).upper())].append(idx) def _load_semantic(self): """Load precomputed embeddings and the query embedder. Any failure (missing embeddings, or numpy/model unavailable) degrades the index to BM25-only rather than breaking retrieval. """ self.semantic = False emb_path = PROCESSED_DIR / "embeddings.npz" if not emb_path.exists(): print("CanLex index: no embeddings.npz; using BM25 only " "(run 'canlex.embed' to enable semantic search).", file=sys.stderr) return try: import numpy as np from .embed import Embedder with np.load(emb_path) as data: id_to_vec = dict(zip(data["ids"].tolist(), data["vectors"])) dim = int(data["vectors"].shape[1]) missing = 0 rows = [] for c in self.chunks: vec = id_to_vec.get(c["id"]) if vec is None: missing += 1 rows.append(np.zeros(dim, dtype=np.float32)) else: rows.append(vec) self._np = np self.vectors = np.vstack(rows) self.embedder = Embedder() self.semantic = True if missing: print(f"CanLex index: {missing}/{len(self.chunks)} sections lack " f"embeddings; re-run 'canlex.embed' to refresh.", file=sys.stderr) except Exception as exc: print(f"CanLex index: semantic search disabled ({type(exc).__name__}: " f"{exc}); using BM25 only.", file=sys.stderr) self.semantic = False def _load_reranker(self): """Load the cross-encoder reranker; degrade to the fusion order on failure.""" self.reranker = None try: from .rerank import Reranker self.reranker = Reranker() except Exception as exc: print(f"CanLex index: reranker disabled ({type(exc).__name__}: {exc}); " f"using hybrid fusion order.", file=sys.stderr) def _bm25_scores(self, query): scores = defaultdict(float) for term in set(tokenize(query)): idf = self.idf.get(term) if idf is None: continue for idx, tf in self.postings[term]: dl = self.doc_len[idx] denom = tf + K1 * (1 - B + B * dl / self.avgdl) scores[idx] += idf * tf * (K1 + 1) / denom return scores def _semantic_ranking(self, query): qv = self.embedder.encode_query(query) sims = self.vectors @ qv order = self._np.argsort(sims)[::-1][:CANDIDATES] # The top cosine similarity doubles as a corpus-coverage signal: a query # the corpus cannot answer has no passage close to it. return [int(i) for i in order], float(sims.max()) def _rerank_doc(self, idx): c = self.chunks[idx] return f"{c['citation']} — {c['marginal_note']}\n{c['text']}" def _source_key(self, idx): """The parent document a chunk belongs to, for diversity capping. Returns None for primary instruments -- legislation, collective agreements and directives -- whose sections are distinct provisions and are never capped; case law is keyed by citation, memoranda by memo number.""" c = self.chunks[idx] doc_type = c.get("doc_type", "legislation") if doc_type in PRIMARY_DOC_TYPES: return None if doc_type == "memorandum": return ("memorandum", c["section"]) # act_code is a shared constant return (doc_type, c["act_code"]) # one decision, keyed by citation def _diversify(self, ordered): """Reorder so no single case, memorandum, agreement or directive can monopolise the results: once a source has contributed SOURCE_CAP chunks, its remaining chunks are deferred below every other candidate. This stops a heavily paragraph-chunked decision from crowding out the statute it interprets. Legislation is never capped.""" kept, deferred, counts = [], [], defaultdict(int) for idx in ordered: key = self._source_key(idx) if key is None: kept.append(idx) continue counts[key] += 1 (kept if counts[key] <= SOURCE_CAP else deferred).append(idx) return kept + deferred def _ensure_primary(self, ordered, top_k, q_tokens): """Guarantee the governing primary instrument is surfaced: when the natural top_k is monopolised by case law or D-memoranda that interpret a statute, pull the most topically-on-target Act/agreement/directive/ delegation section into the top_k, displacing the lowest-ranked secondary sources. The single best match is always kept in place. Two changes from the older 'ensure_legislation' guarantee: (i) all primary instruments count, not only legislation -- so an FB-Agreement query that surfaces only FPSLREB case law gets the agreement article pulled in too; (ii) the candidate to promote is chosen by title-match against the query (the section whose marginal_note covers the most of the query's distinctive vocabulary), not by raw fusion rank. The fusion rank surfaces tangentially-on-topic sections that share the Act's general vocabulary; the title-match scorer surfaces the section actually about the topic ('Seizure and forfeiture' over 'Report to President' for a 'seize currency' query).""" if top_k < 3: return ordered def is_primary(i): return self.chunks[i].get("doc_type", "legislation") in PRIMARY_DOC_TYPES top, rest = ordered[:top_k], ordered[top_k:] need = 2 - sum(1 for i in top if is_primary(i)) if need <= 0: return ordered primary_in_rest = [i for i in rest if is_primary(i)] if not primary_in_rest: return ordered if q_tokens: def title_score(idx): note_tokens = self._note_tokens[idx] if not note_tokens: return 0.0 matched = sum(self.idf.get(t, 0.0) for t in note_tokens if t in q_tokens) total = sum(self.idf.get(t, 0.0) for t in note_tokens) or 1.0 score = matched * matched / total # Mirror the fusion-stage hierarchy preferences for tiebreaks: # the governing Act beats its regulation, and numbered # agreement articles beat their back-matter, when both have # identical titles (e.g. IRPA s. 112 and IRPR s. 160 both # marginal-noted 'Application for protection'). if self._is_regulation[idx]: score -= REG_PENALTY if self._is_backmatter[idx]: score -= BACKMATTER_PENALTY return score # Sort by title-match descending, then by original fusion order as # a tiebreak (stable sort: keep the original rest order). primary_in_rest.sort(key=title_score, reverse=True) promote = primary_in_rest[:need] drop = [i for i in reversed(top) if not is_primary(i)][:len(promote)] if not drop: return ordered promote = promote[:len(drop)] dropped, promoted = set(drop), set(promote) kept = [i for i in top if i not in dropped] return kept[:1] + promote + kept[1:] + drop + [ i for i in rest if i not in promoted] def _cosurface_appendices(self, top): """Append the appendices the directive results cite but that retrieval missed. A directive's rate tables ('Appendix C') rank poorly on a natural-language query, yet a section that cites them is of little use without them -- so the appendix travels with it. When more appendices are cited than APPENDIX_CAP allows, the ones cited by the most results win, so a lone off-topic result cannot outvote the relevant ones. Returns `top` extended by up to APPENDIX_CAP appendix chunks.""" have = set(top) cited = Counter() for idx in top: c = self.chunks[idx] if c.get("doc_type") != "directive": continue seen = set() # count an appendix once per citing result for m in _APPENDIX_REF.finditer(c["text"]): key = (c["act_code"], m.group(1).upper()) for app in self._appendix.get(key, ()): if app not in have and app not in seen: seen.add(app) cited[app] += 1 return top + [app for app, _ in cited.most_common(APPENDIX_CAP)] def _highlight(self, query, indices): """For each result chunk, the subsection or paragraph most on point for the query: {result_position: (citation_suffix, snippet)}. Uses the cross-encoder; returns {} if it is unavailable or nothing is structured. Only the first results are scored -- a pinpoint deep in the list is not worth the cross-encoder cost.""" if not self.reranker: return {} jobs = [] # (result_position, label, scoring_text, snippet) for pos, idx in enumerate(indices[:8]): c = self.chunks[idx] if c.get("doc_type", "legislation") != "legislation": continue note = c["marginal_note"] for label, scoring, snippet in _provision_units(c["text"]): jobs.append((pos, label, f"{note}. {scoring}", snippet)) if not jobs: return {} best = {} # result_position -> (score, label, snippet) for (pos, label, _, snippet), score in zip( jobs, self.reranker.score(query, [j[2] for j in jobs])): if pos not in best or score > best[pos][0]: best[pos] = (score, label, snippet) return {pos: (label, " ".join(snippet[:240].split())) for pos, (score, label, snippet) in best.items()} def search(self, query, top_k=6, act=None, doc_type=None): """Hybrid candidate fusion (BM25 + semantic), then cross-encoder rerank.""" # Expand legal abbreviations (PRRA, H&C, ...) into statutory wording for # the recall stages; the reranker still sees the user's original query. expanded = expand_query(query) confidence = None fused = defaultdict(float) bm25 = self._bm25_scores(expanded) for rank, idx in enumerate(sorted(bm25, key=bm25.get, reverse=True)[:CANDIDATES]): fused[idx] += 1.0 / (RRF_K + rank) if self.semantic: sem_order, confidence = self._semantic_ranking(expanded) for rank, idx in enumerate(sem_order): fused[idx] += W_SEM / (RRF_K + rank) # Ensure explicitly-referenced sections are retrieved even if recall # missed them -- but only for Acts the query actually names. A query # like "IRPA s. 40 misrepresentation defence" uses the section number # topically; pulling every Act's s. 40 into the pool would drown out # the case law that interprets the section the user meant. Substring # check rather than token-overlap because act_codes split into trivial # tokens ("A-8.8" -> {a, 8}) that spuriously match common query words. refs = _section_refs(query) q_lc = query.lower() def _act_in_query(c): short = c["act_short"].lower() code = c["act_code"].lower() return ((short and short in q_lc) or (code and len(code) >= 3 and code in q_lc)) if refs: for idx, c in enumerate(self.chunks): if (c["section"] in refs and idx not in fused and _act_in_query(c)): fused[idx] = 0.0 # Title-match boost: the marginal note is a section's canonical subject. # Reward a candidate by how completely and how specifically the query # matches its marginal note. The overlap is idf-weighted (matching a # distinctive title like "hours of work" counts far more than a generic # one like "Decision"), scaled by coverage, and capped -- so it nudges # ranking toward the provision a question names by topic without # overriding a strong base score. q_tokens = set(tokenize(expanded)) for idx in list(fused): note_tokens = self._note_tokens[idx] total = sum(self.idf.get(t, 0.0) for t in note_tokens) if total <= 0: continue matched = sum(self.idf.get(t, 0.0) for t in note_tokens if t in q_tokens) if matched > 0: fused[idx] += min(MN_WEIGHT * matched * matched / total, MN_CAP) # Hierarchy penalties: a topical question should surface the governing # provision, not the supplementary material around it. An Act creates a # duty while a regulation only elaborates procedure; a collective # agreement's numbered articles are its substance while its memoranda # and letters of understanding are back-matter. Both take a small # fusion penalty so the governing provision wins a close contest. for idx in list(fused): if self._is_regulation[idx]: fused[idx] -= REG_PENALTY elif self._is_backmatter[idx]: fused[idx] -= BACKMATTER_PENALTY def allowed(idx): c = self.chunks[idx] if act and act.lower() not in (c["act_short"].lower(), c["act_code"].lower()): return False if doc_type and c.get("doc_type", "legislation") != doc_type: return False return True candidates = [i for i in sorted(fused, key=fused.get, reverse=True) if allowed(i)] if not candidates: return [] scores = {i: fused[i] for i in candidates} # Precision stage: the cross-encoder rescores the top candidate pool, but # may only PROMOTE -- each pooled candidate is placed at the better of its # fusion rank and its rerank rank, never below its fusion rank. The # reranker reliably surfaces a strong answer the fusion ranked low, yet is # unreliable on long statutory text (it can score the right section # negative), so its power to demote a candidate is deliberately removed. if self.reranker: pool = candidates[:RERANK_POOL] ce = dict(zip(pool, self.reranker.score( query, [self._rerank_doc(i) for i in pool]))) fusion_rank = {idx: r for r, idx in enumerate(pool)} rerank_rank = {idx: r for r, idx in enumerate( sorted(pool, key=ce.get, reverse=True))} pool.sort(key=lambda i: (min(fusion_rank[i], rerank_rank[i]), fusion_rank[i])) candidates = pool + candidates[RERANK_POOL:] # Explicit section references are pinned to the very top -- using the # same Act-mentioned constraint as the recall step above, for the same # reason: a bare "s. 40" without an Act name is usually topical # (e.g. "the IRPA s. 40 misrepresentation defence"), not a lookup. if refs: pinned = [i for i in candidates if self.chunks[i]["section"] in refs and _act_in_query(self.chunks[i])] if pinned: pinned_set = set(pinned) candidates = pinned + [i for i in candidates if i not in pinned_set] # Cap one-source monopolies, then guarantee a primary instrument on # the topic is represented. The guarantee operates on a fixed visible # window of min(top_k, 5), not the full top_k -- with top_k=20 (the # eval default) the larger window almost always contains incidental # legislation, so the guarantee never fires even when the governing # provision is buried at rank 10+. candidates = self._diversify(candidates) candidates = self._ensure_primary(candidates, min(top_k, 5), q_tokens) top = self._cosurface_appendices(candidates[:top_k]) highlights = self._highlight(query, top) results = [] for pos, i in enumerate(top): result = {**self.chunks[i], "score": round(scores.get(i, 0.0), 4), "confidence": confidence} if pos in highlights: result["highlight"] = highlights[pos] results.append(result) return results def get_section(self, act, section): act = act.lower() for c in self.chunks: if c["section"] == section and act in (c["act_short"].lower(), c["act_code"].lower()): return c return None def _build_xref(self): """Index legislation by (act, section); find each Act's definitions section; link every regulation to its enabling Act and every D-memorandum to the provisions it cites -- all for cross-referencing.""" self._by_section = {} self._defs_section = {} acts, regs = {}, {} # act_code -> (act_short, act_name) for c in self.chunks: if c.get("doc_type", "legislation") != "legislation": continue self._by_section[(c["act_code"], c["section"])] = c if c["act_code"] not in self._defs_section and ( c["marginal_note"].strip().lower() in ( "definitions", "definition", "interpretation")): self._defs_section[c["act_code"]] = c bucket = regs if c["act_code"].startswith(("SOR", "C.R.C")) else acts bucket.setdefault(c["act_code"], (c["act_short"], c["act_name"])) # Link a regulation to the Act it is made under by matching their names # ("X Regulations" <-> "X Act"). self._enabling_act = {} # reg code -> (act_short, act_name) self._regulations = defaultdict(list) # act code -> [(reg_short, reg_name)] def base(name): return re.sub(r"\b(?:Act|Regulations)\b", "", name).strip().lower() act_by_base = {base(n): (code, s, n) for code, (s, n) in acts.items()} for rcode, (rshort, rname) in regs.items(): hit = act_by_base.get(base(rname)) if hit: self._enabling_act[rcode] = (hit[1], hit[2]) self._regulations[hit[0]].append((rshort, rname)) # Link D-memoranda to the Customs Act / Customs Tariff provisions they # cite; an unqualified "the Act" in a D-memo means the Customs Act. by_short = {s.lower(): code for code, (s, n) in acts.items()} customs, tariff = by_short.get("customs act"), by_short.get("customs tariff") self._memos_for_section = defaultdict(set) # (act_code, section) -> memos for c in self.chunks: if c.get("doc_type") != "memorandum": continue for num, which in _MEMO_CITE.findall(c["text"]): code = tariff if which.lower() == "customs tariff" else customs if code: self._memos_for_section[(code, num)].add(c["section"]) def related(self, chunk): """Cross-references for a legislation result, as a dict: 'provisions' (intra-Act sections it cites, plus the definitions section), 'regulations' (made under this Act), 'enabling_act' (for a regulation, the Act it is made under) and 'memoranda' (D-memo numbers citing this section). Empty dict for case law, memoranda, etc.""" if chunk.get("doc_type", "legislation") != "legislation": return {} act = chunk["act_code"] provisions, seen = [], {chunk["section"]} defs = self._defs_section.get(act) if defs and defs["section"] not in seen: provisions.append((defs["section"], defs["marginal_note"])) seen.add(defs["section"]) for match in _XREF.finditer(chunk["text"]): sec = match.group(1) if sec in seen: continue target = self._by_section.get((act, sec)) if target: provisions.append((sec, target["marginal_note"])) seen.add(sec) if len(provisions) >= 8: break return { "provisions": provisions, "regulations": self._regulations.get(act, []), "enabling_act": self._enabling_act.get(act), "memoranda": sorted(self._memos_for_section.get( (act, chunk["section"]), []))[:6], } def main(): if len(sys.argv) < 2: print('usage: python -m canlex.index "your query"') return query = " ".join(sys.argv[1:]) index = LegislationIndex() if index.reranker: mode = "hybrid + cross-encoder rerank" elif index.semantic: mode = "hybrid (BM25 + semantic)" else: mode = "BM25 only" print(f"{len(index.chunks)} sections indexed - {mode}. Query: {query!r}\n") for r in index.search(query): print(f"[{r['score']:.3f}] {r['citation']} - {r['marginal_note']}") print(f" {r['text'].replace(chr(10), ' ')[:160]}") print() if __name__ == "__main__": main()