Co-surface a directive's cited appendices into search results
Browse filesA directive section that cites a rate-table appendix ("as specified in
Appendix C") is of little use without it, but the appendix -- bare
numbers -- ranks poorly on a natural-language query. search() now
appends any appendix a directive result cites: index.py builds an
(act_code, letter) -> appendix map and _cosurface_appendices() pulls
the cited appendices in. When more are cited than the cap allows, the
ones cited by the most results win, so a lone off-topic result cannot
crowd out the relevant ones. Cross-directive citations ("Appendix C of
the NJC Travel Directive") are left alone.
This fixes the web app under-claiming -- it had told users to consult
Appendix C for dollar figures CanLex actually holds.
129-question eval: Hit@1 0.74, Hit@3 0.89, Hit@5 0.93, Hit@10 0.97,
MRR 0.82 -- identical to the pre-change baseline.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- canlex/index.py +49 -2
- tests/test_index.py +59 -1
|
@@ -25,6 +25,7 @@ REG_PENALTY = 0.008 # small fusion penalty on regulation sections, so the Act
|
|
| 25 |
BACKMATTER_PENALTY = 0.008 # likewise for a collective agreement's back-matter
|
| 26 |
# (memoranda, letters of understanding) vs its numbered articles
|
| 27 |
SOURCE_CAP = 2 # max chunks one case or memorandum may contribute
|
|
|
|
| 28 |
|
| 29 |
# Primary instruments -- enacted law, collective agreements, the NJC directives
|
| 30 |
# incorporated into them, and the IRPA delegation instruments. Their sections or
|
|
@@ -50,6 +51,14 @@ _MEMO_CITE = re.compile(
|
|
| 50 |
r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+"
|
| 51 |
r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE)
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
_STEMMER = snowballstemmer.stemmer("english")
|
| 55 |
_STEM_CACHE = {}
|
|
@@ -118,6 +127,7 @@ class LegislationIndex:
|
|
| 118 |
self._build_bm25()
|
| 119 |
self._build_note_tokens()
|
| 120 |
self._build_xref()
|
|
|
|
| 121 |
self._load_semantic()
|
| 122 |
self._load_reranker()
|
| 123 |
|
|
@@ -168,6 +178,20 @@ class LegislationIndex:
|
|
| 168 |
c.get("doc_type") == "agreement"
|
| 169 |
and not str(c["section"])[:1].isdigit())
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
def _load_semantic(self):
|
| 172 |
"""Load precomputed embeddings and the query embedder.
|
| 173 |
|
|
@@ -293,6 +317,29 @@ class LegislationIndex:
|
|
| 293 |
return kept[:1] + promote + kept[1:] + drop + [
|
| 294 |
i for i in rest if i not in promoted]
|
| 295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
def _highlight(self, query, indices):
|
| 297 |
"""For each result chunk, the subsection or paragraph most on point for
|
| 298 |
the query: {result_position: (citation_suffix, snippet)}. Uses the
|
|
@@ -412,11 +459,11 @@ class LegislationIndex:
|
|
| 412 |
candidates = self._diversify(candidates)
|
| 413 |
candidates = self._ensure_legislation(candidates, top_k)
|
| 414 |
|
| 415 |
-
top = candidates[:top_k]
|
| 416 |
highlights = self._highlight(query, top)
|
| 417 |
results = []
|
| 418 |
for pos, i in enumerate(top):
|
| 419 |
-
result = {**self.chunks[i], "score": round(scores
|
| 420 |
"confidence": confidence}
|
| 421 |
if pos in highlights:
|
| 422 |
result["highlight"] = highlights[pos]
|
|
|
|
| 25 |
BACKMATTER_PENALTY = 0.008 # likewise for a collective agreement's back-matter
|
| 26 |
# (memoranda, letters of understanding) vs its numbered articles
|
| 27 |
SOURCE_CAP = 2 # max chunks one case or memorandum may contribute
|
| 28 |
+
APPENDIX_CAP = 3 # max referenced appendices co-surfaced into a result set
|
| 29 |
|
| 30 |
# Primary instruments -- enacted law, collective agreements, the NJC directives
|
| 31 |
# incorporated into them, and the IRPA delegation instruments. Their sections or
|
|
|
|
| 51 |
r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+"
|
| 52 |
r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE)
|
| 53 |
|
| 54 |
+
# A directive section's reference to an appendix of the same directive --
|
| 55 |
+
# "as specified in Appendix C". A trailing "of" ("Appendix C of the NJC Travel
|
| 56 |
+
# Directive") marks a cross-directive citation and is deliberately left alone.
|
| 57 |
+
_APPENDIX_REF = re.compile(r"\bAppendi(?:x|ces)\s+([A-Za-z])\b(?!\s+of\b)",
|
| 58 |
+
re.IGNORECASE)
|
| 59 |
+
# A directive chunk that *is* an appendix: its title opens "Appendix C ...".
|
| 60 |
+
_APPENDIX_HEAD = re.compile(r"Appendix\s+([A-Za-z])\b", re.IGNORECASE)
|
| 61 |
+
|
| 62 |
|
| 63 |
_STEMMER = snowballstemmer.stemmer("english")
|
| 64 |
_STEM_CACHE = {}
|
|
|
|
| 127 |
self._build_bm25()
|
| 128 |
self._build_note_tokens()
|
| 129 |
self._build_xref()
|
| 130 |
+
self._build_appendix_index()
|
| 131 |
self._load_semantic()
|
| 132 |
self._load_reranker()
|
| 133 |
|
|
|
|
| 178 |
c.get("doc_type") == "agreement"
|
| 179 |
and not str(c["section"])[:1].isdigit())
|
| 180 |
|
| 181 |
+
def _build_appendix_index(self):
|
| 182 |
+
"""Index directive appendices by (act_code, letter), so a directive
|
| 183 |
+
section that cites 'Appendix C' can pull that appendix into the result
|
| 184 |
+
set -- a directive's rate-table appendices are bare numbers and rank
|
| 185 |
+
poorly on a natural-language query, yet the section citing them is of
|
| 186 |
+
little use without them."""
|
| 187 |
+
self._appendix = defaultdict(list)
|
| 188 |
+
for idx, c in enumerate(self.chunks):
|
| 189 |
+
if c.get("doc_type") != "directive":
|
| 190 |
+
continue
|
| 191 |
+
m = _APPENDIX_HEAD.match(c["marginal_note"])
|
| 192 |
+
if m:
|
| 193 |
+
self._appendix[(c["act_code"], m.group(1).upper())].append(idx)
|
| 194 |
+
|
| 195 |
def _load_semantic(self):
|
| 196 |
"""Load precomputed embeddings and the query embedder.
|
| 197 |
|
|
|
|
| 317 |
return kept[:1] + promote + kept[1:] + drop + [
|
| 318 |
i for i in rest if i not in promoted]
|
| 319 |
|
| 320 |
+
def _cosurface_appendices(self, top):
|
| 321 |
+
"""Append the appendices the directive results cite but that retrieval
|
| 322 |
+
missed. A directive's rate tables ('Appendix C') rank poorly on a
|
| 323 |
+
natural-language query, yet a section that cites them is of little use
|
| 324 |
+
without them -- so the appendix travels with it. When more appendices
|
| 325 |
+
are cited than APPENDIX_CAP allows, the ones cited by the most results
|
| 326 |
+
win, so a lone off-topic result cannot outvote the relevant ones.
|
| 327 |
+
Returns `top` extended by up to APPENDIX_CAP appendix chunks."""
|
| 328 |
+
have = set(top)
|
| 329 |
+
cited = Counter()
|
| 330 |
+
for idx in top:
|
| 331 |
+
c = self.chunks[idx]
|
| 332 |
+
if c.get("doc_type") != "directive":
|
| 333 |
+
continue
|
| 334 |
+
seen = set() # count an appendix once per citing result
|
| 335 |
+
for m in _APPENDIX_REF.finditer(c["text"]):
|
| 336 |
+
key = (c["act_code"], m.group(1).upper())
|
| 337 |
+
for app in self._appendix.get(key, ()):
|
| 338 |
+
if app not in have and app not in seen:
|
| 339 |
+
seen.add(app)
|
| 340 |
+
cited[app] += 1
|
| 341 |
+
return top + [app for app, _ in cited.most_common(APPENDIX_CAP)]
|
| 342 |
+
|
| 343 |
def _highlight(self, query, indices):
|
| 344 |
"""For each result chunk, the subsection or paragraph most on point for
|
| 345 |
the query: {result_position: (citation_suffix, snippet)}. Uses the
|
|
|
|
| 459 |
candidates = self._diversify(candidates)
|
| 460 |
candidates = self._ensure_legislation(candidates, top_k)
|
| 461 |
|
| 462 |
+
top = self._cosurface_appendices(candidates[:top_k])
|
| 463 |
highlights = self._highlight(query, top)
|
| 464 |
results = []
|
| 465 |
for pos, i in enumerate(top):
|
| 466 |
+
result = {**self.chunks[i], "score": round(scores.get(i, 0.0), 4),
|
| 467 |
"confidence": confidence}
|
| 468 |
if pos in highlights:
|
| 469 |
result["highlight"] = highlights[pos]
|
|
@@ -10,7 +10,8 @@ or reranker are loaded.
|
|
| 10 |
import unittest
|
| 11 |
|
| 12 |
from canlex.index import (
|
| 13 |
-
LegislationIndex, SOURCE_CAP, tokenize, _section_refs,
|
|
|
|
| 14 |
)
|
| 15 |
|
| 16 |
|
|
@@ -166,5 +167,62 @@ class DocTypeFlagTests(unittest.TestCase):
|
|
| 166 |
self.assertEqual(self.idx._note_tokens[5], set(tokenize("Importing goods")))
|
| 167 |
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
if __name__ == "__main__":
|
| 170 |
unittest.main()
|
|
|
|
| 10 |
import unittest
|
| 11 |
|
| 12 |
from canlex.index import (
|
| 13 |
+
LegislationIndex, SOURCE_CAP, APPENDIX_CAP, tokenize, _section_refs,
|
| 14 |
+
_provision_units,
|
| 15 |
)
|
| 16 |
|
| 17 |
|
|
|
|
| 167 |
self.assertEqual(self.idx._note_tokens[5], set(tokenize("Importing goods")))
|
| 168 |
|
| 169 |
|
| 170 |
+
class CosurfaceAppendixTests(unittest.TestCase):
|
| 171 |
+
"""_cosurface_appendices pulls a directive appendix into the result set
|
| 172 |
+
when a directive result cites it but retrieval missed it."""
|
| 173 |
+
|
| 174 |
+
def _idx(self):
|
| 175 |
+
idx = bare_index([
|
| 176 |
+
chunk(doc_type="directive", act_code="d10", marginal_note="Meals",
|
| 177 |
+
text="paid the meal allowance at the rates in Appendix C."),
|
| 178 |
+
chunk(doc_type="directive", act_code="d10",
|
| 179 |
+
marginal_note="Appendix C - Allowances", text="rate tables"),
|
| 180 |
+
chunk(doc_type="directive", act_code="d10",
|
| 181 |
+
marginal_note="Appendix B - Kilometric Rates", text="km rates"),
|
| 182 |
+
])
|
| 183 |
+
idx._build_appendix_index()
|
| 184 |
+
return idx
|
| 185 |
+
|
| 186 |
+
def test_cited_appendix_is_pulled_in(self):
|
| 187 |
+
self.assertEqual(self._idx()._cosurface_appendices([0]), [0, 1])
|
| 188 |
+
|
| 189 |
+
def test_no_duplicate_when_already_present(self):
|
| 190 |
+
self.assertEqual(self._idx()._cosurface_appendices([0, 1]), [0, 1])
|
| 191 |
+
|
| 192 |
+
def test_uncited_appendix_is_left_out(self):
|
| 193 |
+
# result 0 cites only Appendix C, so Appendix B (index 2) stays out.
|
| 194 |
+
self.assertNotIn(2, self._idx()._cosurface_appendices([0]))
|
| 195 |
+
|
| 196 |
+
def test_cross_directive_citation_is_left_alone(self):
|
| 197 |
+
idx = bare_index([
|
| 198 |
+
chunk(doc_type="directive", act_code="d10", marginal_note="A section",
|
| 199 |
+
text="see Appendix C of the NJC Travel Directive"),
|
| 200 |
+
chunk(doc_type="directive", act_code="d10",
|
| 201 |
+
marginal_note="Appendix C - Allowances", text="tables"),
|
| 202 |
+
])
|
| 203 |
+
idx._build_appendix_index()
|
| 204 |
+
self.assertEqual(idx._cosurface_appendices([0]), [0])
|
| 205 |
+
|
| 206 |
+
def test_cap_keeps_the_most_cited_appendix(self):
|
| 207 |
+
# Four appendices are cited; Appendix A by two sections, the rest once.
|
| 208 |
+
# With the cap exceeded, the twice-cited appendix must survive.
|
| 209 |
+
idx = bare_index([
|
| 210 |
+
chunk(doc_type="directive", act_code="d1", marginal_note="S1",
|
| 211 |
+
text="see Appendix A"),
|
| 212 |
+
chunk(doc_type="directive", act_code="d1", marginal_note="S2",
|
| 213 |
+
text="see Appendix A; see Appendix B"),
|
| 214 |
+
chunk(doc_type="directive", act_code="d1", marginal_note="S3",
|
| 215 |
+
text="see Appendix C; see Appendix D"),
|
| 216 |
+
chunk(doc_type="directive", act_code="d1", marginal_note="Appendix A"),
|
| 217 |
+
chunk(doc_type="directive", act_code="d1", marginal_note="Appendix B"),
|
| 218 |
+
chunk(doc_type="directive", act_code="d1", marginal_note="Appendix C"),
|
| 219 |
+
chunk(doc_type="directive", act_code="d1", marginal_note="Appendix D"),
|
| 220 |
+
])
|
| 221 |
+
idx._build_appendix_index()
|
| 222 |
+
out = idx._cosurface_appendices([0, 1, 2])
|
| 223 |
+
self.assertEqual(len(out), 3 + APPENDIX_CAP) # cap respected
|
| 224 |
+
self.assertIn(3, out) # Appendix A survives
|
| 225 |
+
|
| 226 |
+
|
| 227 |
if __name__ == "__main__":
|
| 228 |
unittest.main()
|