Beemer Claude Opus 4.7 commited on
Commit
58fc4b4
·
1 Parent(s): b105ad3

Co-surface a directive's cited appendices into search results

Browse files

A directive section that cites a rate-table appendix ("as specified in
Appendix C") is of little use without it, but the appendix -- bare
numbers -- ranks poorly on a natural-language query. search() now
appends any appendix a directive result cites: index.py builds an
(act_code, letter) -> appendix map and _cosurface_appendices() pulls
the cited appendices in. When more are cited than the cap allows, the
ones cited by the most results win, so a lone off-topic result cannot
crowd out the relevant ones. Cross-directive citations ("Appendix C of
the NJC Travel Directive") are left alone.

This fixes the web app under-claiming -- it had told users to consult
Appendix C for dollar figures CanLex actually holds.

129-question eval: Hit@1 0.74, Hit@3 0.89, Hit@5 0.93, Hit@10 0.97,
MRR 0.82 -- identical to the pre-change baseline.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show
  1. canlex/index.py +49 -2
  2. tests/test_index.py +59 -1
canlex/index.py CHANGED
@@ -25,6 +25,7 @@ REG_PENALTY = 0.008 # small fusion penalty on regulation sections, so the Act
25
  BACKMATTER_PENALTY = 0.008 # likewise for a collective agreement's back-matter
26
  # (memoranda, letters of understanding) vs its numbered articles
27
  SOURCE_CAP = 2 # max chunks one case or memorandum may contribute
 
28
 
29
  # Primary instruments -- enacted law, collective agreements, the NJC directives
30
  # incorporated into them, and the IRPA delegation instruments. Their sections or
@@ -50,6 +51,14 @@ _MEMO_CITE = re.compile(
50
  r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+"
51
  r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE)
52
 
 
 
 
 
 
 
 
 
53
 
54
  _STEMMER = snowballstemmer.stemmer("english")
55
  _STEM_CACHE = {}
@@ -118,6 +127,7 @@ class LegislationIndex:
118
  self._build_bm25()
119
  self._build_note_tokens()
120
  self._build_xref()
 
121
  self._load_semantic()
122
  self._load_reranker()
123
 
@@ -168,6 +178,20 @@ class LegislationIndex:
168
  c.get("doc_type") == "agreement"
169
  and not str(c["section"])[:1].isdigit())
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  def _load_semantic(self):
172
  """Load precomputed embeddings and the query embedder.
173
 
@@ -293,6 +317,29 @@ class LegislationIndex:
293
  return kept[:1] + promote + kept[1:] + drop + [
294
  i for i in rest if i not in promoted]
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  def _highlight(self, query, indices):
297
  """For each result chunk, the subsection or paragraph most on point for
298
  the query: {result_position: (citation_suffix, snippet)}. Uses the
@@ -412,11 +459,11 @@ class LegislationIndex:
412
  candidates = self._diversify(candidates)
413
  candidates = self._ensure_legislation(candidates, top_k)
414
 
415
- top = candidates[:top_k]
416
  highlights = self._highlight(query, top)
417
  results = []
418
  for pos, i in enumerate(top):
419
- result = {**self.chunks[i], "score": round(scores[i], 4),
420
  "confidence": confidence}
421
  if pos in highlights:
422
  result["highlight"] = highlights[pos]
 
25
  BACKMATTER_PENALTY = 0.008 # likewise for a collective agreement's back-matter
26
  # (memoranda, letters of understanding) vs its numbered articles
27
  SOURCE_CAP = 2 # max chunks one case or memorandum may contribute
28
+ APPENDIX_CAP = 3 # max referenced appendices co-surfaced into a result set
29
 
30
  # Primary instruments -- enacted law, collective agreements, the NJC directives
31
  # incorporated into them, and the IRPA delegation instruments. Their sections or
 
51
  r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+"
52
  r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE)
53
 
54
+ # A directive section's reference to an appendix of the same directive --
55
+ # "as specified in Appendix C". A trailing "of" ("Appendix C of the NJC Travel
56
+ # Directive") marks a cross-directive citation and is deliberately left alone.
57
+ _APPENDIX_REF = re.compile(r"\bAppendi(?:x|ces)\s+([A-Za-z])\b(?!\s+of\b)",
58
+ re.IGNORECASE)
59
+ # A directive chunk that *is* an appendix: its title opens "Appendix C ...".
60
+ _APPENDIX_HEAD = re.compile(r"Appendix\s+([A-Za-z])\b", re.IGNORECASE)
61
+
62
 
63
  _STEMMER = snowballstemmer.stemmer("english")
64
  _STEM_CACHE = {}
 
127
  self._build_bm25()
128
  self._build_note_tokens()
129
  self._build_xref()
130
+ self._build_appendix_index()
131
  self._load_semantic()
132
  self._load_reranker()
133
 
 
178
  c.get("doc_type") == "agreement"
179
  and not str(c["section"])[:1].isdigit())
180
 
181
+ def _build_appendix_index(self):
182
+ """Index directive appendices by (act_code, letter), so a directive
183
+ section that cites 'Appendix C' can pull that appendix into the result
184
+ set -- a directive's rate-table appendices are bare numbers and rank
185
+ poorly on a natural-language query, yet the section citing them is of
186
+ little use without them."""
187
+ self._appendix = defaultdict(list)
188
+ for idx, c in enumerate(self.chunks):
189
+ if c.get("doc_type") != "directive":
190
+ continue
191
+ m = _APPENDIX_HEAD.match(c["marginal_note"])
192
+ if m:
193
+ self._appendix[(c["act_code"], m.group(1).upper())].append(idx)
194
+
195
  def _load_semantic(self):
196
  """Load precomputed embeddings and the query embedder.
197
 
 
317
  return kept[:1] + promote + kept[1:] + drop + [
318
  i for i in rest if i not in promoted]
319
 
320
+ def _cosurface_appendices(self, top):
321
+ """Append the appendices the directive results cite but that retrieval
322
+ missed. A directive's rate tables ('Appendix C') rank poorly on a
323
+ natural-language query, yet a section that cites them is of little use
324
+ without them -- so the appendix travels with it. When more appendices
325
+ are cited than APPENDIX_CAP allows, the ones cited by the most results
326
+ win, so a lone off-topic result cannot outvote the relevant ones.
327
+ Returns `top` extended by up to APPENDIX_CAP appendix chunks."""
328
+ have = set(top)
329
+ cited = Counter()
330
+ for idx in top:
331
+ c = self.chunks[idx]
332
+ if c.get("doc_type") != "directive":
333
+ continue
334
+ seen = set() # count an appendix once per citing result
335
+ for m in _APPENDIX_REF.finditer(c["text"]):
336
+ key = (c["act_code"], m.group(1).upper())
337
+ for app in self._appendix.get(key, ()):
338
+ if app not in have and app not in seen:
339
+ seen.add(app)
340
+ cited[app] += 1
341
+ return top + [app for app, _ in cited.most_common(APPENDIX_CAP)]
342
+
343
  def _highlight(self, query, indices):
344
  """For each result chunk, the subsection or paragraph most on point for
345
  the query: {result_position: (citation_suffix, snippet)}. Uses the
 
459
  candidates = self._diversify(candidates)
460
  candidates = self._ensure_legislation(candidates, top_k)
461
 
462
+ top = self._cosurface_appendices(candidates[:top_k])
463
  highlights = self._highlight(query, top)
464
  results = []
465
  for pos, i in enumerate(top):
466
+ result = {**self.chunks[i], "score": round(scores.get(i, 0.0), 4),
467
  "confidence": confidence}
468
  if pos in highlights:
469
  result["highlight"] = highlights[pos]
tests/test_index.py CHANGED
@@ -10,7 +10,8 @@ or reranker are loaded.
10
  import unittest
11
 
12
  from canlex.index import (
13
- LegislationIndex, SOURCE_CAP, tokenize, _section_refs, _provision_units,
 
14
  )
15
 
16
 
@@ -166,5 +167,62 @@ class DocTypeFlagTests(unittest.TestCase):
166
  self.assertEqual(self.idx._note_tokens[5], set(tokenize("Importing goods")))
167
 
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  if __name__ == "__main__":
170
  unittest.main()
 
10
  import unittest
11
 
12
  from canlex.index import (
13
+ LegislationIndex, SOURCE_CAP, APPENDIX_CAP, tokenize, _section_refs,
14
+ _provision_units,
15
  )
16
 
17
 
 
167
  self.assertEqual(self.idx._note_tokens[5], set(tokenize("Importing goods")))
168
 
169
 
170
+ class CosurfaceAppendixTests(unittest.TestCase):
171
+ """_cosurface_appendices pulls a directive appendix into the result set
172
+ when a directive result cites it but retrieval missed it."""
173
+
174
+ def _idx(self):
175
+ idx = bare_index([
176
+ chunk(doc_type="directive", act_code="d10", marginal_note="Meals",
177
+ text="paid the meal allowance at the rates in Appendix C."),
178
+ chunk(doc_type="directive", act_code="d10",
179
+ marginal_note="Appendix C - Allowances", text="rate tables"),
180
+ chunk(doc_type="directive", act_code="d10",
181
+ marginal_note="Appendix B - Kilometric Rates", text="km rates"),
182
+ ])
183
+ idx._build_appendix_index()
184
+ return idx
185
+
186
+ def test_cited_appendix_is_pulled_in(self):
187
+ self.assertEqual(self._idx()._cosurface_appendices([0]), [0, 1])
188
+
189
+ def test_no_duplicate_when_already_present(self):
190
+ self.assertEqual(self._idx()._cosurface_appendices([0, 1]), [0, 1])
191
+
192
+ def test_uncited_appendix_is_left_out(self):
193
+ # result 0 cites only Appendix C, so Appendix B (index 2) stays out.
194
+ self.assertNotIn(2, self._idx()._cosurface_appendices([0]))
195
+
196
+ def test_cross_directive_citation_is_left_alone(self):
197
+ idx = bare_index([
198
+ chunk(doc_type="directive", act_code="d10", marginal_note="A section",
199
+ text="see Appendix C of the NJC Travel Directive"),
200
+ chunk(doc_type="directive", act_code="d10",
201
+ marginal_note="Appendix C - Allowances", text="tables"),
202
+ ])
203
+ idx._build_appendix_index()
204
+ self.assertEqual(idx._cosurface_appendices([0]), [0])
205
+
206
+ def test_cap_keeps_the_most_cited_appendix(self):
207
+ # Four appendices are cited; Appendix A by two sections, the rest once.
208
+ # With the cap exceeded, the twice-cited appendix must survive.
209
+ idx = bare_index([
210
+ chunk(doc_type="directive", act_code="d1", marginal_note="S1",
211
+ text="see Appendix A"),
212
+ chunk(doc_type="directive", act_code="d1", marginal_note="S2",
213
+ text="see Appendix A; see Appendix B"),
214
+ chunk(doc_type="directive", act_code="d1", marginal_note="S3",
215
+ text="see Appendix C; see Appendix D"),
216
+ chunk(doc_type="directive", act_code="d1", marginal_note="Appendix A"),
217
+ chunk(doc_type="directive", act_code="d1", marginal_note="Appendix B"),
218
+ chunk(doc_type="directive", act_code="d1", marginal_note="Appendix C"),
219
+ chunk(doc_type="directive", act_code="d1", marginal_note="Appendix D"),
220
+ ])
221
+ idx._build_appendix_index()
222
+ out = idx._cosurface_appendices([0, 1, 2])
223
+ self.assertEqual(len(out), 3 + APPENDIX_CAP) # cap respected
224
+ self.assertIn(3, out) # Appendix A survives
225
+
226
+
227
  if __name__ == "__main__":
228
  unittest.main()