Spaces:

Beemer0
/

CanLex

Running

Beemer Claude Opus 4.7 commited on 4 days ago

Commit

58fc4b4

1 Parent(s): b105ad3

Co-surface a directive's cited appendices into search results

A directive section that cites a rate-table appendix ("as specified in
Appendix C") is of little use without it, but the appendix -- bare
numbers -- ranks poorly on a natural-language query. search() now
appends any appendix a directive result cites: index.py builds an
(act_code, letter) -> appendix map and _cosurface_appendices() pulls
the cited appendices in. When more are cited than the cap allows, the
ones cited by the most results win, so a lone off-topic result cannot
crowd out the relevant ones. Cross-directive citations ("Appendix C of
the NJC Travel Directive") are left alone.

This fixes the web app under-claiming -- it had told users to consult
Appendix C for dollar figures CanLex actually holds.

129-question eval: Hit@1 0.74, Hit@3 0.89, Hit@5 0.93, Hit@10 0.97,
MRR 0.82 -- identical to the pre-change baseline.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show

canlex/index.py +49 -2
tests/test_index.py +59 -1

canlex/index.py CHANGED Viewed

@@ -25,6 +25,7 @@ REG_PENALTY = 0.008 # small fusion penalty on regulation sections, so the Act
 BACKMATTER_PENALTY = 0.008  # likewise for a collective agreement's back-matter
                     # (memoranda, letters of understanding) vs its numbered articles
 SOURCE_CAP = 2      # max chunks one case or memorandum may contribute
 # Primary instruments -- enacted law, collective agreements, the NJC directives
 # incorporated into them, and the IRPA delegation instruments. Their sections or
@@ -50,6 +51,14 @@ _MEMO_CITE = re.compile(
     r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+"
     r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE)
 _STEMMER = snowballstemmer.stemmer("english")
 _STEM_CACHE = {}
@@ -118,6 +127,7 @@ class LegislationIndex:
         self._build_bm25()
         self._build_note_tokens()
         self._build_xref()
         self._load_semantic()
         self._load_reranker()
@@ -168,6 +178,20 @@ class LegislationIndex:
                 c.get("doc_type") == "agreement"
                 and not str(c["section"])[:1].isdigit())
     def _load_semantic(self):
         """Load precomputed embeddings and the query embedder.
@@ -293,6 +317,29 @@ class LegislationIndex:
         return kept[:1] + promote + kept[1:] + drop + [
             i for i in rest if i not in promoted]
     def _highlight(self, query, indices):
         """For each result chunk, the subsection or paragraph most on point for
         the query: {result_position: (citation_suffix, snippet)}. Uses the
@@ -412,11 +459,11 @@ class LegislationIndex:
         candidates = self._diversify(candidates)
         candidates = self._ensure_legislation(candidates, top_k)
-        top = candidates[:top_k]
         highlights = self._highlight(query, top)
         results = []
         for pos, i in enumerate(top):
-            result = {**self.chunks[i], "score": round(scores[i], 4),
                       "confidence": confidence}
             if pos in highlights:
                 result["highlight"] = highlights[pos]

 BACKMATTER_PENALTY = 0.008  # likewise for a collective agreement's back-matter
                     # (memoranda, letters of understanding) vs its numbered articles
 SOURCE_CAP = 2      # max chunks one case or memorandum may contribute
+APPENDIX_CAP = 3    # max referenced appendices co-surfaced into a result set
 # Primary instruments -- enacted law, collective agreements, the NJC directives
 # incorporated into them, and the IRPA delegation instruments. Their sections or
     r"\b(?:sub)?sections?\s+(\d+(?:\.\d+)?)(?:\([^)]+\))*\s+of\s+the\s+"
     r"(Customs Act|Customs Tariff|Act)\b", re.IGNORECASE)
+# A directive section's reference to an appendix of the same directive --
+# "as specified in Appendix C". A trailing "of" ("Appendix C of the NJC Travel
+# Directive") marks a cross-directive citation and is deliberately left alone.
+_APPENDIX_REF = re.compile(r"\bAppendi(?:x|ces)\s+([A-Za-z])\b(?!\s+of\b)",
+                           re.IGNORECASE)
+# A directive chunk that *is* an appendix: its title opens "Appendix C ...".
+_APPENDIX_HEAD = re.compile(r"Appendix\s+([A-Za-z])\b", re.IGNORECASE)
 _STEMMER = snowballstemmer.stemmer("english")
 _STEM_CACHE = {}
         self._build_bm25()
         self._build_note_tokens()
         self._build_xref()
+        self._build_appendix_index()
         self._load_semantic()
         self._load_reranker()
                 c.get("doc_type") == "agreement"
                 and not str(c["section"])[:1].isdigit())
+    def _build_appendix_index(self):
+        """Index directive appendices by (act_code, letter), so a directive
+        section that cites 'Appendix C' can pull that appendix into the result
+        set -- a directive's rate-table appendices are bare numbers and rank
+        poorly on a natural-language query, yet the section citing them is of
+        little use without them."""
+        self._appendix = defaultdict(list)
+        for idx, c in enumerate(self.chunks):
+            if c.get("doc_type") != "directive":
+                continue
+            m = _APPENDIX_HEAD.match(c["marginal_note"])
+            if m:
+                self._appendix[(c["act_code"], m.group(1).upper())].append(idx)
     def _load_semantic(self):
         """Load precomputed embeddings and the query embedder.
         return kept[:1] + promote + kept[1:] + drop + [
             i for i in rest if i not in promoted]
+    def _cosurface_appendices(self, top):
+        """Append the appendices the directive results cite but that retrieval
+        missed. A directive's rate tables ('Appendix C') rank poorly on a
+        natural-language query, yet a section that cites them is of little use
+        without them -- so the appendix travels with it. When more appendices
+        are cited than APPENDIX_CAP allows, the ones cited by the most results
+        win, so a lone off-topic result cannot outvote the relevant ones.
+        Returns `top` extended by up to APPENDIX_CAP appendix chunks."""
+        have = set(top)
+        cited = Counter()
+        for idx in top:
+            c = self.chunks[idx]
+            if c.get("doc_type") != "directive":
+                continue
+            seen = set()        # count an appendix once per citing result
+            for m in _APPENDIX_REF.finditer(c["text"]):
+                key = (c["act_code"], m.group(1).upper())
+                for app in self._appendix.get(key, ()):
+                    if app not in have and app not in seen:
+                        seen.add(app)
+                        cited[app] += 1
+        return top + [app for app, _ in cited.most_common(APPENDIX_CAP)]
     def _highlight(self, query, indices):
         """For each result chunk, the subsection or paragraph most on point for
         the query: {result_position: (citation_suffix, snippet)}. Uses the
         candidates = self._diversify(candidates)
         candidates = self._ensure_legislation(candidates, top_k)
+        top = self._cosurface_appendices(candidates[:top_k])
         highlights = self._highlight(query, top)
         results = []
         for pos, i in enumerate(top):
+            result = {**self.chunks[i], "score": round(scores.get(i, 0.0), 4),
                       "confidence": confidence}
             if pos in highlights:
                 result["highlight"] = highlights[pos]

tests/test_index.py CHANGED Viewed

@@ -10,7 +10,8 @@ or reranker are loaded.
 import unittest
 from canlex.index import (
-    LegislationIndex, SOURCE_CAP, tokenize, _section_refs, _provision_units,
 )
@@ -166,5 +167,62 @@ class DocTypeFlagTests(unittest.TestCase):
         self.assertEqual(self.idx._note_tokens[5], set(tokenize("Importing goods")))
 if __name__ == "__main__":
     unittest.main()

 import unittest
 from canlex.index import (
+    LegislationIndex, SOURCE_CAP, APPENDIX_CAP, tokenize, _section_refs,
+    _provision_units,
 )
         self.assertEqual(self.idx._note_tokens[5], set(tokenize("Importing goods")))
+class CosurfaceAppendixTests(unittest.TestCase):
+    """_cosurface_appendices pulls a directive appendix into the result set
+    when a directive result cites it but retrieval missed it."""
+    def _idx(self):
+        idx = bare_index([
+            chunk(doc_type="directive", act_code="d10", marginal_note="Meals",
+                  text="paid the meal allowance at the rates in Appendix C."),
+            chunk(doc_type="directive", act_code="d10",
+                  marginal_note="Appendix C - Allowances", text="rate tables"),
+            chunk(doc_type="directive", act_code="d10",
+                  marginal_note="Appendix B - Kilometric Rates", text="km rates"),
+        ])
+        idx._build_appendix_index()
+        return idx
+    def test_cited_appendix_is_pulled_in(self):
+        self.assertEqual(self._idx()._cosurface_appendices([0]), [0, 1])
+    def test_no_duplicate_when_already_present(self):
+        self.assertEqual(self._idx()._cosurface_appendices([0, 1]), [0, 1])
+    def test_uncited_appendix_is_left_out(self):
+        # result 0 cites only Appendix C, so Appendix B (index 2) stays out.
+        self.assertNotIn(2, self._idx()._cosurface_appendices([0]))
+    def test_cross_directive_citation_is_left_alone(self):
+        idx = bare_index([
+            chunk(doc_type="directive", act_code="d10", marginal_note="A section",
+                  text="see Appendix C of the NJC Travel Directive"),
+            chunk(doc_type="directive", act_code="d10",
+                  marginal_note="Appendix C - Allowances", text="tables"),
+        ])
+        idx._build_appendix_index()
+        self.assertEqual(idx._cosurface_appendices([0]), [0])
+    def test_cap_keeps_the_most_cited_appendix(self):
+        # Four appendices are cited; Appendix A by two sections, the rest once.
+        # With the cap exceeded, the twice-cited appendix must survive.
+        idx = bare_index([
+            chunk(doc_type="directive", act_code="d1", marginal_note="S1",
+                  text="see Appendix A"),
+            chunk(doc_type="directive", act_code="d1", marginal_note="S2",
+                  text="see Appendix A; see Appendix B"),
+            chunk(doc_type="directive", act_code="d1", marginal_note="S3",
+                  text="see Appendix C; see Appendix D"),
+            chunk(doc_type="directive", act_code="d1", marginal_note="Appendix A"),
+            chunk(doc_type="directive", act_code="d1", marginal_note="Appendix B"),
+            chunk(doc_type="directive", act_code="d1", marginal_note="Appendix C"),
+            chunk(doc_type="directive", act_code="d1", marginal_note="Appendix D"),
+        ])
+        idx._build_appendix_index()
+        out = idx._cosurface_appendices([0, 1, 2])
+        self.assertEqual(len(out), 3 + APPENDIX_CAP)   # cap respected
+        self.assertIn(3, out)                          # Appendix A survives
 if __name__ == "__main__":
     unittest.main()