File size: 10,477 Bytes
589d46e 58fc4b4 589d46e a7a22f5 589d46e a7a22f5 589d46e a7a22f5 589d46e a7a22f5 589d46e a7a22f5 589d46e 58fc4b4 589d46e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | """Unit tests for the retrieval pipeline (canlex/index.py).
Fast, offline tests of the pure retrieval logic -- tokenisation, section-
reference parsing, the diversity cap, the result-set guarantee and the doc-type
flags. They build a bare LegislationIndex via __new__, so no corpus, embeddings
or reranker are loaded.
python -m unittest discover -s tests
"""
import unittest
from canlex.index import (
LegislationIndex, SOURCE_CAP, APPENDIX_CAP, tokenize, _section_refs,
_provision_units,
)
def chunk(doc_type="legislation", act_code="I-2.5", section="1",
marginal_note="Title", part="", **extra):
"""A minimal corpus chunk carrying the fields the index logic reads."""
c = {"doc_type": doc_type, "act_code": act_code, "section": section,
"marginal_note": marginal_note, "part": part, "heading": "",
"act_short": "X", "text": ""}
c.update(extra)
return c
def bare_index(chunks):
"""A LegislationIndex with only .chunks set -- enough for the pure methods."""
idx = LegislationIndex.__new__(LegislationIndex)
idx.chunks = chunks
return idx
class TokenizeTests(unittest.TestCase):
def test_case_insensitive(self):
self.assertEqual(tokenize("REPORT goods"), tokenize("report Goods"))
def test_stemming_unifies_word_forms(self):
# The point of stemming: different forms collapse to one token.
self.assertEqual(tokenize("reporting"), tokenize("reported"))
self.assertEqual(tokenize("importation"), tokenize("import"))
def test_splits_on_non_alphanumeric(self):
self.assertEqual(tokenize("s.34(1)(a)"), ["s", "34", "1", "a"])
def test_empty(self):
self.assertEqual(tokenize(""), [])
class SectionRefTests(unittest.TestCase):
def test_plain_section(self):
self.assertEqual(_section_refs("inadmissible under section 34"), {"34"})
def test_decimal_and_abbreviated(self):
self.assertEqual(_section_refs("see s. 20.1 and section 5"), {"20.1", "5"})
def test_no_reference(self):
self.assertEqual(_section_refs("what is a pre-removal risk assessment"),
set())
class ProvisionUnitsTests(unittest.TestCase):
def test_structured_provision_yields_units(self):
text = "(1) The chapeau.\n(a) first paragraph\n(b) second paragraph"
self.assertTrue(_provision_units(text))
def test_flat_provision_yields_nothing(self):
self.assertEqual(_provision_units("A flat provision with no markers."),
[])
class SourceKeyTests(unittest.TestCase):
"""_source_key decides what the diversity cap collapses."""
def test_primary_instruments_are_never_capped(self):
idx = bare_index([
chunk(doc_type="legislation"),
chunk(doc_type="agreement", act_code="FB"),
chunk(doc_type="directive", act_code="d1"),
])
for i in range(3):
self.assertIsNone(idx._source_key(i))
def test_caselaw_and_memoranda_are_keyed(self):
idx = bare_index([
chunk(doc_type="memorandum", act_code="D-Memo", section="D1-1-1"),
chunk(doc_type="caselaw", act_code="2019 SCC 65"),
])
self.assertEqual(idx._source_key(0), ("memorandum", "D1-1-1"))
self.assertEqual(idx._source_key(1), ("caselaw", "2019 SCC 65"))
class DiversifyTests(unittest.TestCase):
def test_caps_caselaw_per_decision(self):
n = SOURCE_CAP + 2
chunks = [chunk(doc_type="caselaw", act_code="2019 SCC 65")
for _ in range(n)]
chunks.append(chunk(doc_type="legislation")) # index n
idx = bare_index(chunks)
out = idx._diversify(list(range(n + 1)))
kept, deferred = out[:SOURCE_CAP + 1], out[SOURCE_CAP + 1:]
self.assertIn(n, kept) # legislation never capped
self.assertEqual(
sum(1 for i in kept if idx.chunks[i]["doc_type"] == "caselaw"),
SOURCE_CAP)
self.assertEqual(len(deferred), n - SOURCE_CAP)
def test_does_not_cap_agreements(self):
n = SOURCE_CAP + 3
idx = bare_index([chunk(doc_type="agreement", act_code="FB",
section=str(i)) for i in range(n)])
out = idx._diversify(list(range(n)))
self.assertEqual(out, list(range(n))) # uncapped: order intact
class EnsurePrimaryTests(unittest.TestCase):
def test_pulls_primary_into_a_caselaw_dominated_top_k(self):
idx = bare_index([
chunk(doc_type="caselaw", act_code="A"),
chunk(doc_type="caselaw", act_code="B"),
chunk(doc_type="caselaw", act_code="C"),
chunk(doc_type="legislation"),
chunk(doc_type="legislation"),
])
out = idx._ensure_primary([0, 1, 2, 3, 4], top_k=3, q_tokens=set())
top = out[:3]
n_prim = sum(1 for i in top
if idx.chunks[i]["doc_type"] == "legislation")
self.assertGreaterEqual(n_prim, 2)
self.assertEqual(out[0], 0) # the #1 hit is preserved
def test_no_op_when_primary_already_present(self):
idx = bare_index([
chunk(doc_type="legislation"),
chunk(doc_type="legislation"),
chunk(doc_type="caselaw", act_code="A"),
])
self.assertEqual(
idx._ensure_primary([0, 1, 2], top_k=3, q_tokens=set()),
[0, 1, 2])
def test_counts_agreements_as_primary(self):
# An agreement query that surfaces only case-law in top_k should
# have the agreement article pulled in -- not just legislation.
idx = bare_index([
chunk(doc_type="caselaw", act_code="A"),
chunk(doc_type="caselaw", act_code="B"),
chunk(doc_type="caselaw", act_code="C"),
chunk(doc_type="agreement", act_code="FB", section="17",
marginal_note="discipline"),
chunk(doc_type="agreement", act_code="FB", section="25",
marginal_note="hours of work"),
])
out = idx._ensure_primary([0, 1, 2, 3, 4], top_k=3, q_tokens=set())
top_doc_types = [idx.chunks[i]["doc_type"] for i in out[:3]]
self.assertGreaterEqual(top_doc_types.count("agreement"), 2)
class DocTypeFlagTests(unittest.TestCase):
"""_build_note_tokens also flags regulations and agreement back-matter."""
def setUp(self):
self.idx = bare_index([
chunk(doc_type="legislation", act_code="I-2.5"),
chunk(doc_type="legislation", act_code="SOR-2002-227"),
chunk(doc_type="legislation", act_code="C.R.C.,_c._1041"),
chunk(doc_type="agreement", act_code="FB", section="17"),
chunk(doc_type="agreement", act_code="FB", section=""),
chunk(doc_type="memorandum", act_code="D-Memo", section="D1-1-1",
marginal_note="Guidelines", part="Importing goods"),
])
self.idx._build_note_tokens()
def test_regulation_flag(self):
self.assertEqual(self.idx._is_regulation,
[False, True, True, False, False, False])
def test_agreement_backmatter_flag(self):
self.assertEqual(self.idx._is_backmatter,
[False, False, False, False, True, False])
def test_memorandum_title_tokens_come_from_part(self):
# A memo's marginal note is generic; its title is the 'part' field.
self.assertEqual(self.idx._note_tokens[5], set(tokenize("Importing goods")))
class CosurfaceAppendixTests(unittest.TestCase):
"""_cosurface_appendices pulls a directive appendix into the result set
when a directive result cites it but retrieval missed it."""
def _idx(self):
idx = bare_index([
chunk(doc_type="directive", act_code="d10", marginal_note="Meals",
text="paid the meal allowance at the rates in Appendix C."),
chunk(doc_type="directive", act_code="d10",
marginal_note="Appendix C - Allowances", text="rate tables"),
chunk(doc_type="directive", act_code="d10",
marginal_note="Appendix B - Kilometric Rates", text="km rates"),
])
idx._build_appendix_index()
return idx
def test_cited_appendix_is_pulled_in(self):
self.assertEqual(self._idx()._cosurface_appendices([0]), [0, 1])
def test_no_duplicate_when_already_present(self):
self.assertEqual(self._idx()._cosurface_appendices([0, 1]), [0, 1])
def test_uncited_appendix_is_left_out(self):
# result 0 cites only Appendix C, so Appendix B (index 2) stays out.
self.assertNotIn(2, self._idx()._cosurface_appendices([0]))
def test_cross_directive_citation_is_left_alone(self):
idx = bare_index([
chunk(doc_type="directive", act_code="d10", marginal_note="A section",
text="see Appendix C of the NJC Travel Directive"),
chunk(doc_type="directive", act_code="d10",
marginal_note="Appendix C - Allowances", text="tables"),
])
idx._build_appendix_index()
self.assertEqual(idx._cosurface_appendices([0]), [0])
def test_cap_keeps_the_most_cited_appendix(self):
# Four appendices are cited; Appendix A by two sections, the rest once.
# With the cap exceeded, the twice-cited appendix must survive.
idx = bare_index([
chunk(doc_type="directive", act_code="d1", marginal_note="S1",
text="see Appendix A"),
chunk(doc_type="directive", act_code="d1", marginal_note="S2",
text="see Appendix A; see Appendix B"),
chunk(doc_type="directive", act_code="d1", marginal_note="S3",
text="see Appendix C; see Appendix D"),
chunk(doc_type="directive", act_code="d1", marginal_note="Appendix A"),
chunk(doc_type="directive", act_code="d1", marginal_note="Appendix B"),
chunk(doc_type="directive", act_code="d1", marginal_note="Appendix C"),
chunk(doc_type="directive", act_code="d1", marginal_note="Appendix D"),
])
idx._build_appendix_index()
out = idx._cosurface_appendices([0, 1, 2])
self.assertEqual(len(out), 3 + APPENDIX_CAP) # cap respected
self.assertIn(3, out) # Appendix A survives
if __name__ == "__main__":
unittest.main()
|