"""Unit tests for the retrieval pipeline (canlex/index.py). Fast, offline tests of the pure retrieval logic -- tokenisation, section- reference parsing, the diversity cap, the result-set guarantee and the doc-type flags. They build a bare LegislationIndex via __new__, so no corpus, embeddings or reranker are loaded. python -m unittest discover -s tests """ import unittest from canlex.index import ( LegislationIndex, SOURCE_CAP, APPENDIX_CAP, tokenize, _section_refs, _provision_units, ) def chunk(doc_type="legislation", act_code="I-2.5", section="1", marginal_note="Title", part="", **extra): """A minimal corpus chunk carrying the fields the index logic reads.""" c = {"doc_type": doc_type, "act_code": act_code, "section": section, "marginal_note": marginal_note, "part": part, "heading": "", "act_short": "X", "text": ""} c.update(extra) return c def bare_index(chunks): """A LegislationIndex with only .chunks set -- enough for the pure methods.""" idx = LegislationIndex.__new__(LegislationIndex) idx.chunks = chunks return idx class TokenizeTests(unittest.TestCase): def test_case_insensitive(self): self.assertEqual(tokenize("REPORT goods"), tokenize("report Goods")) def test_stemming_unifies_word_forms(self): # The point of stemming: different forms collapse to one token. self.assertEqual(tokenize("reporting"), tokenize("reported")) self.assertEqual(tokenize("importation"), tokenize("import")) def test_splits_on_non_alphanumeric(self): self.assertEqual(tokenize("s.34(1)(a)"), ["s", "34", "1", "a"]) def test_empty(self): self.assertEqual(tokenize(""), []) class SectionRefTests(unittest.TestCase): def test_plain_section(self): self.assertEqual(_section_refs("inadmissible under section 34"), {"34"}) def test_decimal_and_abbreviated(self): self.assertEqual(_section_refs("see s. 20.1 and section 5"), {"20.1", "5"}) def test_no_reference(self): self.assertEqual(_section_refs("what is a pre-removal risk assessment"), set()) class ProvisionUnitsTests(unittest.TestCase): def test_structured_provision_yields_units(self): text = "(1) The chapeau.\n(a) first paragraph\n(b) second paragraph" self.assertTrue(_provision_units(text)) def test_flat_provision_yields_nothing(self): self.assertEqual(_provision_units("A flat provision with no markers."), []) class SourceKeyTests(unittest.TestCase): """_source_key decides what the diversity cap collapses.""" def test_primary_instruments_are_never_capped(self): idx = bare_index([ chunk(doc_type="legislation"), chunk(doc_type="agreement", act_code="FB"), chunk(doc_type="directive", act_code="d1"), ]) for i in range(3): self.assertIsNone(idx._source_key(i)) def test_caselaw_and_memoranda_are_keyed(self): idx = bare_index([ chunk(doc_type="memorandum", act_code="D-Memo", section="D1-1-1"), chunk(doc_type="caselaw", act_code="2019 SCC 65"), ]) self.assertEqual(idx._source_key(0), ("memorandum", "D1-1-1")) self.assertEqual(idx._source_key(1), ("caselaw", "2019 SCC 65")) class DiversifyTests(unittest.TestCase): def test_caps_caselaw_per_decision(self): n = SOURCE_CAP + 2 chunks = [chunk(doc_type="caselaw", act_code="2019 SCC 65") for _ in range(n)] chunks.append(chunk(doc_type="legislation")) # index n idx = bare_index(chunks) out = idx._diversify(list(range(n + 1))) kept, deferred = out[:SOURCE_CAP + 1], out[SOURCE_CAP + 1:] self.assertIn(n, kept) # legislation never capped self.assertEqual( sum(1 for i in kept if idx.chunks[i]["doc_type"] == "caselaw"), SOURCE_CAP) self.assertEqual(len(deferred), n - SOURCE_CAP) def test_does_not_cap_agreements(self): n = SOURCE_CAP + 3 idx = bare_index([chunk(doc_type="agreement", act_code="FB", section=str(i)) for i in range(n)]) out = idx._diversify(list(range(n))) self.assertEqual(out, list(range(n))) # uncapped: order intact class EnsurePrimaryTests(unittest.TestCase): def test_pulls_primary_into_a_caselaw_dominated_top_k(self): idx = bare_index([ chunk(doc_type="caselaw", act_code="A"), chunk(doc_type="caselaw", act_code="B"), chunk(doc_type="caselaw", act_code="C"), chunk(doc_type="legislation"), chunk(doc_type="legislation"), ]) out = idx._ensure_primary([0, 1, 2, 3, 4], top_k=3, q_tokens=set()) top = out[:3] n_prim = sum(1 for i in top if idx.chunks[i]["doc_type"] == "legislation") self.assertGreaterEqual(n_prim, 2) self.assertEqual(out[0], 0) # the #1 hit is preserved def test_no_op_when_primary_already_present(self): idx = bare_index([ chunk(doc_type="legislation"), chunk(doc_type="legislation"), chunk(doc_type="caselaw", act_code="A"), ]) self.assertEqual( idx._ensure_primary([0, 1, 2], top_k=3, q_tokens=set()), [0, 1, 2]) def test_counts_agreements_as_primary(self): # An agreement query that surfaces only case-law in top_k should # have the agreement article pulled in -- not just legislation. idx = bare_index([ chunk(doc_type="caselaw", act_code="A"), chunk(doc_type="caselaw", act_code="B"), chunk(doc_type="caselaw", act_code="C"), chunk(doc_type="agreement", act_code="FB", section="17", marginal_note="discipline"), chunk(doc_type="agreement", act_code="FB", section="25", marginal_note="hours of work"), ]) out = idx._ensure_primary([0, 1, 2, 3, 4], top_k=3, q_tokens=set()) top_doc_types = [idx.chunks[i]["doc_type"] for i in out[:3]] self.assertGreaterEqual(top_doc_types.count("agreement"), 2) class DocTypeFlagTests(unittest.TestCase): """_build_note_tokens also flags regulations and agreement back-matter.""" def setUp(self): self.idx = bare_index([ chunk(doc_type="legislation", act_code="I-2.5"), chunk(doc_type="legislation", act_code="SOR-2002-227"), chunk(doc_type="legislation", act_code="C.R.C.,_c._1041"), chunk(doc_type="agreement", act_code="FB", section="17"), chunk(doc_type="agreement", act_code="FB", section=""), chunk(doc_type="memorandum", act_code="D-Memo", section="D1-1-1", marginal_note="Guidelines", part="Importing goods"), ]) self.idx._build_note_tokens() def test_regulation_flag(self): self.assertEqual(self.idx._is_regulation, [False, True, True, False, False, False]) def test_agreement_backmatter_flag(self): self.assertEqual(self.idx._is_backmatter, [False, False, False, False, True, False]) def test_memorandum_title_tokens_come_from_part(self): # A memo's marginal note is generic; its title is the 'part' field. self.assertEqual(self.idx._note_tokens[5], set(tokenize("Importing goods"))) class CosurfaceAppendixTests(unittest.TestCase): """_cosurface_appendices pulls a directive appendix into the result set when a directive result cites it but retrieval missed it.""" def _idx(self): idx = bare_index([ chunk(doc_type="directive", act_code="d10", marginal_note="Meals", text="paid the meal allowance at the rates in Appendix C."), chunk(doc_type="directive", act_code="d10", marginal_note="Appendix C - Allowances", text="rate tables"), chunk(doc_type="directive", act_code="d10", marginal_note="Appendix B - Kilometric Rates", text="km rates"), ]) idx._build_appendix_index() return idx def test_cited_appendix_is_pulled_in(self): self.assertEqual(self._idx()._cosurface_appendices([0]), [0, 1]) def test_no_duplicate_when_already_present(self): self.assertEqual(self._idx()._cosurface_appendices([0, 1]), [0, 1]) def test_uncited_appendix_is_left_out(self): # result 0 cites only Appendix C, so Appendix B (index 2) stays out. self.assertNotIn(2, self._idx()._cosurface_appendices([0])) def test_cross_directive_citation_is_left_alone(self): idx = bare_index([ chunk(doc_type="directive", act_code="d10", marginal_note="A section", text="see Appendix C of the NJC Travel Directive"), chunk(doc_type="directive", act_code="d10", marginal_note="Appendix C - Allowances", text="tables"), ]) idx._build_appendix_index() self.assertEqual(idx._cosurface_appendices([0]), [0]) def test_cap_keeps_the_most_cited_appendix(self): # Four appendices are cited; Appendix A by two sections, the rest once. # With the cap exceeded, the twice-cited appendix must survive. idx = bare_index([ chunk(doc_type="directive", act_code="d1", marginal_note="S1", text="see Appendix A"), chunk(doc_type="directive", act_code="d1", marginal_note="S2", text="see Appendix A; see Appendix B"), chunk(doc_type="directive", act_code="d1", marginal_note="S3", text="see Appendix C; see Appendix D"), chunk(doc_type="directive", act_code="d1", marginal_note="Appendix A"), chunk(doc_type="directive", act_code="d1", marginal_note="Appendix B"), chunk(doc_type="directive", act_code="d1", marginal_note="Appendix C"), chunk(doc_type="directive", act_code="d1", marginal_note="Appendix D"), ]) idx._build_appendix_index() out = idx._cosurface_appendices([0, 1, 2]) self.assertEqual(len(out), 3 + APPENDIX_CAP) # cap respected self.assertIn(3, out) # Appendix A survives if __name__ == "__main__": unittest.main()