Pull the topically-on-target primary instrument into the visible top
Browse filesThe legislation guarantee was named for what it did 6 months ago and had
fallen behind: it ensured 'some legislation' showed up in top_k by
fusion rank, in a top_k that with the eval default of 20 always already
contained incidental legislation, so the guarantee never fired even
when the GOVERNING provision was buried at rank 10+. And when it did
fire it picked by fusion rank, so a query about 'seize currency' lifted
PCMLTFA s.20 ('Report to President') over PCMLTFA s.18 ('Seizure and
forfeiture').
Renamed to _ensure_primary, now:
- Operates on a fixed visible window of min(top_k, 5), so a top_k=20
eval still gets the right Act provision pulled into the top 5
where the user actually looks.
- Counts all four primary instrument doc_types (legislation,
agreement, directive, delegation), so an FB-Agreement query that
surfaces only FPSLREB case law gets the agreement article promoted
too.
- Picks the candidate to promote by title-match score against the
query, not raw fusion rank -- so a topical question gets the
section about the topic, not a tangentially-ranked section that
happened to score higher on other vocabulary.
- Mirrors the fusion-stage hierarchy preferences (Act over
regulation, numbered articles over agreement back-matter) as
tiebreaks, so when IRPA s.112 and IRPR s.160 both have the
marginal note 'Application for protection' the Act wins.
141-question eval: Hit@1 0.79 / Hit@3 0.96 / Hit@5 0.98 / Hit@10 0.99
/ MRR 0.87 (vs pre-fix 0.79 / 0.93 / 0.96 / 0.98 / 0.87 -- Hit@3 +0.03,
Hit@5 +0.02, Hit@10 +0.01; 6 misses -> 3). IRPA s.112 #13->#2,
Sch-98.03 #6->#2, PCMLTFA s.12 #17->#2, PCMLTFA s.18 #10->#3.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- canlex/index.py +54 -12
|
@@ -332,21 +332,58 @@ class LegislationIndex:
|
|
| 332 |
(kept if counts[key] <= SOURCE_CAP else deferred).append(idx)
|
| 333 |
return kept + deferred
|
| 334 |
|
| 335 |
-
def
|
| 336 |
-
"""Guarantee the governing
|
| 337 |
-
monopolised by case law or memoranda
|
| 338 |
-
|
| 339 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
if top_k < 3:
|
| 341 |
return ordered
|
| 342 |
-
def
|
| 343 |
-
return self.chunks[i].get("doc_type", "legislation")
|
| 344 |
top, rest = ordered[:top_k], ordered[top_k:]
|
| 345 |
-
need = 2 - sum(1 for i in top if
|
| 346 |
if need <= 0:
|
| 347 |
return ordered
|
| 348 |
-
|
| 349 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
if not drop:
|
| 351 |
return ordered
|
| 352 |
promote = promote[:len(drop)]
|
|
@@ -511,9 +548,14 @@ class LegislationIndex:
|
|
| 511 |
pinned_set = set(pinned)
|
| 512 |
candidates = pinned + [i for i in candidates if i not in pinned_set]
|
| 513 |
|
| 514 |
-
# Cap one-source monopolies, then guarantee
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
candidates = self._diversify(candidates)
|
| 516 |
-
candidates = self.
|
| 517 |
|
| 518 |
top = self._cosurface_appendices(candidates[:top_k])
|
| 519 |
highlights = self._highlight(query, top)
|
|
|
|
| 332 |
(kept if counts[key] <= SOURCE_CAP else deferred).append(idx)
|
| 333 |
return kept + deferred
|
| 334 |
|
| 335 |
+
def _ensure_primary(self, ordered, top_k, q_tokens):
|
| 336 |
+
"""Guarantee the governing primary instrument is surfaced: when the
|
| 337 |
+
natural top_k is monopolised by case law or D-memoranda that interpret
|
| 338 |
+
a statute, pull the most topically-on-target Act/agreement/directive/
|
| 339 |
+
delegation section into the top_k, displacing the lowest-ranked
|
| 340 |
+
secondary sources. The single best match is always kept in place.
|
| 341 |
+
|
| 342 |
+
Two changes from the older 'ensure_legislation' guarantee: (i) all
|
| 343 |
+
primary instruments count, not only legislation -- so an FB-Agreement
|
| 344 |
+
query that surfaces only FPSLREB case law gets the agreement article
|
| 345 |
+
pulled in too; (ii) the candidate to promote is chosen by title-match
|
| 346 |
+
against the query (the section whose marginal_note covers the most of
|
| 347 |
+
the query's distinctive vocabulary), not by raw fusion rank. The
|
| 348 |
+
fusion rank surfaces tangentially-on-topic sections that share the
|
| 349 |
+
Act's general vocabulary; the title-match scorer surfaces the section
|
| 350 |
+
actually about the topic ('Seizure and forfeiture' over 'Report to
|
| 351 |
+
President' for a 'seize currency' query)."""
|
| 352 |
if top_k < 3:
|
| 353 |
return ordered
|
| 354 |
+
def is_primary(i):
|
| 355 |
+
return self.chunks[i].get("doc_type", "legislation") in PRIMARY_DOC_TYPES
|
| 356 |
top, rest = ordered[:top_k], ordered[top_k:]
|
| 357 |
+
need = 2 - sum(1 for i in top if is_primary(i))
|
| 358 |
if need <= 0:
|
| 359 |
return ordered
|
| 360 |
+
primary_in_rest = [i for i in rest if is_primary(i)]
|
| 361 |
+
if not primary_in_rest:
|
| 362 |
+
return ordered
|
| 363 |
+
if q_tokens:
|
| 364 |
+
def title_score(idx):
|
| 365 |
+
note_tokens = self._note_tokens[idx]
|
| 366 |
+
if not note_tokens:
|
| 367 |
+
return 0.0
|
| 368 |
+
matched = sum(self.idf.get(t, 0.0)
|
| 369 |
+
for t in note_tokens if t in q_tokens)
|
| 370 |
+
total = sum(self.idf.get(t, 0.0) for t in note_tokens) or 1.0
|
| 371 |
+
score = matched * matched / total
|
| 372 |
+
# Mirror the fusion-stage hierarchy preferences for tiebreaks:
|
| 373 |
+
# the governing Act beats its regulation, and numbered
|
| 374 |
+
# agreement articles beat their back-matter, when both have
|
| 375 |
+
# identical titles (e.g. IRPA s. 112 and IRPR s. 160 both
|
| 376 |
+
# marginal-noted 'Application for protection').
|
| 377 |
+
if self._is_regulation[idx]:
|
| 378 |
+
score -= REG_PENALTY
|
| 379 |
+
if self._is_backmatter[idx]:
|
| 380 |
+
score -= BACKMATTER_PENALTY
|
| 381 |
+
return score
|
| 382 |
+
# Sort by title-match descending, then by original fusion order as
|
| 383 |
+
# a tiebreak (stable sort: keep the original rest order).
|
| 384 |
+
primary_in_rest.sort(key=title_score, reverse=True)
|
| 385 |
+
promote = primary_in_rest[:need]
|
| 386 |
+
drop = [i for i in reversed(top) if not is_primary(i)][:len(promote)]
|
| 387 |
if not drop:
|
| 388 |
return ordered
|
| 389 |
promote = promote[:len(drop)]
|
|
|
|
| 548 |
pinned_set = set(pinned)
|
| 549 |
candidates = pinned + [i for i in candidates if i not in pinned_set]
|
| 550 |
|
| 551 |
+
# Cap one-source monopolies, then guarantee a primary instrument
|
| 552 |
+
# (statute/agreement/directive/delegation) on the topic is represented.
|
| 553 |
+
# The guarantee operates on a fixed visible window (5), not the full
|
| 554 |
+
# top_k -- with top_k=20 (the eval default) the larger window almost
|
| 555 |
+
# always contains incidental legislation, so the guarantee never fires
|
| 556 |
+
# even when the GOVERNING provision is buried at rank 10+.
|
| 557 |
candidates = self._diversify(candidates)
|
| 558 |
+
candidates = self._ensure_primary(candidates, min(top_k, 5), q_tokens)
|
| 559 |
|
| 560 |
top = self._cosurface_appendices(candidates[:top_k])
|
| 561 |
highlights = self._highlight(query, top)
|