Beemer Claude Opus 4.7 commited on
Commit ·
d33c8fb
1
Parent(s): 666cd44
Make the four retrieval-tuning knobs env-overridable
Browse filesMN_WEIGHT, MN_CAP, REG_PENALTY, BACKMATTER_PENALTY now read from
CANLEX_<NAME> env vars with the current literals as defaults, so a
tuning sweep can vary them without code edits. No behaviour change
unless an env var is set.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- canlex/index.py +9 -4
canlex/index.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
"""Hybrid retrieval (BM25 + semantic) with cross-encoder reranking."""
|
| 2 |
import json
|
| 3 |
import math
|
|
|
|
| 4 |
import re
|
| 5 |
import sys
|
| 6 |
from collections import Counter, defaultdict
|
|
@@ -16,13 +17,17 @@ RRF_K = 60 # reciprocal-rank-fusion damping constant
|
|
| 16 |
W_SEM = 2.0 # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
|
| 17 |
CANDIDATES = 80 # hits each retriever contributes to the fusion
|
| 18 |
RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
|
| 19 |
-
MN_WEIGHT = 0.0024
|
|
|
|
| 20 |
# the query and a candidate's marginal note (section title)
|
| 21 |
-
MN_CAP = 0.012
|
|
|
|
| 22 |
# without overriding a strong base score
|
| 23 |
-
REG_PENALTY =
|
|
|
|
| 24 |
# that creates a duty outranks the regulation elaborating it
|
| 25 |
-
BACKMATTER_PENALTY = 0.008
|
|
|
|
| 26 |
# (memoranda, letters of understanding) vs its numbered articles
|
| 27 |
SOURCE_CAP = 2 # max chunks one case or memorandum may contribute
|
| 28 |
APPENDIX_CAP = 3 # max referenced appendices co-surfaced into a result set
|
|
|
|
| 1 |
"""Hybrid retrieval (BM25 + semantic) with cross-encoder reranking."""
|
| 2 |
import json
|
| 3 |
import math
|
| 4 |
+
import os
|
| 5 |
import re
|
| 6 |
import sys
|
| 7 |
from collections import Counter, defaultdict
|
|
|
|
| 17 |
W_SEM = 2.0 # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
|
| 18 |
CANDIDATES = 80 # hits each retriever contributes to the fusion
|
| 19 |
RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
|
| 20 |
+
MN_WEIGHT = float(os.environ.get("CANLEX_MN_WEIGHT", "0.0024"))
|
| 21 |
+
# title-match boost per unit of idf-weighted overlap between
|
| 22 |
# the query and a candidate's marginal note (section title)
|
| 23 |
+
MN_CAP = float(os.environ.get("CANLEX_MN_CAP", "0.012"))
|
| 24 |
+
# ceiling on the title-match boost -- it nudges the ranking
|
| 25 |
# without overriding a strong base score
|
| 26 |
+
REG_PENALTY = float(os.environ.get("CANLEX_REG_PENALTY", "0.008"))
|
| 27 |
+
# small fusion penalty on regulation sections, so the Act
|
| 28 |
# that creates a duty outranks the regulation elaborating it
|
| 29 |
+
BACKMATTER_PENALTY = float(os.environ.get("CANLEX_BACKMATTER_PENALTY", "0.008"))
|
| 30 |
+
# likewise for a collective agreement's back-matter
|
| 31 |
# (memoranda, letters of understanding) vs its numbered articles
|
| 32 |
SOURCE_CAP = 2 # max chunks one case or memorandum may contribute
|
| 33 |
APPENDIX_CAP = 3 # max referenced appendices co-surfaced into a result set
|