Beemer Claude Opus 4.7 commited on
Commit
d33c8fb
·
1 Parent(s): 666cd44

Make the four retrieval-tuning knobs env-overridable

Browse files

MN_WEIGHT, MN_CAP, REG_PENALTY, BACKMATTER_PENALTY now read from
CANLEX_<NAME> env vars with the current literals as defaults, so a
tuning sweep can vary them without code edits. No behaviour change
unless an env var is set.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (1) hide show
  1. canlex/index.py +9 -4
canlex/index.py CHANGED
@@ -1,6 +1,7 @@
1
  """Hybrid retrieval (BM25 + semantic) with cross-encoder reranking."""
2
  import json
3
  import math
 
4
  import re
5
  import sys
6
  from collections import Counter, defaultdict
@@ -16,13 +17,17 @@ RRF_K = 60 # reciprocal-rank-fusion damping constant
16
  W_SEM = 2.0 # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
17
  CANDIDATES = 80 # hits each retriever contributes to the fusion
18
  RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
19
- MN_WEIGHT = 0.0024 # title-match boost per unit of idf-weighted overlap between
 
20
  # the query and a candidate's marginal note (section title)
21
- MN_CAP = 0.012 # ceiling on the title-match boost -- it nudges the ranking
 
22
  # without overriding a strong base score
23
- REG_PENALTY = 0.008 # small fusion penalty on regulation sections, so the Act
 
24
  # that creates a duty outranks the regulation elaborating it
25
- BACKMATTER_PENALTY = 0.008 # likewise for a collective agreement's back-matter
 
26
  # (memoranda, letters of understanding) vs its numbered articles
27
  SOURCE_CAP = 2 # max chunks one case or memorandum may contribute
28
  APPENDIX_CAP = 3 # max referenced appendices co-surfaced into a result set
 
1
  """Hybrid retrieval (BM25 + semantic) with cross-encoder reranking."""
2
  import json
3
  import math
4
+ import os
5
  import re
6
  import sys
7
  from collections import Counter, defaultdict
 
17
  W_SEM = 2.0 # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
18
  CANDIDATES = 80 # hits each retriever contributes to the fusion
19
  RERANK_POOL = 50 # top fused candidates the cross-encoder rescores
20
+ MN_WEIGHT = float(os.environ.get("CANLEX_MN_WEIGHT", "0.0024"))
21
+ # title-match boost per unit of idf-weighted overlap between
22
  # the query and a candidate's marginal note (section title)
23
+ MN_CAP = float(os.environ.get("CANLEX_MN_CAP", "0.012"))
24
+ # ceiling on the title-match boost -- it nudges the ranking
25
  # without overriding a strong base score
26
+ REG_PENALTY = float(os.environ.get("CANLEX_REG_PENALTY", "0.008"))
27
+ # small fusion penalty on regulation sections, so the Act
28
  # that creates a duty outranks the regulation elaborating it
29
+ BACKMATTER_PENALTY = float(os.environ.get("CANLEX_BACKMATTER_PENALTY", "0.008"))
30
+ # likewise for a collective agreement's back-matter
31
  # (memoranda, letters of understanding) vs its numbered articles
32
  SOURCE_CAP = 2 # max chunks one case or memorandum may contribute
33
  APPENDIX_CAP = 3 # max referenced appendices co-surfaced into a result set