Spaces:

Beemer0
/

CanLex

Running

Beemer Claude Opus 4.7 commited on 4 days ago

Commit

d33c8fb

1 Parent(s): 666cd44

Make the four retrieval-tuning knobs env-overridable

MN_WEIGHT, MN_CAP, REG_PENALTY, BACKMATTER_PENALTY now read from
CANLEX_<NAME> env vars with the current literals as defaults, so a
tuning sweep can vary them without code edits. No behaviour change
unless an env var is set.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (1) hide show

canlex/index.py +9 -4

canlex/index.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Hybrid retrieval (BM25 + semantic) with cross-encoder reranking."""
 import json
 import math
 import re
 import sys
 from collections import Counter, defaultdict
@@ -16,13 +17,17 @@ RRF_K = 60          # reciprocal-rank-fusion damping constant
 W_SEM = 2.0         # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
 CANDIDATES = 80     # hits each retriever contributes to the fusion
 RERANK_POOL = 50    # top fused candidates the cross-encoder rescores
-MN_WEIGHT = 0.0024  # title-match boost per unit of idf-weighted overlap between
                     # the query and a candidate's marginal note (section title)
-MN_CAP = 0.012      # ceiling on the title-match boost -- it nudges the ranking
                     # without overriding a strong base score
-REG_PENALTY = 0.008 # small fusion penalty on regulation sections, so the Act
                     # that creates a duty outranks the regulation elaborating it
-BACKMATTER_PENALTY = 0.008  # likewise for a collective agreement's back-matter
                     # (memoranda, letters of understanding) vs its numbered articles
 SOURCE_CAP = 2      # max chunks one case or memorandum may contribute
 APPENDIX_CAP = 3    # max referenced appendices co-surfaced into a result set

 """Hybrid retrieval (BM25 + semantic) with cross-encoder reranking."""
 import json
 import math
+import os
 import re
 import sys
 from collections import Counter, defaultdict
 W_SEM = 2.0         # weight on the semantic retriever in the fusion (1.0 = equal; eval-tuned)
 CANDIDATES = 80     # hits each retriever contributes to the fusion
 RERANK_POOL = 50    # top fused candidates the cross-encoder rescores
+MN_WEIGHT = float(os.environ.get("CANLEX_MN_WEIGHT", "0.0024"))
+                    # title-match boost per unit of idf-weighted overlap between
                     # the query and a candidate's marginal note (section title)
+MN_CAP = float(os.environ.get("CANLEX_MN_CAP", "0.012"))
+                    # ceiling on the title-match boost -- it nudges the ranking
                     # without overriding a strong base score
+REG_PENALTY = float(os.environ.get("CANLEX_REG_PENALTY", "0.008"))
+                    # small fusion penalty on regulation sections, so the Act
                     # that creates a duty outranks the regulation elaborating it
+BACKMATTER_PENALTY = float(os.environ.get("CANLEX_BACKMATTER_PENALTY", "0.008"))
+                    # likewise for a collective agreement's back-matter
                     # (memoranda, letters of understanding) vs its numbered articles
 SOURCE_CAP = 2      # max chunks one case or memorandum may contribute
 APPENDIX_CAP = 3    # max referenced appendices co-surfaced into a result set