CanLex / canlex /synonyms.py
Beemer
Add output-quality features: highlighting, hedging, linking, currency
1e58371
"""Query-side expansion of legal abbreviations and informal terms.
Statutes use formal wording -- "application for protection", "removal order" --
but users (and a model drafting a search) reach for everyday shorthand: "PRRA",
"H&C", "deportation". Before retrieval, expand_query() appends the canonical
statutory terms for any abbreviation or nickname it recognises, so the BM25 and
semantic stages can match the provision's actual language. It only ever ADDS
words -- the user's own phrasing is left untouched -- and the cross-encoder
reranker still sees the original query, so precision is unaffected.
python -m canlex.synonyms "PRRA eligibility and an H&C application"
"""
import re
import sys
# (trigger, canonical terms to append). The trigger is a regex fragment matched
# case-insensitively as a whole word. Keep this list high-precision: an entry
# earns its place only when the shorthand is unambiguous in Canadian border,
# immigration, customs, financial-crime or labour law.
_SYNONYMS = [
# Immigration and refugee law
(r"prra", "pre-removal risk assessment application for protection"),
(r"pre[- ]removal risk assessment", "application for protection"),
(r"h\s*&\s*c", "humanitarian and compassionate"),
(r"rad", "refugee appeal division"),
(r"rpd", "refugee protection division"),
(r"iad", "immigration appeal division"),
(r"irb", "immigration and refugee board"),
(r"trp", "temporary resident permit"),
(r"deportation", "removal order"),
(r"misrep", "misrepresentation"),
(r"ircc", "immigration refugees and citizenship canada"),
# Border and customs
(r"cbsa", "canada border services agency"),
(r"bsos?", "border services officer"),
(r"amps", "administrative monetary penalty system"),
# Financial-crime and labour
(r"fintrac", "financial transactions and reports analysis centre"),
(r"njc", "national joint council"),
]
_COMPILED = [(re.compile(rf"\b{trigger}\b", re.IGNORECASE), expansion)
for trigger, expansion in _SYNONYMS]
def expand_query(query):
"""Return `query` with canonical statutory terms appended for every legal
abbreviation it contains; return it unchanged if it contains none."""
additions = [exp for pattern, exp in _COMPILED if pattern.search(query)]
if not additions:
return query
return f"{query} {' '.join(additions)}"
def main():
query = " ".join(sys.argv[1:]) or "PRRA eligibility and an H&C application"
print(f"query: {query}")
print(f"expanded: {expand_query(query)}")
if __name__ == "__main__":
main()