""" Configuration constants for SinCode v3. Key difference from v2: no rule engine, no dictionary. Candidate generation is fully handled by the ByT5 seq2seq model. """ import re # ─── MLM Model Path ────────────────────────────────────────────────────────── # XLM-RoBERTa fine-tuned on Sinhala — reranks ByT5 candidates by context DEFAULT_MLM_MODEL = "Kalana001/xlm-roberta-base-finetuned-sinhala" # ─── ByT5 Transliterator Model Path ────────────────────────────────────────── # Fine-tuned on 1M Singlish→Sinhala pairs — hosted on Hugging Face Hub DEFAULT_BYT5_MODEL = "Kalana001/byt5-small-singlish-sinhala" # ─── mBart50 Transliterator Model Path ─────────────────────────────────────── # Full-sentence Singlish→Sinhala (no English retained) — Hugging Face Hub DEFAULT_MBART_MODEL = "Kalana001/mbart50-large-singlish-sinhala" # ─── Corpus ─────────────────────────────────────────────────────────────────── ENGLISH_CORPUS_URL = ( "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt" ) # ─── Scoring Weights ───────────────────────────────────────────────────────── # Pure MLM — no manual weights needed # ─── Decoding Parameters ───────────────────────────────────────────────────── MAX_CANDIDATES: int = 5 # ByT5 beam=5 → 5 candidates per word MIN_ENGLISH_LEN: int = 3 # Min word length for English detection # Words >= this length that are in the English vocab are treated as unambiguous # English loanwords and passed through without MLM scoring. # Short words (< 6 chars) are kept in the MLM path because they may be # Singlish homophones (e.g. 'mage'=wizard vs 'මගේ', 'mama'=mum vs 'mama'=uncle). MIN_ENGLISH_PASSTHROUGH_LEN: int = 5 # ─── Regex ─────────────────────────────────────────────────────────────────── PUNCT_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")