| """ | |
| Configuration constants for SinCode v3. | |
| Key difference from v2: no rule engine, no dictionary. | |
| Candidate generation is fully handled by the ByT5 seq2seq model. | |
| """ | |
| import re | |
| # βββ MLM Model Path ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # XLM-RoBERTa fine-tuned on Sinhala β reranks ByT5 candidates by context | |
| DEFAULT_MLM_MODEL = "Kalana001/xlm-roberta-base-finetuned-sinhala" | |
| # βββ ByT5 Transliterator Model Path ββββββββββββββββββββββββββββββββββββββββββ | |
| # Fine-tuned on 1M SinglishβSinhala pairs β hosted on Hugging Face Hub | |
| DEFAULT_BYT5_MODEL = "Kalana001/byt5-small-singlish-sinhala" | |
| # βββ mBart50 Transliterator Model Path βββββββββββββββββββββββββββββββββββββββ | |
| # Full-sentence SinglishβSinhala (no English retained) β Hugging Face Hub | |
| DEFAULT_MBART_MODEL = "Kalana001/mbart50-large-singlish-sinhala" | |
| # βββ Corpus βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ENGLISH_CORPUS_URL = ( | |
| "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt" | |
| ) | |
| # βββ Scoring Weights βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Pure MLM β no manual weights needed | |
| # βββ Decoding Parameters βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MAX_CANDIDATES: int = 5 # ByT5 beam=5 β 5 candidates per word | |
| MIN_ENGLISH_LEN: int = 3 # Min word length for English detection | |
| # Words >= this length that are in the English vocab are treated as unambiguous | |
| # English loanwords and passed through without MLM scoring. | |
| # Short words (< 6 chars) are kept in the MLM path because they may be | |
| # Singlish homophones (e.g. 'mage'=wizard vs 'ΰΆΈΰΆΰ·', 'mama'=mum vs 'mama'=uncle). | |
| MIN_ENGLISH_PASSTHROUGH_LEN: int = 5 | |
| # βββ Regex βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PUNCT_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$") | |