Spaces:

Kalana001
/

SinCode

Running

SinCode / core /constants.py

KalanaPabasara

SinCode v3 — seq2seq pipeline, evaluation scripts, IndoNLP benchmark data

1fed70a 1 day ago

2.66 kB

	"""
	Configuration constants for SinCode v3.

	Key difference from v2: no rule engine, no dictionary.
	Candidate generation is fully handled by the ByT5 seq2seq model.
	"""

	import re

	# ─── MLM Model Path ──────────────────────────────────────────────────────────
	# XLM-RoBERTa fine-tuned on Sinhala — reranks ByT5 candidates by context
	DEFAULT_MLM_MODEL = "Kalana001/xlm-roberta-base-finetuned-sinhala"

	# ─── ByT5 Transliterator Model Path ──────────────────────────────────────────
	# Fine-tuned on 1M Singlish→Sinhala pairs — hosted on Hugging Face Hub
	DEFAULT_BYT5_MODEL = "Kalana001/byt5-small-singlish-sinhala"

	# ─── mBart50 Transliterator Model Path ───────────────────────────────────────
	# Full-sentence Singlish→Sinhala (no English retained) — Hugging Face Hub
	DEFAULT_MBART_MODEL = "Kalana001/mbart50-large-singlish-sinhala"

	# ─── Corpus ───────────────────────────────────────────────────────────────────
	ENGLISH_CORPUS_URL = (
	"https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
	)

	# ─── Scoring Weights ─────────────────────────────────────────────────────────
	# Pure MLM — no manual weights needed

	# ─── Decoding Parameters ─────────────────────────────────────────────────────
	MAX_CANDIDATES: int = 5 # ByT5 beam=5 → 5 candidates per word
	MIN_ENGLISH_LEN: int = 3 # Min word length for English detection
	# Words >= this length that are in the English vocab are treated as unambiguous
	# English loanwords and passed through without MLM scoring.
	# Short words (< 6 chars) are kept in the MLM path because they may be
	# Singlish homophones (e.g. 'mage'=wizard vs 'මගේ', 'mama'=mum vs 'mama'=uncle).
	MIN_ENGLISH_PASSTHROUGH_LEN: int = 5

	# ─── Regex ───────────────────────────────────────────────────────────────────
	PUNCT_PATTERN = re.compile(r"^(\W)(.?)(\W*)$")