SinCode / core /english.py
KalanaPabasara
SinCode v3 — ByT5 seq2seq + XLM-RoBERTa MLM reranker
f6f45d5
"""
English vocabulary loader for SinCode v3.
Used for English passthrough detection in the decoder.
Loads purely from the 20k corpus file — no hardcoded word lists.
"""
import os
import logging
import requests
from typing import Set
from core.constants import ENGLISH_CORPUS_URL, MIN_ENGLISH_LEN
logger = logging.getLogger(__name__)
def _resolve_english_cache_path() -> str:
override = os.getenv("SINCODE_ENGLISH_CACHE")
if override:
return override
candidates = [
os.path.join(os.getenv("HF_HOME", ""), "english_20k.txt") if os.getenv("HF_HOME") else "",
os.path.join(os.getcwd(), "english_20k.txt"),
os.path.join(os.getenv("TMPDIR", os.getenv("TEMP", "/tmp")), "english_20k.txt"),
]
for path in candidates:
if not path:
continue
parent = os.path.dirname(path) or "."
try:
os.makedirs(parent, exist_ok=True)
with open(path, "a", encoding="utf-8"):
pass
return path
except OSError:
continue
return "english_20k.txt"
ENGLISH_CORPUS_CACHE = _resolve_english_cache_path()
def load_english_vocab() -> Set[str]:
vocab: Set[str] = set()
if not os.path.exists(ENGLISH_CORPUS_CACHE) or os.path.getsize(ENGLISH_CORPUS_CACHE) == 0:
try:
logger.info("Downloading English corpus...")
response = requests.get(ENGLISH_CORPUS_URL, timeout=10)
response.raise_for_status()
with open(ENGLISH_CORPUS_CACHE, "wb") as f:
f.write(response.content)
except (requests.RequestException, OSError) as exc:
logger.warning("Could not download English corpus: %s", exc)
return vocab
try:
with open(ENGLISH_CORPUS_CACHE, "r", encoding="utf-8") as f:
vocab.update(
w for line in f
if (w := line.strip().lower()) and len(w) >= MIN_ENGLISH_LEN
)
except OSError as exc:
logger.warning("Could not read English corpus file: %s", exc)
logger.info("English vocabulary loaded: %d words", len(vocab))
return vocab
ENGLISH_VOCAB: Set[str] = load_english_vocab()