from __future__ import annotations import unicodedata from functools import lru_cache from pathlib import Path from typing import Any import pandas as pd from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle TATOEBA_CACHE_DIR = Path(__file__).with_name("data") / "tatoeba" TATOEBA_PARQUET_PATH = TATOEBA_CACHE_DIR / "tatoeba_text.parquet" def _normalize_text_key(text: str) -> str: normalized = unicodedata.normalize("NFKC", text) normalized = " ".join(normalized.split()) return normalized.casefold().strip() def _normalize_lang(code: str) -> str | None: code = (code or "").strip() if not code: return None code = canonical_lang(code) if code in ALL_LANGS: return code return None def _coerce_source_lang(lang_code: str) -> tuple[str, str]: lang = _normalize_lang(lang_code) or lang_code.strip().lower() return lang, LANG_ISO2_TO_ISO3.get(lang, "") def build_tatoeba_text_parquet( input_path: str | Path = Path(__file__).with_name("sentences.csv"), parquet_path: str | Path = TATOEBA_PARQUET_PATH, ) -> Path: """Convert the raw Tatoeba dump into a lean inference parquet cache.""" input_path = Path(input_path) parquet_path = Path(parquet_path) parquet_path.parent.mkdir(parents=True, exist_ok=True) records: list[dict[str, Any]] = [] seen: set[tuple[str, str]] = set() with input_path.open("r", encoding="utf-8", newline="") as handle: for line in handle: line = line.rstrip("\n") if not line: continue parts = line.split("\t", 2) if len(parts) < 3: continue raw_id, raw_lang, raw_text = parts text = raw_text.strip() if not text: continue source_lang, lang_iso3 = _coerce_source_lang(raw_lang) if not source_lang: continue dedupe_key = (source_lang, _normalize_text_key(text)) if dedupe_key in seen: continue seen.add(dedupe_key) try: sentence_id = int(raw_id.strip()) except ValueError: sentence_id = -1 records.append( { "id": sentence_id, "text": text, "source_lang": source_lang, "lang_iso3": lang_iso3, "source": "tatoeba", } ) if not records: raise RuntimeError(f"No usable Tatoeba rows found in {input_path}.") frame = pd.DataFrame.from_records(records) frame = frame.sort_values(by=["source_lang", "id"], kind="stable").reset_index(drop=True) frame.to_parquet(parquet_path, index=False) print( f"Built lean Tatoeba parquet with {len(frame):,} rows " f"and {len(frame.columns)} columns at {parquet_path}." ) return parquet_path @lru_cache(maxsize=1) def load_tatoeba_table(parquet_path: str | Path = TATOEBA_PARQUET_PATH) -> pd.DataFrame: parquet_path = Path(parquet_path) if not parquet_path.exists(): raise FileNotFoundError( f"Missing Tatoeba cache at {parquet_path}. " "Run `./.venv/bin/python convert_tatoeba_sentences.py` once to build it." ) return pd.read_parquet(parquet_path) def _row_to_sentence(row: pd.Series) -> dict[str, Any]: source_lang = str(row.get("source_lang", "")).strip() lang_iso3 = str(row.get("lang_iso3", "")).strip() return { "text": str(row.get("text", "")).strip(), "source": "tatoeba", "sentence_id": int(row.get("id", -1)) if str(row.get("id", "-1")).strip().lstrip("-").isdigit() else -1, "source_lang": source_lang, "lang_iso2": source_lang, "lang_iso3": lang_iso3 or LANG_ISO2_TO_ISO3.get(source_lang, ""), "language": source_lang, } def fetch_random_tatoeba_sentence( *, attempts: int = 8, parquet_path: str | Path = TATOEBA_PARQUET_PATH, ) -> dict[str, Any]: """Fetch one random text sample, sometimes repeated within one language.""" frame = load_tatoeba_table(parquet_path) candidate_frame = frame[frame["source_lang"].isin(ALL_LANGS)] if "source_lang" in frame.columns else frame return sample_single_group_bundle( candidate_frame, group_column="source_lang", row_to_sentence=_row_to_sentence, attempts=attempts, ) def fetch_random_tatoeba_sentence_mix( *, min_sentences: int = 2, max_sentences: int = 3, parquet_path: str | Path = TATOEBA_PARQUET_PATH, ) -> dict[str, Any]: frame = load_tatoeba_table(parquet_path) candidate_frame = frame[frame["source_lang"].isin(ALL_LANGS)] if "source_lang" in frame.columns else frame bundle = sample_multi_group_bundle( candidate_frame, group_column="source_lang", row_to_sentence=_row_to_sentence, min_groups=min_sentences, max_groups=max_sentences, ) return { **bundle, "source": "tatoeba-mix", } if __name__ == "__main__": build_tatoeba_text_parquet()