Spaces:

DerivedFunction
/

language-extractor-demo

Running

App Files Files Community

DerivedFunction commited on 4 days ago

Commit

1d100ed

1 Parent(s): 3b3f566

add

Browse files

Files changed (3) hide show

app.py +30 -14
convert_tatoeba_sentences.py +35 -0
tatoeba.py +167 -75

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 from collections import Counter, defaultdict
 from functools import lru_cache
 import os
 from typing import Any
@@ -228,6 +229,20 @@ def render_tatoeba_validation_html(validation: dict[str, Any]) -> str:
     return render_validation_html(validation, source_label="Tatoeba")
 def render_prediction_summary(
     *,
     text: str,
@@ -525,7 +540,7 @@ def load_random_tatoeba_mix_example() -> tuple[str, str, pd.DataFrame, dict[str,
 def load_random_fleurs_example() -> tuple[str, str, pd.DataFrame, dict[str, Any], dict[str, Any], str]:
     try:
-        sentence = fetch_random_fleurs_sentence()
     except FileNotFoundError as exc:
         empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
         message = (
@@ -542,16 +557,16 @@ def load_random_fleurs_example() -> tuple[str, str, pd.DataFrame, dict[str, Any]
     )
     raw = {
         **raw,
-        "source": "fleurs",
-        "fleurs_sentence_id": sentence.get("fleurs_id"),
-        "fleurs_split": sentence.get("split"),
-        "fleurs_source_lang": sentence.get("source_lang"),
-        "fleurs_model_lang": sentence.get("model_lang"),
-        "fleurs_language": sentence.get("language"),
-        "fleurs_lang_group": sentence.get("lang_group"),
-        "fleurs_validation": validation,
     }
-    validation_html = render_validation_html(validation, source_label="FLEURS")
     summary = render_prediction_summary(
         text=text,
         selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
@@ -566,7 +581,7 @@ def load_random_fleurs_example() -> tuple[str, str, pd.DataFrame, dict[str, Any]
 def load_random_fleurs_mix_example() -> tuple[str, str, pd.DataFrame, dict[str, Any], dict[str, Any], str]:
     try:
-        mix = fetch_random_fleurs_sentence_mix()
     except FileNotFoundError as exc:
         empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
         message = (
@@ -583,14 +598,15 @@ def load_random_fleurs_mix_example() -> tuple[str, str, pd.DataFrame, dict[str,
     )
     raw = {
         **raw,
-        "source": "fleurs-mix",
         "lang_count": mix["lang_count"],
         "sentence_langs": mix["langs"],
         "sentence_lang_iso3s": mix["lang_iso3s"],
         "sentences": mix["sentences"],
-        "fleurs_validation": validation,
     }
-    validation_html = render_validation_html(validation, source_label="FLEURS")
     summary = render_prediction_summary(
         text=text,
         selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),

 from collections import Counter, defaultdict
 from functools import lru_cache
+import random
 import os
 from typing import Any
     return render_validation_html(validation, source_label="Tatoeba")
+def fetch_random_cached_sentence() -> dict[str, Any]:
+    """Randomly sample a sentence from either cached source."""
+    if random.random() < 0.5:
+        return fetch_random_fleurs_sentence()
+    return fetch_random_tatoeba_sentence()
+def fetch_random_cached_sentence_mix() -> dict[str, Any]:
+    """Randomly sample a mixed-language example from either cached source."""
+    if random.random() < 0.5:
+        return fetch_random_fleurs_sentence_mix()
+    return fetch_random_tatoeba_sentence_mix()
 def render_prediction_summary(
     *,
     text: str,
 def load_random_fleurs_example() -> tuple[str, str, pd.DataFrame, dict[str, Any], dict[str, Any], str]:
     try:
+        sentence = fetch_random_cached_sentence()
     except FileNotFoundError as exc:
         empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
         message = (
     )
     raw = {
         **raw,
+        "source": sentence.get("source", "fleurs"),
+        "cached_sentence_id": sentence.get("fleurs_id", sentence.get("sentence_id")),
+        "cached_split": sentence.get("split"),
+        "cached_source_lang": sentence.get("source_lang"),
+        "cached_model_lang": sentence.get("model_lang", sentence.get("lang_iso2")),
+        "cached_language": sentence.get("language"),
+        "fleurs_validation": validation if sentence.get("source") == "fleurs" else {},
     }
+    source_label = "FLEURS" if sentence.get("source") == "fleurs" else "Tatoeba"
+    validation_html = render_validation_html(validation, source_label=source_label)
     summary = render_prediction_summary(
         text=text,
         selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
 def load_random_fleurs_mix_example() -> tuple[str, str, pd.DataFrame, dict[str, Any], dict[str, Any], str]:
     try:
+        mix = fetch_random_cached_sentence_mix()
     except FileNotFoundError as exc:
         empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
         message = (
     )
     raw = {
         **raw,
+        "source": mix.get("source", "fleurs-mix"),
         "lang_count": mix["lang_count"],
         "sentence_langs": mix["langs"],
         "sentence_lang_iso3s": mix["lang_iso3s"],
         "sentences": mix["sentences"],
+        "fleurs_validation": validation if mix.get("source") == "fleurs-mix" else {},
     }
+    source_label = "FLEURS" if mix.get("source") == "fleurs-mix" else "Tatoeba"
+    validation_html = render_validation_html(validation, source_label=source_label)
     summary = render_prediction_summary(
         text=text,
         selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),

convert_tatoeba_sentences.py ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/usr/bin/env python3
+"""Convert the raw Tatoeba sentence dump into a lean parquet cache."""
+from __future__ import annotations
+import argparse
+from pathlib import Path
+from tatoeba import TATOEBA_PARQUET_PATH, build_tatoeba_text_parquet
+def build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--input-path",
+        type=Path,
+        default=Path(__file__).with_name("sentences.csv"),
+        help="Path to the raw Tatoeba TSV dump.",
+    )
+    parser.add_argument(
+        "--output-path",
+        type=Path,
+        default=TATOEBA_PARQUET_PATH,
+        help="Where to write the lean parquet cache.",
+    )
+    return parser
+def main() -> None:
+    args = build_arg_parser().parse_args()
+    build_tatoeba_text_parquet(args.input_path, args.output_path)
+if __name__ == "__main__":
+    main()

tatoeba.py CHANGED Viewed

@@ -1,111 +1,203 @@
 from __future__ import annotations
-import json
 import random
 from typing import Any
-from urllib.error import HTTPError, URLError
-from urllib.parse import urlencode
-from urllib.request import Request, urlopen
-from language import ALL_LANGS, LANG_ISO2_TO_ISO3
-TATOEBA_SENTENCE_API = "https://api.tatoeba.org/v1/sentences"
-TATOEBA_TIMEOUT_SECONDS = 10.0
-TATOEBA_RANDOM_LANGS = [lang for lang in ALL_LANGS if lang in LANG_ISO2_TO_ISO3]
-def _sentence_url(lang_iso3: str) -> str:
-    query = urlencode(
-        {
-            "lang": lang_iso3,
-            "sort": "random",
-            "limit": 1,
-            "showtrans": "none",
-        }
     )
-    return f"{TATOEBA_SENTENCE_API}?{query}"
-def _fetch_random_tatoeba_sentence_for_lang(
-    lang_iso2: str,
-    *,
-    timeout: float = TATOEBA_TIMEOUT_SECONDS,
-) -> dict[str, Any]:
-    lang_iso3 = LANG_ISO2_TO_ISO3.get(lang_iso2)
-    if not lang_iso3:
-        raise RuntimeError(f"Language {lang_iso2!r} is not available in Tatoeba mappings.")
-    request = Request(_sentence_url(lang_iso3), headers={"accept": "application/json"})
-    with urlopen(request, timeout=timeout) as response:
-        payload = json.load(response)
-    data = payload.get("data") if isinstance(payload, dict) else None
-    if not isinstance(data, list) or not data:
-        raise RuntimeError("Tatoeba returned no sentence data.")
-    sentence = data[0]
-    if not isinstance(sentence, dict):
-        raise RuntimeError("Tatoeba returned an unexpected sentence payload.")
-    text = sentence.get("text")
-    if not isinstance(text, str) or not text.strip():
-        raise RuntimeError("Tatoeba returned an empty sentence text.")
-    sentence["text"] = text.strip()
-    sentence["lang_iso2"] = lang_iso2
-    sentence["lang_iso3"] = lang_iso3
-    return sentence
-def fetch_random_tatoeba_sentence(*, attempts: int = 8, timeout: float = TATOEBA_TIMEOUT_SECONDS) -> dict[str, Any]:
-    """Fetch one random sentence from Tatoeba, retrying across random languages."""
-    if not TATOEBA_RANDOM_LANGS:
-        raise RuntimeError("No Tatoeba-compatible languages are available.")
-    candidates = TATOEBA_RANDOM_LANGS[:]
-    random.shuffle(candidates)
-    last_error: Exception | None = None
-    for lang_iso2 in candidates[: max(1, attempts)]:
-        try:
-            return _fetch_random_tatoeba_sentence_for_lang(lang_iso2, timeout=timeout)
-        except (HTTPError, URLError, TimeoutError, json.JSONDecodeError, RuntimeError) as exc:
-            last_error = exc
-            continue
-    raise RuntimeError("Unable to fetch a random Tatoeba sentence.") from last_error
 def fetch_random_tatoeba_sentence_mix(
     *,
     min_sentences: int = 2,
     max_sentences: int = 3,
-    timeout: float = TATOEBA_TIMEOUT_SECONDS,
 ) -> dict[str, Any]:
-    """Fetch 2-3 random sentences from distinct languages and concatenate them."""
-    if not TATOEBA_RANDOM_LANGS:
-        raise RuntimeError("No Tatoeba-compatible languages are available.")
     min_sentences = max(1, min_sentences)
     max_sentences = max(min_sentences, max_sentences)
     count = random.randint(min_sentences, max_sentences)
-    if count > len(TATOEBA_RANDOM_LANGS):
-        count = len(TATOEBA_RANDOM_LANGS)
-    langs = random.sample(TATOEBA_RANDOM_LANGS, k=count)
-    sentences: list[dict[str, Any]] = []
-    parts: list[str] = []
-    for lang_iso2 in langs:
-        sentence = _fetch_random_tatoeba_sentence_for_lang(lang_iso2, timeout=timeout)
-        sentences.append(sentence)
-        parts.append(sentence["text"])
-    combined_text = "\n\n".join(parts)
     return {
         "text": combined_text,
         "sentences": sentences,
         "lang_count": len(sentences),
         "langs": [sentence["lang_iso2"] for sentence in sentences],
         "lang_iso3s": [sentence["lang_iso3"] for sentence in sentences],
     }

 from __future__ import annotations
 import random
+import unicodedata
+from functools import lru_cache
+from pathlib import Path
 from typing import Any
+import pandas as pd
+from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
+TATOEBA_CACHE_DIR = Path(__file__).with_name("data") / "tatoeba"
+TATOEBA_PARQUET_PATH = TATOEBA_CACHE_DIR / "tatoeba_text.parquet"
+DEFAULT_LANGUAGE_REMAPS = {
+    "cmn": "zh",
+    "yue": "zh",
+    "wuu": "zh",
+    "nan": "zh",
+    "nob": "no",
+    "nno": "no",
+}
+def _normalize_text_key(text: str) -> str:
+    normalized = unicodedata.normalize("NFKC", text)
+    normalized = " ".join(normalized.split())
+    return normalized.casefold().strip()
+def _normalize_lang(code: str) -> str | None:
+    code = (code or "").strip()
+    if not code:
+        return None
+    code = DEFAULT_LANGUAGE_REMAPS.get(code, code)
+    if code in ALL_LANGS:
+        return code
+    return canonical_lang(code)
+def _coerce_source_lang(lang_code: str) -> tuple[str, str]:
+    lang = _normalize_lang(lang_code) or lang_code.strip().lower()
+    return lang, LANG_ISO2_TO_ISO3.get(lang, "")
+def build_tatoeba_text_parquet(
+    input_path: str | Path = Path(__file__).with_name("sentences.csv"),
+    parquet_path: str | Path = TATOEBA_PARQUET_PATH,
+) -> Path:
+    """Convert the raw Tatoeba dump into a lean inference parquet cache."""
+    input_path = Path(input_path)
+    parquet_path = Path(parquet_path)
+    parquet_path.parent.mkdir(parents=True, exist_ok=True)
+    records: list[dict[str, Any]] = []
+    seen: set[tuple[str, str]] = set()
+    with input_path.open("r", encoding="utf-8", newline="") as handle:
+        for line in handle:
+            line = line.rstrip("\n")
+            if not line:
+                continue
+            parts = line.split("\t", 2)
+            if len(parts) < 3:
+                continue
+            raw_id, raw_lang, raw_text = parts
+            text = raw_text.strip()
+            if not text:
+                continue
+            source_lang, lang_iso3 = _coerce_source_lang(raw_lang)
+            if not source_lang:
+                continue
+            dedupe_key = (source_lang, _normalize_text_key(text))
+            if dedupe_key in seen:
+                continue
+            seen.add(dedupe_key)
+            try:
+                sentence_id = int(raw_id.strip())
+            except ValueError:
+                sentence_id = -1
+            records.append(
+                {
+                    "id": sentence_id,
+                    "text": text,
+                    "source_lang": source_lang,
+                    "lang_iso3": lang_iso3,
+                    "source": "tatoeba",
+                }
+            )
+    if not records:
+        raise RuntimeError(f"No usable Tatoeba rows found in {input_path}.")
+    frame = pd.DataFrame.from_records(records)
+    frame = frame.sort_values(by=["source_lang", "id"], kind="stable").reset_index(drop=True)
+    frame.to_parquet(parquet_path, index=False)
+    print(
+        f"Built lean Tatoeba parquet with {len(frame):,} rows "
+        f"and {len(frame.columns)} columns at {parquet_path}."
     )
+    return parquet_path
+@lru_cache(maxsize=1)
+def load_tatoeba_table(parquet_path: str | Path = TATOEBA_PARQUET_PATH) -> pd.DataFrame:
+    parquet_path = Path(parquet_path)
+    if not parquet_path.exists():
+        raise FileNotFoundError(
+            f"Missing Tatoeba cache at {parquet_path}. "
+            "Run `./.venv/bin/python convert_tatoeba_sentences.py` once to build it."
+        )
+    return pd.read_parquet(parquet_path)
+def _pick_random_rows(frame: pd.DataFrame, *, count: int) -> pd.DataFrame:
+    if frame.empty:
+        raise RuntimeError("Tatoeba cache has no rows.")
+    return frame.sample(n=min(count, len(frame)))
+def _row_to_sentence(row: pd.Series) -> dict[str, Any]:
+    source_lang = str(row.get("source_lang", "")).strip()
+    lang_iso3 = str(row.get("lang_iso3", "")).strip()
+    return {
+        "text": str(row.get("text", "")).strip(),
+        "source": "tatoeba",
+        "sentence_id": int(row.get("id", -1)) if str(row.get("id", "-1")).strip().lstrip("-").isdigit() else -1,
+        "source_lang": source_lang,
+        "lang_iso2": source_lang,
+        "lang_iso3": lang_iso3 or LANG_ISO2_TO_ISO3.get(source_lang, ""),
+        "language": source_lang,
+    }
+def fetch_random_tatoeba_sentence(
+    *,
+    attempts: int = 8,
+    parquet_path: str | Path = TATOEBA_PARQUET_PATH,
+) -> dict[str, Any]:
+    frame = load_tatoeba_table(parquet_path)
+    candidate_frame = frame[frame["text"].astype(str).str.strip().ne("")]
+    supported = candidate_frame[candidate_frame["source_lang"].isin(ALL_LANGS)]
+    if not supported.empty:
+        candidate_frame = supported
+    for _ in range(max(1, attempts)):
+        row = _pick_random_rows(candidate_frame, count=1).iloc[0]
+        sentence = _row_to_sentence(row)
+        if sentence["text"]:
+            return sentence
+    raise RuntimeError("Unable to sample a random Tatoeba sentence.")
 def fetch_random_tatoeba_sentence_mix(
     *,
     min_sentences: int = 2,
     max_sentences: int = 3,
+    parquet_path: str | Path = TATOEBA_PARQUET_PATH,
 ) -> dict[str, Any]:
+    frame = load_tatoeba_table(parquet_path)
+    candidate_frame = frame[frame["text"].astype(str).str.strip().ne("")]
+    supported = candidate_frame[candidate_frame["source_lang"].isin(ALL_LANGS)]
+    if not supported.empty:
+        candidate_frame = supported
     min_sentences = max(1, min_sentences)
     max_sentences = max(min_sentences, max_sentences)
     count = random.randint(min_sentences, max_sentences)
+    distinct_langs = [lang for lang in candidate_frame["source_lang"].dropna().unique().tolist() if lang]
+    if not distinct_langs:
+        raise RuntimeError("No usable Tatoeba languages were found in the cache.")
+    random.shuffle(distinct_langs)
+    chosen_langs = distinct_langs[: min(count, len(distinct_langs))]
+    rows = []
+    for lang in chosen_langs:
+        lang_rows = candidate_frame[candidate_frame["source_lang"] == lang]
+        rows.append(_pick_random_rows(lang_rows, count=1).iloc[0])
+    sentences = [_row_to_sentence(row) for row in rows]
+    combined_text = "\n\n".join(sentence["text"] for sentence in sentences if sentence["text"])
     return {
         "text": combined_text,
         "sentences": sentences,
         "lang_count": len(sentences),
         "langs": [sentence["lang_iso2"] for sentence in sentences],
         "lang_iso3s": [sentence["lang_iso3"] for sentence in sentences],
+        "source": "tatoeba-mix",
     }
+if __name__ == "__main__":
+    build_tatoeba_text_parquet()