DerivedFunction commited on
Commit
1d100ed
·
1 Parent(s): 3b3f566
Files changed (3) hide show
  1. app.py +30 -14
  2. convert_tatoeba_sentences.py +35 -0
  3. tatoeba.py +167 -75
app.py CHANGED
@@ -5,6 +5,7 @@ from __future__ import annotations
5
 
6
  from collections import Counter, defaultdict
7
  from functools import lru_cache
 
8
  import os
9
  from typing import Any
10
 
@@ -228,6 +229,20 @@ def render_tatoeba_validation_html(validation: dict[str, Any]) -> str:
228
  return render_validation_html(validation, source_label="Tatoeba")
229
 
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  def render_prediction_summary(
232
  *,
233
  text: str,
@@ -525,7 +540,7 @@ def load_random_tatoeba_mix_example() -> tuple[str, str, pd.DataFrame, dict[str,
525
 
526
  def load_random_fleurs_example() -> tuple[str, str, pd.DataFrame, dict[str, Any], dict[str, Any], str]:
527
  try:
528
- sentence = fetch_random_fleurs_sentence()
529
  except FileNotFoundError as exc:
530
  empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
531
  message = (
@@ -542,16 +557,16 @@ def load_random_fleurs_example() -> tuple[str, str, pd.DataFrame, dict[str, Any]
542
  )
543
  raw = {
544
  **raw,
545
- "source": "fleurs",
546
- "fleurs_sentence_id": sentence.get("fleurs_id"),
547
- "fleurs_split": sentence.get("split"),
548
- "fleurs_source_lang": sentence.get("source_lang"),
549
- "fleurs_model_lang": sentence.get("model_lang"),
550
- "fleurs_language": sentence.get("language"),
551
- "fleurs_lang_group": sentence.get("lang_group"),
552
- "fleurs_validation": validation,
553
  }
554
- validation_html = render_validation_html(validation, source_label="FLEURS")
 
555
  summary = render_prediction_summary(
556
  text=text,
557
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
@@ -566,7 +581,7 @@ def load_random_fleurs_example() -> tuple[str, str, pd.DataFrame, dict[str, Any]
566
 
567
  def load_random_fleurs_mix_example() -> tuple[str, str, pd.DataFrame, dict[str, Any], dict[str, Any], str]:
568
  try:
569
- mix = fetch_random_fleurs_sentence_mix()
570
  except FileNotFoundError as exc:
571
  empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
572
  message = (
@@ -583,14 +598,15 @@ def load_random_fleurs_mix_example() -> tuple[str, str, pd.DataFrame, dict[str,
583
  )
584
  raw = {
585
  **raw,
586
- "source": "fleurs-mix",
587
  "lang_count": mix["lang_count"],
588
  "sentence_langs": mix["langs"],
589
  "sentence_lang_iso3s": mix["lang_iso3s"],
590
  "sentences": mix["sentences"],
591
- "fleurs_validation": validation,
592
  }
593
- validation_html = render_validation_html(validation, source_label="FLEURS")
 
594
  summary = render_prediction_summary(
595
  text=text,
596
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
 
5
 
6
  from collections import Counter, defaultdict
7
  from functools import lru_cache
8
+ import random
9
  import os
10
  from typing import Any
11
 
 
229
  return render_validation_html(validation, source_label="Tatoeba")
230
 
231
 
232
+ def fetch_random_cached_sentence() -> dict[str, Any]:
233
+ """Randomly sample a sentence from either cached source."""
234
+ if random.random() < 0.5:
235
+ return fetch_random_fleurs_sentence()
236
+ return fetch_random_tatoeba_sentence()
237
+
238
+
239
+ def fetch_random_cached_sentence_mix() -> dict[str, Any]:
240
+ """Randomly sample a mixed-language example from either cached source."""
241
+ if random.random() < 0.5:
242
+ return fetch_random_fleurs_sentence_mix()
243
+ return fetch_random_tatoeba_sentence_mix()
244
+
245
+
246
  def render_prediction_summary(
247
  *,
248
  text: str,
 
540
 
541
  def load_random_fleurs_example() -> tuple[str, str, pd.DataFrame, dict[str, Any], dict[str, Any], str]:
542
  try:
543
+ sentence = fetch_random_cached_sentence()
544
  except FileNotFoundError as exc:
545
  empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
546
  message = (
 
557
  )
558
  raw = {
559
  **raw,
560
+ "source": sentence.get("source", "fleurs"),
561
+ "cached_sentence_id": sentence.get("fleurs_id", sentence.get("sentence_id")),
562
+ "cached_split": sentence.get("split"),
563
+ "cached_source_lang": sentence.get("source_lang"),
564
+ "cached_model_lang": sentence.get("model_lang", sentence.get("lang_iso2")),
565
+ "cached_language": sentence.get("language"),
566
+ "fleurs_validation": validation if sentence.get("source") == "fleurs" else {},
 
567
  }
568
+ source_label = "FLEURS" if sentence.get("source") == "fleurs" else "Tatoeba"
569
+ validation_html = render_validation_html(validation, source_label=source_label)
570
  summary = render_prediction_summary(
571
  text=text,
572
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
 
581
 
582
  def load_random_fleurs_mix_example() -> tuple[str, str, pd.DataFrame, dict[str, Any], dict[str, Any], str]:
583
  try:
584
+ mix = fetch_random_cached_sentence_mix()
585
  except FileNotFoundError as exc:
586
  empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
587
  message = (
 
598
  )
599
  raw = {
600
  **raw,
601
+ "source": mix.get("source", "fleurs-mix"),
602
  "lang_count": mix["lang_count"],
603
  "sentence_langs": mix["langs"],
604
  "sentence_lang_iso3s": mix["lang_iso3s"],
605
  "sentences": mix["sentences"],
606
+ "fleurs_validation": validation if mix.get("source") == "fleurs-mix" else {},
607
  }
608
+ source_label = "FLEURS" if mix.get("source") == "fleurs-mix" else "Tatoeba"
609
+ validation_html = render_validation_html(validation, source_label=source_label)
610
  summary = render_prediction_summary(
611
  text=text,
612
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
convert_tatoeba_sentences.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Convert the raw Tatoeba sentence dump into a lean parquet cache."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ from pathlib import Path
8
+
9
+ from tatoeba import TATOEBA_PARQUET_PATH, build_tatoeba_text_parquet
10
+
11
+
12
+ def build_arg_parser() -> argparse.ArgumentParser:
13
+ parser = argparse.ArgumentParser(description=__doc__)
14
+ parser.add_argument(
15
+ "--input-path",
16
+ type=Path,
17
+ default=Path(__file__).with_name("sentences.csv"),
18
+ help="Path to the raw Tatoeba TSV dump.",
19
+ )
20
+ parser.add_argument(
21
+ "--output-path",
22
+ type=Path,
23
+ default=TATOEBA_PARQUET_PATH,
24
+ help="Where to write the lean parquet cache.",
25
+ )
26
+ return parser
27
+
28
+
29
+ def main() -> None:
30
+ args = build_arg_parser().parse_args()
31
+ build_tatoeba_text_parquet(args.input_path, args.output_path)
32
+
33
+
34
+ if __name__ == "__main__":
35
+ main()
tatoeba.py CHANGED
@@ -1,111 +1,203 @@
1
  from __future__ import annotations
2
 
3
- import json
4
  import random
 
 
 
5
  from typing import Any
6
- from urllib.error import HTTPError, URLError
7
- from urllib.parse import urlencode
8
- from urllib.request import Request, urlopen
9
 
10
- from language import ALL_LANGS, LANG_ISO2_TO_ISO3
11
-
12
- TATOEBA_SENTENCE_API = "https://api.tatoeba.org/v1/sentences"
13
- TATOEBA_TIMEOUT_SECONDS = 10.0
14
- TATOEBA_RANDOM_LANGS = [lang for lang in ALL_LANGS if lang in LANG_ISO2_TO_ISO3]
15
-
16
-
17
- def _sentence_url(lang_iso3: str) -> str:
18
- query = urlencode(
19
- {
20
- "lang": lang_iso3,
21
- "sort": "random",
22
- "limit": 1,
23
- "showtrans": "none",
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  )
26
- return f"{TATOEBA_SENTENCE_API}?{query}"
27
-
28
-
29
- def _fetch_random_tatoeba_sentence_for_lang(
30
- lang_iso2: str,
31
- *,
32
- timeout: float = TATOEBA_TIMEOUT_SECONDS,
33
- ) -> dict[str, Any]:
34
- lang_iso3 = LANG_ISO2_TO_ISO3.get(lang_iso2)
35
- if not lang_iso3:
36
- raise RuntimeError(f"Language {lang_iso2!r} is not available in Tatoeba mappings.")
37
-
38
- request = Request(_sentence_url(lang_iso3), headers={"accept": "application/json"})
39
- with urlopen(request, timeout=timeout) as response:
40
- payload = json.load(response)
41
 
42
- data = payload.get("data") if isinstance(payload, dict) else None
43
- if not isinstance(data, list) or not data:
44
- raise RuntimeError("Tatoeba returned no sentence data.")
45
 
46
- sentence = data[0]
47
- if not isinstance(sentence, dict):
48
- raise RuntimeError("Tatoeba returned an unexpected sentence payload.")
 
 
 
 
 
 
49
 
50
- text = sentence.get("text")
51
- if not isinstance(text, str) or not text.strip():
52
- raise RuntimeError("Tatoeba returned an empty sentence text.")
53
 
54
- sentence["text"] = text.strip()
55
- sentence["lang_iso2"] = lang_iso2
56
- sentence["lang_iso3"] = lang_iso3
57
- return sentence
58
 
59
 
60
- def fetch_random_tatoeba_sentence(*, attempts: int = 8, timeout: float = TATOEBA_TIMEOUT_SECONDS) -> dict[str, Any]:
61
- """Fetch one random sentence from Tatoeba, retrying across random languages."""
62
- if not TATOEBA_RANDOM_LANGS:
63
- raise RuntimeError("No Tatoeba-compatible languages are available.")
 
 
 
 
 
 
 
 
64
 
65
- candidates = TATOEBA_RANDOM_LANGS[:]
66
- random.shuffle(candidates)
67
- last_error: Exception | None = None
68
 
69
- for lang_iso2 in candidates[: max(1, attempts)]:
70
- try:
71
- return _fetch_random_tatoeba_sentence_for_lang(lang_iso2, timeout=timeout)
72
- except (HTTPError, URLError, TimeoutError, json.JSONDecodeError, RuntimeError) as exc:
73
- last_error = exc
74
- continue
 
 
 
 
75
 
76
- raise RuntimeError("Unable to fetch a random Tatoeba sentence.") from last_error
 
 
 
 
 
77
 
78
 
79
  def fetch_random_tatoeba_sentence_mix(
80
  *,
81
  min_sentences: int = 2,
82
  max_sentences: int = 3,
83
- timeout: float = TATOEBA_TIMEOUT_SECONDS,
84
  ) -> dict[str, Any]:
85
- """Fetch 2-3 random sentences from distinct languages and concatenate them."""
86
- if not TATOEBA_RANDOM_LANGS:
87
- raise RuntimeError("No Tatoeba-compatible languages are available.")
 
 
88
 
89
  min_sentences = max(1, min_sentences)
90
  max_sentences = max(min_sentences, max_sentences)
91
  count = random.randint(min_sentences, max_sentences)
92
- if count > len(TATOEBA_RANDOM_LANGS):
93
- count = len(TATOEBA_RANDOM_LANGS)
94
 
95
- langs = random.sample(TATOEBA_RANDOM_LANGS, k=count)
96
- sentences: list[dict[str, Any]] = []
97
- parts: list[str] = []
98
 
99
- for lang_iso2 in langs:
100
- sentence = _fetch_random_tatoeba_sentence_for_lang(lang_iso2, timeout=timeout)
101
- sentences.append(sentence)
102
- parts.append(sentence["text"])
103
 
104
- combined_text = "\n\n".join(parts)
 
 
 
 
 
 
105
  return {
106
  "text": combined_text,
107
  "sentences": sentences,
108
  "lang_count": len(sentences),
109
  "langs": [sentence["lang_iso2"] for sentence in sentences],
110
  "lang_iso3s": [sentence["lang_iso3"] for sentence in sentences],
 
111
  }
 
 
 
 
 
1
  from __future__ import annotations
2
 
 
3
  import random
4
+ import unicodedata
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
  from typing import Any
 
 
 
8
 
9
+ import pandas as pd
10
+
11
+ from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
12
+
13
+
14
+ TATOEBA_CACHE_DIR = Path(__file__).with_name("data") / "tatoeba"
15
+ TATOEBA_PARQUET_PATH = TATOEBA_CACHE_DIR / "tatoeba_text.parquet"
16
+
17
+ DEFAULT_LANGUAGE_REMAPS = {
18
+ "cmn": "zh",
19
+ "yue": "zh",
20
+ "wuu": "zh",
21
+ "nan": "zh",
22
+ "nob": "no",
23
+ "nno": "no",
24
+ }
25
+
26
+
27
+ def _normalize_text_key(text: str) -> str:
28
+ normalized = unicodedata.normalize("NFKC", text)
29
+ normalized = " ".join(normalized.split())
30
+ return normalized.casefold().strip()
31
+
32
+
33
+ def _normalize_lang(code: str) -> str | None:
34
+ code = (code or "").strip()
35
+ if not code:
36
+ return None
37
+ code = DEFAULT_LANGUAGE_REMAPS.get(code, code)
38
+ if code in ALL_LANGS:
39
+ return code
40
+ return canonical_lang(code)
41
+
42
+
43
+ def _coerce_source_lang(lang_code: str) -> tuple[str, str]:
44
+ lang = _normalize_lang(lang_code) or lang_code.strip().lower()
45
+ return lang, LANG_ISO2_TO_ISO3.get(lang, "")
46
+
47
+
48
+ def build_tatoeba_text_parquet(
49
+ input_path: str | Path = Path(__file__).with_name("sentences.csv"),
50
+ parquet_path: str | Path = TATOEBA_PARQUET_PATH,
51
+ ) -> Path:
52
+ """Convert the raw Tatoeba dump into a lean inference parquet cache."""
53
+ input_path = Path(input_path)
54
+ parquet_path = Path(parquet_path)
55
+ parquet_path.parent.mkdir(parents=True, exist_ok=True)
56
+
57
+ records: list[dict[str, Any]] = []
58
+ seen: set[tuple[str, str]] = set()
59
+
60
+ with input_path.open("r", encoding="utf-8", newline="") as handle:
61
+ for line in handle:
62
+ line = line.rstrip("\n")
63
+ if not line:
64
+ continue
65
+
66
+ parts = line.split("\t", 2)
67
+ if len(parts) < 3:
68
+ continue
69
+
70
+ raw_id, raw_lang, raw_text = parts
71
+ text = raw_text.strip()
72
+ if not text:
73
+ continue
74
+
75
+ source_lang, lang_iso3 = _coerce_source_lang(raw_lang)
76
+ if not source_lang:
77
+ continue
78
+
79
+ dedupe_key = (source_lang, _normalize_text_key(text))
80
+ if dedupe_key in seen:
81
+ continue
82
+ seen.add(dedupe_key)
83
+
84
+ try:
85
+ sentence_id = int(raw_id.strip())
86
+ except ValueError:
87
+ sentence_id = -1
88
+
89
+ records.append(
90
+ {
91
+ "id": sentence_id,
92
+ "text": text,
93
+ "source_lang": source_lang,
94
+ "lang_iso3": lang_iso3,
95
+ "source": "tatoeba",
96
+ }
97
+ )
98
+
99
+ if not records:
100
+ raise RuntimeError(f"No usable Tatoeba rows found in {input_path}.")
101
+
102
+ frame = pd.DataFrame.from_records(records)
103
+ frame = frame.sort_values(by=["source_lang", "id"], kind="stable").reset_index(drop=True)
104
+ frame.to_parquet(parquet_path, index=False)
105
+ print(
106
+ f"Built lean Tatoeba parquet with {len(frame):,} rows "
107
+ f"and {len(frame.columns)} columns at {parquet_path}."
108
  )
109
+ return parquet_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
 
 
 
111
 
112
+ @lru_cache(maxsize=1)
113
+ def load_tatoeba_table(parquet_path: str | Path = TATOEBA_PARQUET_PATH) -> pd.DataFrame:
114
+ parquet_path = Path(parquet_path)
115
+ if not parquet_path.exists():
116
+ raise FileNotFoundError(
117
+ f"Missing Tatoeba cache at {parquet_path}. "
118
+ "Run `./.venv/bin/python convert_tatoeba_sentences.py` once to build it."
119
+ )
120
+ return pd.read_parquet(parquet_path)
121
 
 
 
 
122
 
123
+ def _pick_random_rows(frame: pd.DataFrame, *, count: int) -> pd.DataFrame:
124
+ if frame.empty:
125
+ raise RuntimeError("Tatoeba cache has no rows.")
126
+ return frame.sample(n=min(count, len(frame)))
127
 
128
 
129
+ def _row_to_sentence(row: pd.Series) -> dict[str, Any]:
130
+ source_lang = str(row.get("source_lang", "")).strip()
131
+ lang_iso3 = str(row.get("lang_iso3", "")).strip()
132
+ return {
133
+ "text": str(row.get("text", "")).strip(),
134
+ "source": "tatoeba",
135
+ "sentence_id": int(row.get("id", -1)) if str(row.get("id", "-1")).strip().lstrip("-").isdigit() else -1,
136
+ "source_lang": source_lang,
137
+ "lang_iso2": source_lang,
138
+ "lang_iso3": lang_iso3 or LANG_ISO2_TO_ISO3.get(source_lang, ""),
139
+ "language": source_lang,
140
+ }
141
 
 
 
 
142
 
143
+ def fetch_random_tatoeba_sentence(
144
+ *,
145
+ attempts: int = 8,
146
+ parquet_path: str | Path = TATOEBA_PARQUET_PATH,
147
+ ) -> dict[str, Any]:
148
+ frame = load_tatoeba_table(parquet_path)
149
+ candidate_frame = frame[frame["text"].astype(str).str.strip().ne("")]
150
+ supported = candidate_frame[candidate_frame["source_lang"].isin(ALL_LANGS)]
151
+ if not supported.empty:
152
+ candidate_frame = supported
153
 
154
+ for _ in range(max(1, attempts)):
155
+ row = _pick_random_rows(candidate_frame, count=1).iloc[0]
156
+ sentence = _row_to_sentence(row)
157
+ if sentence["text"]:
158
+ return sentence
159
+ raise RuntimeError("Unable to sample a random Tatoeba sentence.")
160
 
161
 
162
  def fetch_random_tatoeba_sentence_mix(
163
  *,
164
  min_sentences: int = 2,
165
  max_sentences: int = 3,
166
+ parquet_path: str | Path = TATOEBA_PARQUET_PATH,
167
  ) -> dict[str, Any]:
168
+ frame = load_tatoeba_table(parquet_path)
169
+ candidate_frame = frame[frame["text"].astype(str).str.strip().ne("")]
170
+ supported = candidate_frame[candidate_frame["source_lang"].isin(ALL_LANGS)]
171
+ if not supported.empty:
172
+ candidate_frame = supported
173
 
174
  min_sentences = max(1, min_sentences)
175
  max_sentences = max(min_sentences, max_sentences)
176
  count = random.randint(min_sentences, max_sentences)
 
 
177
 
178
+ distinct_langs = [lang for lang in candidate_frame["source_lang"].dropna().unique().tolist() if lang]
179
+ if not distinct_langs:
180
+ raise RuntimeError("No usable Tatoeba languages were found in the cache.")
181
 
182
+ random.shuffle(distinct_langs)
183
+ chosen_langs = distinct_langs[: min(count, len(distinct_langs))]
 
 
184
 
185
+ rows = []
186
+ for lang in chosen_langs:
187
+ lang_rows = candidate_frame[candidate_frame["source_lang"] == lang]
188
+ rows.append(_pick_random_rows(lang_rows, count=1).iloc[0])
189
+
190
+ sentences = [_row_to_sentence(row) for row in rows]
191
+ combined_text = "\n\n".join(sentence["text"] for sentence in sentences if sentence["text"])
192
  return {
193
  "text": combined_text,
194
  "sentences": sentences,
195
  "lang_count": len(sentences),
196
  "langs": [sentence["lang_iso2"] for sentence in sentences],
197
  "lang_iso3s": [sentence["lang_iso3"] for sentence in sentences],
198
+ "source": "tatoeba-mix",
199
  }
200
+
201
+
202
+ if __name__ == "__main__":
203
+ build_tatoeba_text_parquet()