DerivedFunction1 commited on
Commit
a42debc
·
1 Parent(s): b535ce8
Files changed (4) hide show
  1. README.md +14 -1
  2. app.py +110 -16
  3. data/fleurs/fleurs_text_only.parquet +3 -0
  4. requirements.txt +2 -0
README.md CHANGED
@@ -10,4 +10,17 @@ pinned: false
10
  short_description: 'Language Extractor: Polyglot Tagger'
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  short_description: 'Language Extractor: Polyglot Tagger'
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ ## Offline FLEURS cache
16
+
17
+ The demo can now pull examples from a local, text-only FLEURS parquet cache instead of relying on Tatoeba.
18
+
19
+ Build the cache once with:
20
+
21
+ ```bash
22
+ ./.venv/bin/python fleurs_cache.py
23
+ ```
24
+
25
+ That downloads the FLEURS TSV metadata, dedupes repeated sentences, drops unused columns, and writes a reusable lean parquet file at `data/fleurs/fleurs_text_only.parquet`.
26
+ Run it once while online; after that, the app reads only the local parquet and does not need the network.
app.py CHANGED
@@ -12,6 +12,7 @@ import pandas as pd
12
  import gradio as gr
13
  from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
14
 
 
15
  from language import ALL_LANGS, LANG_ISO2_TO_ISO3
16
  from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence_mix
17
 
@@ -152,11 +153,11 @@ def build_ui_state(
152
  }
153
 
154
 
155
- def build_tatoeba_validation(
156
  classifier_scores: dict[str, float],
157
  expected_langs: list[str],
158
  ) -> dict[str, Any]:
159
- """Compare derived scores against known Tatoeba source languages."""
160
  expected_langs = [lang for lang in expected_langs if lang]
161
  expected_set = set(expected_langs)
162
  top_lang = next(iter(classifier_scores), None)
@@ -181,8 +182,8 @@ def build_tatoeba_validation(
181
  }
182
 
183
 
184
- def render_tatoeba_validation_html(validation: dict[str, Any]) -> str:
185
- """Render a compact validation card for Tatoeba examples."""
186
  if not validation:
187
  return ""
188
 
@@ -199,7 +200,7 @@ def render_tatoeba_validation_html(validation: dict[str, Any]) -> str:
199
 
200
  return f"""
201
  <div class="validation-strip">
202
- <div class="validation-kicker">Tatoeba validation</div>
203
  <div class="validation-main">{validation_score:.1%}</div>
204
  <div class="validation-status {status_class}">{status_label}</div>
205
  <div class="validation-subtitle">
@@ -214,6 +215,19 @@ def render_tatoeba_validation_html(validation: dict[str, Any]) -> str:
214
  """
215
 
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  def render_prediction_summary(
218
  *,
219
  text: str,
@@ -453,7 +467,7 @@ def load_random_tatoeba_example() -> tuple[str, str, pd.DataFrame, dict[str, Any
453
  sentence = fetch_random_tatoeba_sentence()
454
  text = sentence["text"]
455
  summary, spans, raw, ui_state, _, *chip_updates = predict(text)
456
- validation = build_tatoeba_validation(
457
  raw.get("classifier_scores", {}),
458
  [sentence.get("lang_iso2", "")],
459
  )
@@ -466,7 +480,7 @@ def load_random_tatoeba_example() -> tuple[str, str, pd.DataFrame, dict[str, Any
466
  "sentence_lang_iso3": sentence.get("lang_iso3"),
467
  "tatoeba_validation": validation,
468
  }
469
- validation_html = render_tatoeba_validation_html(validation)
470
  summary = render_prediction_summary(
471
  text=text,
472
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
@@ -483,7 +497,7 @@ def load_random_tatoeba_mix_example() -> tuple[str, str, pd.DataFrame, dict[str,
483
  mix = fetch_random_tatoeba_sentence_mix()
484
  text = mix["text"]
485
  summary, spans, raw, ui_state, _, *chip_updates = predict(text)
486
- validation = build_tatoeba_validation(
487
  raw.get("classifier_scores", {}),
488
  mix.get("langs", []),
489
  )
@@ -496,7 +510,87 @@ def load_random_tatoeba_mix_example() -> tuple[str, str, pd.DataFrame, dict[str,
496
  "sentences": mix["sentences"],
497
  "tatoeba_validation": validation,
498
  }
499
- validation_html = render_tatoeba_validation_html(validation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  summary = render_prediction_summary(
501
  text=text,
502
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
@@ -887,7 +981,7 @@ with gr.Blocks(title="Polyglot Tagger Studio") as demo:
887
  )
888
  validation_strip = gr.HTML()
889
  gr.Markdown(
890
- "Use the Tatoeba buttons for fresh examples, or paste your own text."
891
  )
892
  with gr.Row(elem_classes=["action-strip"]):
893
  with gr.Column(scale=1, min_width=0):
@@ -896,9 +990,9 @@ with gr.Blocks(title="Polyglot Tagger Studio") as demo:
896
  clear_btn = gr.Button("Clear", elem_classes=["action-btn", "action-clear"])
897
  with gr.Row(elem_classes=["action-strip", "action-stack"]):
898
  with gr.Column(scale=1, min_width=0):
899
- random_btn = gr.Button("Random sentence", elem_classes=["action-btn", "action-secondary"])
900
  with gr.Column(scale=1, min_width=0):
901
- random_mix_btn = gr.Button("Random mix", elem_classes=["action-btn", "action-secondary"])
902
  with gr.Column(scale=7):
903
  summary = gr.HTML()
904
  prediction_state = gr.State({})
@@ -929,16 +1023,16 @@ with gr.Blocks(title="Polyglot Tagger Studio") as demo:
929
  api_name="analyze",
930
  )
931
  random_btn.click(
932
- fn=load_random_tatoeba_example,
933
  inputs=None,
934
  outputs=[input_text, summary, spans, raw, prediction_state, validation_strip, chip_0, chip_1, chip_2, chip_3, chip_4, chip_5],
935
- api_name="random_tatoeba_sentence",
936
  )
937
  random_mix_btn.click(
938
- fn=load_random_tatoeba_mix_example,
939
  inputs=None,
940
  outputs=[input_text, summary, spans, raw, prediction_state, validation_strip, chip_0, chip_1, chip_2, chip_3, chip_4, chip_5],
941
- api_name="random_tatoeba_mix",
942
  )
943
  input_text.submit(
944
  fn=predict,
 
12
  import gradio as gr
13
  from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
14
 
15
+ from fleurs_cache import fetch_random_fleurs_sentence, fetch_random_fleurs_sentence_mix
16
  from language import ALL_LANGS, LANG_ISO2_TO_ISO3
17
  from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence_mix
18
 
 
153
  }
154
 
155
 
156
+ def build_example_validation(
157
  classifier_scores: dict[str, float],
158
  expected_langs: list[str],
159
  ) -> dict[str, Any]:
160
+ """Compare derived scores against known source languages."""
161
  expected_langs = [lang for lang in expected_langs if lang]
162
  expected_set = set(expected_langs)
163
  top_lang = next(iter(classifier_scores), None)
 
182
  }
183
 
184
 
185
+ def render_validation_html(validation: dict[str, Any], *, source_label: str) -> str:
186
+ """Render a compact validation card for a labeled example source."""
187
  if not validation:
188
  return ""
189
 
 
200
 
201
  return f"""
202
  <div class="validation-strip">
203
+ <div class="validation-kicker">{source_label} validation</div>
204
  <div class="validation-main">{validation_score:.1%}</div>
205
  <div class="validation-status {status_class}">{status_label}</div>
206
  <div class="validation-subtitle">
 
215
  """
216
 
217
 
218
+ def build_tatoeba_validation(
219
+ classifier_scores: dict[str, float],
220
+ expected_langs: list[str],
221
+ ) -> dict[str, Any]:
222
+ """Backward-compatible wrapper for existing Tatoeba callers."""
223
+ return build_example_validation(classifier_scores, expected_langs)
224
+
225
+
226
+ def render_tatoeba_validation_html(validation: dict[str, Any]) -> str:
227
+ """Backward-compatible wrapper for existing Tatoeba callers."""
228
+ return render_validation_html(validation, source_label="Tatoeba")
229
+
230
+
231
  def render_prediction_summary(
232
  *,
233
  text: str,
 
467
  sentence = fetch_random_tatoeba_sentence()
468
  text = sentence["text"]
469
  summary, spans, raw, ui_state, _, *chip_updates = predict(text)
470
+ validation = build_example_validation(
471
  raw.get("classifier_scores", {}),
472
  [sentence.get("lang_iso2", "")],
473
  )
 
480
  "sentence_lang_iso3": sentence.get("lang_iso3"),
481
  "tatoeba_validation": validation,
482
  }
483
+ validation_html = render_validation_html(validation, source_label="Tatoeba")
484
  summary = render_prediction_summary(
485
  text=text,
486
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
 
497
  mix = fetch_random_tatoeba_sentence_mix()
498
  text = mix["text"]
499
  summary, spans, raw, ui_state, _, *chip_updates = predict(text)
500
+ validation = build_example_validation(
501
  raw.get("classifier_scores", {}),
502
  mix.get("langs", []),
503
  )
 
510
  "sentences": mix["sentences"],
511
  "tatoeba_validation": validation,
512
  }
513
+ validation_html = render_validation_html(validation, source_label="Tatoeba")
514
+ summary = render_prediction_summary(
515
+ text=text,
516
+ selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
517
+ dominant_lang=ui_state.get("dominant_lang", raw.get("selected_lang", "")),
518
+ lang_stats=ui_state.get("lang_stats", {}),
519
+ classifier_scores=ui_state.get("classifier_scores", {}),
520
+ overall_confidence=float(ui_state.get("overall_confidence", 0.0)),
521
+ ignored_artifacts=int(ui_state.get("ignored_artifacts", 0)),
522
+ )
523
+ return text, summary, spans, raw, ui_state, validation_html, *chip_updates
524
+
525
+
526
+ def load_random_fleurs_example() -> tuple[str, str, pd.DataFrame, dict[str, Any], dict[str, Any], str]:
527
+ try:
528
+ sentence = fetch_random_fleurs_sentence()
529
+ except FileNotFoundError as exc:
530
+ empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
531
+ message = (
532
+ "<div class='empty-state'>"
533
+ f"{exc}"
534
+ "</div>"
535
+ )
536
+ return "", message, empty, {}, {}, "", *[gr.update(value="", visible=False) for _ in range(6)]
537
+ text = sentence["text"]
538
+ summary, spans, raw, ui_state, _, *chip_updates = predict(text)
539
+ validation = build_example_validation(
540
+ raw.get("classifier_scores", {}),
541
+ [sentence.get("lang_iso2", "")],
542
+ )
543
+ raw = {
544
+ **raw,
545
+ "source": "fleurs",
546
+ "fleurs_sentence_id": sentence.get("fleurs_id"),
547
+ "fleurs_split": sentence.get("split"),
548
+ "fleurs_source_lang": sentence.get("source_lang"),
549
+ "fleurs_model_lang": sentence.get("model_lang"),
550
+ "fleurs_language": sentence.get("language"),
551
+ "fleurs_lang_group": sentence.get("lang_group"),
552
+ "fleurs_validation": validation,
553
+ }
554
+ validation_html = render_validation_html(validation, source_label="FLEURS")
555
+ summary = render_prediction_summary(
556
+ text=text,
557
+ selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
558
+ dominant_lang=ui_state.get("dominant_lang", raw.get("selected_lang", "")),
559
+ lang_stats=ui_state.get("lang_stats", {}),
560
+ classifier_scores=ui_state.get("classifier_scores", {}),
561
+ overall_confidence=float(ui_state.get("overall_confidence", 0.0)),
562
+ ignored_artifacts=int(ui_state.get("ignored_artifacts", 0)),
563
+ )
564
+ return text, summary, spans, raw, ui_state, validation_html, *chip_updates
565
+
566
+
567
+ def load_random_fleurs_mix_example() -> tuple[str, str, pd.DataFrame, dict[str, Any], dict[str, Any], str]:
568
+ try:
569
+ mix = fetch_random_fleurs_sentence_mix()
570
+ except FileNotFoundError as exc:
571
+ empty = pd.DataFrame(columns=["token", "language", "score", "start", "end"])
572
+ message = (
573
+ "<div class='empty-state'>"
574
+ f"{exc}"
575
+ "</div>"
576
+ )
577
+ return "", message, empty, {}, {}, "", *[gr.update(value="", visible=False) for _ in range(6)]
578
+ text = mix["text"]
579
+ summary, spans, raw, ui_state, _, *chip_updates = predict(text)
580
+ validation = build_example_validation(
581
+ raw.get("classifier_scores", {}),
582
+ mix.get("langs", []),
583
+ )
584
+ raw = {
585
+ **raw,
586
+ "source": "fleurs-mix",
587
+ "lang_count": mix["lang_count"],
588
+ "sentence_langs": mix["langs"],
589
+ "sentence_lang_iso3s": mix["lang_iso3s"],
590
+ "sentences": mix["sentences"],
591
+ "fleurs_validation": validation,
592
+ }
593
+ validation_html = render_validation_html(validation, source_label="FLEURS")
594
  summary = render_prediction_summary(
595
  text=text,
596
  selected_lang=ui_state.get("selected_lang", raw.get("selected_lang", "")),
 
981
  )
982
  validation_strip = gr.HTML()
983
  gr.Markdown(
984
+ "Use the FLEURS buttons for fresh examples, or paste your own text."
985
  )
986
  with gr.Row(elem_classes=["action-strip"]):
987
  with gr.Column(scale=1, min_width=0):
 
990
  clear_btn = gr.Button("Clear", elem_classes=["action-btn", "action-clear"])
991
  with gr.Row(elem_classes=["action-strip", "action-stack"]):
992
  with gr.Column(scale=1, min_width=0):
993
+ random_btn = gr.Button("Random FLEURS sentence", elem_classes=["action-btn", "action-secondary"])
994
  with gr.Column(scale=1, min_width=0):
995
+ random_mix_btn = gr.Button("Random FLEURS mix", elem_classes=["action-btn", "action-secondary"])
996
  with gr.Column(scale=7):
997
  summary = gr.HTML()
998
  prediction_state = gr.State({})
 
1023
  api_name="analyze",
1024
  )
1025
  random_btn.click(
1026
+ fn=load_random_fleurs_example,
1027
  inputs=None,
1028
  outputs=[input_text, summary, spans, raw, prediction_state, validation_strip, chip_0, chip_1, chip_2, chip_3, chip_4, chip_5],
1029
+ api_name="random_fleurs_sentence",
1030
  )
1031
  random_mix_btn.click(
1032
+ fn=load_random_fleurs_mix_example,
1033
  inputs=None,
1034
  outputs=[input_text, summary, spans, raw, prediction_state, validation_strip, chip_0, chip_1, chip_2, chip_3, chip_4, chip_5],
1035
+ api_name="random_fleurs_mix",
1036
  )
1037
  input_text.submit(
1038
  fn=predict,
data/fleurs/fleurs_text_only.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f348bba789b3a7f051f586b6424ba445eb122be69d91f2b8a23db4c3bafae02
3
+ size 20278131
requirements.txt CHANGED
@@ -2,3 +2,5 @@ torch
2
  transformers
3
  gradio
4
  pandas
 
 
 
2
  transformers
3
  gradio
4
  pandas
5
+ datasets
6
+ pyarrow