Spaces:

polyglot-tagger
/

language-extractor-demo

Running

App Files Files Community

DerivedFunction1 commited on 4 days ago

Commit

d9a3362

1 Parent(s): 92e6087

update

Browse files

Files changed (3) hide show

app.py +97 -4
init_venv.py +2 -1
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ from typing import Any
 import pandas as pd
 import gradio as gr
 import pycountry
 from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
 from fleurs_cache import fetch_random_fleurs_sentence, fetch_random_fleurs_sentence_mix
@@ -20,6 +22,8 @@ from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence
 MODEL_CHECKPOINT = "DerivedFunction/polyglot-tagger-v2"
 MIN_ARTIFACT_SPAN_CHARS = 4
 MIN_ARTIFACT_CONFIDENCE = 0.5
 ARTIFACT_SPAN_WEIGHT = 0.35
@@ -38,6 +42,13 @@ def get_pipeline():
     )
 def normalize_label(label: str) -> str:
     if label.startswith(("B-", "I-")):
         label = label[2:]
@@ -115,17 +126,41 @@ def make_lang_chip_label(lang: str, stat: dict[str, float | int], score: float)
 def build_chip_button_updates(
     ranked: list[tuple[str, dict[str, float | int]]],
     classifier_scores: dict[str, float],
     max_chips: int = 6,
 ) -> list[dict[str, Any]]:
     """Return button updates for the top-ranked languages."""
     updates: list[dict[str, Any]] = []
     for idx in range(max_chips):
-        if idx < len(ranked):
-            lang, stat = ranked[idx]
             updates.append(
                 gr.update(
-                    value=make_lang_chip_label(lang, stat, classifier_scores.get(lang, 0.0)),
                     visible=True,
                 )
             )
         else:
@@ -157,6 +192,7 @@ def build_ui_state(
 def build_example_validation(
     classifier_scores: dict[str, float],
     expected_langs: list[str],
 ) -> dict[str, Any]:
     """Compare derived scores against known source languages."""
@@ -175,10 +211,28 @@ def build_example_validation(
     precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) else 0.0
     recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) else 0.0
     validation_score = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
     return {
         "expected_langs": expected_langs,
         "predicted_langs": predicted_langs,
         "top_lang": top_lang,
         "top_score": top_score,
         "true_positive": true_positive,
@@ -190,6 +244,12 @@ def build_example_validation(
         "recall": recall,
         "top_match": false_positive == 0 and false_negative == 0,
         "validation_score": validation_score,
     }
@@ -208,6 +268,9 @@ def render_validation_html(validation: dict[str, Any], *, source_label: str) ->
     validation_score = float(validation.get("validation_score", 0.0))
     precision = float(validation.get("precision", 0.0))
     recall = float(validation.get("recall", 0.0))
     top_match = bool(validation.get("top_match"))
     status_label = "Match" if top_match else "Mismatch"
     status_class = "validation-pass" if top_match else "validation-warn"
@@ -226,6 +289,7 @@ def render_validation_html(validation: dict[str, Any], *, source_label: str) ->
         <div class="validation-subtitle">
         expected: {expected_langs}
         | predicted: {predicted_langs}
         | top: {top_lang.upper()}
         | top score: {top_score:.1%}
         | tp: {true_positive}
@@ -233,6 +297,8 @@ def render_validation_html(validation: dict[str, Any], *, source_label: str) ->
         | fn: {false_negative}
         | precision: {precision:.1%}
         | recall: {recall:.1%}
       </div>
     </div>
     """
@@ -288,6 +354,25 @@ def render_language_reference_html() -> str:
     """
 def fetch_random_cached_sentence() -> dict[str, Any]:
     """Randomly sample a sentence from either cached source."""
     if random.random() < 0.5:
@@ -444,6 +529,8 @@ def predict(text: str) -> tuple[str, pd.DataFrame, dict[str, Any], dict[str, Any
     nlp = get_pipeline()
     entities = nlp(text)
     rows: list[dict[str, Any]] = []
     token_counts: Counter[str] = Counter()
@@ -531,9 +618,11 @@ def predict(text: str) -> tuple[str, pd.DataFrame, dict[str, Any], dict[str, Any
         "entities": entities,
         "selected_lang": dominant_lang,
         "ranked_langs": [lang for lang, _ in ranked],
         "text": text,
     }
-    chip_updates = build_chip_button_updates(ranked, classifier_scores) if lang_stats else [gr.update(value="", visible=False) for _ in range(6)]
     return summary, spans, raw, ui_state, "", *chip_updates
@@ -543,6 +632,7 @@ def load_random_tatoeba_example() -> tuple[str, str, pd.DataFrame, dict[str, Any
     summary, spans, raw, ui_state, _, *chip_updates = predict(text)
     validation = build_example_validation(
         raw.get("classifier_scores", {}),
         [sentence.get("lang_iso2", "")],
     )
     raw = {
@@ -573,6 +663,7 @@ def load_random_tatoeba_mix_example() -> tuple[str, str, pd.DataFrame, dict[str,
     summary, spans, raw, ui_state, _, *chip_updates = predict(text)
     validation = build_example_validation(
         raw.get("classifier_scores", {}),
         mix.get("langs", []),
     )
     raw = {
@@ -612,6 +703,7 @@ def load_random_fleurs_example() -> tuple[str, str, pd.DataFrame, dict[str, Any]
     summary, spans, raw, ui_state, _, *chip_updates = predict(text)
     validation = build_example_validation(
         raw.get("classifier_scores", {}),
         [sentence.get("lang_iso2", "")],
     )
     raw = {
@@ -653,6 +745,7 @@ def load_random_fleurs_mix_example() -> tuple[str, str, pd.DataFrame, dict[str,
     summary, spans, raw, ui_state, _, *chip_updates = predict(text)
     validation = build_example_validation(
         raw.get("classifier_scores", {}),
         mix.get("langs", []),
     )
     raw = {

 import pandas as pd
 import gradio as gr
 import pycountry
+import fasttext
+from huggingface_hub import hf_hub_download
 from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
 from fleurs_cache import fetch_random_fleurs_sentence, fetch_random_fleurs_sentence_mix
 MODEL_CHECKPOINT = "DerivedFunction/polyglot-tagger-v2"
+FASTTEXT_MODEL_REPO = "facebook/fasttext-language-identification"
+FASTTEXT_MODEL_FILENAME = "model.bin"
 MIN_ARTIFACT_SPAN_CHARS = 4
 MIN_ARTIFACT_CONFIDENCE = 0.5
 ARTIFACT_SPAN_WEIGHT = 0.35
     )
+@lru_cache(maxsize=1)
+def get_fasttext_model():
+    """Load the reference fastText language ID model once."""
+    model_path = hf_hub_download(repo_id=FASTTEXT_MODEL_REPO, filename=FASTTEXT_MODEL_FILENAME)
+    return fasttext.load_model(model_path)
 def normalize_label(label: str) -> str:
     if label.startswith(("B-", "I-")):
         label = label[2:]
 def build_chip_button_updates(
     ranked: list[tuple[str, dict[str, float | int]]],
     classifier_scores: dict[str, float],
+    fasttext_scores: dict[str, float] | None = None,
     max_chips: int = 6,
 ) -> list[dict[str, Any]]:
     """Return button updates for the top-ranked languages."""
+    fasttext_scores = fasttext_scores or {}
+    fasttext_ranked = sorted(fasttext_scores.items(), key=lambda item: item[1], reverse=True)
+    fasttext_rank = {lang: idx for idx, (lang, _) in enumerate(fasttext_ranked)}
+    model_ranked = [lang for lang, _ in ranked]
+    union_langs = sorted(
+        set(model_ranked) | set(fasttext_scores.keys()),
+        key=lambda lang: max(classifier_scores.get(lang, 0.0), fasttext_scores.get(lang, 0.0)),
+        reverse=True,
+    )
     updates: list[dict[str, Any]] = []
     for idx in range(max_chips):
+        if idx < len(union_langs):
+            lang = union_langs[idx]
+            model_score = classifier_scores.get(lang, 0.0)
+            fast_score = fasttext_scores.get(lang, 0.0)
+            in_fasttext = lang in fasttext_scores
+            in_model = model_score > 0.0
+            if in_model and in_fasttext:
+                variant = "primary"
+            elif in_fasttext:
+                variant = "secondary"
+            else:
+                variant = "stop"
+            source_tag = "both" if in_model and in_fasttext else ("ft" if in_fasttext else "model")
+            fast_rank = fasttext_rank.get(lang)
+            fast_rank_text = f" #{fast_rank + 1}" if fast_rank is not None else ""
             updates.append(
                 gr.update(
+                    value=f"{lang.upper()} M {model_score:.0%} | FT {fast_score:.0%}{fast_rank_text}",
                     visible=True,
+                    variant=variant,
                 )
             )
         else:
 def build_example_validation(
     classifier_scores: dict[str, float],
+    reference_scores: dict[str, float] | None,
     expected_langs: list[str],
 ) -> dict[str, Any]:
     """Compare derived scores against known source languages."""
     precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) else 0.0
     recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) else 0.0
     validation_score = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
+    reference_scores = reference_scores or {}
+    reference_predicted = sorted(
+        (lang for lang, score in reference_scores.items() if score > 0.0),
+        key=lambda lang: reference_scores.get(lang, 0.0),
+        reverse=True,
+    )
+    reference_set = set(reference_predicted)
+    reference_tp = len(expected_set & reference_set)
+    reference_fp = len(reference_set - expected_set)
+    reference_fn = len(expected_set - reference_set)
+    reference_precision = reference_tp / (reference_tp + reference_fp) if (reference_tp + reference_fp) else 0.0
+    reference_recall = reference_tp / (reference_tp + reference_fn) if (reference_tp + reference_fn) else 0.0
+    reference_score = (
+        2 * reference_precision * reference_recall / (reference_precision + reference_recall)
+        if (reference_precision + reference_recall)
+        else 0.0
+    )
     return {
         "expected_langs": expected_langs,
         "predicted_langs": predicted_langs,
+        "reference_langs": reference_predicted,
         "top_lang": top_lang,
         "top_score": top_score,
         "true_positive": true_positive,
         "recall": recall,
         "top_match": false_positive == 0 and false_negative == 0,
         "validation_score": validation_score,
+        "reference_true_positive": reference_tp,
+        "reference_false_positive": reference_fp,
+        "reference_false_negative": reference_fn,
+        "reference_precision": reference_precision,
+        "reference_recall": reference_recall,
+        "reference_score": reference_score,
     }
     validation_score = float(validation.get("validation_score", 0.0))
     precision = float(validation.get("precision", 0.0))
     recall = float(validation.get("recall", 0.0))
+    reference_score = float(validation.get("reference_score", 0.0))
+    reference_precision = float(validation.get("reference_precision", 0.0))
+    reference_recall = float(validation.get("reference_recall", 0.0))
     top_match = bool(validation.get("top_match"))
     status_label = "Match" if top_match else "Mismatch"
     status_class = "validation-pass" if top_match else "validation-warn"
         <div class="validation-subtitle">
         expected: {expected_langs}
         | predicted: {predicted_langs}
+        | vs: {reference_score:.1%}
         | top: {top_lang.upper()}
         | top score: {top_score:.1%}
         | tp: {true_positive}
         | fn: {false_negative}
         | precision: {precision:.1%}
         | recall: {recall:.1%}
+        | ref precision: {reference_precision:.1%}
+        | ref recall: {reference_recall:.1%}
       </div>
     </div>
     """
     """
+def predict_fasttext(text: str, k: int = 5) -> dict[str, Any]:
+    """Return fastText language predictions for comparison."""
+    model = get_fasttext_model()
+    labels, scores = model.predict(text, k=k)
+    predictions = [
+        {
+            "lang": label.removeprefix("__label__"),
+            "score": float(score),
+        }
+        for label, score in zip(labels, scores)
+    ]
+    return {
+        "model": FASTTEXT_MODEL_REPO,
+        "predictions": predictions,
+        "top_lang": predictions[0]["lang"] if predictions else None,
+        "top_score": predictions[0]["score"] if predictions else 0.0,
+    }
 def fetch_random_cached_sentence() -> dict[str, Any]:
     """Randomly sample a sentence from either cached source."""
     if random.random() < 0.5:
     nlp = get_pipeline()
     entities = nlp(text)
+    fasttext_result = predict_fasttext(text)
+    fasttext_scores = {item["lang"]: item["score"] for item in fasttext_result.get("predictions", [])}
     rows: list[dict[str, Any]] = []
     token_counts: Counter[str] = Counter()
         "entities": entities,
         "selected_lang": dominant_lang,
         "ranked_langs": [lang for lang, _ in ranked],
+        "fasttext": fasttext_result,
+        "fasttext_scores": fasttext_scores,
         "text": text,
     }
+    chip_updates = build_chip_button_updates(ranked, classifier_scores, fasttext_scores) if lang_stats else [gr.update(value="", visible=False) for _ in range(6)]
     return summary, spans, raw, ui_state, "", *chip_updates
     summary, spans, raw, ui_state, _, *chip_updates = predict(text)
     validation = build_example_validation(
         raw.get("classifier_scores", {}),
+        raw.get("fasttext_scores", {}),
         [sentence.get("lang_iso2", "")],
     )
     raw = {
     summary, spans, raw, ui_state, _, *chip_updates = predict(text)
     validation = build_example_validation(
         raw.get("classifier_scores", {}),
+        raw.get("fasttext_scores", {}),
         mix.get("langs", []),
     )
     raw = {
     summary, spans, raw, ui_state, _, *chip_updates = predict(text)
     validation = build_example_validation(
         raw.get("classifier_scores", {}),
+        raw.get("fasttext_scores", {}),
         [sentence.get("lang_iso2", "")],
     )
     raw = {
     summary, spans, raw, ui_state, _, *chip_updates = predict(text)
     validation = build_example_validation(
         raw.get("classifier_scores", {}),
+        raw.get("fasttext_scores", {}),
         mix.get("langs", []),
     )
     raw = {

init_venv.py CHANGED Viewed

@@ -38,7 +38,8 @@ BASE_PACKAGES = [
 CUSTOM_PACKAGES = [
    "gradio",
-   "pycountry"
 ]
 # Packages for the classification server

 CUSTOM_PACKAGES = [
    "gradio",
+   "pycountry",
+   "fasttext",
 ]
 # Packages for the classification server

requirements.txt CHANGED Viewed

@@ -5,3 +5,5 @@ pandas
 datasets
 pyarrow
 pycountry

 datasets
 pyarrow
 pycountry
+fasttext
+huggingface_hub