Spaces:

permitt
/

modernbertic-demo

Sleeping

App Files Files Community

permitt commited on 24 days ago

Commit

ff7e90d

1 Parent(s): 3afcada

feat: demo app

Browse files

Files changed (2) hide show

app.py +120 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+ModernBERTić Large - HF Space demo
+Three tabs: fill-mask, side-by-side vs BERTić, long-context fill-mask.
+"""
+import gradio as gr
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+MODEL_NAME = "permitt/galton-modernbertic-large"
+BASELINE_NAME = "classla/bcms-bertic"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.bfloat16 if device == "cuda" else torch.float32
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME, torch_dtype=dtype).to(device).eval()
+baseline_tokenizer = AutoTokenizer.from_pretrained(BASELINE_NAME)
+baseline_model = AutoModelForMaskedLM.from_pretrained(BASELINE_NAME).to(device).eval()
+OUR_MASK = tokenizer.mask_token
+THEIR_MASK = baseline_tokenizer.mask_token
+@torch.inference_mode()
+def fill_mask(text: str, tok, mdl, top_k: int = 5):
+    inputs = tok(text, return_tensors="pt", truncation=True, max_length=8192).to(device)
+    mask_id = tok.mask_token_id
+    pos = (inputs.input_ids == mask_id).nonzero(as_tuple=True)
+    if len(pos[1]) == 0:
+        return [("(no mask token in input)", 0.0)]
+    logits = mdl(**inputs).logits
+    mask_logits = logits[0, pos[1][0]]
+    probs = F.softmax(mask_logits.float(), dim=-1)
+    top_probs, top_ids = probs.topk(top_k)
+    return [(tok.decode([tid]).strip(), float(p)) for tid, p in zip(top_ids, top_probs)]
+def fmt(preds):
+    return "\n".join(f"{w:<20}  {p:.3f}" for w, p in preds)
+def predict_ours(text):
+    return fmt(fill_mask(text, tokenizer, model))
+def predict_compare(text):
+    ours = fill_mask(text, tokenizer, model)
+    bertic_text = text.replace(OUR_MASK, THEIR_MASK)
+    theirs = fill_mask(bertic_text, baseline_tokenizer, baseline_model)
+    return fmt(ours), fmt(theirs)
+with gr.Blocks(title="ModernBERTić Large", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        f"""
+        # ModernBERTić Large
+        First ModernBERT-style encoder for **Bosnian / Croatian / Montenegrin / Serbian**.
+        Pretrained on ~66B tokens with 8192 context window. Use `{OUR_MASK}` as the mask token.
+        """
+    )
+    with gr.Tab("Fill mask"):
+        inp = gr.Textbox(
+            label="Input",
+            value=f"Glavni grad Crne Gore je {OUR_MASK}.",
+            lines=2,
+        )
+        btn = gr.Button("Predict", variant="primary")
+        out = gr.Textbox(label="Top-5 predictions", lines=6)
+        gr.Examples(
+            examples=[
+                f"Glavni grad Srbije je {OUR_MASK}.",
+                f"Najveći grad u Hrvatskoj je {OUR_MASK}.",
+                f"Pisac romana 'Na Drini ćuprija' je {OUR_MASK} Andrić.",
+                f"Главни град Србије је {OUR_MASK}.",  # cyrillic
+            ],
+            inputs=inp,
+        )
+        btn.click(predict_ours, inp, out)
+    with gr.Tab("vs BERTić"):
+        gr.Markdown("Same input, both models. ModernBERTić-large vs `classla/bcms-bertic`.")
+        inp2 = gr.Textbox(
+            label="Input",
+            value=f"Najveće jezero u Crnoj Gori je {OUR_MASK} jezero.",
+            lines=2,
+        )
+        btn2 = gr.Button("Compare", variant="primary")
+        with gr.Row():
+            out_ours = gr.Textbox(label="ModernBERTić-large (ours)", lines=6)
+            out_theirs = gr.Textbox(label="BERTić (Ljubešić et al.)", lines=6)
+        btn2.click(predict_compare, inp2, [out_ours, out_theirs])
+    with gr.Tab("Long context (8192)"):
+        gr.Markdown(
+            "Paste a long passage with one mask token deep in the text. "
+            "BERTić truncates at 512 tokens. ModernBERTić handles up to 8192."
+        )
+        inp3 = gr.Textbox(
+            label="Long input",
+            lines=15,
+            placeholder=f"Paste a Wikipedia paragraph and place {OUR_MASK} somewhere late in the text...",
+        )
+        btn3 = gr.Button("Predict", variant="primary")
+        out3 = gr.Textbox(label="Top-5 predictions", lines=6)
+        btn3.click(predict_ours, inp3, out3)
+    gr.Markdown(
+        """
+        ---
+        Trained on EuroHPC Leonardo (64× A100). Paper, checkpoints and SuperGLUE-SR results: [link].
+        """
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+transformers>=4.48
+gradio>=4.0
+spaces