Spaces:
Sleeping
Sleeping
File size: 8,131 Bytes
ff7e90d 190c27b ff7e90d f5fc858 190c27b f5fc858 ff7e90d f5fc858 ff7e90d f5fc858 ff7e90d f5fc858 ff7e90d 190c27b ff7e90d 695d464 ff7e90d 190c27b f5fc858 190c27b ff7e90d 190c27b ff7e90d 190c27b f5fc858 190c27b ff7e90d 190c27b ff7e90d 190c27b ff7e90d 190c27b ff7e90d 190c27b ff7e90d 190c27b ff7e90d 695d464 ff7e90d 190c27b ff7e90d afde0ac 190c27b ff7e90d 190c27b ff7e90d 190c27b ff7e90d 190c27b ff7e90d 190c27b 41efb11 ff7e90d 695d464 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | """
ModernBERTić Large - HF Space demo
Click any word in BCMS text to mask it and see what the model predicts.
"""
import os
import re
import string
import gradio as gr
import spaces
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForMaskedLM
MODEL_NAME = "permitt/galton-modernbertic-large"
HF_TOKEN = os.environ.get("HF_TOKEN")
if HF_TOKEN is None:
raise RuntimeError(
"HF_TOKEN secret not set. Add it under Space Settings -> Variables and secrets."
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
model = AutoModelForMaskedLM.from_pretrained(
MODEL_NAME, dtype=torch.bfloat16, token=HF_TOKEN
).eval()
OUR_MASK = tokenizer.mask_token
EXAMPLES = {
"Psihometrija (long context)": (
"Psihometrija je teorijska disciplina, utemeljena na statistici, "
"koja proučava mogućnosti, zakone i principe merenja psiholoških pojava, "
"konstrukcijom, standardizacijom i evaluacijom testova i drugih psiholoških "
"mernih instrumenata, kao i statističkim problemima empirijskih istraživanja. "
"Psihometrija je oblast psihologije koja se bavi teorijom i tehnikom merenja. "
"Psihometrija se generalno odnosi na specijalizovane oblasti u okviru "
"psihologije i obrazovanja posvećene testiranju, merenju, proceni i srodnim "
"aktivnostima. Psihometrija se bavi objektivnim merenjem latentnih "
"konstrukata koji se ne mogu direktno posmatrati. Primeri latentnih "
"konstrukcija uključuju inteligenciju, introvertnost, mentalne poremećaje "
"i obrazovna postignuća. Nivoi sposobnosti na neuočljivim latentnim "
"varijablama zaključuju se putem matematičkog modeliranja na osnovu onoga "
"što se posmatra iz odgovora pojedinaca na stavke na testovima i skalama."
),
"Crna Gora (geography)": (
"Crna Gora je država na jugoistoku Evrope. Glavni grad je Podgorica, "
"a istorijska prestonica je Cetinje. Crna Gora se graniči sa Hrvatskom, "
"Bosnom i Hercegovinom, Srbijom, Kosovom i Albanijom. Skadarsko jezero, "
"koje deli sa Albanijom, najveće je jezero na Balkanu."
),
"Ivo Andrić (literature)": (
"Ivo Andrić je bio jugoslovenski književnik i diplomata, rođen u Travniku "
"u centralnoj Bosni 1892. godine. Najpoznatiji je po romanu Na Drini "
"ćuprija, za koji je 1961. godine dobio Nobelovu nagradu za književnost. "
"Smatra se jednim od najvećih pisaca južnoslovenskih književnosti "
"dvadesetog veka."
),
}
DEFAULT_EXAMPLE = "Psihometrija (long context)"
def split_text(text: str):
"""Split text into a list of tokens, preserving whitespace runs as separate items."""
return [p for p in re.split(r"(\s+)", text) if p]
def make_clickable(text: str, masked_index: int | None = None):
"""Build the value list for HighlightedText: each word gets the 'word' label,
whitespace stays unlabeled, and a single position can be rendered as 'mask'."""
if not text:
return []
tokens = split_text(text)
out = []
for i, t in enumerate(tokens):
if not t.strip():
out.append((t, None))
elif i == masked_index:
out.append((OUR_MASK, "mask"))
else:
out.append((t, "word"))
return out
def strip_edge_punct(word: str):
"""Return (leading, core, trailing) so we can mask only the alphabetic core
of a token like 'Beograd.' and keep the period in place."""
leading = ""
while word and word[0] in string.punctuation:
leading += word[0]
word = word[1:]
trailing = ""
while word and word[-1] in string.punctuation:
trailing = word[-1] + trailing
word = word[:-1]
return leading, word, trailing
@spaces.GPU
@torch.inference_mode()
def _predict(text: str, top_k: int = 5):
mdl = model.to("cuda")
inputs = tokenizer(
text, return_tensors="pt", truncation=True, max_length=8192
).to("cuda")
mask_id = tokenizer.mask_token_id
pos = (inputs.input_ids == mask_id).nonzero(as_tuple=True)
if len(pos[1]) == 0:
return [("(no mask token)", 0.0)]
logits = mdl(**inputs).logits
mask_logits = logits[0, pos[1][0]]
probs = F.softmax(mask_logits.float(), dim=-1)
# Over-fetch so we can drop pure-punctuation predictions and still return top_k.
top_probs, top_ids = probs.topk(top_k * 5)
raw = [
(tokenizer.decode([tid]).strip(), float(p))
for tid, p in zip(top_ids, top_probs)
]
filtered = [
(w, p) for w, p in raw
if w and not all(c in string.punctuation for c in w)
]
return (filtered or raw)[:top_k]
def on_word_click(text: str, evt: gr.SelectData):
if evt.value is None or not str(evt.value).strip():
return gr.update(), gr.update()
tokens = split_text(text)
if evt.index >= len(tokens):
return gr.update(), "Click registered out of range. Edit the text and try again."
target = tokens[evt.index]
if not target.strip():
return gr.update(), gr.update()
leading, core, trailing = strip_edge_punct(target)
if not core:
return gr.update(), gr.update()
masked_tokens = list(tokens)
masked_tokens[evt.index] = leading + OUR_MASK + trailing
masked_text = "".join(masked_tokens)
preds = _predict(masked_text)
lines = [
f"### Masked word: `{core}`",
"",
"| Rank | Token | Probability |",
"|------|-------|-------------|",
]
for i, (w, p) in enumerate(preds, 1):
marker = " ← original" if w.lower() == core.lower() else ""
lines.append(f"| {i} | `{w}`{marker} | {p:.3f} |")
return make_clickable(text, evt.index), "\n".join(lines)
def reset_on_change(text):
return (
make_clickable(text),
"*Click any word above to mask it and see predictions.*",
)
CSS = """
.clickable-text .textspan {
cursor: pointer;
transition: opacity 0.15s;
}
.clickable-text .textspan:hover {
opacity: 0.55;
}
"""
with gr.Blocks(title="ModernBERTić Large") as demo:
gr.Markdown(
"""
# ModernBERTić Large
First ModernBERT-style encoder for **Bosnian / Croatian / Montenegrin / Serbian**.
Pretrained on ~60B tokens with **8192-token context window**.
**How to use:** pick an example below or paste your own BCMS text, then **click any word** in the highlighted view to mask it. The model will predict what fits.
"""
)
with gr.Row():
example_buttons = [gr.Button(label, size="sm") for label in EXAMPLES]
inp = gr.Textbox(
label="Input text (paste anything in BCMS)",
value=EXAMPLES[DEFAULT_EXAMPLE],
lines=8,
)
clickable = gr.HighlightedText(
label="Click any word to mask it",
value=make_clickable(EXAMPLES[DEFAULT_EXAMPLE]),
color_map={
"word": "rgba(99, 102, 241, 0.18)",
"mask": "rgba(239, 68, 68, 0.55)",
},
show_legend=False,
combine_adjacent=False,
elem_classes=["clickable-text"],
)
output = gr.Markdown(
value="*Click any word above to mask it and see predictions.*"
)
for label, btn in zip(EXAMPLES.keys(), example_buttons):
btn.click(
fn=lambda l=label: (
EXAMPLES[l],
make_clickable(EXAMPLES[l]),
"*Click any word above to mask it and see predictions.*",
),
outputs=[inp, clickable, output],
)
inp.change(reset_on_change, inp, [clickable, output])
clickable.select(on_word_click, inp, [clickable, output])
gr.Markdown(
"---\n"
"Trained on EuroHPC Leonardo (64× A100) at Recrewty. https://recrewty.com \n"
"You can find the results at SuperGLUE-SR results: https://balkanbench.com/leaderboard. \n"
"Link to blogposts and release: https://permitt.io. \n"
)
if __name__ == "__main__":
demo.queue().launch(ssr_mode=False, css=CSS)
|