Spaces:
Running on Zero
Running on Zero
Misc improvements
#1
by merve HF Staff - opened
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import dataclasses
|
| 2 |
import functools
|
|
|
|
| 3 |
import json
|
| 4 |
import math
|
| 5 |
import os
|
|
@@ -52,6 +53,7 @@ REQUIRED_MODEL_CONFIG_KEYS: Final[tuple[str, ...]] = (
|
|
| 52 |
BACKGROUND_CLASS_LABEL: Final[str] = "O"
|
| 53 |
BOUNDARY_PREFIXES: Final[tuple[str, ...]] = ("B", "I", "E", "S")
|
| 54 |
EMPTY_HIGHLIGHT_PAYLOAD = {"text": "", "entities": []}
|
|
|
|
| 55 |
SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
|
| 56 |
BACKGROUND_CLASS_LABEL,
|
| 57 |
"account_number",
|
|
@@ -63,6 +65,16 @@ SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
|
|
| 63 |
"private_url",
|
| 64 |
"secret",
|
| 65 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
NER_CLASS_NAMES: Final[tuple[str, ...]] = (BACKGROUND_CLASS_LABEL,) + tuple(
|
| 67 |
f"{prefix}-{base_label}"
|
| 68 |
for base_label in SPAN_CLASS_NAMES
|
|
@@ -80,6 +92,14 @@ VITERBI_TRANSITION_BIAS_KEYS: Final[tuple[str, ...]] = (
|
|
| 80 |
DEFAULT_VITERBI_CALIBRATION_PRESET: Final[str] = "default"
|
| 81 |
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
def validate_model_config_contract(
|
| 84 |
checkpoint_config: dict[str, object],
|
| 85 |
*,
|
|
@@ -1223,6 +1243,72 @@ def predict(text: str) -> dict[str, object]:
|
|
| 1223 |
}
|
| 1224 |
|
| 1225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1226 |
def build_demo() -> gr.Blocks:
|
| 1227 |
config_path = MODEL_DIR / "config.json"
|
| 1228 |
checkpoint_config = json.loads(config_path.read_text(encoding="utf-8"))
|
|
@@ -1248,67 +1334,98 @@ def build_demo() -> gr.Blocks:
|
|
| 1248 |
"#808080",
|
| 1249 |
)
|
| 1250 |
with gr.Blocks(
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
|
|
|
|
|
|
|
|
|
| 1254 |
) as demo:
|
| 1255 |
gr.Markdown("# OpenAI Privacy Filter Demo")
|
| 1256 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1257 |
|
| 1258 |
with gr.Column(variant="panel"):
|
| 1259 |
-
gr.Markdown("Input text:")
|
| 1260 |
input_text = gr.Textbox(
|
| 1261 |
-
|
| 1262 |
-
|
| 1263 |
-
|
| 1264 |
-
|
|
|
|
|
|
|
|
|
|
| 1265 |
)
|
|
|
|
|
|
|
|
|
|
| 1266 |
|
| 1267 |
with gr.Column(variant="panel"):
|
| 1268 |
-
gr.Markdown("Text after masking personal identifiers:")
|
| 1269 |
output_text = gr.HighlightedText(
|
| 1270 |
-
|
| 1271 |
-
|
| 1272 |
-
label
|
| 1273 |
-
|
| 1274 |
-
|
| 1275 |
-
|
| 1276 |
-
|
| 1277 |
-
|
| 1278 |
-
|
| 1279 |
-
|
| 1280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1281 |
)
|
| 1282 |
-
|
| 1283 |
-
with gr.Row():
|
| 1284 |
-
submit_button = gr.Button("Submit", variant="primary")
|
| 1285 |
-
clear_button = gr.Button("Clear")
|
| 1286 |
-
|
| 1287 |
submit_button.click(
|
| 1288 |
-
fn=
|
| 1289 |
inputs=input_text,
|
| 1290 |
-
outputs=output_text,
|
| 1291 |
-
api_name="
|
| 1292 |
)
|
| 1293 |
input_text.submit(
|
| 1294 |
-
fn=
|
| 1295 |
inputs=input_text,
|
| 1296 |
-
outputs=output_text,
|
| 1297 |
)
|
| 1298 |
clear_button.click(
|
| 1299 |
-
lambda: ("", EMPTY_HIGHLIGHT_PAYLOAD),
|
| 1300 |
-
outputs=[input_text, output_text],
|
| 1301 |
-
show_progress="hidden",
|
| 1302 |
)
|
| 1303 |
|
|
|
|
| 1304 |
gr.Examples(
|
| 1305 |
examples=[
|
| 1306 |
["Alice was born on 1990-01-02 and lives at 1 Main St."],
|
| 1307 |
["Email me at alice@example.com or call 415-555-0101."],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1308 |
],
|
| 1309 |
inputs=input_text,
|
| 1310 |
-
outputs=output_text,
|
| 1311 |
-
fn=
|
| 1312 |
cache_examples=False,
|
| 1313 |
)
|
| 1314 |
return demo
|
|
@@ -1316,4 +1433,4 @@ def build_demo() -> gr.Blocks:
|
|
| 1316 |
|
| 1317 |
if __name__ == "__main__":
|
| 1318 |
demo = build_demo()
|
| 1319 |
-
demo.launch()
|
|
|
|
| 1 |
import dataclasses
|
| 2 |
import functools
|
| 3 |
+
import inspect
|
| 4 |
import json
|
| 5 |
import math
|
| 6 |
import os
|
|
|
|
| 53 |
BACKGROUND_CLASS_LABEL: Final[str] = "O"
|
| 54 |
BOUNDARY_PREFIXES: Final[tuple[str, ...]] = ("B", "I", "E", "S")
|
| 55 |
EMPTY_HIGHLIGHT_PAYLOAD = {"text": "", "entities": []}
|
| 56 |
+
EMPTY_SUMMARY_MARKDOWN = "_No entities detected yet._"
|
| 57 |
SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
|
| 58 |
BACKGROUND_CLASS_LABEL,
|
| 59 |
"account_number",
|
|
|
|
| 65 |
"private_url",
|
| 66 |
"secret",
|
| 67 |
)
|
| 68 |
+
REDACTION_LABEL_MAP: Final[dict[str, str]] = {
|
| 69 |
+
"account_number": "[ACCOUNT_NUMBER]",
|
| 70 |
+
"private_address": "[ADDRESS]",
|
| 71 |
+
"private_date": "[DATE]",
|
| 72 |
+
"private_email": "[EMAIL]",
|
| 73 |
+
"private_person": "[PERSON]",
|
| 74 |
+
"private_phone": "[PHONE]",
|
| 75 |
+
"private_url": "[URL]",
|
| 76 |
+
"secret": "[SECRET]",
|
| 77 |
+
}
|
| 78 |
NER_CLASS_NAMES: Final[tuple[str, ...]] = (BACKGROUND_CLASS_LABEL,) + tuple(
|
| 79 |
f"{prefix}-{base_label}"
|
| 80 |
for base_label in SPAN_CLASS_NAMES
|
|
|
|
| 92 |
DEFAULT_VITERBI_CALIBRATION_PRESET: Final[str] = "default"
|
| 93 |
|
| 94 |
|
| 95 |
+
def supported_kwargs(
|
| 96 |
+
factory: object,
|
| 97 |
+
**kwargs: object,
|
| 98 |
+
) -> dict[str, object]:
|
| 99 |
+
signature = inspect.signature(factory)
|
| 100 |
+
return {key: value for key, value in kwargs.items() if key in signature.parameters}
|
| 101 |
+
|
| 102 |
+
|
| 103 |
def validate_model_config_contract(
|
| 104 |
checkpoint_config: dict[str, object],
|
| 105 |
*,
|
|
|
|
| 1243 |
}
|
| 1244 |
|
| 1245 |
|
| 1246 |
+
def build_redacted_text(text: str, entities: Sequence[dict[str, object]]) -> str:
|
| 1247 |
+
if not text or not entities:
|
| 1248 |
+
return text
|
| 1249 |
+
|
| 1250 |
+
redacted_parts: list[str] = []
|
| 1251 |
+
cursor = 0
|
| 1252 |
+
sorted_entities = sorted(
|
| 1253 |
+
entities,
|
| 1254 |
+
key=lambda item: (
|
| 1255 |
+
int(item.get("start", 0)),
|
| 1256 |
+
int(item.get("end", 0)),
|
| 1257 |
+
),
|
| 1258 |
+
)
|
| 1259 |
+
for entity in sorted_entities:
|
| 1260 |
+
start_raw = entity.get("start")
|
| 1261 |
+
end_raw = entity.get("end")
|
| 1262 |
+
label_raw = entity.get("entity")
|
| 1263 |
+
if not isinstance(start_raw, int) or not isinstance(end_raw, int):
|
| 1264 |
+
continue
|
| 1265 |
+
if not isinstance(label_raw, str):
|
| 1266 |
+
continue
|
| 1267 |
+
if start_raw < cursor or start_raw >= end_raw:
|
| 1268 |
+
continue
|
| 1269 |
+
start = max(0, min(start_raw, len(text)))
|
| 1270 |
+
end = max(0, min(end_raw, len(text)))
|
| 1271 |
+
if start < cursor or start >= end:
|
| 1272 |
+
continue
|
| 1273 |
+
redacted_parts.append(text[cursor:start])
|
| 1274 |
+
replacement = REDACTION_LABEL_MAP.get(label_raw, "[REDACTED]")
|
| 1275 |
+
redacted_parts.append(replacement)
|
| 1276 |
+
cursor = end
|
| 1277 |
+
redacted_parts.append(text[cursor:])
|
| 1278 |
+
return "".join(redacted_parts)
|
| 1279 |
+
|
| 1280 |
+
|
| 1281 |
+
def summarize_entities_markdown(entities: Sequence[dict[str, object]]) -> str:
|
| 1282 |
+
if not entities:
|
| 1283 |
+
return EMPTY_SUMMARY_MARKDOWN
|
| 1284 |
+
|
| 1285 |
+
counts: dict[str, int] = {}
|
| 1286 |
+
for entity in entities:
|
| 1287 |
+
label = entity.get("entity")
|
| 1288 |
+
if not isinstance(label, str):
|
| 1289 |
+
continue
|
| 1290 |
+
counts[label] = counts.get(label, 0) + 1
|
| 1291 |
+
if not counts:
|
| 1292 |
+
return EMPTY_SUMMARY_MARKDOWN
|
| 1293 |
+
|
| 1294 |
+
ordered_labels = sorted(counts.items(), key=lambda item: (-item[1], item[0]))
|
| 1295 |
+
lines = ["**Detected entities**"]
|
| 1296 |
+
lines.extend(f"- `{label}`: {count}" for label, count in ordered_labels)
|
| 1297 |
+
return "\n".join(lines)
|
| 1298 |
+
|
| 1299 |
+
|
| 1300 |
+
@spaces.GPU
|
| 1301 |
+
def predict_for_demo(text: str) -> tuple[dict[str, object], str, str]:
|
| 1302 |
+
prediction = predict(text)
|
| 1303 |
+
detected = prediction.get("entities")
|
| 1304 |
+
source_text = prediction.get("text")
|
| 1305 |
+
entities = detected if isinstance(detected, list) else []
|
| 1306 |
+
display_text = source_text if isinstance(source_text, str) else (text or "")
|
| 1307 |
+
redacted_text = build_redacted_text(display_text, entities)
|
| 1308 |
+
summary = summarize_entities_markdown(entities)
|
| 1309 |
+
return prediction, redacted_text, summary
|
| 1310 |
+
|
| 1311 |
+
|
| 1312 |
def build_demo() -> gr.Blocks:
|
| 1313 |
config_path = MODEL_DIR / "config.json"
|
| 1314 |
checkpoint_config = json.loads(config_path.read_text(encoding="utf-8"))
|
|
|
|
| 1334 |
"#808080",
|
| 1335 |
)
|
| 1336 |
with gr.Blocks(
|
| 1337 |
+
**supported_kwargs(
|
| 1338 |
+
gr.Blocks,
|
| 1339 |
+
title="OpenAI Privacy Filter",
|
| 1340 |
+
fill_width=True,
|
| 1341 |
+
elem_id="privacy-filter-app",
|
| 1342 |
+
)
|
| 1343 |
) as demo:
|
| 1344 |
gr.Markdown("# OpenAI Privacy Filter Demo")
|
| 1345 |
+
gr.Markdown(
|
| 1346 |
+
"Detect and redact personal identifiers using `openai/privacy-filter`.\n\n"
|
| 1347 |
+
"This demo highlights predicted spans and generates a redacted text variant "
|
| 1348 |
+
"with label placeholders."
|
| 1349 |
+
)
|
| 1350 |
|
| 1351 |
with gr.Column(variant="panel"):
|
|
|
|
| 1352 |
input_text = gr.Textbox(
|
| 1353 |
+
**supported_kwargs(
|
| 1354 |
+
gr.Textbox,
|
| 1355 |
+
lines=6,
|
| 1356 |
+
label="Input text with PII",
|
| 1357 |
+
placeholder="Paste text to detect personal identifiers and generate redacted output...",
|
| 1358 |
+
container=False,
|
| 1359 |
+
)
|
| 1360 |
)
|
| 1361 |
+
with gr.Row():
|
| 1362 |
+
submit_button = gr.Button("Detect & Redact", variant="primary")
|
| 1363 |
+
clear_button = gr.Button("Clear")
|
| 1364 |
|
| 1365 |
with gr.Column(variant="panel"):
|
|
|
|
| 1366 |
output_text = gr.HighlightedText(
|
| 1367 |
+
**supported_kwargs(
|
| 1368 |
+
gr.HighlightedText,
|
| 1369 |
+
label="Detected entities (highlighted)",
|
| 1370 |
+
value=EMPTY_HIGHLIGHT_PAYLOAD,
|
| 1371 |
+
color_map={
|
| 1372 |
+
label: web_color_palette[idx % len(web_color_palette)]
|
| 1373 |
+
for idx, label in enumerate(
|
| 1374 |
+
label for label in span_class_names if label != BACKGROUND_CLASS_LABEL
|
| 1375 |
+
)
|
| 1376 |
+
},
|
| 1377 |
+
combine_adjacent=False,
|
| 1378 |
+
show_legend=False,
|
| 1379 |
+
container=True,
|
| 1380 |
+
)
|
| 1381 |
+
)
|
| 1382 |
+
redacted_output = gr.Textbox(
|
| 1383 |
+
**supported_kwargs(
|
| 1384 |
+
gr.Textbox,
|
| 1385 |
+
label="Redacted text output",
|
| 1386 |
+
lines=6,
|
| 1387 |
+
show_copy_button=True,
|
| 1388 |
+
interactive=False,
|
| 1389 |
+
)
|
| 1390 |
+
)
|
| 1391 |
+
entity_summary = gr.Markdown(EMPTY_SUMMARY_MARKDOWN)
|
| 1392 |
+
with gr.Accordion("How to read results", open=False):
|
| 1393 |
+
gr.Markdown(
|
| 1394 |
+
"- Detects 8 span categories: person, email, phone, address, date, URL, "
|
| 1395 |
+
"account number, and secrets.\n"
|
| 1396 |
+
"- Uses sequence decoding (BIOES + constrained Viterbi) for cleaner boundaries.\n"
|
| 1397 |
+
"- Best treated as a redaction aid, not a standalone compliance or anonymization guarantee.\n"
|
| 1398 |
+
"- Official card notes strongest support is English, with limited multilingual robustness."
|
| 1399 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1400 |
submit_button.click(
|
| 1401 |
+
fn=predict_for_demo,
|
| 1402 |
inputs=input_text,
|
| 1403 |
+
outputs=[output_text, redacted_output, entity_summary],
|
| 1404 |
+
api_name="predict_and_redact",
|
| 1405 |
)
|
| 1406 |
input_text.submit(
|
| 1407 |
+
fn=predict_for_demo,
|
| 1408 |
inputs=input_text,
|
| 1409 |
+
outputs=[output_text, redacted_output, entity_summary],
|
| 1410 |
)
|
| 1411 |
clear_button.click(
|
| 1412 |
+
lambda: ("", EMPTY_HIGHLIGHT_PAYLOAD, "", EMPTY_SUMMARY_MARKDOWN),
|
| 1413 |
+
outputs=[input_text, output_text, redacted_output, entity_summary],
|
|
|
|
| 1414 |
)
|
| 1415 |
|
| 1416 |
+
gr.Markdown("### Multilingual quick examples")
|
| 1417 |
gr.Examples(
|
| 1418 |
examples=[
|
| 1419 |
["Alice was born on 1990-01-02 and lives at 1 Main St."],
|
| 1420 |
["Email me at alice@example.com or call 415-555-0101."],
|
| 1421 |
+
["Me llamo Laura Gómez y vivo en Calle de Alcalá 21, Madrid."],
|
| 1422 |
+
["Mon e-mail est jean.dupont@example.fr et mon téléphone est +33 6 12 34 56 78."],
|
| 1423 |
+
["私の名前は山田太郎です。メールはtaro.yamada@example.jpです。"],
|
| 1424 |
+
["اسمي أحمد وبريدي هو ahmed@example.com ورقم هاتفي +971501234567."],
|
| 1425 |
],
|
| 1426 |
inputs=input_text,
|
| 1427 |
+
outputs=[output_text, redacted_output, entity_summary],
|
| 1428 |
+
fn=predict_for_demo,
|
| 1429 |
cache_examples=False,
|
| 1430 |
)
|
| 1431 |
return demo
|
|
|
|
| 1433 |
|
| 1434 |
if __name__ == "__main__":
|
| 1435 |
demo = build_demo()
|
| 1436 |
+
demo.launch()
|