merve HF Staff commited on
Commit
2c0dcf8
·
verified ·
1 Parent(s): 1d92498

Misc improvements

Browse files
Files changed (1) hide show
  1. app.py +154 -37
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import dataclasses
2
  import functools
 
3
  import json
4
  import math
5
  import os
@@ -52,6 +53,7 @@ REQUIRED_MODEL_CONFIG_KEYS: Final[tuple[str, ...]] = (
52
  BACKGROUND_CLASS_LABEL: Final[str] = "O"
53
  BOUNDARY_PREFIXES: Final[tuple[str, ...]] = ("B", "I", "E", "S")
54
  EMPTY_HIGHLIGHT_PAYLOAD = {"text": "", "entities": []}
 
55
  SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
56
  BACKGROUND_CLASS_LABEL,
57
  "account_number",
@@ -63,6 +65,16 @@ SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
63
  "private_url",
64
  "secret",
65
  )
 
 
 
 
 
 
 
 
 
 
66
  NER_CLASS_NAMES: Final[tuple[str, ...]] = (BACKGROUND_CLASS_LABEL,) + tuple(
67
  f"{prefix}-{base_label}"
68
  for base_label in SPAN_CLASS_NAMES
@@ -80,6 +92,14 @@ VITERBI_TRANSITION_BIAS_KEYS: Final[tuple[str, ...]] = (
80
  DEFAULT_VITERBI_CALIBRATION_PRESET: Final[str] = "default"
81
 
82
 
 
 
 
 
 
 
 
 
83
  def validate_model_config_contract(
84
  checkpoint_config: dict[str, object],
85
  *,
@@ -1223,6 +1243,72 @@ def predict(text: str) -> dict[str, object]:
1223
  }
1224
 
1225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1226
  def build_demo() -> gr.Blocks:
1227
  config_path = MODEL_DIR / "config.json"
1228
  checkpoint_config = json.loads(config_path.read_text(encoding="utf-8"))
@@ -1248,67 +1334,98 @@ def build_demo() -> gr.Blocks:
1248
  "#808080",
1249
  )
1250
  with gr.Blocks(
1251
- title="OpenAI Privacy Filter",
1252
- fill_width=True,
1253
- elem_id="privacy-filter-app",
 
 
 
1254
  ) as demo:
1255
  gr.Markdown("# OpenAI Privacy Filter Demo")
1256
- gr.Markdown("Example of using OpenAI Privacy Filter (OPF) to mask personal identifiers.")
 
 
 
 
1257
 
1258
  with gr.Column(variant="panel"):
1259
- gr.Markdown("Input text:")
1260
  input_text = gr.Textbox(
1261
- lines=2,
1262
- placeholder="Paste text here to detect and mask personal identifiers...",
1263
- show_label=False,
1264
- container=False,
 
 
 
1265
  )
 
 
 
1266
 
1267
  with gr.Column(variant="panel"):
1268
- gr.Markdown("Text after masking personal identifiers:")
1269
  output_text = gr.HighlightedText(
1270
- value=EMPTY_HIGHLIGHT_PAYLOAD,
1271
- color_map={
1272
- label: web_color_palette[idx % len(web_color_palette)]
1273
- for idx, label in enumerate(
1274
- label for label in span_class_names if label != BACKGROUND_CLASS_LABEL
1275
- )
1276
- },
1277
- combine_adjacent=False,
1278
- show_legend=False,
1279
- show_label=False,
1280
- container=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1281
  )
1282
-
1283
- with gr.Row():
1284
- submit_button = gr.Button("Submit", variant="primary")
1285
- clear_button = gr.Button("Clear")
1286
-
1287
  submit_button.click(
1288
- fn=predict,
1289
  inputs=input_text,
1290
- outputs=output_text,
1291
- api_name="predict",
1292
  )
1293
  input_text.submit(
1294
- fn=predict,
1295
  inputs=input_text,
1296
- outputs=output_text,
1297
  )
1298
  clear_button.click(
1299
- lambda: ("", EMPTY_HIGHLIGHT_PAYLOAD),
1300
- outputs=[input_text, output_text],
1301
- show_progress="hidden",
1302
  )
1303
 
 
1304
  gr.Examples(
1305
  examples=[
1306
  ["Alice was born on 1990-01-02 and lives at 1 Main St."],
1307
  ["Email me at alice@example.com or call 415-555-0101."],
 
 
 
 
1308
  ],
1309
  inputs=input_text,
1310
- outputs=output_text,
1311
- fn=predict,
1312
  cache_examples=False,
1313
  )
1314
  return demo
@@ -1316,4 +1433,4 @@ def build_demo() -> gr.Blocks:
1316
 
1317
  if __name__ == "__main__":
1318
  demo = build_demo()
1319
- demo.launch()
 
1
  import dataclasses
2
  import functools
3
+ import inspect
4
  import json
5
  import math
6
  import os
 
53
  BACKGROUND_CLASS_LABEL: Final[str] = "O"
54
  BOUNDARY_PREFIXES: Final[tuple[str, ...]] = ("B", "I", "E", "S")
55
  EMPTY_HIGHLIGHT_PAYLOAD = {"text": "", "entities": []}
56
+ EMPTY_SUMMARY_MARKDOWN = "_No entities detected yet._"
57
  SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
58
  BACKGROUND_CLASS_LABEL,
59
  "account_number",
 
65
  "private_url",
66
  "secret",
67
  )
68
+ REDACTION_LABEL_MAP: Final[dict[str, str]] = {
69
+ "account_number": "[ACCOUNT_NUMBER]",
70
+ "private_address": "[ADDRESS]",
71
+ "private_date": "[DATE]",
72
+ "private_email": "[EMAIL]",
73
+ "private_person": "[PERSON]",
74
+ "private_phone": "[PHONE]",
75
+ "private_url": "[URL]",
76
+ "secret": "[SECRET]",
77
+ }
78
  NER_CLASS_NAMES: Final[tuple[str, ...]] = (BACKGROUND_CLASS_LABEL,) + tuple(
79
  f"{prefix}-{base_label}"
80
  for base_label in SPAN_CLASS_NAMES
 
92
  DEFAULT_VITERBI_CALIBRATION_PRESET: Final[str] = "default"
93
 
94
 
95
+ def supported_kwargs(
96
+ factory: object,
97
+ **kwargs: object,
98
+ ) -> dict[str, object]:
99
+ signature = inspect.signature(factory)
100
+ return {key: value for key, value in kwargs.items() if key in signature.parameters}
101
+
102
+
103
  def validate_model_config_contract(
104
  checkpoint_config: dict[str, object],
105
  *,
 
1243
  }
1244
 
1245
 
1246
+ def build_redacted_text(text: str, entities: Sequence[dict[str, object]]) -> str:
1247
+ if not text or not entities:
1248
+ return text
1249
+
1250
+ redacted_parts: list[str] = []
1251
+ cursor = 0
1252
+ sorted_entities = sorted(
1253
+ entities,
1254
+ key=lambda item: (
1255
+ int(item.get("start", 0)),
1256
+ int(item.get("end", 0)),
1257
+ ),
1258
+ )
1259
+ for entity in sorted_entities:
1260
+ start_raw = entity.get("start")
1261
+ end_raw = entity.get("end")
1262
+ label_raw = entity.get("entity")
1263
+ if not isinstance(start_raw, int) or not isinstance(end_raw, int):
1264
+ continue
1265
+ if not isinstance(label_raw, str):
1266
+ continue
1267
+ if start_raw < cursor or start_raw >= end_raw:
1268
+ continue
1269
+ start = max(0, min(start_raw, len(text)))
1270
+ end = max(0, min(end_raw, len(text)))
1271
+ if start < cursor or start >= end:
1272
+ continue
1273
+ redacted_parts.append(text[cursor:start])
1274
+ replacement = REDACTION_LABEL_MAP.get(label_raw, "[REDACTED]")
1275
+ redacted_parts.append(replacement)
1276
+ cursor = end
1277
+ redacted_parts.append(text[cursor:])
1278
+ return "".join(redacted_parts)
1279
+
1280
+
1281
+ def summarize_entities_markdown(entities: Sequence[dict[str, object]]) -> str:
1282
+ if not entities:
1283
+ return EMPTY_SUMMARY_MARKDOWN
1284
+
1285
+ counts: dict[str, int] = {}
1286
+ for entity in entities:
1287
+ label = entity.get("entity")
1288
+ if not isinstance(label, str):
1289
+ continue
1290
+ counts[label] = counts.get(label, 0) + 1
1291
+ if not counts:
1292
+ return EMPTY_SUMMARY_MARKDOWN
1293
+
1294
+ ordered_labels = sorted(counts.items(), key=lambda item: (-item[1], item[0]))
1295
+ lines = ["**Detected entities**"]
1296
+ lines.extend(f"- `{label}`: {count}" for label, count in ordered_labels)
1297
+ return "\n".join(lines)
1298
+
1299
+
1300
+ @spaces.GPU
1301
+ def predict_for_demo(text: str) -> tuple[dict[str, object], str, str]:
1302
+ prediction = predict(text)
1303
+ detected = prediction.get("entities")
1304
+ source_text = prediction.get("text")
1305
+ entities = detected if isinstance(detected, list) else []
1306
+ display_text = source_text if isinstance(source_text, str) else (text or "")
1307
+ redacted_text = build_redacted_text(display_text, entities)
1308
+ summary = summarize_entities_markdown(entities)
1309
+ return prediction, redacted_text, summary
1310
+
1311
+
1312
  def build_demo() -> gr.Blocks:
1313
  config_path = MODEL_DIR / "config.json"
1314
  checkpoint_config = json.loads(config_path.read_text(encoding="utf-8"))
 
1334
  "#808080",
1335
  )
1336
  with gr.Blocks(
1337
+ **supported_kwargs(
1338
+ gr.Blocks,
1339
+ title="OpenAI Privacy Filter",
1340
+ fill_width=True,
1341
+ elem_id="privacy-filter-app",
1342
+ )
1343
  ) as demo:
1344
  gr.Markdown("# OpenAI Privacy Filter Demo")
1345
+ gr.Markdown(
1346
+ "Detect and redact personal identifiers using `openai/privacy-filter`.\n\n"
1347
+ "This demo highlights predicted spans and generates a redacted text variant "
1348
+ "with label placeholders."
1349
+ )
1350
 
1351
  with gr.Column(variant="panel"):
 
1352
  input_text = gr.Textbox(
1353
+ **supported_kwargs(
1354
+ gr.Textbox,
1355
+ lines=6,
1356
+ label="Input text with PII",
1357
+ placeholder="Paste text to detect personal identifiers and generate redacted output...",
1358
+ container=False,
1359
+ )
1360
  )
1361
+ with gr.Row():
1362
+ submit_button = gr.Button("Detect & Redact", variant="primary")
1363
+ clear_button = gr.Button("Clear")
1364
 
1365
  with gr.Column(variant="panel"):
 
1366
  output_text = gr.HighlightedText(
1367
+ **supported_kwargs(
1368
+ gr.HighlightedText,
1369
+ label="Detected entities (highlighted)",
1370
+ value=EMPTY_HIGHLIGHT_PAYLOAD,
1371
+ color_map={
1372
+ label: web_color_palette[idx % len(web_color_palette)]
1373
+ for idx, label in enumerate(
1374
+ label for label in span_class_names if label != BACKGROUND_CLASS_LABEL
1375
+ )
1376
+ },
1377
+ combine_adjacent=False,
1378
+ show_legend=False,
1379
+ container=True,
1380
+ )
1381
+ )
1382
+ redacted_output = gr.Textbox(
1383
+ **supported_kwargs(
1384
+ gr.Textbox,
1385
+ label="Redacted text output",
1386
+ lines=6,
1387
+ show_copy_button=True,
1388
+ interactive=False,
1389
+ )
1390
+ )
1391
+ entity_summary = gr.Markdown(EMPTY_SUMMARY_MARKDOWN)
1392
+ with gr.Accordion("How to read results", open=False):
1393
+ gr.Markdown(
1394
+ "- Detects 8 span categories: person, email, phone, address, date, URL, "
1395
+ "account number, and secrets.\n"
1396
+ "- Uses sequence decoding (BIOES + constrained Viterbi) for cleaner boundaries.\n"
1397
+ "- Best treated as a redaction aid, not a standalone compliance or anonymization guarantee.\n"
1398
+ "- Official card notes strongest support is English, with limited multilingual robustness."
1399
  )
 
 
 
 
 
1400
  submit_button.click(
1401
+ fn=predict_for_demo,
1402
  inputs=input_text,
1403
+ outputs=[output_text, redacted_output, entity_summary],
1404
+ api_name="predict_and_redact",
1405
  )
1406
  input_text.submit(
1407
+ fn=predict_for_demo,
1408
  inputs=input_text,
1409
+ outputs=[output_text, redacted_output, entity_summary],
1410
  )
1411
  clear_button.click(
1412
+ lambda: ("", EMPTY_HIGHLIGHT_PAYLOAD, "", EMPTY_SUMMARY_MARKDOWN),
1413
+ outputs=[input_text, output_text, redacted_output, entity_summary],
 
1414
  )
1415
 
1416
+ gr.Markdown("### Multilingual quick examples")
1417
  gr.Examples(
1418
  examples=[
1419
  ["Alice was born on 1990-01-02 and lives at 1 Main St."],
1420
  ["Email me at alice@example.com or call 415-555-0101."],
1421
+ ["Me llamo Laura Gómez y vivo en Calle de Alcalá 21, Madrid."],
1422
+ ["Mon e-mail est jean.dupont@example.fr et mon téléphone est +33 6 12 34 56 78."],
1423
+ ["私の名前は山田太郎です。メールはtaro.yamada@example.jpです。"],
1424
+ ["اسمي أحمد وبريدي هو ahmed@example.com ورقم هاتفي +971501234567."],
1425
  ],
1426
  inputs=input_text,
1427
+ outputs=[output_text, redacted_output, entity_summary],
1428
+ fn=predict_for_demo,
1429
  cache_examples=False,
1430
  )
1431
  return demo
 
1433
 
1434
  if __name__ == "__main__":
1435
  demo = build_demo()
1436
+ demo.launch()