Spaces:

openai
/

privacy-filter

Running on Zero

App Files Files Community

Misc improvements

by merve HF Staff - opened 13 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+154

-37

Files changed (1) hide show

app.py +154 -37

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import dataclasses
 import functools
 import json
 import math
 import os
@@ -52,6 +53,7 @@ REQUIRED_MODEL_CONFIG_KEYS: Final[tuple[str, ...]] = (
 BACKGROUND_CLASS_LABEL: Final[str] = "O"
 BOUNDARY_PREFIXES: Final[tuple[str, ...]] = ("B", "I", "E", "S")
 EMPTY_HIGHLIGHT_PAYLOAD = {"text": "", "entities": []}
 SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
     BACKGROUND_CLASS_LABEL,
     "account_number",
@@ -63,6 +65,16 @@ SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
     "private_url",
     "secret",
 )
 NER_CLASS_NAMES: Final[tuple[str, ...]] = (BACKGROUND_CLASS_LABEL,) + tuple(
     f"{prefix}-{base_label}"
     for base_label in SPAN_CLASS_NAMES
@@ -80,6 +92,14 @@ VITERBI_TRANSITION_BIAS_KEYS: Final[tuple[str, ...]] = (
 DEFAULT_VITERBI_CALIBRATION_PRESET: Final[str] = "default"
 def validate_model_config_contract(
     checkpoint_config: dict[str, object],
     *,
@@ -1223,6 +1243,72 @@ def predict(text: str) -> dict[str, object]:
     }
 def build_demo() -> gr.Blocks:
     config_path = MODEL_DIR / "config.json"
     checkpoint_config = json.loads(config_path.read_text(encoding="utf-8"))
@@ -1248,67 +1334,98 @@ def build_demo() -> gr.Blocks:
         "#808080",
     )
     with gr.Blocks(
-        title="OpenAI Privacy Filter",
-        fill_width=True,
-        elem_id="privacy-filter-app",
     ) as demo:
         gr.Markdown("# OpenAI Privacy Filter Demo")
-        gr.Markdown("Example of using OpenAI Privacy Filter (OPF) to mask personal identifiers.")
         with gr.Column(variant="panel"):
-            gr.Markdown("Input text:")
             input_text = gr.Textbox(
-                lines=2,
-                placeholder="Paste text here to detect and mask personal identifiers...",
-                show_label=False,
-                container=False,
             )
         with gr.Column(variant="panel"):
-            gr.Markdown("Text after masking personal identifiers:")
             output_text = gr.HighlightedText(
-                value=EMPTY_HIGHLIGHT_PAYLOAD,
-                color_map={
-                    label: web_color_palette[idx % len(web_color_palette)]
-                    for idx, label in enumerate(
-                        label for label in span_class_names if label != BACKGROUND_CLASS_LABEL
-                    )
-                },
-                combine_adjacent=False,
-                show_legend=False,
-                show_label=False,
-                container=True,
             )
-        with gr.Row():
-            submit_button = gr.Button("Submit", variant="primary")
-            clear_button = gr.Button("Clear")
         submit_button.click(
-            fn=predict,
             inputs=input_text,
-            outputs=output_text,
-            api_name="predict",
         )
         input_text.submit(
-            fn=predict,
             inputs=input_text,
-            outputs=output_text,
         )
         clear_button.click(
-            lambda: ("", EMPTY_HIGHLIGHT_PAYLOAD),
-            outputs=[input_text, output_text],
-            show_progress="hidden",
         )
         gr.Examples(
             examples=[
                 ["Alice was born on 1990-01-02 and lives at 1 Main St."],
                 ["Email me at alice@example.com or call 415-555-0101."],
             ],
             inputs=input_text,
-            outputs=output_text,
-            fn=predict,
             cache_examples=False,
         )
     return demo
@@ -1316,4 +1433,4 @@ def build_demo() -> gr.Blocks:
 if __name__ == "__main__":
     demo = build_demo()
-    demo.launch()

 import dataclasses
 import functools
+import inspect
 import json
 import math
 import os
 BACKGROUND_CLASS_LABEL: Final[str] = "O"
 BOUNDARY_PREFIXES: Final[tuple[str, ...]] = ("B", "I", "E", "S")
 EMPTY_HIGHLIGHT_PAYLOAD = {"text": "", "entities": []}
+EMPTY_SUMMARY_MARKDOWN = "_No entities detected yet._"
 SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
     BACKGROUND_CLASS_LABEL,
     "account_number",
     "private_url",
     "secret",
 )
+REDACTION_LABEL_MAP: Final[dict[str, str]] = {
+    "account_number": "[ACCOUNT_NUMBER]",
+    "private_address": "[ADDRESS]",
+    "private_date": "[DATE]",
+    "private_email": "[EMAIL]",
+    "private_person": "[PERSON]",
+    "private_phone": "[PHONE]",
+    "private_url": "[URL]",
+    "secret": "[SECRET]",
+}
 NER_CLASS_NAMES: Final[tuple[str, ...]] = (BACKGROUND_CLASS_LABEL,) + tuple(
     f"{prefix}-{base_label}"
     for base_label in SPAN_CLASS_NAMES
 DEFAULT_VITERBI_CALIBRATION_PRESET: Final[str] = "default"
+def supported_kwargs(
+    factory: object,
+    **kwargs: object,
+) -> dict[str, object]:
+    signature = inspect.signature(factory)
+    return {key: value for key, value in kwargs.items() if key in signature.parameters}
 def validate_model_config_contract(
     checkpoint_config: dict[str, object],
     *,
     }
+def build_redacted_text(text: str, entities: Sequence[dict[str, object]]) -> str:
+    if not text or not entities:
+        return text
+    redacted_parts: list[str] = []
+    cursor = 0
+    sorted_entities = sorted(
+        entities,
+        key=lambda item: (
+            int(item.get("start", 0)),
+            int(item.get("end", 0)),
+        ),
+    )
+    for entity in sorted_entities:
+        start_raw = entity.get("start")
+        end_raw = entity.get("end")
+        label_raw = entity.get("entity")
+        if not isinstance(start_raw, int) or not isinstance(end_raw, int):
+            continue
+        if not isinstance(label_raw, str):
+            continue
+        if start_raw < cursor or start_raw >= end_raw:
+            continue
+        start = max(0, min(start_raw, len(text)))
+        end = max(0, min(end_raw, len(text)))
+        if start < cursor or start >= end:
+            continue
+        redacted_parts.append(text[cursor:start])
+        replacement = REDACTION_LABEL_MAP.get(label_raw, "[REDACTED]")
+        redacted_parts.append(replacement)
+        cursor = end
+    redacted_parts.append(text[cursor:])
+    return "".join(redacted_parts)
+def summarize_entities_markdown(entities: Sequence[dict[str, object]]) -> str:
+    if not entities:
+        return EMPTY_SUMMARY_MARKDOWN
+    counts: dict[str, int] = {}
+    for entity in entities:
+        label = entity.get("entity")
+        if not isinstance(label, str):
+            continue
+        counts[label] = counts.get(label, 0) + 1
+    if not counts:
+        return EMPTY_SUMMARY_MARKDOWN
+    ordered_labels = sorted(counts.items(), key=lambda item: (-item[1], item[0]))
+    lines = ["**Detected entities**"]
+    lines.extend(f"- `{label}`: {count}" for label, count in ordered_labels)
+    return "\n".join(lines)
+@spaces.GPU
+def predict_for_demo(text: str) -> tuple[dict[str, object], str, str]:
+    prediction = predict(text)
+    detected = prediction.get("entities")
+    source_text = prediction.get("text")
+    entities = detected if isinstance(detected, list) else []
+    display_text = source_text if isinstance(source_text, str) else (text or "")
+    redacted_text = build_redacted_text(display_text, entities)
+    summary = summarize_entities_markdown(entities)
+    return prediction, redacted_text, summary
 def build_demo() -> gr.Blocks:
     config_path = MODEL_DIR / "config.json"
     checkpoint_config = json.loads(config_path.read_text(encoding="utf-8"))
         "#808080",
     )
     with gr.Blocks(
+        **supported_kwargs(
+            gr.Blocks,
+            title="OpenAI Privacy Filter",
+            fill_width=True,
+            elem_id="privacy-filter-app",
+        )
     ) as demo:
         gr.Markdown("# OpenAI Privacy Filter Demo")
+        gr.Markdown(
+            "Detect and redact personal identifiers using `openai/privacy-filter`.\n\n"
+            "This demo highlights predicted spans and generates a redacted text variant "
+            "with label placeholders."
+        )
         with gr.Column(variant="panel"):
             input_text = gr.Textbox(
+                **supported_kwargs(
+                    gr.Textbox,
+                    lines=6,
+                    label="Input text with PII",
+                    placeholder="Paste text to detect personal identifiers and generate redacted output...",
+                    container=False,
+                )
             )
+        with gr.Row():
+            submit_button = gr.Button("Detect & Redact", variant="primary")
+            clear_button = gr.Button("Clear")
         with gr.Column(variant="panel"):
             output_text = gr.HighlightedText(
+                **supported_kwargs(
+                    gr.HighlightedText,
+                    label="Detected entities (highlighted)",
+                    value=EMPTY_HIGHLIGHT_PAYLOAD,
+                    color_map={
+                        label: web_color_palette[idx % len(web_color_palette)]
+                        for idx, label in enumerate(
+                            label for label in span_class_names if label != BACKGROUND_CLASS_LABEL
+                        )
+                    },
+                    combine_adjacent=False,
+                    show_legend=False,
+                    container=True,
+                )
+            )
+            redacted_output = gr.Textbox(
+                **supported_kwargs(
+                    gr.Textbox,
+                    label="Redacted text output",
+                    lines=6,
+                    show_copy_button=True,
+                    interactive=False,
+                )
+            )
+            entity_summary = gr.Markdown(EMPTY_SUMMARY_MARKDOWN)
+        with gr.Accordion("How to read results", open=False):
+            gr.Markdown(
+                "- Detects 8 span categories: person, email, phone, address, date, URL, "
+                "account number, and secrets.\n"
+                "- Uses sequence decoding (BIOES + constrained Viterbi) for cleaner boundaries.\n"
+                "- Best treated as a redaction aid, not a standalone compliance or anonymization guarantee.\n"
+                "- Official card notes strongest support is English, with limited multilingual robustness."
             )
         submit_button.click(
+            fn=predict_for_demo,
             inputs=input_text,
+            outputs=[output_text, redacted_output, entity_summary],
+            api_name="predict_and_redact",
         )
         input_text.submit(
+            fn=predict_for_demo,
             inputs=input_text,
+            outputs=[output_text, redacted_output, entity_summary],
         )
         clear_button.click(
+            lambda: ("", EMPTY_HIGHLIGHT_PAYLOAD, "", EMPTY_SUMMARY_MARKDOWN),
+            outputs=[input_text, output_text, redacted_output, entity_summary],
         )
+        gr.Markdown("### Multilingual quick examples")
         gr.Examples(
             examples=[
                 ["Alice was born on 1990-01-02 and lives at 1 Main St."],
                 ["Email me at alice@example.com or call 415-555-0101."],
+                ["Me llamo Laura Gómez y vivo en Calle de Alcalá 21, Madrid."],
+                ["Mon e-mail est jean.dupont@example.fr et mon téléphone est +33 6 12 34 56 78."],
+                ["私の名前は山田太郎です。メールはtaro.yamada@example.jpです。"],
+                ["اسمي أحمد وبريدي هو ahmed@example.com ورقم هاتفي +971501234567."],
             ],
             inputs=input_text,
+            outputs=[output_text, redacted_output, entity_summary],
+            fn=predict_for_demo,
             cache_examples=False,
         )
     return demo
 if __name__ == "__main__":
     demo = build_demo()
+    demo.launch()