Spaces:
Sleeping
Sleeping
| """PII redactor — find and redact common personal info / secrets in text. | |
| Regex-based, deliberately simple. Mirrors the patterns in the | |
| pii-detection-fixtures dataset. | |
| """ | |
| import re | |
| import gradio as gr | |
| # (label, pattern, optional anchor for nicer matches) | |
| PATTERNS = [ | |
| ("email", r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), | |
| ("phone", r"\+?\d{1,2}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"), | |
| ("ssn", r"\b\d{3}-\d{2}-\d{4}\b"), | |
| ("credit_card", r"\b(?:\d{4}[-\s]?){3}\d{4}\b"), | |
| ("ipv4", r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), | |
| ("aws_access_key", r"\bAKIA[0-9A-Z]{16}\b"), | |
| ("github_token", r"\bgh[ps]_[A-Za-z0-9]{30,}\b"), | |
| ("slack_token", r"\bxox[bpas]-[A-Za-z0-9-]{10,}\b"), | |
| ("stripe_key", r"\bsk_(?:live|test)_[A-Za-z0-9]{20,}\b"), | |
| ("openai_key", r"\bsk-[A-Za-z0-9]{20,}\b"), | |
| ("jwt", r"\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b"), | |
| ("mac_address", r"\b(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}\b"), | |
| ("credentials_url", r"\b\w+://[^:]+:[^@]+@[\w.-]+"), | |
| ] | |
| def redact(text: str, mode: str): | |
| if not text.strip(): | |
| return "_Enter some text to scan._", "_(no output)_" | |
| findings = [] | |
| redacted = text | |
| for label, pattern in PATTERNS: | |
| for m in re.finditer(pattern, text): | |
| findings.append({"type": label, "value": m.group(0), "start": m.start(), "end": m.end()}) | |
| if mode == "Mask": | |
| redacted = re.sub(pattern, lambda m, l=label: f"[{l.upper()}]", redacted) | |
| elif mode == "Hash": | |
| import hashlib | |
| def _hash(m): | |
| h = hashlib.sha256(m.group(0).encode()).hexdigest()[:8] | |
| return f"[{label.upper()}:{h}]" | |
| redacted = re.sub(pattern, _hash, redacted) | |
| else: # "Highlight only" | |
| pass | |
| if not findings: | |
| report = "✅ **No PII detected** by these patterns." | |
| else: | |
| rows = [f"⚠️ **{len(findings)} match(es) found:**", ""] | |
| rows.append("| Type | Value | Position |") | |
| rows.append("|---|---|---|") | |
| for f in findings: | |
| v = f["value"][:40] + "…" if len(f["value"]) > 40 else f["value"] | |
| rows.append(f"| `{f['type']}` | `{v}` | {f['start']}–{f['end']} |") | |
| report = "\n".join(rows) | |
| return report, redacted | |
| with gr.Blocks(title="PII Redactor", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # PII Redactor | |
| Find and redact common personal info / secrets in text. Regex-based, deliberately simple — mirrors the patterns in the [`pii-detection-fixtures`](https://huggingface.co/datasets/mukunda1729/pii-detection-fixtures) dataset. | |
| Use this before sending text into a logging system, an LLM prompt, or a third-party API. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| txt = gr.Textbox( | |
| value="Email me at alice@example.com or call +1-415-555-2024.\nMy AWS key is AKIAIOSFODNN7EXAMPLE.", | |
| label="Input text", | |
| lines=10, | |
| ) | |
| mode = gr.Radio(["Mask", "Hash", "Highlight only"], value="Mask", label="Redaction mode") | |
| btn = gr.Button("Scan + redact", variant="primary") | |
| with gr.Column(): | |
| findings_out = gr.Markdown(label="Findings") | |
| redacted_out = gr.Textbox(label="Redacted output", lines=10) | |
| btn.click(redact, inputs=[txt, mode], outputs=[findings_out, redacted_out]) | |
| gr.Markdown( | |
| """ | |
| --- | |
| Part of [The Agent Reliability Stack](https://mukundakatta.github.io/agent-stack/) · MIT licensed | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |