"""PII redactor — find and redact common personal info / secrets in text. Regex-based, deliberately simple. Mirrors the patterns in the pii-detection-fixtures dataset. """ import re import gradio as gr # (label, pattern, optional anchor for nicer matches) PATTERNS = [ ("email", r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), ("phone", r"\+?\d{1,2}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"), ("ssn", r"\b\d{3}-\d{2}-\d{4}\b"), ("credit_card", r"\b(?:\d{4}[-\s]?){3}\d{4}\b"), ("ipv4", r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), ("aws_access_key", r"\bAKIA[0-9A-Z]{16}\b"), ("github_token", r"\bgh[ps]_[A-Za-z0-9]{30,}\b"), ("slack_token", r"\bxox[bpas]-[A-Za-z0-9-]{10,}\b"), ("stripe_key", r"\bsk_(?:live|test)_[A-Za-z0-9]{20,}\b"), ("openai_key", r"\bsk-[A-Za-z0-9]{20,}\b"), ("jwt", r"\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b"), ("mac_address", r"\b(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}\b"), ("credentials_url", r"\b\w+://[^:]+:[^@]+@[\w.-]+"), ] def redact(text: str, mode: str): if not text.strip(): return "_Enter some text to scan._", "_(no output)_" findings = [] redacted = text for label, pattern in PATTERNS: for m in re.finditer(pattern, text): findings.append({"type": label, "value": m.group(0), "start": m.start(), "end": m.end()}) if mode == "Mask": redacted = re.sub(pattern, lambda m, l=label: f"[{l.upper()}]", redacted) elif mode == "Hash": import hashlib def _hash(m): h = hashlib.sha256(m.group(0).encode()).hexdigest()[:8] return f"[{label.upper()}:{h}]" redacted = re.sub(pattern, _hash, redacted) else: # "Highlight only" pass if not findings: report = "✅ **No PII detected** by these patterns." else: rows = [f"⚠️ **{len(findings)} match(es) found:**", ""] rows.append("| Type | Value | Position |") rows.append("|---|---|---|") for f in findings: v = f["value"][:40] + "…" if len(f["value"]) > 40 else f["value"] rows.append(f"| `{f['type']}` | `{v}` | {f['start']}–{f['end']} |") report = "\n".join(rows) return report, redacted with gr.Blocks(title="PII Redactor", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # PII Redactor Find and redact common personal info / secrets in text. Regex-based, deliberately simple — mirrors the patterns in the [`pii-detection-fixtures`](https://huggingface.co/datasets/mukunda1729/pii-detection-fixtures) dataset. Use this before sending text into a logging system, an LLM prompt, or a third-party API. """ ) with gr.Row(): with gr.Column(): txt = gr.Textbox( value="Email me at alice@example.com or call +1-415-555-2024.\nMy AWS key is AKIAIOSFODNN7EXAMPLE.", label="Input text", lines=10, ) mode = gr.Radio(["Mask", "Hash", "Highlight only"], value="Mask", label="Redaction mode") btn = gr.Button("Scan + redact", variant="primary") with gr.Column(): findings_out = gr.Markdown(label="Findings") redacted_out = gr.Textbox(label="Redacted output", lines=10) btn.click(redact, inputs=[txt, mode], outputs=[findings_out, redacted_out]) gr.Markdown( """ --- Part of [The Agent Reliability Stack](https://mukundakatta.github.io/agent-stack/) · MIT licensed """ ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)