pii-redactor / app.py
mukunda1729's picture
Initial: PII redactor with mask/hash modes
ad4f7c6 verified
"""PII redactor — find and redact common personal info / secrets in text.
Regex-based, deliberately simple. Mirrors the patterns in the
pii-detection-fixtures dataset.
"""
import re
import gradio as gr
# (label, pattern, optional anchor for nicer matches)
PATTERNS = [
("email", r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"),
("phone", r"\+?\d{1,2}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"),
("ssn", r"\b\d{3}-\d{2}-\d{4}\b"),
("credit_card", r"\b(?:\d{4}[-\s]?){3}\d{4}\b"),
("ipv4", r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
("aws_access_key", r"\bAKIA[0-9A-Z]{16}\b"),
("github_token", r"\bgh[ps]_[A-Za-z0-9]{30,}\b"),
("slack_token", r"\bxox[bpas]-[A-Za-z0-9-]{10,}\b"),
("stripe_key", r"\bsk_(?:live|test)_[A-Za-z0-9]{20,}\b"),
("openai_key", r"\bsk-[A-Za-z0-9]{20,}\b"),
("jwt", r"\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b"),
("mac_address", r"\b(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}\b"),
("credentials_url", r"\b\w+://[^:]+:[^@]+@[\w.-]+"),
]
def redact(text: str, mode: str):
if not text.strip():
return "_Enter some text to scan._", "_(no output)_"
findings = []
redacted = text
for label, pattern in PATTERNS:
for m in re.finditer(pattern, text):
findings.append({"type": label, "value": m.group(0), "start": m.start(), "end": m.end()})
if mode == "Mask":
redacted = re.sub(pattern, lambda m, l=label: f"[{l.upper()}]", redacted)
elif mode == "Hash":
import hashlib
def _hash(m):
h = hashlib.sha256(m.group(0).encode()).hexdigest()[:8]
return f"[{label.upper()}:{h}]"
redacted = re.sub(pattern, _hash, redacted)
else: # "Highlight only"
pass
if not findings:
report = "✅ **No PII detected** by these patterns."
else:
rows = [f"⚠️ **{len(findings)} match(es) found:**", ""]
rows.append("| Type | Value | Position |")
rows.append("|---|---|---|")
for f in findings:
v = f["value"][:40] + "…" if len(f["value"]) > 40 else f["value"]
rows.append(f"| `{f['type']}` | `{v}` | {f['start']}{f['end']} |")
report = "\n".join(rows)
return report, redacted
with gr.Blocks(title="PII Redactor", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# PII Redactor
Find and redact common personal info / secrets in text. Regex-based, deliberately simple — mirrors the patterns in the [`pii-detection-fixtures`](https://huggingface.co/datasets/mukunda1729/pii-detection-fixtures) dataset.
Use this before sending text into a logging system, an LLM prompt, or a third-party API.
"""
)
with gr.Row():
with gr.Column():
txt = gr.Textbox(
value="Email me at alice@example.com or call +1-415-555-2024.\nMy AWS key is AKIAIOSFODNN7EXAMPLE.",
label="Input text",
lines=10,
)
mode = gr.Radio(["Mask", "Hash", "Highlight only"], value="Mask", label="Redaction mode")
btn = gr.Button("Scan + redact", variant="primary")
with gr.Column():
findings_out = gr.Markdown(label="Findings")
redacted_out = gr.Textbox(label="Redacted output", lines=10)
btn.click(redact, inputs=[txt, mode], outputs=[findings_out, redacted_out])
gr.Markdown(
"""
---
Part of [The Agent Reliability Stack](https://mukundakatta.github.io/agent-stack/) · MIT licensed
"""
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)