""" PII Reveal - Document Privacy Explorer ======================================= Uploads a PDF/DOC/DOCX, runs the openai/privacy-filter model over the extracted text, and returns per-span character offsets + stats for an interactive reader view. Also supports building a black-bar redacted PDF. Inference path: `transformers.pipeline("token-classification", "openai/privacy-filter", aggregation_strategy="simple")` — the pipeline takes care of BIOES → char-level span aggregation for us. PDF redaction (build_redacted_pdf_bytes) is optimized for large files: per-page `needle in page_text` prefilter before page.search_for, skip apply_redactions on pages with no matches, and save with garbage=1 to avoid the expensive stream-recompression pass. """ # ── stdlib ─────────────────────────────────────────────────────── import functools import io import json import os import re import tempfile import time from pathlib import Path # ── third-party ────────────────────────────────────────────────── import gradio as gr import spaces import torch from fastapi.responses import HTMLResponse from gradio.data_classes import FileData # ── configuration ──────────────────────────────────────────────── PII_MODEL_REPO = os.getenv("MODEL_ID", "openai/privacy-filter") HF_TOKEN = os.getenv("HF_TOKEN", None) CATEGORIES_META = { "private_person": {"color": "#E24B4A", "cls": "hp", "label": "Person", "mono": False}, "private_date": {"color": "#7F77DD", "cls": "hd", "label": "Date", "mono": True}, "private_address": {"color": "#1D9E75", "cls": "ha", "label": "Address", "mono": False}, "private_email": {"color": "#378ADD", "cls": "he", "label": "Email", "mono": True}, "account_number": {"color": "#BA7517", "cls": "hac", "label": "Account", "mono": True}, "private_url": {"color": "#D85A30", "cls": "hu", "label": "URL", "mono": True}, "secret": {"color": "#D4537E", "cls": "hs", "label": "Secret", "mono": True}, "private_phone": {"color": "#639922", "cls": "hph", "label": "Phone", "mono": True}, } # ===================================================================== # MODEL INFERENCE (transformers pipeline — openai/privacy-filter) # ===================================================================== @functools.lru_cache(maxsize=1) def get_pii_pipeline(): """Lazy-load the privacy filter on the GPU. Cached so repeated calls inside a single ZeroGPU slot don't re-move weights.""" from transformers import pipeline return pipeline( task="token-classification", model=PII_MODEL_REPO, aggregation_strategy="simple", # merges BIOES tags into char-level spans device=0, torch_dtype=torch.bfloat16, token=HF_TOKEN, ) def predict_text(text: str) -> tuple[str, list[dict]]: """Returns (source_text, spans). `spans` is a list of {label, start, end, text} with character offsets into `text`.""" if not text.strip(): return text, [] pipe = get_pii_pipeline() results = pipe(text) spans = [] for r in results: label = r.get("entity_group") or r.get("entity") if not label or label == "O": continue s, e = int(r["start"]), int(r["end"]) if e <= s or s < 0 or e > len(text): continue spans.append({"label": label, "start": s, "end": e, "text": text[s:e]}) return text, spans # ===================================================================== # APPLICATION LAYER # ===================================================================== def _sniff_suffix(path: str) -> str: """Detect file type from magic bytes when the filename extension is missing (Gradio's server-side temp path often drops the suffix).""" try: with open(path, "rb") as f: header = f.read(8) except OSError: return "" if header.startswith(b"%PDF-"): return ".pdf" if header.startswith(b"PK\x03\x04"): # zip container — .docx return ".docx" if header.startswith(b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"): # OLE2 — legacy .doc return ".doc" return "" def _resolve_suffix(path: str, orig_name: str) -> str: """Pick the best available suffix: orig_name → path → magic bytes.""" for candidate in (orig_name, path): s = Path(candidate or "").suffix.lower() if s: return s return _sniff_suffix(path) def extract_text(file_path: str, suffix: str | None = None) -> str: suffix = (suffix or Path(file_path).suffix).lower() if suffix == ".pdf": import fitz doc = fitz.open(file_path) pages = [page.get_text() for page in doc] doc.close() return "\n\n".join(pages) elif suffix in (".docx", ".doc"): from docx import Document doc = Document(file_path) return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip()) raise ValueError(f"Unsupported file type: {suffix}") def compute_stats(text, spans): total = len(text) pii_chars = sum(s["end"] - s["start"] for s in spans) by_cat = {} for s in spans: c = s["label"] by_cat.setdefault(c, {"count": 0, "chars": 0}) by_cat[c]["count"] += 1; by_cat[c]["chars"] += s["end"] - s["start"] return { "total_chars": total, "pii_chars": pii_chars, "pii_percentage": round(pii_chars / total * 100, 1) if total else 0, "total_spans": len(spans), "categories": by_cat, "num_categories": len(by_cat), "total_lines": text.count("\n") + 1 if total else 0, } def detect_speakers(text, spans): patterns = [r"^([A-Z][a-zA-Z ]{1,30}):\s", r"^\[([^\]]{1,30})\]\s", r"^(Speaker\s*\d+):\s"] line_sp, pos, cur = [], 0, None for line in text.split("\n"): for p in patterns: m = re.match(p, line) if m: cur = m.group(1).strip(); break line_sp.append((pos, pos + len(line), cur)); pos += len(line) + 1 result = {} for span in spans: mid = (span["start"] + span["end"]) // 2 speaker = "Document" for ls, le, sp in line_sp: if ls <= mid <= le and sp: speaker = sp; break result[speaker] = result.get(speaker, 0) + 1 return {} if list(result.keys()) == ["Document"] else result @spaces.GPU def run_pii_analysis(text: str): """GPU-accelerated PII detection.""" return predict_text(text) def build_redacted_pdf_bytes(pdf_path: str, pii_texts: list[str]) -> bytes: """ Fast PDF redaction via PyMuPDF. Perf notes vs the v5 implementation that ran for "several minutes": 1. Dedupe needles once; process longest first so full spans win over their substrings. 2. Pull each page's full text string ONCE, then do a cheap Python `needle in page_text` prefilter before ever calling page.search_for (which is the expensive call). This avoids the 100-page * 200-needle = 20k wasted search calls. 3. Skip apply_redactions on pages with no matches. 4. save(garbage=1, deflate=True) — garbage=4 in v5 recompressed every stream and dominated the save time on large docs. """ import fitz ordered = sorted( {t.strip() for t in pii_texts if t and len(t.strip()) >= 2}, key=len, reverse=True, ) if not ordered: # No needles — return the original untouched return Path(pdf_path).read_bytes() doc = fitz.open(pdf_path) try: for page in doc: page_text = page.get_text() if not page_text: continue needles = [t for t in ordered if t in page_text] if not needles: continue added = False for needle in needles: for rect in page.search_for(needle): page.add_redact_annot(rect, fill=(0, 0, 0)) added = True if added: page.apply_redactions() buf = io.BytesIO() doc.save(buf, garbage=1, deflate=True) return buf.getvalue() finally: doc.close() # ── Gradio Server ──────────────────────────────────────────────── # # We only keep one plain FastAPI route here — the homepage, which # serves the static HTML shell. The heavy lifting endpoints are # declared with @server.api, which wraps them in Gradio's queue so # they compose correctly with @spaces.GPU on ZeroGPU and with the # gradio_client / @gradio/client SDKs. server = gr.Server() @server.get("/", response_class=HTMLResponse) async def homepage(): return FRONTEND_HTML @server.api(name="analyze_document") def analyze_document_api(file: FileData) -> dict: """Extract text from an uploaded PDF/DOC/DOCX and run the OPF privacy filter over it. Returns the detected spans, stats, per-speaker counts, and the category color/label table. Called from the browser via @gradio/client: client.predict("/analyze_document", { file: handle_file(f) }) And from Python via gradio_client: client.predict("/analyze_document", file=handle_file(path)) """ path = file.get("path") or "" orig_name = file.get("orig_name") or Path(path).name suffix = _resolve_suffix(path, orig_name) if suffix not in (".pdf", ".doc", ".docx"): return {"error": f"Unsupported: {suffix or '(no extension)'}. Use PDF, DOC, or DOCX."} try: text = extract_text(path, suffix=suffix) if not text.strip(): return {"error": "No text content found."} source_text, spans = run_pii_analysis(text) stats = compute_stats(source_text, spans) speakers = detect_speakers(source_text, spans) return { "filename": orig_name, "text": source_text, "spans": spans, "stats": stats, "speakers": speakers, "categories_meta": { k: {"color": v["color"], "cls": v["cls"], "label": v["label"], "mono": v["mono"]} for k, v in CATEGORIES_META.items() }, } except Exception as e: return {"error": str(e)} @server.api(name="redact_pdf") def redact_pdf_api(file: FileData, spans: str, active: str) -> dict: """Build a black-bar-redacted PDF from an uploaded PDF plus the list of spans the browser wants redacted. `spans` and `active` are JSON strings because the JS client serializes complex objects more predictably as strings than as nested dicts. Returns {"pdf": FileData, "elapsed_ms": int} so the caller can download the file and also display timing.""" path = file.get("path") or "" orig_name = file.get("orig_name") or Path(path).name suffix = _resolve_suffix(path, orig_name) if suffix != ".pdf": return {"error": "PDF redaction only accepts PDF input."} try: span_list = json.loads(spans) active_set = set(json.loads(active)) except Exception as e: return {"error": f"Invalid payload: {e}"} pii_texts = [ s.get("text", "") for s in span_list if s.get("label") in active_set ] if not pii_texts: return {"error": "No active categories selected — nothing to redact."} try: t0 = time.perf_counter() pdf_bytes = build_redacted_pdf_bytes(path, pii_texts) elapsed_ms = int((time.perf_counter() - t0) * 1000) except Exception as e: return {"error": str(e)} stem = Path(orig_name).stem or "document" out_path = Path(tempfile.gettempdir()) / f"{stem}.redacted.pdf" out_path.write_bytes(pdf_bytes) return { "pdf": FileData(path=str(out_path)), "elapsed_ms": elapsed_ms, } @server.api(name="analyze_text") def analyze_text_api(text: str) -> dict: """Analyze raw text for PII — convenient for gradio_client users who don't want to build a PDF just to test the model.""" source_text, spans = run_pii_analysis(text) stats = compute_stats(source_text, spans) return {"text": source_text, "spans": spans, "stats": stats} # ── Frontend HTML (v6) ─────────────────────────────────────────── FRONTEND_HTML = r"""
Find every PII span in a PDF, DOC or DOCX — names, accounts, secrets and five other entity types — then export a fully redacted copy.