""" DLP Paste-Proxy — "Pastebin with a conscience" ================================================ A sleek paste-to-share service. The author pastes PII-rich text and gets a shareable URL. Recipients at that URL see the OPF-redacted version by default; a separate "reveal" link guarded by an unguessable token shows the original. Why gr.Server? We need four HTTP surfaces that don't map cleanly onto gr.Blocks event wiring: * @server.api create_paste - accept paste, run OPF, mint IDs (queued compute → @gradio/client) * GET /view/{id} - public redacted view page * GET /view/{id}?token=... - author's reveal page * GET /api/paste/{pid} - JSON lookup of an existing paste plus a background sweeper for auto-expiry. The create_paste endpoint is the only one that runs the OPF model, so it is the only one that needs Gradio's queue + ZeroGPU wiring — the other three are pure lookup/rendering and are served as plain FastAPI routes, which the gradio.Server blog recommends for static content. Storage is an in-process dict. That is fine for a public demo — the point is to illustrate the request-composition model; it is NOT a durable pastebin. Restarting the Space clears all pastes. """ from __future__ import annotations import html import json import os import secrets import threading import time from dataclasses import dataclass from typing import Optional import gradio as gr from fastapi.responses import HTMLResponse, JSONResponse # spaces is only available on Hugging Face Spaces; degrade gracefully # when running locally so `python app.py` still works off-GPU. try: import spaces _HAS_SPACES = True except ImportError: _HAS_SPACES = False from opf import predict_text # ── configuration ───────────────────────────────────────────────── MAX_PASTE_CHARS = int(os.getenv("MAX_PASTE_CHARS", "50000")) SWEEP_INTERVAL_SEC = int(os.getenv("SWEEP_INTERVAL_SEC", "30")) TTL_CHOICES: dict[str, Optional[int]] = { "never": None, "1h": 60 * 60, "24h": 60 * 60 * 24, "7d": 60 * 60 * 24 * 7, } CATEGORIES_META = { "private_person": {"color": "#E24B4A", "label": "Person"}, "private_date": {"color": "#1E7DD1", "label": "Date"}, "private_address": {"color": "#1D9E75", "label": "Address"}, "private_email": {"color": "#0EA5A1", "label": "Email"}, "account_number": {"color": "#BA7517", "label": "Account"}, "private_url": {"color": "#D85A30", "label": "URL"}, "secret": {"color": "#52525b", "label": "Secret"}, "private_phone": {"color": "#639922", "label": "Phone"}, } # ── paste store ─────────────────────────────────────────────────── @dataclass class Paste: id: str reveal_token: str original: str redacted: str spans: list[dict] stats: dict created_at: float expires_at: Optional[float] views: int = 0 reveals: int = 0 PASTES: dict[str, Paste] = {} LOCK = threading.RLock() def _store_put(paste: Paste) -> None: with LOCK: PASTES[paste.id] = paste def _store_get(pid: str) -> Optional[Paste]: with LOCK: p = PASTES.get(pid) if p is None: return None if p.expires_at is not None and p.expires_at <= time.time(): PASTES.pop(pid, None) return None return p def _sweep_loop() -> None: while True: time.sleep(SWEEP_INTERVAL_SEC) now = time.time() with LOCK: expired = [pid for pid, p in PASTES.items() if p.expires_at is not None and p.expires_at <= now] for pid in expired: PASTES.pop(pid, None) threading.Thread(target=_sweep_loop, daemon=True, name="paste-sweeper").start() # ── redaction ───────────────────────────────────────────────────── def redact(text: str, spans: list[dict]) -> str: """Replace each detected span with right-to-left. Right-to-left preserves indices for earlier spans while we rewrite later ones (the v6 model output is non-overlapping, but we still sort defensively and drop any that would nest).""" out = text last_start: Optional[int] = None for sp in sorted(spans, key=lambda s: s["start"], reverse=True): s, e = sp["start"], sp["end"] if last_start is not None and e > last_start: continue # overlaps a later (earlier-in-text) span; skip placeholder = f"<{sp['label'].upper()}>" out = out[:s] + placeholder + out[e:] last_start = s return out def compute_stats(text: str, spans: list[dict]) -> dict: total = len(text) pii_chars = sum(s["end"] - s["start"] for s in spans) by_cat: dict[str, dict[str, int]] = {} for s in spans: c = s["label"] by_cat.setdefault(c, {"count": 0, "chars": 0}) by_cat[c]["count"] += 1 by_cat[c]["chars"] += s["end"] - s["start"] return { "total_chars": total, "pii_chars": pii_chars, "pii_percentage": round(pii_chars / total * 100, 1) if total else 0.0, "total_spans": len(spans), "categories": by_cat, } # ── OPF call (GPU-gated on HF Spaces) ───────────────────────────── if _HAS_SPACES: @spaces.GPU def analyze(text: str): return predict_text(text) else: def analyze(text: str): return predict_text(text) # ── gr.Server wiring ────────────────────────────────────────────── server = gr.Server() @server.get("/", response_class=HTMLResponse) async def home(): return HTMLResponse(_COMPOSE_HTML) @server.api(name="create_paste") def create_paste_api(text: str, ttl: str = "never") -> dict: """Scan text with the OPF model, mint a paste id + reveal token, and return all the metadata the caller needs to build share URLs. This is a @server.api route (not a plain FastAPI POST) because it runs the @spaces.GPU-decorated `analyze` — we want the request to go through Gradio's queue so concurrent callers don't stomp on each other's ZeroGPU allocations. Both the browser (via @gradio/client) and Python clients (via gradio_client) hit the same endpoint. """ text = (text or "").strip() if not text: return {"error": "Paste is empty"} if len(text) > MAX_PASTE_CHARS: return {"error": f"Paste exceeds {MAX_PASTE_CHARS:,} characters"} if ttl not in TTL_CHOICES: return {"error": f"Unknown ttl {ttl!r}"} try: source_text, spans = analyze(text) except Exception as exc: return {"error": f"OPF inference failed: {exc}"} redacted = redact(source_text, spans) stats = compute_stats(source_text, spans) pid = secrets.token_urlsafe(6) reveal_token = secrets.token_urlsafe(22) ttl_sec = TTL_CHOICES[ttl] now = time.time() expires_at = (now + ttl_sec) if ttl_sec is not None else None _store_put(Paste( id=pid, reveal_token=reveal_token, original=source_text, redacted=redacted, spans=spans, stats=stats, created_at=now, expires_at=expires_at, )) return { "id": pid, "reveal_token": reveal_token, "view_path": f"/view/{pid}", "reveal_path": f"/view/{pid}?token={reveal_token}", "expires_at": expires_at, "stats": stats, "redacted": redacted, # let the frontend render a preview without a second round-trip "categories_meta": CATEGORIES_META, } @server.get("/view/{pid}", response_class=HTMLResponse) async def view_paste(pid: str, token: Optional[str] = None): p = _store_get(pid) if p is None: return HTMLResponse(_not_found_html(pid), status_code=404) revealed = bool(token) and secrets.compare_digest(token, p.reveal_token) with LOCK: if revealed: p.reveals += 1 else: p.views += 1 return HTMLResponse(_render_view(p, revealed)) @server.get("/api/paste/{pid}") async def api_get_paste(pid: str, token: Optional[str] = None): """Plain FastAPI read-only lookup — no compute, no queue needed.""" p = _store_get(pid) if p is None: return JSONResponse({"error": "not found or expired"}, status_code=404) revealed = bool(token) and secrets.compare_digest(token, p.reveal_token) payload = { "id": p.id, "created_at": p.created_at, "expires_at": p.expires_at, "stats": p.stats, "views": p.views, "reveals": p.reveals, "redacted": p.redacted, } if revealed: payload["original"] = p.original payload["spans"] = p.spans return JSONResponse(payload) # ── HTML rendering ──────────────────────────────────────────────── def _escape(text: str) -> str: return html.escape(text, quote=False) def _highlight_html(text: str, spans: list[dict]) -> str: """Return HTML for text with each span wrapped in a colored mark, revealing the original content (used on the reveal page).""" pieces: list[str] = [] cursor = 0 for sp in sorted(spans, key=lambda s: s["start"]): s, e = sp["start"], sp["end"] if s < cursor or e <= s: continue if s > cursor: pieces.append(_escape(text[cursor:s])) meta = CATEGORIES_META.get(sp["label"]) color = meta["color"] if meta else "#333" label = meta["label"] if meta else sp["label"] pieces.append( f'' f'{_escape(text[s:e])}' f'{_escape(label)}' f'' ) cursor = e if cursor < len(text): pieces.append(_escape(text[cursor:])) return "".join(pieces) def _redacted_html(redacted: str) -> str: """Render the redacted version with placeholders as colored pills so readers can see what kind of data was stripped.""" out: list[str] = [] i = 0 while i < len(redacted): lt = redacted.find("<", i) if lt == -1: out.append(_escape(redacted[i:])) break out.append(_escape(redacted[i:lt])) gt = redacted.find(">", lt + 1) if gt == -1: out.append(_escape(redacted[lt:])) break tag = redacted[lt + 1:gt] cat_key = tag.lower() meta = CATEGORIES_META.get(cat_key) if meta is None: out.append(_escape(redacted[lt:gt + 1])) else: out.append( f'' f'{_escape(meta["label"])}' f'' ) i = gt + 1 return "".join(out) def _format_expiry(paste: Paste) -> str: if paste.expires_at is None: return "does not expire" remaining = paste.expires_at - time.time() if remaining <= 0: return "expired" if remaining < 3600: return f"expires in {int(remaining // 60)} min" if remaining < 86400: return f"expires in {int(remaining // 3600)} h" return f"expires in {int(remaining // 86400)} d" def _render_view(p: Paste, revealed: bool) -> str: stats = p.stats badges_html = "".join( f'' f'' f'{_escape(CATEGORIES_META.get(cat, {"label": cat})["label"])}' f'{info["count"]}' f'' for cat, info in sorted(stats["categories"].items(), key=lambda kv: -kv[1]["count"]) ) or 'No PII detected in this paste.' body_html = ( _highlight_html(p.original, p.spans) if revealed else _redacted_html(p.redacted) ) mode_banner = ( '
' 'Private reveal. This URL contains the reveal token — ' 'treat it like a password. Anyone with it sees the original text.' '
' if revealed else '
' 'Redacted view. Sensitive spans were stripped before ' 'this page was served. The original is only visible via the author\'s reveal link.' '
' ) view_mode_label = "Original (revealed)" if revealed else "Redacted" replacements = { "__PID__": _escape(p.id), "__MODE__": _escape(view_mode_label), "__EXPIRY__": _escape(_format_expiry(p)), "__CREATED__": _escape(time.strftime( "%Y-%m-%d %H:%M UTC", time.gmtime(p.created_at))), "__VIEWS__": str(p.views), "__REVEALS__": str(p.reveals), "__PCT__": str(stats["pii_percentage"]), "__SPANS_N__": str(stats["total_spans"]), "__CHARS_N__": f'{stats["total_chars"]:,}', "__BADGES__": badges_html, "__BANNER__": mode_banner, "__BODY__": body_html, "__BODY_CLASS__": "pp-body-reveal" if revealed else "pp-body-redacted", } out = _VIEW_HTML for k, v in replacements.items(): out = out.replace(k, v) return out def _not_found_html(pid: str) -> str: return _NOT_FOUND_HTML.replace("{{PID}}", _escape(pid)) # ── compose page (paste editor) ─────────────────────────────────── _CATEGORIES_JSON = json.dumps(CATEGORIES_META) _SHARED_CSS = r""" :root{ --bg: #f7f7f8; --panel: #ffffff; --panel-2: #f1f1f3; --ink: #0a0a0a; --ink-dim: #3f3f46; --ink-faint: #70707a; --line: #e4e4e7; --line-strong: #d4d4d8; --accent: #0f8a5f; --accent-ink: #ffffff; --warn: #b45309; --primary-bg: #18181b; --primary-fg: #ffffff; --radius-lg: 12px; --radius-md: 8px; --radius-sm: 5px; --shadow-xs: 0 1px 1.5px rgba(10,10,10,.04); --shadow-sm: 0 1px 3px rgba(10,10,10,.06), 0 1px 2px rgba(10,10,10,.04); --shadow-md: 0 4px 14px rgba(10,10,10,.07), 0 1px 3px rgba(10,10,10,.04); --font-sans: 'Inter', system-ui, -apple-system, 'Segoe UI', sans-serif; --font-mono: 'JetBrains Mono', ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; --font-serif: 'Instrument Serif', 'Source Serif 4', Georgia, serif; } @media (prefers-color-scheme: dark){ :root{ --bg: #0e0e11; --panel: #18181c; --panel-2: #1f1f24; --ink: #e8e8ea; --ink-dim: #a8a8ae; --ink-faint: #70707a; --line: rgba(255,255,255,0.08); --line-strong: rgba(255,255,255,0.18); --accent: #2bb77e; --accent-ink: #0e0e11; --warn: #eab308; --primary-bg: #f0f0f2; --primary-fg: #0e0e11; --shadow-xs: none; --shadow-sm: none; --shadow-md: none; } } *,*::before,*::after{box-sizing:border-box;margin:0;padding:0} html,body{height:100%} body{ font-family:var(--font-sans); background:var(--bg); color:var(--ink); font-size:14px;line-height:1.55; -webkit-font-smoothing:antialiased; font-feature-settings:"cv11","ss01"; } a{color:inherit;text-decoration:underline;text-decoration-color:var(--line-strong);text-underline-offset:3px} a:hover{text-decoration-color:var(--ink)} button{font:inherit;color:inherit;background:transparent;border:0;cursor:pointer} .pp-shell{max-width:1060px;margin:0 auto;padding:36px 20px 56px} .pp-brand{display:flex;align-items:center;gap:10px;margin-bottom:22px} .pp-brand-mark{ width:26px;height:26px;border-radius:7px; background:var(--ink);color:var(--bg); display:grid;place-items:center; font-family:var(--font-mono);font-size:13px;font-weight:600;letter-spacing:-0.02em; } .pp-brand-name{font-size:13.5px;font-weight:500} .pp-brand-name .sub{color:var(--ink-faint);font-weight:400;margin-left:6px} .pp-caps{font-size:10.5px;font-weight:600;letter-spacing:0.09em;text-transform:uppercase;color:var(--ink-dim)} .pp-hero{margin-bottom:22px} .pp-hero h1{font-family:var(--font-serif);font-size:38px;line-height:1.08;letter-spacing:-0.015em;font-weight:500;margin-bottom:8px} .pp-hero p{color:var(--ink-dim);max-width:58ch;font-size:14px} .pp-banner{padding:10px 14px;border-radius:var(--radius-md);font-size:13px;line-height:1.5;border:0.5px solid var(--line-strong);margin-bottom:16px} .pp-banner strong{font-weight:600} .pp-banner-safe{background:color-mix(in srgb, var(--accent) 8%, transparent);border-color:color-mix(in srgb, var(--accent) 26%, var(--line-strong))} .pp-banner-reveal{background:color-mix(in srgb, var(--warn) 10%, transparent);border-color:color-mix(in srgb, var(--warn) 30%, var(--line-strong))} """ _COMPOSE_HTML = r""" DLP Paste-Proxy — Pastebin with a conscience
P
DLP Paste-Proxypastebin with a conscience

Paste sensitive text.
Share only the redacted view.

OpenAI Privacy Filter scans your paste for names, addresses, emails, phones, URLs, dates, account numbers, and secrets before minting a shareable link. Viewers see placeholders; only your private reveal link shows the original.

Compose

New paste

0 / """ + f"{MAX_PASTE_CHARS:,}" + r""" chars
line 1, col 1 · no data leaves this server except as redacted placeholders

A background sweeper deletes expired pastes on the server. Expired links 404.

Running OPF on your paste…
Paste minted

Your paste is ready.

What recipients will see
Summary
""" # ── view page ───────────────────────────────────────────────────── _VIEW_HTML = r""" Paste __PID__ — DLP Paste-Proxy
Paste __PID__ __MODE__ __CREATED__ · __EXPIRY__
__PCT__%PII density
__SPANS_N__spans
__CHARS_N__chars
__BANNER__
__BADGES__
__BODY__
Create your own paste →
Recipients see placeholders. The author's reveal link shows the original inline.
Views: __VIEWS__ · Reveals: __REVEALS__
""" _NOT_FOUND_HTML = r""" Paste not found — DLP Paste-Proxy

Paste not found

{{PID}} either never existed, expired by its TTL, or was evicted by a server restart. Pastes live in process memory for the demo.

Create a new paste →
""" # ── launch ──────────────────────────────────────────────────────── if __name__ == "__main__": server.launch(server_name="0.0.0.0", server_port=7860)