Spaces:
Running on Zero
Running on Zero
Upload 3 files
Browse files- app.py +1035 -0
- opf.py +557 -0
- requirements.txt +9 -0
app.py
ADDED
|
@@ -0,0 +1,1035 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DLP Paste-Proxy — "Pastebin with a conscience"
|
| 3 |
+
================================================
|
| 4 |
+
|
| 5 |
+
A sleek paste-to-share service. The author pastes PII-rich text and gets
|
| 6 |
+
a shareable URL. Recipients at that URL see the OPF-redacted version by
|
| 7 |
+
default; a separate "reveal" link guarded by an unguessable token shows
|
| 8 |
+
the original.
|
| 9 |
+
|
| 10 |
+
Why gr.Server? We need three HTTP surfaces that don't map cleanly onto
|
| 11 |
+
gr.Blocks event wiring:
|
| 12 |
+
* POST /api/paste - accept paste, run OPF, mint IDs
|
| 13 |
+
* GET /view/{id} - public redacted view page
|
| 14 |
+
* GET /view/{id}?token=... - author's reveal page
|
| 15 |
+
plus a programmable API endpoint (@server.api) for gradio-client SDK
|
| 16 |
+
users and a background sweeper for auto-expiry.
|
| 17 |
+
|
| 18 |
+
Storage is an in-process dict. That is fine for a public demo — the
|
| 19 |
+
point is to illustrate the request-composition model; it is NOT a
|
| 20 |
+
durable pastebin. Restarting the Space clears all pastes.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from __future__ import annotations
|
| 24 |
+
|
| 25 |
+
import html
|
| 26 |
+
import json
|
| 27 |
+
import os
|
| 28 |
+
import secrets
|
| 29 |
+
import threading
|
| 30 |
+
import time
|
| 31 |
+
from dataclasses import dataclass
|
| 32 |
+
from typing import Optional
|
| 33 |
+
|
| 34 |
+
import gradio as gr
|
| 35 |
+
from fastapi import Request
|
| 36 |
+
from fastapi.responses import HTMLResponse, JSONResponse
|
| 37 |
+
|
| 38 |
+
# spaces is only available on Hugging Face Spaces; degrade gracefully
|
| 39 |
+
# when running locally so `python app.py` still works off-GPU.
|
| 40 |
+
try:
|
| 41 |
+
import spaces
|
| 42 |
+
_HAS_SPACES = True
|
| 43 |
+
except ImportError:
|
| 44 |
+
_HAS_SPACES = False
|
| 45 |
+
|
| 46 |
+
from opf import predict_text
|
| 47 |
+
|
| 48 |
+
# ── configuration ─────────────────────────────────────────────────
|
| 49 |
+
|
| 50 |
+
MAX_PASTE_CHARS = int(os.getenv("MAX_PASTE_CHARS", "50000"))
|
| 51 |
+
SWEEP_INTERVAL_SEC = int(os.getenv("SWEEP_INTERVAL_SEC", "30"))
|
| 52 |
+
|
| 53 |
+
TTL_CHOICES: dict[str, Optional[int]] = {
|
| 54 |
+
"never": None,
|
| 55 |
+
"1h": 60 * 60,
|
| 56 |
+
"24h": 60 * 60 * 24,
|
| 57 |
+
"7d": 60 * 60 * 24 * 7,
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
CATEGORIES_META = {
|
| 61 |
+
"private_person": {"color": "#E24B4A", "label": "Person"},
|
| 62 |
+
"private_date": {"color": "#1E7DD1", "label": "Date"},
|
| 63 |
+
"private_address": {"color": "#1D9E75", "label": "Address"},
|
| 64 |
+
"private_email": {"color": "#0EA5A1", "label": "Email"},
|
| 65 |
+
"account_number": {"color": "#BA7517", "label": "Account"},
|
| 66 |
+
"private_url": {"color": "#D85A30", "label": "URL"},
|
| 67 |
+
"secret": {"color": "#52525b", "label": "Secret"},
|
| 68 |
+
"private_phone": {"color": "#639922", "label": "Phone"},
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
# ── paste store ───────────────────────────────────────────────────
|
| 72 |
+
|
| 73 |
+
@dataclass
|
| 74 |
+
class Paste:
|
| 75 |
+
id: str
|
| 76 |
+
reveal_token: str
|
| 77 |
+
original: str
|
| 78 |
+
redacted: str
|
| 79 |
+
spans: list[dict]
|
| 80 |
+
stats: dict
|
| 81 |
+
created_at: float
|
| 82 |
+
expires_at: Optional[float]
|
| 83 |
+
views: int = 0
|
| 84 |
+
reveals: int = 0
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
PASTES: dict[str, Paste] = {}
|
| 88 |
+
LOCK = threading.RLock()
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _store_put(paste: Paste) -> None:
|
| 92 |
+
with LOCK:
|
| 93 |
+
PASTES[paste.id] = paste
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _store_get(pid: str) -> Optional[Paste]:
|
| 97 |
+
with LOCK:
|
| 98 |
+
p = PASTES.get(pid)
|
| 99 |
+
if p is None:
|
| 100 |
+
return None
|
| 101 |
+
if p.expires_at is not None and p.expires_at <= time.time():
|
| 102 |
+
PASTES.pop(pid, None)
|
| 103 |
+
return None
|
| 104 |
+
return p
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _sweep_loop() -> None:
|
| 108 |
+
while True:
|
| 109 |
+
time.sleep(SWEEP_INTERVAL_SEC)
|
| 110 |
+
now = time.time()
|
| 111 |
+
with LOCK:
|
| 112 |
+
expired = [pid for pid, p in PASTES.items()
|
| 113 |
+
if p.expires_at is not None and p.expires_at <= now]
|
| 114 |
+
for pid in expired:
|
| 115 |
+
PASTES.pop(pid, None)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
threading.Thread(target=_sweep_loop, daemon=True, name="paste-sweeper").start()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
# ── redaction ─────────────────────────────────────────────────────
|
| 122 |
+
|
| 123 |
+
def redact(text: str, spans: list[dict]) -> str:
|
| 124 |
+
"""Replace each detected span with <CATEGORY> right-to-left.
|
| 125 |
+
|
| 126 |
+
Right-to-left preserves indices for earlier spans while we rewrite
|
| 127 |
+
later ones (the v6 model output is non-overlapping, but we still
|
| 128 |
+
sort defensively and drop any that would nest)."""
|
| 129 |
+
out = text
|
| 130 |
+
last_start: Optional[int] = None
|
| 131 |
+
for sp in sorted(spans, key=lambda s: s["start"], reverse=True):
|
| 132 |
+
s, e = sp["start"], sp["end"]
|
| 133 |
+
if last_start is not None and e > last_start:
|
| 134 |
+
continue # overlaps a later (earlier-in-text) span; skip
|
| 135 |
+
placeholder = f"<{sp['label'].upper()}>"
|
| 136 |
+
out = out[:s] + placeholder + out[e:]
|
| 137 |
+
last_start = s
|
| 138 |
+
return out
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def compute_stats(text: str, spans: list[dict]) -> dict:
|
| 142 |
+
total = len(text)
|
| 143 |
+
pii_chars = sum(s["end"] - s["start"] for s in spans)
|
| 144 |
+
by_cat: dict[str, dict[str, int]] = {}
|
| 145 |
+
for s in spans:
|
| 146 |
+
c = s["label"]
|
| 147 |
+
by_cat.setdefault(c, {"count": 0, "chars": 0})
|
| 148 |
+
by_cat[c]["count"] += 1
|
| 149 |
+
by_cat[c]["chars"] += s["end"] - s["start"]
|
| 150 |
+
return {
|
| 151 |
+
"total_chars": total,
|
| 152 |
+
"pii_chars": pii_chars,
|
| 153 |
+
"pii_percentage": round(pii_chars / total * 100, 1) if total else 0.0,
|
| 154 |
+
"total_spans": len(spans),
|
| 155 |
+
"categories": by_cat,
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# ── OPF call (GPU-gated on HF Spaces) ─────────────────────────────
|
| 160 |
+
|
| 161 |
+
if _HAS_SPACES:
|
| 162 |
+
@spaces.GPU
|
| 163 |
+
def analyze(text: str):
|
| 164 |
+
return predict_text(text)
|
| 165 |
+
else:
|
| 166 |
+
def analyze(text: str):
|
| 167 |
+
return predict_text(text)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ── gr.Server wiring ──────────────────────────────────────────────
|
| 171 |
+
|
| 172 |
+
server = gr.Server()
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
@server.get("/", response_class=HTMLResponse)
|
| 176 |
+
async def home():
|
| 177 |
+
return HTMLResponse(_COMPOSE_HTML)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
@server.post("/api/paste")
|
| 181 |
+
async def create_paste(req: Request):
|
| 182 |
+
try:
|
| 183 |
+
body = await req.json()
|
| 184 |
+
except Exception:
|
| 185 |
+
return JSONResponse({"error": "Expected JSON body"}, status_code=400)
|
| 186 |
+
|
| 187 |
+
text = (body.get("text") or "").strip()
|
| 188 |
+
ttl_key = body.get("ttl", "never")
|
| 189 |
+
if not text:
|
| 190 |
+
return JSONResponse({"error": "Paste is empty"}, status_code=400)
|
| 191 |
+
if len(text) > MAX_PASTE_CHARS:
|
| 192 |
+
return JSONResponse(
|
| 193 |
+
{"error": f"Paste exceeds {MAX_PASTE_CHARS:,} characters"},
|
| 194 |
+
status_code=413,
|
| 195 |
+
)
|
| 196 |
+
if ttl_key not in TTL_CHOICES:
|
| 197 |
+
return JSONResponse({"error": f"Unknown ttl {ttl_key!r}"}, status_code=400)
|
| 198 |
+
|
| 199 |
+
try:
|
| 200 |
+
source_text, spans = analyze(text)
|
| 201 |
+
except Exception as exc: # model failure is the only realistic path here
|
| 202 |
+
return JSONResponse({"error": f"OPF inference failed: {exc}"},
|
| 203 |
+
status_code=500)
|
| 204 |
+
|
| 205 |
+
redacted = redact(source_text, spans)
|
| 206 |
+
stats = compute_stats(source_text, spans)
|
| 207 |
+
|
| 208 |
+
pid = secrets.token_urlsafe(6)
|
| 209 |
+
reveal_token = secrets.token_urlsafe(22)
|
| 210 |
+
ttl_sec = TTL_CHOICES[ttl_key]
|
| 211 |
+
now = time.time()
|
| 212 |
+
expires_at = (now + ttl_sec) if ttl_sec is not None else None
|
| 213 |
+
|
| 214 |
+
_store_put(Paste(
|
| 215 |
+
id=pid, reveal_token=reveal_token,
|
| 216 |
+
original=source_text, redacted=redacted,
|
| 217 |
+
spans=spans, stats=stats,
|
| 218 |
+
created_at=now, expires_at=expires_at,
|
| 219 |
+
))
|
| 220 |
+
|
| 221 |
+
return JSONResponse({
|
| 222 |
+
"id": pid,
|
| 223 |
+
"reveal_token": reveal_token,
|
| 224 |
+
"view_path": f"/view/{pid}",
|
| 225 |
+
"reveal_path": f"/view/{pid}?token={reveal_token}",
|
| 226 |
+
"expires_at": expires_at,
|
| 227 |
+
"stats": stats,
|
| 228 |
+
"categories_meta": CATEGORIES_META,
|
| 229 |
+
})
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
@server.get("/view/{pid}", response_class=HTMLResponse)
|
| 233 |
+
async def view_paste(pid: str, token: Optional[str] = None):
|
| 234 |
+
p = _store_get(pid)
|
| 235 |
+
if p is None:
|
| 236 |
+
return HTMLResponse(_not_found_html(pid), status_code=404)
|
| 237 |
+
|
| 238 |
+
revealed = bool(token) and secrets.compare_digest(token, p.reveal_token)
|
| 239 |
+
|
| 240 |
+
with LOCK:
|
| 241 |
+
if revealed:
|
| 242 |
+
p.reveals += 1
|
| 243 |
+
else:
|
| 244 |
+
p.views += 1
|
| 245 |
+
|
| 246 |
+
return HTMLResponse(_render_view(p, revealed))
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
@server.get("/api/paste/{pid}")
|
| 250 |
+
async def api_get_paste(pid: str, token: Optional[str] = None):
|
| 251 |
+
p = _store_get(pid)
|
| 252 |
+
if p is None:
|
| 253 |
+
return JSONResponse({"error": "not found or expired"}, status_code=404)
|
| 254 |
+
revealed = bool(token) and secrets.compare_digest(token, p.reveal_token)
|
| 255 |
+
payload = {
|
| 256 |
+
"id": p.id,
|
| 257 |
+
"created_at": p.created_at,
|
| 258 |
+
"expires_at": p.expires_at,
|
| 259 |
+
"stats": p.stats,
|
| 260 |
+
"views": p.views,
|
| 261 |
+
"reveals": p.reveals,
|
| 262 |
+
"redacted": p.redacted,
|
| 263 |
+
}
|
| 264 |
+
if revealed:
|
| 265 |
+
payload["original"] = p.original
|
| 266 |
+
payload["spans"] = p.spans
|
| 267 |
+
return JSONResponse(payload)
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
@server.api(name="analyze_paste")
|
| 271 |
+
def analyze_paste_api(text: str, ttl: str = "never") -> str:
|
| 272 |
+
"""Programmatic endpoint for gradio-client SDK.
|
| 273 |
+
|
| 274 |
+
Creates a paste and returns the paste id, reveal token, and stats as
|
| 275 |
+
a JSON string. Callers must combine the paths with the Space's base
|
| 276 |
+
URL to form shareable links."""
|
| 277 |
+
if ttl not in TTL_CHOICES:
|
| 278 |
+
return json.dumps({"error": f"Unknown ttl {ttl!r}"})
|
| 279 |
+
source_text, spans = analyze(text)
|
| 280 |
+
redacted = redact(source_text, spans)
|
| 281 |
+
stats = compute_stats(source_text, spans)
|
| 282 |
+
pid = secrets.token_urlsafe(6)
|
| 283 |
+
reveal_token = secrets.token_urlsafe(22)
|
| 284 |
+
ttl_sec = TTL_CHOICES[ttl]
|
| 285 |
+
now = time.time()
|
| 286 |
+
expires_at = (now + ttl_sec) if ttl_sec is not None else None
|
| 287 |
+
_store_put(Paste(
|
| 288 |
+
id=pid, reveal_token=reveal_token,
|
| 289 |
+
original=source_text, redacted=redacted,
|
| 290 |
+
spans=spans, stats=stats,
|
| 291 |
+
created_at=now, expires_at=expires_at,
|
| 292 |
+
))
|
| 293 |
+
return json.dumps({
|
| 294 |
+
"id": pid,
|
| 295 |
+
"reveal_token": reveal_token,
|
| 296 |
+
"view_path": f"/view/{pid}",
|
| 297 |
+
"reveal_path": f"/view/{pid}?token={reveal_token}",
|
| 298 |
+
"expires_at": expires_at,
|
| 299 |
+
"stats": stats,
|
| 300 |
+
})
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
# ── HTML rendering ───────────────────────────────────��────────────
|
| 304 |
+
|
| 305 |
+
def _escape(text: str) -> str:
|
| 306 |
+
return html.escape(text, quote=False)
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
def _highlight_html(text: str, spans: list[dict]) -> str:
|
| 310 |
+
"""Return HTML for text with each span wrapped in a colored mark,
|
| 311 |
+
revealing the original content (used on the reveal page)."""
|
| 312 |
+
pieces: list[str] = []
|
| 313 |
+
cursor = 0
|
| 314 |
+
for sp in sorted(spans, key=lambda s: s["start"]):
|
| 315 |
+
s, e = sp["start"], sp["end"]
|
| 316 |
+
if s < cursor or e <= s:
|
| 317 |
+
continue
|
| 318 |
+
if s > cursor:
|
| 319 |
+
pieces.append(_escape(text[cursor:s]))
|
| 320 |
+
meta = CATEGORIES_META.get(sp["label"])
|
| 321 |
+
color = meta["color"] if meta else "#333"
|
| 322 |
+
label = meta["label"] if meta else sp["label"]
|
| 323 |
+
pieces.append(
|
| 324 |
+
f'<mark class="pp-hi" data-cat="{_escape(sp["label"])}" '
|
| 325 |
+
f'style="--cat:{color}" title="{_escape(label)}">'
|
| 326 |
+
f'{_escape(text[s:e])}'
|
| 327 |
+
f'<span class="pp-hi-tag">{_escape(label)}</span>'
|
| 328 |
+
f'</mark>'
|
| 329 |
+
)
|
| 330 |
+
cursor = e
|
| 331 |
+
if cursor < len(text):
|
| 332 |
+
pieces.append(_escape(text[cursor:]))
|
| 333 |
+
return "".join(pieces)
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
def _redacted_html(redacted: str) -> str:
|
| 337 |
+
"""Render the redacted version with <CATEGORY> placeholders as
|
| 338 |
+
colored pills so readers can see what kind of data was stripped."""
|
| 339 |
+
out: list[str] = []
|
| 340 |
+
i = 0
|
| 341 |
+
while i < len(redacted):
|
| 342 |
+
lt = redacted.find("<", i)
|
| 343 |
+
if lt == -1:
|
| 344 |
+
out.append(_escape(redacted[i:]))
|
| 345 |
+
break
|
| 346 |
+
out.append(_escape(redacted[i:lt]))
|
| 347 |
+
gt = redacted.find(">", lt + 1)
|
| 348 |
+
if gt == -1:
|
| 349 |
+
out.append(_escape(redacted[lt:]))
|
| 350 |
+
break
|
| 351 |
+
tag = redacted[lt + 1:gt]
|
| 352 |
+
cat_key = tag.lower()
|
| 353 |
+
meta = CATEGORIES_META.get(cat_key)
|
| 354 |
+
if meta is None:
|
| 355 |
+
out.append(_escape(redacted[lt:gt + 1]))
|
| 356 |
+
else:
|
| 357 |
+
out.append(
|
| 358 |
+
f'<span class="pp-red" data-cat="{_escape(cat_key)}" '
|
| 359 |
+
f'style="--cat:{meta["color"]}">'
|
| 360 |
+
f'<span class="pp-red-dot"></span>{_escape(meta["label"])}'
|
| 361 |
+
f'</span>'
|
| 362 |
+
)
|
| 363 |
+
i = gt + 1
|
| 364 |
+
return "".join(out)
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def _format_expiry(paste: Paste) -> str:
|
| 368 |
+
if paste.expires_at is None:
|
| 369 |
+
return "does not expire"
|
| 370 |
+
remaining = paste.expires_at - time.time()
|
| 371 |
+
if remaining <= 0:
|
| 372 |
+
return "expired"
|
| 373 |
+
if remaining < 3600:
|
| 374 |
+
return f"expires in {int(remaining // 60)} min"
|
| 375 |
+
if remaining < 86400:
|
| 376 |
+
return f"expires in {int(remaining // 3600)} h"
|
| 377 |
+
return f"expires in {int(remaining // 86400)} d"
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
def _render_view(p: Paste, revealed: bool) -> str:
|
| 381 |
+
stats = p.stats
|
| 382 |
+
badges_html = "".join(
|
| 383 |
+
f'<span class="pp-badge" style="--cat:{CATEGORIES_META.get(cat, {"color": "#333"})["color"]}">'
|
| 384 |
+
f'<span class="pp-badge-dot"></span>'
|
| 385 |
+
f'{_escape(CATEGORIES_META.get(cat, {"label": cat})["label"])}'
|
| 386 |
+
f'<span class="pp-badge-n">{info["count"]}</span>'
|
| 387 |
+
f'</span>'
|
| 388 |
+
for cat, info in sorted(stats["categories"].items(),
|
| 389 |
+
key=lambda kv: -kv[1]["count"])
|
| 390 |
+
) or '<span class="pp-muted">No PII detected in this paste.</span>'
|
| 391 |
+
|
| 392 |
+
body_html = (
|
| 393 |
+
_highlight_html(p.original, p.spans) if revealed
|
| 394 |
+
else _redacted_html(p.redacted)
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
mode_banner = (
|
| 398 |
+
'<div class="pp-banner pp-banner-reveal">'
|
| 399 |
+
'<strong>Private reveal.</strong> This URL contains the reveal token — '
|
| 400 |
+
'treat it like a password. Anyone with it sees the original text.'
|
| 401 |
+
'</div>'
|
| 402 |
+
if revealed else
|
| 403 |
+
'<div class="pp-banner pp-banner-safe">'
|
| 404 |
+
'<strong>Redacted view.</strong> Sensitive spans were stripped before '
|
| 405 |
+
'this page was served. The original is only visible via the author\'s reveal link.'
|
| 406 |
+
'</div>'
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
view_mode_label = "Original (revealed)" if revealed else "Redacted"
|
| 410 |
+
|
| 411 |
+
replacements = {
|
| 412 |
+
"__PID__": _escape(p.id),
|
| 413 |
+
"__MODE__": _escape(view_mode_label),
|
| 414 |
+
"__EXPIRY__": _escape(_format_expiry(p)),
|
| 415 |
+
"__CREATED__": _escape(time.strftime(
|
| 416 |
+
"%Y-%m-%d %H:%M UTC", time.gmtime(p.created_at))),
|
| 417 |
+
"__VIEWS__": str(p.views),
|
| 418 |
+
"__REVEALS__": str(p.reveals),
|
| 419 |
+
"__PCT__": str(stats["pii_percentage"]),
|
| 420 |
+
"__SPANS_N__": str(stats["total_spans"]),
|
| 421 |
+
"__CHARS_N__": f'{stats["total_chars"]:,}',
|
| 422 |
+
"__BADGES__": badges_html,
|
| 423 |
+
"__BANNER__": mode_banner,
|
| 424 |
+
"__BODY__": body_html,
|
| 425 |
+
"__BODY_CLASS__": "pp-body-reveal" if revealed else "pp-body-redacted",
|
| 426 |
+
}
|
| 427 |
+
out = _VIEW_HTML
|
| 428 |
+
for k, v in replacements.items():
|
| 429 |
+
out = out.replace(k, v)
|
| 430 |
+
return out
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def _not_found_html(pid: str) -> str:
|
| 434 |
+
return _NOT_FOUND_HTML.replace("{{PID}}", _escape(pid))
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
# ── compose page (paste editor) ───────────────────────────────────
|
| 438 |
+
|
| 439 |
+
_CATEGORIES_JSON = json.dumps(CATEGORIES_META)
|
| 440 |
+
|
| 441 |
+
_SHARED_CSS = r"""
|
| 442 |
+
:root{
|
| 443 |
+
--bg: #f7f7f8;
|
| 444 |
+
--panel: #ffffff;
|
| 445 |
+
--panel-2: #f1f1f3;
|
| 446 |
+
--ink: #0a0a0a;
|
| 447 |
+
--ink-dim: #3f3f46;
|
| 448 |
+
--ink-faint: #70707a;
|
| 449 |
+
--line: #e4e4e7;
|
| 450 |
+
--line-strong: #d4d4d8;
|
| 451 |
+
--accent: #0f8a5f;
|
| 452 |
+
--accent-ink: #ffffff;
|
| 453 |
+
--warn: #b45309;
|
| 454 |
+
--primary-bg: #18181b;
|
| 455 |
+
--primary-fg: #ffffff;
|
| 456 |
+
--radius-lg: 12px;
|
| 457 |
+
--radius-md: 8px;
|
| 458 |
+
--radius-sm: 5px;
|
| 459 |
+
--shadow-xs: 0 1px 1.5px rgba(10,10,10,.04);
|
| 460 |
+
--shadow-sm: 0 1px 3px rgba(10,10,10,.06), 0 1px 2px rgba(10,10,10,.04);
|
| 461 |
+
--shadow-md: 0 4px 14px rgba(10,10,10,.07), 0 1px 3px rgba(10,10,10,.04);
|
| 462 |
+
--font-sans: 'Inter', system-ui, -apple-system, 'Segoe UI', sans-serif;
|
| 463 |
+
--font-mono: 'JetBrains Mono', ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
| 464 |
+
--font-serif: 'Instrument Serif', 'Source Serif 4', Georgia, serif;
|
| 465 |
+
}
|
| 466 |
+
@media (prefers-color-scheme: dark){
|
| 467 |
+
:root{
|
| 468 |
+
--bg: #0e0e11;
|
| 469 |
+
--panel: #18181c;
|
| 470 |
+
--panel-2: #1f1f24;
|
| 471 |
+
--ink: #e8e8ea;
|
| 472 |
+
--ink-dim: #a8a8ae;
|
| 473 |
+
--ink-faint: #70707a;
|
| 474 |
+
--line: rgba(255,255,255,0.08);
|
| 475 |
+
--line-strong: rgba(255,255,255,0.18);
|
| 476 |
+
--accent: #2bb77e;
|
| 477 |
+
--accent-ink: #0e0e11;
|
| 478 |
+
--warn: #eab308;
|
| 479 |
+
--primary-bg: #f0f0f2;
|
| 480 |
+
--primary-fg: #0e0e11;
|
| 481 |
+
--shadow-xs: none;
|
| 482 |
+
--shadow-sm: none;
|
| 483 |
+
--shadow-md: none;
|
| 484 |
+
}
|
| 485 |
+
}
|
| 486 |
+
*,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
|
| 487 |
+
html,body{height:100%}
|
| 488 |
+
body{
|
| 489 |
+
font-family:var(--font-sans);
|
| 490 |
+
background:var(--bg);
|
| 491 |
+
color:var(--ink);
|
| 492 |
+
font-size:14px;line-height:1.55;
|
| 493 |
+
-webkit-font-smoothing:antialiased;
|
| 494 |
+
font-feature-settings:"cv11","ss01";
|
| 495 |
+
}
|
| 496 |
+
a{color:inherit;text-decoration:underline;text-decoration-color:var(--line-strong);text-underline-offset:3px}
|
| 497 |
+
a:hover{text-decoration-color:var(--ink)}
|
| 498 |
+
button{font:inherit;color:inherit;background:transparent;border:0;cursor:pointer}
|
| 499 |
+
.pp-shell{max-width:1060px;margin:0 auto;padding:36px 20px 56px}
|
| 500 |
+
.pp-brand{display:flex;align-items:center;gap:10px;margin-bottom:22px}
|
| 501 |
+
.pp-brand-mark{
|
| 502 |
+
width:26px;height:26px;border-radius:7px;
|
| 503 |
+
background:var(--ink);color:var(--bg);
|
| 504 |
+
display:grid;place-items:center;
|
| 505 |
+
font-family:var(--font-mono);font-size:13px;font-weight:600;letter-spacing:-0.02em;
|
| 506 |
+
}
|
| 507 |
+
.pp-brand-name{font-size:13.5px;font-weight:500}
|
| 508 |
+
.pp-brand-name .sub{color:var(--ink-faint);font-weight:400;margin-left:6px}
|
| 509 |
+
.pp-caps{font-size:10.5px;font-weight:600;letter-spacing:0.09em;text-transform:uppercase;color:var(--ink-dim)}
|
| 510 |
+
.pp-hero{margin-bottom:22px}
|
| 511 |
+
.pp-hero h1{font-family:var(--font-serif);font-size:38px;line-height:1.08;letter-spacing:-0.015em;font-weight:500;margin-bottom:8px}
|
| 512 |
+
.pp-hero p{color:var(--ink-dim);max-width:58ch;font-size:14px}
|
| 513 |
+
.pp-banner{padding:10px 14px;border-radius:var(--radius-md);font-size:13px;line-height:1.5;border:0.5px solid var(--line-strong);margin-bottom:16px}
|
| 514 |
+
.pp-banner strong{font-weight:600}
|
| 515 |
+
.pp-banner-safe{background:color-mix(in srgb, var(--accent) 8%, transparent);border-color:color-mix(in srgb, var(--accent) 26%, var(--line-strong))}
|
| 516 |
+
.pp-banner-reveal{background:color-mix(in srgb, var(--warn) 10%, transparent);border-color:color-mix(in srgb, var(--warn) 30%, var(--line-strong))}
|
| 517 |
+
"""
|
| 518 |
+
|
| 519 |
+
_COMPOSE_HTML = r"""<!DOCTYPE html>
|
| 520 |
+
<html lang="en">
|
| 521 |
+
<head>
|
| 522 |
+
<meta charset="UTF-8">
|
| 523 |
+
<meta name="viewport" content="width=device-width,initial-scale=1">
|
| 524 |
+
<title>DLP Paste-Proxy — Pastebin with a conscience</title>
|
| 525 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 526 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 527 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&family=Instrument+Serif:ital@0;1&display=swap" rel="stylesheet">
|
| 528 |
+
<style>
|
| 529 |
+
""" + _SHARED_CSS + r"""
|
| 530 |
+
|
| 531 |
+
.pp-card{
|
| 532 |
+
background:var(--panel);
|
| 533 |
+
border:0.5px solid var(--line);
|
| 534 |
+
border-radius:var(--radius-lg);
|
| 535 |
+
box-shadow:var(--shadow-md);
|
| 536 |
+
overflow:hidden;
|
| 537 |
+
}
|
| 538 |
+
.pp-card-head{
|
| 539 |
+
padding:14px 18px;
|
| 540 |
+
border-bottom:0.5px solid var(--line);
|
| 541 |
+
display:flex;align-items:center;gap:10px;flex-wrap:wrap;
|
| 542 |
+
}
|
| 543 |
+
.pp-card-head h2{font-size:13.5px;font-weight:500;letter-spacing:-0.005em}
|
| 544 |
+
.pp-spacer{flex:1}
|
| 545 |
+
.pp-grid{
|
| 546 |
+
display:grid;
|
| 547 |
+
grid-template-columns:minmax(0,1fr) 280px;
|
| 548 |
+
gap:0;
|
| 549 |
+
}
|
| 550 |
+
.pp-pane{padding:18px 20px 22px}
|
| 551 |
+
.pp-pane + .pp-pane{border-left:0.5px solid var(--line);background:var(--panel-2)}
|
| 552 |
+
.pp-textarea{
|
| 553 |
+
width:100%;min-height:320px;
|
| 554 |
+
font-family:var(--font-mono);font-size:13px;line-height:1.55;
|
| 555 |
+
color:var(--ink);background:transparent;
|
| 556 |
+
border:1px solid var(--line);border-radius:var(--radius-md);
|
| 557 |
+
padding:14px 14px;resize:vertical;
|
| 558 |
+
transition:border-color .15s,background .15s;
|
| 559 |
+
}
|
| 560 |
+
.pp-textarea::placeholder{color:var(--ink-faint)}
|
| 561 |
+
.pp-textarea:focus{outline:none;border-color:var(--line-strong);background:color-mix(in srgb, var(--ink) 1.5%, transparent)}
|
| 562 |
+
.pp-sub{color:var(--ink-faint);font-size:11.5px;font-family:var(--font-mono);margin-top:8px;display:flex;align-items:center;gap:10px;flex-wrap:wrap}
|
| 563 |
+
.pp-sub .sep{opacity:.4}
|
| 564 |
+
.pp-label{display:block;font-size:11px;font-weight:600;letter-spacing:0.07em;text-transform:uppercase;color:var(--ink-dim);margin:0 0 8px}
|
| 565 |
+
.pp-ttl{display:flex;gap:4px;background:var(--panel);border:0.5px solid var(--line);padding:3px;border-radius:var(--radius-md)}
|
| 566 |
+
.pp-ttl button{
|
| 567 |
+
flex:1;padding:7px 0;font-size:12px;font-weight:500;color:var(--ink-dim);
|
| 568 |
+
border-radius:5px;transition:background .12s,color .12s;
|
| 569 |
+
}
|
| 570 |
+
.pp-ttl button[aria-pressed="true"]{background:var(--ink);color:var(--bg)}
|
| 571 |
+
.pp-ttl button:hover:not([aria-pressed="true"]){background:color-mix(in srgb, var(--ink) 4%, transparent);color:var(--ink)}
|
| 572 |
+
.pp-hint{font-size:12px;color:var(--ink-faint);margin-top:8px;line-height:1.45}
|
| 573 |
+
.pp-btn{
|
| 574 |
+
font-size:13px;font-weight:500;padding:10px 14px;
|
| 575 |
+
border:0.5px solid var(--line-strong);
|
| 576 |
+
border-radius:var(--radius-md);
|
| 577 |
+
background:var(--panel);color:var(--ink);
|
| 578 |
+
display:inline-flex;align-items:center;justify-content:center;gap:8px;
|
| 579 |
+
transition:background .12s,border-color .12s;
|
| 580 |
+
}
|
| 581 |
+
.pp-btn:hover:not(:disabled){background:color-mix(in srgb, var(--ink) 4%, var(--panel));border-color:var(--ink-dim)}
|
| 582 |
+
.pp-btn:disabled{opacity:.55;cursor:not-allowed}
|
| 583 |
+
.pp-btn-primary{background:var(--primary-bg);color:var(--primary-fg);border-color:var(--primary-bg);width:100%}
|
| 584 |
+
.pp-btn-primary:hover:not(:disabled){background:color-mix(in srgb, var(--primary-bg) 88%, var(--ink));border-color:var(--primary-bg)}
|
| 585 |
+
.pp-btn-arr{font-family:var(--font-mono);font-size:11px;opacity:.7}
|
| 586 |
+
|
| 587 |
+
.pp-success{
|
| 588 |
+
display:none;margin-top:24px;padding:22px 22px 24px;
|
| 589 |
+
background:var(--panel);border:0.5px solid var(--line);border-radius:var(--radius-lg);box-shadow:var(--shadow-md);
|
| 590 |
+
}
|
| 591 |
+
.pp-success.on{display:block}
|
| 592 |
+
.pp-success h3{font-family:var(--font-serif);font-size:22px;line-height:1.15;font-weight:500;margin-bottom:4px;letter-spacing:-0.01em}
|
| 593 |
+
.pp-success .pp-caps{margin-bottom:14px;display:block}
|
| 594 |
+
.pp-link{
|
| 595 |
+
display:flex;align-items:stretch;gap:0;margin:8px 0 14px;
|
| 596 |
+
border:0.5px solid var(--line);border-radius:var(--radius-md);overflow:hidden;background:var(--panel-2);
|
| 597 |
+
}
|
| 598 |
+
.pp-link input{
|
| 599 |
+
flex:1;border:0;background:transparent;padding:10px 12px;
|
| 600 |
+
font-family:var(--font-mono);font-size:12px;color:var(--ink);min-width:0;outline:none;
|
| 601 |
+
}
|
| 602 |
+
.pp-link button{
|
| 603 |
+
border-left:0.5px solid var(--line);background:var(--panel);
|
| 604 |
+
padding:0 14px;font-size:12px;font-weight:500;color:var(--ink-dim);
|
| 605 |
+
transition:background .12s,color .12s;
|
| 606 |
+
}
|
| 607 |
+
.pp-link button:hover{background:color-mix(in srgb, var(--ink) 4%, var(--panel));color:var(--ink)}
|
| 608 |
+
.pp-link-label{display:flex;align-items:baseline;gap:8px;font-size:13px;font-weight:500;margin-top:14px}
|
| 609 |
+
.pp-link-label .hint{font-weight:400;color:var(--ink-faint);font-size:12px}
|
| 610 |
+
.pp-link-label:first-of-type{margin-top:0}
|
| 611 |
+
.pp-link-label .priv{
|
| 612 |
+
font-family:var(--font-mono);font-size:10px;font-weight:600;letter-spacing:.06em;
|
| 613 |
+
padding:2px 7px;border-radius:4px;
|
| 614 |
+
background:color-mix(in srgb, var(--warn) 18%, transparent);
|
| 615 |
+
color:color-mix(in srgb, var(--warn) 70%, var(--ink));
|
| 616 |
+
text-transform:uppercase;
|
| 617 |
+
}
|
| 618 |
+
.pp-preview-row{display:grid;grid-template-columns:1fr 1fr;gap:12px;margin-top:16px}
|
| 619 |
+
.pp-preview{background:var(--panel-2);border:0.5px solid var(--line);border-radius:var(--radius-md);padding:12px 14px 14px;font-family:var(--font-serif);font-size:14.5px;line-height:1.55;min-height:130px;max-height:260px;overflow:auto}
|
| 620 |
+
.pp-preview .pp-caps{display:block;margin-bottom:8px;font-family:var(--font-sans);font-size:10px;color:var(--ink-faint)}
|
| 621 |
+
.pp-err{display:none;margin-top:12px;padding:10px 12px;border-radius:var(--radius-md);background:color-mix(in srgb, #dc2626 9%, transparent);border:0.5px solid color-mix(in srgb, #dc2626 30%, var(--line-strong));color:#991b1b;font-size:13px}
|
| 622 |
+
.pp-err.on{display:block}
|
| 623 |
+
.pp-err code{font-family:var(--font-mono);font-size:12px}
|
| 624 |
+
.pp-loading{display:none;align-items:center;gap:8px;color:var(--ink-dim);font-size:13px;margin-top:12px}
|
| 625 |
+
.pp-loading.on{display:inline-flex}
|
| 626 |
+
.pp-spin{width:12px;height:12px;border:1.5px solid color-mix(in srgb, var(--ink) 25%, transparent);border-top-color:var(--ink);border-radius:50%;animation:pp-spin 0.8s linear infinite}
|
| 627 |
+
@keyframes pp-spin{to{transform:rotate(360deg)}}
|
| 628 |
+
|
| 629 |
+
.pp-footer{
|
| 630 |
+
margin-top:28px;padding-top:22px;border-top:0.5px solid var(--line);
|
| 631 |
+
display:flex;justify-content:space-between;gap:16px;color:var(--ink-faint);font-size:12px;flex-wrap:wrap;
|
| 632 |
+
}
|
| 633 |
+
.pp-footer a{color:var(--ink-dim)}
|
| 634 |
+
|
| 635 |
+
/* Pills & highlights used on view page (scoped so compose page can
|
| 636 |
+
reuse the preview rendering to show what the redacted version
|
| 637 |
+
looks like before the user commits) */
|
| 638 |
+
.pp-red{
|
| 639 |
+
display:inline-flex;align-items:center;gap:4px;
|
| 640 |
+
font-family:var(--font-sans);font-size:12px;font-weight:500;
|
| 641 |
+
padding:1px 7px 1px 6px;margin:0 1px;border-radius:3px;
|
| 642 |
+
background:color-mix(in srgb, var(--cat, #666) 14%, transparent);
|
| 643 |
+
color:color-mix(in srgb, var(--cat, #666) 62%, var(--ink));
|
| 644 |
+
vertical-align:baseline;letter-spacing:-0.002em;
|
| 645 |
+
border:0.5px solid color-mix(in srgb, var(--cat, #666) 28%, transparent);
|
| 646 |
+
}
|
| 647 |
+
.pp-red-dot{width:5px;height:5px;border-radius:50%;background:var(--cat,#666);flex:none}
|
| 648 |
+
|
| 649 |
+
@media (max-width:820px){
|
| 650 |
+
.pp-grid{grid-template-columns:1fr}
|
| 651 |
+
.pp-pane + .pp-pane{border-left:0;border-top:0.5px solid var(--line)}
|
| 652 |
+
.pp-preview-row{grid-template-columns:1fr}
|
| 653 |
+
}
|
| 654 |
+
</style>
|
| 655 |
+
</head>
|
| 656 |
+
<body>
|
| 657 |
+
<div class="pp-shell">
|
| 658 |
+
|
| 659 |
+
<div class="pp-brand">
|
| 660 |
+
<div class="pp-brand-mark">P</div>
|
| 661 |
+
<div class="pp-brand-name">DLP Paste-Proxy<span class="sub">pastebin with a conscience</span></div>
|
| 662 |
+
</div>
|
| 663 |
+
|
| 664 |
+
<div class="pp-hero">
|
| 665 |
+
<h1>Paste sensitive text.<br>Share only the redacted view.</h1>
|
| 666 |
+
<p>OpenAI Privacy Filter scans your paste for names, addresses, emails, phones, URLs, dates, account numbers, and secrets before minting a shareable link. Viewers see placeholders; only your private reveal link shows the original.</p>
|
| 667 |
+
</div>
|
| 668 |
+
|
| 669 |
+
<div class="pp-card">
|
| 670 |
+
<div class="pp-card-head">
|
| 671 |
+
<span class="pp-caps">Compose</span>
|
| 672 |
+
<h2>New paste</h2>
|
| 673 |
+
<span class="pp-spacer"></span>
|
| 674 |
+
<span class="pp-sub" id="pp-char-count">0 / """ + f"{MAX_PASTE_CHARS:,}" + r""" chars</span>
|
| 675 |
+
</div>
|
| 676 |
+
<div class="pp-grid">
|
| 677 |
+
<div class="pp-pane">
|
| 678 |
+
<label class="pp-label" for="pp-text">Paste body</label>
|
| 679 |
+
<textarea id="pp-text" class="pp-textarea" spellcheck="false"
|
| 680 |
+
placeholder="Paste anything — a DM thread, a log line, an email, a support ticket. The OPF model labels each character span; placeholders replace the private parts before the URL is minted."></textarea>
|
| 681 |
+
<div class="pp-sub">
|
| 682 |
+
<span id="pp-cursor">line 1, col 1</span>
|
| 683 |
+
<span class="sep">·</span>
|
| 684 |
+
<span>no data leaves this server except as redacted placeholders</span>
|
| 685 |
+
</div>
|
| 686 |
+
</div>
|
| 687 |
+
<div class="pp-pane">
|
| 688 |
+
<label class="pp-label">Auto-expiry</label>
|
| 689 |
+
<div class="pp-ttl" id="pp-ttl" role="tablist" aria-label="Expiration">
|
| 690 |
+
<button type="button" data-ttl="never" aria-pressed="true">Never</button>
|
| 691 |
+
<button type="button" data-ttl="1h" aria-pressed="false">1h</button>
|
| 692 |
+
<button type="button" data-ttl="24h" aria-pressed="false">24h</button>
|
| 693 |
+
<button type="button" data-ttl="7d" aria-pressed="false">7d</button>
|
| 694 |
+
</div>
|
| 695 |
+
<p class="pp-hint">A background sweeper deletes expired pastes on the server. Expired links 404.</p>
|
| 696 |
+
|
| 697 |
+
<label class="pp-label" style="margin-top:20px">Create</label>
|
| 698 |
+
<button type="button" id="pp-create" class="pp-btn pp-btn-primary">
|
| 699 |
+
<span>Scan & mint link</span>
|
| 700 |
+
<span class="pp-btn-arr">↵</span>
|
| 701 |
+
</button>
|
| 702 |
+
<div class="pp-loading" id="pp-loading">
|
| 703 |
+
<span class="pp-spin"></span><span>Running OPF on your paste…</span>
|
| 704 |
+
</div>
|
| 705 |
+
<div class="pp-err" id="pp-err"></div>
|
| 706 |
+
</div>
|
| 707 |
+
</div>
|
| 708 |
+
</div>
|
| 709 |
+
|
| 710 |
+
<section class="pp-success" id="pp-success">
|
| 711 |
+
<span class="pp-caps">Paste minted</span>
|
| 712 |
+
<h3>Your paste is ready.</h3>
|
| 713 |
+
|
| 714 |
+
<div class="pp-link-label">
|
| 715 |
+
Shareable view link
|
| 716 |
+
<span class="hint">redacted — give to recipients</span>
|
| 717 |
+
</div>
|
| 718 |
+
<div class="pp-link">
|
| 719 |
+
<input id="pp-view-url" readonly value="">
|
| 720 |
+
<button type="button" data-copy="pp-view-url">Copy</button>
|
| 721 |
+
</div>
|
| 722 |
+
|
| 723 |
+
<div class="pp-link-label">
|
| 724 |
+
Private reveal link
|
| 725 |
+
<span class="priv">author only</span>
|
| 726 |
+
<span class="hint">shows original — keep it to yourself</span>
|
| 727 |
+
</div>
|
| 728 |
+
<div class="pp-link">
|
| 729 |
+
<input id="pp-reveal-url" readonly value="">
|
| 730 |
+
<button type="button" data-copy="pp-reveal-url">Copy</button>
|
| 731 |
+
</div>
|
| 732 |
+
|
| 733 |
+
<div class="pp-preview-row">
|
| 734 |
+
<div class="pp-preview">
|
| 735 |
+
<span class="pp-caps">What recipients will see</span>
|
| 736 |
+
<div id="pp-preview-redacted"></div>
|
| 737 |
+
</div>
|
| 738 |
+
<div class="pp-preview" style="font-family:var(--font-sans);font-size:12.5px;line-height:1.5">
|
| 739 |
+
<span class="pp-caps">Summary</span>
|
| 740 |
+
<div id="pp-preview-summary"></div>
|
| 741 |
+
</div>
|
| 742 |
+
</div>
|
| 743 |
+
</section>
|
| 744 |
+
|
| 745 |
+
<footer class="pp-footer">
|
| 746 |
+
<div>Powered by <a href="https://huggingface.co/charles-first-org/second-model" target="_blank" rel="noopener">OpenAI Privacy Filter</a> · 1.5B params, 50M active, 128k context</div>
|
| 747 |
+
<div><a href="#" id="pp-about">How this works →</a></div>
|
| 748 |
+
</footer>
|
| 749 |
+
</div>
|
| 750 |
+
|
| 751 |
+
<script>
|
| 752 |
+
const CATS = """ + _CATEGORIES_JSON + r""";
|
| 753 |
+
const MAX = """ + str(MAX_PASTE_CHARS) + r""";
|
| 754 |
+
|
| 755 |
+
const $text = document.getElementById('pp-text');
|
| 756 |
+
const $cc = document.getElementById('pp-char-count');
|
| 757 |
+
const $cur = document.getElementById('pp-cursor');
|
| 758 |
+
const $ttl = document.getElementById('pp-ttl');
|
| 759 |
+
const $btn = document.getElementById('pp-create');
|
| 760 |
+
const $load = document.getElementById('pp-loading');
|
| 761 |
+
const $err = document.getElementById('pp-err');
|
| 762 |
+
const $ok = document.getElementById('pp-success');
|
| 763 |
+
|
| 764 |
+
function updateCount(){
|
| 765 |
+
const n = $text.value.length;
|
| 766 |
+
$cc.textContent = n.toLocaleString() + ' / ' + MAX.toLocaleString() + ' chars';
|
| 767 |
+
$cc.style.color = n > MAX ? '#b45309' : '';
|
| 768 |
+
}
|
| 769 |
+
function updateCursor(){
|
| 770 |
+
const pos = $text.selectionStart;
|
| 771 |
+
const lines = $text.value.slice(0, pos).split('\n');
|
| 772 |
+
$cur.textContent = 'line ' + lines.length + ', col ' + (lines[lines.length-1].length + 1);
|
| 773 |
+
}
|
| 774 |
+
$text.addEventListener('input', updateCount);
|
| 775 |
+
['keyup','click','focus','mouseup'].forEach(e => $text.addEventListener(e, updateCursor));
|
| 776 |
+
|
| 777 |
+
let ttl = 'never';
|
| 778 |
+
$ttl.addEventListener('click', (e) => {
|
| 779 |
+
const b = e.target.closest('button'); if (!b) return;
|
| 780 |
+
[...$ttl.querySelectorAll('button')].forEach(x => x.setAttribute('aria-pressed', x === b ? 'true' : 'false'));
|
| 781 |
+
ttl = b.dataset.ttl;
|
| 782 |
+
});
|
| 783 |
+
|
| 784 |
+
function renderRedacted(redacted){
|
| 785 |
+
let html = '';
|
| 786 |
+
let i = 0;
|
| 787 |
+
while (i < redacted.length){
|
| 788 |
+
const lt = redacted.indexOf('<', i);
|
| 789 |
+
if (lt === -1){ html += escapeHtml(redacted.slice(i)); break; }
|
| 790 |
+
html += escapeHtml(redacted.slice(i, lt));
|
| 791 |
+
const gt = redacted.indexOf('>', lt + 1);
|
| 792 |
+
if (gt === -1){ html += escapeHtml(redacted.slice(lt)); break; }
|
| 793 |
+
const tag = redacted.slice(lt+1, gt);
|
| 794 |
+
const key = tag.toLowerCase();
|
| 795 |
+
const meta = CATS[key];
|
| 796 |
+
if (!meta){ html += escapeHtml(redacted.slice(lt, gt+1)); }
|
| 797 |
+
else {
|
| 798 |
+
html += '<span class="pp-red" data-cat="'+escapeHtml(key)+'" style="--cat:'+meta.color+'">'+
|
| 799 |
+
'<span class="pp-red-dot"></span>'+escapeHtml(meta.label)+'</span>';
|
| 800 |
+
}
|
| 801 |
+
i = gt + 1;
|
| 802 |
+
}
|
| 803 |
+
return html;
|
| 804 |
+
}
|
| 805 |
+
function escapeHtml(s){ return s.replace(/[&<>"']/g, c => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[c])); }
|
| 806 |
+
|
| 807 |
+
async function createPaste(){
|
| 808 |
+
const text = $text.value.trim();
|
| 809 |
+
$err.classList.remove('on'); $err.textContent = '';
|
| 810 |
+
if (!text){ $err.classList.add('on'); $err.textContent = 'Paste is empty.'; return; }
|
| 811 |
+
if (text.length > MAX){ $err.classList.add('on'); $err.textContent = 'Paste exceeds ' + MAX.toLocaleString() + ' characters.'; return; }
|
| 812 |
+
|
| 813 |
+
$btn.disabled = true; $load.classList.add('on'); $ok.classList.remove('on');
|
| 814 |
+
try{
|
| 815 |
+
const r = await fetch('/api/paste', {
|
| 816 |
+
method: 'POST',
|
| 817 |
+
headers: {'Content-Type': 'application/json'},
|
| 818 |
+
body: JSON.stringify({text, ttl}),
|
| 819 |
+
});
|
| 820 |
+
const data = await r.json();
|
| 821 |
+
if (!r.ok) throw new Error(data.error || ('HTTP ' + r.status));
|
| 822 |
+
|
| 823 |
+
const origin = window.location.origin;
|
| 824 |
+
document.getElementById('pp-view-url').value = origin + data.view_path;
|
| 825 |
+
document.getElementById('pp-reveal-url').value = origin + data.reveal_path;
|
| 826 |
+
|
| 827 |
+
// Fetch public redacted version to preview
|
| 828 |
+
const pv = await fetch('/api/paste/' + data.id).then(x => x.json());
|
| 829 |
+
document.getElementById('pp-preview-redacted').innerHTML = renderRedacted(pv.redacted);
|
| 830 |
+
|
| 831 |
+
const s = data.stats;
|
| 832 |
+
const cats = Object.entries(s.categories).sort((a,b) => b[1].count - a[1].count);
|
| 833 |
+
const catHtml = cats.length
|
| 834 |
+
? cats.map(([k,v]) => {
|
| 835 |
+
const m = CATS[k] || {label:k, color:'#333'};
|
| 836 |
+
return '<span class="pp-red" style="--cat:'+m.color+';margin:2px 4px 2px 0"><span class="pp-red-dot"></span>'+escapeHtml(m.label)+' × '+v.count+'</span>';
|
| 837 |
+
}).join('')
|
| 838 |
+
: '<em style="color:var(--ink-faint)">No PII found in this paste.</em>';
|
| 839 |
+
document.getElementById('pp-preview-summary').innerHTML =
|
| 840 |
+
'<div style="display:flex;gap:18px;margin-bottom:10px;align-items:baseline"><div><div style="font-family:var(--font-serif);font-size:26px;letter-spacing:-0.02em;line-height:1">'+s.pii_percentage+'%</div><div class="pp-caps" style="margin-top:3px">PII density</div></div>'+
|
| 841 |
+
'<div><div style="font-family:var(--font-serif);font-size:26px;letter-spacing:-0.02em;line-height:1">'+s.total_spans+'</div><div class="pp-caps" style="margin-top:3px">spans</div></div>'+
|
| 842 |
+
'<div><div style="font-family:var(--font-serif);font-size:26px;letter-spacing:-0.02em;line-height:1">'+s.total_chars.toLocaleString()+'</div><div class="pp-caps" style="margin-top:3px">chars</div></div></div>'+
|
| 843 |
+
'<div>'+catHtml+'</div>';
|
| 844 |
+
|
| 845 |
+
$ok.classList.add('on');
|
| 846 |
+
$ok.scrollIntoView({behavior:'smooth', block:'start'});
|
| 847 |
+
} catch (e) {
|
| 848 |
+
$err.classList.add('on');
|
| 849 |
+
$err.textContent = e.message || 'Failed to create paste.';
|
| 850 |
+
} finally {
|
| 851 |
+
$btn.disabled = false; $load.classList.remove('on');
|
| 852 |
+
}
|
| 853 |
+
}
|
| 854 |
+
|
| 855 |
+
$btn.addEventListener('click', createPaste);
|
| 856 |
+
$text.addEventListener('keydown', (e) => {
|
| 857 |
+
if ((e.metaKey || e.ctrlKey) && e.key === 'Enter'){ e.preventDefault(); createPaste(); }
|
| 858 |
+
});
|
| 859 |
+
|
| 860 |
+
document.addEventListener('click', (e) => {
|
| 861 |
+
const b = e.target.closest('[data-copy]'); if (!b) return;
|
| 862 |
+
const inp = document.getElementById(b.dataset.copy);
|
| 863 |
+
inp.select(); navigator.clipboard.writeText(inp.value);
|
| 864 |
+
const prev = b.textContent; b.textContent = 'Copied'; setTimeout(() => b.textContent = prev, 1200);
|
| 865 |
+
});
|
| 866 |
+
|
| 867 |
+
updateCount(); updateCursor();
|
| 868 |
+
</script>
|
| 869 |
+
</body>
|
| 870 |
+
</html>
|
| 871 |
+
"""
|
| 872 |
+
|
| 873 |
+
# ── view page ──────────────────────────────────────────────────��──
|
| 874 |
+
|
| 875 |
+
_VIEW_HTML = r"""<!DOCTYPE html>
|
| 876 |
+
<html lang="en">
|
| 877 |
+
<head>
|
| 878 |
+
<meta charset="UTF-8">
|
| 879 |
+
<meta name="viewport" content="width=device-width,initial-scale=1">
|
| 880 |
+
<title>Paste __PID__ — DLP Paste-Proxy</title>
|
| 881 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 882 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 883 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&family=Instrument+Serif:ital@0;1&display=swap" rel="stylesheet">
|
| 884 |
+
<style>
|
| 885 |
+
""" + _SHARED_CSS + r"""
|
| 886 |
+
|
| 887 |
+
.pp-view-head{
|
| 888 |
+
display:flex;align-items:center;gap:10px;flex-wrap:wrap;margin-bottom:18px;
|
| 889 |
+
padding:14px 16px;background:var(--panel);border:0.5px solid var(--line);
|
| 890 |
+
border-radius:var(--radius-lg);box-shadow:var(--shadow-sm);
|
| 891 |
+
}
|
| 892 |
+
.pp-view-id{font-family:var(--font-mono);font-size:12.5px;color:var(--ink-dim);padding:3px 8px;background:var(--panel-2);border:0.5px solid var(--line);border-radius:5px}
|
| 893 |
+
.pp-view-mode{font-size:11px;font-weight:600;letter-spacing:0.06em;text-transform:uppercase;padding:3px 8px;border-radius:4px;background:color-mix(in srgb,var(--ink) 8%,transparent);color:var(--ink-dim)}
|
| 894 |
+
.pp-view-expiry{font-family:var(--font-mono);font-size:11.5px;color:var(--ink-faint)}
|
| 895 |
+
|
| 896 |
+
.pp-stat-row{display:flex;gap:26px;flex-wrap:wrap;margin-left:auto;margin-right:0}
|
| 897 |
+
.pp-stat{text-align:right}
|
| 898 |
+
.pp-stat b{font-family:var(--font-serif);font-weight:500;font-size:22px;letter-spacing:-0.01em;display:block;line-height:1}
|
| 899 |
+
.pp-stat span{font-size:10.5px;letter-spacing:0.08em;text-transform:uppercase;color:var(--ink-faint);font-weight:500}
|
| 900 |
+
|
| 901 |
+
.pp-view-body{
|
| 902 |
+
background:var(--panel);border:0.5px solid var(--line);border-radius:var(--radius-lg);
|
| 903 |
+
box-shadow:var(--shadow-md);padding:28px 32px 30px;
|
| 904 |
+
}
|
| 905 |
+
.pp-body-redacted, .pp-body-reveal{
|
| 906 |
+
font-family:var(--font-serif);font-size:17px;line-height:1.7;
|
| 907 |
+
color:var(--ink);
|
| 908 |
+
white-space:pre-wrap;word-wrap:break-word;
|
| 909 |
+
}
|
| 910 |
+
|
| 911 |
+
/* highlight (reveal mode) */
|
| 912 |
+
.pp-hi{
|
| 913 |
+
background:color-mix(in srgb, var(--cat,#666) 18%, transparent);
|
| 914 |
+
color:var(--ink);
|
| 915 |
+
border-radius:3px;padding:1px 3px;margin:0 1px;
|
| 916 |
+
border:0.5px solid color-mix(in srgb, var(--cat,#666) 30%, transparent);
|
| 917 |
+
position:relative;
|
| 918 |
+
}
|
| 919 |
+
.pp-hi-tag{
|
| 920 |
+
font-family:var(--font-sans);font-size:9.5px;letter-spacing:0.07em;text-transform:uppercase;
|
| 921 |
+
font-weight:600;color:var(--cat,#666);margin-left:4px;opacity:.72;
|
| 922 |
+
}
|
| 923 |
+
|
| 924 |
+
.pp-badges{display:flex;gap:8px;flex-wrap:wrap;margin-bottom:18px;padding-bottom:16px;border-bottom:0.5px solid var(--line)}
|
| 925 |
+
.pp-badge{
|
| 926 |
+
display:inline-flex;align-items:center;gap:6px;
|
| 927 |
+
font-size:12px;font-weight:500;padding:4px 9px 4px 8px;
|
| 928 |
+
border-radius:4px;background:color-mix(in srgb, var(--cat,#666) 10%, transparent);
|
| 929 |
+
border:0.5px solid color-mix(in srgb, var(--cat,#666) 22%, transparent);
|
| 930 |
+
color:var(--ink-dim);
|
| 931 |
+
}
|
| 932 |
+
.pp-badge-dot{width:6px;height:6px;border-radius:50%;background:var(--cat,#666)}
|
| 933 |
+
.pp-badge-n{font-family:var(--font-mono);font-size:11px;color:var(--cat,#666);font-weight:600;margin-left:2px}
|
| 934 |
+
.pp-muted{color:var(--ink-faint);font-size:13px}
|
| 935 |
+
|
| 936 |
+
.pp-actions{display:flex;gap:10px;margin-top:16px;flex-wrap:wrap}
|
| 937 |
+
.pp-btn{
|
| 938 |
+
font-size:12.5px;font-weight:500;padding:8px 14px;
|
| 939 |
+
border:0.5px solid var(--line-strong);border-radius:var(--radius-md);
|
| 940 |
+
background:var(--panel);color:var(--ink);display:inline-flex;align-items:center;gap:8px;
|
| 941 |
+
transition:background .12s,border-color .12s;
|
| 942 |
+
}
|
| 943 |
+
.pp-btn:hover{background:color-mix(in srgb, var(--ink) 4%, var(--panel));border-color:var(--ink-dim)}
|
| 944 |
+
|
| 945 |
+
.pp-footer{margin-top:28px;padding-top:22px;border-top:0.5px solid var(--line);display:flex;justify-content:space-between;gap:16px;color:var(--ink-faint);font-size:12px;flex-wrap:wrap}
|
| 946 |
+
</style>
|
| 947 |
+
</head>
|
| 948 |
+
<body>
|
| 949 |
+
<div class="pp-shell">
|
| 950 |
+
|
| 951 |
+
<div class="pp-brand">
|
| 952 |
+
<a href="/" style="text-decoration:none;display:flex;align-items:center;gap:10px">
|
| 953 |
+
<div class="pp-brand-mark">P</div>
|
| 954 |
+
<div class="pp-brand-name">DLP Paste-Proxy<span class="sub">pastebin with a conscience</span></div>
|
| 955 |
+
</a>
|
| 956 |
+
</div>
|
| 957 |
+
|
| 958 |
+
<div class="pp-view-head">
|
| 959 |
+
<span class="pp-caps">Paste</span>
|
| 960 |
+
<span class="pp-view-id">__PID__</span>
|
| 961 |
+
<span class="pp-view-mode">__MODE__</span>
|
| 962 |
+
<span class="pp-view-expiry">__CREATED__ · __EXPIRY__</span>
|
| 963 |
+
|
| 964 |
+
<div class="pp-stat-row">
|
| 965 |
+
<div class="pp-stat"><b>__PCT__%</b><span>PII density</span></div>
|
| 966 |
+
<div class="pp-stat"><b>__SPANS_N__</b><span>spans</span></div>
|
| 967 |
+
<div class="pp-stat"><b>__CHARS_N__</b><span>chars</span></div>
|
| 968 |
+
</div>
|
| 969 |
+
</div>
|
| 970 |
+
|
| 971 |
+
__BANNER__
|
| 972 |
+
|
| 973 |
+
<div class="pp-view-body">
|
| 974 |
+
<div class="pp-badges">__BADGES__</div>
|
| 975 |
+
<div class="__BODY_CLASS__">__BODY__</div>
|
| 976 |
+
|
| 977 |
+
<div class="pp-actions">
|
| 978 |
+
<button type="button" class="pp-btn" onclick="navigator.clipboard.writeText(window.location.href); this.textContent='Copied this link'">Copy this link</button>
|
| 979 |
+
<a class="pp-btn" href="/">Create your own paste →</a>
|
| 980 |
+
</div>
|
| 981 |
+
</div>
|
| 982 |
+
|
| 983 |
+
<footer class="pp-footer">
|
| 984 |
+
<div>Recipients see placeholders. The author's reveal link shows the original inline.</div>
|
| 985 |
+
<div>Views: __VIEWS__ · Reveals: __REVEALS__</div>
|
| 986 |
+
</footer>
|
| 987 |
+
</div>
|
| 988 |
+
</body>
|
| 989 |
+
</html>
|
| 990 |
+
"""
|
| 991 |
+
|
| 992 |
+
_NOT_FOUND_HTML = r"""<!DOCTYPE html>
|
| 993 |
+
<html lang="en">
|
| 994 |
+
<head>
|
| 995 |
+
<meta charset="UTF-8">
|
| 996 |
+
<meta name="viewport" content="width=device-width,initial-scale=1">
|
| 997 |
+
<title>Paste not found — DLP Paste-Proxy</title>
|
| 998 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 999 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 1000 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&family=Instrument+Serif:ital@0;1&display=swap" rel="stylesheet">
|
| 1001 |
+
<style>
|
| 1002 |
+
""" + _SHARED_CSS + r"""
|
| 1003 |
+
.pp-404{
|
| 1004 |
+
background:var(--panel);border:0.5px solid var(--line);border-radius:var(--radius-lg);
|
| 1005 |
+
box-shadow:var(--shadow-md);padding:56px 40px;text-align:center;
|
| 1006 |
+
}
|
| 1007 |
+
.pp-404 h1{font-family:var(--font-serif);font-size:48px;font-weight:500;letter-spacing:-0.02em;line-height:1;margin-bottom:10px}
|
| 1008 |
+
.pp-404 p{color:var(--ink-dim);margin-bottom:22px;max-width:44ch;margin-left:auto;margin-right:auto}
|
| 1009 |
+
.pp-404 code{font-family:var(--font-mono);font-size:12.5px;background:var(--panel-2);padding:2px 8px;border-radius:4px}
|
| 1010 |
+
.pp-btn{font-size:13px;font-weight:500;padding:10px 16px;border:0.5px solid var(--line-strong);border-radius:var(--radius-md);background:var(--primary-bg);color:var(--primary-fg);display:inline-flex;align-items:center;gap:8px}
|
| 1011 |
+
</style>
|
| 1012 |
+
</head>
|
| 1013 |
+
<body>
|
| 1014 |
+
<div class="pp-shell">
|
| 1015 |
+
<div class="pp-brand">
|
| 1016 |
+
<a href="/" style="text-decoration:none;display:flex;align-items:center;gap:10px">
|
| 1017 |
+
<div class="pp-brand-mark">P</div>
|
| 1018 |
+
<div class="pp-brand-name">DLP Paste-Proxy<span class="sub">pastebin with a conscience</span></div>
|
| 1019 |
+
</a>
|
| 1020 |
+
</div>
|
| 1021 |
+
<div class="pp-404">
|
| 1022 |
+
<h1>Paste not found</h1>
|
| 1023 |
+
<p><code>{{PID}}</code> either never existed, expired by its TTL, or was evicted by a server restart. Pastes live in process memory for the demo.</p>
|
| 1024 |
+
<a class="pp-btn" href="/">Create a new paste →</a>
|
| 1025 |
+
</div>
|
| 1026 |
+
</div>
|
| 1027 |
+
</body>
|
| 1028 |
+
</html>
|
| 1029 |
+
"""
|
| 1030 |
+
|
| 1031 |
+
|
| 1032 |
+
# ── launch ────────────────────────────────────────────────────────
|
| 1033 |
+
|
| 1034 |
+
if __name__ == "__main__":
|
| 1035 |
+
server.launch(server_name="0.0.0.0", server_port=7860)
|
opf.py
ADDED
|
@@ -0,0 +1,557 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenAI Privacy Filter — inference module for DLP Paste-Proxy.
|
| 3 |
+
|
| 4 |
+
This is a focused subset of the reference implementation used in
|
| 5 |
+
app_v6.py: architecture (Transformer + Viterbi decoder), span decoding,
|
| 6 |
+
and a single public entrypoint `predict_text(text) -> (source_text, spans)`.
|
| 7 |
+
|
| 8 |
+
The numerics and the config contract are deliberately identical to v6 so
|
| 9 |
+
any future model-level tweaks made upstream can be ported in directly.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import dataclasses
|
| 15 |
+
import functools
|
| 16 |
+
import json
|
| 17 |
+
import math
|
| 18 |
+
import os
|
| 19 |
+
from bisect import bisect_left, bisect_right
|
| 20 |
+
from dataclasses import dataclass
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from typing import Final
|
| 23 |
+
|
| 24 |
+
import tiktoken
|
| 25 |
+
import torch
|
| 26 |
+
import torch.nn.functional as F
|
| 27 |
+
from huggingface_hub import snapshot_download
|
| 28 |
+
from safetensors import safe_open
|
| 29 |
+
|
| 30 |
+
MODEL_REPO = os.getenv("MODEL_ID", "charles-first-org/second-model")
|
| 31 |
+
HF_TOKEN = os.getenv("HF_TOKEN", None)
|
| 32 |
+
MODEL_DIR = Path(snapshot_download(MODEL_REPO, token=HF_TOKEN))
|
| 33 |
+
|
| 34 |
+
PRIVACY_FILTER_MODEL_TYPE: Final[str] = "privacy_filter"
|
| 35 |
+
REQUIRED_MODEL_CONFIG_KEYS: Final[tuple[str, ...]] = (
|
| 36 |
+
"model_type", "encoding", "num_hidden_layers", "num_experts",
|
| 37 |
+
"experts_per_token", "vocab_size", "num_labels", "hidden_size",
|
| 38 |
+
"intermediate_size", "head_dim", "num_attention_heads",
|
| 39 |
+
"num_key_value_heads", "sliding_window", "bidirectional_context",
|
| 40 |
+
"bidirectional_left_context", "bidirectional_right_context",
|
| 41 |
+
"default_n_ctx", "initial_context_length", "rope_theta",
|
| 42 |
+
"rope_scaling_factor", "rope_ntk_alpha", "rope_ntk_beta", "param_dtype",
|
| 43 |
+
)
|
| 44 |
+
BACKGROUND_CLASS_LABEL: Final[str] = "O"
|
| 45 |
+
BOUNDARY_PREFIXES: Final[tuple[str, ...]] = ("B", "I", "E", "S")
|
| 46 |
+
SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
|
| 47 |
+
BACKGROUND_CLASS_LABEL,
|
| 48 |
+
"account_number", "private_address", "private_date", "private_email",
|
| 49 |
+
"private_person", "private_phone", "private_url", "secret",
|
| 50 |
+
)
|
| 51 |
+
NER_CLASS_NAMES: Final[tuple[str, ...]] = (BACKGROUND_CLASS_LABEL,) + tuple(
|
| 52 |
+
f"{prefix}-{base}"
|
| 53 |
+
for base in SPAN_CLASS_NAMES if base != BACKGROUND_CLASS_LABEL
|
| 54 |
+
for prefix in BOUNDARY_PREFIXES
|
| 55 |
+
)
|
| 56 |
+
VITERBI_TRANSITION_BIAS_KEYS: Final[tuple[str, ...]] = (
|
| 57 |
+
"transition_bias_background_stay", "transition_bias_background_to_start",
|
| 58 |
+
"transition_bias_inside_to_continue", "transition_bias_inside_to_end",
|
| 59 |
+
"transition_bias_end_to_background", "transition_bias_end_to_start",
|
| 60 |
+
)
|
| 61 |
+
DEFAULT_VITERBI_CALIBRATION_PRESET: Final[str] = "default"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def validate_model_config_contract(cfg: dict, *, context: str) -> None:
|
| 65 |
+
missing = [k for k in REQUIRED_MODEL_CONFIG_KEYS if k not in cfg]
|
| 66 |
+
if missing:
|
| 67 |
+
raise ValueError(f"{context} missing keys: {', '.join(missing)}")
|
| 68 |
+
if cfg.get("model_type") != PRIVACY_FILTER_MODEL_TYPE:
|
| 69 |
+
raise ValueError(f"{context} model_type must be {PRIVACY_FILTER_MODEL_TYPE!r}")
|
| 70 |
+
if cfg.get("bidirectional_context") is not True:
|
| 71 |
+
raise ValueError(f"{context} must use bidirectional_context=true")
|
| 72 |
+
lc, rc = cfg.get("bidirectional_left_context"), cfg.get("bidirectional_right_context")
|
| 73 |
+
if not isinstance(lc, int) or not isinstance(rc, int) or lc != rc or lc < 0:
|
| 74 |
+
raise ValueError(f"{context} bidirectional context must be equal non-negative ints")
|
| 75 |
+
sw = cfg.get("sliding_window")
|
| 76 |
+
if sw != 2 * lc + 1:
|
| 77 |
+
raise ValueError(f"{context} sliding_window must equal 2*context+1")
|
| 78 |
+
if cfg["num_labels"] != 33:
|
| 79 |
+
raise ValueError(f"{context} num_labels must be 33")
|
| 80 |
+
if cfg["param_dtype"] != "bfloat16":
|
| 81 |
+
raise ValueError(f"{context} param_dtype must be bfloat16")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def expert_linear(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None) -> torch.Tensor:
|
| 85 |
+
n, e, k = x.shape
|
| 86 |
+
_, _, _, o = weight.shape
|
| 87 |
+
out = torch.bmm(x.reshape(n * e, 1, k), weight.reshape(n * e, k, o)).reshape(n, e, o)
|
| 88 |
+
return out + bias if bias is not None else out
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@dataclass
|
| 92 |
+
class ModelConfig:
|
| 93 |
+
num_hidden_layers: int; num_experts: int; experts_per_token: int
|
| 94 |
+
vocab_size: int; num_labels: int; hidden_size: int; intermediate_size: int
|
| 95 |
+
head_dim: int; num_attention_heads: int; num_key_value_heads: int
|
| 96 |
+
bidirectional_context_size: int; initial_context_length: int
|
| 97 |
+
rope_theta: float; rope_scaling_factor: float; rope_ntk_alpha: float; rope_ntk_beta: float
|
| 98 |
+
|
| 99 |
+
@classmethod
|
| 100 |
+
def from_checkpoint_config(cls, cfg: dict, *, context: str) -> "ModelConfig":
|
| 101 |
+
cfg = dict(cfg)
|
| 102 |
+
cfg["bidirectional_context_size"] = cfg["bidirectional_left_context"]
|
| 103 |
+
fields = {f.name for f in dataclasses.fields(cls)}
|
| 104 |
+
return cls(**{k: v for k, v in cfg.items() if k in fields})
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class RMSNorm(torch.nn.Module):
|
| 108 |
+
def __init__(self, n: int, eps: float = 1e-5, device=None):
|
| 109 |
+
super().__init__()
|
| 110 |
+
self.eps = eps
|
| 111 |
+
self.scale = torch.nn.Parameter(torch.ones(n, device=device, dtype=torch.float32))
|
| 112 |
+
|
| 113 |
+
def forward(self, x):
|
| 114 |
+
t = x.float()
|
| 115 |
+
return (t * torch.rsqrt(t.pow(2).mean(-1, keepdim=True) + self.eps) * self.scale).to(x.dtype)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def apply_rope(x, cos, sin):
|
| 119 |
+
cos = cos.unsqueeze(-2).to(x.dtype); sin = sin.unsqueeze(-2).to(x.dtype)
|
| 120 |
+
x1, x2 = x[..., ::2], x[..., 1::2]
|
| 121 |
+
return torch.stack((x1 * cos - x2 * sin, x2 * cos + x1 * sin), dim=-1).reshape(x.shape)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class RotaryEmbedding(torch.nn.Module):
|
| 125 |
+
def __init__(self, head_dim, base, dtype, *, initial_context_length=4096,
|
| 126 |
+
scaling_factor=1.0, ntk_alpha=1.0, ntk_beta=32.0, device=None):
|
| 127 |
+
super().__init__()
|
| 128 |
+
self.head_dim, self.base, self.dtype = head_dim, base, dtype
|
| 129 |
+
self.initial_context_length = initial_context_length
|
| 130 |
+
self.scaling_factor, self.ntk_alpha, self.ntk_beta = scaling_factor, ntk_alpha, ntk_beta
|
| 131 |
+
self.device = device
|
| 132 |
+
mp = max(int(initial_context_length * scaling_factor), initial_context_length)
|
| 133 |
+
self.max_position_embeddings = mp
|
| 134 |
+
cos, sin = self._compute(mp, device=torch.device("cpu"))
|
| 135 |
+
target = device or torch.device("cpu")
|
| 136 |
+
self.register_buffer("cos_cache", cos.to(target), persistent=False)
|
| 137 |
+
self.register_buffer("sin_cache", sin.to(target), persistent=False)
|
| 138 |
+
|
| 139 |
+
def _inv_freq(self, device=None):
|
| 140 |
+
device = device or self.device
|
| 141 |
+
freq = self.base ** (torch.arange(0, self.head_dim, 2, dtype=torch.float, device=device) / self.head_dim)
|
| 142 |
+
if self.scaling_factor > 1.0:
|
| 143 |
+
d_half = self.head_dim / 2
|
| 144 |
+
low = d_half * math.log(self.initial_context_length / (self.ntk_beta * 2 * math.pi)) / math.log(self.base)
|
| 145 |
+
high = d_half * math.log(self.initial_context_length / (self.ntk_alpha * 2 * math.pi)) / math.log(self.base)
|
| 146 |
+
interp = 1.0 / (self.scaling_factor * freq)
|
| 147 |
+
extrap = 1.0 / freq
|
| 148 |
+
ramp = (torch.arange(d_half, dtype=torch.float32, device=device) - low) / (high - low)
|
| 149 |
+
mask = 1 - ramp.clamp(0, 1)
|
| 150 |
+
return interp * (1 - mask) + extrap * mask
|
| 151 |
+
return 1.0 / freq
|
| 152 |
+
|
| 153 |
+
def _compute(self, n, device=None):
|
| 154 |
+
inv_freq = self._inv_freq(device)
|
| 155 |
+
t = torch.arange(n, dtype=torch.float32, device=device or self.device)
|
| 156 |
+
freqs = torch.einsum("i,j->ij", t, inv_freq)
|
| 157 |
+
c = 0.1 * math.log(self.scaling_factor) + 1.0 if self.scaling_factor > 1.0 else 1.0
|
| 158 |
+
return (freqs.cos() * c).to(self.dtype), (freqs.sin() * c).to(self.dtype)
|
| 159 |
+
|
| 160 |
+
def forward(self, q, k):
|
| 161 |
+
n = q.shape[0]
|
| 162 |
+
if n > self.cos_cache.shape[0]:
|
| 163 |
+
cos, sin = self._compute(n, torch.device("cpu"))
|
| 164 |
+
self.cos_cache, self.sin_cache = cos.to(q.device), sin.to(q.device)
|
| 165 |
+
cc = self.cos_cache.to(q.device) if self.cos_cache.device != q.device else self.cos_cache
|
| 166 |
+
sc = self.sin_cache.to(q.device) if self.sin_cache.device != q.device else self.sin_cache
|
| 167 |
+
cos, sin = cc[:n], sc[:n]
|
| 168 |
+
q = apply_rope(q.view(n, -1, self.head_dim), cos, sin).reshape(q.shape)
|
| 169 |
+
k = apply_rope(k.view(n, -1, self.head_dim), cos, sin).reshape(k.shape)
|
| 170 |
+
return q, k
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def sdpa(Q, K, V, S, sm_scale, ctx):
|
| 174 |
+
n, nh, qm, hd = Q.shape
|
| 175 |
+
w = 2 * ctx + 1
|
| 176 |
+
Kp = F.pad(K, (0, 0, 0, 0, ctx, ctx)); Vp = F.pad(V, (0, 0, 0, 0, ctx, ctx))
|
| 177 |
+
Kw = Kp.unfold(0, w, 1).permute(0, 3, 1, 2); Vw = Vp.unfold(0, w, 1).permute(0, 3, 1, 2)
|
| 178 |
+
idx = torch.arange(w, device=Q.device) - ctx
|
| 179 |
+
pos = torch.arange(n, device=Q.device)[:, None] + idx[None, :]
|
| 180 |
+
valid = (pos >= 0) & (pos < n)
|
| 181 |
+
scores = torch.einsum("nhqd,nwhd->nhqw", Q, Kw).float() * sm_scale
|
| 182 |
+
scores = scores.masked_fill(~valid[:, None, None, :], -float("inf"))
|
| 183 |
+
sink = (S * math.log(2.0)).reshape(nh, qm)[None, :, :, None].expand(n, -1, -1, 1)
|
| 184 |
+
scores = torch.cat([scores, sink], dim=-1)
|
| 185 |
+
wt = torch.softmax(scores, dim=-1)[..., :-1].to(V.dtype)
|
| 186 |
+
return torch.einsum("nhqw,nwhd->nhqd", wt, Vw).reshape(n, -1)
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
class AttentionBlock(torch.nn.Module):
|
| 190 |
+
def __init__(self, cfg: ModelConfig, device=None):
|
| 191 |
+
super().__init__()
|
| 192 |
+
dt = torch.bfloat16
|
| 193 |
+
self.head_dim, self.nah, self.nkv = cfg.head_dim, cfg.num_attention_heads, cfg.num_key_value_heads
|
| 194 |
+
self.ctx = int(cfg.bidirectional_context_size)
|
| 195 |
+
self.sinks = torch.nn.Parameter(torch.empty(cfg.num_attention_heads, device=device, dtype=torch.float32))
|
| 196 |
+
self.norm = RMSNorm(cfg.hidden_size, device=device)
|
| 197 |
+
qkv_d = cfg.head_dim * (cfg.num_attention_heads + 2 * cfg.num_key_value_heads)
|
| 198 |
+
self.qkv = torch.nn.Linear(cfg.hidden_size, qkv_d, device=device, dtype=dt)
|
| 199 |
+
self.out = torch.nn.Linear(cfg.head_dim * cfg.num_attention_heads, cfg.hidden_size, device=device, dtype=dt)
|
| 200 |
+
self.qk_scale = 1 / math.sqrt(math.sqrt(cfg.head_dim))
|
| 201 |
+
self.rope = RotaryEmbedding(cfg.head_dim, int(cfg.rope_theta), torch.float32,
|
| 202 |
+
initial_context_length=cfg.initial_context_length,
|
| 203 |
+
scaling_factor=cfg.rope_scaling_factor,
|
| 204 |
+
ntk_alpha=cfg.rope_ntk_alpha, ntk_beta=cfg.rope_ntk_beta, device=device)
|
| 205 |
+
|
| 206 |
+
def forward(self, x):
|
| 207 |
+
t = self.norm(x).to(self.qkv.weight.dtype)
|
| 208 |
+
qkv = F.linear(t, self.qkv.weight, self.qkv.bias)
|
| 209 |
+
hd, nah, nkv = self.head_dim, self.nah, self.nkv
|
| 210 |
+
q = qkv[:, :nah * hd].contiguous()
|
| 211 |
+
k = qkv[:, nah * hd:(nah + nkv) * hd].contiguous()
|
| 212 |
+
v = qkv[:, (nah + nkv) * hd:(nah + 2 * nkv) * hd].contiguous()
|
| 213 |
+
q, k = self.rope(q, k)
|
| 214 |
+
q, k = q * self.qk_scale, k * self.qk_scale
|
| 215 |
+
n = q.shape[0]
|
| 216 |
+
q = q.view(n, nkv, nah // nkv, hd); k = k.view(n, nkv, hd); v = v.view(n, nkv, hd)
|
| 217 |
+
ao = sdpa(q, k, v, self.sinks, 1.0, self.ctx).to(self.out.weight.dtype)
|
| 218 |
+
return x + F.linear(ao, self.out.weight, self.out.bias).to(x.dtype)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def swiglu(x, alpha=1.702, limit=7.0):
|
| 222 |
+
g, l = x.chunk(2, dim=-1)
|
| 223 |
+
g, l = g.clamp(max=limit), l.clamp(-limit, limit)
|
| 224 |
+
return g * torch.sigmoid(alpha * g) * (l + 1)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
class MLPBlock(torch.nn.Module):
|
| 228 |
+
def __init__(self, cfg: ModelConfig, device=None):
|
| 229 |
+
super().__init__()
|
| 230 |
+
dt = torch.bfloat16
|
| 231 |
+
self.ne, self.ept = cfg.num_experts, cfg.experts_per_token
|
| 232 |
+
self.norm = RMSNorm(cfg.hidden_size, device=device)
|
| 233 |
+
self.gate = torch.nn.Linear(cfg.hidden_size, cfg.num_experts, device=device, dtype=dt)
|
| 234 |
+
self.mlp1_weight = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.hidden_size, cfg.intermediate_size * 2, device=device, dtype=dt))
|
| 235 |
+
self.mlp1_bias = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.intermediate_size * 2, device=device, dtype=dt))
|
| 236 |
+
self.mlp2_weight = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.intermediate_size, cfg.hidden_size, device=device, dtype=dt))
|
| 237 |
+
self.mlp2_bias = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.hidden_size, device=device, dtype=dt))
|
| 238 |
+
|
| 239 |
+
def forward(self, x):
|
| 240 |
+
t = self.norm(x)
|
| 241 |
+
gs = F.linear(t.float(), self.gate.weight.float(), self.gate.bias.float())
|
| 242 |
+
top = torch.topk(gs, k=self.ept, dim=-1, sorted=True)
|
| 243 |
+
ew = torch.softmax(top.values, dim=-1) / self.ept
|
| 244 |
+
ei = top.indices
|
| 245 |
+
ept = self.ept
|
| 246 |
+
|
| 247 |
+
def _chunk(tc, eic, ewc):
|
| 248 |
+
o = expert_linear(tc.float().unsqueeze(1).expand(-1, eic.shape[1], -1),
|
| 249 |
+
self.mlp1_weight[eic].float(), self.mlp1_bias[eic].float())
|
| 250 |
+
o = swiglu(o)
|
| 251 |
+
o = expert_linear(o.float(), self.mlp2_weight[eic].float(), self.mlp2_bias[eic].float())
|
| 252 |
+
return (torch.einsum("bec,be->bc", o.to(ewc.dtype), ewc) * ept).to(x.dtype)
|
| 253 |
+
|
| 254 |
+
cs = 32
|
| 255 |
+
if t.shape[0] > cs:
|
| 256 |
+
parts = [_chunk(t[s:s+cs], ei[s:s+cs], ew[s:s+cs]) for s in range(0, t.shape[0], cs)]
|
| 257 |
+
return x + torch.cat(parts, 0)
|
| 258 |
+
return x + _chunk(t, ei, ew)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
class TransformerBlock(torch.nn.Module):
|
| 262 |
+
def __init__(self, cfg, device=None):
|
| 263 |
+
super().__init__()
|
| 264 |
+
self.attn = AttentionBlock(cfg, device=device)
|
| 265 |
+
self.mlp = MLPBlock(cfg, device=device)
|
| 266 |
+
def forward(self, x):
|
| 267 |
+
return self.mlp(self.attn(x))
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
class Checkpoint:
|
| 271 |
+
@staticmethod
|
| 272 |
+
def build_param_name_map(n):
|
| 273 |
+
return ({f"block.{i}.mlp.mlp1_bias": f"block.{i}.mlp.swiglu.bias" for i in range(n)}
|
| 274 |
+
| {f"block.{i}.mlp.mlp1_weight": f"block.{i}.mlp.swiglu.weight" for i in range(n)}
|
| 275 |
+
| {f"block.{i}.mlp.mlp2_bias": f"block.{i}.mlp.out.bias" for i in range(n)}
|
| 276 |
+
| {f"block.{i}.mlp.mlp2_weight": f"block.{i}.mlp.out.weight" for i in range(n)})
|
| 277 |
+
|
| 278 |
+
def __init__(self, path, device, num_hidden_layers):
|
| 279 |
+
self.pnm = self.build_param_name_map(num_hidden_layers)
|
| 280 |
+
self.ds = device.type if device.index is None else f"{device.type}:{device.index}"
|
| 281 |
+
files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".safetensors")]
|
| 282 |
+
self.map = {}
|
| 283 |
+
for sf in files:
|
| 284 |
+
with safe_open(sf, framework="pt", device=self.ds) as h:
|
| 285 |
+
for k in h.keys():
|
| 286 |
+
self.map[k] = sf
|
| 287 |
+
|
| 288 |
+
def get(self, name):
|
| 289 |
+
mapped = self.pnm.get(name, name)
|
| 290 |
+
with safe_open(self.map[mapped], framework="pt", device=self.ds) as h:
|
| 291 |
+
return h.get_tensor(mapped)
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
class Transformer(torch.nn.Module):
|
| 295 |
+
def __init__(self, cfg, device):
|
| 296 |
+
super().__init__()
|
| 297 |
+
dt = torch.bfloat16
|
| 298 |
+
self.embedding = torch.nn.Embedding(cfg.vocab_size, cfg.hidden_size, device=device, dtype=dt)
|
| 299 |
+
self.block = torch.nn.ModuleList([TransformerBlock(cfg, device=device) for _ in range(cfg.num_hidden_layers)])
|
| 300 |
+
self.norm = RMSNorm(cfg.hidden_size, device=device)
|
| 301 |
+
self.unembedding = torch.nn.Linear(cfg.hidden_size, cfg.num_labels, bias=False, device=device, dtype=dt)
|
| 302 |
+
|
| 303 |
+
def forward(self, token_ids):
|
| 304 |
+
x = self.embedding(token_ids)
|
| 305 |
+
for blk in self.block:
|
| 306 |
+
x = blk(x)
|
| 307 |
+
return F.linear(self.norm(x), self.unembedding.weight, None)
|
| 308 |
+
|
| 309 |
+
@classmethod
|
| 310 |
+
def from_checkpoint(cls, checkpoint_dir, *, device):
|
| 311 |
+
torch.backends.cuda.matmul.allow_tf32 = False
|
| 312 |
+
torch.backends.cudnn.allow_tf32 = False
|
| 313 |
+
torch.set_float32_matmul_precision("highest")
|
| 314 |
+
cp = json.loads((Path(checkpoint_dir) / "config.json").read_text())
|
| 315 |
+
validate_model_config_contract(cp, context=str(checkpoint_dir))
|
| 316 |
+
cfg = ModelConfig.from_checkpoint_config(cp, context=str(checkpoint_dir))
|
| 317 |
+
ckpt = Checkpoint(checkpoint_dir, device, cfg.num_hidden_layers)
|
| 318 |
+
m = cls(cfg, device); m.eval()
|
| 319 |
+
for name, param in m.named_parameters():
|
| 320 |
+
loaded = ckpt.get(name)
|
| 321 |
+
if param.shape != loaded.shape:
|
| 322 |
+
raise ValueError(f"Shape mismatch {name}: {param.shape} vs {loaded.shape}")
|
| 323 |
+
param.data.copy_(loaded)
|
| 324 |
+
return m
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
@dataclass(frozen=True)
|
| 328 |
+
class LabelInfo:
|
| 329 |
+
boundary_label_lookup: dict[str, dict[str, int]]
|
| 330 |
+
token_to_span_label: dict[int, int]
|
| 331 |
+
token_boundary_tags: dict[int, str | None]
|
| 332 |
+
span_class_names: tuple[str, ...]
|
| 333 |
+
span_label_lookup: dict[str, int]
|
| 334 |
+
background_token_label: int
|
| 335 |
+
background_span_label: int
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
def labels_to_spans(labels_by_index, label_info):
|
| 339 |
+
spans, cur_label, start_idx, prev_idx = [], None, None, None
|
| 340 |
+
bg = label_info.background_span_label
|
| 341 |
+
for ti in sorted(labels_by_index):
|
| 342 |
+
lid = labels_by_index[ti]
|
| 343 |
+
sl = label_info.token_to_span_label.get(lid)
|
| 344 |
+
bt = label_info.token_boundary_tags.get(lid)
|
| 345 |
+
if prev_idx is not None and ti != prev_idx + 1:
|
| 346 |
+
if cur_label is not None and start_idx is not None:
|
| 347 |
+
spans.append((cur_label, start_idx, prev_idx + 1))
|
| 348 |
+
cur_label = start_idx = None
|
| 349 |
+
if sl is None:
|
| 350 |
+
prev_idx = ti; continue
|
| 351 |
+
if sl == bg:
|
| 352 |
+
if cur_label is not None and start_idx is not None:
|
| 353 |
+
spans.append((cur_label, start_idx, ti))
|
| 354 |
+
cur_label = start_idx = None; prev_idx = ti; continue
|
| 355 |
+
if bt == "S":
|
| 356 |
+
if cur_label is not None and start_idx is not None and prev_idx is not None:
|
| 357 |
+
spans.append((cur_label, start_idx, prev_idx + 1))
|
| 358 |
+
spans.append((sl, ti, ti + 1)); cur_label = start_idx = None
|
| 359 |
+
elif bt == "B":
|
| 360 |
+
if cur_label is not None and start_idx is not None and prev_idx is not None:
|
| 361 |
+
spans.append((cur_label, start_idx, prev_idx + 1))
|
| 362 |
+
cur_label, start_idx = sl, ti
|
| 363 |
+
elif bt == "I":
|
| 364 |
+
if cur_label is None or cur_label != sl:
|
| 365 |
+
if cur_label is not None and start_idx is not None and prev_idx is not None:
|
| 366 |
+
spans.append((cur_label, start_idx, prev_idx + 1))
|
| 367 |
+
cur_label, start_idx = sl, ti
|
| 368 |
+
elif bt == "E":
|
| 369 |
+
if cur_label is None or cur_label != sl or start_idx is None:
|
| 370 |
+
if cur_label is not None and start_idx is not None and prev_idx is not None:
|
| 371 |
+
spans.append((cur_label, start_idx, prev_idx + 1))
|
| 372 |
+
spans.append((sl, ti, ti + 1)); cur_label = start_idx = None
|
| 373 |
+
else:
|
| 374 |
+
spans.append((cur_label, start_idx, ti + 1)); cur_label = start_idx = None
|
| 375 |
+
else:
|
| 376 |
+
if cur_label is not None and start_idx is not None and prev_idx is not None:
|
| 377 |
+
spans.append((cur_label, start_idx, prev_idx + 1))
|
| 378 |
+
cur_label = start_idx = None
|
| 379 |
+
prev_idx = ti
|
| 380 |
+
if cur_label is not None and start_idx is not None and prev_idx is not None:
|
| 381 |
+
spans.append((cur_label, start_idx, prev_idx + 1))
|
| 382 |
+
return spans
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
def token_spans_to_char_spans(spans, cs, ce):
|
| 386 |
+
out = []
|
| 387 |
+
for li, ts, te in spans:
|
| 388 |
+
if not (0 <= ts < te <= len(cs)):
|
| 389 |
+
continue
|
| 390 |
+
s, e = cs[ts], ce[te - 1]
|
| 391 |
+
if e > s:
|
| 392 |
+
out.append((li, s, e))
|
| 393 |
+
return out
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
def trim_char_spans_whitespace(spans, text):
|
| 397 |
+
out = []
|
| 398 |
+
for li, s, e in spans:
|
| 399 |
+
if not (0 <= s < e <= len(text)):
|
| 400 |
+
continue
|
| 401 |
+
while s < e and text[s].isspace(): s += 1
|
| 402 |
+
while e > s and text[e - 1].isspace(): e -= 1
|
| 403 |
+
if e > s:
|
| 404 |
+
out.append((li, s, e))
|
| 405 |
+
return out
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
@functools.lru_cache(maxsize=1)
|
| 409 |
+
def get_viterbi_transition_biases():
|
| 410 |
+
cp = MODEL_DIR / "viterbi_calibration.json"
|
| 411 |
+
default = {k: 0.0 for k in VITERBI_TRANSITION_BIAS_KEYS}
|
| 412 |
+
if not cp.is_file():
|
| 413 |
+
return default
|
| 414 |
+
payload = json.loads(cp.read_text())
|
| 415 |
+
raw = payload
|
| 416 |
+
ops = payload.get("operating_points")
|
| 417 |
+
if isinstance(ops, dict):
|
| 418 |
+
preset = ops.get(DEFAULT_VITERBI_CALIBRATION_PRESET)
|
| 419 |
+
if isinstance(preset, dict):
|
| 420 |
+
raw = preset.get("biases", raw)
|
| 421 |
+
if not isinstance(raw, dict):
|
| 422 |
+
return default
|
| 423 |
+
return {k: float(raw.get(k, 0.0)) for k in VITERBI_TRANSITION_BIAS_KEYS}
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
class Decoder:
|
| 427 |
+
def __init__(self, label_info):
|
| 428 |
+
nc = len(label_info.token_to_span_label)
|
| 429 |
+
self._start = torch.full((nc,), -1e9, dtype=torch.float32)
|
| 430 |
+
self._end = torch.full((nc,), -1e9, dtype=torch.float32)
|
| 431 |
+
self._trans = torch.full((nc, nc), -1e9, dtype=torch.float32)
|
| 432 |
+
biases = get_viterbi_transition_biases()
|
| 433 |
+
bg_tok, bg_sp = label_info.background_token_label, label_info.background_span_label
|
| 434 |
+
ttsl, tbt = label_info.token_to_span_label, label_info.token_boundary_tags
|
| 435 |
+
for i in range(nc):
|
| 436 |
+
tag, sl = tbt.get(i), ttsl.get(i)
|
| 437 |
+
if tag in {"B", "S"} or i == bg_tok: self._start[i] = 0.0
|
| 438 |
+
if tag in {"E", "S"} or i == bg_tok: self._end[i] = 0.0
|
| 439 |
+
for j in range(nc):
|
| 440 |
+
nt, ns = tbt.get(j), ttsl.get(j)
|
| 441 |
+
if self._valid(tag, sl, nt, ns, bg_tok, bg_sp, j):
|
| 442 |
+
self._trans[i, j] = self._bias(tag, sl, nt, ns, bg_sp, biases)
|
| 443 |
+
|
| 444 |
+
@staticmethod
|
| 445 |
+
def _valid(pt, ps, nt, ns, bti, bsi, ni):
|
| 446 |
+
nb = ns == bsi or ni == bti
|
| 447 |
+
if (ns is None or nt is None) and not nb: return False
|
| 448 |
+
if pt is None or ps is None: return nb or nt in {"B", "S"}
|
| 449 |
+
if ps == bsi or pt in {"E", "S"}: return nb or nt in {"B", "S"}
|
| 450 |
+
if pt in {"B", "I"}: return ps == ns and nt in {"I", "E"}
|
| 451 |
+
return False
|
| 452 |
+
|
| 453 |
+
@staticmethod
|
| 454 |
+
def _bias(pt, ps, nt, ns, bsi, b):
|
| 455 |
+
nb, pb = ns == bsi, ps == bsi
|
| 456 |
+
if pb: return b["transition_bias_background_stay"] if nb else b["transition_bias_background_to_start"]
|
| 457 |
+
if pt in {"B", "I"}: return b["transition_bias_inside_to_continue"] if nt == "I" else b["transition_bias_inside_to_end"]
|
| 458 |
+
return b["transition_bias_end_to_background"] if nb else b["transition_bias_end_to_start"]
|
| 459 |
+
|
| 460 |
+
def decode(self, lp):
|
| 461 |
+
sl, nc = lp.shape
|
| 462 |
+
if sl == 0: return []
|
| 463 |
+
st = self._start.to(lp.device, lp.dtype)
|
| 464 |
+
en = self._end.to(lp.device, lp.dtype)
|
| 465 |
+
tr = self._trans.to(lp.device, lp.dtype)
|
| 466 |
+
scores = lp[0] + st
|
| 467 |
+
bp = torch.empty((sl - 1, nc), device=lp.device, dtype=torch.int64)
|
| 468 |
+
for i in range(1, sl):
|
| 469 |
+
t = scores.unsqueeze(1) + tr
|
| 470 |
+
bs, bi = t.max(dim=0)
|
| 471 |
+
scores = bs + lp[i]; bp[i - 1] = bi
|
| 472 |
+
if not torch.isfinite(scores).any(): return lp.argmax(dim=1).tolist()
|
| 473 |
+
scores = scores + en
|
| 474 |
+
path = torch.empty(sl, device=lp.device, dtype=torch.int64)
|
| 475 |
+
path[-1] = scores.argmax()
|
| 476 |
+
for i in range(sl - 2, -1, -1): path[i] = bp[i, path[i + 1]]
|
| 477 |
+
return path.tolist()
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
@dataclass(frozen=True)
|
| 481 |
+
class InferenceRuntime:
|
| 482 |
+
model: Transformer; encoding: tiktoken.Encoding; label_info: LabelInfo
|
| 483 |
+
device: torch.device; n_ctx: int
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
@functools.lru_cache(maxsize=1)
|
| 487 |
+
def get_runtime():
|
| 488 |
+
cp = MODEL_DIR
|
| 489 |
+
cfg = json.loads((cp / "config.json").read_text())
|
| 490 |
+
validate_model_config_contract(cfg, context=str(cp))
|
| 491 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 492 |
+
encoding = tiktoken.get_encoding(str(cfg["encoding"]).strip())
|
| 493 |
+
scn = [BACKGROUND_CLASS_LABEL]; sll = {BACKGROUND_CLASS_LABEL: 0}
|
| 494 |
+
bll, ttsl, tbt = {}, {}, {}
|
| 495 |
+
bg_idx = None
|
| 496 |
+
for idx, name in enumerate(NER_CLASS_NAMES):
|
| 497 |
+
if name == BACKGROUND_CLASS_LABEL:
|
| 498 |
+
bg_idx = idx; ttsl[idx] = 0; tbt[idx] = None; continue
|
| 499 |
+
bnd, base = name.split("-", 1)
|
| 500 |
+
si = sll.get(base)
|
| 501 |
+
if si is None:
|
| 502 |
+
si = len(scn); scn.append(base); sll[base] = si
|
| 503 |
+
ttsl[idx] = si; tbt[idx] = bnd
|
| 504 |
+
bll.setdefault(base, {})[bnd] = idx
|
| 505 |
+
li = LabelInfo(bll, ttsl, tbt, tuple(scn), sll, bg_idx, 0)
|
| 506 |
+
m = Transformer.from_checkpoint(str(cp), device=device)
|
| 507 |
+
return InferenceRuntime(m, encoding, li, device, int(cfg["default_n_ctx"]))
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
@functools.lru_cache(maxsize=1)
|
| 511 |
+
def get_decoder():
|
| 512 |
+
return Decoder(label_info=get_runtime().label_info)
|
| 513 |
+
|
| 514 |
+
|
| 515 |
+
@torch.inference_mode()
|
| 516 |
+
def _predict_with_runtime(runtime, text, decoder):
|
| 517 |
+
tids = tuple(int(t) for t in runtime.encoding.encode(text, allowed_special="all"))
|
| 518 |
+
if not tids: return text, []
|
| 519 |
+
chunks = []
|
| 520 |
+
for s in range(0, len(tids), runtime.n_ctx):
|
| 521 |
+
e = min(s + runtime.n_ctx, len(tids))
|
| 522 |
+
wt = torch.tensor(tids[s:e], device=runtime.device, dtype=torch.int32)
|
| 523 |
+
lp = F.log_softmax(runtime.model(wt).float(), dim=-1)
|
| 524 |
+
chunks.append(lp)
|
| 525 |
+
stacked = chunks[0] if len(chunks) == 1 else torch.cat(chunks, dim=0)
|
| 526 |
+
dl = decoder.decode(stacked)
|
| 527 |
+
if len(dl) != len(tids): dl = stacked.argmax(dim=1).tolist()
|
| 528 |
+
pli = {i: int(l) for i, l in enumerate(dl)}
|
| 529 |
+
pts = labels_to_spans(pli, runtime.label_info)
|
| 530 |
+
tb = [runtime.encoding.decode_single_token_bytes(t) for t in tids]
|
| 531 |
+
dt = b"".join(tb).decode("utf-8", errors="replace")
|
| 532 |
+
cbs, cbe = [], []
|
| 533 |
+
bc = 0
|
| 534 |
+
for ch in dt: cbs.append(bc); bc += len(ch.encode("utf-8")); cbe.append(bc)
|
| 535 |
+
cs, ce = [], []
|
| 536 |
+
tbc = 0
|
| 537 |
+
for rb in tb:
|
| 538 |
+
tbs = tbc; tbe = tbs + len(rb); tbc = tbe
|
| 539 |
+
cs.append(bisect_right(cbe, tbs)); ce.append(bisect_left(cbs, tbe))
|
| 540 |
+
pcs = token_spans_to_char_spans(pts, cs, ce)
|
| 541 |
+
pcs = trim_char_spans_whitespace(pcs, dt if dt != text else text)
|
| 542 |
+
src = dt if dt != text else text
|
| 543 |
+
detected = []
|
| 544 |
+
for li, s, e in pcs:
|
| 545 |
+
if 0 <= li < len(runtime.label_info.span_class_names):
|
| 546 |
+
lbl = runtime.label_info.span_class_names[li]
|
| 547 |
+
else:
|
| 548 |
+
lbl = f"label_{li}"
|
| 549 |
+
detected.append({"label": lbl, "start": s, "end": e, "text": src[s:e]})
|
| 550 |
+
return src, detected
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
def predict_text(text: str) -> tuple[str, list[dict]]:
|
| 554 |
+
"""Returns (source_text, spans). source_text may differ from input
|
| 555 |
+
only if the tokenizer's decode path normalizes invalid UTF-8; spans
|
| 556 |
+
are character offsets into source_text."""
|
| 557 |
+
return _predict_with_runtime(get_runtime(), text, get_decoder())
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tiktoken
|
| 2 |
+
sentencepiece
|
| 3 |
+
torch
|
| 4 |
+
safetensors
|
| 5 |
+
huggingface_hub
|
| 6 |
+
gradio[mcp]>=5.29.0
|
| 7 |
+
accelerate
|
| 8 |
+
spaces
|
| 9 |
+
python-multipart
|