ysharma HF Staff commited on
Commit
754de8e
·
verified ·
1 Parent(s): 3580321

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +1035 -0
  2. opf.py +557 -0
  3. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,1035 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DLP Paste-Proxy — "Pastebin with a conscience"
3
+ ================================================
4
+
5
+ A sleek paste-to-share service. The author pastes PII-rich text and gets
6
+ a shareable URL. Recipients at that URL see the OPF-redacted version by
7
+ default; a separate "reveal" link guarded by an unguessable token shows
8
+ the original.
9
+
10
+ Why gr.Server? We need three HTTP surfaces that don't map cleanly onto
11
+ gr.Blocks event wiring:
12
+ * POST /api/paste - accept paste, run OPF, mint IDs
13
+ * GET /view/{id} - public redacted view page
14
+ * GET /view/{id}?token=... - author's reveal page
15
+ plus a programmable API endpoint (@server.api) for gradio-client SDK
16
+ users and a background sweeper for auto-expiry.
17
+
18
+ Storage is an in-process dict. That is fine for a public demo — the
19
+ point is to illustrate the request-composition model; it is NOT a
20
+ durable pastebin. Restarting the Space clears all pastes.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import html
26
+ import json
27
+ import os
28
+ import secrets
29
+ import threading
30
+ import time
31
+ from dataclasses import dataclass
32
+ from typing import Optional
33
+
34
+ import gradio as gr
35
+ from fastapi import Request
36
+ from fastapi.responses import HTMLResponse, JSONResponse
37
+
38
+ # spaces is only available on Hugging Face Spaces; degrade gracefully
39
+ # when running locally so `python app.py` still works off-GPU.
40
+ try:
41
+ import spaces
42
+ _HAS_SPACES = True
43
+ except ImportError:
44
+ _HAS_SPACES = False
45
+
46
+ from opf import predict_text
47
+
48
+ # ── configuration ─────────────────────────────────────────────────
49
+
50
+ MAX_PASTE_CHARS = int(os.getenv("MAX_PASTE_CHARS", "50000"))
51
+ SWEEP_INTERVAL_SEC = int(os.getenv("SWEEP_INTERVAL_SEC", "30"))
52
+
53
+ TTL_CHOICES: dict[str, Optional[int]] = {
54
+ "never": None,
55
+ "1h": 60 * 60,
56
+ "24h": 60 * 60 * 24,
57
+ "7d": 60 * 60 * 24 * 7,
58
+ }
59
+
60
+ CATEGORIES_META = {
61
+ "private_person": {"color": "#E24B4A", "label": "Person"},
62
+ "private_date": {"color": "#1E7DD1", "label": "Date"},
63
+ "private_address": {"color": "#1D9E75", "label": "Address"},
64
+ "private_email": {"color": "#0EA5A1", "label": "Email"},
65
+ "account_number": {"color": "#BA7517", "label": "Account"},
66
+ "private_url": {"color": "#D85A30", "label": "URL"},
67
+ "secret": {"color": "#52525b", "label": "Secret"},
68
+ "private_phone": {"color": "#639922", "label": "Phone"},
69
+ }
70
+
71
+ # ── paste store ───────────────────────────────────────────────────
72
+
73
+ @dataclass
74
+ class Paste:
75
+ id: str
76
+ reveal_token: str
77
+ original: str
78
+ redacted: str
79
+ spans: list[dict]
80
+ stats: dict
81
+ created_at: float
82
+ expires_at: Optional[float]
83
+ views: int = 0
84
+ reveals: int = 0
85
+
86
+
87
+ PASTES: dict[str, Paste] = {}
88
+ LOCK = threading.RLock()
89
+
90
+
91
+ def _store_put(paste: Paste) -> None:
92
+ with LOCK:
93
+ PASTES[paste.id] = paste
94
+
95
+
96
+ def _store_get(pid: str) -> Optional[Paste]:
97
+ with LOCK:
98
+ p = PASTES.get(pid)
99
+ if p is None:
100
+ return None
101
+ if p.expires_at is not None and p.expires_at <= time.time():
102
+ PASTES.pop(pid, None)
103
+ return None
104
+ return p
105
+
106
+
107
+ def _sweep_loop() -> None:
108
+ while True:
109
+ time.sleep(SWEEP_INTERVAL_SEC)
110
+ now = time.time()
111
+ with LOCK:
112
+ expired = [pid for pid, p in PASTES.items()
113
+ if p.expires_at is not None and p.expires_at <= now]
114
+ for pid in expired:
115
+ PASTES.pop(pid, None)
116
+
117
+
118
+ threading.Thread(target=_sweep_loop, daemon=True, name="paste-sweeper").start()
119
+
120
+
121
+ # ── redaction ─────────────────────────────────────────────────────
122
+
123
+ def redact(text: str, spans: list[dict]) -> str:
124
+ """Replace each detected span with <CATEGORY> right-to-left.
125
+
126
+ Right-to-left preserves indices for earlier spans while we rewrite
127
+ later ones (the v6 model output is non-overlapping, but we still
128
+ sort defensively and drop any that would nest)."""
129
+ out = text
130
+ last_start: Optional[int] = None
131
+ for sp in sorted(spans, key=lambda s: s["start"], reverse=True):
132
+ s, e = sp["start"], sp["end"]
133
+ if last_start is not None and e > last_start:
134
+ continue # overlaps a later (earlier-in-text) span; skip
135
+ placeholder = f"<{sp['label'].upper()}>"
136
+ out = out[:s] + placeholder + out[e:]
137
+ last_start = s
138
+ return out
139
+
140
+
141
+ def compute_stats(text: str, spans: list[dict]) -> dict:
142
+ total = len(text)
143
+ pii_chars = sum(s["end"] - s["start"] for s in spans)
144
+ by_cat: dict[str, dict[str, int]] = {}
145
+ for s in spans:
146
+ c = s["label"]
147
+ by_cat.setdefault(c, {"count": 0, "chars": 0})
148
+ by_cat[c]["count"] += 1
149
+ by_cat[c]["chars"] += s["end"] - s["start"]
150
+ return {
151
+ "total_chars": total,
152
+ "pii_chars": pii_chars,
153
+ "pii_percentage": round(pii_chars / total * 100, 1) if total else 0.0,
154
+ "total_spans": len(spans),
155
+ "categories": by_cat,
156
+ }
157
+
158
+
159
+ # ── OPF call (GPU-gated on HF Spaces) ─────────────────────────────
160
+
161
+ if _HAS_SPACES:
162
+ @spaces.GPU
163
+ def analyze(text: str):
164
+ return predict_text(text)
165
+ else:
166
+ def analyze(text: str):
167
+ return predict_text(text)
168
+
169
+
170
+ # ── gr.Server wiring ──────────────────────────────────────────────
171
+
172
+ server = gr.Server()
173
+
174
+
175
+ @server.get("/", response_class=HTMLResponse)
176
+ async def home():
177
+ return HTMLResponse(_COMPOSE_HTML)
178
+
179
+
180
+ @server.post("/api/paste")
181
+ async def create_paste(req: Request):
182
+ try:
183
+ body = await req.json()
184
+ except Exception:
185
+ return JSONResponse({"error": "Expected JSON body"}, status_code=400)
186
+
187
+ text = (body.get("text") or "").strip()
188
+ ttl_key = body.get("ttl", "never")
189
+ if not text:
190
+ return JSONResponse({"error": "Paste is empty"}, status_code=400)
191
+ if len(text) > MAX_PASTE_CHARS:
192
+ return JSONResponse(
193
+ {"error": f"Paste exceeds {MAX_PASTE_CHARS:,} characters"},
194
+ status_code=413,
195
+ )
196
+ if ttl_key not in TTL_CHOICES:
197
+ return JSONResponse({"error": f"Unknown ttl {ttl_key!r}"}, status_code=400)
198
+
199
+ try:
200
+ source_text, spans = analyze(text)
201
+ except Exception as exc: # model failure is the only realistic path here
202
+ return JSONResponse({"error": f"OPF inference failed: {exc}"},
203
+ status_code=500)
204
+
205
+ redacted = redact(source_text, spans)
206
+ stats = compute_stats(source_text, spans)
207
+
208
+ pid = secrets.token_urlsafe(6)
209
+ reveal_token = secrets.token_urlsafe(22)
210
+ ttl_sec = TTL_CHOICES[ttl_key]
211
+ now = time.time()
212
+ expires_at = (now + ttl_sec) if ttl_sec is not None else None
213
+
214
+ _store_put(Paste(
215
+ id=pid, reveal_token=reveal_token,
216
+ original=source_text, redacted=redacted,
217
+ spans=spans, stats=stats,
218
+ created_at=now, expires_at=expires_at,
219
+ ))
220
+
221
+ return JSONResponse({
222
+ "id": pid,
223
+ "reveal_token": reveal_token,
224
+ "view_path": f"/view/{pid}",
225
+ "reveal_path": f"/view/{pid}?token={reveal_token}",
226
+ "expires_at": expires_at,
227
+ "stats": stats,
228
+ "categories_meta": CATEGORIES_META,
229
+ })
230
+
231
+
232
+ @server.get("/view/{pid}", response_class=HTMLResponse)
233
+ async def view_paste(pid: str, token: Optional[str] = None):
234
+ p = _store_get(pid)
235
+ if p is None:
236
+ return HTMLResponse(_not_found_html(pid), status_code=404)
237
+
238
+ revealed = bool(token) and secrets.compare_digest(token, p.reveal_token)
239
+
240
+ with LOCK:
241
+ if revealed:
242
+ p.reveals += 1
243
+ else:
244
+ p.views += 1
245
+
246
+ return HTMLResponse(_render_view(p, revealed))
247
+
248
+
249
+ @server.get("/api/paste/{pid}")
250
+ async def api_get_paste(pid: str, token: Optional[str] = None):
251
+ p = _store_get(pid)
252
+ if p is None:
253
+ return JSONResponse({"error": "not found or expired"}, status_code=404)
254
+ revealed = bool(token) and secrets.compare_digest(token, p.reveal_token)
255
+ payload = {
256
+ "id": p.id,
257
+ "created_at": p.created_at,
258
+ "expires_at": p.expires_at,
259
+ "stats": p.stats,
260
+ "views": p.views,
261
+ "reveals": p.reveals,
262
+ "redacted": p.redacted,
263
+ }
264
+ if revealed:
265
+ payload["original"] = p.original
266
+ payload["spans"] = p.spans
267
+ return JSONResponse(payload)
268
+
269
+
270
+ @server.api(name="analyze_paste")
271
+ def analyze_paste_api(text: str, ttl: str = "never") -> str:
272
+ """Programmatic endpoint for gradio-client SDK.
273
+
274
+ Creates a paste and returns the paste id, reveal token, and stats as
275
+ a JSON string. Callers must combine the paths with the Space's base
276
+ URL to form shareable links."""
277
+ if ttl not in TTL_CHOICES:
278
+ return json.dumps({"error": f"Unknown ttl {ttl!r}"})
279
+ source_text, spans = analyze(text)
280
+ redacted = redact(source_text, spans)
281
+ stats = compute_stats(source_text, spans)
282
+ pid = secrets.token_urlsafe(6)
283
+ reveal_token = secrets.token_urlsafe(22)
284
+ ttl_sec = TTL_CHOICES[ttl]
285
+ now = time.time()
286
+ expires_at = (now + ttl_sec) if ttl_sec is not None else None
287
+ _store_put(Paste(
288
+ id=pid, reveal_token=reveal_token,
289
+ original=source_text, redacted=redacted,
290
+ spans=spans, stats=stats,
291
+ created_at=now, expires_at=expires_at,
292
+ ))
293
+ return json.dumps({
294
+ "id": pid,
295
+ "reveal_token": reveal_token,
296
+ "view_path": f"/view/{pid}",
297
+ "reveal_path": f"/view/{pid}?token={reveal_token}",
298
+ "expires_at": expires_at,
299
+ "stats": stats,
300
+ })
301
+
302
+
303
+ # ── HTML rendering ───────────────────────────────────��────────────
304
+
305
+ def _escape(text: str) -> str:
306
+ return html.escape(text, quote=False)
307
+
308
+
309
+ def _highlight_html(text: str, spans: list[dict]) -> str:
310
+ """Return HTML for text with each span wrapped in a colored mark,
311
+ revealing the original content (used on the reveal page)."""
312
+ pieces: list[str] = []
313
+ cursor = 0
314
+ for sp in sorted(spans, key=lambda s: s["start"]):
315
+ s, e = sp["start"], sp["end"]
316
+ if s < cursor or e <= s:
317
+ continue
318
+ if s > cursor:
319
+ pieces.append(_escape(text[cursor:s]))
320
+ meta = CATEGORIES_META.get(sp["label"])
321
+ color = meta["color"] if meta else "#333"
322
+ label = meta["label"] if meta else sp["label"]
323
+ pieces.append(
324
+ f'<mark class="pp-hi" data-cat="{_escape(sp["label"])}" '
325
+ f'style="--cat:{color}" title="{_escape(label)}">'
326
+ f'{_escape(text[s:e])}'
327
+ f'<span class="pp-hi-tag">{_escape(label)}</span>'
328
+ f'</mark>'
329
+ )
330
+ cursor = e
331
+ if cursor < len(text):
332
+ pieces.append(_escape(text[cursor:]))
333
+ return "".join(pieces)
334
+
335
+
336
+ def _redacted_html(redacted: str) -> str:
337
+ """Render the redacted version with <CATEGORY> placeholders as
338
+ colored pills so readers can see what kind of data was stripped."""
339
+ out: list[str] = []
340
+ i = 0
341
+ while i < len(redacted):
342
+ lt = redacted.find("<", i)
343
+ if lt == -1:
344
+ out.append(_escape(redacted[i:]))
345
+ break
346
+ out.append(_escape(redacted[i:lt]))
347
+ gt = redacted.find(">", lt + 1)
348
+ if gt == -1:
349
+ out.append(_escape(redacted[lt:]))
350
+ break
351
+ tag = redacted[lt + 1:gt]
352
+ cat_key = tag.lower()
353
+ meta = CATEGORIES_META.get(cat_key)
354
+ if meta is None:
355
+ out.append(_escape(redacted[lt:gt + 1]))
356
+ else:
357
+ out.append(
358
+ f'<span class="pp-red" data-cat="{_escape(cat_key)}" '
359
+ f'style="--cat:{meta["color"]}">'
360
+ f'<span class="pp-red-dot"></span>{_escape(meta["label"])}'
361
+ f'</span>'
362
+ )
363
+ i = gt + 1
364
+ return "".join(out)
365
+
366
+
367
+ def _format_expiry(paste: Paste) -> str:
368
+ if paste.expires_at is None:
369
+ return "does not expire"
370
+ remaining = paste.expires_at - time.time()
371
+ if remaining <= 0:
372
+ return "expired"
373
+ if remaining < 3600:
374
+ return f"expires in {int(remaining // 60)} min"
375
+ if remaining < 86400:
376
+ return f"expires in {int(remaining // 3600)} h"
377
+ return f"expires in {int(remaining // 86400)} d"
378
+
379
+
380
+ def _render_view(p: Paste, revealed: bool) -> str:
381
+ stats = p.stats
382
+ badges_html = "".join(
383
+ f'<span class="pp-badge" style="--cat:{CATEGORIES_META.get(cat, {"color": "#333"})["color"]}">'
384
+ f'<span class="pp-badge-dot"></span>'
385
+ f'{_escape(CATEGORIES_META.get(cat, {"label": cat})["label"])}'
386
+ f'<span class="pp-badge-n">{info["count"]}</span>'
387
+ f'</span>'
388
+ for cat, info in sorted(stats["categories"].items(),
389
+ key=lambda kv: -kv[1]["count"])
390
+ ) or '<span class="pp-muted">No PII detected in this paste.</span>'
391
+
392
+ body_html = (
393
+ _highlight_html(p.original, p.spans) if revealed
394
+ else _redacted_html(p.redacted)
395
+ )
396
+
397
+ mode_banner = (
398
+ '<div class="pp-banner pp-banner-reveal">'
399
+ '<strong>Private reveal.</strong> This URL contains the reveal token — '
400
+ 'treat it like a password. Anyone with it sees the original text.'
401
+ '</div>'
402
+ if revealed else
403
+ '<div class="pp-banner pp-banner-safe">'
404
+ '<strong>Redacted view.</strong> Sensitive spans were stripped before '
405
+ 'this page was served. The original is only visible via the author\'s reveal link.'
406
+ '</div>'
407
+ )
408
+
409
+ view_mode_label = "Original (revealed)" if revealed else "Redacted"
410
+
411
+ replacements = {
412
+ "__PID__": _escape(p.id),
413
+ "__MODE__": _escape(view_mode_label),
414
+ "__EXPIRY__": _escape(_format_expiry(p)),
415
+ "__CREATED__": _escape(time.strftime(
416
+ "%Y-%m-%d %H:%M UTC", time.gmtime(p.created_at))),
417
+ "__VIEWS__": str(p.views),
418
+ "__REVEALS__": str(p.reveals),
419
+ "__PCT__": str(stats["pii_percentage"]),
420
+ "__SPANS_N__": str(stats["total_spans"]),
421
+ "__CHARS_N__": f'{stats["total_chars"]:,}',
422
+ "__BADGES__": badges_html,
423
+ "__BANNER__": mode_banner,
424
+ "__BODY__": body_html,
425
+ "__BODY_CLASS__": "pp-body-reveal" if revealed else "pp-body-redacted",
426
+ }
427
+ out = _VIEW_HTML
428
+ for k, v in replacements.items():
429
+ out = out.replace(k, v)
430
+ return out
431
+
432
+
433
+ def _not_found_html(pid: str) -> str:
434
+ return _NOT_FOUND_HTML.replace("{{PID}}", _escape(pid))
435
+
436
+
437
+ # ── compose page (paste editor) ───────────────────────────────────
438
+
439
+ _CATEGORIES_JSON = json.dumps(CATEGORIES_META)
440
+
441
+ _SHARED_CSS = r"""
442
+ :root{
443
+ --bg: #f7f7f8;
444
+ --panel: #ffffff;
445
+ --panel-2: #f1f1f3;
446
+ --ink: #0a0a0a;
447
+ --ink-dim: #3f3f46;
448
+ --ink-faint: #70707a;
449
+ --line: #e4e4e7;
450
+ --line-strong: #d4d4d8;
451
+ --accent: #0f8a5f;
452
+ --accent-ink: #ffffff;
453
+ --warn: #b45309;
454
+ --primary-bg: #18181b;
455
+ --primary-fg: #ffffff;
456
+ --radius-lg: 12px;
457
+ --radius-md: 8px;
458
+ --radius-sm: 5px;
459
+ --shadow-xs: 0 1px 1.5px rgba(10,10,10,.04);
460
+ --shadow-sm: 0 1px 3px rgba(10,10,10,.06), 0 1px 2px rgba(10,10,10,.04);
461
+ --shadow-md: 0 4px 14px rgba(10,10,10,.07), 0 1px 3px rgba(10,10,10,.04);
462
+ --font-sans: 'Inter', system-ui, -apple-system, 'Segoe UI', sans-serif;
463
+ --font-mono: 'JetBrains Mono', ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
464
+ --font-serif: 'Instrument Serif', 'Source Serif 4', Georgia, serif;
465
+ }
466
+ @media (prefers-color-scheme: dark){
467
+ :root{
468
+ --bg: #0e0e11;
469
+ --panel: #18181c;
470
+ --panel-2: #1f1f24;
471
+ --ink: #e8e8ea;
472
+ --ink-dim: #a8a8ae;
473
+ --ink-faint: #70707a;
474
+ --line: rgba(255,255,255,0.08);
475
+ --line-strong: rgba(255,255,255,0.18);
476
+ --accent: #2bb77e;
477
+ --accent-ink: #0e0e11;
478
+ --warn: #eab308;
479
+ --primary-bg: #f0f0f2;
480
+ --primary-fg: #0e0e11;
481
+ --shadow-xs: none;
482
+ --shadow-sm: none;
483
+ --shadow-md: none;
484
+ }
485
+ }
486
+ *,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
487
+ html,body{height:100%}
488
+ body{
489
+ font-family:var(--font-sans);
490
+ background:var(--bg);
491
+ color:var(--ink);
492
+ font-size:14px;line-height:1.55;
493
+ -webkit-font-smoothing:antialiased;
494
+ font-feature-settings:"cv11","ss01";
495
+ }
496
+ a{color:inherit;text-decoration:underline;text-decoration-color:var(--line-strong);text-underline-offset:3px}
497
+ a:hover{text-decoration-color:var(--ink)}
498
+ button{font:inherit;color:inherit;background:transparent;border:0;cursor:pointer}
499
+ .pp-shell{max-width:1060px;margin:0 auto;padding:36px 20px 56px}
500
+ .pp-brand{display:flex;align-items:center;gap:10px;margin-bottom:22px}
501
+ .pp-brand-mark{
502
+ width:26px;height:26px;border-radius:7px;
503
+ background:var(--ink);color:var(--bg);
504
+ display:grid;place-items:center;
505
+ font-family:var(--font-mono);font-size:13px;font-weight:600;letter-spacing:-0.02em;
506
+ }
507
+ .pp-brand-name{font-size:13.5px;font-weight:500}
508
+ .pp-brand-name .sub{color:var(--ink-faint);font-weight:400;margin-left:6px}
509
+ .pp-caps{font-size:10.5px;font-weight:600;letter-spacing:0.09em;text-transform:uppercase;color:var(--ink-dim)}
510
+ .pp-hero{margin-bottom:22px}
511
+ .pp-hero h1{font-family:var(--font-serif);font-size:38px;line-height:1.08;letter-spacing:-0.015em;font-weight:500;margin-bottom:8px}
512
+ .pp-hero p{color:var(--ink-dim);max-width:58ch;font-size:14px}
513
+ .pp-banner{padding:10px 14px;border-radius:var(--radius-md);font-size:13px;line-height:1.5;border:0.5px solid var(--line-strong);margin-bottom:16px}
514
+ .pp-banner strong{font-weight:600}
515
+ .pp-banner-safe{background:color-mix(in srgb, var(--accent) 8%, transparent);border-color:color-mix(in srgb, var(--accent) 26%, var(--line-strong))}
516
+ .pp-banner-reveal{background:color-mix(in srgb, var(--warn) 10%, transparent);border-color:color-mix(in srgb, var(--warn) 30%, var(--line-strong))}
517
+ """
518
+
519
+ _COMPOSE_HTML = r"""<!DOCTYPE html>
520
+ <html lang="en">
521
+ <head>
522
+ <meta charset="UTF-8">
523
+ <meta name="viewport" content="width=device-width,initial-scale=1">
524
+ <title>DLP Paste-Proxy — Pastebin with a conscience</title>
525
+ <link rel="preconnect" href="https://fonts.googleapis.com">
526
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
527
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&family=Instrument+Serif:ital@0;1&display=swap" rel="stylesheet">
528
+ <style>
529
+ """ + _SHARED_CSS + r"""
530
+
531
+ .pp-card{
532
+ background:var(--panel);
533
+ border:0.5px solid var(--line);
534
+ border-radius:var(--radius-lg);
535
+ box-shadow:var(--shadow-md);
536
+ overflow:hidden;
537
+ }
538
+ .pp-card-head{
539
+ padding:14px 18px;
540
+ border-bottom:0.5px solid var(--line);
541
+ display:flex;align-items:center;gap:10px;flex-wrap:wrap;
542
+ }
543
+ .pp-card-head h2{font-size:13.5px;font-weight:500;letter-spacing:-0.005em}
544
+ .pp-spacer{flex:1}
545
+ .pp-grid{
546
+ display:grid;
547
+ grid-template-columns:minmax(0,1fr) 280px;
548
+ gap:0;
549
+ }
550
+ .pp-pane{padding:18px 20px 22px}
551
+ .pp-pane + .pp-pane{border-left:0.5px solid var(--line);background:var(--panel-2)}
552
+ .pp-textarea{
553
+ width:100%;min-height:320px;
554
+ font-family:var(--font-mono);font-size:13px;line-height:1.55;
555
+ color:var(--ink);background:transparent;
556
+ border:1px solid var(--line);border-radius:var(--radius-md);
557
+ padding:14px 14px;resize:vertical;
558
+ transition:border-color .15s,background .15s;
559
+ }
560
+ .pp-textarea::placeholder{color:var(--ink-faint)}
561
+ .pp-textarea:focus{outline:none;border-color:var(--line-strong);background:color-mix(in srgb, var(--ink) 1.5%, transparent)}
562
+ .pp-sub{color:var(--ink-faint);font-size:11.5px;font-family:var(--font-mono);margin-top:8px;display:flex;align-items:center;gap:10px;flex-wrap:wrap}
563
+ .pp-sub .sep{opacity:.4}
564
+ .pp-label{display:block;font-size:11px;font-weight:600;letter-spacing:0.07em;text-transform:uppercase;color:var(--ink-dim);margin:0 0 8px}
565
+ .pp-ttl{display:flex;gap:4px;background:var(--panel);border:0.5px solid var(--line);padding:3px;border-radius:var(--radius-md)}
566
+ .pp-ttl button{
567
+ flex:1;padding:7px 0;font-size:12px;font-weight:500;color:var(--ink-dim);
568
+ border-radius:5px;transition:background .12s,color .12s;
569
+ }
570
+ .pp-ttl button[aria-pressed="true"]{background:var(--ink);color:var(--bg)}
571
+ .pp-ttl button:hover:not([aria-pressed="true"]){background:color-mix(in srgb, var(--ink) 4%, transparent);color:var(--ink)}
572
+ .pp-hint{font-size:12px;color:var(--ink-faint);margin-top:8px;line-height:1.45}
573
+ .pp-btn{
574
+ font-size:13px;font-weight:500;padding:10px 14px;
575
+ border:0.5px solid var(--line-strong);
576
+ border-radius:var(--radius-md);
577
+ background:var(--panel);color:var(--ink);
578
+ display:inline-flex;align-items:center;justify-content:center;gap:8px;
579
+ transition:background .12s,border-color .12s;
580
+ }
581
+ .pp-btn:hover:not(:disabled){background:color-mix(in srgb, var(--ink) 4%, var(--panel));border-color:var(--ink-dim)}
582
+ .pp-btn:disabled{opacity:.55;cursor:not-allowed}
583
+ .pp-btn-primary{background:var(--primary-bg);color:var(--primary-fg);border-color:var(--primary-bg);width:100%}
584
+ .pp-btn-primary:hover:not(:disabled){background:color-mix(in srgb, var(--primary-bg) 88%, var(--ink));border-color:var(--primary-bg)}
585
+ .pp-btn-arr{font-family:var(--font-mono);font-size:11px;opacity:.7}
586
+
587
+ .pp-success{
588
+ display:none;margin-top:24px;padding:22px 22px 24px;
589
+ background:var(--panel);border:0.5px solid var(--line);border-radius:var(--radius-lg);box-shadow:var(--shadow-md);
590
+ }
591
+ .pp-success.on{display:block}
592
+ .pp-success h3{font-family:var(--font-serif);font-size:22px;line-height:1.15;font-weight:500;margin-bottom:4px;letter-spacing:-0.01em}
593
+ .pp-success .pp-caps{margin-bottom:14px;display:block}
594
+ .pp-link{
595
+ display:flex;align-items:stretch;gap:0;margin:8px 0 14px;
596
+ border:0.5px solid var(--line);border-radius:var(--radius-md);overflow:hidden;background:var(--panel-2);
597
+ }
598
+ .pp-link input{
599
+ flex:1;border:0;background:transparent;padding:10px 12px;
600
+ font-family:var(--font-mono);font-size:12px;color:var(--ink);min-width:0;outline:none;
601
+ }
602
+ .pp-link button{
603
+ border-left:0.5px solid var(--line);background:var(--panel);
604
+ padding:0 14px;font-size:12px;font-weight:500;color:var(--ink-dim);
605
+ transition:background .12s,color .12s;
606
+ }
607
+ .pp-link button:hover{background:color-mix(in srgb, var(--ink) 4%, var(--panel));color:var(--ink)}
608
+ .pp-link-label{display:flex;align-items:baseline;gap:8px;font-size:13px;font-weight:500;margin-top:14px}
609
+ .pp-link-label .hint{font-weight:400;color:var(--ink-faint);font-size:12px}
610
+ .pp-link-label:first-of-type{margin-top:0}
611
+ .pp-link-label .priv{
612
+ font-family:var(--font-mono);font-size:10px;font-weight:600;letter-spacing:.06em;
613
+ padding:2px 7px;border-radius:4px;
614
+ background:color-mix(in srgb, var(--warn) 18%, transparent);
615
+ color:color-mix(in srgb, var(--warn) 70%, var(--ink));
616
+ text-transform:uppercase;
617
+ }
618
+ .pp-preview-row{display:grid;grid-template-columns:1fr 1fr;gap:12px;margin-top:16px}
619
+ .pp-preview{background:var(--panel-2);border:0.5px solid var(--line);border-radius:var(--radius-md);padding:12px 14px 14px;font-family:var(--font-serif);font-size:14.5px;line-height:1.55;min-height:130px;max-height:260px;overflow:auto}
620
+ .pp-preview .pp-caps{display:block;margin-bottom:8px;font-family:var(--font-sans);font-size:10px;color:var(--ink-faint)}
621
+ .pp-err{display:none;margin-top:12px;padding:10px 12px;border-radius:var(--radius-md);background:color-mix(in srgb, #dc2626 9%, transparent);border:0.5px solid color-mix(in srgb, #dc2626 30%, var(--line-strong));color:#991b1b;font-size:13px}
622
+ .pp-err.on{display:block}
623
+ .pp-err code{font-family:var(--font-mono);font-size:12px}
624
+ .pp-loading{display:none;align-items:center;gap:8px;color:var(--ink-dim);font-size:13px;margin-top:12px}
625
+ .pp-loading.on{display:inline-flex}
626
+ .pp-spin{width:12px;height:12px;border:1.5px solid color-mix(in srgb, var(--ink) 25%, transparent);border-top-color:var(--ink);border-radius:50%;animation:pp-spin 0.8s linear infinite}
627
+ @keyframes pp-spin{to{transform:rotate(360deg)}}
628
+
629
+ .pp-footer{
630
+ margin-top:28px;padding-top:22px;border-top:0.5px solid var(--line);
631
+ display:flex;justify-content:space-between;gap:16px;color:var(--ink-faint);font-size:12px;flex-wrap:wrap;
632
+ }
633
+ .pp-footer a{color:var(--ink-dim)}
634
+
635
+ /* Pills & highlights used on view page (scoped so compose page can
636
+ reuse the preview rendering to show what the redacted version
637
+ looks like before the user commits) */
638
+ .pp-red{
639
+ display:inline-flex;align-items:center;gap:4px;
640
+ font-family:var(--font-sans);font-size:12px;font-weight:500;
641
+ padding:1px 7px 1px 6px;margin:0 1px;border-radius:3px;
642
+ background:color-mix(in srgb, var(--cat, #666) 14%, transparent);
643
+ color:color-mix(in srgb, var(--cat, #666) 62%, var(--ink));
644
+ vertical-align:baseline;letter-spacing:-0.002em;
645
+ border:0.5px solid color-mix(in srgb, var(--cat, #666) 28%, transparent);
646
+ }
647
+ .pp-red-dot{width:5px;height:5px;border-radius:50%;background:var(--cat,#666);flex:none}
648
+
649
+ @media (max-width:820px){
650
+ .pp-grid{grid-template-columns:1fr}
651
+ .pp-pane + .pp-pane{border-left:0;border-top:0.5px solid var(--line)}
652
+ .pp-preview-row{grid-template-columns:1fr}
653
+ }
654
+ </style>
655
+ </head>
656
+ <body>
657
+ <div class="pp-shell">
658
+
659
+ <div class="pp-brand">
660
+ <div class="pp-brand-mark">P</div>
661
+ <div class="pp-brand-name">DLP Paste-Proxy<span class="sub">pastebin with a conscience</span></div>
662
+ </div>
663
+
664
+ <div class="pp-hero">
665
+ <h1>Paste sensitive text.<br>Share only the redacted view.</h1>
666
+ <p>OpenAI Privacy Filter scans your paste for names, addresses, emails, phones, URLs, dates, account numbers, and secrets before minting a shareable link. Viewers see placeholders; only your private reveal link shows the original.</p>
667
+ </div>
668
+
669
+ <div class="pp-card">
670
+ <div class="pp-card-head">
671
+ <span class="pp-caps">Compose</span>
672
+ <h2>New paste</h2>
673
+ <span class="pp-spacer"></span>
674
+ <span class="pp-sub" id="pp-char-count">0 / """ + f"{MAX_PASTE_CHARS:,}" + r""" chars</span>
675
+ </div>
676
+ <div class="pp-grid">
677
+ <div class="pp-pane">
678
+ <label class="pp-label" for="pp-text">Paste body</label>
679
+ <textarea id="pp-text" class="pp-textarea" spellcheck="false"
680
+ placeholder="Paste anything — a DM thread, a log line, an email, a support ticket. The OPF model labels each character span; placeholders replace the private parts before the URL is minted."></textarea>
681
+ <div class="pp-sub">
682
+ <span id="pp-cursor">line 1, col 1</span>
683
+ <span class="sep">·</span>
684
+ <span>no data leaves this server except as redacted placeholders</span>
685
+ </div>
686
+ </div>
687
+ <div class="pp-pane">
688
+ <label class="pp-label">Auto-expiry</label>
689
+ <div class="pp-ttl" id="pp-ttl" role="tablist" aria-label="Expiration">
690
+ <button type="button" data-ttl="never" aria-pressed="true">Never</button>
691
+ <button type="button" data-ttl="1h" aria-pressed="false">1h</button>
692
+ <button type="button" data-ttl="24h" aria-pressed="false">24h</button>
693
+ <button type="button" data-ttl="7d" aria-pressed="false">7d</button>
694
+ </div>
695
+ <p class="pp-hint">A background sweeper deletes expired pastes on the server. Expired links 404.</p>
696
+
697
+ <label class="pp-label" style="margin-top:20px">Create</label>
698
+ <button type="button" id="pp-create" class="pp-btn pp-btn-primary">
699
+ <span>Scan & mint link</span>
700
+ <span class="pp-btn-arr">↵</span>
701
+ </button>
702
+ <div class="pp-loading" id="pp-loading">
703
+ <span class="pp-spin"></span><span>Running OPF on your paste…</span>
704
+ </div>
705
+ <div class="pp-err" id="pp-err"></div>
706
+ </div>
707
+ </div>
708
+ </div>
709
+
710
+ <section class="pp-success" id="pp-success">
711
+ <span class="pp-caps">Paste minted</span>
712
+ <h3>Your paste is ready.</h3>
713
+
714
+ <div class="pp-link-label">
715
+ Shareable view link
716
+ <span class="hint">redacted — give to recipients</span>
717
+ </div>
718
+ <div class="pp-link">
719
+ <input id="pp-view-url" readonly value="">
720
+ <button type="button" data-copy="pp-view-url">Copy</button>
721
+ </div>
722
+
723
+ <div class="pp-link-label">
724
+ Private reveal link
725
+ <span class="priv">author only</span>
726
+ <span class="hint">shows original — keep it to yourself</span>
727
+ </div>
728
+ <div class="pp-link">
729
+ <input id="pp-reveal-url" readonly value="">
730
+ <button type="button" data-copy="pp-reveal-url">Copy</button>
731
+ </div>
732
+
733
+ <div class="pp-preview-row">
734
+ <div class="pp-preview">
735
+ <span class="pp-caps">What recipients will see</span>
736
+ <div id="pp-preview-redacted"></div>
737
+ </div>
738
+ <div class="pp-preview" style="font-family:var(--font-sans);font-size:12.5px;line-height:1.5">
739
+ <span class="pp-caps">Summary</span>
740
+ <div id="pp-preview-summary"></div>
741
+ </div>
742
+ </div>
743
+ </section>
744
+
745
+ <footer class="pp-footer">
746
+ <div>Powered by <a href="https://huggingface.co/charles-first-org/second-model" target="_blank" rel="noopener">OpenAI Privacy Filter</a> · 1.5B params, 50M active, 128k context</div>
747
+ <div><a href="#" id="pp-about">How this works →</a></div>
748
+ </footer>
749
+ </div>
750
+
751
+ <script>
752
+ const CATS = """ + _CATEGORIES_JSON + r""";
753
+ const MAX = """ + str(MAX_PASTE_CHARS) + r""";
754
+
755
+ const $text = document.getElementById('pp-text');
756
+ const $cc = document.getElementById('pp-char-count');
757
+ const $cur = document.getElementById('pp-cursor');
758
+ const $ttl = document.getElementById('pp-ttl');
759
+ const $btn = document.getElementById('pp-create');
760
+ const $load = document.getElementById('pp-loading');
761
+ const $err = document.getElementById('pp-err');
762
+ const $ok = document.getElementById('pp-success');
763
+
764
+ function updateCount(){
765
+ const n = $text.value.length;
766
+ $cc.textContent = n.toLocaleString() + ' / ' + MAX.toLocaleString() + ' chars';
767
+ $cc.style.color = n > MAX ? '#b45309' : '';
768
+ }
769
+ function updateCursor(){
770
+ const pos = $text.selectionStart;
771
+ const lines = $text.value.slice(0, pos).split('\n');
772
+ $cur.textContent = 'line ' + lines.length + ', col ' + (lines[lines.length-1].length + 1);
773
+ }
774
+ $text.addEventListener('input', updateCount);
775
+ ['keyup','click','focus','mouseup'].forEach(e => $text.addEventListener(e, updateCursor));
776
+
777
+ let ttl = 'never';
778
+ $ttl.addEventListener('click', (e) => {
779
+ const b = e.target.closest('button'); if (!b) return;
780
+ [...$ttl.querySelectorAll('button')].forEach(x => x.setAttribute('aria-pressed', x === b ? 'true' : 'false'));
781
+ ttl = b.dataset.ttl;
782
+ });
783
+
784
+ function renderRedacted(redacted){
785
+ let html = '';
786
+ let i = 0;
787
+ while (i < redacted.length){
788
+ const lt = redacted.indexOf('<', i);
789
+ if (lt === -1){ html += escapeHtml(redacted.slice(i)); break; }
790
+ html += escapeHtml(redacted.slice(i, lt));
791
+ const gt = redacted.indexOf('>', lt + 1);
792
+ if (gt === -1){ html += escapeHtml(redacted.slice(lt)); break; }
793
+ const tag = redacted.slice(lt+1, gt);
794
+ const key = tag.toLowerCase();
795
+ const meta = CATS[key];
796
+ if (!meta){ html += escapeHtml(redacted.slice(lt, gt+1)); }
797
+ else {
798
+ html += '<span class="pp-red" data-cat="'+escapeHtml(key)+'" style="--cat:'+meta.color+'">'+
799
+ '<span class="pp-red-dot"></span>'+escapeHtml(meta.label)+'</span>';
800
+ }
801
+ i = gt + 1;
802
+ }
803
+ return html;
804
+ }
805
+ function escapeHtml(s){ return s.replace(/[&<>"']/g, c => ({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[c])); }
806
+
807
+ async function createPaste(){
808
+ const text = $text.value.trim();
809
+ $err.classList.remove('on'); $err.textContent = '';
810
+ if (!text){ $err.classList.add('on'); $err.textContent = 'Paste is empty.'; return; }
811
+ if (text.length > MAX){ $err.classList.add('on'); $err.textContent = 'Paste exceeds ' + MAX.toLocaleString() + ' characters.'; return; }
812
+
813
+ $btn.disabled = true; $load.classList.add('on'); $ok.classList.remove('on');
814
+ try{
815
+ const r = await fetch('/api/paste', {
816
+ method: 'POST',
817
+ headers: {'Content-Type': 'application/json'},
818
+ body: JSON.stringify({text, ttl}),
819
+ });
820
+ const data = await r.json();
821
+ if (!r.ok) throw new Error(data.error || ('HTTP ' + r.status));
822
+
823
+ const origin = window.location.origin;
824
+ document.getElementById('pp-view-url').value = origin + data.view_path;
825
+ document.getElementById('pp-reveal-url').value = origin + data.reveal_path;
826
+
827
+ // Fetch public redacted version to preview
828
+ const pv = await fetch('/api/paste/' + data.id).then(x => x.json());
829
+ document.getElementById('pp-preview-redacted').innerHTML = renderRedacted(pv.redacted);
830
+
831
+ const s = data.stats;
832
+ const cats = Object.entries(s.categories).sort((a,b) => b[1].count - a[1].count);
833
+ const catHtml = cats.length
834
+ ? cats.map(([k,v]) => {
835
+ const m = CATS[k] || {label:k, color:'#333'};
836
+ return '<span class="pp-red" style="--cat:'+m.color+';margin:2px 4px 2px 0"><span class="pp-red-dot"></span>'+escapeHtml(m.label)+' × '+v.count+'</span>';
837
+ }).join('')
838
+ : '<em style="color:var(--ink-faint)">No PII found in this paste.</em>';
839
+ document.getElementById('pp-preview-summary').innerHTML =
840
+ '<div style="display:flex;gap:18px;margin-bottom:10px;align-items:baseline"><div><div style="font-family:var(--font-serif);font-size:26px;letter-spacing:-0.02em;line-height:1">'+s.pii_percentage+'%</div><div class="pp-caps" style="margin-top:3px">PII density</div></div>'+
841
+ '<div><div style="font-family:var(--font-serif);font-size:26px;letter-spacing:-0.02em;line-height:1">'+s.total_spans+'</div><div class="pp-caps" style="margin-top:3px">spans</div></div>'+
842
+ '<div><div style="font-family:var(--font-serif);font-size:26px;letter-spacing:-0.02em;line-height:1">'+s.total_chars.toLocaleString()+'</div><div class="pp-caps" style="margin-top:3px">chars</div></div></div>'+
843
+ '<div>'+catHtml+'</div>';
844
+
845
+ $ok.classList.add('on');
846
+ $ok.scrollIntoView({behavior:'smooth', block:'start'});
847
+ } catch (e) {
848
+ $err.classList.add('on');
849
+ $err.textContent = e.message || 'Failed to create paste.';
850
+ } finally {
851
+ $btn.disabled = false; $load.classList.remove('on');
852
+ }
853
+ }
854
+
855
+ $btn.addEventListener('click', createPaste);
856
+ $text.addEventListener('keydown', (e) => {
857
+ if ((e.metaKey || e.ctrlKey) && e.key === 'Enter'){ e.preventDefault(); createPaste(); }
858
+ });
859
+
860
+ document.addEventListener('click', (e) => {
861
+ const b = e.target.closest('[data-copy]'); if (!b) return;
862
+ const inp = document.getElementById(b.dataset.copy);
863
+ inp.select(); navigator.clipboard.writeText(inp.value);
864
+ const prev = b.textContent; b.textContent = 'Copied'; setTimeout(() => b.textContent = prev, 1200);
865
+ });
866
+
867
+ updateCount(); updateCursor();
868
+ </script>
869
+ </body>
870
+ </html>
871
+ """
872
+
873
+ # ── view page ──────────────────────────────────────────────────��──
874
+
875
+ _VIEW_HTML = r"""<!DOCTYPE html>
876
+ <html lang="en">
877
+ <head>
878
+ <meta charset="UTF-8">
879
+ <meta name="viewport" content="width=device-width,initial-scale=1">
880
+ <title>Paste __PID__ — DLP Paste-Proxy</title>
881
+ <link rel="preconnect" href="https://fonts.googleapis.com">
882
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
883
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&family=Instrument+Serif:ital@0;1&display=swap" rel="stylesheet">
884
+ <style>
885
+ """ + _SHARED_CSS + r"""
886
+
887
+ .pp-view-head{
888
+ display:flex;align-items:center;gap:10px;flex-wrap:wrap;margin-bottom:18px;
889
+ padding:14px 16px;background:var(--panel);border:0.5px solid var(--line);
890
+ border-radius:var(--radius-lg);box-shadow:var(--shadow-sm);
891
+ }
892
+ .pp-view-id{font-family:var(--font-mono);font-size:12.5px;color:var(--ink-dim);padding:3px 8px;background:var(--panel-2);border:0.5px solid var(--line);border-radius:5px}
893
+ .pp-view-mode{font-size:11px;font-weight:600;letter-spacing:0.06em;text-transform:uppercase;padding:3px 8px;border-radius:4px;background:color-mix(in srgb,var(--ink) 8%,transparent);color:var(--ink-dim)}
894
+ .pp-view-expiry{font-family:var(--font-mono);font-size:11.5px;color:var(--ink-faint)}
895
+
896
+ .pp-stat-row{display:flex;gap:26px;flex-wrap:wrap;margin-left:auto;margin-right:0}
897
+ .pp-stat{text-align:right}
898
+ .pp-stat b{font-family:var(--font-serif);font-weight:500;font-size:22px;letter-spacing:-0.01em;display:block;line-height:1}
899
+ .pp-stat span{font-size:10.5px;letter-spacing:0.08em;text-transform:uppercase;color:var(--ink-faint);font-weight:500}
900
+
901
+ .pp-view-body{
902
+ background:var(--panel);border:0.5px solid var(--line);border-radius:var(--radius-lg);
903
+ box-shadow:var(--shadow-md);padding:28px 32px 30px;
904
+ }
905
+ .pp-body-redacted, .pp-body-reveal{
906
+ font-family:var(--font-serif);font-size:17px;line-height:1.7;
907
+ color:var(--ink);
908
+ white-space:pre-wrap;word-wrap:break-word;
909
+ }
910
+
911
+ /* highlight (reveal mode) */
912
+ .pp-hi{
913
+ background:color-mix(in srgb, var(--cat,#666) 18%, transparent);
914
+ color:var(--ink);
915
+ border-radius:3px;padding:1px 3px;margin:0 1px;
916
+ border:0.5px solid color-mix(in srgb, var(--cat,#666) 30%, transparent);
917
+ position:relative;
918
+ }
919
+ .pp-hi-tag{
920
+ font-family:var(--font-sans);font-size:9.5px;letter-spacing:0.07em;text-transform:uppercase;
921
+ font-weight:600;color:var(--cat,#666);margin-left:4px;opacity:.72;
922
+ }
923
+
924
+ .pp-badges{display:flex;gap:8px;flex-wrap:wrap;margin-bottom:18px;padding-bottom:16px;border-bottom:0.5px solid var(--line)}
925
+ .pp-badge{
926
+ display:inline-flex;align-items:center;gap:6px;
927
+ font-size:12px;font-weight:500;padding:4px 9px 4px 8px;
928
+ border-radius:4px;background:color-mix(in srgb, var(--cat,#666) 10%, transparent);
929
+ border:0.5px solid color-mix(in srgb, var(--cat,#666) 22%, transparent);
930
+ color:var(--ink-dim);
931
+ }
932
+ .pp-badge-dot{width:6px;height:6px;border-radius:50%;background:var(--cat,#666)}
933
+ .pp-badge-n{font-family:var(--font-mono);font-size:11px;color:var(--cat,#666);font-weight:600;margin-left:2px}
934
+ .pp-muted{color:var(--ink-faint);font-size:13px}
935
+
936
+ .pp-actions{display:flex;gap:10px;margin-top:16px;flex-wrap:wrap}
937
+ .pp-btn{
938
+ font-size:12.5px;font-weight:500;padding:8px 14px;
939
+ border:0.5px solid var(--line-strong);border-radius:var(--radius-md);
940
+ background:var(--panel);color:var(--ink);display:inline-flex;align-items:center;gap:8px;
941
+ transition:background .12s,border-color .12s;
942
+ }
943
+ .pp-btn:hover{background:color-mix(in srgb, var(--ink) 4%, var(--panel));border-color:var(--ink-dim)}
944
+
945
+ .pp-footer{margin-top:28px;padding-top:22px;border-top:0.5px solid var(--line);display:flex;justify-content:space-between;gap:16px;color:var(--ink-faint);font-size:12px;flex-wrap:wrap}
946
+ </style>
947
+ </head>
948
+ <body>
949
+ <div class="pp-shell">
950
+
951
+ <div class="pp-brand">
952
+ <a href="/" style="text-decoration:none;display:flex;align-items:center;gap:10px">
953
+ <div class="pp-brand-mark">P</div>
954
+ <div class="pp-brand-name">DLP Paste-Proxy<span class="sub">pastebin with a conscience</span></div>
955
+ </a>
956
+ </div>
957
+
958
+ <div class="pp-view-head">
959
+ <span class="pp-caps">Paste</span>
960
+ <span class="pp-view-id">__PID__</span>
961
+ <span class="pp-view-mode">__MODE__</span>
962
+ <span class="pp-view-expiry">__CREATED__ · __EXPIRY__</span>
963
+
964
+ <div class="pp-stat-row">
965
+ <div class="pp-stat"><b>__PCT__%</b><span>PII density</span></div>
966
+ <div class="pp-stat"><b>__SPANS_N__</b><span>spans</span></div>
967
+ <div class="pp-stat"><b>__CHARS_N__</b><span>chars</span></div>
968
+ </div>
969
+ </div>
970
+
971
+ __BANNER__
972
+
973
+ <div class="pp-view-body">
974
+ <div class="pp-badges">__BADGES__</div>
975
+ <div class="__BODY_CLASS__">__BODY__</div>
976
+
977
+ <div class="pp-actions">
978
+ <button type="button" class="pp-btn" onclick="navigator.clipboard.writeText(window.location.href); this.textContent='Copied this link'">Copy this link</button>
979
+ <a class="pp-btn" href="/">Create your own paste →</a>
980
+ </div>
981
+ </div>
982
+
983
+ <footer class="pp-footer">
984
+ <div>Recipients see placeholders. The author's reveal link shows the original inline.</div>
985
+ <div>Views: __VIEWS__ · Reveals: __REVEALS__</div>
986
+ </footer>
987
+ </div>
988
+ </body>
989
+ </html>
990
+ """
991
+
992
+ _NOT_FOUND_HTML = r"""<!DOCTYPE html>
993
+ <html lang="en">
994
+ <head>
995
+ <meta charset="UTF-8">
996
+ <meta name="viewport" content="width=device-width,initial-scale=1">
997
+ <title>Paste not found — DLP Paste-Proxy</title>
998
+ <link rel="preconnect" href="https://fonts.googleapis.com">
999
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
1000
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&family=Instrument+Serif:ital@0;1&display=swap" rel="stylesheet">
1001
+ <style>
1002
+ """ + _SHARED_CSS + r"""
1003
+ .pp-404{
1004
+ background:var(--panel);border:0.5px solid var(--line);border-radius:var(--radius-lg);
1005
+ box-shadow:var(--shadow-md);padding:56px 40px;text-align:center;
1006
+ }
1007
+ .pp-404 h1{font-family:var(--font-serif);font-size:48px;font-weight:500;letter-spacing:-0.02em;line-height:1;margin-bottom:10px}
1008
+ .pp-404 p{color:var(--ink-dim);margin-bottom:22px;max-width:44ch;margin-left:auto;margin-right:auto}
1009
+ .pp-404 code{font-family:var(--font-mono);font-size:12.5px;background:var(--panel-2);padding:2px 8px;border-radius:4px}
1010
+ .pp-btn{font-size:13px;font-weight:500;padding:10px 16px;border:0.5px solid var(--line-strong);border-radius:var(--radius-md);background:var(--primary-bg);color:var(--primary-fg);display:inline-flex;align-items:center;gap:8px}
1011
+ </style>
1012
+ </head>
1013
+ <body>
1014
+ <div class="pp-shell">
1015
+ <div class="pp-brand">
1016
+ <a href="/" style="text-decoration:none;display:flex;align-items:center;gap:10px">
1017
+ <div class="pp-brand-mark">P</div>
1018
+ <div class="pp-brand-name">DLP Paste-Proxy<span class="sub">pastebin with a conscience</span></div>
1019
+ </a>
1020
+ </div>
1021
+ <div class="pp-404">
1022
+ <h1>Paste not found</h1>
1023
+ <p><code>{{PID}}</code> either never existed, expired by its TTL, or was evicted by a server restart. Pastes live in process memory for the demo.</p>
1024
+ <a class="pp-btn" href="/">Create a new paste →</a>
1025
+ </div>
1026
+ </div>
1027
+ </body>
1028
+ </html>
1029
+ """
1030
+
1031
+
1032
+ # ── launch ────────────────────────────────────────────────────────
1033
+
1034
+ if __name__ == "__main__":
1035
+ server.launch(server_name="0.0.0.0", server_port=7860)
opf.py ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenAI Privacy Filter — inference module for DLP Paste-Proxy.
3
+
4
+ This is a focused subset of the reference implementation used in
5
+ app_v6.py: architecture (Transformer + Viterbi decoder), span decoding,
6
+ and a single public entrypoint `predict_text(text) -> (source_text, spans)`.
7
+
8
+ The numerics and the config contract are deliberately identical to v6 so
9
+ any future model-level tweaks made upstream can be ported in directly.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import dataclasses
15
+ import functools
16
+ import json
17
+ import math
18
+ import os
19
+ from bisect import bisect_left, bisect_right
20
+ from dataclasses import dataclass
21
+ from pathlib import Path
22
+ from typing import Final
23
+
24
+ import tiktoken
25
+ import torch
26
+ import torch.nn.functional as F
27
+ from huggingface_hub import snapshot_download
28
+ from safetensors import safe_open
29
+
30
+ MODEL_REPO = os.getenv("MODEL_ID", "charles-first-org/second-model")
31
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
32
+ MODEL_DIR = Path(snapshot_download(MODEL_REPO, token=HF_TOKEN))
33
+
34
+ PRIVACY_FILTER_MODEL_TYPE: Final[str] = "privacy_filter"
35
+ REQUIRED_MODEL_CONFIG_KEYS: Final[tuple[str, ...]] = (
36
+ "model_type", "encoding", "num_hidden_layers", "num_experts",
37
+ "experts_per_token", "vocab_size", "num_labels", "hidden_size",
38
+ "intermediate_size", "head_dim", "num_attention_heads",
39
+ "num_key_value_heads", "sliding_window", "bidirectional_context",
40
+ "bidirectional_left_context", "bidirectional_right_context",
41
+ "default_n_ctx", "initial_context_length", "rope_theta",
42
+ "rope_scaling_factor", "rope_ntk_alpha", "rope_ntk_beta", "param_dtype",
43
+ )
44
+ BACKGROUND_CLASS_LABEL: Final[str] = "O"
45
+ BOUNDARY_PREFIXES: Final[tuple[str, ...]] = ("B", "I", "E", "S")
46
+ SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
47
+ BACKGROUND_CLASS_LABEL,
48
+ "account_number", "private_address", "private_date", "private_email",
49
+ "private_person", "private_phone", "private_url", "secret",
50
+ )
51
+ NER_CLASS_NAMES: Final[tuple[str, ...]] = (BACKGROUND_CLASS_LABEL,) + tuple(
52
+ f"{prefix}-{base}"
53
+ for base in SPAN_CLASS_NAMES if base != BACKGROUND_CLASS_LABEL
54
+ for prefix in BOUNDARY_PREFIXES
55
+ )
56
+ VITERBI_TRANSITION_BIAS_KEYS: Final[tuple[str, ...]] = (
57
+ "transition_bias_background_stay", "transition_bias_background_to_start",
58
+ "transition_bias_inside_to_continue", "transition_bias_inside_to_end",
59
+ "transition_bias_end_to_background", "transition_bias_end_to_start",
60
+ )
61
+ DEFAULT_VITERBI_CALIBRATION_PRESET: Final[str] = "default"
62
+
63
+
64
+ def validate_model_config_contract(cfg: dict, *, context: str) -> None:
65
+ missing = [k for k in REQUIRED_MODEL_CONFIG_KEYS if k not in cfg]
66
+ if missing:
67
+ raise ValueError(f"{context} missing keys: {', '.join(missing)}")
68
+ if cfg.get("model_type") != PRIVACY_FILTER_MODEL_TYPE:
69
+ raise ValueError(f"{context} model_type must be {PRIVACY_FILTER_MODEL_TYPE!r}")
70
+ if cfg.get("bidirectional_context") is not True:
71
+ raise ValueError(f"{context} must use bidirectional_context=true")
72
+ lc, rc = cfg.get("bidirectional_left_context"), cfg.get("bidirectional_right_context")
73
+ if not isinstance(lc, int) or not isinstance(rc, int) or lc != rc or lc < 0:
74
+ raise ValueError(f"{context} bidirectional context must be equal non-negative ints")
75
+ sw = cfg.get("sliding_window")
76
+ if sw != 2 * lc + 1:
77
+ raise ValueError(f"{context} sliding_window must equal 2*context+1")
78
+ if cfg["num_labels"] != 33:
79
+ raise ValueError(f"{context} num_labels must be 33")
80
+ if cfg["param_dtype"] != "bfloat16":
81
+ raise ValueError(f"{context} param_dtype must be bfloat16")
82
+
83
+
84
+ def expert_linear(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None) -> torch.Tensor:
85
+ n, e, k = x.shape
86
+ _, _, _, o = weight.shape
87
+ out = torch.bmm(x.reshape(n * e, 1, k), weight.reshape(n * e, k, o)).reshape(n, e, o)
88
+ return out + bias if bias is not None else out
89
+
90
+
91
+ @dataclass
92
+ class ModelConfig:
93
+ num_hidden_layers: int; num_experts: int; experts_per_token: int
94
+ vocab_size: int; num_labels: int; hidden_size: int; intermediate_size: int
95
+ head_dim: int; num_attention_heads: int; num_key_value_heads: int
96
+ bidirectional_context_size: int; initial_context_length: int
97
+ rope_theta: float; rope_scaling_factor: float; rope_ntk_alpha: float; rope_ntk_beta: float
98
+
99
+ @classmethod
100
+ def from_checkpoint_config(cls, cfg: dict, *, context: str) -> "ModelConfig":
101
+ cfg = dict(cfg)
102
+ cfg["bidirectional_context_size"] = cfg["bidirectional_left_context"]
103
+ fields = {f.name for f in dataclasses.fields(cls)}
104
+ return cls(**{k: v for k, v in cfg.items() if k in fields})
105
+
106
+
107
+ class RMSNorm(torch.nn.Module):
108
+ def __init__(self, n: int, eps: float = 1e-5, device=None):
109
+ super().__init__()
110
+ self.eps = eps
111
+ self.scale = torch.nn.Parameter(torch.ones(n, device=device, dtype=torch.float32))
112
+
113
+ def forward(self, x):
114
+ t = x.float()
115
+ return (t * torch.rsqrt(t.pow(2).mean(-1, keepdim=True) + self.eps) * self.scale).to(x.dtype)
116
+
117
+
118
+ def apply_rope(x, cos, sin):
119
+ cos = cos.unsqueeze(-2).to(x.dtype); sin = sin.unsqueeze(-2).to(x.dtype)
120
+ x1, x2 = x[..., ::2], x[..., 1::2]
121
+ return torch.stack((x1 * cos - x2 * sin, x2 * cos + x1 * sin), dim=-1).reshape(x.shape)
122
+
123
+
124
+ class RotaryEmbedding(torch.nn.Module):
125
+ def __init__(self, head_dim, base, dtype, *, initial_context_length=4096,
126
+ scaling_factor=1.0, ntk_alpha=1.0, ntk_beta=32.0, device=None):
127
+ super().__init__()
128
+ self.head_dim, self.base, self.dtype = head_dim, base, dtype
129
+ self.initial_context_length = initial_context_length
130
+ self.scaling_factor, self.ntk_alpha, self.ntk_beta = scaling_factor, ntk_alpha, ntk_beta
131
+ self.device = device
132
+ mp = max(int(initial_context_length * scaling_factor), initial_context_length)
133
+ self.max_position_embeddings = mp
134
+ cos, sin = self._compute(mp, device=torch.device("cpu"))
135
+ target = device or torch.device("cpu")
136
+ self.register_buffer("cos_cache", cos.to(target), persistent=False)
137
+ self.register_buffer("sin_cache", sin.to(target), persistent=False)
138
+
139
+ def _inv_freq(self, device=None):
140
+ device = device or self.device
141
+ freq = self.base ** (torch.arange(0, self.head_dim, 2, dtype=torch.float, device=device) / self.head_dim)
142
+ if self.scaling_factor > 1.0:
143
+ d_half = self.head_dim / 2
144
+ low = d_half * math.log(self.initial_context_length / (self.ntk_beta * 2 * math.pi)) / math.log(self.base)
145
+ high = d_half * math.log(self.initial_context_length / (self.ntk_alpha * 2 * math.pi)) / math.log(self.base)
146
+ interp = 1.0 / (self.scaling_factor * freq)
147
+ extrap = 1.0 / freq
148
+ ramp = (torch.arange(d_half, dtype=torch.float32, device=device) - low) / (high - low)
149
+ mask = 1 - ramp.clamp(0, 1)
150
+ return interp * (1 - mask) + extrap * mask
151
+ return 1.0 / freq
152
+
153
+ def _compute(self, n, device=None):
154
+ inv_freq = self._inv_freq(device)
155
+ t = torch.arange(n, dtype=torch.float32, device=device or self.device)
156
+ freqs = torch.einsum("i,j->ij", t, inv_freq)
157
+ c = 0.1 * math.log(self.scaling_factor) + 1.0 if self.scaling_factor > 1.0 else 1.0
158
+ return (freqs.cos() * c).to(self.dtype), (freqs.sin() * c).to(self.dtype)
159
+
160
+ def forward(self, q, k):
161
+ n = q.shape[0]
162
+ if n > self.cos_cache.shape[0]:
163
+ cos, sin = self._compute(n, torch.device("cpu"))
164
+ self.cos_cache, self.sin_cache = cos.to(q.device), sin.to(q.device)
165
+ cc = self.cos_cache.to(q.device) if self.cos_cache.device != q.device else self.cos_cache
166
+ sc = self.sin_cache.to(q.device) if self.sin_cache.device != q.device else self.sin_cache
167
+ cos, sin = cc[:n], sc[:n]
168
+ q = apply_rope(q.view(n, -1, self.head_dim), cos, sin).reshape(q.shape)
169
+ k = apply_rope(k.view(n, -1, self.head_dim), cos, sin).reshape(k.shape)
170
+ return q, k
171
+
172
+
173
+ def sdpa(Q, K, V, S, sm_scale, ctx):
174
+ n, nh, qm, hd = Q.shape
175
+ w = 2 * ctx + 1
176
+ Kp = F.pad(K, (0, 0, 0, 0, ctx, ctx)); Vp = F.pad(V, (0, 0, 0, 0, ctx, ctx))
177
+ Kw = Kp.unfold(0, w, 1).permute(0, 3, 1, 2); Vw = Vp.unfold(0, w, 1).permute(0, 3, 1, 2)
178
+ idx = torch.arange(w, device=Q.device) - ctx
179
+ pos = torch.arange(n, device=Q.device)[:, None] + idx[None, :]
180
+ valid = (pos >= 0) & (pos < n)
181
+ scores = torch.einsum("nhqd,nwhd->nhqw", Q, Kw).float() * sm_scale
182
+ scores = scores.masked_fill(~valid[:, None, None, :], -float("inf"))
183
+ sink = (S * math.log(2.0)).reshape(nh, qm)[None, :, :, None].expand(n, -1, -1, 1)
184
+ scores = torch.cat([scores, sink], dim=-1)
185
+ wt = torch.softmax(scores, dim=-1)[..., :-1].to(V.dtype)
186
+ return torch.einsum("nhqw,nwhd->nhqd", wt, Vw).reshape(n, -1)
187
+
188
+
189
+ class AttentionBlock(torch.nn.Module):
190
+ def __init__(self, cfg: ModelConfig, device=None):
191
+ super().__init__()
192
+ dt = torch.bfloat16
193
+ self.head_dim, self.nah, self.nkv = cfg.head_dim, cfg.num_attention_heads, cfg.num_key_value_heads
194
+ self.ctx = int(cfg.bidirectional_context_size)
195
+ self.sinks = torch.nn.Parameter(torch.empty(cfg.num_attention_heads, device=device, dtype=torch.float32))
196
+ self.norm = RMSNorm(cfg.hidden_size, device=device)
197
+ qkv_d = cfg.head_dim * (cfg.num_attention_heads + 2 * cfg.num_key_value_heads)
198
+ self.qkv = torch.nn.Linear(cfg.hidden_size, qkv_d, device=device, dtype=dt)
199
+ self.out = torch.nn.Linear(cfg.head_dim * cfg.num_attention_heads, cfg.hidden_size, device=device, dtype=dt)
200
+ self.qk_scale = 1 / math.sqrt(math.sqrt(cfg.head_dim))
201
+ self.rope = RotaryEmbedding(cfg.head_dim, int(cfg.rope_theta), torch.float32,
202
+ initial_context_length=cfg.initial_context_length,
203
+ scaling_factor=cfg.rope_scaling_factor,
204
+ ntk_alpha=cfg.rope_ntk_alpha, ntk_beta=cfg.rope_ntk_beta, device=device)
205
+
206
+ def forward(self, x):
207
+ t = self.norm(x).to(self.qkv.weight.dtype)
208
+ qkv = F.linear(t, self.qkv.weight, self.qkv.bias)
209
+ hd, nah, nkv = self.head_dim, self.nah, self.nkv
210
+ q = qkv[:, :nah * hd].contiguous()
211
+ k = qkv[:, nah * hd:(nah + nkv) * hd].contiguous()
212
+ v = qkv[:, (nah + nkv) * hd:(nah + 2 * nkv) * hd].contiguous()
213
+ q, k = self.rope(q, k)
214
+ q, k = q * self.qk_scale, k * self.qk_scale
215
+ n = q.shape[0]
216
+ q = q.view(n, nkv, nah // nkv, hd); k = k.view(n, nkv, hd); v = v.view(n, nkv, hd)
217
+ ao = sdpa(q, k, v, self.sinks, 1.0, self.ctx).to(self.out.weight.dtype)
218
+ return x + F.linear(ao, self.out.weight, self.out.bias).to(x.dtype)
219
+
220
+
221
+ def swiglu(x, alpha=1.702, limit=7.0):
222
+ g, l = x.chunk(2, dim=-1)
223
+ g, l = g.clamp(max=limit), l.clamp(-limit, limit)
224
+ return g * torch.sigmoid(alpha * g) * (l + 1)
225
+
226
+
227
+ class MLPBlock(torch.nn.Module):
228
+ def __init__(self, cfg: ModelConfig, device=None):
229
+ super().__init__()
230
+ dt = torch.bfloat16
231
+ self.ne, self.ept = cfg.num_experts, cfg.experts_per_token
232
+ self.norm = RMSNorm(cfg.hidden_size, device=device)
233
+ self.gate = torch.nn.Linear(cfg.hidden_size, cfg.num_experts, device=device, dtype=dt)
234
+ self.mlp1_weight = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.hidden_size, cfg.intermediate_size * 2, device=device, dtype=dt))
235
+ self.mlp1_bias = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.intermediate_size * 2, device=device, dtype=dt))
236
+ self.mlp2_weight = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.intermediate_size, cfg.hidden_size, device=device, dtype=dt))
237
+ self.mlp2_bias = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.hidden_size, device=device, dtype=dt))
238
+
239
+ def forward(self, x):
240
+ t = self.norm(x)
241
+ gs = F.linear(t.float(), self.gate.weight.float(), self.gate.bias.float())
242
+ top = torch.topk(gs, k=self.ept, dim=-1, sorted=True)
243
+ ew = torch.softmax(top.values, dim=-1) / self.ept
244
+ ei = top.indices
245
+ ept = self.ept
246
+
247
+ def _chunk(tc, eic, ewc):
248
+ o = expert_linear(tc.float().unsqueeze(1).expand(-1, eic.shape[1], -1),
249
+ self.mlp1_weight[eic].float(), self.mlp1_bias[eic].float())
250
+ o = swiglu(o)
251
+ o = expert_linear(o.float(), self.mlp2_weight[eic].float(), self.mlp2_bias[eic].float())
252
+ return (torch.einsum("bec,be->bc", o.to(ewc.dtype), ewc) * ept).to(x.dtype)
253
+
254
+ cs = 32
255
+ if t.shape[0] > cs:
256
+ parts = [_chunk(t[s:s+cs], ei[s:s+cs], ew[s:s+cs]) for s in range(0, t.shape[0], cs)]
257
+ return x + torch.cat(parts, 0)
258
+ return x + _chunk(t, ei, ew)
259
+
260
+
261
+ class TransformerBlock(torch.nn.Module):
262
+ def __init__(self, cfg, device=None):
263
+ super().__init__()
264
+ self.attn = AttentionBlock(cfg, device=device)
265
+ self.mlp = MLPBlock(cfg, device=device)
266
+ def forward(self, x):
267
+ return self.mlp(self.attn(x))
268
+
269
+
270
+ class Checkpoint:
271
+ @staticmethod
272
+ def build_param_name_map(n):
273
+ return ({f"block.{i}.mlp.mlp1_bias": f"block.{i}.mlp.swiglu.bias" for i in range(n)}
274
+ | {f"block.{i}.mlp.mlp1_weight": f"block.{i}.mlp.swiglu.weight" for i in range(n)}
275
+ | {f"block.{i}.mlp.mlp2_bias": f"block.{i}.mlp.out.bias" for i in range(n)}
276
+ | {f"block.{i}.mlp.mlp2_weight": f"block.{i}.mlp.out.weight" for i in range(n)})
277
+
278
+ def __init__(self, path, device, num_hidden_layers):
279
+ self.pnm = self.build_param_name_map(num_hidden_layers)
280
+ self.ds = device.type if device.index is None else f"{device.type}:{device.index}"
281
+ files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".safetensors")]
282
+ self.map = {}
283
+ for sf in files:
284
+ with safe_open(sf, framework="pt", device=self.ds) as h:
285
+ for k in h.keys():
286
+ self.map[k] = sf
287
+
288
+ def get(self, name):
289
+ mapped = self.pnm.get(name, name)
290
+ with safe_open(self.map[mapped], framework="pt", device=self.ds) as h:
291
+ return h.get_tensor(mapped)
292
+
293
+
294
+ class Transformer(torch.nn.Module):
295
+ def __init__(self, cfg, device):
296
+ super().__init__()
297
+ dt = torch.bfloat16
298
+ self.embedding = torch.nn.Embedding(cfg.vocab_size, cfg.hidden_size, device=device, dtype=dt)
299
+ self.block = torch.nn.ModuleList([TransformerBlock(cfg, device=device) for _ in range(cfg.num_hidden_layers)])
300
+ self.norm = RMSNorm(cfg.hidden_size, device=device)
301
+ self.unembedding = torch.nn.Linear(cfg.hidden_size, cfg.num_labels, bias=False, device=device, dtype=dt)
302
+
303
+ def forward(self, token_ids):
304
+ x = self.embedding(token_ids)
305
+ for blk in self.block:
306
+ x = blk(x)
307
+ return F.linear(self.norm(x), self.unembedding.weight, None)
308
+
309
+ @classmethod
310
+ def from_checkpoint(cls, checkpoint_dir, *, device):
311
+ torch.backends.cuda.matmul.allow_tf32 = False
312
+ torch.backends.cudnn.allow_tf32 = False
313
+ torch.set_float32_matmul_precision("highest")
314
+ cp = json.loads((Path(checkpoint_dir) / "config.json").read_text())
315
+ validate_model_config_contract(cp, context=str(checkpoint_dir))
316
+ cfg = ModelConfig.from_checkpoint_config(cp, context=str(checkpoint_dir))
317
+ ckpt = Checkpoint(checkpoint_dir, device, cfg.num_hidden_layers)
318
+ m = cls(cfg, device); m.eval()
319
+ for name, param in m.named_parameters():
320
+ loaded = ckpt.get(name)
321
+ if param.shape != loaded.shape:
322
+ raise ValueError(f"Shape mismatch {name}: {param.shape} vs {loaded.shape}")
323
+ param.data.copy_(loaded)
324
+ return m
325
+
326
+
327
+ @dataclass(frozen=True)
328
+ class LabelInfo:
329
+ boundary_label_lookup: dict[str, dict[str, int]]
330
+ token_to_span_label: dict[int, int]
331
+ token_boundary_tags: dict[int, str | None]
332
+ span_class_names: tuple[str, ...]
333
+ span_label_lookup: dict[str, int]
334
+ background_token_label: int
335
+ background_span_label: int
336
+
337
+
338
+ def labels_to_spans(labels_by_index, label_info):
339
+ spans, cur_label, start_idx, prev_idx = [], None, None, None
340
+ bg = label_info.background_span_label
341
+ for ti in sorted(labels_by_index):
342
+ lid = labels_by_index[ti]
343
+ sl = label_info.token_to_span_label.get(lid)
344
+ bt = label_info.token_boundary_tags.get(lid)
345
+ if prev_idx is not None and ti != prev_idx + 1:
346
+ if cur_label is not None and start_idx is not None:
347
+ spans.append((cur_label, start_idx, prev_idx + 1))
348
+ cur_label = start_idx = None
349
+ if sl is None:
350
+ prev_idx = ti; continue
351
+ if sl == bg:
352
+ if cur_label is not None and start_idx is not None:
353
+ spans.append((cur_label, start_idx, ti))
354
+ cur_label = start_idx = None; prev_idx = ti; continue
355
+ if bt == "S":
356
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
357
+ spans.append((cur_label, start_idx, prev_idx + 1))
358
+ spans.append((sl, ti, ti + 1)); cur_label = start_idx = None
359
+ elif bt == "B":
360
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
361
+ spans.append((cur_label, start_idx, prev_idx + 1))
362
+ cur_label, start_idx = sl, ti
363
+ elif bt == "I":
364
+ if cur_label is None or cur_label != sl:
365
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
366
+ spans.append((cur_label, start_idx, prev_idx + 1))
367
+ cur_label, start_idx = sl, ti
368
+ elif bt == "E":
369
+ if cur_label is None or cur_label != sl or start_idx is None:
370
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
371
+ spans.append((cur_label, start_idx, prev_idx + 1))
372
+ spans.append((sl, ti, ti + 1)); cur_label = start_idx = None
373
+ else:
374
+ spans.append((cur_label, start_idx, ti + 1)); cur_label = start_idx = None
375
+ else:
376
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
377
+ spans.append((cur_label, start_idx, prev_idx + 1))
378
+ cur_label = start_idx = None
379
+ prev_idx = ti
380
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
381
+ spans.append((cur_label, start_idx, prev_idx + 1))
382
+ return spans
383
+
384
+
385
+ def token_spans_to_char_spans(spans, cs, ce):
386
+ out = []
387
+ for li, ts, te in spans:
388
+ if not (0 <= ts < te <= len(cs)):
389
+ continue
390
+ s, e = cs[ts], ce[te - 1]
391
+ if e > s:
392
+ out.append((li, s, e))
393
+ return out
394
+
395
+
396
+ def trim_char_spans_whitespace(spans, text):
397
+ out = []
398
+ for li, s, e in spans:
399
+ if not (0 <= s < e <= len(text)):
400
+ continue
401
+ while s < e and text[s].isspace(): s += 1
402
+ while e > s and text[e - 1].isspace(): e -= 1
403
+ if e > s:
404
+ out.append((li, s, e))
405
+ return out
406
+
407
+
408
+ @functools.lru_cache(maxsize=1)
409
+ def get_viterbi_transition_biases():
410
+ cp = MODEL_DIR / "viterbi_calibration.json"
411
+ default = {k: 0.0 for k in VITERBI_TRANSITION_BIAS_KEYS}
412
+ if not cp.is_file():
413
+ return default
414
+ payload = json.loads(cp.read_text())
415
+ raw = payload
416
+ ops = payload.get("operating_points")
417
+ if isinstance(ops, dict):
418
+ preset = ops.get(DEFAULT_VITERBI_CALIBRATION_PRESET)
419
+ if isinstance(preset, dict):
420
+ raw = preset.get("biases", raw)
421
+ if not isinstance(raw, dict):
422
+ return default
423
+ return {k: float(raw.get(k, 0.0)) for k in VITERBI_TRANSITION_BIAS_KEYS}
424
+
425
+
426
+ class Decoder:
427
+ def __init__(self, label_info):
428
+ nc = len(label_info.token_to_span_label)
429
+ self._start = torch.full((nc,), -1e9, dtype=torch.float32)
430
+ self._end = torch.full((nc,), -1e9, dtype=torch.float32)
431
+ self._trans = torch.full((nc, nc), -1e9, dtype=torch.float32)
432
+ biases = get_viterbi_transition_biases()
433
+ bg_tok, bg_sp = label_info.background_token_label, label_info.background_span_label
434
+ ttsl, tbt = label_info.token_to_span_label, label_info.token_boundary_tags
435
+ for i in range(nc):
436
+ tag, sl = tbt.get(i), ttsl.get(i)
437
+ if tag in {"B", "S"} or i == bg_tok: self._start[i] = 0.0
438
+ if tag in {"E", "S"} or i == bg_tok: self._end[i] = 0.0
439
+ for j in range(nc):
440
+ nt, ns = tbt.get(j), ttsl.get(j)
441
+ if self._valid(tag, sl, nt, ns, bg_tok, bg_sp, j):
442
+ self._trans[i, j] = self._bias(tag, sl, nt, ns, bg_sp, biases)
443
+
444
+ @staticmethod
445
+ def _valid(pt, ps, nt, ns, bti, bsi, ni):
446
+ nb = ns == bsi or ni == bti
447
+ if (ns is None or nt is None) and not nb: return False
448
+ if pt is None or ps is None: return nb or nt in {"B", "S"}
449
+ if ps == bsi or pt in {"E", "S"}: return nb or nt in {"B", "S"}
450
+ if pt in {"B", "I"}: return ps == ns and nt in {"I", "E"}
451
+ return False
452
+
453
+ @staticmethod
454
+ def _bias(pt, ps, nt, ns, bsi, b):
455
+ nb, pb = ns == bsi, ps == bsi
456
+ if pb: return b["transition_bias_background_stay"] if nb else b["transition_bias_background_to_start"]
457
+ if pt in {"B", "I"}: return b["transition_bias_inside_to_continue"] if nt == "I" else b["transition_bias_inside_to_end"]
458
+ return b["transition_bias_end_to_background"] if nb else b["transition_bias_end_to_start"]
459
+
460
+ def decode(self, lp):
461
+ sl, nc = lp.shape
462
+ if sl == 0: return []
463
+ st = self._start.to(lp.device, lp.dtype)
464
+ en = self._end.to(lp.device, lp.dtype)
465
+ tr = self._trans.to(lp.device, lp.dtype)
466
+ scores = lp[0] + st
467
+ bp = torch.empty((sl - 1, nc), device=lp.device, dtype=torch.int64)
468
+ for i in range(1, sl):
469
+ t = scores.unsqueeze(1) + tr
470
+ bs, bi = t.max(dim=0)
471
+ scores = bs + lp[i]; bp[i - 1] = bi
472
+ if not torch.isfinite(scores).any(): return lp.argmax(dim=1).tolist()
473
+ scores = scores + en
474
+ path = torch.empty(sl, device=lp.device, dtype=torch.int64)
475
+ path[-1] = scores.argmax()
476
+ for i in range(sl - 2, -1, -1): path[i] = bp[i, path[i + 1]]
477
+ return path.tolist()
478
+
479
+
480
+ @dataclass(frozen=True)
481
+ class InferenceRuntime:
482
+ model: Transformer; encoding: tiktoken.Encoding; label_info: LabelInfo
483
+ device: torch.device; n_ctx: int
484
+
485
+
486
+ @functools.lru_cache(maxsize=1)
487
+ def get_runtime():
488
+ cp = MODEL_DIR
489
+ cfg = json.loads((cp / "config.json").read_text())
490
+ validate_model_config_contract(cfg, context=str(cp))
491
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
492
+ encoding = tiktoken.get_encoding(str(cfg["encoding"]).strip())
493
+ scn = [BACKGROUND_CLASS_LABEL]; sll = {BACKGROUND_CLASS_LABEL: 0}
494
+ bll, ttsl, tbt = {}, {}, {}
495
+ bg_idx = None
496
+ for idx, name in enumerate(NER_CLASS_NAMES):
497
+ if name == BACKGROUND_CLASS_LABEL:
498
+ bg_idx = idx; ttsl[idx] = 0; tbt[idx] = None; continue
499
+ bnd, base = name.split("-", 1)
500
+ si = sll.get(base)
501
+ if si is None:
502
+ si = len(scn); scn.append(base); sll[base] = si
503
+ ttsl[idx] = si; tbt[idx] = bnd
504
+ bll.setdefault(base, {})[bnd] = idx
505
+ li = LabelInfo(bll, ttsl, tbt, tuple(scn), sll, bg_idx, 0)
506
+ m = Transformer.from_checkpoint(str(cp), device=device)
507
+ return InferenceRuntime(m, encoding, li, device, int(cfg["default_n_ctx"]))
508
+
509
+
510
+ @functools.lru_cache(maxsize=1)
511
+ def get_decoder():
512
+ return Decoder(label_info=get_runtime().label_info)
513
+
514
+
515
+ @torch.inference_mode()
516
+ def _predict_with_runtime(runtime, text, decoder):
517
+ tids = tuple(int(t) for t in runtime.encoding.encode(text, allowed_special="all"))
518
+ if not tids: return text, []
519
+ chunks = []
520
+ for s in range(0, len(tids), runtime.n_ctx):
521
+ e = min(s + runtime.n_ctx, len(tids))
522
+ wt = torch.tensor(tids[s:e], device=runtime.device, dtype=torch.int32)
523
+ lp = F.log_softmax(runtime.model(wt).float(), dim=-1)
524
+ chunks.append(lp)
525
+ stacked = chunks[0] if len(chunks) == 1 else torch.cat(chunks, dim=0)
526
+ dl = decoder.decode(stacked)
527
+ if len(dl) != len(tids): dl = stacked.argmax(dim=1).tolist()
528
+ pli = {i: int(l) for i, l in enumerate(dl)}
529
+ pts = labels_to_spans(pli, runtime.label_info)
530
+ tb = [runtime.encoding.decode_single_token_bytes(t) for t in tids]
531
+ dt = b"".join(tb).decode("utf-8", errors="replace")
532
+ cbs, cbe = [], []
533
+ bc = 0
534
+ for ch in dt: cbs.append(bc); bc += len(ch.encode("utf-8")); cbe.append(bc)
535
+ cs, ce = [], []
536
+ tbc = 0
537
+ for rb in tb:
538
+ tbs = tbc; tbe = tbs + len(rb); tbc = tbe
539
+ cs.append(bisect_right(cbe, tbs)); ce.append(bisect_left(cbs, tbe))
540
+ pcs = token_spans_to_char_spans(pts, cs, ce)
541
+ pcs = trim_char_spans_whitespace(pcs, dt if dt != text else text)
542
+ src = dt if dt != text else text
543
+ detected = []
544
+ for li, s, e in pcs:
545
+ if 0 <= li < len(runtime.label_info.span_class_names):
546
+ lbl = runtime.label_info.span_class_names[li]
547
+ else:
548
+ lbl = f"label_{li}"
549
+ detected.append({"label": lbl, "start": s, "end": e, "text": src[s:e]})
550
+ return src, detected
551
+
552
+
553
+ def predict_text(text: str) -> tuple[str, list[dict]]:
554
+ """Returns (source_text, spans). source_text may differ from input
555
+ only if the tokenizer's decode path normalizes invalid UTF-8; spans
556
+ are character offsets into source_text."""
557
+ return _predict_with_runtime(get_runtime(), text, get_decoder())
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ tiktoken
2
+ sentencepiece
3
+ torch
4
+ safetensors
5
+ huggingface_hub
6
+ gradio[mcp]>=5.29.0
7
+ accelerate
8
+ spaces
9
+ python-multipart