ysharma HF Staff commited on
Commit
0234c4b
Β·
verified Β·
1 Parent(s): e71851b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +698 -471
app.py CHANGED
@@ -1,30 +1,42 @@
1
  """
2
  PII Reveal - Document Privacy Explorer
3
  =======================================
4
- A Gradio Server app powered by OpenAI Privacy Filter (Apache 2.0)
5
- for detecting and visualizing PII in PDF/DOC/DOCX documents.
6
-
7
- - Backend: gr.Server (Gradio + FastAPI)
8
- - Frontend: Custom HTML/CSS/JS
9
- - Model: charles-first-org/second-model (1.5B params, 50M active, 128k context)
10
  """
11
 
 
 
 
 
 
12
  import os
13
  import re
14
- import json
15
  import tempfile
 
 
 
16
  from pathlib import Path
 
17
 
18
- import torch
19
  import gradio as gr
 
 
 
 
20
  from fastapi import UploadFile, File
21
  from fastapi.responses import HTMLResponse, JSONResponse
 
 
22
 
23
- # ── Configuration ────────────────────────────────────────────────
24
- MODEL_ID = os.getenv("MODEL_ID", "charles-first-org/second-model")
25
  HF_TOKEN = os.getenv("HF_TOKEN", None)
 
26
 
27
- CATEGORIES = {
28
  "private_person": {"color": "#ef4444", "bg": "rgba(239,68,68,0.15)", "label": "Person"},
29
  "private_address": {"color": "#06b6d4", "bg": "rgba(6,182,212,0.15)", "label": "Address"},
30
  "private_email": {"color": "#3b82f6", "bg": "rgba(59,130,246,0.15)", "label": "Email"},
@@ -35,24 +47,537 @@ CATEGORIES = {
35
  "secret": {"color": "#dc2626", "bg": "rgba(220,38,38,0.15)", "label": "Secret"},
36
  }
37
 
38
- # ── Model Loading ────────────────────────────────────────────────
39
- print(f"[PII Reveal] Loading model: {MODEL_ID}")
40
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- from transformers import AutoTokenizer, AutoModelForTokenClassification # noqa: E402
 
 
 
 
 
 
 
 
43
 
44
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
45
- model = AutoModelForTokenClassification.from_pretrained(
46
- MODEL_ID, trust_remote_code=True, token=HF_TOKEN,
47
- torch_dtype=torch.bfloat16 if device.type == "cuda" else torch.float32,
48
- )
49
- model.eval().to(device)
50
 
51
- id2label = model.config.id2label
52
- print(f"[PII Reveal] Model loaded on {device} | Labels: {len(id2label)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
 
 
 
54
 
55
- # ── Text Extraction ──────────────────────────────────────────────
56
  def extract_text(file_path: str) -> str:
57
  suffix = Path(file_path).suffix.lower()
58
  if suffix == ".pdf":
@@ -68,176 +593,98 @@ def extract_text(file_path: str) -> str:
68
  raise ValueError(f"Unsupported file type: {suffix}")
69
 
70
 
71
- # ── PII Detection ────────────────────────────────────────────────
72
- def detect_pii(text: str) -> list[dict]:
73
- """Run Privacy Filter on text, return list of {label, start, end, text}."""
74
- encodings = tokenizer(
75
- text,
76
- return_tensors="pt",
77
- return_offsets_mapping=True,
78
- truncation=True,
79
- max_length=128000,
80
- )
81
- offset_mapping = encodings.pop("offset_mapping")[0].tolist()
82
- inputs = {k: v.to(device) for k, v in encodings.items()}
83
-
84
- with torch.no_grad():
85
- logits = model(**inputs).logits
86
-
87
- preds = torch.argmax(logits, dim=-1)[0].tolist()
88
-
89
- spans, current = [], None
90
- for i, pred_id in enumerate(preds):
91
- label = id2label.get(pred_id, "O")
92
- char_start, char_end = offset_mapping[i]
93
-
94
- if char_start == char_end or label == "O" or "-" not in label:
95
- if current:
96
- spans.append(current)
97
- current = None
98
- continue
99
-
100
- tag, category = label.split("-", 1)
101
-
102
- if tag == "S":
103
- if current:
104
- spans.append(current)
105
- spans.append({"label": category, "start": char_start, "end": char_end,
106
- "text": text[char_start:char_end]})
107
- current = None
108
- elif tag == "B":
109
- if current:
110
- spans.append(current)
111
- current = {"label": category, "start": char_start, "end": char_end,
112
- "text": text[char_start:char_end]}
113
- elif tag == "I" and current and current["label"] == category:
114
- current["end"] = char_end
115
- current["text"] = text[current["start"]:char_end]
116
- elif tag == "E" and current and current["label"] == category:
117
- current["end"] = char_end
118
- current["text"] = text[current["start"]:char_end]
119
- spans.append(current)
120
- current = None
121
- else:
122
- if current:
123
- spans.append(current)
124
- current = None
125
-
126
- if current:
127
- spans.append(current)
128
- return spans
129
-
130
-
131
- # ── Statistics ───────────────────────────────────────────────────
132
- def compute_stats(text: str, spans: list[dict]) -> dict:
133
  total = len(text)
134
  pii_chars = sum(s["end"] - s["start"] for s in spans)
135
- by_cat: dict[str, dict] = {}
136
  for s in spans:
137
  c = s["label"]
138
  by_cat.setdefault(c, {"count": 0, "chars": 0})
139
- by_cat[c]["count"] += 1
140
- by_cat[c]["chars"] += s["end"] - s["start"]
141
  return {
142
- "total_chars": total,
143
- "pii_chars": pii_chars,
144
  "pii_percentage": round(pii_chars / total * 100, 1) if total else 0,
145
- "total_spans": len(spans),
146
- "categories": by_cat,
147
- "num_categories": len(by_cat),
148
  }
149
 
150
 
151
- def detect_speakers(text: str, spans: list[dict]) -> dict:
152
- """Simple speaker detection for transcripts."""
153
- patterns = [
154
- r"^([A-Z][a-zA-Z ]{1,30}):\s",
155
- r"^\[([^\]]{1,30})\]\s",
156
- r"^(Speaker\s*\d+):\s",
157
- ]
158
- line_speakers = []
159
- pos, cur_speaker = 0, None
160
  for line in text.split("\n"):
161
- for pat in patterns:
162
- m = re.match(pat, line)
163
- if m:
164
- cur_speaker = m.group(1).strip()
165
- break
166
- line_speakers.append((pos, pos + len(line), cur_speaker))
167
- pos += len(line) + 1
168
-
169
- result: dict[str, int] = {}
170
  for span in spans:
171
  mid = (span["start"] + span["end"]) // 2
172
  speaker = "Document"
173
- for ls, le, sp in line_speakers:
174
- if ls <= mid <= le and sp:
175
- speaker = sp
176
- break
177
  result[speaker] = result.get(speaker, 0) + 1
 
 
178
 
179
- if len(result) <= 1 and "Document" in result:
180
- return {}
181
- return result
 
 
 
 
182
 
183
 
184
  # ── Gradio Server ────────────────────────────────────────────────
185
- app = gr.Server()
186
 
187
 
188
- @app.get("/", response_class=HTMLResponse)
189
  async def homepage():
190
  return FRONTEND_HTML
191
 
192
 
193
- @app.post("/api/analyze")
194
  async def analyze_document(file: UploadFile = File(...)):
195
  suffix = Path(file.filename).suffix.lower()
196
  if suffix not in (".pdf", ".doc", ".docx"):
197
  return JSONResponse({"error": f"Unsupported: {suffix}. Use PDF, DOC, or DOCX."}, 400)
198
-
199
  with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
200
- tmp.write(await file.read())
201
- tmp_path = tmp.name
202
-
203
  try:
204
  text = extract_text(tmp_path)
205
  if not text.strip():
206
  return JSONResponse({"error": "No text content found."}, 400)
207
- spans = detect_pii(text)
208
- stats = compute_stats(text, spans)
209
- speakers = detect_speakers(text, spans)
210
  return JSONResponse({
211
- "filename": file.filename,
212
- "text": text,
213
- "spans": spans,
214
- "stats": stats,
215
- "speakers": speakers,
216
  "categories_meta": {k: {"color": v["color"], "bg": v["bg"], "label": v["label"]}
217
- for k, v in CATEGORIES.items()},
218
  })
219
  except Exception as e:
220
  return JSONResponse({"error": str(e)}, 500)
221
  finally:
222
- if os.path.exists(tmp_path):
223
- os.unlink(tmp_path)
224
 
225
 
226
- @app.api(name="analyze_text")
227
  def analyze_text_api(text: str) -> str:
228
- """Gradio API: analyze raw text for PII (for programmatic access)."""
229
- spans = detect_pii(text)
230
- stats = compute_stats(text, spans)
231
- return json.dumps({"spans": spans, "stats": stats}, ensure_ascii=False)
232
 
233
 
234
- # ── Frontend ─────────────────────────────────────────────────────
235
  FRONTEND_HTML = r"""<!DOCTYPE html>
236
  <html lang="en">
237
  <head>
238
  <meta charset="UTF-8">
239
  <meta name="viewport" content="width=device-width,initial-scale=1">
240
- <title>PII Reveal - Document Privacy Explorer</title>
241
  <link rel="preconnect" href="https://fonts.googleapis.com">
242
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap" rel="stylesheet">
243
  <style>
@@ -248,13 +695,10 @@ FRONTEND_HTML = r"""<!DOCTYPE html>
248
  --primary:#6366f1;--primary-light:#e0e7ff;
249
  --radius:12px;--radius-sm:8px;--shadow:0 1px 3px rgba(0,0,0,.08);
250
  --shadow-lg:0 8px 32px rgba(0,0,0,.12);
251
- --person:#ef4444;--address:#06b6d4;--email:#3b82f6;--phone:#22c55e;
252
- --url:#eab308;--date:#a855f7;--account:#f97316;--secret:#dc2626;
253
  }
254
  body{font-family:'Inter',system-ui,sans-serif;background:var(--bg);color:var(--text);min-height:100vh;line-height:1.6}
255
- a{color:var(--primary)}
256
 
257
- /* ─ Upload View ─ */
258
  #upload-view{display:flex;flex-direction:column;align-items:center;justify-content:center;min-height:100vh;padding:2rem}
259
  .upload-card{background:var(--surface);border-radius:20px;padding:3rem;max-width:640px;width:100%;text-align:center;box-shadow:var(--shadow-lg);position:relative;overflow:hidden}
260
  .upload-card::before{content:'';position:absolute;inset:-2px;border-radius:22px;background:linear-gradient(135deg,var(--primary),#ec4899,var(--primary));z-index:-1;opacity:0;transition:opacity .3s}
@@ -276,7 +720,7 @@ a{color:var(--primary)}
276
  .feature-desc{color:var(--text2);font-size:.75rem;line-height:1.4}
277
  .powered-by{margin-top:1.5rem;font-size:.8rem;color:var(--text3)}
278
 
279
- /* ─ Results View ─ */
280
  #results-view{display:none;min-height:100vh}
281
  .top-bar{background:var(--surface);border-bottom:1px solid var(--border);padding:.75rem 1.5rem;display:flex;align-items:center;gap:1rem;position:sticky;top:0;z-index:100;box-shadow:var(--shadow)}
282
  .top-bar .brand{margin:0}
@@ -284,12 +728,10 @@ a{color:var(--primary)}
284
  .top-bar .brand-icon{width:32px;height:32px;font-size:1rem}
285
  .file-info{font-size:.85rem;color:var(--text2);margin-left:.5rem;flex:1}
286
  .btn{padding:.5rem 1rem;border-radius:var(--radius-sm);border:none;cursor:pointer;font-weight:600;font-size:.85rem;transition:all .15s}
287
- .btn-primary{background:var(--primary);color:#fff}
288
- .btn-primary:hover{background:#4f46e5}
289
  .btn-ghost{background:transparent;color:var(--text2);border:1px solid var(--border)}
290
  .btn-ghost:hover{background:var(--surface2)}
291
 
292
- /* ─ Summary Strip ─ */
293
  .summary-strip{background:var(--surface);border-bottom:1px solid var(--border);padding:1rem 1.5rem;display:flex;align-items:center;gap:1.5rem;flex-wrap:wrap}
294
  .stat-big{text-align:center;min-width:80px}
295
  .stat-big .num{font-size:1.75rem;font-weight:800;color:var(--primary)}
@@ -301,28 +743,26 @@ a{color:var(--primary)}
301
  .category-chips{display:flex;flex-wrap:wrap;gap:.35rem}
302
  .chip{display:inline-flex;align-items:center;gap:.35rem;padding:.2rem .6rem;border-radius:20px;font-size:.75rem;font-weight:600;border:1.5px solid}
303
 
304
- /* ─ Main Layout ─ */
305
  .main-layout{display:flex;height:calc(100vh - 130px)}
306
  .doc-panel{flex:1;overflow-y:auto;padding:2rem;background:var(--bg)}
307
  .doc-content{background:var(--surface);border-radius:var(--radius);padding:2rem 2.5rem;max-width:900px;margin:0 auto;box-shadow:var(--shadow);font-size:.95rem;line-height:1.8;white-space:pre-wrap;word-wrap:break-word}
308
 
309
- /* ─ PII Highlights ─ */
310
  .pii{border-radius:3px;padding:1px 2px;cursor:pointer;transition:all .15s;position:relative;border-bottom:2px solid}
311
  .pii:hover{filter:brightness(.92)}
312
  .pii.dimmed{opacity:.15;border-bottom-color:transparent!important}
313
- .pii-private_person{background:rgba(239,68,68,.15);border-bottom-color:var(--person);color:#991b1b}
314
- .pii-private_address{background:rgba(6,182,212,.15);border-bottom-color:var(--address);color:#155e75}
315
- .pii-private_email{background:rgba(59,130,246,.15);border-bottom-color:var(--email);color:#1e40af}
316
- .pii-private_phone{background:rgba(34,197,94,.15);border-bottom-color:var(--phone);color:#166534}
317
- .pii-private_url{background:rgba(234,179,8,.15);border-bottom-color:var(--url);color:#854d0e}
318
- .pii-private_date{background:rgba(168,85,247,.15);border-bottom-color:var(--date);color:#6b21a8}
319
- .pii-account_number{background:rgba(249,115,22,.15);border-bottom-color:var(--account);color:#9a3412}
320
- .pii-secret{background:rgba(220,38,38,.15);border-bottom-color:var(--secret);color:#991b1b}
321
-
322
- /* ─ Tooltip ─ */
323
  .pii-tooltip{position:fixed;background:#1e293b;color:#fff;padding:.4rem .7rem;border-radius:6px;font-size:.75rem;font-weight:500;pointer-events:none;z-index:999;white-space:nowrap;box-shadow:0 4px 12px rgba(0,0,0,.2)}
324
 
325
- /* ─ Sidebar ─ */
326
  .sidebar{width:300px;background:var(--surface);border-left:1px solid var(--border);overflow-y:auto;padding:1.25rem;flex-shrink:0}
327
  .sidebar h3{font-size:.7rem;text-transform:uppercase;letter-spacing:.8px;color:var(--text3);margin-bottom:.75rem;font-weight:700}
328
  .filter-group{margin-bottom:1.5rem}
@@ -336,35 +776,26 @@ a{color:var(--primary)}
336
  .filter-label{flex:1;font-size:.85rem;font-weight:500}
337
  .filter-count{font-size:.75rem;color:var(--text3);font-weight:600;background:var(--surface2);padding:.1rem .45rem;border-radius:10px}
338
 
339
- /* ─ Loading ─ */
340
  #loading{position:fixed;inset:0;background:rgba(255,255,255,.85);backdrop-filter:blur(8px);display:none;flex-direction:column;align-items:center;justify-content:center;z-index:9999}
341
  .spinner{width:48px;height:48px;border:4px solid var(--border);border-top-color:var(--primary);border-radius:50%;animation:spin .8s linear infinite}
342
  @keyframes spin{to{transform:rotate(360deg)}}
343
  #loading p{margin-top:1rem;font-weight:600;color:var(--text2)}
344
  .progress-text{font-size:.85rem;color:var(--text3);margin-top:.5rem}
 
345
 
346
- /* ─ Error ─ */
347
- .error-banner{background:#fef2f2;border:1px solid #fecaca;color:#991b1b;padding:1rem 1.5rem;border-radius:var(--radius-sm);margin:1rem;font-size:.9rem;display:none;align-items:center;gap:.5rem}
348
-
349
- /* ─ Responsive ─ */
350
  @media(max-width:768px){
351
  .main-layout{flex-direction:column-reverse;height:auto}
352
  .sidebar{width:100%;border-left:none;border-top:1px solid var(--border)}
353
  .features{grid-template-columns:1fr}
354
- .summary-strip{flex-direction:column;align-items:stretch}
355
- .stat-divider{width:100%;height:1px}
356
  }
357
  </style>
358
  </head>
359
  <body>
360
 
361
- <!-- ─── Upload View ─── -->
362
  <div id="upload-view">
363
  <div class="upload-card">
364
- <div class="brand">
365
- <div class="brand-icon">&#x1f50d;</div>
366
- <h1>PII Reveal</h1>
367
- </div>
368
  <p class="subtitle">Document Privacy Explorer</p>
369
  <div class="dropzone" id="dropzone">
370
  <div class="dropzone-icon">&#x1f4c4;</div>
@@ -373,36 +804,21 @@ a{color:var(--primary)}
373
  <input type="file" id="file-input" accept=".pdf,.doc,.docx">
374
  </div>
375
  <div class="features">
376
- <div class="feature">
377
- <div class="feature-title">8 PII Categories</div>
378
- <div class="feature-desc">Names, addresses, emails, phones, URLs, dates, accounts, secrets</div>
379
- </div>
380
- <div class="feature">
381
- <div class="feature-title">128k Context</div>
382
- <div class="feature-desc">Full documents in one pass &mdash; no chunking artifacts</div>
383
- </div>
384
- <div class="feature">
385
- <div class="feature-title">Context-Aware</div>
386
- <div class="feature-desc">Understands when "May" is a name vs. a month</div>
387
- </div>
388
  </div>
389
  <div class="powered-by">Powered by <strong>OpenAI Privacy Filter</strong> &middot; Apache 2.0</div>
390
  </div>
391
  </div>
392
 
393
- <!-- ─── Results View ─── -->
394
  <div id="results-view">
395
  <div class="top-bar">
396
- <div class="brand">
397
- <div class="brand-icon">&#x1f50d;</div>
398
- <h1>PII Reveal</h1>
399
- </div>
400
  <div class="file-info" id="file-info"></div>
401
  <button class="btn btn-ghost" onclick="resetView()">New File</button>
402
  </div>
403
-
404
  <div class="error-banner" id="error-banner"></div>
405
-
406
  <div class="summary-strip" id="summary-strip">
407
  <div class="stat-big"><div class="num" id="stat-pct">0%</div><div class="lbl">PII Content</div></div>
408
  <div class="stat-divider"></div>
@@ -410,303 +826,114 @@ a{color:var(--primary)}
410
  <div class="stat-divider"></div>
411
  <div class="stat-big"><div class="num" id="stat-cats">0</div><div class="lbl">Categories</div></div>
412
  <div class="stat-divider"></div>
413
- <div class="stat-bar">
414
- <div class="stat-bar-track" id="stat-bar-track"></div>
415
- <div class="category-chips" id="category-chips"></div>
416
- </div>
417
  </div>
418
-
419
  <div class="main-layout">
420
- <div class="doc-panel">
421
- <div class="doc-content" id="doc-content"></div>
422
- </div>
423
  <div class="sidebar">
424
- <div class="filter-group">
425
- <h3>PII Categories</h3>
426
- <div id="category-filters"></div>
427
- </div>
428
- <div class="filter-group" id="speaker-group" style="display:none">
429
- <h3>Speakers</h3>
430
- <div id="speaker-filters"></div>
431
- </div>
432
  </div>
433
  </div>
434
  </div>
435
 
436
- <!-- ─── Loading Overlay ─── -->
437
- <div id="loading">
438
- <div class="spinner"></div>
439
- <p>Analyzing document for PII&hellip;</p>
440
- <div class="progress-text">Running OpenAI Privacy Filter (128k context)</div>
441
- </div>
442
-
443
- <!-- ─── Tooltip ─── -->
444
  <div class="pii-tooltip" id="tooltip" style="display:none"></div>
445
 
446
  <script>
447
- // ── State ──
448
- let STATE = { text: '', spans: [], stats: {}, speakers: {}, activeCategories: new Set(), activeSpeakers: new Set(), categoriesMeta: {} };
449
-
450
- const CATEGORY_LABELS = {
451
- private_person: 'Person', private_address: 'Address', private_email: 'Email',
452
- private_phone: 'Phone', private_url: 'URL', private_date: 'Date',
453
- account_number: 'Account', secret: 'Secret'
454
- };
455
- const CATEGORY_COLORS = {
456
- private_person:'#ef4444', private_address:'#06b6d4', private_email:'#3b82f6',
457
- private_phone:'#22c55e', private_url:'#eab308', private_date:'#a855f7',
458
- account_number:'#f97316', secret:'#dc2626'
459
- };
460
-
461
- // ── Upload Handling ──
462
- const dropzone = document.getElementById('dropzone');
463
- const fileInput = document.getElementById('file-input');
464
-
465
- ['dragenter','dragover'].forEach(e => dropzone.addEventListener(e, ev => { ev.preventDefault(); dropzone.classList.add('dragover'); }));
466
- ['dragleave','drop'].forEach(e => dropzone.addEventListener(e, ev => { ev.preventDefault(); dropzone.classList.remove('dragover'); }));
467
-
468
- dropzone.addEventListener('drop', ev => {
469
- const file = ev.dataTransfer.files[0];
470
- if (file) uploadFile(file);
471
- });
472
- fileInput.addEventListener('change', ev => {
473
- const file = ev.target.files[0];
474
- if (file) uploadFile(file);
475
- });
476
-
477
- async function uploadFile(file) {
478
- const ext = file.name.split('.').pop().toLowerCase();
479
- if (!['pdf','doc','docx'].includes(ext)) {
480
- showError('Unsupported file type. Please use PDF, DOC, or DOCX.');
481
- return;
482
- }
483
-
484
- document.getElementById('loading').style.display = 'flex';
485
- document.getElementById('upload-view').style.display = 'none';
486
-
487
- const form = new FormData();
488
- form.append('file', file);
489
-
490
- try {
491
- const resp = await fetch('/api/analyze', { method: 'POST', body: form });
492
- const data = await resp.json();
493
-
494
- if (data.error) {
495
- showError(data.error);
496
- return;
497
- }
498
-
499
- STATE.text = data.text;
500
- STATE.spans = data.spans;
501
- STATE.stats = data.stats;
502
- STATE.speakers = data.speakers || {};
503
- STATE.categoriesMeta = data.categories_meta || {};
504
- STATE.activeCategories = new Set(Object.keys(data.stats.categories));
505
- STATE.activeSpeakers = new Set(Object.keys(data.speakers));
506
-
507
- renderResults(data.filename);
508
- } catch (err) {
509
- showError('Analysis failed: ' + err.message);
510
- } finally {
511
- document.getElementById('loading').style.display = 'none';
512
- }
513
  }
514
-
515
- function showError(msg) {
516
- document.getElementById('loading').style.display = 'none';
517
- document.getElementById('results-view').style.display = 'block';
518
- const banner = document.getElementById('error-banner');
519
- banner.textContent = msg;
520
- banner.style.display = 'flex';
 
521
  }
522
-
523
- function resetView() {
524
- document.getElementById('results-view').style.display = 'none';
525
- document.getElementById('upload-view').style.display = 'flex';
526
- document.getElementById('error-banner').style.display = 'none';
527
- fileInput.value = '';
528
- }
529
-
530
- // ── Render Results ──
531
- function renderResults(filename) {
532
- document.getElementById('results-view').style.display = 'block';
533
- document.getElementById('error-banner').style.display = 'none';
534
-
535
- // File info
536
- document.getElementById('file-info').textContent = filename;
537
-
538
- // Summary stats
539
- renderSummary();
540
-
541
- // Filters
542
- renderCategoryFilters();
543
- renderSpeakerFilters();
544
-
545
- // Document
546
- renderDocument();
547
  }
548
-
549
- function renderSummary() {
550
- const s = STATE.stats;
551
- document.getElementById('stat-pct').textContent = s.pii_percentage + '%';
552
- document.getElementById('stat-spans').textContent = s.total_spans;
553
- document.getElementById('stat-cats').textContent = s.num_categories;
554
-
555
- // Bar
556
- const track = document.getElementById('stat-bar-track');
557
- track.innerHTML = '';
558
- const cats = s.categories;
559
- const total = s.pii_chars || 1;
560
- for (const [cat, info] of Object.entries(cats)) {
561
- const pct = (info.chars / s.total_chars * 100);
562
- const seg = document.createElement('div');
563
- seg.className = 'stat-bar-fill';
564
- seg.style.width = pct + '%';
565
- seg.style.background = CATEGORY_COLORS[cat] || '#888';
566
- track.appendChild(seg);
567
- }
568
-
569
- // Chips
570
- const chips = document.getElementById('category-chips');
571
- chips.innerHTML = '';
572
- for (const [cat, info] of Object.entries(cats)) {
573
- const color = CATEGORY_COLORS[cat] || '#888';
574
- const label = CATEGORY_LABELS[cat] || cat;
575
- const chip = document.createElement('span');
576
- chip.className = 'chip';
577
- chip.style.color = color;
578
- chip.style.borderColor = color;
579
- chip.style.background = color + '15';
580
- chip.textContent = label + ' ' + info.count;
581
- chips.appendChild(chip);
582
  }
583
  }
584
-
585
- function renderCategoryFilters() {
586
- const container = document.getElementById('category-filters');
587
- container.innerHTML = '';
588
- const cats = STATE.stats.categories;
589
-
590
- for (const cat of Object.keys(CATEGORY_LABELS)) {
591
- const info = cats[cat];
592
- if (!info) continue;
593
- const color = CATEGORY_COLORS[cat];
594
- const label = CATEGORY_LABELS[cat];
595
-
596
- const item = document.createElement('label');
597
- item.className = 'filter-item';
598
- item.style.color = color;
599
- item.innerHTML = `
600
- <input type="checkbox" data-cat="${cat}" ${STATE.activeCategories.has(cat)?'checked':''}>
601
- <span class="filter-check"></span>
602
- <span class="filter-dot" style="background:${color}"></span>
603
- <span class="filter-label" style="color:var(--text)">${label}</span>
604
- <span class="filter-count">${info.count}</span>
605
- `;
606
- item.querySelector('input').addEventListener('change', ev => {
607
- if (ev.target.checked) STATE.activeCategories.add(cat);
608
- else STATE.activeCategories.delete(cat);
609
- renderDocument();
610
- });
611
- container.appendChild(item);
612
  }
613
  }
614
-
615
- function renderSpeakerFilters() {
616
- const speakers = STATE.speakers;
617
- const group = document.getElementById('speaker-group');
618
- const container = document.getElementById('speaker-filters');
619
-
620
- if (!speakers || Object.keys(speakers).length === 0) {
621
- group.style.display = 'none';
622
- return;
 
623
  }
624
- group.style.display = 'block';
625
- container.innerHTML = '';
626
-
627
- for (const [speaker, count] of Object.entries(speakers)) {
628
- const item = document.createElement('label');
629
- item.className = 'filter-item';
630
- item.innerHTML = `
631
- <input type="checkbox" data-speaker="${speaker}" ${STATE.activeSpeakers.has(speaker)?'checked':''}>
632
- <span class="filter-check" style="color:var(--primary)"></span>
633
- <span class="filter-label">${speaker}</span>
634
- <span class="filter-count">${count}</span>
635
- `;
636
- item.querySelector('input').addEventListener('change', ev => {
637
- if (ev.target.checked) STATE.activeSpeakers.add(speaker);
638
- else STATE.activeSpeakers.delete(speaker);
639
- renderDocument();
640
- });
641
- container.appendChild(item);
642
- }
643
- }
644
-
645
- // ── Document Rendering ──
646
- function escapeHtml(str) {
647
- const div = document.createElement('div');
648
- div.textContent = str;
649
- return div.innerHTML;
650
- }
651
-
652
- function renderDocument() {
653
- const { text, spans } = STATE;
654
- const active = STATE.activeCategories;
655
-
656
- // Sort spans by start position
657
- const sorted = [...spans].sort((a, b) => a.start - b.start);
658
-
659
- let html = '';
660
- let pos = 0;
661
-
662
- for (const span of sorted) {
663
- if (span.start < pos) continue; // skip overlapping
664
-
665
- // Text before span
666
- if (span.start > pos) {
667
- html += escapeHtml(text.substring(pos, span.start));
668
- }
669
-
670
- const isActive = active.has(span.label);
671
- const cls = isActive ? `pii pii-${span.label}` : `pii pii-${span.label} dimmed`;
672
- const spanText = escapeHtml(text.substring(span.start, span.end));
673
- html += `<span class="${cls}" data-label="${span.label}" data-text="${escapeHtml(span.text)}">${spanText}</span>`;
674
- pos = span.end;
675
- }
676
-
677
- // Remaining text
678
- if (pos < text.length) {
679
- html += escapeHtml(text.substring(pos));
680
- }
681
-
682
- document.getElementById('doc-content').innerHTML = html;
683
- attachTooltips();
684
- }
685
-
686
- // ── Tooltips ──
687
- function attachTooltips() {
688
- const tooltip = document.getElementById('tooltip');
689
- document.querySelectorAll('.pii').forEach(el => {
690
- el.addEventListener('mouseenter', ev => {
691
- const label = CATEGORY_LABELS[el.dataset.label] || el.dataset.label;
692
- tooltip.textContent = label + ': ' + el.dataset.text;
693
- tooltip.style.display = 'block';
694
- positionTooltip(ev);
695
- });
696
- el.addEventListener('mousemove', positionTooltip);
697
- el.addEventListener('mouseleave', () => { tooltip.style.display = 'none'; });
698
  });
699
  }
700
-
701
- function positionTooltip(ev) {
702
- const tt = document.getElementById('tooltip');
703
- tt.style.left = ev.clientX + 12 + 'px';
704
- tt.style.top = ev.clientY - 36 + 'px';
705
- }
706
  </script>
707
  </body>
708
  </html>"""
709
 
710
- # ── Launch ───────────────────────────────────────────────────────
711
  if __name__ == "__main__":
712
- app.launch(server_name="0.0.0.0", server_port=7860)
 
1
  """
2
  PII Reveal - Document Privacy Explorer
3
  =======================================
4
+ Backend : gr.Server (Gradio + FastAPI)
5
+ Frontend: Custom HTML / CSS / JS
6
+ Model : charles-first-org/second-model (OpenAI Privacy Filter)
 
 
 
7
  """
8
 
9
+ # ── stdlib ───────────────────────────────────────────────────────
10
+ import dataclasses
11
+ import functools
12
+ import json
13
+ import math
14
  import os
15
  import re
 
16
  import tempfile
17
+ from bisect import bisect_left, bisect_right
18
+ from collections.abc import Sequence
19
+ from dataclasses import dataclass
20
  from pathlib import Path
21
+ from typing import Final
22
 
23
+ # ── third-party ──────────────────────────────────────────────────
24
  import gradio as gr
25
+ import spaces
26
+ import tiktoken
27
+ import torch
28
+ import torch.nn.functional as F
29
  from fastapi import UploadFile, File
30
  from fastapi.responses import HTMLResponse, JSONResponse
31
+ from huggingface_hub import snapshot_download
32
+ from safetensors import safe_open
33
 
34
+ # ── configuration ────────────────────────────────────────────────
35
+ MODEL_REPO = os.getenv("MODEL_ID", "charles-first-org/second-model")
36
  HF_TOKEN = os.getenv("HF_TOKEN", None)
37
+ MODEL_DIR = Path(snapshot_download(MODEL_REPO, token=HF_TOKEN))
38
 
39
+ CATEGORIES_META = {
40
  "private_person": {"color": "#ef4444", "bg": "rgba(239,68,68,0.15)", "label": "Person"},
41
  "private_address": {"color": "#06b6d4", "bg": "rgba(6,182,212,0.15)", "label": "Address"},
42
  "private_email": {"color": "#3b82f6", "bg": "rgba(59,130,246,0.15)", "label": "Email"},
 
47
  "secret": {"color": "#dc2626", "bg": "rgba(220,38,38,0.15)", "label": "Secret"},
48
  }
49
 
50
+ # =====================================================================
51
+ # MODEL ARCHITECTURE + INFERENCE (from reference implementation)
52
+ # =====================================================================
53
+
54
+ PRIVACY_FILTER_MODEL_TYPE: Final[str] = "privacy_filter"
55
+ REQUIRED_MODEL_CONFIG_KEYS: Final[tuple[str, ...]] = (
56
+ "model_type", "encoding", "num_hidden_layers", "num_experts",
57
+ "experts_per_token", "vocab_size", "num_labels", "hidden_size",
58
+ "intermediate_size", "head_dim", "num_attention_heads",
59
+ "num_key_value_heads", "sliding_window", "bidirectional_context",
60
+ "bidirectional_left_context", "bidirectional_right_context",
61
+ "default_n_ctx", "initial_context_length", "rope_theta",
62
+ "rope_scaling_factor", "rope_ntk_alpha", "rope_ntk_beta", "param_dtype",
63
+ )
64
+ BACKGROUND_CLASS_LABEL: Final[str] = "O"
65
+ BOUNDARY_PREFIXES: Final[tuple[str, ...]] = ("B", "I", "E", "S")
66
+ SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
67
+ BACKGROUND_CLASS_LABEL,
68
+ "account_number", "private_address", "private_date", "private_email",
69
+ "private_person", "private_phone", "private_url", "secret",
70
+ )
71
+ NER_CLASS_NAMES: Final[tuple[str, ...]] = (BACKGROUND_CLASS_LABEL,) + tuple(
72
+ f"{prefix}-{base}"
73
+ for base in SPAN_CLASS_NAMES if base != BACKGROUND_CLASS_LABEL
74
+ for prefix in BOUNDARY_PREFIXES
75
+ )
76
+ VITERBI_TRANSITION_BIAS_KEYS: Final[tuple[str, ...]] = (
77
+ "transition_bias_background_stay", "transition_bias_background_to_start",
78
+ "transition_bias_inside_to_continue", "transition_bias_inside_to_end",
79
+ "transition_bias_end_to_background", "transition_bias_end_to_start",
80
+ )
81
+ DEFAULT_VITERBI_CALIBRATION_PRESET: Final[str] = "default"
82
+
83
+
84
+ def validate_model_config_contract(cfg: dict, *, context: str) -> None:
85
+ missing = [k for k in REQUIRED_MODEL_CONFIG_KEYS if k not in cfg]
86
+ if missing:
87
+ raise ValueError(f"{context} missing keys: {', '.join(missing)}")
88
+ if cfg.get("model_type") != PRIVACY_FILTER_MODEL_TYPE:
89
+ raise ValueError(f"{context} model_type must be {PRIVACY_FILTER_MODEL_TYPE!r}")
90
+ if cfg.get("bidirectional_context") is not True:
91
+ raise ValueError(f"{context} must use bidirectional_context=true")
92
+ lc, rc = cfg.get("bidirectional_left_context"), cfg.get("bidirectional_right_context")
93
+ if not isinstance(lc, int) or not isinstance(rc, int) or lc != rc or lc < 0:
94
+ raise ValueError(f"{context} bidirectional context must be equal non-negative ints")
95
+ sw = cfg.get("sliding_window")
96
+ if sw != 2 * lc + 1:
97
+ raise ValueError(f"{context} sliding_window must equal 2*context+1")
98
+ if cfg["num_labels"] != 33:
99
+ raise ValueError(f"{context} num_labels must be 33")
100
+ if cfg["param_dtype"] != "bfloat16":
101
+ raise ValueError(f"{context} param_dtype must be bfloat16")
102
+
103
+
104
+ # ── model helpers ────────────────────────────────────────────────
105
+
106
+ def expert_linear(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None) -> torch.Tensor:
107
+ n, e, k = x.shape
108
+ _, _, _, o = weight.shape
109
+ out = torch.bmm(x.reshape(n * e, 1, k), weight.reshape(n * e, k, o)).reshape(n, e, o)
110
+ return out + bias if bias is not None else out
111
+
112
+
113
+ @dataclass
114
+ class ModelConfig:
115
+ num_hidden_layers: int; num_experts: int; experts_per_token: int
116
+ vocab_size: int; num_labels: int; hidden_size: int; intermediate_size: int
117
+ head_dim: int; num_attention_heads: int; num_key_value_heads: int
118
+ bidirectional_context_size: int; initial_context_length: int
119
+ rope_theta: float; rope_scaling_factor: float; rope_ntk_alpha: float; rope_ntk_beta: float
120
+
121
+ @classmethod
122
+ def from_checkpoint_config(cls, cfg: dict, *, context: str) -> "ModelConfig":
123
+ cfg = dict(cfg)
124
+ cfg["bidirectional_context_size"] = cfg["bidirectional_left_context"]
125
+ fields = {f.name for f in dataclasses.fields(cls)}
126
+ return cls(**{k: v for k, v in cfg.items() if k in fields})
127
+
128
+
129
+ class RMSNorm(torch.nn.Module):
130
+ def __init__(self, n: int, eps: float = 1e-5, device=None):
131
+ super().__init__()
132
+ self.eps = eps
133
+ self.scale = torch.nn.Parameter(torch.ones(n, device=device, dtype=torch.float32))
134
+
135
+ def forward(self, x):
136
+ t = x.float()
137
+ return (t * torch.rsqrt(t.pow(2).mean(-1, keepdim=True) + self.eps) * self.scale).to(x.dtype)
138
+
139
+
140
+ def apply_rope(x, cos, sin):
141
+ cos = cos.unsqueeze(-2).to(x.dtype); sin = sin.unsqueeze(-2).to(x.dtype)
142
+ x1, x2 = x[..., ::2], x[..., 1::2]
143
+ return torch.stack((x1 * cos - x2 * sin, x2 * cos + x1 * sin), dim=-1).reshape(x.shape)
144
+
145
+
146
+ class RotaryEmbedding(torch.nn.Module):
147
+ def __init__(self, head_dim, base, dtype, *, initial_context_length=4096,
148
+ scaling_factor=1.0, ntk_alpha=1.0, ntk_beta=32.0, device=None):
149
+ super().__init__()
150
+ self.head_dim, self.base, self.dtype = head_dim, base, dtype
151
+ self.initial_context_length = initial_context_length
152
+ self.scaling_factor, self.ntk_alpha, self.ntk_beta = scaling_factor, ntk_alpha, ntk_beta
153
+ self.device = device
154
+ mp = max(int(initial_context_length * scaling_factor), initial_context_length)
155
+ self.max_position_embeddings = mp
156
+ cos, sin = self._compute(mp, device=torch.device("cpu"))
157
+ target = device or torch.device("cpu")
158
+ self.register_buffer("cos_cache", cos.to(target), persistent=False)
159
+ self.register_buffer("sin_cache", sin.to(target), persistent=False)
160
+
161
+ def _inv_freq(self, device=None):
162
+ device = device or self.device
163
+ freq = self.base ** (torch.arange(0, self.head_dim, 2, dtype=torch.float, device=device) / self.head_dim)
164
+ if self.scaling_factor > 1.0:
165
+ d_half = self.head_dim / 2
166
+ low = d_half * math.log(self.initial_context_length / (self.ntk_beta * 2 * math.pi)) / math.log(self.base)
167
+ high = d_half * math.log(self.initial_context_length / (self.ntk_alpha * 2 * math.pi)) / math.log(self.base)
168
+ interp = 1.0 / (self.scaling_factor * freq)
169
+ extrap = 1.0 / freq
170
+ ramp = (torch.arange(d_half, dtype=torch.float32, device=device) - low) / (high - low)
171
+ mask = 1 - ramp.clamp(0, 1)
172
+ return interp * (1 - mask) + extrap * mask
173
+ return 1.0 / freq
174
+
175
+ def _compute(self, n, device=None):
176
+ inv_freq = self._inv_freq(device)
177
+ t = torch.arange(n, dtype=torch.float32, device=device or self.device)
178
+ freqs = torch.einsum("i,j->ij", t, inv_freq)
179
+ c = 0.1 * math.log(self.scaling_factor) + 1.0 if self.scaling_factor > 1.0 else 1.0
180
+ return (freqs.cos() * c).to(self.dtype), (freqs.sin() * c).to(self.dtype)
181
+
182
+ def forward(self, q, k):
183
+ n = q.shape[0]
184
+ if n > self.cos_cache.shape[0]:
185
+ cos, sin = self._compute(n, torch.device("cpu"))
186
+ self.cos_cache, self.sin_cache = cos.to(q.device), sin.to(q.device)
187
+ cc = self.cos_cache.to(q.device) if self.cos_cache.device != q.device else self.cos_cache
188
+ sc = self.sin_cache.to(q.device) if self.sin_cache.device != q.device else self.sin_cache
189
+ cos, sin = cc[:n], sc[:n]
190
+ q = apply_rope(q.view(n, -1, self.head_dim), cos, sin).reshape(q.shape)
191
+ k = apply_rope(k.view(n, -1, self.head_dim), cos, sin).reshape(k.shape)
192
+ return q, k
193
+
194
+
195
+ def sdpa(Q, K, V, S, sm_scale, ctx):
196
+ n, nh, qm, hd = Q.shape
197
+ w = 2 * ctx + 1
198
+ Kp = F.pad(K, (0, 0, 0, 0, ctx, ctx)); Vp = F.pad(V, (0, 0, 0, 0, ctx, ctx))
199
+ Kw = Kp.unfold(0, w, 1).permute(0, 3, 1, 2); Vw = Vp.unfold(0, w, 1).permute(0, 3, 1, 2)
200
+ idx = torch.arange(w, device=Q.device) - ctx
201
+ pos = torch.arange(n, device=Q.device)[:, None] + idx[None, :]
202
+ valid = (pos >= 0) & (pos < n)
203
+ scores = torch.einsum("nhqd,nwhd->nhqw", Q, Kw).float() * sm_scale
204
+ scores = scores.masked_fill(~valid[:, None, None, :], -float("inf"))
205
+ sink = (S * math.log(2.0)).reshape(nh, qm)[None, :, :, None].expand(n, -1, -1, 1)
206
+ scores = torch.cat([scores, sink], dim=-1)
207
+ wt = torch.softmax(scores, dim=-1)[..., :-1].to(V.dtype)
208
+ return torch.einsum("nhqw,nwhd->nhqd", wt, Vw).reshape(n, -1)
209
+
210
+
211
+ class AttentionBlock(torch.nn.Module):
212
+ def __init__(self, cfg: ModelConfig, device=None):
213
+ super().__init__()
214
+ dt = torch.bfloat16
215
+ self.head_dim, self.nah, self.nkv = cfg.head_dim, cfg.num_attention_heads, cfg.num_key_value_heads
216
+ self.ctx = int(cfg.bidirectional_context_size)
217
+ self.sinks = torch.nn.Parameter(torch.empty(cfg.num_attention_heads, device=device, dtype=torch.float32))
218
+ self.norm = RMSNorm(cfg.hidden_size, device=device)
219
+ qkv_d = cfg.head_dim * (cfg.num_attention_heads + 2 * cfg.num_key_value_heads)
220
+ self.qkv = torch.nn.Linear(cfg.hidden_size, qkv_d, device=device, dtype=dt)
221
+ self.out = torch.nn.Linear(cfg.head_dim * cfg.num_attention_heads, cfg.hidden_size, device=device, dtype=dt)
222
+ self.qk_scale = 1 / math.sqrt(math.sqrt(cfg.head_dim))
223
+ self.rope = RotaryEmbedding(cfg.head_dim, int(cfg.rope_theta), torch.float32,
224
+ initial_context_length=cfg.initial_context_length,
225
+ scaling_factor=cfg.rope_scaling_factor,
226
+ ntk_alpha=cfg.rope_ntk_alpha, ntk_beta=cfg.rope_ntk_beta, device=device)
227
+
228
+ def forward(self, x):
229
+ t = self.norm(x).to(self.qkv.weight.dtype)
230
+ qkv = F.linear(t, self.qkv.weight, self.qkv.bias)
231
+ hd, nah, nkv = self.head_dim, self.nah, self.nkv
232
+ q = qkv[:, :nah * hd].contiguous()
233
+ k = qkv[:, nah * hd:(nah + nkv) * hd].contiguous()
234
+ v = qkv[:, (nah + nkv) * hd:(nah + 2 * nkv) * hd].contiguous()
235
+ q, k = self.rope(q, k)
236
+ q, k = q * self.qk_scale, k * self.qk_scale
237
+ n = q.shape[0]
238
+ q = q.view(n, nkv, nah // nkv, hd); k = k.view(n, nkv, hd); v = v.view(n, nkv, hd)
239
+ ao = sdpa(q, k, v, self.sinks, 1.0, self.ctx).to(self.out.weight.dtype)
240
+ return x + F.linear(ao, self.out.weight, self.out.bias).to(x.dtype)
241
+
242
+
243
+ def swiglu(x, alpha=1.702, limit=7.0):
244
+ g, l = x.chunk(2, dim=-1)
245
+ g, l = g.clamp(max=limit), l.clamp(-limit, limit)
246
+ return g * torch.sigmoid(alpha * g) * (l + 1)
247
+
248
+
249
+ class MLPBlock(torch.nn.Module):
250
+ def __init__(self, cfg: ModelConfig, device=None):
251
+ super().__init__()
252
+ dt = torch.bfloat16
253
+ self.ne, self.ept = cfg.num_experts, cfg.experts_per_token
254
+ self.norm = RMSNorm(cfg.hidden_size, device=device)
255
+ self.gate = torch.nn.Linear(cfg.hidden_size, cfg.num_experts, device=device, dtype=dt)
256
+ self.mlp1_weight = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.hidden_size, cfg.intermediate_size * 2, device=device, dtype=dt))
257
+ self.mlp1_bias = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.intermediate_size * 2, device=device, dtype=dt))
258
+ self.mlp2_weight = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.intermediate_size, cfg.hidden_size, device=device, dtype=dt))
259
+ self.mlp2_bias = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.hidden_size, device=device, dtype=dt))
260
+
261
+ def forward(self, x):
262
+ t = self.norm(x)
263
+ gs = F.linear(t.float(), self.gate.weight.float(), self.gate.bias.float())
264
+ top = torch.topk(gs, k=self.ept, dim=-1, sorted=True)
265
+ ew = torch.softmax(top.values, dim=-1) / self.ept
266
+ ei = top.indices
267
+ ept = self.ept
268
+
269
+ def _chunk(tc, eic, ewc):
270
+ o = expert_linear(tc.float().unsqueeze(1).expand(-1, eic.shape[1], -1),
271
+ self.mlp1_weight[eic].float(), self.mlp1_bias[eic].float())
272
+ o = swiglu(o)
273
+ o = expert_linear(o.float(), self.mlp2_weight[eic].float(), self.mlp2_bias[eic].float())
274
+ return (torch.einsum("bec,be->bc", o.to(ewc.dtype), ewc) * ept).to(x.dtype)
275
+
276
+ cs = 32
277
+ if t.shape[0] > cs:
278
+ parts = [_chunk(t[s:s+cs], ei[s:s+cs], ew[s:s+cs]) for s in range(0, t.shape[0], cs)]
279
+ return x + torch.cat(parts, 0)
280
+ return x + _chunk(t, ei, ew)
281
+
282
+
283
+ class TransformerBlock(torch.nn.Module):
284
+ def __init__(self, cfg, device=None):
285
+ super().__init__()
286
+ self.attn = AttentionBlock(cfg, device=device)
287
+ self.mlp = MLPBlock(cfg, device=device)
288
+ def forward(self, x):
289
+ return self.mlp(self.attn(x))
290
+
291
+
292
+ class Checkpoint:
293
+ @staticmethod
294
+ def build_param_name_map(n):
295
+ return ({f"block.{i}.mlp.mlp1_bias": f"block.{i}.mlp.swiglu.bias" for i in range(n)}
296
+ | {f"block.{i}.mlp.mlp1_weight": f"block.{i}.mlp.swiglu.weight" for i in range(n)}
297
+ | {f"block.{i}.mlp.mlp2_bias": f"block.{i}.mlp.out.bias" for i in range(n)}
298
+ | {f"block.{i}.mlp.mlp2_weight": f"block.{i}.mlp.out.weight" for i in range(n)})
299
+
300
+ def __init__(self, path, device, num_hidden_layers):
301
+ self.pnm = self.build_param_name_map(num_hidden_layers)
302
+ self.ds = device.type if device.index is None else f"{device.type}:{device.index}"
303
+ files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".safetensors")]
304
+ self.map = {}
305
+ for sf in files:
306
+ with safe_open(sf, framework="pt", device=self.ds) as h:
307
+ for k in h.keys():
308
+ self.map[k] = sf
309
+
310
+ def get(self, name):
311
+ mapped = self.pnm.get(name, name)
312
+ with safe_open(self.map[mapped], framework="pt", device=self.ds) as h:
313
+ return h.get_tensor(mapped)
314
+
315
+
316
+ class Transformer(torch.nn.Module):
317
+ def __init__(self, cfg, device):
318
+ super().__init__()
319
+ dt = torch.bfloat16
320
+ self.embedding = torch.nn.Embedding(cfg.vocab_size, cfg.hidden_size, device=device, dtype=dt)
321
+ self.block = torch.nn.ModuleList([TransformerBlock(cfg, device=device) for _ in range(cfg.num_hidden_layers)])
322
+ self.norm = RMSNorm(cfg.hidden_size, device=device)
323
+ self.unembedding = torch.nn.Linear(cfg.hidden_size, cfg.num_labels, bias=False, device=device, dtype=dt)
324
+
325
+ def forward(self, token_ids):
326
+ x = self.embedding(token_ids)
327
+ for blk in self.block:
328
+ x = blk(x)
329
+ return F.linear(self.norm(x), self.unembedding.weight, None)
330
+
331
+ @classmethod
332
+ def from_checkpoint(cls, checkpoint_dir, *, device):
333
+ torch.backends.cuda.matmul.allow_tf32 = False
334
+ torch.backends.cudnn.allow_tf32 = False
335
+ torch.set_float32_matmul_precision("highest")
336
+ cp = json.loads((Path(checkpoint_dir) / "config.json").read_text())
337
+ validate_model_config_contract(cp, context=str(checkpoint_dir))
338
+ cfg = ModelConfig.from_checkpoint_config(cp, context=str(checkpoint_dir))
339
+ ckpt = Checkpoint(checkpoint_dir, device, cfg.num_hidden_layers)
340
+ m = cls(cfg, device); m.eval()
341
+ for name, param in m.named_parameters():
342
+ loaded = ckpt.get(name)
343
+ if param.shape != loaded.shape:
344
+ raise ValueError(f"Shape mismatch {name}: {param.shape} vs {loaded.shape}")
345
+ param.data.copy_(loaded)
346
+ return m
347
+
348
+
349
+ # ── label info + span decoding ───────────────────────────────────
350
+
351
+ @dataclass(frozen=True)
352
+ class LabelInfo:
353
+ boundary_label_lookup: dict[str, dict[str, int]]
354
+ token_to_span_label: dict[int, int]
355
+ token_boundary_tags: dict[int, str | None]
356
+ span_class_names: tuple[str, ...]
357
+ span_label_lookup: dict[str, int]
358
+ background_token_label: int
359
+ background_span_label: int
360
+
361
+
362
+ def labels_to_spans(labels_by_index, label_info):
363
+ spans, cur_label, start_idx, prev_idx = [], None, None, None
364
+ bg = label_info.background_span_label
365
+ for ti in sorted(labels_by_index):
366
+ lid = labels_by_index[ti]
367
+ sl = label_info.token_to_span_label.get(lid)
368
+ bt = label_info.token_boundary_tags.get(lid)
369
+ if prev_idx is not None and ti != prev_idx + 1:
370
+ if cur_label is not None and start_idx is not None:
371
+ spans.append((cur_label, start_idx, prev_idx + 1))
372
+ cur_label = start_idx = None
373
+ if sl is None:
374
+ prev_idx = ti; continue
375
+ if sl == bg:
376
+ if cur_label is not None and start_idx is not None:
377
+ spans.append((cur_label, start_idx, ti))
378
+ cur_label = start_idx = None; prev_idx = ti; continue
379
+ if bt == "S":
380
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
381
+ spans.append((cur_label, start_idx, prev_idx + 1))
382
+ spans.append((sl, ti, ti + 1)); cur_label = start_idx = None
383
+ elif bt == "B":
384
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
385
+ spans.append((cur_label, start_idx, prev_idx + 1))
386
+ cur_label, start_idx = sl, ti
387
+ elif bt == "I":
388
+ if cur_label is None or cur_label != sl:
389
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
390
+ spans.append((cur_label, start_idx, prev_idx + 1))
391
+ cur_label, start_idx = sl, ti
392
+ elif bt == "E":
393
+ if cur_label is None or cur_label != sl or start_idx is None:
394
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
395
+ spans.append((cur_label, start_idx, prev_idx + 1))
396
+ spans.append((sl, ti, ti + 1)); cur_label = start_idx = None
397
+ else:
398
+ spans.append((cur_label, start_idx, ti + 1)); cur_label = start_idx = None
399
+ else:
400
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
401
+ spans.append((cur_label, start_idx, prev_idx + 1))
402
+ cur_label = start_idx = None
403
+ prev_idx = ti
404
+ if cur_label is not None and start_idx is not None and prev_idx is not None:
405
+ spans.append((cur_label, start_idx, prev_idx + 1))
406
+ return spans
407
+
408
 
409
+ def token_spans_to_char_spans(spans, cs, ce):
410
+ out = []
411
+ for li, ts, te in spans:
412
+ if not (0 <= ts < te <= len(cs)):
413
+ continue
414
+ s, e = cs[ts], ce[te - 1]
415
+ if e > s:
416
+ out.append((li, s, e))
417
+ return out
418
 
 
 
 
 
 
 
419
 
420
+ def trim_char_spans_whitespace(spans, text):
421
+ out = []
422
+ for li, s, e in spans:
423
+ if not (0 <= s < e <= len(text)):
424
+ continue
425
+ while s < e and text[s].isspace(): s += 1
426
+ while e > s and text[e - 1].isspace(): e -= 1
427
+ if e > s:
428
+ out.append((li, s, e))
429
+ return out
430
+
431
+
432
+ # ── viterbi decoder ──────────────────────────────────────────────
433
+
434
+ @functools.lru_cache(maxsize=1)
435
+ def get_viterbi_transition_biases():
436
+ cp = MODEL_DIR / "viterbi_calibration.json"
437
+ default = {k: 0.0 for k in VITERBI_TRANSITION_BIAS_KEYS}
438
+ if not cp.is_file():
439
+ return default
440
+ payload = json.loads(cp.read_text())
441
+ raw = payload
442
+ ops = payload.get("operating_points")
443
+ if isinstance(ops, dict):
444
+ preset = ops.get(DEFAULT_VITERBI_CALIBRATION_PRESET)
445
+ if isinstance(preset, dict):
446
+ raw = preset.get("biases", raw)
447
+ if not isinstance(raw, dict):
448
+ return default
449
+ return {k: float(raw.get(k, 0.0)) for k in VITERBI_TRANSITION_BIAS_KEYS}
450
+
451
+
452
+ class Decoder:
453
+ def __init__(self, label_info):
454
+ nc = len(label_info.token_to_span_label)
455
+ self._start = torch.full((nc,), -1e9, dtype=torch.float32)
456
+ self._end = torch.full((nc,), -1e9, dtype=torch.float32)
457
+ self._trans = torch.full((nc, nc), -1e9, dtype=torch.float32)
458
+ biases = get_viterbi_transition_biases()
459
+ bg_tok, bg_sp = label_info.background_token_label, label_info.background_span_label
460
+ ttsl, tbt = label_info.token_to_span_label, label_info.token_boundary_tags
461
+ for i in range(nc):
462
+ tag, sl = tbt.get(i), ttsl.get(i)
463
+ if tag in {"B", "S"} or i == bg_tok: self._start[i] = 0.0
464
+ if tag in {"E", "S"} or i == bg_tok: self._end[i] = 0.0
465
+ for j in range(nc):
466
+ nt, ns = tbt.get(j), ttsl.get(j)
467
+ if self._valid(tag, sl, nt, ns, bg_tok, bg_sp, j):
468
+ self._trans[i, j] = self._bias(tag, sl, nt, ns, bg_sp, biases)
469
+
470
+ @staticmethod
471
+ def _valid(pt, ps, nt, ns, bti, bsi, ni):
472
+ nb = ns == bsi or ni == bti
473
+ if (ns is None or nt is None) and not nb: return False
474
+ if pt is None or ps is None: return nb or nt in {"B", "S"}
475
+ if ps == bsi or pt in {"E", "S"}: return nb or nt in {"B", "S"}
476
+ if pt in {"B", "I"}: return ps == ns and nt in {"I", "E"}
477
+ return False
478
+
479
+ @staticmethod
480
+ def _bias(pt, ps, nt, ns, bsi, b):
481
+ nb, pb = ns == bsi, ps == bsi
482
+ if pb: return b["transition_bias_background_stay"] if nb else b["transition_bias_background_to_start"]
483
+ if pt in {"B", "I"}: return b["transition_bias_inside_to_continue"] if nt == "I" else b["transition_bias_inside_to_end"]
484
+ return b["transition_bias_end_to_background"] if nb else b["transition_bias_end_to_start"]
485
+
486
+ def decode(self, lp):
487
+ sl, nc = lp.shape
488
+ if sl == 0: return []
489
+ st = self._start.to(lp.device, lp.dtype)
490
+ en = self._end.to(lp.device, lp.dtype)
491
+ tr = self._trans.to(lp.device, lp.dtype)
492
+ scores = lp[0] + st
493
+ bp = torch.empty((sl - 1, nc), device=lp.device, dtype=torch.int64)
494
+ for i in range(1, sl):
495
+ t = scores.unsqueeze(1) + tr
496
+ bs, bi = t.max(dim=0)
497
+ scores = bs + lp[i]; bp[i - 1] = bi
498
+ if not torch.isfinite(scores).any(): return lp.argmax(dim=1).tolist()
499
+ scores += en
500
+ path = torch.empty(sl, device=lp.device, dtype=torch.int64)
501
+ path[-1] = scores.argmax()
502
+ for i in range(sl - 2, -1, -1): path[i] = bp[i, path[i + 1]]
503
+ return path.tolist()
504
+
505
+
506
+ # ── runtime singleton ────────────────────────────────────────────
507
+
508
+ @dataclass(frozen=True)
509
+ class InferenceRuntime:
510
+ model: Transformer; encoding: tiktoken.Encoding; label_info: LabelInfo
511
+ device: torch.device; n_ctx: int
512
+
513
+
514
+ @functools.lru_cache(maxsize=1)
515
+ def get_runtime():
516
+ cp = MODEL_DIR
517
+ cfg = json.loads((cp / "config.json").read_text())
518
+ validate_model_config_contract(cfg, context=str(cp))
519
+ device = torch.device("cuda")
520
+ encoding = tiktoken.get_encoding(str(cfg["encoding"]).strip())
521
+ # build label info
522
+ scn = [BACKGROUND_CLASS_LABEL]; sll = {BACKGROUND_CLASS_LABEL: 0}
523
+ bll, ttsl, tbt = {}, {}, {}
524
+ bg_idx = None
525
+ for idx, name in enumerate(NER_CLASS_NAMES):
526
+ if name == BACKGROUND_CLASS_LABEL:
527
+ bg_idx = idx; ttsl[idx] = 0; tbt[idx] = None; continue
528
+ bnd, base = name.split("-", 1)
529
+ si = sll.get(base)
530
+ if si is None:
531
+ si = len(scn); scn.append(base); sll[base] = si
532
+ ttsl[idx] = si; tbt[idx] = bnd
533
+ bll.setdefault(base, {})[bnd] = idx
534
+ li = LabelInfo(bll, ttsl, tbt, tuple(scn), sll, bg_idx, 0)
535
+ m = Transformer.from_checkpoint(str(cp), device=device)
536
+ return InferenceRuntime(m, encoding, li, device, int(cfg["default_n_ctx"]))
537
+
538
+
539
+ @torch.inference_mode()
540
+ def predict_text(runtime, text, decoder):
541
+ tids = tuple(int(t) for t in runtime.encoding.encode(text, allowed_special="all"))
542
+ if not tids: return text, []
543
+ scores = []
544
+ for s in range(0, len(tids), runtime.n_ctx):
545
+ e = min(s + runtime.n_ctx, len(tids))
546
+ wt = torch.tensor(tids[s:e], device=runtime.device, dtype=torch.int32)
547
+ lp = F.log_softmax(runtime.model(wt).float(), dim=-1)
548
+ scores.extend(lp.unbind(0))
549
+ stacked = torch.stack(scores, 0)
550
+ dl = decoder.decode(stacked)
551
+ if len(dl) != len(tids): dl = stacked.argmax(dim=1).tolist()
552
+ pli = {i: int(l) for i, l in enumerate(dl)}
553
+ pts = labels_to_spans(pli, runtime.label_info)
554
+ tb = [runtime.encoding.decode_single_token_bytes(t) for t in tids]
555
+ dt = b"".join(tb).decode("utf-8", errors="replace")
556
+ cbs, cbe = [], []
557
+ bc = 0
558
+ for ch in dt: cbs.append(bc); bc += len(ch.encode("utf-8")); cbe.append(bc)
559
+ cs, ce = [], []
560
+ tbc = 0
561
+ for rb in tb:
562
+ tbs = tbc; tbe = tbs + len(rb); tbc = tbe
563
+ cs.append(bisect_right(cbe, tbs)); ce.append(bisect_left(cbs, tbe))
564
+ pcs = token_spans_to_char_spans(pts, cs, ce)
565
+ pcs = trim_char_spans_whitespace(pcs, dt if dt != text else text)
566
+ src = dt if dt != text else text
567
+ detected = []
568
+ for li, s, e in pcs:
569
+ if 0 <= li < len(runtime.label_info.span_class_names):
570
+ lbl = runtime.label_info.span_class_names[li]
571
+ else:
572
+ lbl = f"label_{li}"
573
+ detected.append({"label": lbl, "start": s, "end": e, "text": src[s:e]})
574
+ return src, detected
575
+
576
 
577
+ # =====================================================================
578
+ # APPLICATION LAYER
579
+ # =====================================================================
580
 
 
581
  def extract_text(file_path: str) -> str:
582
  suffix = Path(file_path).suffix.lower()
583
  if suffix == ".pdf":
 
593
  raise ValueError(f"Unsupported file type: {suffix}")
594
 
595
 
596
+ def compute_stats(text, spans):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
  total = len(text)
598
  pii_chars = sum(s["end"] - s["start"] for s in spans)
599
+ by_cat = {}
600
  for s in spans:
601
  c = s["label"]
602
  by_cat.setdefault(c, {"count": 0, "chars": 0})
603
+ by_cat[c]["count"] += 1; by_cat[c]["chars"] += s["end"] - s["start"]
 
604
  return {
605
+ "total_chars": total, "pii_chars": pii_chars,
 
606
  "pii_percentage": round(pii_chars / total * 100, 1) if total else 0,
607
+ "total_spans": len(spans), "categories": by_cat, "num_categories": len(by_cat),
 
 
608
  }
609
 
610
 
611
+ def detect_speakers(text, spans):
612
+ patterns = [r"^([A-Z][a-zA-Z ]{1,30}):\s", r"^\[([^\]]{1,30})\]\s", r"^(Speaker\s*\d+):\s"]
613
+ line_sp, pos, cur = [], 0, None
 
 
 
 
 
 
614
  for line in text.split("\n"):
615
+ for p in patterns:
616
+ m = re.match(p, line)
617
+ if m: cur = m.group(1).strip(); break
618
+ line_sp.append((pos, pos + len(line), cur)); pos += len(line) + 1
619
+ result = {}
 
 
 
 
620
  for span in spans:
621
  mid = (span["start"] + span["end"]) // 2
622
  speaker = "Document"
623
+ for ls, le, sp in line_sp:
624
+ if ls <= mid <= le and sp: speaker = sp; break
 
 
625
  result[speaker] = result.get(speaker, 0) + 1
626
+ return {} if list(result.keys()) == ["Document"] else result
627
+
628
 
629
+ @spaces.GPU
630
+ def run_pii_analysis(text: str):
631
+ """GPU-accelerated PII detection."""
632
+ runtime = get_runtime()
633
+ decoder = Decoder(label_info=runtime.label_info)
634
+ source_text, detected = predict_text(runtime, text, decoder)
635
+ return source_text, detected
636
 
637
 
638
  # ── Gradio Server ────────────────────────────────────────────────
639
+ server = gr.Server()
640
 
641
 
642
+ @server.get("/", response_class=HTMLResponse)
643
  async def homepage():
644
  return FRONTEND_HTML
645
 
646
 
647
+ @server.post("/api/analyze")
648
  async def analyze_document(file: UploadFile = File(...)):
649
  suffix = Path(file.filename).suffix.lower()
650
  if suffix not in (".pdf", ".doc", ".docx"):
651
  return JSONResponse({"error": f"Unsupported: {suffix}. Use PDF, DOC, or DOCX."}, 400)
 
652
  with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
653
+ tmp.write(await file.read()); tmp_path = tmp.name
 
 
654
  try:
655
  text = extract_text(tmp_path)
656
  if not text.strip():
657
  return JSONResponse({"error": "No text content found."}, 400)
658
+ source_text, spans = run_pii_analysis(text)
659
+ stats = compute_stats(source_text, spans)
660
+ speakers = detect_speakers(source_text, spans)
661
  return JSONResponse({
662
+ "filename": file.filename, "text": source_text, "spans": spans,
663
+ "stats": stats, "speakers": speakers,
 
 
 
664
  "categories_meta": {k: {"color": v["color"], "bg": v["bg"], "label": v["label"]}
665
+ for k, v in CATEGORIES_META.items()},
666
  })
667
  except Exception as e:
668
  return JSONResponse({"error": str(e)}, 500)
669
  finally:
670
+ if os.path.exists(tmp_path): os.unlink(tmp_path)
 
671
 
672
 
673
+ @server.api(name="analyze_text")
674
  def analyze_text_api(text: str) -> str:
675
+ """Gradio API: analyze raw text for PII."""
676
+ source_text, spans = run_pii_analysis(text)
677
+ stats = compute_stats(source_text, spans)
678
+ return json.dumps({"text": source_text, "spans": spans, "stats": stats}, ensure_ascii=False)
679
 
680
 
681
+ # ── Frontend HTML ────────────────────────────────────────────────
682
  FRONTEND_HTML = r"""<!DOCTYPE html>
683
  <html lang="en">
684
  <head>
685
  <meta charset="UTF-8">
686
  <meta name="viewport" content="width=device-width,initial-scale=1">
687
+ <title>PII Reveal</title>
688
  <link rel="preconnect" href="https://fonts.googleapis.com">
689
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap" rel="stylesheet">
690
  <style>
 
695
  --primary:#6366f1;--primary-light:#e0e7ff;
696
  --radius:12px;--radius-sm:8px;--shadow:0 1px 3px rgba(0,0,0,.08);
697
  --shadow-lg:0 8px 32px rgba(0,0,0,.12);
 
 
698
  }
699
  body{font-family:'Inter',system-ui,sans-serif;background:var(--bg);color:var(--text);min-height:100vh;line-height:1.6}
 
700
 
701
+ /* Upload */
702
  #upload-view{display:flex;flex-direction:column;align-items:center;justify-content:center;min-height:100vh;padding:2rem}
703
  .upload-card{background:var(--surface);border-radius:20px;padding:3rem;max-width:640px;width:100%;text-align:center;box-shadow:var(--shadow-lg);position:relative;overflow:hidden}
704
  .upload-card::before{content:'';position:absolute;inset:-2px;border-radius:22px;background:linear-gradient(135deg,var(--primary),#ec4899,var(--primary));z-index:-1;opacity:0;transition:opacity .3s}
 
720
  .feature-desc{color:var(--text2);font-size:.75rem;line-height:1.4}
721
  .powered-by{margin-top:1.5rem;font-size:.8rem;color:var(--text3)}
722
 
723
+ /* Results */
724
  #results-view{display:none;min-height:100vh}
725
  .top-bar{background:var(--surface);border-bottom:1px solid var(--border);padding:.75rem 1.5rem;display:flex;align-items:center;gap:1rem;position:sticky;top:0;z-index:100;box-shadow:var(--shadow)}
726
  .top-bar .brand{margin:0}
 
728
  .top-bar .brand-icon{width:32px;height:32px;font-size:1rem}
729
  .file-info{font-size:.85rem;color:var(--text2);margin-left:.5rem;flex:1}
730
  .btn{padding:.5rem 1rem;border-radius:var(--radius-sm);border:none;cursor:pointer;font-weight:600;font-size:.85rem;transition:all .15s}
 
 
731
  .btn-ghost{background:transparent;color:var(--text2);border:1px solid var(--border)}
732
  .btn-ghost:hover{background:var(--surface2)}
733
 
734
+ /* Summary */
735
  .summary-strip{background:var(--surface);border-bottom:1px solid var(--border);padding:1rem 1.5rem;display:flex;align-items:center;gap:1.5rem;flex-wrap:wrap}
736
  .stat-big{text-align:center;min-width:80px}
737
  .stat-big .num{font-size:1.75rem;font-weight:800;color:var(--primary)}
 
743
  .category-chips{display:flex;flex-wrap:wrap;gap:.35rem}
744
  .chip{display:inline-flex;align-items:center;gap:.35rem;padding:.2rem .6rem;border-radius:20px;font-size:.75rem;font-weight:600;border:1.5px solid}
745
 
746
+ /* Layout */
747
  .main-layout{display:flex;height:calc(100vh - 130px)}
748
  .doc-panel{flex:1;overflow-y:auto;padding:2rem;background:var(--bg)}
749
  .doc-content{background:var(--surface);border-radius:var(--radius);padding:2rem 2.5rem;max-width:900px;margin:0 auto;box-shadow:var(--shadow);font-size:.95rem;line-height:1.8;white-space:pre-wrap;word-wrap:break-word}
750
 
751
+ /* PII */
752
  .pii{border-radius:3px;padding:1px 2px;cursor:pointer;transition:all .15s;position:relative;border-bottom:2px solid}
753
  .pii:hover{filter:brightness(.92)}
754
  .pii.dimmed{opacity:.15;border-bottom-color:transparent!important}
755
+ .pii-private_person{background:rgba(239,68,68,.15);border-bottom-color:#ef4444;color:#991b1b}
756
+ .pii-private_address{background:rgba(6,182,212,.15);border-bottom-color:#06b6d4;color:#155e75}
757
+ .pii-private_email{background:rgba(59,130,246,.15);border-bottom-color:#3b82f6;color:#1e40af}
758
+ .pii-private_phone{background:rgba(34,197,94,.15);border-bottom-color:#22c55e;color:#166534}
759
+ .pii-private_url{background:rgba(234,179,8,.15);border-bottom-color:#eab308;color:#854d0e}
760
+ .pii-private_date{background:rgba(168,85,247,.15);border-bottom-color:#a855f7;color:#6b21a8}
761
+ .pii-account_number{background:rgba(249,115,22,.15);border-bottom-color:#f97316;color:#9a3412}
762
+ .pii-secret{background:rgba(220,38,38,.15);border-bottom-color:#dc2626;color:#991b1b}
 
 
763
  .pii-tooltip{position:fixed;background:#1e293b;color:#fff;padding:.4rem .7rem;border-radius:6px;font-size:.75rem;font-weight:500;pointer-events:none;z-index:999;white-space:nowrap;box-shadow:0 4px 12px rgba(0,0,0,.2)}
764
 
765
+ /* Sidebar */
766
  .sidebar{width:300px;background:var(--surface);border-left:1px solid var(--border);overflow-y:auto;padding:1.25rem;flex-shrink:0}
767
  .sidebar h3{font-size:.7rem;text-transform:uppercase;letter-spacing:.8px;color:var(--text3);margin-bottom:.75rem;font-weight:700}
768
  .filter-group{margin-bottom:1.5rem}
 
776
  .filter-label{flex:1;font-size:.85rem;font-weight:500}
777
  .filter-count{font-size:.75rem;color:var(--text3);font-weight:600;background:var(--surface2);padding:.1rem .45rem;border-radius:10px}
778
 
779
+ /* Loading */
780
  #loading{position:fixed;inset:0;background:rgba(255,255,255,.85);backdrop-filter:blur(8px);display:none;flex-direction:column;align-items:center;justify-content:center;z-index:9999}
781
  .spinner{width:48px;height:48px;border:4px solid var(--border);border-top-color:var(--primary);border-radius:50%;animation:spin .8s linear infinite}
782
  @keyframes spin{to{transform:rotate(360deg)}}
783
  #loading p{margin-top:1rem;font-weight:600;color:var(--text2)}
784
  .progress-text{font-size:.85rem;color:var(--text3);margin-top:.5rem}
785
+ .error-banner{background:#fef2f2;border:1px solid #fecaca;color:#991b1b;padding:1rem 1.5rem;border-radius:var(--radius-sm);margin:1rem;font-size:.9rem;display:none}
786
 
 
 
 
 
787
  @media(max-width:768px){
788
  .main-layout{flex-direction:column-reverse;height:auto}
789
  .sidebar{width:100%;border-left:none;border-top:1px solid var(--border)}
790
  .features{grid-template-columns:1fr}
 
 
791
  }
792
  </style>
793
  </head>
794
  <body>
795
 
 
796
  <div id="upload-view">
797
  <div class="upload-card">
798
+ <div class="brand"><div class="brand-icon">&#x1f50d;</div><h1>PII Reveal</h1></div>
 
 
 
799
  <p class="subtitle">Document Privacy Explorer</p>
800
  <div class="dropzone" id="dropzone">
801
  <div class="dropzone-icon">&#x1f4c4;</div>
 
804
  <input type="file" id="file-input" accept=".pdf,.doc,.docx">
805
  </div>
806
  <div class="features">
807
+ <div class="feature"><div class="feature-title">8 PII Categories</div><div class="feature-desc">Names, addresses, emails, phones, URLs, dates, accounts, secrets</div></div>
808
+ <div class="feature"><div class="feature-title">128k Context</div><div class="feature-desc">Full documents in one pass &mdash; no chunking artifacts</div></div>
809
+ <div class="feature"><div class="feature-title">Context-Aware</div><div class="feature-desc">Understands when "May" is a name vs. a month</div></div>
 
 
 
 
 
 
 
 
 
810
  </div>
811
  <div class="powered-by">Powered by <strong>OpenAI Privacy Filter</strong> &middot; Apache 2.0</div>
812
  </div>
813
  </div>
814
 
 
815
  <div id="results-view">
816
  <div class="top-bar">
817
+ <div class="brand"><div class="brand-icon">&#x1f50d;</div><h1>PII Reveal</h1></div>
 
 
 
818
  <div class="file-info" id="file-info"></div>
819
  <button class="btn btn-ghost" onclick="resetView()">New File</button>
820
  </div>
 
821
  <div class="error-banner" id="error-banner"></div>
 
822
  <div class="summary-strip" id="summary-strip">
823
  <div class="stat-big"><div class="num" id="stat-pct">0%</div><div class="lbl">PII Content</div></div>
824
  <div class="stat-divider"></div>
 
826
  <div class="stat-divider"></div>
827
  <div class="stat-big"><div class="num" id="stat-cats">0</div><div class="lbl">Categories</div></div>
828
  <div class="stat-divider"></div>
829
+ <div class="stat-bar"><div class="stat-bar-track" id="stat-bar-track"></div><div class="category-chips" id="category-chips"></div></div>
 
 
 
830
  </div>
 
831
  <div class="main-layout">
832
+ <div class="doc-panel"><div class="doc-content" id="doc-content"></div></div>
 
 
833
  <div class="sidebar">
834
+ <div class="filter-group"><h3>PII Categories</h3><div id="category-filters"></div></div>
835
+ <div class="filter-group" id="speaker-group" style="display:none"><h3>Speakers</h3><div id="speaker-filters"></div></div>
 
 
 
 
 
 
836
  </div>
837
  </div>
838
  </div>
839
 
840
+ <div id="loading"><div class="spinner"></div><p>Analyzing document for PII&hellip;</p><div class="progress-text">Running OpenAI Privacy Filter (128k context)</div></div>
 
 
 
 
 
 
 
841
  <div class="pii-tooltip" id="tooltip" style="display:none"></div>
842
 
843
  <script>
844
+ let S={text:'',spans:[],stats:{},speakers:{},activeCats:new Set(),activeSpeakers:new Set(),catMeta:{}};
845
+ const CLABELS={private_person:'Person',private_address:'Address',private_email:'Email',private_phone:'Phone',private_url:'URL',private_date:'Date',account_number:'Account',secret:'Secret'};
846
+ const CCOLORS={private_person:'#ef4444',private_address:'#06b6d4',private_email:'#3b82f6',private_phone:'#22c55e',private_url:'#eab308',private_date:'#a855f7',account_number:'#f97316',secret:'#dc2626'};
847
+
848
+ const dz=document.getElementById('dropzone'),fi=document.getElementById('file-input');
849
+ ['dragenter','dragover'].forEach(e=>dz.addEventListener(e,ev=>{ev.preventDefault();dz.classList.add('dragover')}));
850
+ ['dragleave','drop'].forEach(e=>dz.addEventListener(e,ev=>{ev.preventDefault();dz.classList.remove('dragover')}));
851
+ dz.addEventListener('drop',ev=>{if(ev.dataTransfer.files[0])uploadFile(ev.dataTransfer.files[0])});
852
+ fi.addEventListener('change',ev=>{if(ev.target.files[0])uploadFile(ev.target.files[0])});
853
+
854
+ async function uploadFile(file){
855
+ const ext=file.name.split('.').pop().toLowerCase();
856
+ if(!['pdf','doc','docx'].includes(ext)){showError('Unsupported file type.');return}
857
+ document.getElementById('loading').style.display='flex';
858
+ document.getElementById('upload-view').style.display='none';
859
+ const form=new FormData();form.append('file',file);
860
+ try{
861
+ const r=await fetch('/api/analyze',{method:'POST',body:form});
862
+ const d=await r.json();
863
+ if(d.error){showError(d.error);return}
864
+ S.text=d.text;S.spans=d.spans;S.stats=d.stats;S.speakers=d.speakers||{};S.catMeta=d.categories_meta||{};
865
+ S.activeCats=new Set(Object.keys(d.stats.categories));
866
+ S.activeSpeakers=new Set(Object.keys(d.speakers));
867
+ renderResults(d.filename);
868
+ }catch(e){showError('Analysis failed: '+e.message)}
869
+ finally{document.getElementById('loading').style.display='none'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
870
  }
871
+ function showError(m){document.getElementById('loading').style.display='none';document.getElementById('results-view').style.display='block';const b=document.getElementById('error-banner');b.textContent=m;b.style.display='block'}
872
+ function resetView(){document.getElementById('results-view').style.display='none';document.getElementById('upload-view').style.display='flex';document.getElementById('error-banner').style.display='none';fi.value=''}
873
+
874
+ function renderResults(fn){
875
+ document.getElementById('results-view').style.display='block';
876
+ document.getElementById('error-banner').style.display='none';
877
+ document.getElementById('file-info').textContent=fn;
878
+ renderSummary();renderCatFilters();renderSpeakerFilters();renderDoc();
879
  }
880
+ function renderSummary(){
881
+ const s=S.stats;
882
+ document.getElementById('stat-pct').textContent=s.pii_percentage+'%';
883
+ document.getElementById('stat-spans').textContent=s.total_spans;
884
+ document.getElementById('stat-cats').textContent=s.num_categories;
885
+ const tr=document.getElementById('stat-bar-track');tr.innerHTML='';
886
+ for(const[c,i]of Object.entries(s.categories)){const seg=document.createElement('div');seg.className='stat-bar-fill';seg.style.width=(i.chars/s.total_chars*100)+'%';seg.style.background=CCOLORS[c]||'#888';tr.appendChild(seg)}
887
+ const ch=document.getElementById('category-chips');ch.innerHTML='';
888
+ for(const[c,i]of Object.entries(s.categories)){const el=document.createElement('span');el.className='chip';const co=CCOLORS[c]||'#888';el.style.cssText=`color:${co};border-color:${co};background:${co}15`;el.textContent=(CLABELS[c]||c)+' '+i.count;ch.appendChild(el)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889
  }
890
+ function renderCatFilters(){
891
+ const ct=document.getElementById('category-filters');ct.innerHTML='';
892
+ for(const cat of Object.keys(CLABELS)){
893
+ const info=S.stats.categories[cat];if(!info)continue;
894
+ const co=CCOLORS[cat],lb=CLABELS[cat];
895
+ const el=document.createElement('label');el.className='filter-item';el.style.color=co;
896
+ el.innerHTML=`<input type="checkbox" data-cat="${cat}" ${S.activeCats.has(cat)?'checked':''}><span class="filter-check"></span><span class="filter-dot" style="background:${co}"></span><span class="filter-label" style="color:var(--text)">${lb}</span><span class="filter-count">${info.count}</span>`;
897
+ el.querySelector('input').addEventListener('change',ev=>{if(ev.target.checked)S.activeCats.add(cat);else S.activeCats.delete(cat);renderDoc()});
898
+ ct.appendChild(el);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
899
  }
900
  }
901
+ function renderSpeakerFilters(){
902
+ const sp=S.speakers,grp=document.getElementById('speaker-group'),ct=document.getElementById('speaker-filters');
903
+ if(!sp||!Object.keys(sp).length){grp.style.display='none';return}
904
+ grp.style.display='block';ct.innerHTML='';
905
+ for(const[s,c]of Object.entries(sp)){
906
+ const el=document.createElement('label');el.className='filter-item';
907
+ el.innerHTML=`<input type="checkbox" data-speaker="${s}" ${S.activeSpeakers.has(s)?'checked':''}><span class="filter-check" style="color:var(--primary)"></span><span class="filter-label">${s}</span><span class="filter-count">${c}</span>`;
908
+ el.querySelector('input').addEventListener('change',ev=>{if(ev.target.checked)S.activeSpeakers.add(s);else S.activeSpeakers.delete(s);renderDoc()});
909
+ ct.appendChild(el);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
910
  }
911
  }
912
+ function esc(s){const d=document.createElement('div');d.textContent=s;return d.innerHTML}
913
+ function renderDoc(){
914
+ const{text,spans}=S,ac=S.activeCats,sorted=[...spans].sort((a,b)=>a.start-b.start);
915
+ let html='',pos=0;
916
+ for(const sp of sorted){
917
+ if(sp.start<pos)continue;
918
+ if(sp.start>pos)html+=esc(text.substring(pos,sp.start));
919
+ const active=ac.has(sp.label);
920
+ html+=`<span class="pii pii-${sp.label}${active?'':' dimmed'}" data-label="${sp.label}" data-text="${esc(sp.text)}">${esc(text.substring(sp.start,sp.end))}</span>`;
921
+ pos=sp.end;
922
  }
923
+ if(pos<text.length)html+=esc(text.substring(pos));
924
+ document.getElementById('doc-content').innerHTML=html;
925
+ const tt=document.getElementById('tooltip');
926
+ document.querySelectorAll('.pii').forEach(el=>{
927
+ el.addEventListener('mouseenter',ev=>{tt.textContent=(CLABELS[el.dataset.label]||el.dataset.label)+': '+el.dataset.text;tt.style.display='block';moveTT(ev)});
928
+ el.addEventListener('mousemove',moveTT);
929
+ el.addEventListener('mouseleave',()=>{tt.style.display='none'});
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
930
  });
931
  }
932
+ function moveTT(ev){const t=document.getElementById('tooltip');t.style.left=ev.clientX+12+'px';t.style.top=ev.clientY-36+'px'}
 
 
 
 
 
933
  </script>
934
  </body>
935
  </html>"""
936
 
937
+ # ── launch ───────────────────────────────────────────────────────
938
  if __name__ == "__main__":
939
+ server.launch(server_name="0.0.0.0", server_port=7860)