Spaces:

ysharma
/

OPF-Document-PII-Explorer

Running on Zero

App Files Files Community

ysharma HF Staff commited on Apr 20

Commit

0234c4b

verified ·

1 Parent(s): e71851b

Update app.py

Browse files

Files changed (1) hide show

app.py +698 -471

app.py CHANGED Viewed

@@ -1,30 +1,42 @@
 """
 PII Reveal - Document Privacy Explorer
 =======================================
-A Gradio Server app powered by OpenAI Privacy Filter (Apache 2.0)
-for detecting and visualizing PII in PDF/DOC/DOCX documents.
-- Backend: gr.Server (Gradio + FastAPI)
-- Frontend: Custom HTML/CSS/JS
-- Model: charles-first-org/second-model (1.5B params, 50M active, 128k context)
 """
 import os
 import re
-import json
 import tempfile
 from pathlib import Path
-import torch
 import gradio as gr
 from fastapi import UploadFile, File
 from fastapi.responses import HTMLResponse, JSONResponse
-# ── Configuration ────────────────────────────────────────────────
-MODEL_ID = os.getenv("MODEL_ID", "charles-first-org/second-model")
 HF_TOKEN = os.getenv("HF_TOKEN", None)
-CATEGORIES = {
     "private_person":  {"color": "#ef4444", "bg": "rgba(239,68,68,0.15)",  "label": "Person"},
     "private_address": {"color": "#06b6d4", "bg": "rgba(6,182,212,0.15)",  "label": "Address"},
     "private_email":   {"color": "#3b82f6", "bg": "rgba(59,130,246,0.15)", "label": "Email"},
@@ -35,24 +47,537 @@ CATEGORIES = {
     "secret":          {"color": "#dc2626", "bg": "rgba(220,38,38,0.15)",  "label": "Secret"},
 }
-# ── Model Loading ────────────────────────────────────────────────
-print(f"[PII Reveal] Loading model: {MODEL_ID}")
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-from transformers import AutoTokenizer, AutoModelForTokenClassification  # noqa: E402
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
-model = AutoModelForTokenClassification.from_pretrained(
-    MODEL_ID, trust_remote_code=True, token=HF_TOKEN,
-    torch_dtype=torch.bfloat16 if device.type == "cuda" else torch.float32,
-)
-model.eval().to(device)
-id2label = model.config.id2label
-print(f"[PII Reveal] Model loaded on {device} | Labels: {len(id2label)}")
-# ── Text Extraction ──────────────────────────────────────────────
 def extract_text(file_path: str) -> str:
     suffix = Path(file_path).suffix.lower()
     if suffix == ".pdf":
@@ -68,176 +593,98 @@ def extract_text(file_path: str) -> str:
     raise ValueError(f"Unsupported file type: {suffix}")
-# ── PII Detection ────────────────────────────────────────────────
-def detect_pii(text: str) -> list[dict]:
-    """Run Privacy Filter on text, return list of {label, start, end, text}."""
-    encodings = tokenizer(
-        text,
-        return_tensors="pt",
-        return_offsets_mapping=True,
-        truncation=True,
-        max_length=128000,
-    )
-    offset_mapping = encodings.pop("offset_mapping")[0].tolist()
-    inputs = {k: v.to(device) for k, v in encodings.items()}
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    preds = torch.argmax(logits, dim=-1)[0].tolist()
-    spans, current = [], None
-    for i, pred_id in enumerate(preds):
-        label = id2label.get(pred_id, "O")
-        char_start, char_end = offset_mapping[i]
-        if char_start == char_end or label == "O" or "-" not in label:
-            if current:
-                spans.append(current)
-                current = None
-            continue
-        tag, category = label.split("-", 1)
-        if tag == "S":
-            if current:
-                spans.append(current)
-            spans.append({"label": category, "start": char_start, "end": char_end,
-                          "text": text[char_start:char_end]})
-            current = None
-        elif tag == "B":
-            if current:
-                spans.append(current)
-            current = {"label": category, "start": char_start, "end": char_end,
-                       "text": text[char_start:char_end]}
-        elif tag == "I" and current and current["label"] == category:
-            current["end"] = char_end
-            current["text"] = text[current["start"]:char_end]
-        elif tag == "E" and current and current["label"] == category:
-            current["end"] = char_end
-            current["text"] = text[current["start"]:char_end]
-            spans.append(current)
-            current = None
-        else:
-            if current:
-                spans.append(current)
-                current = None
-    if current:
-        spans.append(current)
-    return spans
-# ── Statistics ───────────────────────────────────────────────────
-def compute_stats(text: str, spans: list[dict]) -> dict:
     total = len(text)
     pii_chars = sum(s["end"] - s["start"] for s in spans)
-    by_cat: dict[str, dict] = {}
     for s in spans:
         c = s["label"]
         by_cat.setdefault(c, {"count": 0, "chars": 0})
-        by_cat[c]["count"] += 1
-        by_cat[c]["chars"] += s["end"] - s["start"]
     return {
-        "total_chars": total,
-        "pii_chars": pii_chars,
         "pii_percentage": round(pii_chars / total * 100, 1) if total else 0,
-        "total_spans": len(spans),
-        "categories": by_cat,
-        "num_categories": len(by_cat),
     }
-def detect_speakers(text: str, spans: list[dict]) -> dict:
-    """Simple speaker detection for transcripts."""
-    patterns = [
-        r"^([A-Z][a-zA-Z ]{1,30}):\s",
-        r"^\[([^\]]{1,30})\]\s",
-        r"^(Speaker\s*\d+):\s",
-    ]
-    line_speakers = []
-    pos, cur_speaker = 0, None
     for line in text.split("\n"):
-        for pat in patterns:
-            m = re.match(pat, line)
-            if m:
-                cur_speaker = m.group(1).strip()
-                break
-        line_speakers.append((pos, pos + len(line), cur_speaker))
-        pos += len(line) + 1
-    result: dict[str, int] = {}
     for span in spans:
         mid = (span["start"] + span["end"]) // 2
         speaker = "Document"
-        for ls, le, sp in line_speakers:
-            if ls <= mid <= le and sp:
-                speaker = sp
-                break
         result[speaker] = result.get(speaker, 0) + 1
-    if len(result) <= 1 and "Document" in result:
-        return {}
-    return result
 # ── Gradio Server ────────────────────────────────────────────────
-app = gr.Server()
-@app.get("/", response_class=HTMLResponse)
 async def homepage():
     return FRONTEND_HTML
-@app.post("/api/analyze")
 async def analyze_document(file: UploadFile = File(...)):
     suffix = Path(file.filename).suffix.lower()
     if suffix not in (".pdf", ".doc", ".docx"):
         return JSONResponse({"error": f"Unsupported: {suffix}. Use PDF, DOC, or DOCX."}, 400)
     with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
-        tmp.write(await file.read())
-        tmp_path = tmp.name
     try:
         text = extract_text(tmp_path)
         if not text.strip():
             return JSONResponse({"error": "No text content found."}, 400)
-        spans = detect_pii(text)
-        stats = compute_stats(text, spans)
-        speakers = detect_speakers(text, spans)
         return JSONResponse({
-            "filename": file.filename,
-            "text": text,
-            "spans": spans,
-            "stats": stats,
-            "speakers": speakers,
             "categories_meta": {k: {"color": v["color"], "bg": v["bg"], "label": v["label"]}
-                                for k, v in CATEGORIES.items()},
         })
     except Exception as e:
         return JSONResponse({"error": str(e)}, 500)
     finally:
-        if os.path.exists(tmp_path):
-            os.unlink(tmp_path)
-@app.api(name="analyze_text")
 def analyze_text_api(text: str) -> str:
-    """Gradio API: analyze raw text for PII (for programmatic access)."""
-    spans = detect_pii(text)
-    stats = compute_stats(text, spans)
-    return json.dumps({"spans": spans, "stats": stats}, ensure_ascii=False)
-# ── Frontend ─────────────────────────────────────────────────────
 FRONTEND_HTML = r"""<!DOCTYPE html>
 <html lang="en">
 <head>
 <meta charset="UTF-8">
 <meta name="viewport" content="width=device-width,initial-scale=1">
-<title>PII Reveal - Document Privacy Explorer</title>
 <link rel="preconnect" href="https://fonts.googleapis.com">
 <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap" rel="stylesheet">
 <style>
@@ -248,13 +695,10 @@ FRONTEND_HTML = r"""<!DOCTYPE html>
   --primary:#6366f1;--primary-light:#e0e7ff;
   --radius:12px;--radius-sm:8px;--shadow:0 1px 3px rgba(0,0,0,.08);
   --shadow-lg:0 8px 32px rgba(0,0,0,.12);
-  --person:#ef4444;--address:#06b6d4;--email:#3b82f6;--phone:#22c55e;
-  --url:#eab308;--date:#a855f7;--account:#f97316;--secret:#dc2626;
 }
 body{font-family:'Inter',system-ui,sans-serif;background:var(--bg);color:var(--text);min-height:100vh;line-height:1.6}
-a{color:var(--primary)}
-/* ─ Upload View ─ */
 #upload-view{display:flex;flex-direction:column;align-items:center;justify-content:center;min-height:100vh;padding:2rem}
 .upload-card{background:var(--surface);border-radius:20px;padding:3rem;max-width:640px;width:100%;text-align:center;box-shadow:var(--shadow-lg);position:relative;overflow:hidden}
 .upload-card::before{content:'';position:absolute;inset:-2px;border-radius:22px;background:linear-gradient(135deg,var(--primary),#ec4899,var(--primary));z-index:-1;opacity:0;transition:opacity .3s}
@@ -276,7 +720,7 @@ a{color:var(--primary)}
 .feature-desc{color:var(--text2);font-size:.75rem;line-height:1.4}
 .powered-by{margin-top:1.5rem;font-size:.8rem;color:var(--text3)}
-/* ─ Results View ─ */
 #results-view{display:none;min-height:100vh}
 .top-bar{background:var(--surface);border-bottom:1px solid var(--border);padding:.75rem 1.5rem;display:flex;align-items:center;gap:1rem;position:sticky;top:0;z-index:100;box-shadow:var(--shadow)}
 .top-bar .brand{margin:0}
@@ -284,12 +728,10 @@ a{color:var(--primary)}
 .top-bar .brand-icon{width:32px;height:32px;font-size:1rem}
 .file-info{font-size:.85rem;color:var(--text2);margin-left:.5rem;flex:1}
 .btn{padding:.5rem 1rem;border-radius:var(--radius-sm);border:none;cursor:pointer;font-weight:600;font-size:.85rem;transition:all .15s}
-.btn-primary{background:var(--primary);color:#fff}
-.btn-primary:hover{background:#4f46e5}
 .btn-ghost{background:transparent;color:var(--text2);border:1px solid var(--border)}
 .btn-ghost:hover{background:var(--surface2)}
-/* ─ Summary Strip ─ */
 .summary-strip{background:var(--surface);border-bottom:1px solid var(--border);padding:1rem 1.5rem;display:flex;align-items:center;gap:1.5rem;flex-wrap:wrap}
 .stat-big{text-align:center;min-width:80px}
 .stat-big .num{font-size:1.75rem;font-weight:800;color:var(--primary)}
@@ -301,28 +743,26 @@ a{color:var(--primary)}
 .category-chips{display:flex;flex-wrap:wrap;gap:.35rem}
 .chip{display:inline-flex;align-items:center;gap:.35rem;padding:.2rem .6rem;border-radius:20px;font-size:.75rem;font-weight:600;border:1.5px solid}
-/* ─ Main Layout ─ */
 .main-layout{display:flex;height:calc(100vh - 130px)}
 .doc-panel{flex:1;overflow-y:auto;padding:2rem;background:var(--bg)}
 .doc-content{background:var(--surface);border-radius:var(--radius);padding:2rem 2.5rem;max-width:900px;margin:0 auto;box-shadow:var(--shadow);font-size:.95rem;line-height:1.8;white-space:pre-wrap;word-wrap:break-word}
-/* ─ PII Highlights ─ */
 .pii{border-radius:3px;padding:1px 2px;cursor:pointer;transition:all .15s;position:relative;border-bottom:2px solid}
 .pii:hover{filter:brightness(.92)}
 .pii.dimmed{opacity:.15;border-bottom-color:transparent!important}
-.pii-private_person{background:rgba(239,68,68,.15);border-bottom-color:var(--person);color:#991b1b}
-.pii-private_address{background:rgba(6,182,212,.15);border-bottom-color:var(--address);color:#155e75}
-.pii-private_email{background:rgba(59,130,246,.15);border-bottom-color:var(--email);color:#1e40af}
-.pii-private_phone{background:rgba(34,197,94,.15);border-bottom-color:var(--phone);color:#166534}
-.pii-private_url{background:rgba(234,179,8,.15);border-bottom-color:var(--url);color:#854d0e}
-.pii-private_date{background:rgba(168,85,247,.15);border-bottom-color:var(--date);color:#6b21a8}
-.pii-account_number{background:rgba(249,115,22,.15);border-bottom-color:var(--account);color:#9a3412}
-.pii-secret{background:rgba(220,38,38,.15);border-bottom-color:var(--secret);color:#991b1b}
-/* ─ Tooltip ─ */
 .pii-tooltip{position:fixed;background:#1e293b;color:#fff;padding:.4rem .7rem;border-radius:6px;font-size:.75rem;font-weight:500;pointer-events:none;z-index:999;white-space:nowrap;box-shadow:0 4px 12px rgba(0,0,0,.2)}
-/* ─ Sidebar ─ */
 .sidebar{width:300px;background:var(--surface);border-left:1px solid var(--border);overflow-y:auto;padding:1.25rem;flex-shrink:0}
 .sidebar h3{font-size:.7rem;text-transform:uppercase;letter-spacing:.8px;color:var(--text3);margin-bottom:.75rem;font-weight:700}
 .filter-group{margin-bottom:1.5rem}
@@ -336,35 +776,26 @@ a{color:var(--primary)}
 .filter-label{flex:1;font-size:.85rem;font-weight:500}
 .filter-count{font-size:.75rem;color:var(--text3);font-weight:600;background:var(--surface2);padding:.1rem .45rem;border-radius:10px}
-/* ─ Loading ─ */
 #loading{position:fixed;inset:0;background:rgba(255,255,255,.85);backdrop-filter:blur(8px);display:none;flex-direction:column;align-items:center;justify-content:center;z-index:9999}
 .spinner{width:48px;height:48px;border:4px solid var(--border);border-top-color:var(--primary);border-radius:50%;animation:spin .8s linear infinite}
 @keyframes spin{to{transform:rotate(360deg)}}
 #loading p{margin-top:1rem;font-weight:600;color:var(--text2)}
 .progress-text{font-size:.85rem;color:var(--text3);margin-top:.5rem}
-/* ─ Error ─ */
-.error-banner{background:#fef2f2;border:1px solid #fecaca;color:#991b1b;padding:1rem 1.5rem;border-radius:var(--radius-sm);margin:1rem;font-size:.9rem;display:none;align-items:center;gap:.5rem}
-/* ─ Responsive ─ */
 @media(max-width:768px){
   .main-layout{flex-direction:column-reverse;height:auto}
   .sidebar{width:100%;border-left:none;border-top:1px solid var(--border)}
   .features{grid-template-columns:1fr}
-  .summary-strip{flex-direction:column;align-items:stretch}
-  .stat-divider{width:100%;height:1px}
 }
 </style>
 </head>
 <body>
-<!-- ─── Upload View ─── -->
 <div id="upload-view">
   <div class="upload-card">
-    <div class="brand">
-      <div class="brand-icon">&#x1f50d;</div>
-      <h1>PII Reveal</h1>
-    </div>
     <p class="subtitle">Document Privacy Explorer</p>
     <div class="dropzone" id="dropzone">
       <div class="dropzone-icon">&#x1f4c4;</div>
@@ -373,36 +804,21 @@ a{color:var(--primary)}
       <input type="file" id="file-input" accept=".pdf,.doc,.docx">
     </div>
     <div class="features">
-      <div class="feature">
-        <div class="feature-title">8 PII Categories</div>
-        <div class="feature-desc">Names, addresses, emails, phones, URLs, dates, accounts, secrets</div>
-      </div>
-      <div class="feature">
-        <div class="feature-title">128k Context</div>
-        <div class="feature-desc">Full documents in one pass &mdash; no chunking artifacts</div>
-      </div>
-      <div class="feature">
-        <div class="feature-title">Context-Aware</div>
-        <div class="feature-desc">Understands when "May" is a name vs. a month</div>
-      </div>
     </div>
     <div class="powered-by">Powered by <strong>OpenAI Privacy Filter</strong> &middot; Apache 2.0</div>
   </div>
 </div>
-<!-- ─── Results View ─── -->
 <div id="results-view">
   <div class="top-bar">
-    <div class="brand">
-      <div class="brand-icon">&#x1f50d;</div>
-      <h1>PII Reveal</h1>
-    </div>
     <div class="file-info" id="file-info"></div>
     <button class="btn btn-ghost" onclick="resetView()">New File</button>
   </div>
   <div class="error-banner" id="error-banner"></div>
   <div class="summary-strip" id="summary-strip">
     <div class="stat-big"><div class="num" id="stat-pct">0%</div><div class="lbl">PII Content</div></div>
     <div class="stat-divider"></div>
@@ -410,303 +826,114 @@ a{color:var(--primary)}
     <div class="stat-divider"></div>
     <div class="stat-big"><div class="num" id="stat-cats">0</div><div class="lbl">Categories</div></div>
     <div class="stat-divider"></div>
-    <div class="stat-bar">
-      <div class="stat-bar-track" id="stat-bar-track"></div>
-      <div class="category-chips" id="category-chips"></div>
-    </div>
   </div>
   <div class="main-layout">
-    <div class="doc-panel">
-      <div class="doc-content" id="doc-content"></div>
-    </div>
     <div class="sidebar">
-      <div class="filter-group">
-        <h3>PII Categories</h3>
-        <div id="category-filters"></div>
-      </div>
-      <div class="filter-group" id="speaker-group" style="display:none">
-        <h3>Speakers</h3>
-        <div id="speaker-filters"></div>
-      </div>
     </div>
   </div>
 </div>
-<!-- ─── Loading Overlay ─── -->
-<div id="loading">
-  <div class="spinner"></div>
-  <p>Analyzing document for PII&hellip;</p>
-  <div class="progress-text">Running OpenAI Privacy Filter (128k context)</div>
-</div>
-<!-- ─── Tooltip ─── -->
 <div class="pii-tooltip" id="tooltip" style="display:none"></div>
 <script>
-// ── State ──
-let STATE = { text: '', spans: [], stats: {}, speakers: {}, activeCategories: new Set(), activeSpeakers: new Set(), categoriesMeta: {} };
-const CATEGORY_LABELS = {
-  private_person: 'Person', private_address: 'Address', private_email: 'Email',
-  private_phone: 'Phone', private_url: 'URL', private_date: 'Date',
-  account_number: 'Account', secret: 'Secret'
-};
-const CATEGORY_COLORS = {
-  private_person:'#ef4444', private_address:'#06b6d4', private_email:'#3b82f6',
-  private_phone:'#22c55e', private_url:'#eab308', private_date:'#a855f7',
-  account_number:'#f97316', secret:'#dc2626'
-};
-// ── Upload Handling ──
-const dropzone = document.getElementById('dropzone');
-const fileInput = document.getElementById('file-input');
-['dragenter','dragover'].forEach(e => dropzone.addEventListener(e, ev => { ev.preventDefault(); dropzone.classList.add('dragover'); }));
-['dragleave','drop'].forEach(e => dropzone.addEventListener(e, ev => { ev.preventDefault(); dropzone.classList.remove('dragover'); }));
-dropzone.addEventListener('drop', ev => {
-  const file = ev.dataTransfer.files[0];
-  if (file) uploadFile(file);
-});
-fileInput.addEventListener('change', ev => {
-  const file = ev.target.files[0];
-  if (file) uploadFile(file);
-});
-async function uploadFile(file) {
-  const ext = file.name.split('.').pop().toLowerCase();
-  if (!['pdf','doc','docx'].includes(ext)) {
-    showError('Unsupported file type. Please use PDF, DOC, or DOCX.');
-    return;
-  }
-  document.getElementById('loading').style.display = 'flex';
-  document.getElementById('upload-view').style.display = 'none';
-  const form = new FormData();
-  form.append('file', file);
-  try {
-    const resp = await fetch('/api/analyze', { method: 'POST', body: form });
-    const data = await resp.json();
-    if (data.error) {
-      showError(data.error);
-      return;
-    }
-    STATE.text = data.text;
-    STATE.spans = data.spans;
-    STATE.stats = data.stats;
-    STATE.speakers = data.speakers || {};
-    STATE.categoriesMeta = data.categories_meta || {};
-    STATE.activeCategories = new Set(Object.keys(data.stats.categories));
-    STATE.activeSpeakers = new Set(Object.keys(data.speakers));
-    renderResults(data.filename);
-  } catch (err) {
-    showError('Analysis failed: ' + err.message);
-  } finally {
-    document.getElementById('loading').style.display = 'none';
-  }
 }
-function showError(msg) {
-  document.getElementById('loading').style.display = 'none';
-  document.getElementById('results-view').style.display = 'block';
-  const banner = document.getElementById('error-banner');
-  banner.textContent = msg;
-  banner.style.display = 'flex';
 }
-function resetView() {
-  document.getElementById('results-view').style.display = 'none';
-  document.getElementById('upload-view').style.display = 'flex';
-  document.getElementById('error-banner').style.display = 'none';
-  fileInput.value = '';
-}
-// ── Render Results ──
-function renderResults(filename) {
-  document.getElementById('results-view').style.display = 'block';
-  document.getElementById('error-banner').style.display = 'none';
-  // File info
-  document.getElementById('file-info').textContent = filename;
-  // Summary stats
-  renderSummary();
-  // Filters
-  renderCategoryFilters();
-  renderSpeakerFilters();
-  // Document
-  renderDocument();
 }
-function renderSummary() {
-  const s = STATE.stats;
-  document.getElementById('stat-pct').textContent = s.pii_percentage + '%';
-  document.getElementById('stat-spans').textContent = s.total_spans;
-  document.getElementById('stat-cats').textContent = s.num_categories;
-  // Bar
-  const track = document.getElementById('stat-bar-track');
-  track.innerHTML = '';
-  const cats = s.categories;
-  const total = s.pii_chars || 1;
-  for (const [cat, info] of Object.entries(cats)) {
-    const pct = (info.chars / s.total_chars * 100);
-    const seg = document.createElement('div');
-    seg.className = 'stat-bar-fill';
-    seg.style.width = pct + '%';
-    seg.style.background = CATEGORY_COLORS[cat] || '#888';
-    track.appendChild(seg);
-  }
-  // Chips
-  const chips = document.getElementById('category-chips');
-  chips.innerHTML = '';
-  for (const [cat, info] of Object.entries(cats)) {
-    const color = CATEGORY_COLORS[cat] || '#888';
-    const label = CATEGORY_LABELS[cat] || cat;
-    const chip = document.createElement('span');
-    chip.className = 'chip';
-    chip.style.color = color;
-    chip.style.borderColor = color;
-    chip.style.background = color + '15';
-    chip.textContent = label + ' ' + info.count;
-    chips.appendChild(chip);
   }
 }
-function renderCategoryFilters() {
-  const container = document.getElementById('category-filters');
-  container.innerHTML = '';
-  const cats = STATE.stats.categories;
-  for (const cat of Object.keys(CATEGORY_LABELS)) {
-    const info = cats[cat];
-    if (!info) continue;
-    const color = CATEGORY_COLORS[cat];
-    const label = CATEGORY_LABELS[cat];
-    const item = document.createElement('label');
-    item.className = 'filter-item';
-    item.style.color = color;
-    item.innerHTML = `
-      <input type="checkbox" data-cat="${cat}" ${STATE.activeCategories.has(cat)?'checked':''}>
-      <span class="filter-check"></span>
-      <span class="filter-dot" style="background:${color}"></span>
-      <span class="filter-label" style="color:var(--text)">${label}</span>
-      <span class="filter-count">${info.count}</span>
-    `;
-    item.querySelector('input').addEventListener('change', ev => {
-      if (ev.target.checked) STATE.activeCategories.add(cat);
-      else STATE.activeCategories.delete(cat);
-      renderDocument();
-    });
-    container.appendChild(item);
   }
 }
-function renderSpeakerFilters() {
-  const speakers = STATE.speakers;
-  const group = document.getElementById('speaker-group');
-  const container = document.getElementById('speaker-filters');
-  if (!speakers || Object.keys(speakers).length === 0) {
-    group.style.display = 'none';
-    return;
   }
-  group.style.display = 'block';
-  container.innerHTML = '';
-  for (const [speaker, count] of Object.entries(speakers)) {
-    const item = document.createElement('label');
-    item.className = 'filter-item';
-    item.innerHTML = `
-      <input type="checkbox" data-speaker="${speaker}" ${STATE.activeSpeakers.has(speaker)?'checked':''}>
-      <span class="filter-check" style="color:var(--primary)"></span>
-      <span class="filter-label">${speaker}</span>
-      <span class="filter-count">${count}</span>
-    `;
-    item.querySelector('input').addEventListener('change', ev => {
-      if (ev.target.checked) STATE.activeSpeakers.add(speaker);
-      else STATE.activeSpeakers.delete(speaker);
-      renderDocument();
-    });
-    container.appendChild(item);
-  }
-}
-// ── Document Rendering ──
-function escapeHtml(str) {
-  const div = document.createElement('div');
-  div.textContent = str;
-  return div.innerHTML;
-}
-function renderDocument() {
-  const { text, spans } = STATE;
-  const active = STATE.activeCategories;
-  // Sort spans by start position
-  const sorted = [...spans].sort((a, b) => a.start - b.start);
-  let html = '';
-  let pos = 0;
-  for (const span of sorted) {
-    if (span.start < pos) continue; // skip overlapping
-    // Text before span
-    if (span.start > pos) {
-      html += escapeHtml(text.substring(pos, span.start));
-    }
-    const isActive = active.has(span.label);
-    const cls = isActive ? `pii pii-${span.label}` : `pii pii-${span.label} dimmed`;
-    const spanText = escapeHtml(text.substring(span.start, span.end));
-    html += `<span class="${cls}" data-label="${span.label}" data-text="${escapeHtml(span.text)}">${spanText}</span>`;
-    pos = span.end;
-  }
-  // Remaining text
-  if (pos < text.length) {
-    html += escapeHtml(text.substring(pos));
-  }
-  document.getElementById('doc-content').innerHTML = html;
-  attachTooltips();
-}
-// ── Tooltips ──
-function attachTooltips() {
-  const tooltip = document.getElementById('tooltip');
-  document.querySelectorAll('.pii').forEach(el => {
-    el.addEventListener('mouseenter', ev => {
-      const label = CATEGORY_LABELS[el.dataset.label] || el.dataset.label;
-      tooltip.textContent = label + ': ' + el.dataset.text;
-      tooltip.style.display = 'block';
-      positionTooltip(ev);
-    });
-    el.addEventListener('mousemove', positionTooltip);
-    el.addEventListener('mouseleave', () => { tooltip.style.display = 'none'; });
   });
 }
-function positionTooltip(ev) {
-  const tt = document.getElementById('tooltip');
-  tt.style.left = ev.clientX + 12 + 'px';
-  tt.style.top = ev.clientY - 36 + 'px';
-}
 </script>
 </body>
 </html>"""
-# ── Launch ───────────────────────────────────────────────────────
 if __name__ == "__main__":
-    app.launch(server_name="0.0.0.0", server_port=7860)

 """
 PII Reveal - Document Privacy Explorer
 =======================================
+Backend : gr.Server (Gradio + FastAPI)
+Frontend: Custom HTML / CSS / JS
+Model   : charles-first-org/second-model  (OpenAI Privacy Filter)
 """
+# ── stdlib ───────────────────────────────────────────────────────
+import dataclasses
+import functools
+import json
+import math
 import os
 import re
 import tempfile
+from bisect import bisect_left, bisect_right
+from collections.abc import Sequence
+from dataclasses import dataclass
 from pathlib import Path
+from typing import Final
+# ── third-party ──────────────────────────────────────────────────
 import gradio as gr
+import spaces
+import tiktoken
+import torch
+import torch.nn.functional as F
 from fastapi import UploadFile, File
 from fastapi.responses import HTMLResponse, JSONResponse
+from huggingface_hub import snapshot_download
+from safetensors import safe_open
+# ── configuration ────────────────────────────────────────────────
+MODEL_REPO = os.getenv("MODEL_ID", "charles-first-org/second-model")
 HF_TOKEN = os.getenv("HF_TOKEN", None)
+MODEL_DIR = Path(snapshot_download(MODEL_REPO, token=HF_TOKEN))
+CATEGORIES_META = {
     "private_person":  {"color": "#ef4444", "bg": "rgba(239,68,68,0.15)",  "label": "Person"},
     "private_address": {"color": "#06b6d4", "bg": "rgba(6,182,212,0.15)",  "label": "Address"},
     "private_email":   {"color": "#3b82f6", "bg": "rgba(59,130,246,0.15)", "label": "Email"},
     "secret":          {"color": "#dc2626", "bg": "rgba(220,38,38,0.15)",  "label": "Secret"},
 }
+# =====================================================================
+# MODEL  ARCHITECTURE  +  INFERENCE  (from reference implementation)
+# =====================================================================
+PRIVACY_FILTER_MODEL_TYPE: Final[str] = "privacy_filter"
+REQUIRED_MODEL_CONFIG_KEYS: Final[tuple[str, ...]] = (
+    "model_type", "encoding", "num_hidden_layers", "num_experts",
+    "experts_per_token", "vocab_size", "num_labels", "hidden_size",
+    "intermediate_size", "head_dim", "num_attention_heads",
+    "num_key_value_heads", "sliding_window", "bidirectional_context",
+    "bidirectional_left_context", "bidirectional_right_context",
+    "default_n_ctx", "initial_context_length", "rope_theta",
+    "rope_scaling_factor", "rope_ntk_alpha", "rope_ntk_beta", "param_dtype",
+)
+BACKGROUND_CLASS_LABEL: Final[str] = "O"
+BOUNDARY_PREFIXES: Final[tuple[str, ...]] = ("B", "I", "E", "S")
+SPAN_CLASS_NAMES: Final[tuple[str, ...]] = (
+    BACKGROUND_CLASS_LABEL,
+    "account_number", "private_address", "private_date", "private_email",
+    "private_person", "private_phone", "private_url", "secret",
+)
+NER_CLASS_NAMES: Final[tuple[str, ...]] = (BACKGROUND_CLASS_LABEL,) + tuple(
+    f"{prefix}-{base}"
+    for base in SPAN_CLASS_NAMES if base != BACKGROUND_CLASS_LABEL
+    for prefix in BOUNDARY_PREFIXES
+)
+VITERBI_TRANSITION_BIAS_KEYS: Final[tuple[str, ...]] = (
+    "transition_bias_background_stay", "transition_bias_background_to_start",
+    "transition_bias_inside_to_continue", "transition_bias_inside_to_end",
+    "transition_bias_end_to_background", "transition_bias_end_to_start",
+)
+DEFAULT_VITERBI_CALIBRATION_PRESET: Final[str] = "default"
+def validate_model_config_contract(cfg: dict, *, context: str) -> None:
+    missing = [k for k in REQUIRED_MODEL_CONFIG_KEYS if k not in cfg]
+    if missing:
+        raise ValueError(f"{context} missing keys: {', '.join(missing)}")
+    if cfg.get("model_type") != PRIVACY_FILTER_MODEL_TYPE:
+        raise ValueError(f"{context} model_type must be {PRIVACY_FILTER_MODEL_TYPE!r}")
+    if cfg.get("bidirectional_context") is not True:
+        raise ValueError(f"{context} must use bidirectional_context=true")
+    lc, rc = cfg.get("bidirectional_left_context"), cfg.get("bidirectional_right_context")
+    if not isinstance(lc, int) or not isinstance(rc, int) or lc != rc or lc < 0:
+        raise ValueError(f"{context} bidirectional context must be equal non-negative ints")
+    sw = cfg.get("sliding_window")
+    if sw != 2 * lc + 1:
+        raise ValueError(f"{context} sliding_window must equal 2*context+1")
+    if cfg["num_labels"] != 33:
+        raise ValueError(f"{context} num_labels must be 33")
+    if cfg["param_dtype"] != "bfloat16":
+        raise ValueError(f"{context} param_dtype must be bfloat16")
+# ── model helpers ────────────────────────────────────────────────
+def expert_linear(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None) -> torch.Tensor:
+    n, e, k = x.shape
+    _, _, _, o = weight.shape
+    out = torch.bmm(x.reshape(n * e, 1, k), weight.reshape(n * e, k, o)).reshape(n, e, o)
+    return out + bias if bias is not None else out
+@dataclass
+class ModelConfig:
+    num_hidden_layers: int; num_experts: int; experts_per_token: int
+    vocab_size: int; num_labels: int; hidden_size: int; intermediate_size: int
+    head_dim: int; num_attention_heads: int; num_key_value_heads: int
+    bidirectional_context_size: int; initial_context_length: int
+    rope_theta: float; rope_scaling_factor: float; rope_ntk_alpha: float; rope_ntk_beta: float
+    @classmethod
+    def from_checkpoint_config(cls, cfg: dict, *, context: str) -> "ModelConfig":
+        cfg = dict(cfg)
+        cfg["bidirectional_context_size"] = cfg["bidirectional_left_context"]
+        fields = {f.name for f in dataclasses.fields(cls)}
+        return cls(**{k: v for k, v in cfg.items() if k in fields})
+class RMSNorm(torch.nn.Module):
+    def __init__(self, n: int, eps: float = 1e-5, device=None):
+        super().__init__()
+        self.eps = eps
+        self.scale = torch.nn.Parameter(torch.ones(n, device=device, dtype=torch.float32))
+    def forward(self, x):
+        t = x.float()
+        return (t * torch.rsqrt(t.pow(2).mean(-1, keepdim=True) + self.eps) * self.scale).to(x.dtype)
+def apply_rope(x, cos, sin):
+    cos = cos.unsqueeze(-2).to(x.dtype); sin = sin.unsqueeze(-2).to(x.dtype)
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    return torch.stack((x1 * cos - x2 * sin, x2 * cos + x1 * sin), dim=-1).reshape(x.shape)
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, head_dim, base, dtype, *, initial_context_length=4096,
+                 scaling_factor=1.0, ntk_alpha=1.0, ntk_beta=32.0, device=None):
+        super().__init__()
+        self.head_dim, self.base, self.dtype = head_dim, base, dtype
+        self.initial_context_length = initial_context_length
+        self.scaling_factor, self.ntk_alpha, self.ntk_beta = scaling_factor, ntk_alpha, ntk_beta
+        self.device = device
+        mp = max(int(initial_context_length * scaling_factor), initial_context_length)
+        self.max_position_embeddings = mp
+        cos, sin = self._compute(mp, device=torch.device("cpu"))
+        target = device or torch.device("cpu")
+        self.register_buffer("cos_cache", cos.to(target), persistent=False)
+        self.register_buffer("sin_cache", sin.to(target), persistent=False)
+    def _inv_freq(self, device=None):
+        device = device or self.device
+        freq = self.base ** (torch.arange(0, self.head_dim, 2, dtype=torch.float, device=device) / self.head_dim)
+        if self.scaling_factor > 1.0:
+            d_half = self.head_dim / 2
+            low = d_half * math.log(self.initial_context_length / (self.ntk_beta * 2 * math.pi)) / math.log(self.base)
+            high = d_half * math.log(self.initial_context_length / (self.ntk_alpha * 2 * math.pi)) / math.log(self.base)
+            interp = 1.0 / (self.scaling_factor * freq)
+            extrap = 1.0 / freq
+            ramp = (torch.arange(d_half, dtype=torch.float32, device=device) - low) / (high - low)
+            mask = 1 - ramp.clamp(0, 1)
+            return interp * (1 - mask) + extrap * mask
+        return 1.0 / freq
+    def _compute(self, n, device=None):
+        inv_freq = self._inv_freq(device)
+        t = torch.arange(n, dtype=torch.float32, device=device or self.device)
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        c = 0.1 * math.log(self.scaling_factor) + 1.0 if self.scaling_factor > 1.0 else 1.0
+        return (freqs.cos() * c).to(self.dtype), (freqs.sin() * c).to(self.dtype)
+    def forward(self, q, k):
+        n = q.shape[0]
+        if n > self.cos_cache.shape[0]:
+            cos, sin = self._compute(n, torch.device("cpu"))
+            self.cos_cache, self.sin_cache = cos.to(q.device), sin.to(q.device)
+        cc = self.cos_cache.to(q.device) if self.cos_cache.device != q.device else self.cos_cache
+        sc = self.sin_cache.to(q.device) if self.sin_cache.device != q.device else self.sin_cache
+        cos, sin = cc[:n], sc[:n]
+        q = apply_rope(q.view(n, -1, self.head_dim), cos, sin).reshape(q.shape)
+        k = apply_rope(k.view(n, -1, self.head_dim), cos, sin).reshape(k.shape)
+        return q, k
+def sdpa(Q, K, V, S, sm_scale, ctx):
+    n, nh, qm, hd = Q.shape
+    w = 2 * ctx + 1
+    Kp = F.pad(K, (0, 0, 0, 0, ctx, ctx)); Vp = F.pad(V, (0, 0, 0, 0, ctx, ctx))
+    Kw = Kp.unfold(0, w, 1).permute(0, 3, 1, 2); Vw = Vp.unfold(0, w, 1).permute(0, 3, 1, 2)
+    idx = torch.arange(w, device=Q.device) - ctx
+    pos = torch.arange(n, device=Q.device)[:, None] + idx[None, :]
+    valid = (pos >= 0) & (pos < n)
+    scores = torch.einsum("nhqd,nwhd->nhqw", Q, Kw).float() * sm_scale
+    scores = scores.masked_fill(~valid[:, None, None, :], -float("inf"))
+    sink = (S * math.log(2.0)).reshape(nh, qm)[None, :, :, None].expand(n, -1, -1, 1)
+    scores = torch.cat([scores, sink], dim=-1)
+    wt = torch.softmax(scores, dim=-1)[..., :-1].to(V.dtype)
+    return torch.einsum("nhqw,nwhd->nhqd", wt, Vw).reshape(n, -1)
+class AttentionBlock(torch.nn.Module):
+    def __init__(self, cfg: ModelConfig, device=None):
+        super().__init__()
+        dt = torch.bfloat16
+        self.head_dim, self.nah, self.nkv = cfg.head_dim, cfg.num_attention_heads, cfg.num_key_value_heads
+        self.ctx = int(cfg.bidirectional_context_size)
+        self.sinks = torch.nn.Parameter(torch.empty(cfg.num_attention_heads, device=device, dtype=torch.float32))
+        self.norm = RMSNorm(cfg.hidden_size, device=device)
+        qkv_d = cfg.head_dim * (cfg.num_attention_heads + 2 * cfg.num_key_value_heads)
+        self.qkv = torch.nn.Linear(cfg.hidden_size, qkv_d, device=device, dtype=dt)
+        self.out = torch.nn.Linear(cfg.head_dim * cfg.num_attention_heads, cfg.hidden_size, device=device, dtype=dt)
+        self.qk_scale = 1 / math.sqrt(math.sqrt(cfg.head_dim))
+        self.rope = RotaryEmbedding(cfg.head_dim, int(cfg.rope_theta), torch.float32,
+                                     initial_context_length=cfg.initial_context_length,
+                                     scaling_factor=cfg.rope_scaling_factor,
+                                     ntk_alpha=cfg.rope_ntk_alpha, ntk_beta=cfg.rope_ntk_beta, device=device)
+    def forward(self, x):
+        t = self.norm(x).to(self.qkv.weight.dtype)
+        qkv = F.linear(t, self.qkv.weight, self.qkv.bias)
+        hd, nah, nkv = self.head_dim, self.nah, self.nkv
+        q = qkv[:, :nah * hd].contiguous()
+        k = qkv[:, nah * hd:(nah + nkv) * hd].contiguous()
+        v = qkv[:, (nah + nkv) * hd:(nah + 2 * nkv) * hd].contiguous()
+        q, k = self.rope(q, k)
+        q, k = q * self.qk_scale, k * self.qk_scale
+        n = q.shape[0]
+        q = q.view(n, nkv, nah // nkv, hd); k = k.view(n, nkv, hd); v = v.view(n, nkv, hd)
+        ao = sdpa(q, k, v, self.sinks, 1.0, self.ctx).to(self.out.weight.dtype)
+        return x + F.linear(ao, self.out.weight, self.out.bias).to(x.dtype)
+def swiglu(x, alpha=1.702, limit=7.0):
+    g, l = x.chunk(2, dim=-1)
+    g, l = g.clamp(max=limit), l.clamp(-limit, limit)
+    return g * torch.sigmoid(alpha * g) * (l + 1)
+class MLPBlock(torch.nn.Module):
+    def __init__(self, cfg: ModelConfig, device=None):
+        super().__init__()
+        dt = torch.bfloat16
+        self.ne, self.ept = cfg.num_experts, cfg.experts_per_token
+        self.norm = RMSNorm(cfg.hidden_size, device=device)
+        self.gate = torch.nn.Linear(cfg.hidden_size, cfg.num_experts, device=device, dtype=dt)
+        self.mlp1_weight = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.hidden_size, cfg.intermediate_size * 2, device=device, dtype=dt))
+        self.mlp1_bias = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.intermediate_size * 2, device=device, dtype=dt))
+        self.mlp2_weight = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.intermediate_size, cfg.hidden_size, device=device, dtype=dt))
+        self.mlp2_bias = torch.nn.Parameter(torch.empty(cfg.num_experts, cfg.hidden_size, device=device, dtype=dt))
+    def forward(self, x):
+        t = self.norm(x)
+        gs = F.linear(t.float(), self.gate.weight.float(), self.gate.bias.float())
+        top = torch.topk(gs, k=self.ept, dim=-1, sorted=True)
+        ew = torch.softmax(top.values, dim=-1) / self.ept
+        ei = top.indices
+        ept = self.ept
+        def _chunk(tc, eic, ewc):
+            o = expert_linear(tc.float().unsqueeze(1).expand(-1, eic.shape[1], -1),
+                              self.mlp1_weight[eic].float(), self.mlp1_bias[eic].float())
+            o = swiglu(o)
+            o = expert_linear(o.float(), self.mlp2_weight[eic].float(), self.mlp2_bias[eic].float())
+            return (torch.einsum("bec,be->bc", o.to(ewc.dtype), ewc) * ept).to(x.dtype)
+        cs = 32
+        if t.shape[0] > cs:
+            parts = [_chunk(t[s:s+cs], ei[s:s+cs], ew[s:s+cs]) for s in range(0, t.shape[0], cs)]
+            return x + torch.cat(parts, 0)
+        return x + _chunk(t, ei, ew)
+class TransformerBlock(torch.nn.Module):
+    def __init__(self, cfg, device=None):
+        super().__init__()
+        self.attn = AttentionBlock(cfg, device=device)
+        self.mlp = MLPBlock(cfg, device=device)
+    def forward(self, x):
+        return self.mlp(self.attn(x))
+class Checkpoint:
+    @staticmethod
+    def build_param_name_map(n):
+        return ({f"block.{i}.mlp.mlp1_bias": f"block.{i}.mlp.swiglu.bias" for i in range(n)}
+              | {f"block.{i}.mlp.mlp1_weight": f"block.{i}.mlp.swiglu.weight" for i in range(n)}
+              | {f"block.{i}.mlp.mlp2_bias": f"block.{i}.mlp.out.bias" for i in range(n)}
+              | {f"block.{i}.mlp.mlp2_weight": f"block.{i}.mlp.out.weight" for i in range(n)})
+    def __init__(self, path, device, num_hidden_layers):
+        self.pnm = self.build_param_name_map(num_hidden_layers)
+        self.ds = device.type if device.index is None else f"{device.type}:{device.index}"
+        files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".safetensors")]
+        self.map = {}
+        for sf in files:
+            with safe_open(sf, framework="pt", device=self.ds) as h:
+                for k in h.keys():
+                    self.map[k] = sf
+    def get(self, name):
+        mapped = self.pnm.get(name, name)
+        with safe_open(self.map[mapped], framework="pt", device=self.ds) as h:
+            return h.get_tensor(mapped)
+class Transformer(torch.nn.Module):
+    def __init__(self, cfg, device):
+        super().__init__()
+        dt = torch.bfloat16
+        self.embedding = torch.nn.Embedding(cfg.vocab_size, cfg.hidden_size, device=device, dtype=dt)
+        self.block = torch.nn.ModuleList([TransformerBlock(cfg, device=device) for _ in range(cfg.num_hidden_layers)])
+        self.norm = RMSNorm(cfg.hidden_size, device=device)
+        self.unembedding = torch.nn.Linear(cfg.hidden_size, cfg.num_labels, bias=False, device=device, dtype=dt)
+    def forward(self, token_ids):
+        x = self.embedding(token_ids)
+        for blk in self.block:
+            x = blk(x)
+        return F.linear(self.norm(x), self.unembedding.weight, None)
+    @classmethod
+    def from_checkpoint(cls, checkpoint_dir, *, device):
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+        torch.set_float32_matmul_precision("highest")
+        cp = json.loads((Path(checkpoint_dir) / "config.json").read_text())
+        validate_model_config_contract(cp, context=str(checkpoint_dir))
+        cfg = ModelConfig.from_checkpoint_config(cp, context=str(checkpoint_dir))
+        ckpt = Checkpoint(checkpoint_dir, device, cfg.num_hidden_layers)
+        m = cls(cfg, device); m.eval()
+        for name, param in m.named_parameters():
+            loaded = ckpt.get(name)
+            if param.shape != loaded.shape:
+                raise ValueError(f"Shape mismatch {name}: {param.shape} vs {loaded.shape}")
+            param.data.copy_(loaded)
+        return m
+# ── label info + span decoding ───────────────────────────────────
+@dataclass(frozen=True)
+class LabelInfo:
+    boundary_label_lookup: dict[str, dict[str, int]]
+    token_to_span_label: dict[int, int]
+    token_boundary_tags: dict[int, str | None]
+    span_class_names: tuple[str, ...]
+    span_label_lookup: dict[str, int]
+    background_token_label: int
+    background_span_label: int
+def labels_to_spans(labels_by_index, label_info):
+    spans, cur_label, start_idx, prev_idx = [], None, None, None
+    bg = label_info.background_span_label
+    for ti in sorted(labels_by_index):
+        lid = labels_by_index[ti]
+        sl = label_info.token_to_span_label.get(lid)
+        bt = label_info.token_boundary_tags.get(lid)
+        if prev_idx is not None and ti != prev_idx + 1:
+            if cur_label is not None and start_idx is not None:
+                spans.append((cur_label, start_idx, prev_idx + 1))
+            cur_label = start_idx = None
+        if sl is None:
+            prev_idx = ti; continue
+        if sl == bg:
+            if cur_label is not None and start_idx is not None:
+                spans.append((cur_label, start_idx, ti))
+            cur_label = start_idx = None; prev_idx = ti; continue
+        if bt == "S":
+            if cur_label is not None and start_idx is not None and prev_idx is not None:
+                spans.append((cur_label, start_idx, prev_idx + 1))
+            spans.append((sl, ti, ti + 1)); cur_label = start_idx = None
+        elif bt == "B":
+            if cur_label is not None and start_idx is not None and prev_idx is not None:
+                spans.append((cur_label, start_idx, prev_idx + 1))
+            cur_label, start_idx = sl, ti
+        elif bt == "I":
+            if cur_label is None or cur_label != sl:
+                if cur_label is not None and start_idx is not None and prev_idx is not None:
+                    spans.append((cur_label, start_idx, prev_idx + 1))
+                cur_label, start_idx = sl, ti
+        elif bt == "E":
+            if cur_label is None or cur_label != sl or start_idx is None:
+                if cur_label is not None and start_idx is not None and prev_idx is not None:
+                    spans.append((cur_label, start_idx, prev_idx + 1))
+                spans.append((sl, ti, ti + 1)); cur_label = start_idx = None
+            else:
+                spans.append((cur_label, start_idx, ti + 1)); cur_label = start_idx = None
+        else:
+            if cur_label is not None and start_idx is not None and prev_idx is not None:
+                spans.append((cur_label, start_idx, prev_idx + 1))
+            cur_label = start_idx = None
+        prev_idx = ti
+    if cur_label is not None and start_idx is not None and prev_idx is not None:
+        spans.append((cur_label, start_idx, prev_idx + 1))
+    return spans
+def token_spans_to_char_spans(spans, cs, ce):
+    out = []
+    for li, ts, te in spans:
+        if not (0 <= ts < te <= len(cs)):
+            continue
+        s, e = cs[ts], ce[te - 1]
+        if e > s:
+            out.append((li, s, e))
+    return out
+def trim_char_spans_whitespace(spans, text):
+    out = []
+    for li, s, e in spans:
+        if not (0 <= s < e <= len(text)):
+            continue
+        while s < e and text[s].isspace(): s += 1
+        while e > s and text[e - 1].isspace(): e -= 1
+        if e > s:
+            out.append((li, s, e))
+    return out
+# ── viterbi decoder ──────────────────────────────────────────────
+@functools.lru_cache(maxsize=1)
+def get_viterbi_transition_biases():
+    cp = MODEL_DIR / "viterbi_calibration.json"
+    default = {k: 0.0 for k in VITERBI_TRANSITION_BIAS_KEYS}
+    if not cp.is_file():
+        return default
+    payload = json.loads(cp.read_text())
+    raw = payload
+    ops = payload.get("operating_points")
+    if isinstance(ops, dict):
+        preset = ops.get(DEFAULT_VITERBI_CALIBRATION_PRESET)
+        if isinstance(preset, dict):
+            raw = preset.get("biases", raw)
+    if not isinstance(raw, dict):
+        return default
+    return {k: float(raw.get(k, 0.0)) for k in VITERBI_TRANSITION_BIAS_KEYS}
+class Decoder:
+    def __init__(self, label_info):
+        nc = len(label_info.token_to_span_label)
+        self._start = torch.full((nc,), -1e9, dtype=torch.float32)
+        self._end = torch.full((nc,), -1e9, dtype=torch.float32)
+        self._trans = torch.full((nc, nc), -1e9, dtype=torch.float32)
+        biases = get_viterbi_transition_biases()
+        bg_tok, bg_sp = label_info.background_token_label, label_info.background_span_label
+        ttsl, tbt = label_info.token_to_span_label, label_info.token_boundary_tags
+        for i in range(nc):
+            tag, sl = tbt.get(i), ttsl.get(i)
+            if tag in {"B", "S"} or i == bg_tok: self._start[i] = 0.0
+            if tag in {"E", "S"} or i == bg_tok: self._end[i] = 0.0
+            for j in range(nc):
+                nt, ns = tbt.get(j), ttsl.get(j)
+                if self._valid(tag, sl, nt, ns, bg_tok, bg_sp, j):
+                    self._trans[i, j] = self._bias(tag, sl, nt, ns, bg_sp, biases)
+    @staticmethod
+    def _valid(pt, ps, nt, ns, bti, bsi, ni):
+        nb = ns == bsi or ni == bti
+        if (ns is None or nt is None) and not nb: return False
+        if pt is None or ps is None: return nb or nt in {"B", "S"}
+        if ps == bsi or pt in {"E", "S"}: return nb or nt in {"B", "S"}
+        if pt in {"B", "I"}: return ps == ns and nt in {"I", "E"}
+        return False
+    @staticmethod
+    def _bias(pt, ps, nt, ns, bsi, b):
+        nb, pb = ns == bsi, ps == bsi
+        if pb: return b["transition_bias_background_stay"] if nb else b["transition_bias_background_to_start"]
+        if pt in {"B", "I"}: return b["transition_bias_inside_to_continue"] if nt == "I" else b["transition_bias_inside_to_end"]
+        return b["transition_bias_end_to_background"] if nb else b["transition_bias_end_to_start"]
+    def decode(self, lp):
+        sl, nc = lp.shape
+        if sl == 0: return []
+        st = self._start.to(lp.device, lp.dtype)
+        en = self._end.to(lp.device, lp.dtype)
+        tr = self._trans.to(lp.device, lp.dtype)
+        scores = lp[0] + st
+        bp = torch.empty((sl - 1, nc), device=lp.device, dtype=torch.int64)
+        for i in range(1, sl):
+            t = scores.unsqueeze(1) + tr
+            bs, bi = t.max(dim=0)
+            scores = bs + lp[i]; bp[i - 1] = bi
+        if not torch.isfinite(scores).any(): return lp.argmax(dim=1).tolist()
+        scores += en
+        path = torch.empty(sl, device=lp.device, dtype=torch.int64)
+        path[-1] = scores.argmax()
+        for i in range(sl - 2, -1, -1): path[i] = bp[i, path[i + 1]]
+        return path.tolist()
+# ── runtime singleton ────────────────────────────────────────────
+@dataclass(frozen=True)
+class InferenceRuntime:
+    model: Transformer; encoding: tiktoken.Encoding; label_info: LabelInfo
+    device: torch.device; n_ctx: int
+@functools.lru_cache(maxsize=1)
+def get_runtime():
+    cp = MODEL_DIR
+    cfg = json.loads((cp / "config.json").read_text())
+    validate_model_config_contract(cfg, context=str(cp))
+    device = torch.device("cuda")
+    encoding = tiktoken.get_encoding(str(cfg["encoding"]).strip())
+    # build label info
+    scn = [BACKGROUND_CLASS_LABEL]; sll = {BACKGROUND_CLASS_LABEL: 0}
+    bll, ttsl, tbt = {}, {}, {}
+    bg_idx = None
+    for idx, name in enumerate(NER_CLASS_NAMES):
+        if name == BACKGROUND_CLASS_LABEL:
+            bg_idx = idx; ttsl[idx] = 0; tbt[idx] = None; continue
+        bnd, base = name.split("-", 1)
+        si = sll.get(base)
+        if si is None:
+            si = len(scn); scn.append(base); sll[base] = si
+        ttsl[idx] = si; tbt[idx] = bnd
+        bll.setdefault(base, {})[bnd] = idx
+    li = LabelInfo(bll, ttsl, tbt, tuple(scn), sll, bg_idx, 0)
+    m = Transformer.from_checkpoint(str(cp), device=device)
+    return InferenceRuntime(m, encoding, li, device, int(cfg["default_n_ctx"]))
+@torch.inference_mode()
+def predict_text(runtime, text, decoder):
+    tids = tuple(int(t) for t in runtime.encoding.encode(text, allowed_special="all"))
+    if not tids: return text, []
+    scores = []
+    for s in range(0, len(tids), runtime.n_ctx):
+        e = min(s + runtime.n_ctx, len(tids))
+        wt = torch.tensor(tids[s:e], device=runtime.device, dtype=torch.int32)
+        lp = F.log_softmax(runtime.model(wt).float(), dim=-1)
+        scores.extend(lp.unbind(0))
+    stacked = torch.stack(scores, 0)
+    dl = decoder.decode(stacked)
+    if len(dl) != len(tids): dl = stacked.argmax(dim=1).tolist()
+    pli = {i: int(l) for i, l in enumerate(dl)}
+    pts = labels_to_spans(pli, runtime.label_info)
+    tb = [runtime.encoding.decode_single_token_bytes(t) for t in tids]
+    dt = b"".join(tb).decode("utf-8", errors="replace")
+    cbs, cbe = [], []
+    bc = 0
+    for ch in dt: cbs.append(bc); bc += len(ch.encode("utf-8")); cbe.append(bc)
+    cs, ce = [], []
+    tbc = 0
+    for rb in tb:
+        tbs = tbc; tbe = tbs + len(rb); tbc = tbe
+        cs.append(bisect_right(cbe, tbs)); ce.append(bisect_left(cbs, tbe))
+    pcs = token_spans_to_char_spans(pts, cs, ce)
+    pcs = trim_char_spans_whitespace(pcs, dt if dt != text else text)
+    src = dt if dt != text else text
+    detected = []
+    for li, s, e in pcs:
+        if 0 <= li < len(runtime.label_info.span_class_names):
+            lbl = runtime.label_info.span_class_names[li]
+        else:
+            lbl = f"label_{li}"
+        detected.append({"label": lbl, "start": s, "end": e, "text": src[s:e]})
+    return src, detected
+# =====================================================================
+# APPLICATION  LAYER
+# =====================================================================
 def extract_text(file_path: str) -> str:
     suffix = Path(file_path).suffix.lower()
     if suffix == ".pdf":
     raise ValueError(f"Unsupported file type: {suffix}")
+def compute_stats(text, spans):
     total = len(text)
     pii_chars = sum(s["end"] - s["start"] for s in spans)
+    by_cat = {}
     for s in spans:
         c = s["label"]
         by_cat.setdefault(c, {"count": 0, "chars": 0})
+        by_cat[c]["count"] += 1; by_cat[c]["chars"] += s["end"] - s["start"]
     return {
+        "total_chars": total, "pii_chars": pii_chars,
         "pii_percentage": round(pii_chars / total * 100, 1) if total else 0,
+        "total_spans": len(spans), "categories": by_cat, "num_categories": len(by_cat),
     }
+def detect_speakers(text, spans):
+    patterns = [r"^([A-Z][a-zA-Z ]{1,30}):\s", r"^\[([^\]]{1,30})\]\s", r"^(Speaker\s*\d+):\s"]
+    line_sp, pos, cur = [], 0, None
     for line in text.split("\n"):
+        for p in patterns:
+            m = re.match(p, line)
+            if m: cur = m.group(1).strip(); break
+        line_sp.append((pos, pos + len(line), cur)); pos += len(line) + 1
+    result = {}
     for span in spans:
         mid = (span["start"] + span["end"]) // 2
         speaker = "Document"
+        for ls, le, sp in line_sp:
+            if ls <= mid <= le and sp: speaker = sp; break
         result[speaker] = result.get(speaker, 0) + 1
+    return {} if list(result.keys()) == ["Document"] else result
+@spaces.GPU
+def run_pii_analysis(text: str):
+    """GPU-accelerated PII detection."""
+    runtime = get_runtime()
+    decoder = Decoder(label_info=runtime.label_info)
+    source_text, detected = predict_text(runtime, text, decoder)
+    return source_text, detected
 # ── Gradio Server ────────────────────────────────────────────────
+server = gr.Server()
+@server.get("/", response_class=HTMLResponse)
 async def homepage():
     return FRONTEND_HTML
+@server.post("/api/analyze")
 async def analyze_document(file: UploadFile = File(...)):
     suffix = Path(file.filename).suffix.lower()
     if suffix not in (".pdf", ".doc", ".docx"):
         return JSONResponse({"error": f"Unsupported: {suffix}. Use PDF, DOC, or DOCX."}, 400)
     with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+        tmp.write(await file.read()); tmp_path = tmp.name
     try:
         text = extract_text(tmp_path)
         if not text.strip():
             return JSONResponse({"error": "No text content found."}, 400)
+        source_text, spans = run_pii_analysis(text)
+        stats = compute_stats(source_text, spans)
+        speakers = detect_speakers(source_text, spans)
         return JSONResponse({
+            "filename": file.filename, "text": source_text, "spans": spans,
+            "stats": stats, "speakers": speakers,
             "categories_meta": {k: {"color": v["color"], "bg": v["bg"], "label": v["label"]}
+                                for k, v in CATEGORIES_META.items()},
         })
     except Exception as e:
         return JSONResponse({"error": str(e)}, 500)
     finally:
+        if os.path.exists(tmp_path): os.unlink(tmp_path)
+@server.api(name="analyze_text")
 def analyze_text_api(text: str) -> str:
+    """Gradio API: analyze raw text for PII."""
+    source_text, spans = run_pii_analysis(text)
+    stats = compute_stats(source_text, spans)
+    return json.dumps({"text": source_text, "spans": spans, "stats": stats}, ensure_ascii=False)
+# ── Frontend HTML ────────────────────────────────────────────────
 FRONTEND_HTML = r"""<!DOCTYPE html>
 <html lang="en">
 <head>
 <meta charset="UTF-8">
 <meta name="viewport" content="width=device-width,initial-scale=1">
+<title>PII Reveal</title>
 <link rel="preconnect" href="https://fonts.googleapis.com">
 <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap" rel="stylesheet">
 <style>
   --primary:#6366f1;--primary-light:#e0e7ff;
   --radius:12px;--radius-sm:8px;--shadow:0 1px 3px rgba(0,0,0,.08);
   --shadow-lg:0 8px 32px rgba(0,0,0,.12);
 }
 body{font-family:'Inter',system-ui,sans-serif;background:var(--bg);color:var(--text);min-height:100vh;line-height:1.6}
+/* Upload */
 #upload-view{display:flex;flex-direction:column;align-items:center;justify-content:center;min-height:100vh;padding:2rem}
 .upload-card{background:var(--surface);border-radius:20px;padding:3rem;max-width:640px;width:100%;text-align:center;box-shadow:var(--shadow-lg);position:relative;overflow:hidden}
 .upload-card::before{content:'';position:absolute;inset:-2px;border-radius:22px;background:linear-gradient(135deg,var(--primary),#ec4899,var(--primary));z-index:-1;opacity:0;transition:opacity .3s}
 .feature-desc{color:var(--text2);font-size:.75rem;line-height:1.4}
 .powered-by{margin-top:1.5rem;font-size:.8rem;color:var(--text3)}
+/* Results */
 #results-view{display:none;min-height:100vh}
 .top-bar{background:var(--surface);border-bottom:1px solid var(--border);padding:.75rem 1.5rem;display:flex;align-items:center;gap:1rem;position:sticky;top:0;z-index:100;box-shadow:var(--shadow)}
 .top-bar .brand{margin:0}
 .top-bar .brand-icon{width:32px;height:32px;font-size:1rem}
 .file-info{font-size:.85rem;color:var(--text2);margin-left:.5rem;flex:1}
 .btn{padding:.5rem 1rem;border-radius:var(--radius-sm);border:none;cursor:pointer;font-weight:600;font-size:.85rem;transition:all .15s}
 .btn-ghost{background:transparent;color:var(--text2);border:1px solid var(--border)}
 .btn-ghost:hover{background:var(--surface2)}
+/* Summary */
 .summary-strip{background:var(--surface);border-bottom:1px solid var(--border);padding:1rem 1.5rem;display:flex;align-items:center;gap:1.5rem;flex-wrap:wrap}
 .stat-big{text-align:center;min-width:80px}
 .stat-big .num{font-size:1.75rem;font-weight:800;color:var(--primary)}
 .category-chips{display:flex;flex-wrap:wrap;gap:.35rem}
 .chip{display:inline-flex;align-items:center;gap:.35rem;padding:.2rem .6rem;border-radius:20px;font-size:.75rem;font-weight:600;border:1.5px solid}
+/* Layout */
 .main-layout{display:flex;height:calc(100vh - 130px)}
 .doc-panel{flex:1;overflow-y:auto;padding:2rem;background:var(--bg)}
 .doc-content{background:var(--surface);border-radius:var(--radius);padding:2rem 2.5rem;max-width:900px;margin:0 auto;box-shadow:var(--shadow);font-size:.95rem;line-height:1.8;white-space:pre-wrap;word-wrap:break-word}
+/* PII */
 .pii{border-radius:3px;padding:1px 2px;cursor:pointer;transition:all .15s;position:relative;border-bottom:2px solid}
 .pii:hover{filter:brightness(.92)}
 .pii.dimmed{opacity:.15;border-bottom-color:transparent!important}
+.pii-private_person{background:rgba(239,68,68,.15);border-bottom-color:#ef4444;color:#991b1b}
+.pii-private_address{background:rgba(6,182,212,.15);border-bottom-color:#06b6d4;color:#155e75}
+.pii-private_email{background:rgba(59,130,246,.15);border-bottom-color:#3b82f6;color:#1e40af}
+.pii-private_phone{background:rgba(34,197,94,.15);border-bottom-color:#22c55e;color:#166534}
+.pii-private_url{background:rgba(234,179,8,.15);border-bottom-color:#eab308;color:#854d0e}
+.pii-private_date{background:rgba(168,85,247,.15);border-bottom-color:#a855f7;color:#6b21a8}
+.pii-account_number{background:rgba(249,115,22,.15);border-bottom-color:#f97316;color:#9a3412}
+.pii-secret{background:rgba(220,38,38,.15);border-bottom-color:#dc2626;color:#991b1b}
 .pii-tooltip{position:fixed;background:#1e293b;color:#fff;padding:.4rem .7rem;border-radius:6px;font-size:.75rem;font-weight:500;pointer-events:none;z-index:999;white-space:nowrap;box-shadow:0 4px 12px rgba(0,0,0,.2)}
+/* Sidebar */
 .sidebar{width:300px;background:var(--surface);border-left:1px solid var(--border);overflow-y:auto;padding:1.25rem;flex-shrink:0}
 .sidebar h3{font-size:.7rem;text-transform:uppercase;letter-spacing:.8px;color:var(--text3);margin-bottom:.75rem;font-weight:700}
 .filter-group{margin-bottom:1.5rem}
 .filter-label{flex:1;font-size:.85rem;font-weight:500}
 .filter-count{font-size:.75rem;color:var(--text3);font-weight:600;background:var(--surface2);padding:.1rem .45rem;border-radius:10px}
+/* Loading */
 #loading{position:fixed;inset:0;background:rgba(255,255,255,.85);backdrop-filter:blur(8px);display:none;flex-direction:column;align-items:center;justify-content:center;z-index:9999}
 .spinner{width:48px;height:48px;border:4px solid var(--border);border-top-color:var(--primary);border-radius:50%;animation:spin .8s linear infinite}
 @keyframes spin{to{transform:rotate(360deg)}}
 #loading p{margin-top:1rem;font-weight:600;color:var(--text2)}
 .progress-text{font-size:.85rem;color:var(--text3);margin-top:.5rem}
+.error-banner{background:#fef2f2;border:1px solid #fecaca;color:#991b1b;padding:1rem 1.5rem;border-radius:var(--radius-sm);margin:1rem;font-size:.9rem;display:none}
 @media(max-width:768px){
   .main-layout{flex-direction:column-reverse;height:auto}
   .sidebar{width:100%;border-left:none;border-top:1px solid var(--border)}
   .features{grid-template-columns:1fr}
 }
 </style>
 </head>
 <body>
 <div id="upload-view">
   <div class="upload-card">
+    <div class="brand"><div class="brand-icon">&#x1f50d;</div><h1>PII Reveal</h1></div>
     <p class="subtitle">Document Privacy Explorer</p>
     <div class="dropzone" id="dropzone">
       <div class="dropzone-icon">&#x1f4c4;</div>
       <input type="file" id="file-input" accept=".pdf,.doc,.docx">
     </div>
     <div class="features">
+      <div class="feature"><div class="feature-title">8 PII Categories</div><div class="feature-desc">Names, addresses, emails, phones, URLs, dates, accounts, secrets</div></div>
+      <div class="feature"><div class="feature-title">128k Context</div><div class="feature-desc">Full documents in one pass &mdash; no chunking artifacts</div></div>
+      <div class="feature"><div class="feature-title">Context-Aware</div><div class="feature-desc">Understands when "May" is a name vs. a month</div></div>
     </div>
     <div class="powered-by">Powered by <strong>OpenAI Privacy Filter</strong> &middot; Apache 2.0</div>
   </div>
 </div>
 <div id="results-view">
   <div class="top-bar">
+    <div class="brand"><div class="brand-icon">&#x1f50d;</div><h1>PII Reveal</h1></div>
     <div class="file-info" id="file-info"></div>
     <button class="btn btn-ghost" onclick="resetView()">New File</button>
   </div>
   <div class="error-banner" id="error-banner"></div>
   <div class="summary-strip" id="summary-strip">
     <div class="stat-big"><div class="num" id="stat-pct">0%</div><div class="lbl">PII Content</div></div>
     <div class="stat-divider"></div>
     <div class="stat-divider"></div>
     <div class="stat-big"><div class="num" id="stat-cats">0</div><div class="lbl">Categories</div></div>
     <div class="stat-divider"></div>
+    <div class="stat-bar"><div class="stat-bar-track" id="stat-bar-track"></div><div class="category-chips" id="category-chips"></div></div>
   </div>
   <div class="main-layout">
+    <div class="doc-panel"><div class="doc-content" id="doc-content"></div></div>
     <div class="sidebar">
+      <div class="filter-group"><h3>PII Categories</h3><div id="category-filters"></div></div>
+      <div class="filter-group" id="speaker-group" style="display:none"><h3>Speakers</h3><div id="speaker-filters"></div></div>
     </div>
   </div>
 </div>
+<div id="loading"><div class="spinner"></div><p>Analyzing document for PII&hellip;</p><div class="progress-text">Running OpenAI Privacy Filter (128k context)</div></div>
 <div class="pii-tooltip" id="tooltip" style="display:none"></div>
 <script>
+let S={text:'',spans:[],stats:{},speakers:{},activeCats:new Set(),activeSpeakers:new Set(),catMeta:{}};
+const CLABELS={private_person:'Person',private_address:'Address',private_email:'Email',private_phone:'Phone',private_url:'URL',private_date:'Date',account_number:'Account',secret:'Secret'};
+const CCOLORS={private_person:'#ef4444',private_address:'#06b6d4',private_email:'#3b82f6',private_phone:'#22c55e',private_url:'#eab308',private_date:'#a855f7',account_number:'#f97316',secret:'#dc2626'};
+const dz=document.getElementById('dropzone'),fi=document.getElementById('file-input');
+['dragenter','dragover'].forEach(e=>dz.addEventListener(e,ev=>{ev.preventDefault();dz.classList.add('dragover')}));
+['dragleave','drop'].forEach(e=>dz.addEventListener(e,ev=>{ev.preventDefault();dz.classList.remove('dragover')}));
+dz.addEventListener('drop',ev=>{if(ev.dataTransfer.files[0])uploadFile(ev.dataTransfer.files[0])});
+fi.addEventListener('change',ev=>{if(ev.target.files[0])uploadFile(ev.target.files[0])});
+async function uploadFile(file){
+  const ext=file.name.split('.').pop().toLowerCase();
+  if(!['pdf','doc','docx'].includes(ext)){showError('Unsupported file type.');return}
+  document.getElementById('loading').style.display='flex';
+  document.getElementById('upload-view').style.display='none';
+  const form=new FormData();form.append('file',file);
+  try{
+    const r=await fetch('/api/analyze',{method:'POST',body:form});
+    const d=await r.json();
+    if(d.error){showError(d.error);return}
+    S.text=d.text;S.spans=d.spans;S.stats=d.stats;S.speakers=d.speakers||{};S.catMeta=d.categories_meta||{};
+    S.activeCats=new Set(Object.keys(d.stats.categories));
+    S.activeSpeakers=new Set(Object.keys(d.speakers));
+    renderResults(d.filename);
+  }catch(e){showError('Analysis failed: '+e.message)}
+  finally{document.getElementById('loading').style.display='none'}
 }
+function showError(m){document.getElementById('loading').style.display='none';document.getElementById('results-view').style.display='block';const b=document.getElementById('error-banner');b.textContent=m;b.style.display='block'}
+function resetView(){document.getElementById('results-view').style.display='none';document.getElementById('upload-view').style.display='flex';document.getElementById('error-banner').style.display='none';fi.value=''}
+function renderResults(fn){
+  document.getElementById('results-view').style.display='block';
+  document.getElementById('error-banner').style.display='none';
+  document.getElementById('file-info').textContent=fn;
+  renderSummary();renderCatFilters();renderSpeakerFilters();renderDoc();
 }
+function renderSummary(){
+  const s=S.stats;
+  document.getElementById('stat-pct').textContent=s.pii_percentage+'%';
+  document.getElementById('stat-spans').textContent=s.total_spans;
+  document.getElementById('stat-cats').textContent=s.num_categories;
+  const tr=document.getElementById('stat-bar-track');tr.innerHTML='';
+  for(const[c,i]of Object.entries(s.categories)){const seg=document.createElement('div');seg.className='stat-bar-fill';seg.style.width=(i.chars/s.total_chars*100)+'%';seg.style.background=CCOLORS[c]||'#888';tr.appendChild(seg)}
+  const ch=document.getElementById('category-chips');ch.innerHTML='';
+  for(const[c,i]of Object.entries(s.categories)){const el=document.createElement('span');el.className='chip';const co=CCOLORS[c]||'#888';el.style.cssText=`color:${co};border-color:${co};background:${co}15`;el.textContent=(CLABELS[c]||c)+' '+i.count;ch.appendChild(el)}
 }
+function renderCatFilters(){
+  const ct=document.getElementById('category-filters');ct.innerHTML='';
+  for(const cat of Object.keys(CLABELS)){
+    const info=S.stats.categories[cat];if(!info)continue;
+    const co=CCOLORS[cat],lb=CLABELS[cat];
+    const el=document.createElement('label');el.className='filter-item';el.style.color=co;
+    el.innerHTML=`<input type="checkbox" data-cat="${cat}" ${S.activeCats.has(cat)?'checked':''}><span class="filter-check"></span><span class="filter-dot" style="background:${co}"></span><span class="filter-label" style="color:var(--text)">${lb}</span><span class="filter-count">${info.count}</span>`;
+    el.querySelector('input').addEventListener('change',ev=>{if(ev.target.checked)S.activeCats.add(cat);else S.activeCats.delete(cat);renderDoc()});
+    ct.appendChild(el);
   }
 }
+function renderSpeakerFilters(){
+  const sp=S.speakers,grp=document.getElementById('speaker-group'),ct=document.getElementById('speaker-filters');
+  if(!sp||!Object.keys(sp).length){grp.style.display='none';return}
+  grp.style.display='block';ct.innerHTML='';
+  for(const[s,c]of Object.entries(sp)){
+    const el=document.createElement('label');el.className='filter-item';
+    el.innerHTML=`<input type="checkbox" data-speaker="${s}" ${S.activeSpeakers.has(s)?'checked':''}><span class="filter-check" style="color:var(--primary)"></span><span class="filter-label">${s}</span><span class="filter-count">${c}</span>`;
+    el.querySelector('input').addEventListener('change',ev=>{if(ev.target.checked)S.activeSpeakers.add(s);else S.activeSpeakers.delete(s);renderDoc()});
+    ct.appendChild(el);
   }
 }
+function esc(s){const d=document.createElement('div');d.textContent=s;return d.innerHTML}
+function renderDoc(){
+  const{text,spans}=S,ac=S.activeCats,sorted=[...spans].sort((a,b)=>a.start-b.start);
+  let html='',pos=0;
+  for(const sp of sorted){
+    if(sp.start<pos)continue;
+    if(sp.start>pos)html+=esc(text.substring(pos,sp.start));
+    const active=ac.has(sp.label);
+    html+=`<span class="pii pii-${sp.label}${active?'':' dimmed'}" data-label="${sp.label}" data-text="${esc(sp.text)}">${esc(text.substring(sp.start,sp.end))}</span>`;
+    pos=sp.end;
   }
+  if(pos<text.length)html+=esc(text.substring(pos));
+  document.getElementById('doc-content').innerHTML=html;
+  const tt=document.getElementById('tooltip');
+  document.querySelectorAll('.pii').forEach(el=>{
+    el.addEventListener('mouseenter',ev=>{tt.textContent=(CLABELS[el.dataset.label]||el.dataset.label)+': '+el.dataset.text;tt.style.display='block';moveTT(ev)});
+    el.addEventListener('mousemove',moveTT);
+    el.addEventListener('mouseleave',()=>{tt.style.display='none'});
   });
 }
+function moveTT(ev){const t=document.getElementById('tooltip');t.style.left=ev.clientX+12+'px';t.style.top=ev.clientY-36+'px'}
 </script>
 </body>
 </html>"""
+# ── launch ───────────────────────────────────────────────────────
 if __name__ == "__main__":
+    server.launch(server_name="0.0.0.0", server_port=7860)