ysharma HF Staff commited on
Commit
087fdc4
Β·
verified Β·
1 Parent(s): e01cf02

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +710 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,710 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PII Reveal - Document Privacy Explorer
3
+ =======================================
4
+ A Gradio Server app powered by OpenAI Privacy Filter (Apache 2.0)
5
+ for detecting and visualizing PII in PDF/DOC/DOCX documents.
6
+
7
+ - Backend: gr.Server (Gradio + FastAPI)
8
+ - Frontend: Custom HTML/CSS/JS
9
+ - Model: openai/openai-privacy-filter (1.5B params, 50M active, 128k context)
10
+ """
11
+
12
+ import os
13
+ import re
14
+ import json
15
+ import tempfile
16
+ from pathlib import Path
17
+
18
+ import torch
19
+ import gradio as gr
20
+ from fastapi import UploadFile, File
21
+ from fastapi.responses import HTMLResponse, JSONResponse
22
+
23
+ # ── Configuration ────────────────────────────────────────────────
24
+ MODEL_ID = os.getenv("MODEL_ID", "openai/openai-privacy-filter")
25
+
26
+ CATEGORIES = {
27
+ "private_person": {"color": "#ef4444", "bg": "rgba(239,68,68,0.15)", "label": "Person"},
28
+ "private_address": {"color": "#06b6d4", "bg": "rgba(6,182,212,0.15)", "label": "Address"},
29
+ "private_email": {"color": "#3b82f6", "bg": "rgba(59,130,246,0.15)", "label": "Email"},
30
+ "private_phone": {"color": "#22c55e", "bg": "rgba(34,197,94,0.15)", "label": "Phone"},
31
+ "private_url": {"color": "#eab308", "bg": "rgba(234,179,8,0.15)", "label": "URL"},
32
+ "private_date": {"color": "#a855f7", "bg": "rgba(168,85,247,0.15)", "label": "Date"},
33
+ "account_number": {"color": "#f97316", "bg": "rgba(249,115,22,0.15)", "label": "Account"},
34
+ "secret": {"color": "#dc2626", "bg": "rgba(220,38,38,0.15)", "label": "Secret"},
35
+ }
36
+
37
+ # ── Model Loading ────────────────────────────────────────────────
38
+ print(f"[PII Reveal] Loading model: {MODEL_ID}")
39
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40
+
41
+ from transformers import AutoTokenizer, AutoModelForTokenClassification # noqa: E402
42
+
43
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
44
+ model = AutoModelForTokenClassification.from_pretrained(
45
+ MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
46
+ )
47
+ model.eval().to(device)
48
+
49
+ id2label = model.config.id2label
50
+ print(f"[PII Reveal] Model loaded on {device} | Labels: {len(id2label)}")
51
+
52
+
53
+ # ── Text Extraction ──────────────────────────────────────────────
54
+ def extract_text(file_path: str) -> str:
55
+ suffix = Path(file_path).suffix.lower()
56
+ if suffix == ".pdf":
57
+ import fitz
58
+ doc = fitz.open(file_path)
59
+ pages = [page.get_text() for page in doc]
60
+ doc.close()
61
+ return "\n\n".join(pages)
62
+ elif suffix in (".docx", ".doc"):
63
+ from docx import Document
64
+ doc = Document(file_path)
65
+ return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
66
+ raise ValueError(f"Unsupported file type: {suffix}")
67
+
68
+
69
+ # ── PII Detection ────────────────────────────────────────────────
70
+ def detect_pii(text: str) -> list[dict]:
71
+ """Run Privacy Filter on text, return list of {label, start, end, text}."""
72
+ encodings = tokenizer(
73
+ text,
74
+ return_tensors="pt",
75
+ return_offsets_mapping=True,
76
+ truncation=True,
77
+ max_length=128000,
78
+ )
79
+ offset_mapping = encodings.pop("offset_mapping")[0].tolist()
80
+ inputs = {k: v.to(device) for k, v in encodings.items()}
81
+
82
+ with torch.no_grad():
83
+ logits = model(**inputs).logits
84
+
85
+ preds = torch.argmax(logits, dim=-1)[0].tolist()
86
+
87
+ spans, current = [], None
88
+ for i, pred_id in enumerate(preds):
89
+ label = id2label.get(pred_id, "O")
90
+ char_start, char_end = offset_mapping[i]
91
+
92
+ if char_start == char_end or label == "O" or "-" not in label:
93
+ if current:
94
+ spans.append(current)
95
+ current = None
96
+ continue
97
+
98
+ tag, category = label.split("-", 1)
99
+
100
+ if tag == "S":
101
+ if current:
102
+ spans.append(current)
103
+ spans.append({"label": category, "start": char_start, "end": char_end,
104
+ "text": text[char_start:char_end]})
105
+ current = None
106
+ elif tag == "B":
107
+ if current:
108
+ spans.append(current)
109
+ current = {"label": category, "start": char_start, "end": char_end,
110
+ "text": text[char_start:char_end]}
111
+ elif tag == "I" and current and current["label"] == category:
112
+ current["end"] = char_end
113
+ current["text"] = text[current["start"]:char_end]
114
+ elif tag == "E" and current and current["label"] == category:
115
+ current["end"] = char_end
116
+ current["text"] = text[current["start"]:char_end]
117
+ spans.append(current)
118
+ current = None
119
+ else:
120
+ if current:
121
+ spans.append(current)
122
+ current = None
123
+
124
+ if current:
125
+ spans.append(current)
126
+ return spans
127
+
128
+
129
+ # ── Statistics ───────────────────────────────────────────────────
130
+ def compute_stats(text: str, spans: list[dict]) -> dict:
131
+ total = len(text)
132
+ pii_chars = sum(s["end"] - s["start"] for s in spans)
133
+ by_cat: dict[str, dict] = {}
134
+ for s in spans:
135
+ c = s["label"]
136
+ by_cat.setdefault(c, {"count": 0, "chars": 0})
137
+ by_cat[c]["count"] += 1
138
+ by_cat[c]["chars"] += s["end"] - s["start"]
139
+ return {
140
+ "total_chars": total,
141
+ "pii_chars": pii_chars,
142
+ "pii_percentage": round(pii_chars / total * 100, 1) if total else 0,
143
+ "total_spans": len(spans),
144
+ "categories": by_cat,
145
+ "num_categories": len(by_cat),
146
+ }
147
+
148
+
149
+ def detect_speakers(text: str, spans: list[dict]) -> dict:
150
+ """Simple speaker detection for transcripts."""
151
+ patterns = [
152
+ r"^([A-Z][a-zA-Z ]{1,30}):\s",
153
+ r"^\[([^\]]{1,30})\]\s",
154
+ r"^(Speaker\s*\d+):\s",
155
+ ]
156
+ line_speakers = []
157
+ pos, cur_speaker = 0, None
158
+ for line in text.split("\n"):
159
+ for pat in patterns:
160
+ m = re.match(pat, line)
161
+ if m:
162
+ cur_speaker = m.group(1).strip()
163
+ break
164
+ line_speakers.append((pos, pos + len(line), cur_speaker))
165
+ pos += len(line) + 1
166
+
167
+ result: dict[str, int] = {}
168
+ for span in spans:
169
+ mid = (span["start"] + span["end"]) // 2
170
+ speaker = "Document"
171
+ for ls, le, sp in line_speakers:
172
+ if ls <= mid <= le and sp:
173
+ speaker = sp
174
+ break
175
+ result[speaker] = result.get(speaker, 0) + 1
176
+
177
+ if len(result) <= 1 and "Document" in result:
178
+ return {}
179
+ return result
180
+
181
+
182
+ # ── Gradio Server ────────────────────────────────────────────────
183
+ app = gr.Server()
184
+
185
+
186
+ @app.get("/", response_class=HTMLResponse)
187
+ async def homepage():
188
+ return FRONTEND_HTML
189
+
190
+
191
+ @app.post("/api/analyze")
192
+ async def analyze_document(file: UploadFile = File(...)):
193
+ suffix = Path(file.filename).suffix.lower()
194
+ if suffix not in (".pdf", ".doc", ".docx"):
195
+ return JSONResponse({"error": f"Unsupported: {suffix}. Use PDF, DOC, or DOCX."}, 400)
196
+
197
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
198
+ tmp.write(await file.read())
199
+ tmp_path = tmp.name
200
+
201
+ try:
202
+ text = extract_text(tmp_path)
203
+ if not text.strip():
204
+ return JSONResponse({"error": "No text content found."}, 400)
205
+ spans = detect_pii(text)
206
+ stats = compute_stats(text, spans)
207
+ speakers = detect_speakers(text, spans)
208
+ return JSONResponse({
209
+ "filename": file.filename,
210
+ "text": text,
211
+ "spans": spans,
212
+ "stats": stats,
213
+ "speakers": speakers,
214
+ "categories_meta": {k: {"color": v["color"], "bg": v["bg"], "label": v["label"]}
215
+ for k, v in CATEGORIES.items()},
216
+ })
217
+ except Exception as e:
218
+ return JSONResponse({"error": str(e)}, 500)
219
+ finally:
220
+ if os.path.exists(tmp_path):
221
+ os.unlink(tmp_path)
222
+
223
+
224
+ @app.api(name="analyze_text")
225
+ def analyze_text_api(text: str) -> str:
226
+ """Gradio API: analyze raw text for PII (for programmatic access)."""
227
+ spans = detect_pii(text)
228
+ stats = compute_stats(text, spans)
229
+ return json.dumps({"spans": spans, "stats": stats}, ensure_ascii=False)
230
+
231
+
232
+ # ── Frontend ─────────────────────────────────────────────────────
233
+ FRONTEND_HTML = r"""<!DOCTYPE html>
234
+ <html lang="en">
235
+ <head>
236
+ <meta charset="UTF-8">
237
+ <meta name="viewport" content="width=device-width,initial-scale=1">
238
+ <title>PII Reveal - Document Privacy Explorer</title>
239
+ <link rel="preconnect" href="https://fonts.googleapis.com">
240
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap" rel="stylesheet">
241
+ <style>
242
+ *,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
243
+ :root{
244
+ --bg:#f0f2f5;--surface:#fff;--surface2:#f8f9fb;--border:#e2e5ea;
245
+ --text:#1a1d23;--text2:#6b7280;--text3:#9ca3af;
246
+ --primary:#6366f1;--primary-light:#e0e7ff;
247
+ --radius:12px;--radius-sm:8px;--shadow:0 1px 3px rgba(0,0,0,.08);
248
+ --shadow-lg:0 8px 32px rgba(0,0,0,.12);
249
+ --person:#ef4444;--address:#06b6d4;--email:#3b82f6;--phone:#22c55e;
250
+ --url:#eab308;--date:#a855f7;--account:#f97316;--secret:#dc2626;
251
+ }
252
+ body{font-family:'Inter',system-ui,sans-serif;background:var(--bg);color:var(--text);min-height:100vh;line-height:1.6}
253
+ a{color:var(--primary)}
254
+
255
+ /* ─ Upload View ─ */
256
+ #upload-view{display:flex;flex-direction:column;align-items:center;justify-content:center;min-height:100vh;padding:2rem}
257
+ .upload-card{background:var(--surface);border-radius:20px;padding:3rem;max-width:640px;width:100%;text-align:center;box-shadow:var(--shadow-lg);position:relative;overflow:hidden}
258
+ .upload-card::before{content:'';position:absolute;inset:-2px;border-radius:22px;background:linear-gradient(135deg,var(--primary),#ec4899,var(--primary));z-index:-1;opacity:0;transition:opacity .3s}
259
+ .upload-card:hover::before{opacity:1}
260
+ .upload-card::after{content:'';position:absolute;inset:0;border-radius:20px;background:var(--surface);z-index:-1}
261
+ .brand{display:flex;align-items:center;justify-content:center;gap:.75rem;margin-bottom:.5rem}
262
+ .brand h1{font-size:2rem;font-weight:800;background:linear-gradient(135deg,var(--primary),#ec4899);-webkit-background-clip:text;-webkit-text-fill-color:transparent}
263
+ .brand-icon{width:42px;height:42px;background:linear-gradient(135deg,var(--primary),#ec4899);border-radius:10px;display:flex;align-items:center;justify-content:center;color:#fff;font-size:1.4rem}
264
+ .subtitle{color:var(--text2);margin-bottom:2rem;font-size:1.05rem}
265
+ .dropzone{border:2px dashed var(--border);border-radius:var(--radius);padding:3rem 2rem;cursor:pointer;transition:all .2s;position:relative}
266
+ .dropzone:hover,.dropzone.dragover{border-color:var(--primary);background:var(--primary-light)}
267
+ .dropzone-icon{font-size:3rem;margin-bottom:1rem}
268
+ .dropzone-text{font-weight:600;font-size:1.1rem;margin-bottom:.25rem}
269
+ .dropzone-hint{color:var(--text3);font-size:.875rem}
270
+ .dropzone input{position:absolute;inset:0;opacity:0;cursor:pointer}
271
+ .features{display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;margin-top:2rem;text-align:left}
272
+ .feature{background:var(--surface2);padding:1rem;border-radius:var(--radius-sm)}
273
+ .feature-title{font-weight:600;font-size:.8rem;margin-bottom:.25rem}
274
+ .feature-desc{color:var(--text2);font-size:.75rem;line-height:1.4}
275
+ .powered-by{margin-top:1.5rem;font-size:.8rem;color:var(--text3)}
276
+
277
+ /* ─ Results View ─ */
278
+ #results-view{display:none;min-height:100vh}
279
+ .top-bar{background:var(--surface);border-bottom:1px solid var(--border);padding:.75rem 1.5rem;display:flex;align-items:center;gap:1rem;position:sticky;top:0;z-index:100;box-shadow:var(--shadow)}
280
+ .top-bar .brand{margin:0}
281
+ .top-bar .brand h1{font-size:1.25rem}
282
+ .top-bar .brand-icon{width:32px;height:32px;font-size:1rem}
283
+ .file-info{font-size:.85rem;color:var(--text2);margin-left:.5rem;flex:1}
284
+ .btn{padding:.5rem 1rem;border-radius:var(--radius-sm);border:none;cursor:pointer;font-weight:600;font-size:.85rem;transition:all .15s}
285
+ .btn-primary{background:var(--primary);color:#fff}
286
+ .btn-primary:hover{background:#4f46e5}
287
+ .btn-ghost{background:transparent;color:var(--text2);border:1px solid var(--border)}
288
+ .btn-ghost:hover{background:var(--surface2)}
289
+
290
+ /* ─ Summary Strip ─ */
291
+ .summary-strip{background:var(--surface);border-bottom:1px solid var(--border);padding:1rem 1.5rem;display:flex;align-items:center;gap:1.5rem;flex-wrap:wrap}
292
+ .stat-big{text-align:center;min-width:80px}
293
+ .stat-big .num{font-size:1.75rem;font-weight:800;color:var(--primary)}
294
+ .stat-big .lbl{font-size:.7rem;color:var(--text3);text-transform:uppercase;letter-spacing:.5px}
295
+ .stat-divider{width:1px;height:40px;background:var(--border)}
296
+ .stat-bar{flex:1;min-width:200px}
297
+ .stat-bar-track{height:8px;background:var(--surface2);border-radius:4px;overflow:hidden;display:flex;margin-bottom:.5rem}
298
+ .stat-bar-fill{height:100%;transition:width .6s ease}
299
+ .category-chips{display:flex;flex-wrap:wrap;gap:.35rem}
300
+ .chip{display:inline-flex;align-items:center;gap:.35rem;padding:.2rem .6rem;border-radius:20px;font-size:.75rem;font-weight:600;border:1.5px solid}
301
+
302
+ /* ─ Main Layout ─ */
303
+ .main-layout{display:flex;height:calc(100vh - 130px)}
304
+ .doc-panel{flex:1;overflow-y:auto;padding:2rem;background:var(--bg)}
305
+ .doc-content{background:var(--surface);border-radius:var(--radius);padding:2rem 2.5rem;max-width:900px;margin:0 auto;box-shadow:var(--shadow);font-size:.95rem;line-height:1.8;white-space:pre-wrap;word-wrap:break-word}
306
+
307
+ /* ─ PII Highlights ─ */
308
+ .pii{border-radius:3px;padding:1px 2px;cursor:pointer;transition:all .15s;position:relative;border-bottom:2px solid}
309
+ .pii:hover{filter:brightness(.92)}
310
+ .pii.dimmed{opacity:.15;border-bottom-color:transparent!important}
311
+ .pii-private_person{background:rgba(239,68,68,.15);border-bottom-color:var(--person);color:#991b1b}
312
+ .pii-private_address{background:rgba(6,182,212,.15);border-bottom-color:var(--address);color:#155e75}
313
+ .pii-private_email{background:rgba(59,130,246,.15);border-bottom-color:var(--email);color:#1e40af}
314
+ .pii-private_phone{background:rgba(34,197,94,.15);border-bottom-color:var(--phone);color:#166534}
315
+ .pii-private_url{background:rgba(234,179,8,.15);border-bottom-color:var(--url);color:#854d0e}
316
+ .pii-private_date{background:rgba(168,85,247,.15);border-bottom-color:var(--date);color:#6b21a8}
317
+ .pii-account_number{background:rgba(249,115,22,.15);border-bottom-color:var(--account);color:#9a3412}
318
+ .pii-secret{background:rgba(220,38,38,.15);border-bottom-color:var(--secret);color:#991b1b}
319
+
320
+ /* ─ Tooltip ─ */
321
+ .pii-tooltip{position:fixed;background:#1e293b;color:#fff;padding:.4rem .7rem;border-radius:6px;font-size:.75rem;font-weight:500;pointer-events:none;z-index:999;white-space:nowrap;box-shadow:0 4px 12px rgba(0,0,0,.2)}
322
+
323
+ /* ─ Sidebar ─ */
324
+ .sidebar{width:300px;background:var(--surface);border-left:1px solid var(--border);overflow-y:auto;padding:1.25rem;flex-shrink:0}
325
+ .sidebar h3{font-size:.7rem;text-transform:uppercase;letter-spacing:.8px;color:var(--text3);margin-bottom:.75rem;font-weight:700}
326
+ .filter-group{margin-bottom:1.5rem}
327
+ .filter-item{display:flex;align-items:center;gap:.6rem;padding:.45rem .5rem;border-radius:var(--radius-sm);cursor:pointer;transition:background .15s;user-select:none}
328
+ .filter-item:hover{background:var(--surface2)}
329
+ .filter-item input{display:none}
330
+ .filter-check{width:18px;height:18px;border-radius:5px;border:2px solid var(--border);display:flex;align-items:center;justify-content:center;transition:all .15s;flex-shrink:0}
331
+ .filter-item input:checked~.filter-check{border-color:currentColor;background:currentColor}
332
+ .filter-item input:checked~.filter-check::after{content:'';display:block;width:5px;height:9px;border:solid #fff;border-width:0 2px 2px 0;transform:rotate(45deg) translateY(-1px)}
333
+ .filter-dot{width:10px;height:10px;border-radius:50%;flex-shrink:0}
334
+ .filter-label{flex:1;font-size:.85rem;font-weight:500}
335
+ .filter-count{font-size:.75rem;color:var(--text3);font-weight:600;background:var(--surface2);padding:.1rem .45rem;border-radius:10px}
336
+
337
+ /* ─ Loading ─ */
338
+ #loading{position:fixed;inset:0;background:rgba(255,255,255,.85);backdrop-filter:blur(8px);display:none;flex-direction:column;align-items:center;justify-content:center;z-index:9999}
339
+ .spinner{width:48px;height:48px;border:4px solid var(--border);border-top-color:var(--primary);border-radius:50%;animation:spin .8s linear infinite}
340
+ @keyframes spin{to{transform:rotate(360deg)}}
341
+ #loading p{margin-top:1rem;font-weight:600;color:var(--text2)}
342
+ .progress-text{font-size:.85rem;color:var(--text3);margin-top:.5rem}
343
+
344
+ /* ─ Error ─ */
345
+ .error-banner{background:#fef2f2;border:1px solid #fecaca;color:#991b1b;padding:1rem 1.5rem;border-radius:var(--radius-sm);margin:1rem;font-size:.9rem;display:none;align-items:center;gap:.5rem}
346
+
347
+ /* ─ Responsive ─ */
348
+ @media(max-width:768px){
349
+ .main-layout{flex-direction:column-reverse;height:auto}
350
+ .sidebar{width:100%;border-left:none;border-top:1px solid var(--border)}
351
+ .features{grid-template-columns:1fr}
352
+ .summary-strip{flex-direction:column;align-items:stretch}
353
+ .stat-divider{width:100%;height:1px}
354
+ }
355
+ </style>
356
+ </head>
357
+ <body>
358
+
359
+ <!-- ─── Upload View ─── -->
360
+ <div id="upload-view">
361
+ <div class="upload-card">
362
+ <div class="brand">
363
+ <div class="brand-icon">&#x1f50d;</div>
364
+ <h1>PII Reveal</h1>
365
+ </div>
366
+ <p class="subtitle">Document Privacy Explorer</p>
367
+ <div class="dropzone" id="dropzone">
368
+ <div class="dropzone-icon">&#x1f4c4;</div>
369
+ <div class="dropzone-text">Drop your document here</div>
370
+ <div class="dropzone-hint">PDF, DOC, or DOCX &middot; Up to 128k tokens</div>
371
+ <input type="file" id="file-input" accept=".pdf,.doc,.docx">
372
+ </div>
373
+ <div class="features">
374
+ <div class="feature">
375
+ <div class="feature-title">8 PII Categories</div>
376
+ <div class="feature-desc">Names, addresses, emails, phones, URLs, dates, accounts, secrets</div>
377
+ </div>
378
+ <div class="feature">
379
+ <div class="feature-title">128k Context</div>
380
+ <div class="feature-desc">Full documents in one pass &mdash; no chunking artifacts</div>
381
+ </div>
382
+ <div class="feature">
383
+ <div class="feature-title">Context-Aware</div>
384
+ <div class="feature-desc">Understands when "May" is a name vs. a month</div>
385
+ </div>
386
+ </div>
387
+ <div class="powered-by">Powered by <strong>OpenAI Privacy Filter</strong> &middot; Apache 2.0</div>
388
+ </div>
389
+ </div>
390
+
391
+ <!-- ─── Results View ─── -->
392
+ <div id="results-view">
393
+ <div class="top-bar">
394
+ <div class="brand">
395
+ <div class="brand-icon">&#x1f50d;</div>
396
+ <h1>PII Reveal</h1>
397
+ </div>
398
+ <div class="file-info" id="file-info"></div>
399
+ <button class="btn btn-ghost" onclick="resetView()">New File</button>
400
+ </div>
401
+
402
+ <div class="error-banner" id="error-banner"></div>
403
+
404
+ <div class="summary-strip" id="summary-strip">
405
+ <div class="stat-big"><div class="num" id="stat-pct">0%</div><div class="lbl">PII Content</div></div>
406
+ <div class="stat-divider"></div>
407
+ <div class="stat-big"><div class="num" id="stat-spans">0</div><div class="lbl">PII Spans</div></div>
408
+ <div class="stat-divider"></div>
409
+ <div class="stat-big"><div class="num" id="stat-cats">0</div><div class="lbl">Categories</div></div>
410
+ <div class="stat-divider"></div>
411
+ <div class="stat-bar">
412
+ <div class="stat-bar-track" id="stat-bar-track"></div>
413
+ <div class="category-chips" id="category-chips"></div>
414
+ </div>
415
+ </div>
416
+
417
+ <div class="main-layout">
418
+ <div class="doc-panel">
419
+ <div class="doc-content" id="doc-content"></div>
420
+ </div>
421
+ <div class="sidebar">
422
+ <div class="filter-group">
423
+ <h3>PII Categories</h3>
424
+ <div id="category-filters"></div>
425
+ </div>
426
+ <div class="filter-group" id="speaker-group" style="display:none">
427
+ <h3>Speakers</h3>
428
+ <div id="speaker-filters"></div>
429
+ </div>
430
+ </div>
431
+ </div>
432
+ </div>
433
+
434
+ <!-- ─── Loading Overlay ─── -->
435
+ <div id="loading">
436
+ <div class="spinner"></div>
437
+ <p>Analyzing document for PII&hellip;</p>
438
+ <div class="progress-text">Running OpenAI Privacy Filter (128k context)</div>
439
+ </div>
440
+
441
+ <!-- ─── Tooltip ─── -->
442
+ <div class="pii-tooltip" id="tooltip" style="display:none"></div>
443
+
444
+ <script>
445
+ // ── State ──
446
+ let STATE = { text: '', spans: [], stats: {}, speakers: {}, activeCategories: new Set(), activeSpeakers: new Set(), categoriesMeta: {} };
447
+
448
+ const CATEGORY_LABELS = {
449
+ private_person: 'Person', private_address: 'Address', private_email: 'Email',
450
+ private_phone: 'Phone', private_url: 'URL', private_date: 'Date',
451
+ account_number: 'Account', secret: 'Secret'
452
+ };
453
+ const CATEGORY_COLORS = {
454
+ private_person:'#ef4444', private_address:'#06b6d4', private_email:'#3b82f6',
455
+ private_phone:'#22c55e', private_url:'#eab308', private_date:'#a855f7',
456
+ account_number:'#f97316', secret:'#dc2626'
457
+ };
458
+
459
+ // ── Upload Handling ──
460
+ const dropzone = document.getElementById('dropzone');
461
+ const fileInput = document.getElementById('file-input');
462
+
463
+ ['dragenter','dragover'].forEach(e => dropzone.addEventListener(e, ev => { ev.preventDefault(); dropzone.classList.add('dragover'); }));
464
+ ['dragleave','drop'].forEach(e => dropzone.addEventListener(e, ev => { ev.preventDefault(); dropzone.classList.remove('dragover'); }));
465
+
466
+ dropzone.addEventListener('drop', ev => {
467
+ const file = ev.dataTransfer.files[0];
468
+ if (file) uploadFile(file);
469
+ });
470
+ fileInput.addEventListener('change', ev => {
471
+ const file = ev.target.files[0];
472
+ if (file) uploadFile(file);
473
+ });
474
+
475
+ async function uploadFile(file) {
476
+ const ext = file.name.split('.').pop().toLowerCase();
477
+ if (!['pdf','doc','docx'].includes(ext)) {
478
+ showError('Unsupported file type. Please use PDF, DOC, or DOCX.');
479
+ return;
480
+ }
481
+
482
+ document.getElementById('loading').style.display = 'flex';
483
+ document.getElementById('upload-view').style.display = 'none';
484
+
485
+ const form = new FormData();
486
+ form.append('file', file);
487
+
488
+ try {
489
+ const resp = await fetch('/api/analyze', { method: 'POST', body: form });
490
+ const data = await resp.json();
491
+
492
+ if (data.error) {
493
+ showError(data.error);
494
+ return;
495
+ }
496
+
497
+ STATE.text = data.text;
498
+ STATE.spans = data.spans;
499
+ STATE.stats = data.stats;
500
+ STATE.speakers = data.speakers || {};
501
+ STATE.categoriesMeta = data.categories_meta || {};
502
+ STATE.activeCategories = new Set(Object.keys(data.stats.categories));
503
+ STATE.activeSpeakers = new Set(Object.keys(data.speakers));
504
+
505
+ renderResults(data.filename);
506
+ } catch (err) {
507
+ showError('Analysis failed: ' + err.message);
508
+ } finally {
509
+ document.getElementById('loading').style.display = 'none';
510
+ }
511
+ }
512
+
513
+ function showError(msg) {
514
+ document.getElementById('loading').style.display = 'none';
515
+ document.getElementById('results-view').style.display = 'block';
516
+ const banner = document.getElementById('error-banner');
517
+ banner.textContent = msg;
518
+ banner.style.display = 'flex';
519
+ }
520
+
521
+ function resetView() {
522
+ document.getElementById('results-view').style.display = 'none';
523
+ document.getElementById('upload-view').style.display = 'flex';
524
+ document.getElementById('error-banner').style.display = 'none';
525
+ fileInput.value = '';
526
+ }
527
+
528
+ // ── Render Results ──
529
+ function renderResults(filename) {
530
+ document.getElementById('results-view').style.display = 'block';
531
+ document.getElementById('error-banner').style.display = 'none';
532
+
533
+ // File info
534
+ document.getElementById('file-info').textContent = filename;
535
+
536
+ // Summary stats
537
+ renderSummary();
538
+
539
+ // Filters
540
+ renderCategoryFilters();
541
+ renderSpeakerFilters();
542
+
543
+ // Document
544
+ renderDocument();
545
+ }
546
+
547
+ function renderSummary() {
548
+ const s = STATE.stats;
549
+ document.getElementById('stat-pct').textContent = s.pii_percentage + '%';
550
+ document.getElementById('stat-spans').textContent = s.total_spans;
551
+ document.getElementById('stat-cats').textContent = s.num_categories;
552
+
553
+ // Bar
554
+ const track = document.getElementById('stat-bar-track');
555
+ track.innerHTML = '';
556
+ const cats = s.categories;
557
+ const total = s.pii_chars || 1;
558
+ for (const [cat, info] of Object.entries(cats)) {
559
+ const pct = (info.chars / s.total_chars * 100);
560
+ const seg = document.createElement('div');
561
+ seg.className = 'stat-bar-fill';
562
+ seg.style.width = pct + '%';
563
+ seg.style.background = CATEGORY_COLORS[cat] || '#888';
564
+ track.appendChild(seg);
565
+ }
566
+
567
+ // Chips
568
+ const chips = document.getElementById('category-chips');
569
+ chips.innerHTML = '';
570
+ for (const [cat, info] of Object.entries(cats)) {
571
+ const color = CATEGORY_COLORS[cat] || '#888';
572
+ const label = CATEGORY_LABELS[cat] || cat;
573
+ const chip = document.createElement('span');
574
+ chip.className = 'chip';
575
+ chip.style.color = color;
576
+ chip.style.borderColor = color;
577
+ chip.style.background = color + '15';
578
+ chip.textContent = label + ' ' + info.count;
579
+ chips.appendChild(chip);
580
+ }
581
+ }
582
+
583
+ function renderCategoryFilters() {
584
+ const container = document.getElementById('category-filters');
585
+ container.innerHTML = '';
586
+ const cats = STATE.stats.categories;
587
+
588
+ for (const cat of Object.keys(CATEGORY_LABELS)) {
589
+ const info = cats[cat];
590
+ if (!info) continue;
591
+ const color = CATEGORY_COLORS[cat];
592
+ const label = CATEGORY_LABELS[cat];
593
+
594
+ const item = document.createElement('label');
595
+ item.className = 'filter-item';
596
+ item.style.color = color;
597
+ item.innerHTML = `
598
+ <input type="checkbox" data-cat="${cat}" ${STATE.activeCategories.has(cat)?'checked':''}>
599
+ <span class="filter-check"></span>
600
+ <span class="filter-dot" style="background:${color}"></span>
601
+ <span class="filter-label" style="color:var(--text)">${label}</span>
602
+ <span class="filter-count">${info.count}</span>
603
+ `;
604
+ item.querySelector('input').addEventListener('change', ev => {
605
+ if (ev.target.checked) STATE.activeCategories.add(cat);
606
+ else STATE.activeCategories.delete(cat);
607
+ renderDocument();
608
+ });
609
+ container.appendChild(item);
610
+ }
611
+ }
612
+
613
+ function renderSpeakerFilters() {
614
+ const speakers = STATE.speakers;
615
+ const group = document.getElementById('speaker-group');
616
+ const container = document.getElementById('speaker-filters');
617
+
618
+ if (!speakers || Object.keys(speakers).length === 0) {
619
+ group.style.display = 'none';
620
+ return;
621
+ }
622
+ group.style.display = 'block';
623
+ container.innerHTML = '';
624
+
625
+ for (const [speaker, count] of Object.entries(speakers)) {
626
+ const item = document.createElement('label');
627
+ item.className = 'filter-item';
628
+ item.innerHTML = `
629
+ <input type="checkbox" data-speaker="${speaker}" ${STATE.activeSpeakers.has(speaker)?'checked':''}>
630
+ <span class="filter-check" style="color:var(--primary)"></span>
631
+ <span class="filter-label">${speaker}</span>
632
+ <span class="filter-count">${count}</span>
633
+ `;
634
+ item.querySelector('input').addEventListener('change', ev => {
635
+ if (ev.target.checked) STATE.activeSpeakers.add(speaker);
636
+ else STATE.activeSpeakers.delete(speaker);
637
+ renderDocument();
638
+ });
639
+ container.appendChild(item);
640
+ }
641
+ }
642
+
643
+ // ── Document Rendering ──
644
+ function escapeHtml(str) {
645
+ const div = document.createElement('div');
646
+ div.textContent = str;
647
+ return div.innerHTML;
648
+ }
649
+
650
+ function renderDocument() {
651
+ const { text, spans } = STATE;
652
+ const active = STATE.activeCategories;
653
+
654
+ // Sort spans by start position
655
+ const sorted = [...spans].sort((a, b) => a.start - b.start);
656
+
657
+ let html = '';
658
+ let pos = 0;
659
+
660
+ for (const span of sorted) {
661
+ if (span.start < pos) continue; // skip overlapping
662
+
663
+ // Text before span
664
+ if (span.start > pos) {
665
+ html += escapeHtml(text.substring(pos, span.start));
666
+ }
667
+
668
+ const isActive = active.has(span.label);
669
+ const cls = isActive ? `pii pii-${span.label}` : `pii pii-${span.label} dimmed`;
670
+ const spanText = escapeHtml(text.substring(span.start, span.end));
671
+ html += `<span class="${cls}" data-label="${span.label}" data-text="${escapeHtml(span.text)}">${spanText}</span>`;
672
+ pos = span.end;
673
+ }
674
+
675
+ // Remaining text
676
+ if (pos < text.length) {
677
+ html += escapeHtml(text.substring(pos));
678
+ }
679
+
680
+ document.getElementById('doc-content').innerHTML = html;
681
+ attachTooltips();
682
+ }
683
+
684
+ // ── Tooltips ──
685
+ function attachTooltips() {
686
+ const tooltip = document.getElementById('tooltip');
687
+ document.querySelectorAll('.pii').forEach(el => {
688
+ el.addEventListener('mouseenter', ev => {
689
+ const label = CATEGORY_LABELS[el.dataset.label] || el.dataset.label;
690
+ tooltip.textContent = label + ': ' + el.dataset.text;
691
+ tooltip.style.display = 'block';
692
+ positionTooltip(ev);
693
+ });
694
+ el.addEventListener('mousemove', positionTooltip);
695
+ el.addEventListener('mouseleave', () => { tooltip.style.display = 'none'; });
696
+ });
697
+ }
698
+
699
+ function positionTooltip(ev) {
700
+ const tt = document.getElementById('tooltip');
701
+ tt.style.left = ev.clientX + 12 + 'px';
702
+ tt.style.top = ev.clientY - 36 + 'px';
703
+ }
704
+ </script>
705
+ </body>
706
+ </html>"""
707
+
708
+ # ── Launch ───────────────────────────────────────────────────────
709
+ if __name__ == "__main__":
710
+ app.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=5.0
2
+ transformers
3
+ torch
4
+ accelerate
5
+ PyMuPDF
6
+ python-docx
7
+ python-multipart