akhaliq HF Staff commited on
Commit
07d284c
·
1 Parent(s): fbdc786

feat: implement PII detection service with FastAPI, Gradio UI, and Transformer model

Browse files
Files changed (4) hide show
  1. app.py +84 -0
  2. index.html +451 -0
  3. requirements.txt +5 -0
  4. test.py +11 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi.responses import HTMLResponse
3
+ from gradio import Server
4
+ from transformers import pipeline
5
+ import spaces
6
+
7
+ app = Server()
8
+
9
+ print("Loading OpenAI Privacy Filter model...")
10
+ classifier = pipeline(
11
+ task="token-classification",
12
+ model="openai/privacy-filter",
13
+ )
14
+ print("Model loaded successfully.")
15
+
16
+ @app.api("/predict")
17
+ @spaces.GPU
18
+ def predict(text: str):
19
+ """Detect PII in the given text and return aggregated spans."""
20
+ results = classifier(text)
21
+
22
+ merged = []
23
+ current_entity = None
24
+
25
+ for res in results:
26
+ label = res["entity"]
27
+ if label == "O":
28
+ if current_entity:
29
+ merged.append(current_entity)
30
+ current_entity = None
31
+ continue
32
+
33
+ base_label = label.split("-", 1)[-1] if "-" in label else label
34
+
35
+ if label.startswith("B-") or label.startswith("S-"):
36
+ if current_entity:
37
+ merged.append(current_entity)
38
+ current_entity = {
39
+ "entity": base_label,
40
+ "score": float(res["score"]),
41
+ "start": res["start"],
42
+ "end": res["end"],
43
+ "word": res["word"]
44
+ }
45
+ elif label.startswith("I-") or label.startswith("E-"):
46
+ if current_entity and current_entity["entity"] == base_label:
47
+ # Extend current entity
48
+ current_entity["end"] = res["end"]
49
+ current_entity["word"] += res["word"].replace("##", "").replace("Ġ", " ")
50
+ else:
51
+ if current_entity:
52
+ merged.append(current_entity)
53
+ current_entity = {
54
+ "entity": base_label,
55
+ "score": float(res["score"]),
56
+ "start": res["start"],
57
+ "end": res["end"],
58
+ "word": res["word"]
59
+ }
60
+ else:
61
+ # No prefix
62
+ if current_entity:
63
+ merged.append(current_entity)
64
+ current_entity = {
65
+ "entity": base_label,
66
+ "score": float(res["score"]),
67
+ "start": res["start"],
68
+ "end": res["end"],
69
+ "word": res["word"]
70
+ }
71
+
72
+ if current_entity:
73
+ merged.append(current_entity)
74
+
75
+ return merged
76
+
77
+ @app.get("/")
78
+ async def homepage():
79
+ html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
80
+ with open(html_path, "r", encoding="utf-8") as f:
81
+ return HTMLResponse(content=f.read())
82
+
83
+ if __name__ == "__main__":
84
+ app.launch(show_error=True)
index.html ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>OpenAI Privacy Filter</title>
7
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
8
+ <style>
9
+ :root {
10
+ --bg-color: #0f172a;
11
+ --panel-bg: rgba(30, 41, 59, 0.7);
12
+ --border-color: rgba(255, 255, 255, 0.1);
13
+ --text-main: #f8fafc;
14
+ --text-muted: #94a3b8;
15
+ --accent: #3b82f6;
16
+ --accent-hover: #2563eb;
17
+ --glass-blur: blur(12px);
18
+
19
+ /* Entity Colors */
20
+ --entity-account_number: #ef4444;
21
+ --entity-private_address: #f59e0b;
22
+ --entity-private_email: #10b981;
23
+ --entity-private_person: #8b5cf6;
24
+ --entity-private_phone: #ec4899;
25
+ --entity-private_url: #06b6d4;
26
+ --entity-private_date: #eab308;
27
+ --entity-secret: #ef4444;
28
+ }
29
+
30
+ body {
31
+ margin: 0;
32
+ padding: 0;
33
+ font-family: 'Inter', sans-serif;
34
+ background: radial-gradient(circle at 50% -20%, #1e293b, var(--bg-color));
35
+ color: var(--text-main);
36
+ min-height: 100vh;
37
+ display: flex;
38
+ flex-direction: column;
39
+ align-items: center;
40
+ overflow-x: hidden;
41
+ }
42
+
43
+ .container {
44
+ width: 100%;
45
+ max-width: 1000px;
46
+ margin: 40px auto;
47
+ padding: 0 20px;
48
+ box-sizing: border-box;
49
+ display: flex;
50
+ flex-direction: column;
51
+ gap: 24px;
52
+ }
53
+
54
+ header {
55
+ text-align: center;
56
+ margin-bottom: 20px;
57
+ animation: fadeIn 0.8s ease-out;
58
+ }
59
+
60
+ h1 {
61
+ font-size: 2.5rem;
62
+ font-weight: 700;
63
+ margin-bottom: 10px;
64
+ background: linear-gradient(to right, #60a5fa, #a78bfa);
65
+ -webkit-background-clip: text;
66
+ -webkit-text-fill-color: transparent;
67
+ }
68
+
69
+ p.subtitle {
70
+ color: var(--text-muted);
71
+ font-size: 1.1rem;
72
+ max-width: 600px;
73
+ margin: 0 auto;
74
+ line-height: 1.6;
75
+ }
76
+
77
+ .glass-panel {
78
+ background: var(--panel-bg);
79
+ backdrop-filter: var(--glass-blur);
80
+ -webkit-backdrop-filter: var(--glass-blur);
81
+ border: 1px solid var(--border-color);
82
+ border-radius: 16px;
83
+ padding: 24px;
84
+ box-shadow: 0 4px 30px rgba(0, 0, 0, 0.1);
85
+ animation: slideUp 0.6s ease-out;
86
+ }
87
+
88
+ textarea {
89
+ width: 100%;
90
+ height: 150px;
91
+ background: rgba(15, 23, 42, 0.6);
92
+ border: 1px solid var(--border-color);
93
+ border-radius: 12px;
94
+ color: var(--text-main);
95
+ font-family: 'Inter', sans-serif;
96
+ font-size: 1rem;
97
+ padding: 16px;
98
+ box-sizing: border-box;
99
+ resize: vertical;
100
+ transition: border-color 0.3s;
101
+ line-height: 1.5;
102
+ }
103
+
104
+ textarea:focus {
105
+ outline: none;
106
+ border-color: var(--accent);
107
+ box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.2);
108
+ }
109
+
110
+ button {
111
+ background: var(--accent);
112
+ color: white;
113
+ border: none;
114
+ border-radius: 8px;
115
+ padding: 12px 24px;
116
+ font-size: 1rem;
117
+ font-weight: 600;
118
+ cursor: pointer;
119
+ transition: all 0.3s;
120
+ display: flex;
121
+ align-items: center;
122
+ justify-content: center;
123
+ gap: 8px;
124
+ width: fit-content;
125
+ }
126
+
127
+ button:hover {
128
+ background: var(--accent-hover);
129
+ transform: translateY(-2px);
130
+ box-shadow: 0 4px 12px rgba(59, 130, 246, 0.3);
131
+ }
132
+
133
+ button:active {
134
+ transform: translateY(0);
135
+ }
136
+
137
+ button:disabled {
138
+ background: #475569;
139
+ cursor: not-allowed;
140
+ transform: none;
141
+ box-shadow: none;
142
+ }
143
+
144
+ .loading-spinner {
145
+ width: 20px;
146
+ height: 20px;
147
+ border: 3px solid rgba(255,255,255,0.3);
148
+ border-radius: 50%;
149
+ border-top-color: white;
150
+ animation: spin 1s ease-in-out infinite;
151
+ display: none;
152
+ }
153
+
154
+ .results-area {
155
+ display: flex;
156
+ flex-direction: column;
157
+ gap: 20px;
158
+ }
159
+
160
+ .highlighted-text {
161
+ font-size: 1.1rem;
162
+ line-height: 1.8;
163
+ white-space: pre-wrap;
164
+ background: rgba(15, 23, 42, 0.4);
165
+ padding: 20px;
166
+ border-radius: 12px;
167
+ border: 1px solid rgba(255, 255, 255, 0.05);
168
+ min-height: 100px;
169
+ }
170
+
171
+ .entity {
172
+ display: inline-flex;
173
+ align-items: center;
174
+ border-radius: 4px;
175
+ padding: 2px 6px;
176
+ margin: 0 2px;
177
+ font-weight: 500;
178
+ position: relative;
179
+ cursor: help;
180
+ transition: all 0.2s;
181
+ }
182
+
183
+ .entity:hover {
184
+ transform: scale(1.05);
185
+ z-index: 10;
186
+ }
187
+
188
+ .entity-label {
189
+ font-size: 0.7rem;
190
+ text-transform: uppercase;
191
+ letter-spacing: 0.5px;
192
+ margin-left: 6px;
193
+ opacity: 0.8;
194
+ background: rgba(0,0,0,0.2);
195
+ padding: 2px 4px;
196
+ border-radius: 3px;
197
+ }
198
+
199
+ .summary-panel {
200
+ display: grid;
201
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
202
+ gap: 12px;
203
+ }
204
+
205
+ .summary-card {
206
+ background: rgba(255, 255, 255, 0.03);
207
+ border: 1px solid rgba(255, 255, 255, 0.05);
208
+ border-radius: 8px;
209
+ padding: 12px;
210
+ display: flex;
211
+ flex-direction: column;
212
+ gap: 4px;
213
+ }
214
+
215
+ .summary-card .count {
216
+ font-size: 1.5rem;
217
+ font-weight: 700;
218
+ }
219
+
220
+ .summary-card .label {
221
+ font-size: 0.85rem;
222
+ color: var(--text-muted);
223
+ text-transform: uppercase;
224
+ letter-spacing: 0.5px;
225
+ }
226
+
227
+ @keyframes fadeIn {
228
+ from { opacity: 0; }
229
+ to { opacity: 1; }
230
+ }
231
+
232
+ @keyframes slideUp {
233
+ from { opacity: 0; transform: translateY(20px); }
234
+ to { opacity: 1; transform: translateY(0); }
235
+ }
236
+
237
+ @keyframes spin {
238
+ to { transform: rotate(360deg); }
239
+ }
240
+
241
+ .controls {
242
+ display: flex;
243
+ justify-content: space-between;
244
+ align-items: center;
245
+ margin-top: 16px;
246
+ }
247
+
248
+ .legend {
249
+ display: flex;
250
+ flex-wrap: wrap;
251
+ gap: 10px;
252
+ margin-top: 20px;
253
+ justify-content: center;
254
+ }
255
+
256
+ .legend-item {
257
+ display: flex;
258
+ align-items: center;
259
+ gap: 6px;
260
+ font-size: 0.85rem;
261
+ color: var(--text-muted);
262
+ }
263
+
264
+ .legend-color {
265
+ width: 12px;
266
+ height: 12px;
267
+ border-radius: 50%;
268
+ }
269
+
270
+ </style>
271
+ </head>
272
+ <body>
273
+ <div class="container">
274
+ <header>
275
+ <h1>OpenAI Privacy Filter</h1>
276
+ <p class="subtitle">Detect and mask personally identifiable information (PII) in text securely and efficiently using the bidirectional token-classification model.</p>
277
+ </header>
278
+
279
+ <div class="glass-panel">
280
+ <textarea id="inputText" placeholder="Paste your text here... For example: My name is Alice Smith and my email is alice@example.com."></textarea>
281
+ <div class="controls">
282
+ <button id="analyzeBtn">
283
+ <span class="btn-text">Analyze Text</span>
284
+ <div class="loading-spinner" id="spinner"></div>
285
+ </button>
286
+ </div>
287
+ </div>
288
+
289
+ <div class="glass-panel results-area" style="display: none;" id="resultsPanel">
290
+ <h3>Detection Results</h3>
291
+ <div class="summary-panel" id="summaryPanel"></div>
292
+ <div class="highlighted-text" id="outputText"></div>
293
+ </div>
294
+
295
+ <div class="legend" id="legend"></div>
296
+ </div>
297
+
298
+ <!-- Gradio Client Library -->
299
+ <script type="module">
300
+ import { Client } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
301
+
302
+ const entityColors = {
303
+ 'account_number': 'var(--entity-account_number)',
304
+ 'private_address': 'var(--entity-private_address)',
305
+ 'private_email': 'var(--entity-private_email)',
306
+ 'private_person': 'var(--entity-private_person)',
307
+ 'private_phone': 'var(--entity-private_phone)',
308
+ 'private_url': 'var(--entity-private_url)',
309
+ 'private_date': 'var(--entity-private_date)',
310
+ 'secret': 'var(--entity-secret)'
311
+ };
312
+
313
+ // Setup legend
314
+ const legendContainer = document.getElementById('legend');
315
+ for (const [entity, color] of Object.entries(entityColors)) {
316
+ const item = document.createElement('div');
317
+ item.className = 'legend-item';
318
+ item.innerHTML = `<div class="legend-color" style="background-color: ${color}"></div><span>${entity.replace('private_', '').toUpperCase()}</span>`;
319
+ legendContainer.appendChild(item);
320
+ }
321
+
322
+ const inputText = document.getElementById('inputText');
323
+ const analyzeBtn = document.getElementById('analyzeBtn');
324
+ const spinner = document.getElementById('spinner');
325
+ const btnText = document.querySelector('.btn-text');
326
+ const resultsPanel = document.getElementById('resultsPanel');
327
+ const outputText = document.getElementById('outputText');
328
+ const summaryPanel = document.getElementById('summaryPanel');
329
+
330
+ let client = null;
331
+
332
+ async function initClient() {
333
+ try {
334
+ client = await Client.connect(window.location.origin);
335
+ } catch (error) {
336
+ console.error("Failed to connect to Gradio server:", error);
337
+ }
338
+ }
339
+
340
+ initClient();
341
+
342
+ analyzeBtn.addEventListener('click', async () => {
343
+ const text = inputText.value.trim();
344
+ if (!text) return;
345
+
346
+ if (!client) {
347
+ await initClient();
348
+ if (!client) {
349
+ alert("Could not connect to the backend. Is it running?");
350
+ return;
351
+ }
352
+ }
353
+
354
+ analyzeBtn.disabled = true;
355
+ spinner.style.display = 'block';
356
+ btnText.textContent = 'Analyzing...';
357
+ resultsPanel.style.display = 'none';
358
+
359
+ try {
360
+ // Call the Gradio API
361
+ const result = await client.predict("/predict", { text });
362
+ const entities = result.data[0];
363
+
364
+ renderResults(text, entities);
365
+
366
+ resultsPanel.style.display = 'flex';
367
+ resultsPanel.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
368
+ } catch (error) {
369
+ console.error("Analysis error:", error);
370
+ alert("An error occurred during analysis: " + error.message);
371
+ } finally {
372
+ analyzeBtn.disabled = false;
373
+ spinner.style.display = 'none';
374
+ btnText.textContent = 'Analyze Text';
375
+ }
376
+ });
377
+
378
+ function renderResults(text, entities) {
379
+ if (!entities || entities.length === 0) {
380
+ outputText.textContent = text;
381
+ summaryPanel.innerHTML = '<div class="summary-card"><span class="count">0</span><span class="label">Entities Found</span></div>';
382
+ return;
383
+ }
384
+
385
+ // Count summary
386
+ const counts = {};
387
+ entities.forEach(ent => {
388
+ let type = ent.entity || ent.entity_group;
389
+ if (type.startsWith('B-') || type.startsWith('I-') || type.startsWith('E-') || type.startsWith('S-')) {
390
+ type = type.substring(2);
391
+ }
392
+ counts[type] = (counts[type] || 0) + 1;
393
+ });
394
+
395
+ summaryPanel.innerHTML = Object.entries(counts).map(([type, count]) => {
396
+ const color = entityColors[type] || '#888';
397
+ const label = type.replace('private_', '').replace('_', ' ');
398
+ return `<div class="summary-card" style="border-left: 4px solid ${color}">
399
+ <span class="count" style="color: ${color}">${count}</span>
400
+ <span class="label">${label}</span>
401
+ </div>`;
402
+ }).join('');
403
+
404
+ // Highlight text based on start/end
405
+ let lastIdx = 0;
406
+ let html = '';
407
+
408
+ // Sort by start index
409
+ entities.sort((a, b) => (a.start || 0) - (b.start || 0));
410
+
411
+ for (const ent of entities) {
412
+ if (ent.start !== undefined && ent.end !== undefined) {
413
+ if (ent.start > lastIdx) {
414
+ html += escapeHtml(text.substring(lastIdx, ent.start));
415
+ }
416
+
417
+ let type = ent.entity || ent.entity_group;
418
+ if (type.startsWith('B-') || type.startsWith('I-') || type.startsWith('E-') || type.startsWith('S-')) {
419
+ type = type.substring(2);
420
+ }
421
+
422
+ const color = entityColors[type] || '#888';
423
+ const label = type.replace('private_', '');
424
+
425
+ html += `<span class="entity" style="background-color: ${color}40; border: 1px solid ${color}80; color: #fff;">
426
+ ${escapeHtml(text.substring(ent.start, ent.end))}
427
+ <span class="entity-label" style="background-color: ${color}">${label}</span>
428
+ </span>`;
429
+
430
+ lastIdx = ent.end;
431
+ }
432
+ }
433
+
434
+ if (lastIdx < text.length) {
435
+ html += escapeHtml(text.substring(lastIdx));
436
+ }
437
+
438
+ outputText.innerHTML = html;
439
+ }
440
+
441
+ function escapeHtml(unsafe) {
442
+ return unsafe
443
+ .replace(/&/g, "&amp;")
444
+ .replace(/</g, "&lt;")
445
+ .replace(/>/g, "&gt;")
446
+ .replace(/"/g, "&quot;")
447
+ .replace(/'/g, "&#039;");
448
+ }
449
+ </script>
450
+ </body>
451
+ </html>
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ fastapi
3
+ transformers
4
+ torch
5
+ spaces
test.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import json
3
+
4
+ classifier = pipeline(
5
+ task="token-classification",
6
+ model="openai/privacy-filter",
7
+ aggregation_strategy="simple"
8
+ )
9
+
10
+ res = classifier("My name is Alice Smith and my email is alice@example.com.")
11
+ print(json.dumps(res, indent=2))