| from flask import Flask, render_template_string, request, jsonify |
| from flask_cors import CORS |
| from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline |
| import os |
| import sys |
| import threading |
| import time |
|
|
| app = Flask(__name__) |
| CORS(app) |
|
|
| |
| model_name = "OpenMed/privacy-filter-nemotron" |
| classifier = None |
| tokenizer = None |
| model_loading = False |
| model_error = None |
| model_thread = None |
|
|
| |
| def load_model_async(): |
| global classifier, tokenizer, model_loading, model_error |
| model_loading = True |
| |
| print("="*60, flush=True) |
| print("BACKGROUND: Loading OpenMed Privacy Filter model...", flush=True) |
| print("="*60, flush=True) |
| |
| try: |
| print(f"Loading tokenizer and model: {model_name}", flush=True) |
| print("This may take 5-10 minutes on first run...", flush=True) |
| |
| |
| tokenizer = AutoTokenizer.from_pretrained( |
| model_name, |
| cache_dir="/app/.cache/huggingface" |
| ) |
| model = AutoModelForTokenClassification.from_pretrained( |
| model_name, |
| cache_dir="/app/.cache/huggingface" |
| ) |
| |
| global classifier |
| classifier = pipeline( |
| task="token-classification", |
| model=model, |
| tokenizer=tokenizer, |
| aggregation_strategy="first", |
| device=-1 |
| ) |
| |
| print("✓ Model loaded successfully!", flush=True) |
| model_error = None |
| except Exception as e: |
| model_error = str(e) |
| print(f"✗ ERROR loading model: {e}", flush=True) |
| import traceback |
| traceback.print_exc() |
| finally: |
| model_loading = False |
|
|
| |
| model_thread = threading.Thread(target=load_model_async, daemon=True) |
| model_thread.start() |
|
|
| def escape_html(text): |
| """Escape HTML special characters to prevent XSS""" |
| return (text |
| .replace("&", "&") |
| .replace("<", "<") |
| .replace(">", ">") |
| .replace('"', """) |
| .replace("'", "'")) |
|
|
| def create_line_chunks(text, max_tokens=2048): |
| """Split text into chunks that respect line boundaries. |
| |
| Groups lines together based on max token limit, never cutting mid-line. |
| """ |
| global tokenizer |
| if tokenizer is None: |
| return [(0, text, len(text.split()))] |
| |
| lines = text.split('\n') |
| chunks = [] |
| current_lines = [] |
| current_token_count = 0 |
| current_char_start = 0 |
| |
| for line in lines: |
| line_tokens = tokenizer(line, add_special_tokens=False)['input_ids'] |
| line_token_count = len(line_tokens) |
| |
| |
| if current_token_count + line_token_count > max_tokens and current_lines: |
| |
| chunk_text = '\n'.join(current_lines) |
| chunks.append((current_char_start, chunk_text, current_token_count)) |
| |
| |
| current_lines = [line] |
| current_token_count = line_token_count |
| current_char_start = text.find(line, current_char_start + len('\n'.join(current_lines[:-1])) if current_lines[:-1] else 0) |
| else: |
| current_lines.append(line) |
| current_token_count += line_token_count |
| |
| |
| if current_lines: |
| chunk_text = '\n'.join(current_lines) |
| chunks.append((current_char_start, chunk_text, current_token_count)) |
| |
| return chunks |
|
|
| def merge_adjacent_entities(entities): |
| """Merge adjacent entities of the same type that are likely from tokenization splits.""" |
| if not entities: |
| return entities |
| |
| |
| sorted_entities = sorted(entities, key=lambda x: x.get('start', 0)) |
| merged = [] |
| i = 0 |
| |
| while i < len(sorted_entities): |
| current = sorted_entities[i] |
| current_label = current.get('entity_group') or current.get('entity', 'unknown') |
| current_end = current.get('end', 0) |
| current_text = current.get('word', '') |
| current_score = current.get('score', 0) |
| |
| |
| j = i + 1 |
| while j < len(sorted_entities): |
| next_entity = sorted_entities[j] |
| next_label = next_entity.get('entity_group') or next_entity.get('entity', 'unknown') |
| next_start = next_entity.get('start', 0) |
| |
| |
| if next_label == current_label and next_start <= current_end + 5: |
| |
| next_end = next_entity.get('end', 0) |
| next_text = next_entity.get('word', '') |
| next_score = next_entity.get('score', 0) |
| |
| |
| if next_start <= current_end: |
| current_text = current_text[:next_start - current.get('start', 0)] + next_text |
| else: |
| current_text = current_text + ' ' + next_text |
| |
| current_end = max(current_end, next_end) |
| current_score = max(current_score, next_score) |
| j += 1 |
| else: |
| break |
| |
| merged.append({ |
| 'entity_group': current_label, |
| 'entity': current_label, |
| 'word': current_text, |
| 'start': current.get('start', 0), |
| 'end': current_end, |
| 'score': current_score |
| }) |
| i = j |
| |
| return merged |
|
|
| |
| HTML_TEMPLATE = ''' |
| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| <title>OpenMed Privacy Filter + Nemotron - PII Detection Demo</title> |
| <style> |
| * { box-sizing: border-box; margin: 0; padding: 0; } |
| body { |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif; |
| background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); |
| min-height: 100vh; |
| color: #fff; |
| padding: 20px; |
| } |
| .container { max-width: 1000px; margin: 0 auto; } |
| h1 { |
| text-align: center; margin-bottom: 10px; |
| background: linear-gradient(90deg, #00d4ff, #7b2cbf); |
| -webkit-background-clip: text; |
| -webkit-text-fill-color: transparent; |
| font-size: 2.5rem; |
| } |
| .subtitle { text-align: center; color: #8892b0; margin-bottom: 30px; } |
| .card { |
| background: rgba(255,255,255,0.05); |
| border-radius: 12px; |
| padding: 25px; |
| margin-bottom: 20px; |
| backdrop-filter: blur(10px); |
| border: 1px solid rgba(255,255,255,0.1); |
| } |
| textarea { |
| width: 100%; min-height: 180px; padding: 15px; |
| border-radius: 8px; border: 1px solid rgba(255,255,255,0.2); |
| background: rgba(0,0,0,0.3); color: #fff; |
| font-size: 14px; resize: vertical; font-family: monospace; |
| } |
| textarea::placeholder { color: #666; } |
| button { |
| width: 100%; padding: 15px; margin-top: 15px; |
| border: none; border-radius: 8px; |
| background: linear-gradient(90deg, #00d4ff, #7b2cbf); |
| color: #fff; font-size: 16px; font-weight: 600; |
| cursor: pointer; transition: transform 0.2s, box-shadow 0.2s; |
| } |
| button:hover:not(:disabled) { |
| transform: translateY(-2px); |
| box-shadow: 0 5px 25px rgba(0,212,255,0.4); |
| } |
| button:disabled { |
| opacity: 0.6; cursor: not-allowed; |
| background: linear-gradient(90deg, #666, #444); |
| } |
| .results { display: none; } |
| .results.active { display: block; } |
| .result-text { |
| background: rgba(0,0,0,0.3); padding: 20px; |
| border-radius: 8px; font-family: monospace; |
| line-height: 1.8; word-wrap: break-word; |
| white-space: pre-wrap; |
| } |
| .entity { |
| padding: 2px 8px; border-radius: 4px; |
| font-weight: bold; |
| } |
| /* Entity label colors for 55 labels */ |
| .entity-first_name { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; } |
| .entity-last_name { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; } |
| .entity-user_name { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; } |
| .entity-age { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-gender { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-race_ethnicity { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-sexuality { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-religious_belief { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-political_view { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-marital_status { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-nationality { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-education_level { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-occupation { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-employment_status { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-language { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-blood_type { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-biometric_identifier { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; } |
| .entity-email { background: rgba(78,205,196,0.3); border: 1px solid #4ecdc4; } |
| .entity-phone_number { background: rgba(78,205,196,0.3); border: 1px solid #4ecdc4; } |
| .entity-fax_number { background: rgba(78,205,196,0.3); border: 1px solid #4ecdc4; } |
| .entity-url { background: rgba(131,56,236,0.3); border: 1px solid #8338ec; } |
| .entity-street_address { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; } |
| .entity-city { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; } |
| .entity-county { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; } |
| .entity-state { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; } |
| .entity-country { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; } |
| .entity-postcode { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; } |
| .entity-coordinate { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; } |
| .entity-date { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; } |
| .entity-date_of_birth { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; } |
| .entity-date_time { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; } |
| .entity-time { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; } |
| .entity-ssn { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; } |
| .entity-national_id { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; } |
| .entity-tax_id { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; } |
| .entity-account_number { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; } |
| .entity-bank_routing_number { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; } |
| .entity-swift_bic { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; } |
| .entity-credit_debit_card { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; } |
| .entity-cvv { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; } |
| .entity-pin { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; } |
| .entity-password { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; } |
| .entity-medical_record_number { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; } |
| .entity-health_plan_beneficiary_number { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; } |
| .entity-customer_id { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; } |
| .entity-employee_id { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; } |
| .entity-unique_id { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; } |
| .entity-certificate_license_number { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; } |
| .entity-license_plate { background: rgba(131,56,236,0.3); border: 1px solid #8338ec; } |
| .entity-vehicle_identifier { background: rgba(131,56,236,0.3); border: 1px solid #8338ec; } |
| .entity-ipv4 { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; } |
| .entity-ipv6 { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; } |
| .entity-mac_address { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; } |
| .entity-device_identifier { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; } |
| .entity-api_key { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; } |
| .entity-http_cookie { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; } |
| .legend { |
| display: flex; flex-wrap: wrap; gap: 10px; |
| margin-top: 15px; justify-content: center; |
| } |
| .legend-item { |
| display: flex; align-items: center; |
| gap: 5px; font-size: 12px; |
| } |
| .legend-color { |
| width: 20px; height: 20px; |
| border-radius: 4px; border: 1px solid; |
| } |
| .details-list { margin-top: 20px; } |
| .detail-item { |
| display: flex; justify-content: space-between; |
| align-items: center; padding: 12px; |
| background: rgba(255,255,255,0.03); |
| border-radius: 6px; margin-bottom: 8px; |
| } |
| .detail-type { font-weight: bold; color: #00d4ff; } |
| .detail-score { font-size: 12px; color: #8892b0; } |
| .error-box { |
| background: rgba(239,71,111,0.2); |
| border: 1px solid #ef476f; |
| padding: 15px; |
| border-radius: 8px; |
| margin-top: 15px; |
| color: #ff6b6b; |
| } |
| .info-box { |
| background: rgba(0,212,255,0.1); |
| border-left: 3px solid #00d4ff; |
| padding: 15px; margin-bottom: 20px; |
| border-radius: 0 8px 8px 0; |
| } |
| .info-box h3 { margin-bottom: 5px; } |
| .info-box ul { margin-left: 20px; color: #8892b0; } |
| .status-indicator { |
| display: inline-block; |
| width: 10px; height: 10px; |
| border-radius: 50%; |
| margin-right: 8px; |
| } |
| .status-ok { background: #06d6a0; } |
| .status-error { background: #ef476f; } |
| .status-loading { background: #ffd166; animation: pulse 1s infinite; } |
| .status-waiting { background: #3a86ff; } |
| @keyframes pulse { |
| 0%, 100% { opacity: 1; } |
| 50% { opacity: 0.3; } |
| } |
| #modelStatus { |
| text-align: center; |
| margin-bottom: 15px; |
| padding: 15px; |
| background: rgba(0,0,0,0.3); |
| border-radius: 8px; |
| font-size: 14px; |
| } |
| .loading-spinner { |
| display: inline-block; |
| width: 20px; height: 20px; |
| border: 3px solid rgba(255,255,255,0.3); |
| border-top-color: #00d4ff; |
| border-radius: 50%; |
| animation: spin 1s linear infinite; |
| margin-right: 10px; |
| vertical-align: middle; |
| } |
| @keyframes spin { |
| to { transform: rotate(360deg); } |
| } |
| .progress-bar { |
| width: 100%; |
| height: 4px; |
| background: rgba(255,255,255,0.1); |
| border-radius: 2px; |
| margin-top: 10px; |
| overflow: hidden; |
| } |
| .progress-fill { |
| height: 100%; |
| background: linear-gradient(90deg, #00d4ff, #7b2cbf); |
| animation: progress 2s ease-in-out infinite; |
| } |
| @keyframes progress { |
| 0% { width: 0%; transform: translateX(-100%); } |
| 50% { width: 70%; transform: translateX(50%); } |
| 100% { width: 0%; transform: translateX(200%); } |
| } |
| .controls-row { |
| display: grid; |
| grid-template-columns: 1fr 1fr auto; |
| gap: 15px; |
| margin-bottom: 15px; |
| } |
| .control-group label { |
| display: block; |
| font-size: 12px; |
| color: #8892b0; |
| margin-bottom: 5px; |
| } |
| .control-group input[type="range"] { |
| width: 100%; |
| cursor: pointer; |
| } |
| #chunkValue, #tokenCount { |
| color: #00d4ff; |
| font-weight: bold; |
| } |
| .stats-grid { |
| display: grid; |
| grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); |
| gap: 15px; |
| margin-top: 15px; |
| } |
| .stat-box { |
| background: rgba(0,0,0,0.2); |
| padding: 15px; |
| border-radius: 8px; |
| text-align: center; |
| } |
| .stat-value { |
| font-size: 24px; |
| font-weight: bold; |
| color: #00d4ff; |
| } |
| .stat-label { |
| font-size: 12px; |
| color: #8892b0; |
| margin-top: 5px; |
| } |
| </style> |
| </head> |
| <body> |
| <div class="container"> |
| <h1>OpenMed Privacy Filter + Nemotron</h1> |
| <p class="subtitle">PII Detection & Masking Demo using Flask</p> |
| |
| <div id="modelStatus"> |
| <span id="statusIndicator" class="status-indicator status-loading"></span> |
| <span id="statusText">Waiting for server to start...</span> |
| <div class="progress-bar" id="progressBar"> |
| <div class="progress-fill"></div> |
| </div> |
| </div> |
| |
| <div class="info-box"> |
| <h3>Detects 55 Types of PII:</h3> |
| <ul> |
| <li><strong>Names & Identifiers:</strong> first_name, last_name, user_name, age, gender, race_ethnicity, sexuality, religious_belief, political_view, marital_status, nationality, education_level, occupation, employment_status, language, blood_type, biometric_identifier</li> |
| <li><strong>Contact Info:</strong> email, phone_number, fax_number, url</li> |
| <li><strong>Location:</strong> street_address, city, county, state, country, postcode, coordinate</li> |
| <li><strong>Dates & Times:</strong> date, date_of_birth, date_time, time</li> |
| <li><strong>Government IDs:</strong> ssn, national_id, tax_id</li> |
| <li><strong>Financial:</strong> account_number, bank_routing_number, swift_bic, credit_debit_card, cvv, pin, password</li> |
| <li><strong>Health:</strong> medical_record_number, health_plan_beneficiary_number</li> |
| <li><strong>Identification:</strong> customer_id, employee_id, unique_id, certificate_license_number</li> |
| <li><strong>Vehicle:</strong> license_plate, vehicle_identifier</li> |
| <li><strong>Network/Device:</strong> ipv4, ipv6, mac_address, device_identifier, api_key, http_cookie</li> |
| </ul> |
| </div> |
| |
| <div class="card"> |
| <div class="controls-row"> |
| <div class="control-group"> |
| <label>Chunk Size: <span id="chunkValue">10000</span> tokens</label> |
| <input type="range" id="chunkSize" min="128" max="128000" value="10000" oninput="updateChunkDisplay()"> |
| </div> |
| <div class="control-group"> |
| <label>Document Tokens: <span id="tokenCount">0</span></label> |
| <div style="color: #8892b0; font-size: 12px;">Max chunk: 128000 (model limit)</div> |
| </div> |
| </div> |
| <textarea id="inputText" placeholder="Enter text with PII here...\n\nExample: My name is Alice Smith and my email is alice.smith@example.com. You can reach me at (555) 123-4567 or visit me at 123 Main Street, New York. My SSN is 123-45-6789."></textarea> |
| <button onclick="analyzeText()" id="analyzeBtn" disabled>Waiting for model...</button> |
| <div id="errorBox" class="error-box" style="display: none;"></div> |
| </div> |
| |
| <div class="card results" id="resultsCard"> |
| <h3 style="margin-bottom: 15px;">Results</h3> |
| <div class="stats-grid" id="statsGrid" style="display: none;"> |
| <div class="stat-box"> |
| <div class="stat-value" id="statEntities">0</div> |
| <div class="stat-label">Entities Found</div> |
| </div> |
| <div class="stat-box"> |
| <div class="stat-value" id="statChunks">0</div> |
| <div class="stat-label">Chunks Processed</div> |
| </div> |
| <div class="stat-box"> |
| <div class="stat-value" id="statTime">0s</div> |
| <div class="stat-label">Processing Time</div> |
| </div> |
| </div> |
| <div class="result-text" id="resultDisplay" style="margin-top: 15px;"></div> |
| |
| <div class="legend" id="legendContainer"> |
| <!-- Legend items will be dynamically generated --> |
| </div> |
| |
| <div class="details-list" id="detailsList"></div> |
| </div> |
| </div> |
| |
| <script> |
| let statusCheckInterval = null; |
| let isModelLoaded = false; |
| let retryCount = 0; |
| const maxRetries = 200; |
| |
| // Define all 55 labels for legend generation |
| const allLabels = [ |
| { id: 'first_name', name: 'First Name', color: '#ff6b6b' }, |
| { id: 'last_name', name: 'Last Name', color: '#ff6b6b' }, |
| { id: 'user_name', name: 'Username', color: '#ff6b6b' }, |
| { id: 'age', name: 'Age', color: '#ffd166' }, |
| { id: 'gender', name: 'Gender', color: '#ffd166' }, |
| { id: 'race_ethnicity', name: 'Race/Ethnicity', color: '#ffd166' }, |
| { id: 'sexuality', name: 'Sexuality', color: '#ffd166' }, |
| { id: 'religious_belief', name: 'Religion', color: '#ffd166' }, |
| { id: 'political_view', name: 'Political View', color: '#ffd166' }, |
| { id: 'marital_status', name: 'Marital Status', color: '#ffd166' }, |
| { id: 'nationality', name: 'Nationality', color: '#ffd166' }, |
| { id: 'education_level', name: 'Education', color: '#ffd166' }, |
| { id: 'occupation', name: 'Occupation', color: '#ffd166' }, |
| { id: 'employment_status', name: 'Employment', color: '#ffd166' }, |
| { id: 'language', name: 'Language', color: '#ffd166' }, |
| { id: 'blood_type', name: 'Blood Type', color: '#ffd166' }, |
| { id: 'biometric_identifier', name: 'Biometric', color: '#ffd166' }, |
| { id: 'email', name: 'Email', color: '#4ecdc4' }, |
| { id: 'phone_number', name: 'Phone', color: '#4ecdc4' }, |
| { id: 'fax_number', name: 'Fax', color: '#4ecdc4' }, |
| { id: 'url', name: 'URL', color: '#8338ec' }, |
| { id: 'street_address', name: 'Address', color: '#06d6a0' }, |
| { id: 'city', name: 'City', color: '#06d6a0' }, |
| { id: 'county', name: 'County', color: '#06d6a0' }, |
| { id: 'state', name: 'State', color: '#06d6a0' }, |
| { id: 'country', name: 'Country', color: '#06d6a0' }, |
| { id: 'postcode', name: 'Postcode', color: '#06d6a0' }, |
| { id: 'coordinate', name: 'Coordinate', color: '#06d6a0' }, |
| { id: 'date', name: 'Date', color: '#3a86ff' }, |
| { id: 'date_of_birth', name: 'DOB', color: '#3a86ff' }, |
| { id: 'date_time', name: 'DateTime', color: '#3a86ff' }, |
| { id: 'time', name: 'Time', color: '#3a86ff' }, |
| { id: 'ssn', name: 'SSN', color: '#ef476f' }, |
| { id: 'national_id', name: 'National ID', color: '#ef476f' }, |
| { id: 'tax_id', name: 'Tax ID', color: '#ef476f' }, |
| { id: 'account_number', name: 'Account #', color: '#ef476f' }, |
| { id: 'bank_routing_number', name: 'Routing #', color: '#ef476f' }, |
| { id: 'swift_bic', name: 'SWIFT/BIC', color: '#ef476f' }, |
| { id: 'credit_debit_card', name: 'Card #', color: '#ef476f' }, |
| { id: 'cvv', name: 'CVV', color: '#ef476f' }, |
| { id: 'pin', name: 'PIN', color: '#ef476f' }, |
| { id: 'password', name: 'Password', color: '#ef476f' }, |
| { id: 'medical_record_number', name: 'Medical #', color: '#ff006e' }, |
| { id: 'health_plan_beneficiary_number', name: 'Health Plan #', color: '#ff006e' }, |
| { id: 'customer_id', name: 'Customer ID', color: '#ff006e' }, |
| { id: 'employee_id', name: 'Employee ID', color: '#ff006e' }, |
| { id: 'unique_id', name: 'Unique ID', color: '#ff006e' }, |
| { id: 'certificate_license_number', name: 'License #', color: '#ff006e' }, |
| { id: 'license_plate', name: 'License Plate', color: '#8338ec' }, |
| { id: 'vehicle_identifier', name: 'Vehicle ID', color: '#8338ec' }, |
| { id: 'ipv4', name: 'IPv4', color: '#ff6b6b' }, |
| { id: 'ipv6', name: 'IPv6', color: '#ff6b6b' }, |
| { id: 'mac_address', name: 'MAC Address', color: '#ff6b6b' }, |
| { id: 'device_identifier', name: 'Device ID', color: '#ff6b6b' }, |
| { id: 'api_key', name: 'API Key', color: '#ff6b6b' }, |
| { id: 'http_cookie', name: 'Cookie', color: '#ff6b6b' } |
| ]; |
| |
| function generateLegend() { |
| const container = document.getElementById('legendContainer'); |
| container.innerHTML = allLabels.map(label => ` |
| <div class="legend-item"> |
| <div class="legend-color entity-${label.id}" style="background: ${label.color}33; border-color: ${label.color};"></div> |
| ${label.name} |
| </div> |
| `).join(''); |
| } |
| |
| function updateChunkDisplay() { |
| document.getElementById('chunkValue').textContent = document.getElementById('chunkSize').value; |
| } |
| |
| async function updateTokenCount() { |
| const text = document.getElementById('inputText').value; |
| if (!text.trim()) { |
| document.getElementById('tokenCount').textContent = '0'; |
| return; |
| } |
| try { |
| const response = await fetch('/tokenize', { |
| method: 'POST', |
| headers: { 'Content-Type': 'application/json' }, |
| body: JSON.stringify({ text }) |
| }); |
| if (response.ok) { |
| const data = await response.json(); |
| document.getElementById('tokenCount').textContent = data.token_count || 0; |
| } |
| } catch (e) { |
| console.error('Token count failed:', e); |
| } |
| } |
| |
| document.getElementById('inputText').addEventListener('input', |
| debounce(updateTokenCount, 500) |
| ); |
| |
| function debounce(func, wait) { |
| let timeout; |
| return function executedFunction(...args) { |
| const later = () => { |
| clearTimeout(timeout); |
| func(...args); |
| }; |
| clearTimeout(timeout); |
| timeout = setTimeout(later, wait); |
| }; |
| } |
| |
| function updateStatus(state, message) { |
| const statusIndicator = document.getElementById("statusIndicator"); |
| const statusText = document.getElementById("statusText"); |
| const progressBar = document.getElementById("progressBar"); |
| const btn = document.getElementById("analyzeBtn"); |
| |
| switch(state) { |
| case 'connecting': |
| statusIndicator.className = "status-indicator status-waiting"; |
| statusText.innerHTML = `<span class="loading-spinner"></span>${message}`; |
| btn.disabled = true; |
| btn.textContent = "Server is starting up..."; |
| progressBar.style.display = "block"; |
| break; |
| case 'loading': |
| statusIndicator.className = "status-indicator status-loading"; |
| statusText.innerHTML = `<span class="loading-spinner"></span>${message}`; |
| btn.disabled = true; |
| btn.textContent = "Model is loading..."; |
| progressBar.style.display = "block"; |
| break; |
| case 'ready': |
| statusIndicator.className = "status-indicator status-ok"; |
| statusText.innerHTML = "✓ " + message; |
| btn.disabled = false; |
| btn.textContent = "Detect PII"; |
| progressBar.style.display = "none"; |
| break; |
| case 'error': |
| statusIndicator.className = "status-indicator status-error"; |
| statusText.innerHTML = "✗ " + message; |
| btn.disabled = true; |
| btn.textContent = "Model unavailable"; |
| progressBar.style.display = "none"; |
| break; |
| } |
| } |
| |
| async function checkModelStatus() { |
| retryCount++; |
| |
| if (retryCount > maxRetries) { |
| updateStatus('error', 'Server did not respond after 16 minutes. <button onclick="location.reload()">Refresh</button>'); |
| clearInterval(statusCheckInterval); |
| statusCheckInterval = null; |
| return; |
| } |
| |
| try { |
| const response = await fetch("/health", { |
| method: "GET", |
| headers: { "Cache-Control": "no-cache" } |
| }); |
| |
| if (!response.ok) { |
| throw new Error(`HTTP ${response.status}`); |
| } |
| |
| const data = await response.json(); |
| console.log("Health check:", data); |
| |
| if (data.model_loading) { |
| updateStatus('loading', `Model loading... (5-10 minutes on first run)`); |
| isModelLoaded = false; |
| } else if (data.model_loaded) { |
| updateStatus('ready', 'Model loaded and ready'); |
| clearInterval(statusCheckInterval); |
| statusCheckInterval = null; |
| isModelLoaded = true; |
| retryCount = 0; |
| } else { |
| updateStatus('error', `Model failed: ${data.error || "Unknown error"}`); |
| document.getElementById("errorBox").style.display = "block"; |
| document.getElementById("errorBox").innerHTML = `<strong>Error:</strong> ${data.error || "Unknown"}`; |
| clearInterval(statusCheckInterval); |
| statusCheckInterval = null; |
| isModelLoaded = false; |
| } |
| } catch (error) { |
| console.error("Health check failed:", error); |
| updateStatus('connecting', `Waiting for server... (attempt ${retryCount})`); |
| if (!statusCheckInterval) { |
| statusCheckInterval = setInterval(checkModelStatus, 5000); |
| } |
| } |
| } |
| |
| // Initialize legend and status check |
| generateLegend(); |
| checkModelStatus(); |
| |
| async function analyzeText() { |
| const text = document.getElementById("inputText").value; |
| const btn = document.getElementById("analyzeBtn"); |
| const resultsCard = document.getElementById("resultsCard"); |
| const errorBox = document.getElementById("errorBox"); |
| const chunkSize = document.getElementById("chunkSize").value; |
| |
| if (!text.trim()) { |
| errorBox.style.display = "block"; |
| errorBox.textContent = "Please enter some text first!"; |
| return; |
| } |
| |
| btn.disabled = true; |
| btn.innerHTML = '<span class="loading-spinner"></span>Analyzing...'; |
| errorBox.style.display = "none"; |
| document.getElementById("statsGrid").style.display = "grid"; |
| |
| const startTime = Date.now(); |
| |
| try { |
| const response = await fetch("/analyze", { |
| method: "POST", |
| headers: { "Content-Type": "application/json" }, |
| body: JSON.stringify({ text, chunk_size: parseInt(chunkSize) }) |
| }); |
| |
| const data = await response.json(); |
| |
| if (!response.ok || !data.success) { |
| throw new Error(data.error || "Server error"); |
| } |
| |
| const processTime = ((Date.now() - startTime) / 1000).toFixed(2); |
| document.getElementById("statEntities").textContent = data.entity_count || 0; |
| document.getElementById("statChunks").textContent = data.chunks_processed || 1; |
| document.getElementById("statTime").textContent = processTime + "s"; |
| |
| displayResults(data, text); |
| resultsCard.classList.add("active"); |
| |
| } catch (error) { |
| console.error("Error:", error); |
| errorBox.style.display = "block"; |
| errorBox.textContent = "Error: " + error.message; |
| resultsCard.classList.remove("active"); |
| } finally { |
| if (isModelLoaded) { |
| btn.disabled = false; |
| btn.textContent = "Detect PII"; |
| } |
| } |
| } |
| |
| function displayResults(data, originalText) { |
| let html = ""; |
| let lastEnd = 0; |
| |
| if (data.entities && data.entities.length > 0) { |
| const sorted = data.entities.sort((a, b) => a.start - b.start); |
| |
| for (const entity of sorted) { |
| html += escapeHtml(originalText.slice(lastEnd, entity.start)); |
| html += `<span class="entity entity-${entity.label}">${escapeHtml(entity.text)}</span>`; |
| lastEnd = entity.end; |
| } |
| html += escapeHtml(originalText.slice(lastEnd)); |
| |
| const detailsHtml = sorted.map(e => ` |
| <div class="detail-item"> |
| <div> |
| <span class="detail-type">${e.label}</span>: ${escapeHtml(e.text)} |
| </div> |
| <div class="detail-score">Score: ${(e.score * 100).toFixed(2)}%</div> |
| </div> |
| `).join(""); |
| document.getElementById("detailsList").innerHTML = "<h4 style='margin:20px 0 10px 0;'>Detected Entities:</h4>" + detailsHtml; |
| } else { |
| html = escapeHtml(originalText) + "\\n\\n[No PII detected]"; |
| document.getElementById("detailsList").innerHTML = ""; |
| } |
| |
| document.getElementById("resultDisplay").innerHTML = html; |
| } |
| |
| function escapeHtml(text) { |
| const div = document.createElement("div"); |
| div.textContent = text; |
| return div.innerHTML; |
| } |
| |
| window.addEventListener("beforeunload", () => { |
| if (statusCheckInterval) clearInterval(statusCheckInterval); |
| }); |
| |
| document.addEventListener('DOMContentLoaded', () => { |
| document.getElementById('inputText').addEventListener('keydown', function(e) { |
| if (e.ctrlKey && e.key === 'Enter') analyzeText(); |
| }); |
| }); |
| </script> |
| </body> |
| </html> |
| ''' |
| @app.route('/') |
| def index(): |
| return render_template_string(HTML_TEMPLATE) |
|
|
| @app.route('/health') |
| def health(): |
| """Health check with model loading status""" |
| global classifier, model_loading, model_error, model_thread |
| |
| if classifier is not None: |
| return jsonify({ |
| 'status': 'healthy', |
| 'model_loaded': True, |
| 'model_loading': False |
| }) |
| elif model_loading: |
| return jsonify({ |
| 'status': 'loading', |
| 'model_loaded': False, |
| 'model_loading': True, |
| 'message': 'Model is still loading, please wait...' |
| }) |
| else: |
| return jsonify({ |
| 'status': 'unhealthy', |
| 'model_loaded': False, |
| 'model_loading': False, |
| 'error': model_error or 'Model loading failed or thread terminated unexpectedly' |
| }), 503 |
|
|
| @app.route('/tokenize', methods=['POST']) |
| def tokenize(): |
| """Count tokens in the provided text.""" |
| global tokenizer |
| |
| if tokenizer is None: |
| return jsonify({'success': False, 'error': 'Tokenizer not loaded yet'}), 503 |
| |
| try: |
| data = request.get_json() |
| if not data: |
| return jsonify({'success': False, 'error': 'No JSON data'}), 400 |
| |
| text = data.get('text', '') |
| if not text: |
| return jsonify({'token_count': 0}) |
| |
| |
| tokens = tokenizer(text, add_special_tokens=False) |
| token_count = int(len(tokens['input_ids'])) |
| |
| return jsonify({ |
| 'success': True, |
| 'token_count': token_count |
| }) |
| except Exception as e: |
| return jsonify({'success': False, 'error': str(e)}), 500 |
|
|
| @app.route('/analyze', methods=['POST', 'OPTIONS']) |
| def analyze(): |
| if request.method == 'OPTIONS': |
| return '', 204 |
| |
| global classifier, tokenizer |
| |
| if classifier is None: |
| return jsonify({ |
| 'success': False, |
| 'error': f'Model not yet loaded. Please wait and refresh in a few minutes.' |
| }), 503 |
| |
| try: |
| data = request.get_json() |
| |
| if not data: |
| return jsonify({'success': False, 'error': 'No JSON data received'}), 400 |
| |
| text = data.get('text', '') |
| chunk_size = data.get('chunk_size', 4096) |
| |
| if not text.strip(): |
| return jsonify({'success': True, 'entities': [], 'entity_count': 0, 'chunks_processed': 0}) |
| |
| |
| chunks = create_line_chunks(text, max_tokens=chunk_size) |
| all_entities = [] |
| |
| |
| for char_offset, chunk_text, chunk_tokens in chunks: |
| chunk_results = classifier(chunk_text) |
| |
| |
| for entity in chunk_results: |
| entity['start'] = entity.get('start', 0) + char_offset |
| entity['end'] = entity.get('end', 0) + char_offset |
| all_entities.append(entity) |
| |
| |
| merged_entities = merge_adjacent_entities(all_entities) |
| |
| |
| entities = [] |
| for entity in merged_entities: |
| entities.append({ |
| 'label': entity.get('entity_group', entity.get('entity', 'unknown')), |
| 'text': entity.get('word', ''), |
| 'start': entity.get('start', 0), |
| 'end': entity.get('end', 0), |
| 'score': float(entity.get('score', 0)) |
| }) |
| |
| return jsonify({ |
| 'success': True, |
| 'entities': entities, |
| 'entity_count': len(entities), |
| 'chunks_processed': len(chunks) |
| }) |
| |
| except Exception as e: |
| print(f"Error during analysis: {e}", flush=True) |
| import traceback |
| traceback.print_exc() |
| return jsonify({ |
| 'success': False, |
| 'error': str(e) |
| }), 500 |
|
|
| if __name__ == '__main__': |
| port = int(os.environ.get('PORT', 7860)) |
| app.run(host='0.0.0.0', port=port, debug=False, threaded=True) |