broadfield-dev's picture
Update app.py
e38102a verified
from flask import Flask, render_template_string, request, jsonify
from flask_cors import CORS
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import os
import sys
import threading
import time
app = Flask(__name__)
CORS(app)
# Model loading state (thread-safe)
model_name = "OpenMed/privacy-filter-nemotron"
classifier = None
tokenizer = None
model_loading = False
model_error = None
model_thread = None
# Background model loading
def load_model_async():
global classifier, tokenizer, model_loading, model_error
model_loading = True
print("="*60, flush=True)
print("BACKGROUND: Loading OpenMed Privacy Filter model...", flush=True)
print("="*60, flush=True)
try:
print(f"Loading tokenizer and model: {model_name}", flush=True)
print("This may take 5-10 minutes on first run...", flush=True)
# Use AutoModelForTokenClassification directly for better performance
tokenizer = AutoTokenizer.from_pretrained(
model_name,
cache_dir="/app/.cache/huggingface"
)
model = AutoModelForTokenClassification.from_pretrained(
model_name,
cache_dir="/app/.cache/huggingface"
)
global classifier
classifier = pipeline(
task="token-classification",
model=model,
tokenizer=tokenizer,
aggregation_strategy="first", #none simple first average max
device=-1
)
print("✓ Model loaded successfully!", flush=True)
model_error = None
except Exception as e:
model_error = str(e)
print(f"✗ ERROR loading model: {e}", flush=True)
import traceback
traceback.print_exc()
finally:
model_loading = False
# Start model loading in background
model_thread = threading.Thread(target=load_model_async, daemon=True)
model_thread.start()
def escape_html(text):
"""Escape HTML special characters to prevent XSS"""
return (text
.replace("&", "&")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#x27;"))
def create_line_chunks(text, max_tokens=2048):
"""Split text into chunks that respect line boundaries.
Groups lines together based on max token limit, never cutting mid-line.
"""
global tokenizer
if tokenizer is None:
return [(0, text, len(text.split()))]
lines = text.split('\n')
chunks = []
current_lines = []
current_token_count = 0
current_char_start = 0
for line in lines:
line_tokens = tokenizer(line, add_special_tokens=False)['input_ids']
line_token_count = len(line_tokens)
# If this single line exceeds max_tokens, we have to include it anyway
if current_token_count + line_token_count > max_tokens and current_lines:
# Save current chunk
chunk_text = '\n'.join(current_lines)
chunks.append((current_char_start, chunk_text, current_token_count))
# Start new chunk with this line
current_lines = [line]
current_token_count = line_token_count
current_char_start = text.find(line, current_char_start + len('\n'.join(current_lines[:-1])) if current_lines[:-1] else 0)
else:
current_lines.append(line)
current_token_count += line_token_count
# Add final chunk
if current_lines:
chunk_text = '\n'.join(current_lines)
chunks.append((current_char_start, chunk_text, current_token_count))
return chunks
def merge_adjacent_entities(entities):
"""Merge adjacent entities of the same type that are likely from tokenization splits."""
if not entities:
return entities
# Sort by start position
sorted_entities = sorted(entities, key=lambda x: x.get('start', 0))
merged = []
i = 0
while i < len(sorted_entities):
current = sorted_entities[i]
current_label = current.get('entity_group') or current.get('entity', 'unknown')
current_end = current.get('end', 0)
current_text = current.get('word', '')
current_score = current.get('score', 0)
# Look ahead for adjacent same-type entities
j = i + 1
while j < len(sorted_entities):
next_entity = sorted_entities[j]
next_label = next_entity.get('entity_group') or next_entity.get('entity', 'unknown')
next_start = next_entity.get('start', 0)
# Check if same label and adjacent (or overlapping/nearby)
if next_label == current_label and next_start <= current_end + 5:
# Merge
next_end = next_entity.get('end', 0)
next_text = next_entity.get('word', '')
next_score = next_entity.get('score', 0)
# Combine text (remove overlap if any)
if next_start <= current_end:
current_text = current_text[:next_start - current.get('start', 0)] + next_text
else:
current_text = current_text + ' ' + next_text
current_end = max(current_end, next_end)
current_score = max(current_score, next_score) # Use highest score
j += 1
else:
break
merged.append({
'entity_group': current_label,
'entity': current_label,
'word': current_text,
'start': current.get('start', 0),
'end': current_end,
'score': current_score
})
i = j
return merged
# HTML Template with proper loading states
HTML_TEMPLATE = '''
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OpenMed Privacy Filter + Nemotron - PII Detection Demo</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
min-height: 100vh;
color: #fff;
padding: 20px;
}
.container { max-width: 1000px; margin: 0 auto; }
h1 {
text-align: center; margin-bottom: 10px;
background: linear-gradient(90deg, #00d4ff, #7b2cbf);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-size: 2.5rem;
}
.subtitle { text-align: center; color: #8892b0; margin-bottom: 30px; }
.card {
background: rgba(255,255,255,0.05);
border-radius: 12px;
padding: 25px;
margin-bottom: 20px;
backdrop-filter: blur(10px);
border: 1px solid rgba(255,255,255,0.1);
}
textarea {
width: 100%; min-height: 180px; padding: 15px;
border-radius: 8px; border: 1px solid rgba(255,255,255,0.2);
background: rgba(0,0,0,0.3); color: #fff;
font-size: 14px; resize: vertical; font-family: monospace;
}
textarea::placeholder { color: #666; }
button {
width: 100%; padding: 15px; margin-top: 15px;
border: none; border-radius: 8px;
background: linear-gradient(90deg, #00d4ff, #7b2cbf);
color: #fff; font-size: 16px; font-weight: 600;
cursor: pointer; transition: transform 0.2s, box-shadow 0.2s;
}
button:hover:not(:disabled) {
transform: translateY(-2px);
box-shadow: 0 5px 25px rgba(0,212,255,0.4);
}
button:disabled {
opacity: 0.6; cursor: not-allowed;
background: linear-gradient(90deg, #666, #444);
}
.results { display: none; }
.results.active { display: block; }
.result-text {
background: rgba(0,0,0,0.3); padding: 20px;
border-radius: 8px; font-family: monospace;
line-height: 1.8; word-wrap: break-word;
white-space: pre-wrap;
}
.entity {
padding: 2px 8px; border-radius: 4px;
font-weight: bold;
}
/* Entity label colors for 55 labels */
.entity-first_name { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
.entity-last_name { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
.entity-user_name { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
.entity-age { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-gender { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-race_ethnicity { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-sexuality { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-religious_belief { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-political_view { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-marital_status { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-nationality { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-education_level { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-occupation { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-employment_status { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-language { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-blood_type { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-biometric_identifier { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
.entity-email { background: rgba(78,205,196,0.3); border: 1px solid #4ecdc4; }
.entity-phone_number { background: rgba(78,205,196,0.3); border: 1px solid #4ecdc4; }
.entity-fax_number { background: rgba(78,205,196,0.3); border: 1px solid #4ecdc4; }
.entity-url { background: rgba(131,56,236,0.3); border: 1px solid #8338ec; }
.entity-street_address { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
.entity-city { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
.entity-county { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
.entity-state { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
.entity-country { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
.entity-postcode { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
.entity-coordinate { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
.entity-date { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; }
.entity-date_of_birth { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; }
.entity-date_time { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; }
.entity-time { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; }
.entity-ssn { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
.entity-national_id { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
.entity-tax_id { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
.entity-account_number { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
.entity-bank_routing_number { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
.entity-swift_bic { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
.entity-credit_debit_card { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
.entity-cvv { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
.entity-pin { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
.entity-password { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
.entity-medical_record_number { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
.entity-health_plan_beneficiary_number { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
.entity-customer_id { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
.entity-employee_id { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
.entity-unique_id { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
.entity-certificate_license_number { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
.entity-license_plate { background: rgba(131,56,236,0.3); border: 1px solid #8338ec; }
.entity-vehicle_identifier { background: rgba(131,56,236,0.3); border: 1px solid #8338ec; }
.entity-ipv4 { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
.entity-ipv6 { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
.entity-mac_address { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
.entity-device_identifier { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
.entity-api_key { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
.entity-http_cookie { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
.legend {
display: flex; flex-wrap: wrap; gap: 10px;
margin-top: 15px; justify-content: center;
}
.legend-item {
display: flex; align-items: center;
gap: 5px; font-size: 12px;
}
.legend-color {
width: 20px; height: 20px;
border-radius: 4px; border: 1px solid;
}
.details-list { margin-top: 20px; }
.detail-item {
display: flex; justify-content: space-between;
align-items: center; padding: 12px;
background: rgba(255,255,255,0.03);
border-radius: 6px; margin-bottom: 8px;
}
.detail-type { font-weight: bold; color: #00d4ff; }
.detail-score { font-size: 12px; color: #8892b0; }
.error-box {
background: rgba(239,71,111,0.2);
border: 1px solid #ef476f;
padding: 15px;
border-radius: 8px;
margin-top: 15px;
color: #ff6b6b;
}
.info-box {
background: rgba(0,212,255,0.1);
border-left: 3px solid #00d4ff;
padding: 15px; margin-bottom: 20px;
border-radius: 0 8px 8px 0;
}
.info-box h3 { margin-bottom: 5px; }
.info-box ul { margin-left: 20px; color: #8892b0; }
.status-indicator {
display: inline-block;
width: 10px; height: 10px;
border-radius: 50%;
margin-right: 8px;
}
.status-ok { background: #06d6a0; }
.status-error { background: #ef476f; }
.status-loading { background: #ffd166; animation: pulse 1s infinite; }
.status-waiting { background: #3a86ff; }
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.3; }
}
#modelStatus {
text-align: center;
margin-bottom: 15px;
padding: 15px;
background: rgba(0,0,0,0.3);
border-radius: 8px;
font-size: 14px;
}
.loading-spinner {
display: inline-block;
width: 20px; height: 20px;
border: 3px solid rgba(255,255,255,0.3);
border-top-color: #00d4ff;
border-radius: 50%;
animation: spin 1s linear infinite;
margin-right: 10px;
vertical-align: middle;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
.progress-bar {
width: 100%;
height: 4px;
background: rgba(255,255,255,0.1);
border-radius: 2px;
margin-top: 10px;
overflow: hidden;
}
.progress-fill {
height: 100%;
background: linear-gradient(90deg, #00d4ff, #7b2cbf);
animation: progress 2s ease-in-out infinite;
}
@keyframes progress {
0% { width: 0%; transform: translateX(-100%); }
50% { width: 70%; transform: translateX(50%); }
100% { width: 0%; transform: translateX(200%); }
}
.controls-row {
display: grid;
grid-template-columns: 1fr 1fr auto;
gap: 15px;
margin-bottom: 15px;
}
.control-group label {
display: block;
font-size: 12px;
color: #8892b0;
margin-bottom: 5px;
}
.control-group input[type="range"] {
width: 100%;
cursor: pointer;
}
#chunkValue, #tokenCount {
color: #00d4ff;
font-weight: bold;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 15px;
margin-top: 15px;
}
.stat-box {
background: rgba(0,0,0,0.2);
padding: 15px;
border-radius: 8px;
text-align: center;
}
.stat-value {
font-size: 24px;
font-weight: bold;
color: #00d4ff;
}
.stat-label {
font-size: 12px;
color: #8892b0;
margin-top: 5px;
}
</style>
</head>
<body>
<div class="container">
<h1>OpenMed Privacy Filter + Nemotron</h1>
<p class="subtitle">PII Detection & Masking Demo using Flask</p>
<div id="modelStatus">
<span id="statusIndicator" class="status-indicator status-loading"></span>
<span id="statusText">Waiting for server to start...</span>
<div class="progress-bar" id="progressBar">
<div class="progress-fill"></div>
</div>
</div>
<div class="info-box">
<h3>Detects 55 Types of PII:</h3>
<ul>
<li><strong>Names & Identifiers:</strong> first_name, last_name, user_name, age, gender, race_ethnicity, sexuality, religious_belief, political_view, marital_status, nationality, education_level, occupation, employment_status, language, blood_type, biometric_identifier</li>
<li><strong>Contact Info:</strong> email, phone_number, fax_number, url</li>
<li><strong>Location:</strong> street_address, city, county, state, country, postcode, coordinate</li>
<li><strong>Dates & Times:</strong> date, date_of_birth, date_time, time</li>
<li><strong>Government IDs:</strong> ssn, national_id, tax_id</li>
<li><strong>Financial:</strong> account_number, bank_routing_number, swift_bic, credit_debit_card, cvv, pin, password</li>
<li><strong>Health:</strong> medical_record_number, health_plan_beneficiary_number</li>
<li><strong>Identification:</strong> customer_id, employee_id, unique_id, certificate_license_number</li>
<li><strong>Vehicle:</strong> license_plate, vehicle_identifier</li>
<li><strong>Network/Device:</strong> ipv4, ipv6, mac_address, device_identifier, api_key, http_cookie</li>
</ul>
</div>
<div class="card">
<div class="controls-row">
<div class="control-group">
<label>Chunk Size: <span id="chunkValue">10000</span> tokens</label>
<input type="range" id="chunkSize" min="128" max="128000" value="10000" oninput="updateChunkDisplay()">
</div>
<div class="control-group">
<label>Document Tokens: <span id="tokenCount">0</span></label>
<div style="color: #8892b0; font-size: 12px;">Max chunk: 128000 (model limit)</div>
</div>
</div>
<textarea id="inputText" placeholder="Enter text with PII here...\n\nExample: My name is Alice Smith and my email is alice.smith@example.com. You can reach me at (555) 123-4567 or visit me at 123 Main Street, New York. My SSN is 123-45-6789."></textarea>
<button onclick="analyzeText()" id="analyzeBtn" disabled>Waiting for model...</button>
<div id="errorBox" class="error-box" style="display: none;"></div>
</div>
<div class="card results" id="resultsCard">
<h3 style="margin-bottom: 15px;">Results</h3>
<div class="stats-grid" id="statsGrid" style="display: none;">
<div class="stat-box">
<div class="stat-value" id="statEntities">0</div>
<div class="stat-label">Entities Found</div>
</div>
<div class="stat-box">
<div class="stat-value" id="statChunks">0</div>
<div class="stat-label">Chunks Processed</div>
</div>
<div class="stat-box">
<div class="stat-value" id="statTime">0s</div>
<div class="stat-label">Processing Time</div>
</div>
</div>
<div class="result-text" id="resultDisplay" style="margin-top: 15px;"></div>
<div class="legend" id="legendContainer">
<!-- Legend items will be dynamically generated -->
</div>
<div class="details-list" id="detailsList"></div>
</div>
</div>
<script>
let statusCheckInterval = null;
let isModelLoaded = false;
let retryCount = 0;
const maxRetries = 200;
// Define all 55 labels for legend generation
const allLabels = [
{ id: 'first_name', name: 'First Name', color: '#ff6b6b' },
{ id: 'last_name', name: 'Last Name', color: '#ff6b6b' },
{ id: 'user_name', name: 'Username', color: '#ff6b6b' },
{ id: 'age', name: 'Age', color: '#ffd166' },
{ id: 'gender', name: 'Gender', color: '#ffd166' },
{ id: 'race_ethnicity', name: 'Race/Ethnicity', color: '#ffd166' },
{ id: 'sexuality', name: 'Sexuality', color: '#ffd166' },
{ id: 'religious_belief', name: 'Religion', color: '#ffd166' },
{ id: 'political_view', name: 'Political View', color: '#ffd166' },
{ id: 'marital_status', name: 'Marital Status', color: '#ffd166' },
{ id: 'nationality', name: 'Nationality', color: '#ffd166' },
{ id: 'education_level', name: 'Education', color: '#ffd166' },
{ id: 'occupation', name: 'Occupation', color: '#ffd166' },
{ id: 'employment_status', name: 'Employment', color: '#ffd166' },
{ id: 'language', name: 'Language', color: '#ffd166' },
{ id: 'blood_type', name: 'Blood Type', color: '#ffd166' },
{ id: 'biometric_identifier', name: 'Biometric', color: '#ffd166' },
{ id: 'email', name: 'Email', color: '#4ecdc4' },
{ id: 'phone_number', name: 'Phone', color: '#4ecdc4' },
{ id: 'fax_number', name: 'Fax', color: '#4ecdc4' },
{ id: 'url', name: 'URL', color: '#8338ec' },
{ id: 'street_address', name: 'Address', color: '#06d6a0' },
{ id: 'city', name: 'City', color: '#06d6a0' },
{ id: 'county', name: 'County', color: '#06d6a0' },
{ id: 'state', name: 'State', color: '#06d6a0' },
{ id: 'country', name: 'Country', color: '#06d6a0' },
{ id: 'postcode', name: 'Postcode', color: '#06d6a0' },
{ id: 'coordinate', name: 'Coordinate', color: '#06d6a0' },
{ id: 'date', name: 'Date', color: '#3a86ff' },
{ id: 'date_of_birth', name: 'DOB', color: '#3a86ff' },
{ id: 'date_time', name: 'DateTime', color: '#3a86ff' },
{ id: 'time', name: 'Time', color: '#3a86ff' },
{ id: 'ssn', name: 'SSN', color: '#ef476f' },
{ id: 'national_id', name: 'National ID', color: '#ef476f' },
{ id: 'tax_id', name: 'Tax ID', color: '#ef476f' },
{ id: 'account_number', name: 'Account #', color: '#ef476f' },
{ id: 'bank_routing_number', name: 'Routing #', color: '#ef476f' },
{ id: 'swift_bic', name: 'SWIFT/BIC', color: '#ef476f' },
{ id: 'credit_debit_card', name: 'Card #', color: '#ef476f' },
{ id: 'cvv', name: 'CVV', color: '#ef476f' },
{ id: 'pin', name: 'PIN', color: '#ef476f' },
{ id: 'password', name: 'Password', color: '#ef476f' },
{ id: 'medical_record_number', name: 'Medical #', color: '#ff006e' },
{ id: 'health_plan_beneficiary_number', name: 'Health Plan #', color: '#ff006e' },
{ id: 'customer_id', name: 'Customer ID', color: '#ff006e' },
{ id: 'employee_id', name: 'Employee ID', color: '#ff006e' },
{ id: 'unique_id', name: 'Unique ID', color: '#ff006e' },
{ id: 'certificate_license_number', name: 'License #', color: '#ff006e' },
{ id: 'license_plate', name: 'License Plate', color: '#8338ec' },
{ id: 'vehicle_identifier', name: 'Vehicle ID', color: '#8338ec' },
{ id: 'ipv4', name: 'IPv4', color: '#ff6b6b' },
{ id: 'ipv6', name: 'IPv6', color: '#ff6b6b' },
{ id: 'mac_address', name: 'MAC Address', color: '#ff6b6b' },
{ id: 'device_identifier', name: 'Device ID', color: '#ff6b6b' },
{ id: 'api_key', name: 'API Key', color: '#ff6b6b' },
{ id: 'http_cookie', name: 'Cookie', color: '#ff6b6b' }
];
function generateLegend() {
const container = document.getElementById('legendContainer');
container.innerHTML = allLabels.map(label => `
<div class="legend-item">
<div class="legend-color entity-${label.id}" style="background: ${label.color}33; border-color: ${label.color};"></div>
${label.name}
</div>
`).join('');
}
function updateChunkDisplay() {
document.getElementById('chunkValue').textContent = document.getElementById('chunkSize').value;
}
async function updateTokenCount() {
const text = document.getElementById('inputText').value;
if (!text.trim()) {
document.getElementById('tokenCount').textContent = '0';
return;
}
try {
const response = await fetch('/tokenize', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text })
});
if (response.ok) {
const data = await response.json();
document.getElementById('tokenCount').textContent = data.token_count || 0;
}
} catch (e) {
console.error('Token count failed:', e);
}
}
document.getElementById('inputText').addEventListener('input',
debounce(updateTokenCount, 500)
);
function debounce(func, wait) {
let timeout;
return function executedFunction(...args) {
const later = () => {
clearTimeout(timeout);
func(...args);
};
clearTimeout(timeout);
timeout = setTimeout(later, wait);
};
}
function updateStatus(state, message) {
const statusIndicator = document.getElementById("statusIndicator");
const statusText = document.getElementById("statusText");
const progressBar = document.getElementById("progressBar");
const btn = document.getElementById("analyzeBtn");
switch(state) {
case 'connecting':
statusIndicator.className = "status-indicator status-waiting";
statusText.innerHTML = `<span class="loading-spinner"></span>${message}`;
btn.disabled = true;
btn.textContent = "Server is starting up...";
progressBar.style.display = "block";
break;
case 'loading':
statusIndicator.className = "status-indicator status-loading";
statusText.innerHTML = `<span class="loading-spinner"></span>${message}`;
btn.disabled = true;
btn.textContent = "Model is loading...";
progressBar.style.display = "block";
break;
case 'ready':
statusIndicator.className = "status-indicator status-ok";
statusText.innerHTML = "&#10003; " + message;
btn.disabled = false;
btn.textContent = "Detect PII";
progressBar.style.display = "none";
break;
case 'error':
statusIndicator.className = "status-indicator status-error";
statusText.innerHTML = "&#10007; " + message;
btn.disabled = true;
btn.textContent = "Model unavailable";
progressBar.style.display = "none";
break;
}
}
async function checkModelStatus() {
retryCount++;
if (retryCount > maxRetries) {
updateStatus('error', 'Server did not respond after 16 minutes. <button onclick="location.reload()">Refresh</button>');
clearInterval(statusCheckInterval);
statusCheckInterval = null;
return;
}
try {
const response = await fetch("/health", {
method: "GET",
headers: { "Cache-Control": "no-cache" }
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const data = await response.json();
console.log("Health check:", data);
if (data.model_loading) {
updateStatus('loading', `Model loading... (5-10 minutes on first run)`);
isModelLoaded = false;
} else if (data.model_loaded) {
updateStatus('ready', 'Model loaded and ready');
clearInterval(statusCheckInterval);
statusCheckInterval = null;
isModelLoaded = true;
retryCount = 0;
} else {
updateStatus('error', `Model failed: ${data.error || "Unknown error"}`);
document.getElementById("errorBox").style.display = "block";
document.getElementById("errorBox").innerHTML = `<strong>Error:</strong> ${data.error || "Unknown"}`;
clearInterval(statusCheckInterval);
statusCheckInterval = null;
isModelLoaded = false;
}
} catch (error) {
console.error("Health check failed:", error);
updateStatus('connecting', `Waiting for server... (attempt ${retryCount})`);
if (!statusCheckInterval) {
statusCheckInterval = setInterval(checkModelStatus, 5000);
}
}
}
// Initialize legend and status check
generateLegend();
checkModelStatus();
async function analyzeText() {
const text = document.getElementById("inputText").value;
const btn = document.getElementById("analyzeBtn");
const resultsCard = document.getElementById("resultsCard");
const errorBox = document.getElementById("errorBox");
const chunkSize = document.getElementById("chunkSize").value;
if (!text.trim()) {
errorBox.style.display = "block";
errorBox.textContent = "Please enter some text first!";
return;
}
btn.disabled = true;
btn.innerHTML = '<span class="loading-spinner"></span>Analyzing...';
errorBox.style.display = "none";
document.getElementById("statsGrid").style.display = "grid";
const startTime = Date.now();
try {
const response = await fetch("/analyze", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ text, chunk_size: parseInt(chunkSize) })
});
const data = await response.json();
if (!response.ok || !data.success) {
throw new Error(data.error || "Server error");
}
const processTime = ((Date.now() - startTime) / 1000).toFixed(2);
document.getElementById("statEntities").textContent = data.entity_count || 0;
document.getElementById("statChunks").textContent = data.chunks_processed || 1;
document.getElementById("statTime").textContent = processTime + "s";
displayResults(data, text);
resultsCard.classList.add("active");
} catch (error) {
console.error("Error:", error);
errorBox.style.display = "block";
errorBox.textContent = "Error: " + error.message;
resultsCard.classList.remove("active");
} finally {
if (isModelLoaded) {
btn.disabled = false;
btn.textContent = "Detect PII";
}
}
}
function displayResults(data, originalText) {
let html = "";
let lastEnd = 0;
if (data.entities && data.entities.length > 0) {
const sorted = data.entities.sort((a, b) => a.start - b.start);
for (const entity of sorted) {
html += escapeHtml(originalText.slice(lastEnd, entity.start));
html += `<span class="entity entity-${entity.label}">${escapeHtml(entity.text)}</span>`;
lastEnd = entity.end;
}
html += escapeHtml(originalText.slice(lastEnd));
const detailsHtml = sorted.map(e => `
<div class="detail-item">
<div>
<span class="detail-type">${e.label}</span>: ${escapeHtml(e.text)}
</div>
<div class="detail-score">Score: ${(e.score * 100).toFixed(2)}%</div>
</div>
`).join("");
document.getElementById("detailsList").innerHTML = "<h4 style='margin:20px 0 10px 0;'>Detected Entities:</h4>" + detailsHtml;
} else {
html = escapeHtml(originalText) + "\\n\\n[No PII detected]";
document.getElementById("detailsList").innerHTML = "";
}
document.getElementById("resultDisplay").innerHTML = html;
}
function escapeHtml(text) {
const div = document.createElement("div");
div.textContent = text;
return div.innerHTML;
}
window.addEventListener("beforeunload", () => {
if (statusCheckInterval) clearInterval(statusCheckInterval);
});
document.addEventListener('DOMContentLoaded', () => {
document.getElementById('inputText').addEventListener('keydown', function(e) {
if (e.ctrlKey && e.key === 'Enter') analyzeText();
});
});
</script>
</body>
</html>
'''
@app.route('/')
def index():
return render_template_string(HTML_TEMPLATE)
@app.route('/health')
def health():
"""Health check with model loading status"""
global classifier, model_loading, model_error, model_thread
if classifier is not None:
return jsonify({
'status': 'healthy',
'model_loaded': True,
'model_loading': False
})
elif model_loading:
return jsonify({
'status': 'loading',
'model_loaded': False,
'model_loading': True,
'message': 'Model is still loading, please wait...'
})
else:
return jsonify({
'status': 'unhealthy',
'model_loaded': False,
'model_loading': False,
'error': model_error or 'Model loading failed or thread terminated unexpectedly'
}), 503
@app.route('/tokenize', methods=['POST'])
def tokenize():
"""Count tokens in the provided text."""
global tokenizer
if tokenizer is None:
return jsonify({'success': False, 'error': 'Tokenizer not loaded yet'}), 503
try:
data = request.get_json()
if not data:
return jsonify({'success': False, 'error': 'No JSON data'}), 400
text = data.get('text', '')
if not text:
return jsonify({'token_count': 0})
# Tokenize and count
tokens = tokenizer(text, add_special_tokens=False)
token_count = int(len(tokens['input_ids']))
return jsonify({
'success': True,
'token_count': token_count
})
except Exception as e:
return jsonify({'success': False, 'error': str(e)}), 500
@app.route('/analyze', methods=['POST', 'OPTIONS'])
def analyze():
if request.method == 'OPTIONS':
return '', 204
global classifier, tokenizer
if classifier is None:
return jsonify({
'success': False,
'error': f'Model not yet loaded. Please wait and refresh in a few minutes.'
}), 503
try:
data = request.get_json()
if not data:
return jsonify({'success': False, 'error': 'No JSON data received'}), 400
text = data.get('text', '')
chunk_size = data.get('chunk_size', 4096)
if not text.strip():
return jsonify({'success': True, 'entities': [], 'entity_count': 0, 'chunks_processed': 0})
# Create line-based chunks
chunks = create_line_chunks(text, max_tokens=chunk_size)
all_entities = []
# Process each chunk
for char_offset, chunk_text, chunk_tokens in chunks:
chunk_results = classifier(chunk_text)
# Adjust entity positions by chunk offset
for entity in chunk_results:
entity['start'] = entity.get('start', 0) + char_offset
entity['end'] = entity.get('end', 0) + char_offset
all_entities.append(entity)
# Merge adjacent entities
merged_entities = merge_adjacent_entities(all_entities)
# Format output
entities = []
for entity in merged_entities:
entities.append({
'label': entity.get('entity_group', entity.get('entity', 'unknown')),
'text': entity.get('word', ''),
'start': entity.get('start', 0),
'end': entity.get('end', 0),
'score': float(entity.get('score', 0))
})
return jsonify({
'success': True,
'entities': entities,
'entity_count': len(entities),
'chunks_processed': len(chunks)
})
except Exception as e:
print(f"Error during analysis: {e}", flush=True)
import traceback
traceback.print_exc()
return jsonify({
'success': False,
'error': str(e)
}), 500
if __name__ == '__main__':
port = int(os.environ.get('PORT', 7860))
app.run(host='0.0.0.0', port=port, debug=False, threaded=True)