from flask import Flask, render_template_string, request, jsonify
from flask_cors import CORS
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import os
import sys
import threading
import time

app = Flask(__name__)
CORS(app)

# Model loading state (thread-safe)
model_name = "OpenMed/privacy-filter-nemotron" 
classifier = None
tokenizer = None
model_loading = False
model_error = None
model_thread = None

# Background model loading
def load_model_async():
    global classifier, tokenizer, model_loading, model_error
    model_loading = True
    
    print("="*60, flush=True)
    print("BACKGROUND: Loading OpenMed Privacy Filter model...", flush=True)
    print("="*60, flush=True)
    
    try:
        print(f"Loading tokenizer and model: {model_name}", flush=True)
        print("This may take 5-10 minutes on first run...", flush=True)
        
        # Use AutoModelForTokenClassification directly for better performance
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            cache_dir="/app/.cache/huggingface"
        )
        model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            cache_dir="/app/.cache/huggingface"
        )
        
        global classifier
        classifier = pipeline(
            task="token-classification",
            model=model,
            tokenizer=tokenizer,
            aggregation_strategy="first", #none simple first average max
            device=-1
        )
        
        print("✓ Model loaded successfully!", flush=True)
        model_error = None
    except Exception as e:
        model_error = str(e)
        print(f"✗ ERROR loading model: {e}", flush=True)
        import traceback
        traceback.print_exc()
    finally:
        model_loading = False

# Start model loading in background
model_thread = threading.Thread(target=load_model_async, daemon=True)
model_thread.start()

def escape_html(text):
    """Escape HTML special characters to prevent XSS"""
    return (text
        .replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
        .replace('"', "&quot;")
        .replace("'", "&#x27;"))

def create_line_chunks(text, max_tokens=2048):
    """Split text into chunks that respect line boundaries.
    
    Groups lines together based on max token limit, never cutting mid-line.
    """
    global tokenizer
    if tokenizer is None:
        return [(0, text, len(text.split()))]
    
    lines = text.split('\n')
    chunks = []
    current_lines = []
    current_token_count = 0
    current_char_start = 0
    
    for line in lines:
        line_tokens = tokenizer(line, add_special_tokens=False)['input_ids']
        line_token_count = len(line_tokens)
        
        # If this single line exceeds max_tokens, we have to include it anyway
        if current_token_count + line_token_count > max_tokens and current_lines:
            # Save current chunk
            chunk_text = '\n'.join(current_lines)
            chunks.append((current_char_start, chunk_text, current_token_count))
            
            # Start new chunk with this line
            current_lines = [line]
            current_token_count = line_token_count
            current_char_start = text.find(line, current_char_start + len('\n'.join(current_lines[:-1])) if current_lines[:-1] else 0)
        else:
            current_lines.append(line)
            current_token_count += line_token_count
    
    # Add final chunk
    if current_lines:
        chunk_text = '\n'.join(current_lines)
        chunks.append((current_char_start, chunk_text, current_token_count))
    
    return chunks

def merge_adjacent_entities(entities):
    """Merge adjacent entities of the same type that are likely from tokenization splits."""
    if not entities:
        return entities
    
    # Sort by start position
    sorted_entities = sorted(entities, key=lambda x: x.get('start', 0))
    merged = []
    i = 0
    
    while i < len(sorted_entities):
        current = sorted_entities[i]
        current_label = current.get('entity_group') or current.get('entity', 'unknown')
        current_end = current.get('end', 0)
        current_text = current.get('word', '')
        current_score = current.get('score', 0)
        
        # Look ahead for adjacent same-type entities
        j = i + 1
        while j < len(sorted_entities):
            next_entity = sorted_entities[j]
            next_label = next_entity.get('entity_group') or next_entity.get('entity', 'unknown')
            next_start = next_entity.get('start', 0)
            
            # Check if same label and adjacent (or overlapping/nearby)
            if next_label == current_label and next_start <= current_end + 5:
                # Merge
                next_end = next_entity.get('end', 0)
                next_text = next_entity.get('word', '')
                next_score = next_entity.get('score', 0)
                
                # Combine text (remove overlap if any)
                if next_start <= current_end:
                    current_text = current_text[:next_start - current.get('start', 0)] + next_text
                else:
                    current_text = current_text + ' ' + next_text
                
                current_end = max(current_end, next_end)
                current_score = max(current_score, next_score)  # Use highest score
                j += 1
            else:
                break
        
        merged.append({
            'entity_group': current_label,
            'entity': current_label,
            'word': current_text,
            'start': current.get('start', 0),
            'end': current_end,
            'score': current_score
        })
        i = j
    
    return merged

# HTML Template with proper loading states
HTML_TEMPLATE = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>OpenMed Privacy Filter + Nemotron - PII Detection Demo</title>
    <style>
        * { box-sizing: border-box; margin: 0; padding: 0; }
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
            background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
            min-height: 100vh;
            color: #fff;
            padding: 20px;
        }
        .container { max-width: 1000px; margin: 0 auto; }
        h1 {
            text-align: center; margin-bottom: 10px;
            background: linear-gradient(90deg, #00d4ff, #7b2cbf);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            font-size: 2.5rem;
        }
        .subtitle { text-align: center; color: #8892b0; margin-bottom: 30px; }
        .card {
            background: rgba(255,255,255,0.05);
            border-radius: 12px;
            padding: 25px;
            margin-bottom: 20px;
            backdrop-filter: blur(10px);
            border: 1px solid rgba(255,255,255,0.1);
        }
        textarea {
            width: 100%; min-height: 180px; padding: 15px;
            border-radius: 8px; border: 1px solid rgba(255,255,255,0.2);
            background: rgba(0,0,0,0.3); color: #fff;
            font-size: 14px; resize: vertical; font-family: monospace;
        }
        textarea::placeholder { color: #666; }
        button {
            width: 100%; padding: 15px; margin-top: 15px;
            border: none; border-radius: 8px;
            background: linear-gradient(90deg, #00d4ff, #7b2cbf);
            color: #fff; font-size: 16px; font-weight: 600;
            cursor: pointer; transition: transform 0.2s, box-shadow 0.2s;
        }
        button:hover:not(:disabled) {
            transform: translateY(-2px);
            box-shadow: 0 5px 25px rgba(0,212,255,0.4);
        }
        button:disabled {
            opacity: 0.6; cursor: not-allowed;
            background: linear-gradient(90deg, #666, #444);
        }
        .results { display: none; }
        .results.active { display: block; }
        .result-text {
            background: rgba(0,0,0,0.3); padding: 20px;
            border-radius: 8px; font-family: monospace;
            line-height: 1.8; word-wrap: break-word;
            white-space: pre-wrap;
        }
        .entity {
            padding: 2px 8px; border-radius: 4px;
            font-weight: bold;
        }
        /* Entity label colors for 55 labels */
        .entity-first_name { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
        .entity-last_name { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
        .entity-user_name { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
        .entity-age { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-gender { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-race_ethnicity { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-sexuality { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-religious_belief { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-political_view { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-marital_status { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-nationality { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-education_level { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-occupation { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-employment_status { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-language { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-blood_type { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-biometric_identifier { background: rgba(255,209,102,0.3); border: 1px solid #ffd166; }
        .entity-email { background: rgba(78,205,196,0.3); border: 1px solid #4ecdc4; }
        .entity-phone_number { background: rgba(78,205,196,0.3); border: 1px solid #4ecdc4; }
        .entity-fax_number { background: rgba(78,205,196,0.3); border: 1px solid #4ecdc4; }
        .entity-url { background: rgba(131,56,236,0.3); border: 1px solid #8338ec; }
        .entity-street_address { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
        .entity-city { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
        .entity-county { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
        .entity-state { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
        .entity-country { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
        .entity-postcode { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
        .entity-coordinate { background: rgba(6,214,160,0.3); border: 1px solid #06d6a0; }
        .entity-date { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; }
        .entity-date_of_birth { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; }
        .entity-date_time { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; }
        .entity-time { background: rgba(58,134,255,0.3); border: 1px solid #3a86ff; }
        .entity-ssn { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
        .entity-national_id { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
        .entity-tax_id { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
        .entity-account_number { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
        .entity-bank_routing_number { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
        .entity-swift_bic { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
        .entity-credit_debit_card { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
        .entity-cvv { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
        .entity-pin { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
        .entity-password { background: rgba(239,71,111,0.3); border: 1px solid #ef476f; }
        .entity-medical_record_number { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
        .entity-health_plan_beneficiary_number { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
        .entity-customer_id { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
        .entity-employee_id { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
        .entity-unique_id { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
        .entity-certificate_license_number { background: rgba(255,0,110,0.3); border: 1px solid #ff006e; }
        .entity-license_plate { background: rgba(131,56,236,0.3); border: 1px solid #8338ec; }
        .entity-vehicle_identifier { background: rgba(131,56,236,0.3); border: 1px solid #8338ec; }
        .entity-ipv4 { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
        .entity-ipv6 { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
        .entity-mac_address { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
        .entity-device_identifier { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
        .entity-api_key { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
        .entity-http_cookie { background: rgba(255,107,107,0.3); border: 1px solid #ff6b6b; }
        .legend {
            display: flex; flex-wrap: wrap; gap: 10px;
            margin-top: 15px; justify-content: center;
        }
        .legend-item {
            display: flex; align-items: center;
            gap: 5px; font-size: 12px;
        }
        .legend-color {
            width: 20px; height: 20px;
            border-radius: 4px; border: 1px solid;
        }
        .details-list { margin-top: 20px; }
        .detail-item {
            display: flex; justify-content: space-between;
            align-items: center; padding: 12px;
            background: rgba(255,255,255,0.03);
            border-radius: 6px; margin-bottom: 8px;
        }
        .detail-type { font-weight: bold; color: #00d4ff; }
        .detail-score { font-size: 12px; color: #8892b0; }
        .error-box {
            background: rgba(239,71,111,0.2);
            border: 1px solid #ef476f;
            padding: 15px;
            border-radius: 8px;
            margin-top: 15px;
            color: #ff6b6b;
        }
        .info-box {
            background: rgba(0,212,255,0.1);
            border-left: 3px solid #00d4ff;
            padding: 15px; margin-bottom: 20px;
            border-radius: 0 8px 8px 0;
        }
        .info-box h3 { margin-bottom: 5px; }
        .info-box ul { margin-left: 20px; color: #8892b0; }
        .status-indicator {
            display: inline-block;
            width: 10px; height: 10px;
            border-radius: 50%;
            margin-right: 8px;
        }
        .status-ok { background: #06d6a0; }
        .status-error { background: #ef476f; }
        .status-loading { background: #ffd166; animation: pulse 1s infinite; }
        .status-waiting { background: #3a86ff; }
        @keyframes pulse {
            0%, 100% { opacity: 1; }
            50% { opacity: 0.3; }
        }
        #modelStatus {
            text-align: center;
            margin-bottom: 15px;
            padding: 15px;
            background: rgba(0,0,0,0.3);
            border-radius: 8px;
            font-size: 14px;
        }
        .loading-spinner {
            display: inline-block;
            width: 20px; height: 20px;
            border: 3px solid rgba(255,255,255,0.3);
            border-top-color: #00d4ff;
            border-radius: 50%;
            animation: spin 1s linear infinite;
            margin-right: 10px;
            vertical-align: middle;
        }
        @keyframes spin {
            to { transform: rotate(360deg); }
        }
        .progress-bar {
            width: 100%;
            height: 4px;
            background: rgba(255,255,255,0.1);
            border-radius: 2px;
            margin-top: 10px;
            overflow: hidden;
        }
        .progress-fill {
            height: 100%;
            background: linear-gradient(90deg, #00d4ff, #7b2cbf);
            animation: progress 2s ease-in-out infinite;
        }
        @keyframes progress {
            0% { width: 0%; transform: translateX(-100%); }
            50% { width: 70%; transform: translateX(50%); }
            100% { width: 0%; transform: translateX(200%); }
        }
        .controls-row {
            display: grid;
            grid-template-columns: 1fr 1fr auto;
            gap: 15px;
            margin-bottom: 15px;
        }
        .control-group label {
            display: block;
            font-size: 12px;
            color: #8892b0;
            margin-bottom: 5px;
        }
        .control-group input[type="range"] {
            width: 100%;
            cursor: pointer;
        }
        #chunkValue, #tokenCount {
            color: #00d4ff;
            font-weight: bold;
        }
        .stats-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
            gap: 15px;
            margin-top: 15px;
        }
        .stat-box {
            background: rgba(0,0,0,0.2);
            padding: 15px;
            border-radius: 8px;
            text-align: center;
        }
        .stat-value {
            font-size: 24px;
            font-weight: bold;
            color: #00d4ff;
        }
        .stat-label {
            font-size: 12px;
            color: #8892b0;
            margin-top: 5px;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>OpenMed Privacy Filter + Nemotron</h1>
        <p class="subtitle">PII Detection & Masking Demo using Flask</p>
        
        <div id="modelStatus">
            <span id="statusIndicator" class="status-indicator status-loading"></span>
            <span id="statusText">Waiting for server to start...</span>
            <div class="progress-bar" id="progressBar">
                <div class="progress-fill"></div>
            </div>
        </div>
        
        <div class="info-box">
            <h3>Detects 55 Types of PII:</h3>
            <ul>
                <li><strong>Names & Identifiers:</strong> first_name, last_name, user_name, age, gender, race_ethnicity, sexuality, religious_belief, political_view, marital_status, nationality, education_level, occupation, employment_status, language, blood_type, biometric_identifier</li>
                <li><strong>Contact Info:</strong> email, phone_number, fax_number, url</li>
                <li><strong>Location:</strong> street_address, city, county, state, country, postcode, coordinate</li>
                <li><strong>Dates & Times:</strong> date, date_of_birth, date_time, time</li>
                <li><strong>Government IDs:</strong> ssn, national_id, tax_id</li>
                <li><strong>Financial:</strong> account_number, bank_routing_number, swift_bic, credit_debit_card, cvv, pin, password</li>
                <li><strong>Health:</strong> medical_record_number, health_plan_beneficiary_number</li>
                <li><strong>Identification:</strong> customer_id, employee_id, unique_id, certificate_license_number</li>
                <li><strong>Vehicle:</strong> license_plate, vehicle_identifier</li>
                <li><strong>Network/Device:</strong> ipv4, ipv6, mac_address, device_identifier, api_key, http_cookie</li>
            </ul>
        </div>
        
        <div class="card">
            <div class="controls-row">
                <div class="control-group">
                    <label>Chunk Size: <span id="chunkValue">10000</span> tokens</label>
                    <input type="range" id="chunkSize" min="128" max="128000" value="10000" oninput="updateChunkDisplay()">
                </div>
                <div class="control-group">
                    <label>Document Tokens: <span id="tokenCount">0</span></label>
                    <div style="color: #8892b0; font-size: 12px;">Max chunk: 128000 (model limit)</div>
                </div>
            </div>
            <textarea id="inputText" placeholder="Enter text with PII here...\n\nExample: My name is Alice Smith and my email is alice.smith@example.com. You can reach me at (555) 123-4567 or visit me at 123 Main Street, New York. My SSN is 123-45-6789."></textarea>
            <button onclick="analyzeText()" id="analyzeBtn" disabled>Waiting for model...</button>
            <div id="errorBox" class="error-box" style="display: none;"></div>
        </div>
        
        <div class="card results" id="resultsCard">
            <h3 style="margin-bottom: 15px;">Results</h3>
            <div class="stats-grid" id="statsGrid" style="display: none;">
                <div class="stat-box">
                    <div class="stat-value" id="statEntities">0</div>
                    <div class="stat-label">Entities Found</div>
                </div>
                <div class="stat-box">
                    <div class="stat-value" id="statChunks">0</div>
                    <div class="stat-label">Chunks Processed</div>
                </div>
                <div class="stat-box">
                    <div class="stat-value" id="statTime">0s</div>
                    <div class="stat-label">Processing Time</div>
                </div>
            </div>
            <div class="result-text" id="resultDisplay" style="margin-top: 15px;"></div>
            
            <div class="legend" id="legendContainer">
                <!-- Legend items will be dynamically generated -->
            </div>
            
            <div class="details-list" id="detailsList"></div>
        </div>
    </div>
    
    <script>
        let statusCheckInterval = null;
        let isModelLoaded = false;
        let retryCount = 0;
        const maxRetries = 200;
        
        // Define all 55 labels for legend generation
        const allLabels = [
            { id: 'first_name', name: 'First Name', color: '#ff6b6b' },
            { id: 'last_name', name: 'Last Name', color: '#ff6b6b' },
            { id: 'user_name', name: 'Username', color: '#ff6b6b' },
            { id: 'age', name: 'Age', color: '#ffd166' },
            { id: 'gender', name: 'Gender', color: '#ffd166' },
            { id: 'race_ethnicity', name: 'Race/Ethnicity', color: '#ffd166' },
            { id: 'sexuality', name: 'Sexuality', color: '#ffd166' },
            { id: 'religious_belief', name: 'Religion', color: '#ffd166' },
            { id: 'political_view', name: 'Political View', color: '#ffd166' },
            { id: 'marital_status', name: 'Marital Status', color: '#ffd166' },
            { id: 'nationality', name: 'Nationality', color: '#ffd166' },
            { id: 'education_level', name: 'Education', color: '#ffd166' },
            { id: 'occupation', name: 'Occupation', color: '#ffd166' },
            { id: 'employment_status', name: 'Employment', color: '#ffd166' },
            { id: 'language', name: 'Language', color: '#ffd166' },
            { id: 'blood_type', name: 'Blood Type', color: '#ffd166' },
            { id: 'biometric_identifier', name: 'Biometric', color: '#ffd166' },
            { id: 'email', name: 'Email', color: '#4ecdc4' },
            { id: 'phone_number', name: 'Phone', color: '#4ecdc4' },
            { id: 'fax_number', name: 'Fax', color: '#4ecdc4' },
            { id: 'url', name: 'URL', color: '#8338ec' },
            { id: 'street_address', name: 'Address', color: '#06d6a0' },
            { id: 'city', name: 'City', color: '#06d6a0' },
            { id: 'county', name: 'County', color: '#06d6a0' },
            { id: 'state', name: 'State', color: '#06d6a0' },
            { id: 'country', name: 'Country', color: '#06d6a0' },
            { id: 'postcode', name: 'Postcode', color: '#06d6a0' },
            { id: 'coordinate', name: 'Coordinate', color: '#06d6a0' },
            { id: 'date', name: 'Date', color: '#3a86ff' },
            { id: 'date_of_birth', name: 'DOB', color: '#3a86ff' },
            { id: 'date_time', name: 'DateTime', color: '#3a86ff' },
            { id: 'time', name: 'Time', color: '#3a86ff' },
            { id: 'ssn', name: 'SSN', color: '#ef476f' },
            { id: 'national_id', name: 'National ID', color: '#ef476f' },
            { id: 'tax_id', name: 'Tax ID', color: '#ef476f' },
            { id: 'account_number', name: 'Account #', color: '#ef476f' },
            { id: 'bank_routing_number', name: 'Routing #', color: '#ef476f' },
            { id: 'swift_bic', name: 'SWIFT/BIC', color: '#ef476f' },
            { id: 'credit_debit_card', name: 'Card #', color: '#ef476f' },
            { id: 'cvv', name: 'CVV', color: '#ef476f' },
            { id: 'pin', name: 'PIN', color: '#ef476f' },
            { id: 'password', name: 'Password', color: '#ef476f' },
            { id: 'medical_record_number', name: 'Medical #', color: '#ff006e' },
            { id: 'health_plan_beneficiary_number', name: 'Health Plan #', color: '#ff006e' },
            { id: 'customer_id', name: 'Customer ID', color: '#ff006e' },
            { id: 'employee_id', name: 'Employee ID', color: '#ff006e' },
            { id: 'unique_id', name: 'Unique ID', color: '#ff006e' },
            { id: 'certificate_license_number', name: 'License #', color: '#ff006e' },
            { id: 'license_plate', name: 'License Plate', color: '#8338ec' },
            { id: 'vehicle_identifier', name: 'Vehicle ID', color: '#8338ec' },
            { id: 'ipv4', name: 'IPv4', color: '#ff6b6b' },
            { id: 'ipv6', name: 'IPv6', color: '#ff6b6b' },
            { id: 'mac_address', name: 'MAC Address', color: '#ff6b6b' },
            { id: 'device_identifier', name: 'Device ID', color: '#ff6b6b' },
            { id: 'api_key', name: 'API Key', color: '#ff6b6b' },
            { id: 'http_cookie', name: 'Cookie', color: '#ff6b6b' }
        ];
        
        function generateLegend() {
            const container = document.getElementById('legendContainer');
            container.innerHTML = allLabels.map(label => `
                <div class="legend-item">
                    <div class="legend-color entity-${label.id}" style="background: ${label.color}33; border-color: ${label.color};"></div>
                    ${label.name}
                </div>
            `).join('');
        }
        
        function updateChunkDisplay() {
            document.getElementById('chunkValue').textContent = document.getElementById('chunkSize').value;
        }
        
        async function updateTokenCount() {
            const text = document.getElementById('inputText').value;
            if (!text.trim()) {
                document.getElementById('tokenCount').textContent = '0';
                return;
            }
            try {
                const response = await fetch('/tokenize', {
                    method: 'POST',
                    headers: { 'Content-Type': 'application/json' },
                    body: JSON.stringify({ text })
                });
                if (response.ok) {
                    const data = await response.json();
                    document.getElementById('tokenCount').textContent = data.token_count || 0;
                }
            } catch (e) {
                console.error('Token count failed:', e);
            }
        }
        
        document.getElementById('inputText').addEventListener('input', 
            debounce(updateTokenCount, 500)
        );
        
        function debounce(func, wait) {
            let timeout;
            return function executedFunction(...args) {
                const later = () => {
                    clearTimeout(timeout);
                    func(...args);
                };
                clearTimeout(timeout);
                timeout = setTimeout(later, wait);
            };
        }
        
        function updateStatus(state, message) {
            const statusIndicator = document.getElementById("statusIndicator");
            const statusText = document.getElementById("statusText");
            const progressBar = document.getElementById("progressBar");
            const btn = document.getElementById("analyzeBtn");
            
            switch(state) {
                case 'connecting':
                    statusIndicator.className = "status-indicator status-waiting";
                    statusText.innerHTML = `<span class="loading-spinner"></span>${message}`;
                    btn.disabled = true;
                    btn.textContent = "Server is starting up...";
                    progressBar.style.display = "block";
                    break;
                case 'loading':
                    statusIndicator.className = "status-indicator status-loading";
                    statusText.innerHTML = `<span class="loading-spinner"></span>${message}`;
                    btn.disabled = true;
                    btn.textContent = "Model is loading...";
                    progressBar.style.display = "block";
                    break;
                case 'ready':
                    statusIndicator.className = "status-indicator status-ok";
                    statusText.innerHTML = "&#10003; " + message;
                    btn.disabled = false;
                    btn.textContent = "Detect PII";
                    progressBar.style.display = "none";
                    break;
                case 'error':
                    statusIndicator.className = "status-indicator status-error";
                    statusText.innerHTML = "&#10007; " + message;
                    btn.disabled = true;
                    btn.textContent = "Model unavailable";
                    progressBar.style.display = "none";
                    break;
            }
        }
        
        async function checkModelStatus() {
            retryCount++;
            
            if (retryCount > maxRetries) {
                updateStatus('error', 'Server did not respond after 16 minutes. <button onclick="location.reload()">Refresh</button>');
                clearInterval(statusCheckInterval);
                statusCheckInterval = null;
                return;
            }
            
            try {
                const response = await fetch("/health", {
                    method: "GET",
                    headers: { "Cache-Control": "no-cache" }
                });
                
                if (!response.ok) {
                    throw new Error(`HTTP ${response.status}`);
                }
                
                const data = await response.json();
                console.log("Health check:", data);
                
                if (data.model_loading) {
                    updateStatus('loading', `Model loading... (5-10 minutes on first run)`);
                    isModelLoaded = false;
                } else if (data.model_loaded) {
                    updateStatus('ready', 'Model loaded and ready');
                    clearInterval(statusCheckInterval);
                    statusCheckInterval = null;
                    isModelLoaded = true;
                    retryCount = 0;
                } else {
                    updateStatus('error', `Model failed: ${data.error || "Unknown error"}`);
                    document.getElementById("errorBox").style.display = "block";
                    document.getElementById("errorBox").innerHTML = `<strong>Error:</strong> ${data.error || "Unknown"}`;
                    clearInterval(statusCheckInterval);
                    statusCheckInterval = null;
                    isModelLoaded = false;
                }
            } catch (error) {
                console.error("Health check failed:", error);
                updateStatus('connecting', `Waiting for server... (attempt ${retryCount})`);
                if (!statusCheckInterval) {
                    statusCheckInterval = setInterval(checkModelStatus, 5000);
                }
            }
        }
        
        // Initialize legend and status check
        generateLegend();
        checkModelStatus();
        
        async function analyzeText() {
            const text = document.getElementById("inputText").value;
            const btn = document.getElementById("analyzeBtn");
            const resultsCard = document.getElementById("resultsCard");
            const errorBox = document.getElementById("errorBox");
            const chunkSize = document.getElementById("chunkSize").value;
            
            if (!text.trim()) {
                errorBox.style.display = "block";
                errorBox.textContent = "Please enter some text first!";
                return;
            }
            
            btn.disabled = true;
            btn.innerHTML = '<span class="loading-spinner"></span>Analyzing...';
            errorBox.style.display = "none";
            document.getElementById("statsGrid").style.display = "grid";
            
            const startTime = Date.now();
            
            try {
                const response = await fetch("/analyze", {
                    method: "POST",
                    headers: { "Content-Type": "application/json" },
                    body: JSON.stringify({ text, chunk_size: parseInt(chunkSize) })
                });
                
                const data = await response.json();
                
                if (!response.ok || !data.success) {
                    throw new Error(data.error || "Server error");
                }
                
                const processTime = ((Date.now() - startTime) / 1000).toFixed(2);
                document.getElementById("statEntities").textContent = data.entity_count || 0;
                document.getElementById("statChunks").textContent = data.chunks_processed || 1;
                document.getElementById("statTime").textContent = processTime + "s";
                
                displayResults(data, text);
                resultsCard.classList.add("active");
                
            } catch (error) {
                console.error("Error:", error);
                errorBox.style.display = "block";
                errorBox.textContent = "Error: " + error.message;
                resultsCard.classList.remove("active");
            } finally {
                if (isModelLoaded) {
                    btn.disabled = false;
                    btn.textContent = "Detect PII";
                }
            }
        }
        
        function displayResults(data, originalText) {
            let html = "";
            let lastEnd = 0;
            
            if (data.entities && data.entities.length > 0) {
                const sorted = data.entities.sort((a, b) => a.start - b.start);
                
                for (const entity of sorted) {
                    html += escapeHtml(originalText.slice(lastEnd, entity.start));
                    html += `<span class="entity entity-${entity.label}">${escapeHtml(entity.text)}</span>`;
                    lastEnd = entity.end;
                }
                html += escapeHtml(originalText.slice(lastEnd));
                
                const detailsHtml = sorted.map(e => `
                    <div class="detail-item">
                        <div>
                            <span class="detail-type">${e.label}</span>: ${escapeHtml(e.text)}
                        </div>
                        <div class="detail-score">Score: ${(e.score * 100).toFixed(2)}%</div>
                    </div>
                `).join("");
                document.getElementById("detailsList").innerHTML = "<h4 style='margin:20px 0 10px 0;'>Detected Entities:</h4>" + detailsHtml;
            } else {
                html = escapeHtml(originalText) + "\\n\\n[No PII detected]";
                document.getElementById("detailsList").innerHTML = "";
            }
            
            document.getElementById("resultDisplay").innerHTML = html;
        }
        
        function escapeHtml(text) {
            const div = document.createElement("div");
            div.textContent = text;
            return div.innerHTML;
        }
        
        window.addEventListener("beforeunload", () => {
            if (statusCheckInterval) clearInterval(statusCheckInterval);
        });
        
        document.addEventListener('DOMContentLoaded', () => {
            document.getElementById('inputText').addEventListener('keydown', function(e) {
                if (e.ctrlKey && e.key === 'Enter') analyzeText();
            });
        });
    </script>
</body>
</html>
'''
@app.route('/')
def index():
    return render_template_string(HTML_TEMPLATE)

@app.route('/health')
def health():
    """Health check with model loading status"""
    global classifier, model_loading, model_error, model_thread
    
    if classifier is not None:
        return jsonify({
            'status': 'healthy',
            'model_loaded': True,
            'model_loading': False
        })
    elif model_loading:
        return jsonify({
            'status': 'loading',
            'model_loaded': False,
            'model_loading': True,
            'message': 'Model is still loading, please wait...'
        })
    else:
        return jsonify({
            'status': 'unhealthy',
            'model_loaded': False,
            'model_loading': False,
            'error': model_error or 'Model loading failed or thread terminated unexpectedly'
        }), 503

@app.route('/tokenize', methods=['POST'])
def tokenize():
    """Count tokens in the provided text."""
    global tokenizer
    
    if tokenizer is None:
        return jsonify({'success': False, 'error': 'Tokenizer not loaded yet'}), 503
    
    try:
        data = request.get_json()
        if not data:
            return jsonify({'success': False, 'error': 'No JSON data'}), 400
        
        text = data.get('text', '')
        if not text:
            return jsonify({'token_count': 0})
        
        # Tokenize and count
        tokens = tokenizer(text, add_special_tokens=False)
        token_count = int(len(tokens['input_ids']))
        
        return jsonify({
            'success': True,
            'token_count': token_count
        })
    except Exception as e:
        return jsonify({'success': False, 'error': str(e)}), 500

@app.route('/analyze', methods=['POST', 'OPTIONS'])
def analyze():
    if request.method == 'OPTIONS':
        return '', 204
    
    global classifier, tokenizer
    
    if classifier is None:
        return jsonify({
            'success': False,
            'error': f'Model not yet loaded. Please wait and refresh in a few minutes.'
        }), 503
    
    try:
        data = request.get_json()
        
        if not data:
            return jsonify({'success': False, 'error': 'No JSON data received'}), 400
        
        text = data.get('text', '')
        chunk_size = data.get('chunk_size', 4096)
        
        if not text.strip():
            return jsonify({'success': True, 'entities': [], 'entity_count': 0, 'chunks_processed': 0})
        
        # Create line-based chunks
        chunks = create_line_chunks(text, max_tokens=chunk_size)
        all_entities = []
        
        # Process each chunk
        for char_offset, chunk_text, chunk_tokens in chunks:
            chunk_results = classifier(chunk_text)
            
            # Adjust entity positions by chunk offset
            for entity in chunk_results:
                entity['start'] = entity.get('start', 0) + char_offset
                entity['end'] = entity.get('end', 0) + char_offset
                all_entities.append(entity)
        
        # Merge adjacent entities
        merged_entities = merge_adjacent_entities(all_entities)
        
        # Format output
        entities = []
        for entity in merged_entities:
            entities.append({
                'label': entity.get('entity_group', entity.get('entity', 'unknown')),
                'text': entity.get('word', ''),
                'start': entity.get('start', 0),
                'end': entity.get('end', 0),
                'score': float(entity.get('score', 0))
            })
        
        return jsonify({
            'success': True,
            'entities': entities,
            'entity_count': len(entities),
            'chunks_processed': len(chunks)
        })
        
    except Exception as e:
        print(f"Error during analysis: {e}", flush=True)
        import traceback
        traceback.print_exc()
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

if __name__ == '__main__':
    port = int(os.environ.get('PORT', 7860))
    app.run(host='0.0.0.0', port=port, debug=False, threaded=True)