import asyncio
import json
import logging
import sys
import os

# Ensure app modules can be imported
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from app.services.medical_orchestrator import get_medical_orchestrator, MedicalOrchestrator
from app.services.intent_classifier import get_classifier
from app.services.context_manager import EntryContext
from app.services.vector_db import VectorDB

# Configure Logging
logging.basicConfig(level=logging.INFO, format='%(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("Evaluator")

async def setup_test_data(orchestrator: MedicalOrchestrator):
    """Inject dummy business data for RAG testing"""
    print("Injecting test business data into VectorDB...")
    
    # Dummy data for tenant "1"
    docs = [
        {"text": "Our opening hours are Monday to Friday, 9 AM to 5 PM.", "source": "Business Hours"},
        {"text": "We accept Cigna, BlueCross, and Aetna insurance plans.", "source": "Insurance Policy"},
        {"text": "Dental cleanings start at $99 for new patients.", "source": "Pricing"},
        {"text": "We are located at 123 Main St, New York.", "source": "Location"}
    ]
    
    website_id = 1
    
    # Generate embeddings and add
    from app.services.vector_operations import VectorOperations
    
    vectors = []
    metadata = []
    
    for doc in docs:
        emb = await VectorOperations.get_embedding(doc['text'], is_query=False)
        vectors.append(emb)
        metadata.append(doc)
        
    import numpy as np
    orchestrator.vector_db.add_vectors(
        np.array(vectors, dtype=np.float32), 
        metadata, 
        website_id
    )
    print("✓ Test data injected.")

async def run_evaluation():
    print("=== Starting Golden Dataset Evaluation ===")
    
    # Load Dataset
    with open("datasets/golden_evaluation_dataset.json", "r") as f:
        test_cases = json.load(f)
        
    orchestrator = get_medical_orchestrator()
    classifier = get_classifier()
    
    # Setup Data
    await setup_test_data(orchestrator)
    
    results = {
        "total": 0,
        "intent_pass": 0,
        "risk_pass": 0,
        "rag_pass": 0,
        "failures": []
    }
    
    for case in test_cases:
        query = case['query']
        expected_intent = case['expected_intent']
        expected_risk = case.get('expected_risk')
        
        print(f"\nScanning: '{query}'")
        results["total"] += 1
        
        # 1. Test Intent
        intent_res = await classifier.classify(query, industry="healthcare", context={})
        actual_intent = intent_res.category.value
        
        # Loose match for intent (e.g. MEDICAL_CONSULT match)
        intent_match = (actual_intent == expected_intent) or \
                       (expected_intent == "BUSINESS_SPECIFIC" and actual_intent in ["FAQ", "BUSINESS_SPECIFIC"])
        
        if intent_match:
            results["intent_pass"] += 1
            print(f"  ✓ Intent: {actual_intent}")
        else:
            print(f"  ❌ Intent Mismatch: Expected {expected_intent}, Got {actual_intent}")
            results["failures"].append(f"Intent fail: {query}")

        # 2. Test Risk (using Orchestrator logic)
        # We need to manually invoke the risk logic as it's private/internal usually, 
        # but analyze_risk is public in our refactor.
        actual_risk, _ = await orchestrator.analyze_risk(query, {})
        
        # Risk matching (Critical/High are often grouped)
        risk_match = (actual_risk == expected_risk)
        if not risk_match and expected_risk == "high" and actual_risk == "critical": risk_match = True
        
        if risk_match:
            results["risk_pass"] += 1
            print(f"  ✓ Risk: {actual_risk}")
        else:
            print(f"  ❌ Risk Mismatch: Expected {expected_risk}, Got {actual_risk}")
            results["failures"].append(f"Risk fail: {query}")
            
        # 3. Test Response (E2E)
        # Context with tenant_id="1" to match our injected data
        entry_context = EntryContext(tenant_id="1")
        response, conf, _ = await orchestrator.process_query(query, entry_context)
        
        # Simple validation for RAG
        rag_success = True
        if expected_intent in ["FAQ", "BUSINESS_SPECIFIC"]:
             # Check if response contains key info from our injected docs
             key_terms = []
             if "hours" in query: key_terms = ["9 AM", "5 PM", "Monday"]
             if "insurance" in query: key_terms = ["Cigna", "Aetna"]
             if "cost" in query: key_terms = ["$99"]
             
             if key_terms:
                 if any(term in response for term in key_terms):
                     print(f"  ✓ RAG Retrieval Verified (Found '{key_terms[0]}')")
                 else:
                     print(f"  ❌ RAG Fail: Key terms {key_terms} not found in response: '{response[:50]}...'")
                     rag_success = False
                     results["failures"].append(f"RAG fail: {query}")
        
        if rag_success:
             results["rag_pass"] += 1

    print("\n=== Evaluation Summary ===")
    print(f"Total Cases: {results['total']}")
    print(f"Intent Accuracy: {results['intent_pass']}/{results['total']} ({(results['intent_pass']/results['total'])*100:.1f}%)")
    print(f"Risk Accuracy:   {results['risk_pass']}/{results['total']} ({(results['risk_pass']/results['total'])*100:.1f}%)")
    
    if len(results["failures"]) > 0:
        print("\nFailures:")
        for f in results["failures"]:
            print(f"- {f}")

if __name__ == "__main__":
    asyncio.run(run_evaluation())