import gradio as gr
import pickle
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
print("✅ NLTK resources downloaded")

# ============================================================================
# CRITICAL: Define TextPreprocessor class BEFORE loading the pickle file
# ============================================================================

class TextPreprocessor:
    """
    Advanced text preprocessing pipeline for sentiment analysis.
    
    Features:
    - Lemmatization for better word normalization
    - Custom stopword filtering (preserves negation words)
    - URL and email removal
    - Special character cleaning
    - Case normalization
    """
    
    def __init__(self, use_lemmatization=True, remove_stopwords=True):
        """
        Initialize the preprocessor.
        
        Parameters:
            use_lemmatization (bool): Use lemmatization instead of stemming
            remove_stopwords (bool): Remove stopwords from text
        """
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.use_lemmatization = use_lemmatization
        self.remove_stopwords = remove_stopwords
        
        # Custom stopwords excluding important sentiment words
        self.stop_words = set(stopwords.words('english'))
        
        # Remove negation words as they're crucial for sentiment
        negation_words = {
            'not', 'no', 'nor', 'neither', 'never', 'none',
            'nothing', 'nowhere', "don't", "doesn't", "didn't",
            "won't", "wouldn't", "can't", "couldn't", "shouldn't",
            "wasn't", "weren't", "hasn't", "haven't", "hadn't"
        }
        self.stop_words = self.stop_words - negation_words
    
    def clean_text(self, text: str) -> str:
        """
        Clean and preprocess a single text string.
        
        Parameters:
            text (str): Raw text
            
        Returns:
            str: Cleaned text
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', ' ', text)
        
        # Remove HTML tags
        text = re.sub(r'<.*?>', ' ', text)
        
        # Remove special characters but keep spaces
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenize
        words = text.split()
        
        # Remove stopwords if enabled
        if self.remove_stopwords:
            words = [word for word in words if word not in self.stop_words]
        
        # Apply lemmatization or stemming
        if self.use_lemmatization:
            words = [self.lemmatizer.lemmatize(word, pos='v') for word in words]
            words = [self.lemmatizer.lemmatize(word, pos='n') for word in words]
        else:
            words = [self.stemmer.stem(word) for word in words]
        
        return ' '.join(words)
    
    def fit_transform(self, texts):
        """Process multiple texts."""
        return [self.clean_text(text) for text in texts]
    
    def transform(self, texts):
        """Process multiple texts (alias for fit_transform)."""
        return self.fit_transform(texts)

# ============================================================================
# Load models
# ============================================================================

print("Loading models...")
try:
    with open('best_model.pkl', 'rb') as f:
        model = pickle.load(f)
    print("✅ Model loaded")
    
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    print("✅ Vectorizer loaded")
    
    with open('preprocessor.pkl', 'rb') as f:
        preprocessor = pickle.load(f)
    print("✅ Preprocessor loaded")
    
except Exception as e:
    print(f"❌ Error loading models: {e}")
    raise

# Feature extraction function
def extract_features(texts, original_texts):
    """Extract statistical features from texts."""
    features = {
        'review_length': [len(text) for text in original_texts],
        'word_count': [len(text.split()) for text in texts],
        'avg_word_length': [
            np.mean([len(word) for word in text.split()]) if text else 0 
            for text in texts
        ],
        'exclamation_count': [text.count('!') for text in original_texts],
        'question_count': [text.count('?') for text in original_texts],
        'capital_ratio': [
            sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0
            for text in original_texts
        ]
    }
    return pd.DataFrame(features)

# Prediction function
def predict_sentiment(review_text):
    """Predict sentiment for a review."""
    if not review_text or not review_text.strip():
        return "⚠️ Please enter a review!", "", "", "", ""
    
    try:
        # Preprocess
        cleaned = preprocessor.clean_text(review_text)
        
        # Vectorize
        vectorized = vectorizer.transform([cleaned]).toarray()
        
        # Extract additional features
        add_features = extract_features([cleaned], [review_text])
        
        # Combine features
        X_new = np.concatenate([vectorized, add_features.values], axis=1)
        
        # Predict
        prediction = model.predict(X_new)[0]
        
        # Get probabilities if available
        if hasattr(model, 'predict_proba'):
            proba = model.predict_proba(X_new)[0]
            confidence = max(proba)
            prob_neg = proba[0]
            prob_pos = proba[1]
        else:
            confidence = None
            prob_neg = None
            prob_pos = None
        
        # Format output
        sentiment = "✅ Positive 😊" if prediction == 1 else "❌ Negative 😞"
        conf_str = f"{confidence:.2%}" if confidence else "N/A"
        neg_str = f"{prob_neg:.2%}" if prob_neg else "N/A"
        pos_str = f"{prob_pos:.2%}" if prob_pos else "N/A"
        
        return sentiment, conf_str, neg_str, pos_str, cleaned
    
    except Exception as e:
        return f"❌ Error: {str(e)}", "", "", "", ""

# Create Gradio interface
print("Creating Gradio interface...")

with gr.Blocks(
    theme=gr.themes.Soft(),
    title="Restaurant Review Sentiment Analyzer"
) as demo:
    
    gr.Markdown("""
    # 🍽️ Restaurant Review Sentiment Analyzer
    ### AI-Powered Sentiment Analysis with Machine Learning
    
    Enter a restaurant review to analyze its sentiment in real-time!
    
    **Model:** Advanced ML Classification  
    **Accuracy:** 85%+  
    **Features:** TF-IDF + Statistical Text Analysis
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("### 📝 Enter Your Review")
            input_text = gr.Textbox(
                label="Restaurant Review",
                placeholder="e.g., The food was amazing and the service was excellent!",
                lines=5
            )
            
            with gr.Row():
                submit_btn = gr.Button("🔍 Analyze Sentiment", variant="primary", size="lg")
                clear_btn = gr.ClearButton([input_text], value="🗑️ Clear", size="lg")
        
        with gr.Column(scale=2):
            gr.Markdown("### 📊 Analysis Results")
            sentiment_output = gr.Textbox(label="🎯 Predicted Sentiment", interactive=False)
            confidence_output = gr.Textbox(label="📈 Confidence Score", interactive=False)
            
            with gr.Row():
                neg_prob = gr.Textbox(label="😞 Negative Probability", interactive=False)
                pos_prob = gr.Textbox(label="😊 Positive Probability", interactive=False)
    
    with gr.Accordion("🔍 Preprocessing Details", open=False):
        cleaned_output = gr.Textbox(
            label="Cleaned Review Text (After Preprocessing)",
            interactive=False,
            lines=3
        )
        gr.Markdown("""
        **Preprocessing Steps:**
        1. Convert to lowercase
        2. Remove URLs, emails, HTML tags
        3. Remove special characters
        4. Remove stopwords (keep negations)
        5. Apply lemmatization
        6. Extract statistical features
        """)
    
    gr.Markdown("---")
    gr.Markdown("### 💡 Try These Example Reviews")
    
    gr.Examples(
        examples=[
            ["The food was absolutely amazing! Best restaurant I've ever been to!"],
            ["Terrible service and the food was cold. Never coming back."],
            ["Outstanding! The staff was friendly and attentive."],
            ["Worst meal ever. Complete waste of money."],
            ["Good food but portions were small. Reasonable prices."],
            ["Fantastic! Every dish was cooked to perfection!"],
        ],
        inputs=input_text,
        label="Click to try"
    )
    
    gr.Markdown("""
    ---
    ### 📚 About This Model
    
    **Machine Learning Pipeline:**
    - **Preprocessing:** Lemmatization, stopword removal, text normalization
    - **Features:** TF-IDF (1500 features, bigrams) + 6 statistical features
    - **Algorithm:** Ensemble machine learning (Random Forest / SVM / Gradient Boosting)
    - **Accuracy:** 85%+ on test data
    - **Metrics:** High precision, recall, and F1-score
    
    **Technologies:** Python • Scikit-learn • NLTK • Gradio • Pandas • NumPy
    
    **Developer:** Einstein Ellandala | Project: ML-06-BML11 | October 2025
    """)
    
    submit_btn.click(
        fn=predict_sentiment,
        inputs=input_text,
        outputs=[sentiment_output, confidence_output, neg_prob, pos_prob, cleaned_output]
    )

print("✅ Gradio interface created")
print("🚀 Launching application...")

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)