Spaces:

Elliot89
/

sentiment-analysis-restaurant

Sleeping

File size: 10,355 Bytes

import gradio as gr
import pickle
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
print("✅ NLTK resources downloaded")

# ============================================================================
# CRITICAL: Define TextPreprocessor class BEFORE loading the pickle file
# ============================================================================

class TextPreprocessor:
    """
    Advanced text preprocessing pipeline for sentiment analysis.
    
    Features:
    - Lemmatization for better word normalization
    - Custom stopword filtering (preserves negation words)
    - URL and email removal
    - Special character cleaning
    - Case normalization
    """
    
    def __init__(self, use_lemmatization=True, remove_stopwords=True):
        """
        Initialize the preprocessor.
        
        Parameters:
            use_lemmatization (bool): Use lemmatization instead of stemming
            remove_stopwords (bool): Remove stopwords from text
        """
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.use_lemmatization = use_lemmatization
        self.remove_stopwords = remove_stopwords
        
        # Custom stopwords excluding important sentiment words
        self.stop_words = set(stopwords.words('english'))
        
        # Remove negation words as they're crucial for sentiment
        negation_words = {
            'not', 'no', 'nor', 'neither', 'never', 'none',
            'nothing', 'nowhere', "don't", "doesn't", "didn't",
            "won't", "wouldn't", "can't", "couldn't", "shouldn't",
            "wasn't", "weren't", "hasn't", "haven't", "hadn't"
        }
        self.stop_words = self.stop_words - negation_words
    
    def clean_text(self, text: str) -> str:
        """
        Clean and preprocess a single text string.
        
        Parameters:
            text (str): Raw text
            
        Returns:
            str: Cleaned text
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', ' ', text)
        
        # Remove HTML tags
        text = re.sub(r'<.*?>', ' ', text)
        
        # Remove special characters but keep spaces
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenize
        words = text.split()
        
        # Remove stopwords if enabled
        if self.remove_stopwords:
            words = [word for word in words if word not in self.stop_words]
        
        # Apply lemmatization or stemming
        if self.use_lemmatization:
            words = [self.lemmatizer.lemmatize(word, pos='v') for word in words]
            words = [self.lemmatizer.lemmatize(word, pos='n') for word in words]
        else:
            words = [self.stemmer.stem(word) for word in words]
        
        return ' '.join(words)
    
    def fit_transform(self, texts):
        """Process multiple texts."""
        return [self.clean_text(text) for text in texts]
    
    def transform(self, texts):
        """Process multiple texts (alias for fit_transform)."""
        return self.fit_transform(texts)

# ============================================================================
# Load models
# ============================================================================

print("Loading models...")
try:
    with open('best_model.pkl', 'rb') as f:
        model = pickle.load(f)
    print("✅ Model loaded")
    
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    print("✅ Vectorizer loaded")
    
    with open('preprocessor.pkl', 'rb') as f:
        preprocessor = pickle.load(f)
    print("✅ Preprocessor loaded")
    
except Exception as e:
    print(f"❌ Error loading models: {e}")
    raise

# Feature extraction function
def extract_features(texts, original_texts):
    """Extract statistical features from texts."""
    features = {
        'review_length': [len(text) for text in original_texts],
        'word_count': [len(text.split()) for text in texts],
        'avg_word_length': [
            np.mean([len(word) for word in text.split()]) if text else 0 
            for text in texts
        ],
        'exclamation_count': [text.count('!') for text in original_texts],
        'question_count': [text.count('?') for text in original_texts],
        'capital_ratio': [
            sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0
            for text in original_texts
        ]
    }
    return pd.DataFrame(features)

# Prediction function
def predict_sentiment(review_text):
    """Predict sentiment for a review."""
    if not review_text or not review_text.strip():
        return "⚠️ Please enter a review!", "", "", "", ""
    
    try:
        # Preprocess
        cleaned = preprocessor.clean_text(review_text)
        
        # Vectorize
        vectorized = vectorizer.transform([cleaned]).toarray()
        
        # Extract additional features
        add_features = extract_features([cleaned], [review_text])
        
        # Combine features
        X_new = np.concatenate([vectorized, add_features.values], axis=1)
        
        # Predict
        prediction = model.predict(X_new)[0]
        
        # Get probabilities if available
        if hasattr(model, 'predict_proba'):
            proba = model.predict_proba(X_new)[0]
            confidence = max(proba)
            prob_neg = proba[0]
            prob_pos = proba[1]
        else:
            confidence = None
            prob_neg = None
            prob_pos = None
        
        # Format output
        sentiment = "✅ Positive 😊" if prediction == 1 else "❌ Negative 😞"
        conf_str = f"{confidence:.2%}" if confidence else "N/A"
        neg_str = f"{prob_neg:.2%}" if prob_neg else "N/A"
        pos_str = f"{prob_pos:.2%}" if prob_pos else "N/A"
        
        return sentiment, conf_str, neg_str, pos_str, cleaned
    
    except Exception as e:
        return f"❌ Error: {str(e)}", "", "", "", ""

# Create Gradio interface
print("Creating Gradio interface...")

with gr.Blocks(
    theme=gr.themes.Soft(),
    title="Restaurant Review Sentiment Analyzer"
) as demo:
    
    gr.Markdown("""
    # 🍽️ Restaurant Review Sentiment Analyzer
    ### AI-Powered Sentiment Analysis with Machine Learning
    
    Enter a restaurant review to analyze its sentiment in real-time!
    
    **Model:** Advanced ML Classification  
    **Accuracy:** 85%+  
    **Features:** TF-IDF + Statistical Text Analysis
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("### 📝 Enter Your Review")
            input_text = gr.Textbox(
                label="Restaurant Review",
                placeholder="e.g., The food was amazing and the service was excellent!",
                lines=5
            )
            
            with gr.Row():
                submit_btn = gr.Button("🔍 Analyze Sentiment", variant="primary", size="lg")
                clear_btn = gr.ClearButton([input_text], value="🗑️ Clear", size="lg")
        
        with gr.Column(scale=2):
            gr.Markdown("### 📊 Analysis Results")
            sentiment_output = gr.Textbox(label="🎯 Predicted Sentiment", interactive=False)
            confidence_output = gr.Textbox(label="📈 Confidence Score", interactive=False)
            
            with gr.Row():
                neg_prob = gr.Textbox(label="😞 Negative Probability", interactive=False)
                pos_prob = gr.Textbox(label="😊 Positive Probability", interactive=False)
    
    with gr.Accordion("🔍 Preprocessing Details", open=False):
        cleaned_output = gr.Textbox(
            label="Cleaned Review Text (After Preprocessing)",
            interactive=False,
            lines=3
        )
        gr.Markdown("""
        **Preprocessing Steps:**
        1. Convert to lowercase
        2. Remove URLs, emails, HTML tags
        3. Remove special characters
        4. Remove stopwords (keep negations)
        5. Apply lemmatization
        6. Extract statistical features
        """)
    
    gr.Markdown("---")
    gr.Markdown("### 💡 Try These Example Reviews")
    
    gr.Examples(
        examples=[
            ["The food was absolutely amazing! Best restaurant I've ever been to!"],
            ["Terrible service and the food was cold. Never coming back."],
            ["Outstanding! The staff was friendly and attentive."],
            ["Worst meal ever. Complete waste of money."],
            ["Good food but portions were small. Reasonable prices."],
            ["Fantastic! Every dish was cooked to perfection!"],
        ],
        inputs=input_text,
        label="Click to try"
    )
    
    gr.Markdown("""
    ---
    ### 📚 About This Model
    
    **Machine Learning Pipeline:**
    - **Preprocessing:** Lemmatization, stopword removal, text normalization
    - **Features:** TF-IDF (1500 features, bigrams) + 6 statistical features
    - **Algorithm:** Ensemble machine learning (Random Forest / SVM / Gradient Boosting)
    - **Accuracy:** 85%+ on test data
    - **Metrics:** High precision, recall, and F1-score
    
    **Technologies:** Python • Scikit-learn • NLTK • Gradio • Pandas • NumPy
    
    **Developer:** Einstein Ellandala | Project: ML-06-BML11 | October 2025
    """)
    
    submit_btn.click(
        fn=predict_sentiment,
        inputs=input_text,
        outputs=[sentiment_output, confidence_output, neg_prob, pos_prob, cleaned_output]
    )

print("✅ Gradio interface created")
print("🚀 Launching application...")

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)