import gradio as gr import pickle import numpy as np import pandas as pd import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer, WordNetLemmatizer import re import warnings warnings.filterwarnings('ignore') # Download NLTK data print("Downloading NLTK resources...") nltk.download('stopwords', quiet=True) nltk.download('wordnet', quiet=True) nltk.download('omw-1.4', quiet=True) nltk.download('punkt', quiet=True) print("✅ NLTK resources downloaded") # ============================================================================ # CRITICAL: Define TextPreprocessor class BEFORE loading the pickle file # ============================================================================ class TextPreprocessor: """ Advanced text preprocessing pipeline for sentiment analysis. Features: - Lemmatization for better word normalization - Custom stopword filtering (preserves negation words) - URL and email removal - Special character cleaning - Case normalization """ def __init__(self, use_lemmatization=True, remove_stopwords=True): """ Initialize the preprocessor. Parameters: use_lemmatization (bool): Use lemmatization instead of stemming remove_stopwords (bool): Remove stopwords from text """ self.stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() self.use_lemmatization = use_lemmatization self.remove_stopwords = remove_stopwords # Custom stopwords excluding important sentiment words self.stop_words = set(stopwords.words('english')) # Remove negation words as they're crucial for sentiment negation_words = { 'not', 'no', 'nor', 'neither', 'never', 'none', 'nothing', 'nowhere', "don't", "doesn't", "didn't", "won't", "wouldn't", "can't", "couldn't", "shouldn't", "wasn't", "weren't", "hasn't", "haven't", "hadn't" } self.stop_words = self.stop_words - negation_words def clean_text(self, text: str) -> str: """ Clean and preprocess a single text string. Parameters: text (str): Raw text Returns: str: Cleaned text """ # Convert to lowercase text = text.lower() # Remove URLs text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text) # Remove email addresses text = re.sub(r'\S+@\S+', ' ', text) # Remove HTML tags text = re.sub(r'<.*?>', ' ', text) # Remove special characters but keep spaces text = re.sub(r'[^a-zA-Z\s]', ' ', text) # Remove extra whitespaces text = re.sub(r'\s+', ' ', text).strip() # Tokenize words = text.split() # Remove stopwords if enabled if self.remove_stopwords: words = [word for word in words if word not in self.stop_words] # Apply lemmatization or stemming if self.use_lemmatization: words = [self.lemmatizer.lemmatize(word, pos='v') for word in words] words = [self.lemmatizer.lemmatize(word, pos='n') for word in words] else: words = [self.stemmer.stem(word) for word in words] return ' '.join(words) def fit_transform(self, texts): """Process multiple texts.""" return [self.clean_text(text) for text in texts] def transform(self, texts): """Process multiple texts (alias for fit_transform).""" return self.fit_transform(texts) # ============================================================================ # Load models # ============================================================================ print("Loading models...") try: with open('best_model.pkl', 'rb') as f: model = pickle.load(f) print("✅ Model loaded") with open('tfidf_vectorizer.pkl', 'rb') as f: vectorizer = pickle.load(f) print("✅ Vectorizer loaded") with open('preprocessor.pkl', 'rb') as f: preprocessor = pickle.load(f) print("✅ Preprocessor loaded") except Exception as e: print(f"❌ Error loading models: {e}") raise # Feature extraction function def extract_features(texts, original_texts): """Extract statistical features from texts.""" features = { 'review_length': [len(text) for text in original_texts], 'word_count': [len(text.split()) for text in texts], 'avg_word_length': [ np.mean([len(word) for word in text.split()]) if text else 0 for text in texts ], 'exclamation_count': [text.count('!') for text in original_texts], 'question_count': [text.count('?') for text in original_texts], 'capital_ratio': [ sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0 for text in original_texts ] } return pd.DataFrame(features) # Prediction function def predict_sentiment(review_text): """Predict sentiment for a review.""" if not review_text or not review_text.strip(): return "⚠️ Please enter a review!", "", "", "", "" try: # Preprocess cleaned = preprocessor.clean_text(review_text) # Vectorize vectorized = vectorizer.transform([cleaned]).toarray() # Extract additional features add_features = extract_features([cleaned], [review_text]) # Combine features X_new = np.concatenate([vectorized, add_features.values], axis=1) # Predict prediction = model.predict(X_new)[0] # Get probabilities if available if hasattr(model, 'predict_proba'): proba = model.predict_proba(X_new)[0] confidence = max(proba) prob_neg = proba[0] prob_pos = proba[1] else: confidence = None prob_neg = None prob_pos = None # Format output sentiment = "✅ Positive 😊" if prediction == 1 else "❌ Negative 😞" conf_str = f"{confidence:.2%}" if confidence else "N/A" neg_str = f"{prob_neg:.2%}" if prob_neg else "N/A" pos_str = f"{prob_pos:.2%}" if prob_pos else "N/A" return sentiment, conf_str, neg_str, pos_str, cleaned except Exception as e: return f"❌ Error: {str(e)}", "", "", "", "" # Create Gradio interface print("Creating Gradio interface...") with gr.Blocks( theme=gr.themes.Soft(), title="Restaurant Review Sentiment Analyzer" ) as demo: gr.Markdown(""" # 🍽️ Restaurant Review Sentiment Analyzer ### AI-Powered Sentiment Analysis with Machine Learning Enter a restaurant review to analyze its sentiment in real-time! **Model:** Advanced ML Classification **Accuracy:** 85%+ **Features:** TF-IDF + Statistical Text Analysis """) with gr.Row(): with gr.Column(scale=2): gr.Markdown("### 📝 Enter Your Review") input_text = gr.Textbox( label="Restaurant Review", placeholder="e.g., The food was amazing and the service was excellent!", lines=5 ) with gr.Row(): submit_btn = gr.Button("🔍 Analyze Sentiment", variant="primary", size="lg") clear_btn = gr.ClearButton([input_text], value="🗑️ Clear", size="lg") with gr.Column(scale=2): gr.Markdown("### 📊 Analysis Results") sentiment_output = gr.Textbox(label="🎯 Predicted Sentiment", interactive=False) confidence_output = gr.Textbox(label="📈 Confidence Score", interactive=False) with gr.Row(): neg_prob = gr.Textbox(label="😞 Negative Probability", interactive=False) pos_prob = gr.Textbox(label="😊 Positive Probability", interactive=False) with gr.Accordion("🔍 Preprocessing Details", open=False): cleaned_output = gr.Textbox( label="Cleaned Review Text (After Preprocessing)", interactive=False, lines=3 ) gr.Markdown(""" **Preprocessing Steps:** 1. Convert to lowercase 2. Remove URLs, emails, HTML tags 3. Remove special characters 4. Remove stopwords (keep negations) 5. Apply lemmatization 6. Extract statistical features """) gr.Markdown("---") gr.Markdown("### 💡 Try These Example Reviews") gr.Examples( examples=[ ["The food was absolutely amazing! Best restaurant I've ever been to!"], ["Terrible service and the food was cold. Never coming back."], ["Outstanding! The staff was friendly and attentive."], ["Worst meal ever. Complete waste of money."], ["Good food but portions were small. Reasonable prices."], ["Fantastic! Every dish was cooked to perfection!"], ], inputs=input_text, label="Click to try" ) gr.Markdown(""" --- ### 📚 About This Model **Machine Learning Pipeline:** - **Preprocessing:** Lemmatization, stopword removal, text normalization - **Features:** TF-IDF (1500 features, bigrams) + 6 statistical features - **Algorithm:** Ensemble machine learning (Random Forest / SVM / Gradient Boosting) - **Accuracy:** 85%+ on test data - **Metrics:** High precision, recall, and F1-score **Technologies:** Python • Scikit-learn • NLTK • Gradio • Pandas • NumPy **Developer:** Einstein Ellandala | Project: ML-06-BML11 | October 2025 """) submit_btn.click( fn=predict_sentiment, inputs=input_text, outputs=[sentiment_output, confidence_output, neg_prob, pos_prob, cleaned_output] ) print("✅ Gradio interface created") print("🚀 Launching application...") if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)