Elliot89's picture
Update app.py
2113fdf verified
import gradio as gr
import pickle
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import warnings
warnings.filterwarnings('ignore')
# Download NLTK data
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
print("βœ… NLTK resources downloaded")
# ============================================================================
# CRITICAL: Define TextPreprocessor class BEFORE loading the pickle file
# ============================================================================
class TextPreprocessor:
"""
Advanced text preprocessing pipeline for sentiment analysis.
Features:
- Lemmatization for better word normalization
- Custom stopword filtering (preserves negation words)
- URL and email removal
- Special character cleaning
- Case normalization
"""
def __init__(self, use_lemmatization=True, remove_stopwords=True):
"""
Initialize the preprocessor.
Parameters:
use_lemmatization (bool): Use lemmatization instead of stemming
remove_stopwords (bool): Remove stopwords from text
"""
self.stemmer = PorterStemmer()
self.lemmatizer = WordNetLemmatizer()
self.use_lemmatization = use_lemmatization
self.remove_stopwords = remove_stopwords
# Custom stopwords excluding important sentiment words
self.stop_words = set(stopwords.words('english'))
# Remove negation words as they're crucial for sentiment
negation_words = {
'not', 'no', 'nor', 'neither', 'never', 'none',
'nothing', 'nowhere', "don't", "doesn't", "didn't",
"won't", "wouldn't", "can't", "couldn't", "shouldn't",
"wasn't", "weren't", "hasn't", "haven't", "hadn't"
}
self.stop_words = self.stop_words - negation_words
def clean_text(self, text: str) -> str:
"""
Clean and preprocess a single text string.
Parameters:
text (str): Raw text
Returns:
str: Cleaned text
"""
# Convert to lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)
# Remove email addresses
text = re.sub(r'\S+@\S+', ' ', text)
# Remove HTML tags
text = re.sub(r'<.*?>', ' ', text)
# Remove special characters but keep spaces
text = re.sub(r'[^a-zA-Z\s]', ' ', text)
# Remove extra whitespaces
text = re.sub(r'\s+', ' ', text).strip()
# Tokenize
words = text.split()
# Remove stopwords if enabled
if self.remove_stopwords:
words = [word for word in words if word not in self.stop_words]
# Apply lemmatization or stemming
if self.use_lemmatization:
words = [self.lemmatizer.lemmatize(word, pos='v') for word in words]
words = [self.lemmatizer.lemmatize(word, pos='n') for word in words]
else:
words = [self.stemmer.stem(word) for word in words]
return ' '.join(words)
def fit_transform(self, texts):
"""Process multiple texts."""
return [self.clean_text(text) for text in texts]
def transform(self, texts):
"""Process multiple texts (alias for fit_transform)."""
return self.fit_transform(texts)
# ============================================================================
# Load models
# ============================================================================
print("Loading models...")
try:
with open('best_model.pkl', 'rb') as f:
model = pickle.load(f)
print("βœ… Model loaded")
with open('tfidf_vectorizer.pkl', 'rb') as f:
vectorizer = pickle.load(f)
print("βœ… Vectorizer loaded")
with open('preprocessor.pkl', 'rb') as f:
preprocessor = pickle.load(f)
print("βœ… Preprocessor loaded")
except Exception as e:
print(f"❌ Error loading models: {e}")
raise
# Feature extraction function
def extract_features(texts, original_texts):
"""Extract statistical features from texts."""
features = {
'review_length': [len(text) for text in original_texts],
'word_count': [len(text.split()) for text in texts],
'avg_word_length': [
np.mean([len(word) for word in text.split()]) if text else 0
for text in texts
],
'exclamation_count': [text.count('!') for text in original_texts],
'question_count': [text.count('?') for text in original_texts],
'capital_ratio': [
sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0
for text in original_texts
]
}
return pd.DataFrame(features)
# Prediction function
def predict_sentiment(review_text):
"""Predict sentiment for a review."""
if not review_text or not review_text.strip():
return "⚠️ Please enter a review!", "", "", "", ""
try:
# Preprocess
cleaned = preprocessor.clean_text(review_text)
# Vectorize
vectorized = vectorizer.transform([cleaned]).toarray()
# Extract additional features
add_features = extract_features([cleaned], [review_text])
# Combine features
X_new = np.concatenate([vectorized, add_features.values], axis=1)
# Predict
prediction = model.predict(X_new)[0]
# Get probabilities if available
if hasattr(model, 'predict_proba'):
proba = model.predict_proba(X_new)[0]
confidence = max(proba)
prob_neg = proba[0]
prob_pos = proba[1]
else:
confidence = None
prob_neg = None
prob_pos = None
# Format output
sentiment = "βœ… Positive 😊" if prediction == 1 else "❌ Negative 😞"
conf_str = f"{confidence:.2%}" if confidence else "N/A"
neg_str = f"{prob_neg:.2%}" if prob_neg else "N/A"
pos_str = f"{prob_pos:.2%}" if prob_pos else "N/A"
return sentiment, conf_str, neg_str, pos_str, cleaned
except Exception as e:
return f"❌ Error: {str(e)}", "", "", "", ""
# Create Gradio interface
print("Creating Gradio interface...")
with gr.Blocks(
theme=gr.themes.Soft(),
title="Restaurant Review Sentiment Analyzer"
) as demo:
gr.Markdown("""
# 🍽️ Restaurant Review Sentiment Analyzer
### AI-Powered Sentiment Analysis with Machine Learning
Enter a restaurant review to analyze its sentiment in real-time!
**Model:** Advanced ML Classification
**Accuracy:** 85%+
**Features:** TF-IDF + Statistical Text Analysis
""")
with gr.Row():
with gr.Column(scale=2):
gr.Markdown("### πŸ“ Enter Your Review")
input_text = gr.Textbox(
label="Restaurant Review",
placeholder="e.g., The food was amazing and the service was excellent!",
lines=5
)
with gr.Row():
submit_btn = gr.Button("πŸ” Analyze Sentiment", variant="primary", size="lg")
clear_btn = gr.ClearButton([input_text], value="πŸ—‘οΈ Clear", size="lg")
with gr.Column(scale=2):
gr.Markdown("### πŸ“Š Analysis Results")
sentiment_output = gr.Textbox(label="🎯 Predicted Sentiment", interactive=False)
confidence_output = gr.Textbox(label="πŸ“ˆ Confidence Score", interactive=False)
with gr.Row():
neg_prob = gr.Textbox(label="😞 Negative Probability", interactive=False)
pos_prob = gr.Textbox(label="😊 Positive Probability", interactive=False)
with gr.Accordion("πŸ” Preprocessing Details", open=False):
cleaned_output = gr.Textbox(
label="Cleaned Review Text (After Preprocessing)",
interactive=False,
lines=3
)
gr.Markdown("""
**Preprocessing Steps:**
1. Convert to lowercase
2. Remove URLs, emails, HTML tags
3. Remove special characters
4. Remove stopwords (keep negations)
5. Apply lemmatization
6. Extract statistical features
""")
gr.Markdown("---")
gr.Markdown("### πŸ’‘ Try These Example Reviews")
gr.Examples(
examples=[
["The food was absolutely amazing! Best restaurant I've ever been to!"],
["Terrible service and the food was cold. Never coming back."],
["Outstanding! The staff was friendly and attentive."],
["Worst meal ever. Complete waste of money."],
["Good food but portions were small. Reasonable prices."],
["Fantastic! Every dish was cooked to perfection!"],
],
inputs=input_text,
label="Click to try"
)
gr.Markdown("""
---
### πŸ“š About This Model
**Machine Learning Pipeline:**
- **Preprocessing:** Lemmatization, stopword removal, text normalization
- **Features:** TF-IDF (1500 features, bigrams) + 6 statistical features
- **Algorithm:** Ensemble machine learning (Random Forest / SVM / Gradient Boosting)
- **Accuracy:** 85%+ on test data
- **Metrics:** High precision, recall, and F1-score
**Technologies:** Python β€’ Scikit-learn β€’ NLTK β€’ Gradio β€’ Pandas β€’ NumPy
**Developer:** Einstein Ellandala | Project: ML-06-BML11 | October 2025
""")
submit_btn.click(
fn=predict_sentiment,
inputs=input_text,
outputs=[sentiment_output, confidence_output, neg_prob, pos_prob, cleaned_output]
)
print("βœ… Gradio interface created")
print("πŸš€ Launching application...")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)