File size: 10,355 Bytes
8afe425
 
 
 
 
 
2113fdf
8afe425
 
 
 
 
 
 
 
 
 
 
 
2113fdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8afe425
2113fdf
 
8afe425
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2113fdf
8afe425
 
 
 
 
 
 
 
2113fdf
8afe425
2113fdf
8afe425
 
 
 
 
 
 
 
2113fdf
8afe425
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2113fdf
8afe425
2113fdf
 
 
 
 
8afe425
 
 
 
 
 
 
2113fdf
 
 
 
 
 
8afe425
 
2113fdf
8afe425
 
 
 
 
 
 
2113fdf
 
 
 
 
8afe425
2113fdf
8afe425
2113fdf
8afe425
 
 
 
 
 
 
 
 
 
 
 
2113fdf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
import gradio as gr
import pickle
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
print("βœ… NLTK resources downloaded")

# ============================================================================
# CRITICAL: Define TextPreprocessor class BEFORE loading the pickle file
# ============================================================================

class TextPreprocessor:
    """
    Advanced text preprocessing pipeline for sentiment analysis.
    
    Features:
    - Lemmatization for better word normalization
    - Custom stopword filtering (preserves negation words)
    - URL and email removal
    - Special character cleaning
    - Case normalization
    """
    
    def __init__(self, use_lemmatization=True, remove_stopwords=True):
        """
        Initialize the preprocessor.
        
        Parameters:
            use_lemmatization (bool): Use lemmatization instead of stemming
            remove_stopwords (bool): Remove stopwords from text
        """
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.use_lemmatization = use_lemmatization
        self.remove_stopwords = remove_stopwords
        
        # Custom stopwords excluding important sentiment words
        self.stop_words = set(stopwords.words('english'))
        
        # Remove negation words as they're crucial for sentiment
        negation_words = {
            'not', 'no', 'nor', 'neither', 'never', 'none',
            'nothing', 'nowhere', "don't", "doesn't", "didn't",
            "won't", "wouldn't", "can't", "couldn't", "shouldn't",
            "wasn't", "weren't", "hasn't", "haven't", "hadn't"
        }
        self.stop_words = self.stop_words - negation_words
    
    def clean_text(self, text: str) -> str:
        """
        Clean and preprocess a single text string.
        
        Parameters:
            text (str): Raw text
            
        Returns:
            str: Cleaned text
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', ' ', text)
        
        # Remove HTML tags
        text = re.sub(r'<.*?>', ' ', text)
        
        # Remove special characters but keep spaces
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenize
        words = text.split()
        
        # Remove stopwords if enabled
        if self.remove_stopwords:
            words = [word for word in words if word not in self.stop_words]
        
        # Apply lemmatization or stemming
        if self.use_lemmatization:
            words = [self.lemmatizer.lemmatize(word, pos='v') for word in words]
            words = [self.lemmatizer.lemmatize(word, pos='n') for word in words]
        else:
            words = [self.stemmer.stem(word) for word in words]
        
        return ' '.join(words)
    
    def fit_transform(self, texts):
        """Process multiple texts."""
        return [self.clean_text(text) for text in texts]
    
    def transform(self, texts):
        """Process multiple texts (alias for fit_transform)."""
        return self.fit_transform(texts)

# ============================================================================
# Load models
# ============================================================================

print("Loading models...")
try:
    with open('best_model.pkl', 'rb') as f:
        model = pickle.load(f)
    print("βœ… Model loaded")
    
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    print("βœ… Vectorizer loaded")
    
    with open('preprocessor.pkl', 'rb') as f:
        preprocessor = pickle.load(f)
    print("βœ… Preprocessor loaded")
    
except Exception as e:
    print(f"❌ Error loading models: {e}")
    raise

# Feature extraction function
def extract_features(texts, original_texts):
    """Extract statistical features from texts."""
    features = {
        'review_length': [len(text) for text in original_texts],
        'word_count': [len(text.split()) for text in texts],
        'avg_word_length': [
            np.mean([len(word) for word in text.split()]) if text else 0 
            for text in texts
        ],
        'exclamation_count': [text.count('!') for text in original_texts],
        'question_count': [text.count('?') for text in original_texts],
        'capital_ratio': [
            sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0
            for text in original_texts
        ]
    }
    return pd.DataFrame(features)

# Prediction function
def predict_sentiment(review_text):
    """Predict sentiment for a review."""
    if not review_text or not review_text.strip():
        return "⚠️ Please enter a review!", "", "", "", ""
    
    try:
        # Preprocess
        cleaned = preprocessor.clean_text(review_text)
        
        # Vectorize
        vectorized = vectorizer.transform([cleaned]).toarray()
        
        # Extract additional features
        add_features = extract_features([cleaned], [review_text])
        
        # Combine features
        X_new = np.concatenate([vectorized, add_features.values], axis=1)
        
        # Predict
        prediction = model.predict(X_new)[0]
        
        # Get probabilities if available
        if hasattr(model, 'predict_proba'):
            proba = model.predict_proba(X_new)[0]
            confidence = max(proba)
            prob_neg = proba[0]
            prob_pos = proba[1]
        else:
            confidence = None
            prob_neg = None
            prob_pos = None
        
        # Format output
        sentiment = "βœ… Positive 😊" if prediction == 1 else "❌ Negative 😞"
        conf_str = f"{confidence:.2%}" if confidence else "N/A"
        neg_str = f"{prob_neg:.2%}" if prob_neg else "N/A"
        pos_str = f"{prob_pos:.2%}" if prob_pos else "N/A"
        
        return sentiment, conf_str, neg_str, pos_str, cleaned
    
    except Exception as e:
        return f"❌ Error: {str(e)}", "", "", "", ""

# Create Gradio interface
print("Creating Gradio interface...")

with gr.Blocks(
    theme=gr.themes.Soft(),
    title="Restaurant Review Sentiment Analyzer"
) as demo:
    
    gr.Markdown("""
    # 🍽️ Restaurant Review Sentiment Analyzer
    ### AI-Powered Sentiment Analysis with Machine Learning
    
    Enter a restaurant review to analyze its sentiment in real-time!
    
    **Model:** Advanced ML Classification  
    **Accuracy:** 85%+  
    **Features:** TF-IDF + Statistical Text Analysis
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("### πŸ“ Enter Your Review")
            input_text = gr.Textbox(
                label="Restaurant Review",
                placeholder="e.g., The food was amazing and the service was excellent!",
                lines=5
            )
            
            with gr.Row():
                submit_btn = gr.Button("πŸ” Analyze Sentiment", variant="primary", size="lg")
                clear_btn = gr.ClearButton([input_text], value="πŸ—‘οΈ Clear", size="lg")
        
        with gr.Column(scale=2):
            gr.Markdown("### πŸ“Š Analysis Results")
            sentiment_output = gr.Textbox(label="🎯 Predicted Sentiment", interactive=False)
            confidence_output = gr.Textbox(label="πŸ“ˆ Confidence Score", interactive=False)
            
            with gr.Row():
                neg_prob = gr.Textbox(label="😞 Negative Probability", interactive=False)
                pos_prob = gr.Textbox(label="😊 Positive Probability", interactive=False)
    
    with gr.Accordion("πŸ” Preprocessing Details", open=False):
        cleaned_output = gr.Textbox(
            label="Cleaned Review Text (After Preprocessing)",
            interactive=False,
            lines=3
        )
        gr.Markdown("""
        **Preprocessing Steps:**
        1. Convert to lowercase
        2. Remove URLs, emails, HTML tags
        3. Remove special characters
        4. Remove stopwords (keep negations)
        5. Apply lemmatization
        6. Extract statistical features
        """)
    
    gr.Markdown("---")
    gr.Markdown("### πŸ’‘ Try These Example Reviews")
    
    gr.Examples(
        examples=[
            ["The food was absolutely amazing! Best restaurant I've ever been to!"],
            ["Terrible service and the food was cold. Never coming back."],
            ["Outstanding! The staff was friendly and attentive."],
            ["Worst meal ever. Complete waste of money."],
            ["Good food but portions were small. Reasonable prices."],
            ["Fantastic! Every dish was cooked to perfection!"],
        ],
        inputs=input_text,
        label="Click to try"
    )
    
    gr.Markdown("""
    ---
    ### πŸ“š About This Model
    
    **Machine Learning Pipeline:**
    - **Preprocessing:** Lemmatization, stopword removal, text normalization
    - **Features:** TF-IDF (1500 features, bigrams) + 6 statistical features
    - **Algorithm:** Ensemble machine learning (Random Forest / SVM / Gradient Boosting)
    - **Accuracy:** 85%+ on test data
    - **Metrics:** High precision, recall, and F1-score
    
    **Technologies:** Python β€’ Scikit-learn β€’ NLTK β€’ Gradio β€’ Pandas β€’ NumPy
    
    **Developer:** Einstein Ellandala | Project: ML-06-BML11 | October 2025
    """)
    
    submit_btn.click(
        fn=predict_sentiment,
        inputs=input_text,
        outputs=[sentiment_output, confidence_output, neg_prob, pos_prob, cleaned_output]
    )

print("βœ… Gradio interface created")
print("πŸš€ Launching application...")

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)