Elliot89 commited on
Commit
2113fdf
·
verified ·
1 Parent(s): 9be220c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -40
app.py CHANGED
@@ -4,7 +4,7 @@ import numpy as np
4
  import pandas as pd
5
  import nltk
6
  from nltk.corpus import stopwords
7
- from nltk.stem import WordNetLemmatizer
8
  import re
9
  import warnings
10
  warnings.filterwarnings('ignore')
@@ -17,7 +17,103 @@ nltk.download('omw-1.4', quiet=True)
17
  nltk.download('punkt', quiet=True)
18
  print("✅ NLTK resources downloaded")
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # Load models
 
 
21
  print("Loading models...")
22
  try:
23
  with open('best_model.pkl', 'rb') as f:
@@ -104,10 +200,7 @@ print("Creating Gradio interface...")
104
 
105
  with gr.Blocks(
106
  theme=gr.themes.Soft(),
107
- title="Restaurant Review Sentiment Analyzer",
108
- css="""
109
- .gradio-container {font-family: 'Arial', sans-serif;}
110
- """
111
  ) as demo:
112
 
113
  gr.Markdown("""
@@ -116,9 +209,9 @@ with gr.Blocks(
116
 
117
  Enter a restaurant review to analyze its sentiment in real-time!
118
 
119
- **Model:** Random Forest Classifier
120
  **Accuracy:** 85%+
121
- **Features:** TF-IDF + Statistical Text Features
122
  """)
123
 
124
  with gr.Row():
@@ -127,8 +220,7 @@ with gr.Blocks(
127
  input_text = gr.Textbox(
128
  label="Restaurant Review",
129
  placeholder="e.g., The food was amazing and the service was excellent!",
130
- lines=6,
131
- max_lines=10
132
  )
133
 
134
  with gr.Row():
@@ -151,12 +243,13 @@ with gr.Blocks(
151
  lines=3
152
  )
153
  gr.Markdown("""
154
- **Preprocessing Steps Applied:**
155
  1. Convert to lowercase
156
- 2. Remove special characters and numbers
157
- 3. Remove stopwords (preserving negations)
158
- 4. Apply lemmatization
159
- 5. Extract statistical features
 
160
  """)
161
 
162
  gr.Markdown("---")
@@ -164,17 +257,15 @@ with gr.Blocks(
164
 
165
  gr.Examples(
166
  examples=[
167
- ["The food was absolutely amazing! Best restaurant I've ever been to! The service was impeccable."],
168
- ["Terrible service and the food was cold. The waiter was rude. Never coming back!"],
169
- ["Outstanding experience from start to finish! Every dish was cooked to perfection. Highly recommended!"],
170
- ["Worst meal I've ever had. Complete waste of money. Very disappointing experience."],
171
- ["Good food but the portions were quite small. Reasonable prices. Service was okay."],
172
- ["Fantastic! The ambiance was perfect and the food was delicious. Will definitely return!"],
173
- ["Not impressed at all. The quality has really gone downhill. Won't be going back."],
174
- ["Absolutely loved everything! Great variety and excellent presentation. Five stars!"]
175
  ],
176
  inputs=input_text,
177
- label="Click any example to try it"
178
  )
179
 
180
  gr.Markdown("""
@@ -182,21 +273,17 @@ with gr.Blocks(
182
  ### 📚 About This Model
183
 
184
  **Machine Learning Pipeline:**
185
- - **Preprocessing:** Lemmatization, stopword removal, text cleaning
186
- - **Feature Engineering:** TF-IDF vectorization (1500 features, bigrams) + 6 statistical features
187
- - **Algorithm:** Random Forest Classifier
188
- - **Training:** 6 different models compared, best one deployed
189
- - **Evaluation:** Cross-validation, multiple metrics (Accuracy, F1, ROC-AUC)
190
-
191
- **Technologies Used:**
192
- - Python, Scikit-learn, NLTK, Gradio, Pandas, NumPy
193
 
194
- **Developer:** Einstein Ellandala | Project: ML-06-BML11
195
 
196
- 📓 **Full Project:** [View on GitHub](https://github.com/MrEinsteinE/sentiment-analysis-restaurant)
197
  """)
198
 
199
- # Connect button to prediction function
200
  submit_btn.click(
201
  fn=predict_sentiment,
202
  inputs=input_text,
@@ -206,10 +293,5 @@ with gr.Blocks(
206
  print("✅ Gradio interface created")
207
  print("🚀 Launching application...")
208
 
209
- # Launch the app
210
  if __name__ == "__main__":
211
- demo.launch(
212
- server_name="0.0.0.0",
213
- server_port=7860,
214
- show_error=True
215
- )
 
4
  import pandas as pd
5
  import nltk
6
  from nltk.corpus import stopwords
7
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
8
  import re
9
  import warnings
10
  warnings.filterwarnings('ignore')
 
17
  nltk.download('punkt', quiet=True)
18
  print("✅ NLTK resources downloaded")
19
 
20
+ # ============================================================================
21
+ # CRITICAL: Define TextPreprocessor class BEFORE loading the pickle file
22
+ # ============================================================================
23
+
24
+ class TextPreprocessor:
25
+ """
26
+ Advanced text preprocessing pipeline for sentiment analysis.
27
+
28
+ Features:
29
+ - Lemmatization for better word normalization
30
+ - Custom stopword filtering (preserves negation words)
31
+ - URL and email removal
32
+ - Special character cleaning
33
+ - Case normalization
34
+ """
35
+
36
+ def __init__(self, use_lemmatization=True, remove_stopwords=True):
37
+ """
38
+ Initialize the preprocessor.
39
+
40
+ Parameters:
41
+ use_lemmatization (bool): Use lemmatization instead of stemming
42
+ remove_stopwords (bool): Remove stopwords from text
43
+ """
44
+ self.stemmer = PorterStemmer()
45
+ self.lemmatizer = WordNetLemmatizer()
46
+ self.use_lemmatization = use_lemmatization
47
+ self.remove_stopwords = remove_stopwords
48
+
49
+ # Custom stopwords excluding important sentiment words
50
+ self.stop_words = set(stopwords.words('english'))
51
+
52
+ # Remove negation words as they're crucial for sentiment
53
+ negation_words = {
54
+ 'not', 'no', 'nor', 'neither', 'never', 'none',
55
+ 'nothing', 'nowhere', "don't", "doesn't", "didn't",
56
+ "won't", "wouldn't", "can't", "couldn't", "shouldn't",
57
+ "wasn't", "weren't", "hasn't", "haven't", "hadn't"
58
+ }
59
+ self.stop_words = self.stop_words - negation_words
60
+
61
+ def clean_text(self, text: str) -> str:
62
+ """
63
+ Clean and preprocess a single text string.
64
+
65
+ Parameters:
66
+ text (str): Raw text
67
+
68
+ Returns:
69
+ str: Cleaned text
70
+ """
71
+ # Convert to lowercase
72
+ text = text.lower()
73
+
74
+ # Remove URLs
75
+ text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)
76
+
77
+ # Remove email addresses
78
+ text = re.sub(r'\S+@\S+', ' ', text)
79
+
80
+ # Remove HTML tags
81
+ text = re.sub(r'<.*?>', ' ', text)
82
+
83
+ # Remove special characters but keep spaces
84
+ text = re.sub(r'[^a-zA-Z\s]', ' ', text)
85
+
86
+ # Remove extra whitespaces
87
+ text = re.sub(r'\s+', ' ', text).strip()
88
+
89
+ # Tokenize
90
+ words = text.split()
91
+
92
+ # Remove stopwords if enabled
93
+ if self.remove_stopwords:
94
+ words = [word for word in words if word not in self.stop_words]
95
+
96
+ # Apply lemmatization or stemming
97
+ if self.use_lemmatization:
98
+ words = [self.lemmatizer.lemmatize(word, pos='v') for word in words]
99
+ words = [self.lemmatizer.lemmatize(word, pos='n') for word in words]
100
+ else:
101
+ words = [self.stemmer.stem(word) for word in words]
102
+
103
+ return ' '.join(words)
104
+
105
+ def fit_transform(self, texts):
106
+ """Process multiple texts."""
107
+ return [self.clean_text(text) for text in texts]
108
+
109
+ def transform(self, texts):
110
+ """Process multiple texts (alias for fit_transform)."""
111
+ return self.fit_transform(texts)
112
+
113
+ # ============================================================================
114
  # Load models
115
+ # ============================================================================
116
+
117
  print("Loading models...")
118
  try:
119
  with open('best_model.pkl', 'rb') as f:
 
200
 
201
  with gr.Blocks(
202
  theme=gr.themes.Soft(),
203
+ title="Restaurant Review Sentiment Analyzer"
 
 
 
204
  ) as demo:
205
 
206
  gr.Markdown("""
 
209
 
210
  Enter a restaurant review to analyze its sentiment in real-time!
211
 
212
+ **Model:** Advanced ML Classification
213
  **Accuracy:** 85%+
214
+ **Features:** TF-IDF + Statistical Text Analysis
215
  """)
216
 
217
  with gr.Row():
 
220
  input_text = gr.Textbox(
221
  label="Restaurant Review",
222
  placeholder="e.g., The food was amazing and the service was excellent!",
223
+ lines=5
 
224
  )
225
 
226
  with gr.Row():
 
243
  lines=3
244
  )
245
  gr.Markdown("""
246
+ **Preprocessing Steps:**
247
  1. Convert to lowercase
248
+ 2. Remove URLs, emails, HTML tags
249
+ 3. Remove special characters
250
+ 4. Remove stopwords (keep negations)
251
+ 5. Apply lemmatization
252
+ 6. Extract statistical features
253
  """)
254
 
255
  gr.Markdown("---")
 
257
 
258
  gr.Examples(
259
  examples=[
260
+ ["The food was absolutely amazing! Best restaurant I've ever been to!"],
261
+ ["Terrible service and the food was cold. Never coming back."],
262
+ ["Outstanding! The staff was friendly and attentive."],
263
+ ["Worst meal ever. Complete waste of money."],
264
+ ["Good food but portions were small. Reasonable prices."],
265
+ ["Fantastic! Every dish was cooked to perfection!"],
 
 
266
  ],
267
  inputs=input_text,
268
+ label="Click to try"
269
  )
270
 
271
  gr.Markdown("""
 
273
  ### 📚 About This Model
274
 
275
  **Machine Learning Pipeline:**
276
+ - **Preprocessing:** Lemmatization, stopword removal, text normalization
277
+ - **Features:** TF-IDF (1500 features, bigrams) + 6 statistical features
278
+ - **Algorithm:** Ensemble machine learning (Random Forest / SVM / Gradient Boosting)
279
+ - **Accuracy:** 85%+ on test data
280
+ - **Metrics:** High precision, recall, and F1-score
 
 
 
281
 
282
+ **Technologies:** Python Scikit-learn NLTK • Gradio • Pandas • NumPy
283
 
284
+ **Developer:** Einstein Ellandala | Project: ML-06-BML11 | October 2025
285
  """)
286
 
 
287
  submit_btn.click(
288
  fn=predict_sentiment,
289
  inputs=input_text,
 
293
  print("✅ Gradio interface created")
294
  print("🚀 Launching application...")
295
 
 
296
  if __name__ == "__main__":
297
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)