| import streamlit as st |
| import joblib |
| import nltk |
| from nltk.corpus import stopwords |
| from nltk.tokenize import word_tokenize |
| import string |
| import re |
|
|
|
|
| nltk.download('punkt') |
| nltk.download('stopwords') |
|
|
| def preprocess_text(text): |
|
|
| text = text.lower() |
| |
|
|
| text = ''.join([char for char in text if char not in string.punctuation]) |
| |
|
|
| text = re.sub(r'\d+', '', text) |
| |
|
|
| text = ' '.join(text.split()) |
| |
|
|
| tokens = word_tokenize(text) |
| |
|
|
| stop_words = set(stopwords.words('english')) |
| tokens = [token for token in tokens if token not in stop_words] |
| |
| |
| return ' '.join(tokens) |
|
|
|
|
| model = joblib.load('spam_detector_model.joblib') |
| vectorizer = joblib.load('tfidf_vectorizer.joblib') |
|
|
|
|
| st.title("📧 Spam Message Detector") |
|
|
| st.write(""" |
| This app detects whether a message is spam or not. |
| Enter your message below and click 'Analyze' to check! |
| """) |
|
|
| message = st.text_area("Enter your message:", height=100) |
|
|
| if st.button("Analyze"): |
| if message: |
| |
| processed_text = preprocess_text(message) |
| |
|
|
| text_vectorized = vectorizer.transform([processed_text]) |
| |
|
|
| prediction = model.predict(text_vectorized)[0] |
| probability = model.predict_proba(text_vectorized)[0] |
| |
|
|
| st.markdown("### Analysis Result") |
| |
| if prediction == 1: |
| st.error("🚨 This message is likely SPAM!") |
| st.write(f"Confidence: {probability[1]:.2%}") |
| else: |
| st.success("✅ This message appears to be legitimate.") |
| st.write(f"Confidence: {probability[0]:.2%}") |
| |
|
|
| with st.expander("See preprocessing steps"): |
| st.write("Original message:", message) |
| st.write("Processed message:", processed_text) |
| else: |
| st.warning("Please enter a message to analyze.") |
|
|
|
|
| with st.sidebar: |
| st.header("About the Model") |
| st.write(""" |
| This spam detector uses an XGBoost classifier trained on a dataset of spam and legitimate messages. |
| |
| Model Performance: |
| - Training Accuracy: 99.7% |
| - Testing Accuracy: 98.9% |
| """) |
|
|