import os import streamlit as st import pandas as pd import torch from transformers import pipeline from wordcloud import WordCloud import matplotlib.pyplot as plt from collections import Counter import nltk from nltk.corpus import stopwords import random from datetime import datetime, timedelta # ---- Config st.set_option('browser.gatherUsageStats', False) os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib" st.set_page_config(page_title="MCA Demo Comment Analyzer", layout="wide") # ---- NLTK nltk.download('stopwords', quiet=True) STOPWORDS = set(stopwords.words('english')) # ---- Lightweight MCA Analyzer class MCACommentAnalyzer: def __init__(self): device = 0 if torch.cuda.is_available() else -1 print("Using device:", "GPU" if device==0 else "CPU") # Lightweight sentiment model self.sentiment_model = pipeline( "sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device ) # Lightweight summarizer self.summarizer = pipeline( "summarization", model="t5-small", device=device ) self.stop_words = STOPWORDS def map_sentiment(self, pred, text): text_lower = text.lower() violation_keywords = ["violation", "violates", "illegal", "non-compliant"] suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if"] positive_keywords = ["clear", "helpful", "good", "appreciate", "support"] negative_keywords = ["confusing", "unclear", "bad", "problem"] if any(w in text_lower for w in violation_keywords): return "Violation" if any(w in text_lower for w in suggestion_keywords): return "Suggestion" if any(w in text_lower for w in positive_keywords): return "Positive" if any(w in text_lower for w in negative_keywords): return "Negative" label = pred['label'].upper() if label == "POSITIVE": return "Positive" elif label == "NEGATIVE": return "Negative" else: return "Neutral" def process_comment(self, comment): pred = self.sentiment_model(comment)[0] sentiment = self.map_sentiment(pred, comment) if len(comment.split()) < 10: summary_text = " ".join(comment.split()[:10]) else: try: summary_text = self.summarizer(comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text'] except: summary_text = comment words = [w for w in comment.lower().split() if w.isalpha() and w not in self.stop_words] keywords = list(Counter(words).keys()) top_keywords = ", ".join(keywords[:3]) return sentiment, summary_text, keywords, top_keywords def process_comments(self, comments_list): sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], [] start_date = datetime.now() - timedelta(days=30) for comment in comments_list: sentiment, summary, keywords, top_kw = self.process_comment(comment) sentiments.append(sentiment) summaries.append(summary) all_keywords.extend(keywords) top_keywords_list.append(top_kw) timestamps.append(start_date + timedelta(days=random.randint(0,30))) df = pd.DataFrame({ "Timestamp": timestamps, "Comment": comments_list, "Summary": summaries, "Sentiment": sentiments, "Top Keywords": top_keywords_list }) df.sort_values(by='Timestamp', inplace=True, ascending=True) keyword_freq = pd.DataFrame( Counter(all_keywords).items(), columns=['Keyword', 'Frequency'] ).sort_values(by='Frequency', ascending=False) return df, keyword_freq def generate_wordcloud(self, keyword_freq, filename=None): wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency'])) wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict) plt.figure(figsize=(10,5)) plt.imshow(wc, interpolation="bilinear") plt.axis("off") if filename: plt.savefig(filename, bbox_inches='tight') return plt # ---- Streamlit UI st.title("📊 MCA Demo Comment Analyzer") st.sidebar.header("Upload or Enter Comments") upload_file = st.sidebar.file_uploader("Upload CSV/Excel/TXT", type=["csv","xlsx","txt"]) manual_input = st.sidebar.text_area("Or enter comments manually (one per line)") comments = [] if upload_file: try: if upload_file.name.endswith(".csv"): df_file = pd.read_csv(upload_file) if 'comment' in df_file.columns: comments = df_file['comment'].astype(str).tolist() else: comments = df_file.iloc[:,0].astype(str).tolist() elif upload_file.name.endswith(".xlsx"): df_file = pd.read_excel(upload_file) if 'comment' in df_file.columns: comments = df_file['comment'].astype(str).tolist() else: comments = df_file.iloc[:,0].astype(str).tolist() else: comments = upload_file.read().decode("utf-8").splitlines() except Exception as e: st.error(f"File format not supported or corrupted. {e}") elif manual_input.strip(): comments = manual_input.strip().split("\n") if st.sidebar.button("Analyze"): if comments: analyzer = MCACommentAnalyzer() df, keyword_freq = analyzer.process_comments(comments) st.subheader("📌 Analysis Results") st.dataframe(df, use_container_width=True) st.subheader("📊 Sentiment Distribution") st.bar_chart(df["Sentiment"].value_counts()) st.subheader("☁️ Word Cloud") plt_obj = analyzer.generate_wordcloud(keyword_freq) st.pyplot(plt_obj) else: st.warning("⚠️ Provide comments manually or upload a supported file.")