Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import pandas as pd | |
| import torch | |
| from transformers import pipeline | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| from collections import Counter | |
| import nltk | |
| from nltk.corpus import stopwords | |
| import random | |
| from datetime import datetime, timedelta | |
| # ---- Config | |
| st.set_option('browser.gatherUsageStats', False) | |
| os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib" | |
| st.set_page_config(page_title="MCA Demo Comment Analyzer", layout="wide") | |
| # ---- NLTK | |
| nltk.download('stopwords', quiet=True) | |
| STOPWORDS = set(stopwords.words('english')) | |
| # ---- Lightweight MCA Analyzer | |
| class MCACommentAnalyzer: | |
| def __init__(self): | |
| device = 0 if torch.cuda.is_available() else -1 | |
| print("Using device:", "GPU" if device==0 else "CPU") | |
| # Lightweight sentiment model | |
| self.sentiment_model = pipeline( | |
| "sentiment-analysis", | |
| model="distilbert-base-uncased-finetuned-sst-2-english", | |
| device=device | |
| ) | |
| # Lightweight summarizer | |
| self.summarizer = pipeline( | |
| "summarization", | |
| model="t5-small", | |
| device=device | |
| ) | |
| self.stop_words = STOPWORDS | |
| def map_sentiment(self, pred, text): | |
| text_lower = text.lower() | |
| violation_keywords = ["violation", "violates", "illegal", "non-compliant"] | |
| suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if"] | |
| positive_keywords = ["clear", "helpful", "good", "appreciate", "support"] | |
| negative_keywords = ["confusing", "unclear", "bad", "problem"] | |
| if any(w in text_lower for w in violation_keywords): | |
| return "Violation" | |
| if any(w in text_lower for w in suggestion_keywords): | |
| return "Suggestion" | |
| if any(w in text_lower for w in positive_keywords): | |
| return "Positive" | |
| if any(w in text_lower for w in negative_keywords): | |
| return "Negative" | |
| label = pred['label'].upper() | |
| if label == "POSITIVE": | |
| return "Positive" | |
| elif label == "NEGATIVE": | |
| return "Negative" | |
| else: | |
| return "Neutral" | |
| def process_comment(self, comment): | |
| pred = self.sentiment_model(comment)[0] | |
| sentiment = self.map_sentiment(pred, comment) | |
| if len(comment.split()) < 10: | |
| summary_text = " ".join(comment.split()[:10]) | |
| else: | |
| try: | |
| summary_text = self.summarizer(comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text'] | |
| except: | |
| summary_text = comment | |
| words = [w for w in comment.lower().split() if w.isalpha() and w not in self.stop_words] | |
| keywords = list(Counter(words).keys()) | |
| top_keywords = ", ".join(keywords[:3]) | |
| return sentiment, summary_text, keywords, top_keywords | |
| def process_comments(self, comments_list): | |
| sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], [] | |
| start_date = datetime.now() - timedelta(days=30) | |
| for comment in comments_list: | |
| sentiment, summary, keywords, top_kw = self.process_comment(comment) | |
| sentiments.append(sentiment) | |
| summaries.append(summary) | |
| all_keywords.extend(keywords) | |
| top_keywords_list.append(top_kw) | |
| timestamps.append(start_date + timedelta(days=random.randint(0,30))) | |
| df = pd.DataFrame({ | |
| "Timestamp": timestamps, | |
| "Comment": comments_list, | |
| "Summary": summaries, | |
| "Sentiment": sentiments, | |
| "Top Keywords": top_keywords_list | |
| }) | |
| df.sort_values(by='Timestamp', inplace=True, ascending=True) | |
| keyword_freq = pd.DataFrame( | |
| Counter(all_keywords).items(), | |
| columns=['Keyword', 'Frequency'] | |
| ).sort_values(by='Frequency', ascending=False) | |
| return df, keyword_freq | |
| def generate_wordcloud(self, keyword_freq, filename=None): | |
| wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency'])) | |
| wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict) | |
| plt.figure(figsize=(10,5)) | |
| plt.imshow(wc, interpolation="bilinear") | |
| plt.axis("off") | |
| if filename: | |
| plt.savefig(filename, bbox_inches='tight') | |
| return plt | |
| # ---- Streamlit UI | |
| st.title("π MCA Demo Comment Analyzer") | |
| st.sidebar.header("Upload or Enter Comments") | |
| upload_file = st.sidebar.file_uploader("Upload CSV/Excel/TXT", type=["csv","xlsx","txt"]) | |
| manual_input = st.sidebar.text_area("Or enter comments manually (one per line)") | |
| comments = [] | |
| if upload_file: | |
| try: | |
| if upload_file.name.endswith(".csv"): | |
| df_file = pd.read_csv(upload_file) | |
| if 'comment' in df_file.columns: | |
| comments = df_file['comment'].astype(str).tolist() | |
| else: | |
| comments = df_file.iloc[:,0].astype(str).tolist() | |
| elif upload_file.name.endswith(".xlsx"): | |
| df_file = pd.read_excel(upload_file) | |
| if 'comment' in df_file.columns: | |
| comments = df_file['comment'].astype(str).tolist() | |
| else: | |
| comments = df_file.iloc[:,0].astype(str).tolist() | |
| else: | |
| comments = upload_file.read().decode("utf-8").splitlines() | |
| except Exception as e: | |
| st.error(f"File format not supported or corrupted. {e}") | |
| elif manual_input.strip(): | |
| comments = manual_input.strip().split("\n") | |
| if st.sidebar.button("Analyze"): | |
| if comments: | |
| analyzer = MCACommentAnalyzer() | |
| df, keyword_freq = analyzer.process_comments(comments) | |
| st.subheader("π Analysis Results") | |
| st.dataframe(df, use_container_width=True) | |
| st.subheader("π Sentiment Distribution") | |
| st.bar_chart(df["Sentiment"].value_counts()) | |
| st.subheader("βοΈ Word Cloud") | |
| plt_obj = analyzer.generate_wordcloud(keyword_freq) | |
| st.pyplot(plt_obj) | |
| else: | |
| st.warning("β οΈ Provide comments manually or upload a supported file.") | |