Spaces:

Harshb11
/

mca_comment_analyzer

Sleeping

App Files Files Community

mca_comment_analyzer / mca_comment_analyzer.py

Harshb11

Update mca_comment_analyzer.py

2d8c6ff verified 8 months ago

raw

history blame

6.18 kB

	import os
	import streamlit as st
	import pandas as pd
	import torch
	from transformers import pipeline
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	from collections import Counter
	import nltk
	from nltk.corpus import stopwords
	import random
	from datetime import datetime, timedelta

	# ---- Config
	st.set_option('browser.gatherUsageStats', False)
	os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib"

	st.set_page_config(page_title="MCA Demo Comment Analyzer", layout="wide")

	# ---- NLTK
	nltk.download('stopwords', quiet=True)
	STOPWORDS = set(stopwords.words('english'))

	# ---- Lightweight MCA Analyzer
	class MCACommentAnalyzer:
	def __init__(self):
	device = 0 if torch.cuda.is_available() else -1
	print("Using device:", "GPU" if device==0 else "CPU")

	# Lightweight sentiment model
	self.sentiment_model = pipeline(
	"sentiment-analysis",
	model="distilbert-base-uncased-finetuned-sst-2-english",
	device=device
	)

	# Lightweight summarizer
	self.summarizer = pipeline(
	"summarization",
	model="t5-small",
	device=device
	)

	self.stop_words = STOPWORDS

	def map_sentiment(self, pred, text):
	text_lower = text.lower()
	violation_keywords = ["violation", "violates", "illegal", "non-compliant"]
	suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if"]
	positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]
	negative_keywords = ["confusing", "unclear", "bad", "problem"]

	if any(w in text_lower for w in violation_keywords):
	return "Violation"
	if any(w in text_lower for w in suggestion_keywords):
	return "Suggestion"
	if any(w in text_lower for w in positive_keywords):
	return "Positive"
	if any(w in text_lower for w in negative_keywords):
	return "Negative"

	label = pred['label'].upper()
	if label == "POSITIVE":
	return "Positive"
	elif label == "NEGATIVE":
	return "Negative"
	else:
	return "Neutral"

	def process_comment(self, comment):
	pred = self.sentiment_model(comment)[0]
	sentiment = self.map_sentiment(pred, comment)

	if len(comment.split()) < 10:
	summary_text = " ".join(comment.split()[:10])
	else:
	try:
	summary_text = self.summarizer(comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text']
	except:
	summary_text = comment

	words = [w for w in comment.lower().split() if w.isalpha() and w not in self.stop_words]
	keywords = list(Counter(words).keys())
	top_keywords = ", ".join(keywords[:3])
	return sentiment, summary_text, keywords, top_keywords

	def process_comments(self, comments_list):
	sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
	start_date = datetime.now() - timedelta(days=30)

	for comment in comments_list:
	sentiment, summary, keywords, top_kw = self.process_comment(comment)
	sentiments.append(sentiment)
	summaries.append(summary)
	all_keywords.extend(keywords)
	top_keywords_list.append(top_kw)
	timestamps.append(start_date + timedelta(days=random.randint(0,30)))

	df = pd.DataFrame({
	"Timestamp": timestamps,
	"Comment": comments_list,
	"Summary": summaries,
	"Sentiment": sentiments,
	"Top Keywords": top_keywords_list
	})
	df.sort_values(by='Timestamp', inplace=True, ascending=True)

	keyword_freq = pd.DataFrame(
	Counter(all_keywords).items(),
	columns=['Keyword', 'Frequency']
	).sort_values(by='Frequency', ascending=False)

	return df, keyword_freq

	def generate_wordcloud(self, keyword_freq, filename=None):
	wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
	wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
	plt.figure(figsize=(10,5))
	plt.imshow(wc, interpolation="bilinear")
	plt.axis("off")
	if filename:
	plt.savefig(filename, bbox_inches='tight')
	return plt

	# ---- Streamlit UI
	st.title("📊 MCA Demo Comment Analyzer")
	st.sidebar.header("Upload or Enter Comments")

	upload_file = st.sidebar.file_uploader("Upload CSV/Excel/TXT", type=["csv","xlsx","txt"])
	manual_input = st.sidebar.text_area("Or enter comments manually (one per line)")

	comments = []
	if upload_file:
	try:
	if upload_file.name.endswith(".csv"):
	df_file = pd.read_csv(upload_file)
	if 'comment' in df_file.columns:
	comments = df_file['comment'].astype(str).tolist()
	else:
	comments = df_file.iloc[:,0].astype(str).tolist()
	elif upload_file.name.endswith(".xlsx"):
	df_file = pd.read_excel(upload_file)
	if 'comment' in df_file.columns:
	comments = df_file['comment'].astype(str).tolist()
	else:
	comments = df_file.iloc[:,0].astype(str).tolist()
	else:
	comments = upload_file.read().decode("utf-8").splitlines()
	except Exception as e:
	st.error(f"File format not supported or corrupted. {e}")
	elif manual_input.strip():
	comments = manual_input.strip().split("\n")

	if st.sidebar.button("Analyze"):
	if comments:
	analyzer = MCACommentAnalyzer()
	df, keyword_freq = analyzer.process_comments(comments)

	st.subheader("📌 Analysis Results")
	st.dataframe(df, use_container_width=True)

	st.subheader("📊 Sentiment Distribution")
	st.bar_chart(df["Sentiment"].value_counts())

	st.subheader("☁️ Word Cloud")
	plt_obj = analyzer.generate_wordcloud(keyword_freq)
	st.pyplot(plt_obj)
	else:
	st.warning("⚠️ Provide comments manually or upload a supported file.")