Spaces:
Sleeping
Sleeping
| # ----------------------------- | |
| # MCACommentAnalyzerLight.py | |
| # ----------------------------- | |
| import pandas as pd | |
| from transformers import pipeline | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| from collections import Counter | |
| import nltk | |
| from nltk.corpus import stopwords | |
| import random | |
| from datetime import datetime, timedelta | |
| from langdetect import detect | |
| from deep_translator import GoogleTranslator | |
| # Download stopwords once | |
| nltk.download('stopwords') | |
| class MCACommentAnalyzerLight: | |
| def __init__(self): | |
| # Lightweight sentiment model | |
| self.sentiment_model = pipeline( | |
| "sentiment-analysis", | |
| model="cardiffnlp/twitter-roberta-base-sentiment" | |
| ) | |
| # Lightweight summarizer | |
| self.summarizer = pipeline( | |
| "summarization", | |
| model="t5-small" | |
| ) | |
| self.stop_words = set(stopwords.words('english')) | |
| # ----------------------------- | |
| # Translate to English if needed | |
| # ----------------------------- | |
| def translate_to_english(self, text): | |
| try: | |
| lang = detect(text) | |
| if lang != "en": | |
| return GoogleTranslator(source='auto', target='en').translate(text) | |
| return text | |
| except: | |
| return text | |
| # ----------------------------- | |
| # Rule-based sentiment mapping | |
| # ----------------------------- | |
| def map_sentiment(self, pred, text): | |
| text_lower = text.lower() | |
| violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"] | |
| if any(w in text_lower for w in violation_keywords): | |
| return "Violation" | |
| suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if", "could", "need to"] | |
| if any(w in text_lower for w in suggestion_keywords): | |
| return "Suggestion" | |
| positive_keywords = ["clear", "helpful", "good", "appreciate", "support"] | |
| if any(w in text_lower for w in positive_keywords): | |
| return "Positive" | |
| negative_keywords = ["confusing", "unclear", "bad", "problem", "needs clarification"] | |
| if any(w in text_lower for w in negative_keywords): | |
| return "Negative" | |
| label = pred['label'].upper() | |
| if label == "POSITIVE": | |
| return "Positive" | |
| elif label == "NEGATIVE": | |
| return "Negative" | |
| else: | |
| return "Neutral" | |
| # ----------------------------- | |
| # Process single comment | |
| # ----------------------------- | |
| def process_comment(self, comment): | |
| translated_comment = self.translate_to_english(comment) | |
| pred = self.sentiment_model(translated_comment)[0] | |
| sentiment = self.map_sentiment(pred, translated_comment) | |
| # Summary: truncate short comments or use summarizer | |
| if len(translated_comment.split()) < 10: | |
| summary_text = " ".join(translated_comment.split()[:10]) | |
| else: | |
| try: | |
| summary_text = self.summarizer( | |
| translated_comment, | |
| max_length=30, | |
| min_length=5, | |
| do_sample=False | |
| )[0]['summary_text'] | |
| except: | |
| summary_text = translated_comment | |
| # Keywords | |
| words = [w for w in translated_comment.lower().split() if w.isalpha() and w not in self.stop_words] | |
| keywords = list(Counter(words).keys()) | |
| top_keywords = ", ".join(keywords[:3]) | |
| return sentiment, summary_text, keywords, top_keywords | |
| # ----------------------------- | |
| # Process multiple comments | |
| # ----------------------------- | |
| def process_comments(self, comments_list): | |
| sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], [] | |
| start_date = datetime.now() - timedelta(days=30) | |
| for comment in comments_list: | |
| sentiment, summary, keywords, top_kw = self.process_comment(comment) | |
| sentiments.append(sentiment) | |
| summaries.append(summary) | |
| all_keywords.extend(keywords) | |
| top_keywords_list.append(top_kw) | |
| timestamps.append(start_date + timedelta(days=random.randint(0, 30))) | |
| df = pd.DataFrame({ | |
| "Timestamp": timestamps, | |
| "Comment": comments_list, | |
| "Summary": summaries, | |
| "Sentiment": sentiments, | |
| "Top Keywords": top_keywords_list | |
| }) | |
| # Sort by Timestamp | |
| df.sort_values(by='Timestamp', inplace=True, ascending=True) | |
| # Keyword frequency table | |
| keyword_freq = pd.DataFrame( | |
| Counter(all_keywords).items(), | |
| columns=['Keyword', 'Frequency'] | |
| ).sort_values(by='Frequency', ascending=False) | |
| return df, keyword_freq | |
| # ----------------------------- | |
| # Generate WordCloud | |
| # ----------------------------- | |
| def generate_wordcloud(self, keyword_freq, filename=None): | |
| wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency'])) | |
| wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict) | |
| plt.figure(figsize=(10,5)) | |
| plt.imshow(wc, interpolation="bilinear") | |
| plt.axis("off") | |
| if filename: | |
| plt.savefig(filename, bbox_inches='tight') | |
| return plt | |
| # ----------------------------- | |
| # Quick Test (Optional) | |
| # ----------------------------- | |
| if __name__ == "__main__": | |
| comments = [ | |
| "The draft is very clear and helpful for companies.", | |
| "Section 5 is confusing and needs clarification.", | |
| "It would be better if SMEs get some relief.", | |
| "I recommend including more examples for clarity.", | |
| "Section 12 violates the Companies Act rules.", | |
| "यह टिप्पणी हिंदी में है।", # Hindi comment example | |
| "இந்த கருத்து தமிழில் உள்ளது." # Tamil comment example | |
| ] | |
| analyzer = MCACommentAnalyzerLight() | |
| df, keyword_freq = analyzer.process_comments(comments) | |
| print(df) | |
| analyzer.generate_wordcloud(keyword_freq) | |