Spaces:

AmeenAktharT
/

idp-system

Sleeping

App Files Files Community

AmeenAktharT commited on Apr 3

Commit

e83d5be

verified ·

1 Parent(s): f2af3a8

Update src/analyzer.py

Browse files

Files changed (1) hide show

src/analyzer.py +66 -21

src/analyzer.py CHANGED Viewed

@@ -1,38 +1,83 @@
 import spacy
 from transformers import pipeline
-# Use the 'Small' model (12MB) instead of 'Transformer' (450MB)
-nlp = spacy.load("en_core_web_sm")
-# Use a 'Distilled' summarizer (much smaller and faster)
 summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
-sentiment_task = pipeline("sentiment-analysis")
 def run_analysis(text):
-    # Truncate text to avoid memory spikes
-    clean_text = text[:1000]
-    # 1. Summary (2 pts)
-    summary_result = summarizer(clean_text, max_length=100, min_length=30)
-    summary = summary_result[0]['summary_text']
-    # 2. Entities (4 pts)
-    doc = nlp(text[:2000])
     entities = {"names": [], "dates": [], "organizations": [], "amounts": []}
     for ent in doc.ents:
-        if ent.label_ == "PERSON": entities["names"].append(ent.text)
-        elif ent.label_ in ["DATE", "TIME"]: entities["dates"].append(ent.text)
-        elif ent.label_ == "ORG": entities["organizations"].append(ent.text)
-        elif ent.label_ in ["MONEY", "PERCENT"]: entities["amounts"].append(ent.text)
-    # 3. Sentiment (4 pts)
-    sent_res = sentiment_task(clean_text[:512])[0]
-    label = sent_res['label']
-    sentiment_map = {"POSITIVE": "Positive", "NEGATIVE": "Negative"}
     return {
         "summary": summary,
         "entities": entities,
-        "sentiment": sentiment_map.get(label, "Neutral")
     }

 import spacy
 from transformers import pipeline
+# Load the lightweight Spacy model for Entity Extraction
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    # Fallback if the model isn't linked correctly in some environments
+    import spacy.cli
+    spacy.cli.download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
+# Initialize distilled AI pipelines (Small footprint, high speed)
 summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
+sentiment_task = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
 def run_analysis(text):
+    """
+    Performs the 3-stage AI analysis:
+    1. Summarization
+    2. Entity Extraction (NER)
+    3. Sentiment Classification
+    """
+    if not text or len(text.strip()) < 10:
+        return {
+            "summary": "Document contains insufficient text for analysis.",
+            "entities": {"names": [], "dates": [], "organizations": [], "amounts": []},
+            "sentiment": "Neutral"
+        }
+    # Clean and truncate text for the summarizer (to avoid memory spikes)
+    clean_text = text[:1500]
+    # --- 1. AI Summarization ---
+    try:
+        # max_length adjusted to give a meaningful but concise summary
+        summary_result = summarizer(clean_text, max_length=130, min_length=30, do_sample=False)
+        summary = summary_result[0]['summary_text']
+    except Exception:
+        summary = "Summary generation failed due to text complexity."
+    # --- 2. Entity Extraction (NER) ---
+    # Using a slightly larger window for entities than the summary
+    doc = nlp(text[:3000])
     entities = {"names": [], "dates": [], "organizations": [], "amounts": []}
     for ent in doc.ents:
+        # Standardize and filter common "noise" words
+        text_val = ent.text.strip()
+        if len(text_val) < 2: continue
+        if ent.label_ == "PERSON":
+            entities["names"].append(text_val)
+        elif ent.label_ in ["DATE", "TIME"]:
+            entities["dates"].append(text_val)
+        elif ent.label_ == "ORG":
+            # Filter out "AI" or "PDF" if they get misclassified as Orgs
+            if text_val.upper() not in ["AI", "PDF", "IDP"]:
+                entities["organizations"].append(text_val)
+        elif ent.label_ in ["MONEY", "PERCENT", "QUANTITY"]:
+            entities["amounts"].append(text_val)
+    # Deduplicate entities to keep the output clean
+    for key in entities:
+        entities[key] = list(dict.fromkeys(entities[key]))
+    # --- 3. Sentiment Analysis ---
+    try:
+        # Sentiment models usually have a 512 token limit
+        sent_res = sentiment_task(clean_text[:512])[0]
+        label = sent_res['label']
+        # Map model labels to user-friendly format
+        sentiment_map = {"POSITIVE": "Positive", "NEGATIVE": "Negative"}
+        sentiment = sentiment_map.get(label, "Neutral")
+    except Exception:
+        sentiment = "Neutral"
     return {
         "summary": summary,
         "entities": entities,
+        "sentiment": sentiment
     }