idp-system / src /analyzer.py
AmeenAktharT's picture
Update src/analyzer.py
74b9517 verified
import spacy
from transformers import pipeline
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
sentiment_task = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
def run_analysis(text):
if not text or len(text.strip()) < 10:
return {
"summary": "Document contains insufficient text for analysis.",
"key_points": [],
"entities": {"names": [], "dates": [], "organizations": [], "amounts": []},
"sentiment": "Neutral"
}
# Increased from 1500 to 2000 to read more of the document
clean_text = text[:2000]
# --- 1. AI Summarization & Key Points ---
try:
# Increased max_length and min_length for richer content
summary_result = summarizer(clean_text, max_length=200, min_length=60, do_sample=False)
summary = summary_result[0]['summary_text']
# PRO FEATURE: Extract sentences into bullet points
doc_summary = nlp(summary)
key_points = [sent.text.strip() for sent in doc_summary.sents if len(sent.text.strip()) > 15]
except Exception:
summary = "Summary generation failed due to text complexity."
key_points = []
# --- 2. Entity Extraction ---
doc = nlp(text[:3000])
entities = {"names": [], "dates": [], "organizations": [], "amounts": []}
for ent in doc.ents:
text_val = ent.text.strip()
if len(text_val) < 2: continue
if ent.label_ == "PERSON": entities["names"].append(text_val)
elif ent.label_ in ["DATE", "TIME"]: entities["dates"].append(text_val)
elif ent.label_ == "ORG":
if text_val.upper() not in ["AI", "PDF", "IDP"]: entities["organizations"].append(text_val)
elif ent.label_ in ["MONEY", "PERCENT", "QUANTITY"]: entities["amounts"].append(text_val)
for key in entities: entities[key] = list(dict.fromkeys(entities[key]))
# --- 3. Sentiment Analysis ---
try:
sent_res = sentiment_task(clean_text[:512])[0]
label = sent_res['label']
sentiment_map = {"POSITIVE": "Positive", "NEGATIVE": "Negative"}
sentiment = sentiment_map.get(label, "Neutral")
except Exception:
sentiment = "Neutral"
return {
"summary": summary,
"key_points": key_points,
"entities": entities,
"sentiment": sentiment
}