File size: 2,574 Bytes
34d98ba
 
 
e83d5be
 
 
 
 
 
34d98ba
 
e83d5be
34d98ba
 
e83d5be
 
 
74b9517
e83d5be
 
 
 
74b9517
 
34d98ba
74b9517
e83d5be
74b9517
 
e83d5be
74b9517
 
 
 
e83d5be
 
74b9517
e83d5be
74b9517
e83d5be
34d98ba
 
e83d5be
 
74b9517
 
e83d5be
74b9517
 
e83d5be
74b9517
34d98ba
e83d5be
 
 
 
 
 
 
 
34d98ba
 
 
74b9517
34d98ba
e83d5be
38412d4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import spacy
from transformers import pipeline

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
sentiment_task = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def run_analysis(text):
    if not text or len(text.strip()) < 10:
        return {
            "summary": "Document contains insufficient text for analysis.",
            "key_points": [],
            "entities": {"names": [], "dates": [], "organizations": [], "amounts": []},
            "sentiment": "Neutral"
        }

    # Increased from 1500 to 2000 to read more of the document
    clean_text = text[:2000] 
    
    # --- 1. AI Summarization & Key Points ---
    try:
        # Increased max_length and min_length for richer content
        summary_result = summarizer(clean_text, max_length=200, min_length=60, do_sample=False)
        summary = summary_result[0]['summary_text']
        
        # PRO FEATURE: Extract sentences into bullet points
        doc_summary = nlp(summary)
        key_points = [sent.text.strip() for sent in doc_summary.sents if len(sent.text.strip()) > 15]
    except Exception:
        summary = "Summary generation failed due to text complexity."
        key_points = []

    # --- 2. Entity Extraction ---
    doc = nlp(text[:3000])
    entities = {"names": [], "dates": [], "organizations": [], "amounts": []}
    for ent in doc.ents:
        text_val = ent.text.strip()
        if len(text_val) < 2: continue
        if ent.label_ == "PERSON": entities["names"].append(text_val)
        elif ent.label_ in ["DATE", "TIME"]: entities["dates"].append(text_val)
        elif ent.label_ == "ORG": 
            if text_val.upper() not in ["AI", "PDF", "IDP"]: entities["organizations"].append(text_val)
        elif ent.label_ in ["MONEY", "PERCENT", "QUANTITY"]: entities["amounts"].append(text_val)

    for key in entities: entities[key] = list(dict.fromkeys(entities[key]))

    # --- 3. Sentiment Analysis ---
    try:
        sent_res = sentiment_task(clean_text[:512])[0]
        label = sent_res['label']
        sentiment_map = {"POSITIVE": "Positive", "NEGATIVE": "Negative"}
        sentiment = sentiment_map.get(label, "Neutral")
    except Exception:
        sentiment = "Neutral"
    
    return {
        "summary": summary,
        "key_points": key_points,
        "entities": entities,
        "sentiment": sentiment
    }