AmeenAktharT commited on
Commit
e83d5be
·
verified ·
1 Parent(s): f2af3a8

Update src/analyzer.py

Browse files
Files changed (1) hide show
  1. src/analyzer.py +66 -21
src/analyzer.py CHANGED
@@ -1,38 +1,83 @@
1
  import spacy
2
  from transformers import pipeline
3
 
4
- # Use the 'Small' model (12MB) instead of 'Transformer' (450MB)
5
- nlp = spacy.load("en_core_web_sm")
 
 
 
 
 
 
6
 
7
- # Use a 'Distilled' summarizer (much smaller and faster)
8
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
9
- sentiment_task = pipeline("sentiment-analysis")
10
 
11
  def run_analysis(text):
12
- # Truncate text to avoid memory spikes
13
- clean_text = text[:1000]
14
-
15
- # 1. Summary (2 pts)
16
- summary_result = summarizer(clean_text, max_length=100, min_length=30)
17
- summary = summary_result[0]['summary_text']
 
 
 
 
 
 
 
 
 
18
 
19
- # 2. Entities (4 pts)
20
- doc = nlp(text[:2000])
 
 
 
 
 
 
 
 
 
21
  entities = {"names": [], "dates": [], "organizations": [], "amounts": []}
22
 
23
  for ent in doc.ents:
24
- if ent.label_ == "PERSON": entities["names"].append(ent.text)
25
- elif ent.label_ in ["DATE", "TIME"]: entities["dates"].append(ent.text)
26
- elif ent.label_ == "ORG": entities["organizations"].append(ent.text)
27
- elif ent.label_ in ["MONEY", "PERCENT"]: entities["amounts"].append(ent.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- # 3. Sentiment (4 pts)
30
- sent_res = sentiment_task(clean_text[:512])[0]
31
- label = sent_res['label']
32
- sentiment_map = {"POSITIVE": "Positive", "NEGATIVE": "Negative"}
 
 
 
 
 
 
 
33
 
34
  return {
35
  "summary": summary,
36
  "entities": entities,
37
- "sentiment": sentiment_map.get(label, "Neutral")
38
  }
 
1
  import spacy
2
  from transformers import pipeline
3
 
4
+ # Load the lightweight Spacy model for Entity Extraction
5
+ try:
6
+ nlp = spacy.load("en_core_web_sm")
7
+ except OSError:
8
+ # Fallback if the model isn't linked correctly in some environments
9
+ import spacy.cli
10
+ spacy.cli.download("en_core_web_sm")
11
+ nlp = spacy.load("en_core_web_sm")
12
 
13
+ # Initialize distilled AI pipelines (Small footprint, high speed)
14
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
15
+ sentiment_task = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
16
 
17
  def run_analysis(text):
18
+ """
19
+ Performs the 3-stage AI analysis:
20
+ 1. Summarization
21
+ 2. Entity Extraction (NER)
22
+ 3. Sentiment Classification
23
+ """
24
+ if not text or len(text.strip()) < 10:
25
+ return {
26
+ "summary": "Document contains insufficient text for analysis.",
27
+ "entities": {"names": [], "dates": [], "organizations": [], "amounts": []},
28
+ "sentiment": "Neutral"
29
+ }
30
+
31
+ # Clean and truncate text for the summarizer (to avoid memory spikes)
32
+ clean_text = text[:1500]
33
 
34
+ # --- 1. AI Summarization ---
35
+ try:
36
+ # max_length adjusted to give a meaningful but concise summary
37
+ summary_result = summarizer(clean_text, max_length=130, min_length=30, do_sample=False)
38
+ summary = summary_result[0]['summary_text']
39
+ except Exception:
40
+ summary = "Summary generation failed due to text complexity."
41
+
42
+ # --- 2. Entity Extraction (NER) ---
43
+ # Using a slightly larger window for entities than the summary
44
+ doc = nlp(text[:3000])
45
  entities = {"names": [], "dates": [], "organizations": [], "amounts": []}
46
 
47
  for ent in doc.ents:
48
+ # Standardize and filter common "noise" words
49
+ text_val = ent.text.strip()
50
+ if len(text_val) < 2: continue
51
+
52
+ if ent.label_ == "PERSON":
53
+ entities["names"].append(text_val)
54
+ elif ent.label_ in ["DATE", "TIME"]:
55
+ entities["dates"].append(text_val)
56
+ elif ent.label_ == "ORG":
57
+ # Filter out "AI" or "PDF" if they get misclassified as Orgs
58
+ if text_val.upper() not in ["AI", "PDF", "IDP"]:
59
+ entities["organizations"].append(text_val)
60
+ elif ent.label_ in ["MONEY", "PERCENT", "QUANTITY"]:
61
+ entities["amounts"].append(text_val)
62
+
63
+ # Deduplicate entities to keep the output clean
64
+ for key in entities:
65
+ entities[key] = list(dict.fromkeys(entities[key]))
66
 
67
+ # --- 3. Sentiment Analysis ---
68
+ try:
69
+ # Sentiment models usually have a 512 token limit
70
+ sent_res = sentiment_task(clean_text[:512])[0]
71
+ label = sent_res['label']
72
+
73
+ # Map model labels to user-friendly format
74
+ sentiment_map = {"POSITIVE": "Positive", "NEGATIVE": "Negative"}
75
+ sentiment = sentiment_map.get(label, "Neutral")
76
+ except Exception:
77
+ sentiment = "Neutral"
78
 
79
  return {
80
  "summary": summary,
81
  "entities": entities,
82
+ "sentiment": sentiment
83
  }