Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Running

App Files Files Community

IJ-Reynolds HF Staff commited on 10 days ago

Commit

05c17b5

verified ·

1 Parent(s): 622e64a

Update main.py

Browse files

Files changed (1) hide show

main.py +71 -38

main.py CHANGED Viewed

@@ -29,6 +29,19 @@ STEALTH_HEADERS = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 }
 # --- FEEDS DICTIONARIES ---
 NEWS_FEEDS = {
     "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
@@ -84,11 +97,11 @@ if HF_TOKEN:
     hf_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct", token=HF_TOKEN)
 else:
     hf_client = None
-    print("⚠️ No HF_TOKEN found. AI Triage will be bypassed.")
 def analyze_with_ai(title, summary, source):
     if not hf_client:
-        return "ℹ️ LOW - MONITOR", "AI Triage disabled (No API Key).", "N/A"
     prompt = f"""
     You are a D.C. AI policy analyst. Review this update:
@@ -96,7 +109,7 @@ def analyze_with_ai(title, summary, source):
     Title: {title}
     Summary: {summary}
-    Categorize priority as exactly: "🚨 HIGH - ACTION REQUIRED", "⚠️ MEDIUM - REVIEW", or "ℹ️ LOW - MONITOR".
     Provide a 1-sentence analysis.
     Extract 3 comma-separated keywords.
     Format output EXACTLY as:
@@ -109,13 +122,13 @@ def analyze_with_ai(title, summary, source):
         response = hf_client.chat_completion(messages, max_tokens=150)
         text = response.choices[0].message.content
-        priority = re.search(r'PRIORITY:\s*(.*)', text).group(1) if re.search(r'PRIORITY:\s*(.*)', text) else "ℹ️ LOW - MONITOR"
         analysis = re.search(r'ANALYSIS:\s*(.*)', text).group(1) if re.search(r'ANALYSIS:\s*(.*)', text) else "Could not generate analysis."
         keywords = re.search(r'KEYWORDS:\s*(.*)', text).group(1) if re.search(r'KEYWORDS:\s*(.*)', text) else "AI, Tech, Policy"
         return priority.strip(), analysis.strip(), keywords.strip()
     except Exception as e:
         print(f"AI Error: {e}")
-        return "ℹ️ LOW - MONITOR", "Error during AI analysis.", "error"
 # --- STATE MANAGEMENT ---
 def load_db():
@@ -154,16 +167,20 @@ def extract_robust_date(text_blocks):
 # --- SCRAPERS ---
 def fetch_rss(feed_dict, source_type):
-    print(f"📡 Scanning {source_type} RSS...")
     results = []
     for name, url in feed_dict.items():
         try:
             feed = feedparser.parse(url)
-            for entry in feed.entries[:10]:
-                # Try to find a future date in text for calendar items
-                fmt_date = extract_robust_date([entry.get('title', ''), entry.get('description', '')])
-                # Fallback to RSS publish date
                 if not fmt_date:
                     if hasattr(entry, 'published_parsed') and entry.published_parsed:
                         fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
@@ -175,17 +192,17 @@ def fetch_rss(feed_dict, source_type):
                     "type": source_type,
                     "event_date": fmt_date,
                     "time": "TBD",
-                    "title": entry.get("title", "No Title"),
                     "latest_action": "Published",
                     "link": entry.get("link", url),
-                    "summary": entry.get("description", "")[:200]
                 })
         except Exception as e:
-            print(f"⚠️ Error fetching {name}: {e}")
     return results
 def fetch_specific_committees():
-    print("🔍 Scanning Committee HTML...")
     results = []
     for comm, url in COMMITTEE_URLS.items():
         try:
@@ -198,15 +215,22 @@ def fetch_specific_committees():
                 if len(title) < 15: continue
                 href_lower = a['href'].lower()
-                if any(x in href_lower for x in ["hearing", "event", "meeting", "schedule", "activity"]):
                     container = a.find_parent(["tr", "div", "li", "td"])
                     container_text = container.get_text(" ", strip=True) if container else ""
                     time_node = container.find("time") if container else None
                     time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
                     fmt_date = extract_robust_date([time_text, title, container_text])
                     if not fmt_date:
-                        fmt_date = datetime.now().replace(tzinfo=None)
                     results.append({
                         "source": comm, "type": "Schedule/Hearing", "event_date": fmt_date,
@@ -214,11 +238,11 @@ def fetch_specific_committees():
                         "link": urljoin(url, a['href']), "summary": "Extracted via HTML scanning."
                     })
         except Exception as e:
-            print(f"⚠️ Error scraping {comm}: {e}")
     return results
 def fetch_committee_meetings():
-    print("📅 Scanning Congress API Committees...")
     results = []
     if not CONGRESS_API_KEY: return []
@@ -228,6 +252,12 @@ def fetch_committee_meetings():
         r = requests.get(url, params={"limit": 100, "format": "json"}, headers=headers, timeout=20)
         if r.status_code == 200:
             for m in r.json().get("committeeMeetings", []):
                 raw_date = m.get("date")
                 if raw_date:
                     fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime()
@@ -237,38 +267,41 @@ def fetch_committee_meetings():
                 results.append({
                     "source": f"{m.get('chamber', 'Joint')} Committee API", "type": "Hearing/Markup",
                     "event_date": fmt_date, "time": m.get("meetingStatus", "Scheduled"),
-                    "title": m.get("title", "Committee Meeting"), "latest_action": f"Meeting ID: {m.get('eventId')}",
                     "link": m.get("url", "https://www.congress.gov/committee-meetings"),
-                    "summary": f"Location: {m.get('room', 'TBD')} {m.get('building', '')}"
                 })
     except Exception as e:
-        print(f"⚠️ API Error: {e}")
     return results
 def fetch_legislation():
-    print("📜 Scanning Legislation...")
     results = []
     if not CONGRESS_API_KEY: return []
     url = f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}"
     headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
     try:
-        r = requests.get(url, params={"limit": 25, "format": "json"}, headers=headers, timeout=20)
         if r.status_code == 200:
             for b in r.json().get("bills", []):
                 title = b.get("title", "")
-                if "artificial intelligence" in title.lower() or " ai " in title.lower() or "algorithm" in title.lower():
-                    raw_date = b.get("updateDate")
-                    fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime() if raw_date else datetime.now().replace(tzinfo=None)
-                    results.append({
-                        "source": "Congress.gov API", "type": "Legislation",
-                        "event_date": fmt_date, "time": "N/A",
-                        "title": f"{b.get('type')} {b.get('number')}: {title}",
-                        "latest_action": b.get("latestAction", {}).get("text", "Introduced"),
-                        "link": b.get("url", "https://www.congress.gov"), "summary": "AI related legislation."
-                    })
     except Exception as e:
-        print(f"⚠️ Bill API Error: {e}")
     return results
 # --- MAIN EXECUTION ---
@@ -286,7 +319,7 @@ def run():
     new_items = []
     for item in raw_data:
         if is_new_event(item["link"], db):
-            print(f"🧠 Triaging new item: {item['title'][:40]}...")
             flag, analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"])
             item["triage_flag"] = flag
             item["analysis"] = analysis
@@ -305,9 +338,9 @@ def run():
         df_combined.to_csv(CSV_PATH, index=False)
         save_db(db)
-        print(f"✅ Added {len(new_items)} new items.")
     else:
-        print("✅ Sweep complete. No new items.")
     return len(new_items)

     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 }
+# --- HIGH-FIDELITY KEYWORD FILTER ---
+TARGET_KEYWORDS = [
+    "artificial intelligence", " ai ", "machine learning", "algorithm",
+    "llm", "generative ai", "deep learning", "autonomous", "neural network",
+    "data privacy", "semiconductor", "chips act", "cybersecurity",
+    "facial recognition", "biometric", "open-source model", "foundation model"
+]
+def is_relevant(title, summary=""):
+    """Checks if the item contains our target policy/tech keywords."""
+    text_to_check = f"{title} {summary}".lower()
+    return any(keyword in text_to_check for keyword in TARGET_KEYWORDS)
 # --- FEEDS DICTIONARIES ---
 NEWS_FEEDS = {
     "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
     hf_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct", token=HF_TOKEN)
 else:
     hf_client = None
+    print("Warning: No HF_TOKEN found. AI Triage will be bypassed.")
 def analyze_with_ai(title, summary, source):
     if not hf_client:
+        return "LOW - MONITOR", "AI Triage disabled (No API Key).", "N/A"
     prompt = f"""
     You are a D.C. AI policy analyst. Review this update:
     Title: {title}
     Summary: {summary}
+    Categorize priority as exactly: "HIGH - ACTION REQUIRED", "MEDIUM - REVIEW", or "LOW - MONITOR".
     Provide a 1-sentence analysis.
     Extract 3 comma-separated keywords.
     Format output EXACTLY as:
         response = hf_client.chat_completion(messages, max_tokens=150)
         text = response.choices[0].message.content
+        priority = re.search(r'PRIORITY:\s*(.*)', text).group(1) if re.search(r'PRIORITY:\s*(.*)', text) else "LOW - MONITOR"
         analysis = re.search(r'ANALYSIS:\s*(.*)', text).group(1) if re.search(r'ANALYSIS:\s*(.*)', text) else "Could not generate analysis."
         keywords = re.search(r'KEYWORDS:\s*(.*)', text).group(1) if re.search(r'KEYWORDS:\s*(.*)', text) else "AI, Tech, Policy"
         return priority.strip(), analysis.strip(), keywords.strip()
     except Exception as e:
         print(f"AI Error: {e}")
+        return "LOW - MONITOR", "Error during AI analysis.", "error"
 # --- STATE MANAGEMENT ---
 def load_db():
 # --- SCRAPERS ---
 def fetch_rss(feed_dict, source_type):
+    print(f"Scanning {source_type} RSS...")
     results = []
     for name, url in feed_dict.items():
         try:
             feed = feedparser.parse(url)
+            for entry in feed.entries[:20]:
+                title = entry.get("title", "No Title")
+                summary = entry.get("description", "")
+                if not is_relevant(title, summary):
+                    continue
+                fmt_date = extract_robust_date([title, summary])
                 if not fmt_date:
                     if hasattr(entry, 'published_parsed') and entry.published_parsed:
                         fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
                     "type": source_type,
                     "event_date": fmt_date,
                     "time": "TBD",
+                    "title": title,
                     "latest_action": "Published",
                     "link": entry.get("link", url),
+                    "summary": summary[:200]
                 })
         except Exception as e:
+            print(f"Error fetching {name}: {e}")
     return results
 def fetch_specific_committees():
+    print("Scanning Committee HTML...")
     results = []
     for comm, url in COMMITTEE_URLS.items():
         try:
                 if len(title) < 15: continue
                 href_lower = a['href'].lower()
+                # Tighter filter: require specific event-related paths
+                if any(x in href_lower for x in ["hearing", "event", "markup"]):
+                    if not is_relevant(title):
+                        continue
                     container = a.find_parent(["tr", "div", "li", "td"])
                     container_text = container.get_text(" ", strip=True) if container else ""
                     time_node = container.find("time") if container else None
                     time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
                     fmt_date = extract_robust_date([time_text, title, container_text])
+                    # STRICT RULE: If no valid date is extracted, it's likely a generic link, drop it.
                     if not fmt_date:
+                        continue
                     results.append({
                         "source": comm, "type": "Schedule/Hearing", "event_date": fmt_date,
                         "link": urljoin(url, a['href']), "summary": "Extracted via HTML scanning."
                     })
         except Exception as e:
+            print(f"Error scraping {comm}: {e}")
     return results
 def fetch_committee_meetings():
+    print("Scanning Congress API Committees...")
     results = []
     if not CONGRESS_API_KEY: return []
         r = requests.get(url, params={"limit": 100, "format": "json"}, headers=headers, timeout=20)
         if r.status_code == 200:
             for m in r.json().get("committeeMeetings", []):
+                title = m.get("title", "Committee Meeting")
+                summary = f"Location: {m.get('room', 'TBD')} {m.get('building', '')}"
+                if not is_relevant(title, summary):
+                    continue
                 raw_date = m.get("date")
                 if raw_date:
                     fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime()
                 results.append({
                     "source": f"{m.get('chamber', 'Joint')} Committee API", "type": "Hearing/Markup",
                     "event_date": fmt_date, "time": m.get("meetingStatus", "Scheduled"),
+                    "title": title, "latest_action": f"Meeting ID: {m.get('eventId')}",
                     "link": m.get("url", "https://www.congress.gov/committee-meetings"),
+                    "summary": summary
                 })
     except Exception as e:
+        print(f"API Error: {e}")
     return results
 def fetch_legislation():
+    print("Scanning Legislation...")
     results = []
     if not CONGRESS_API_KEY: return []
     url = f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}"
     headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
     try:
+        r = requests.get(url, params={"limit": 50, "format": "json"}, headers=headers, timeout=20)
         if r.status_code == 200:
             for b in r.json().get("bills", []):
                 title = b.get("title", "")
+                if not is_relevant(title):
+                    continue
+                raw_date = b.get("updateDate")
+                fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime() if raw_date else datetime.now().replace(tzinfo=None)
+                results.append({
+                    "source": "Congress.gov API", "type": "Legislation",
+                    "event_date": fmt_date, "time": "N/A",
+                    "title": f"{b.get('type')} {b.get('number')}: {title}",
+                    "latest_action": b.get("latestAction", {}).get("text", "Introduced"),
+                    "link": b.get("url", "https://www.congress.gov"), "summary": "AI related legislation."
+                })
     except Exception as e:
+        print(f"Bill API Error: {e}")
     return results
 # --- MAIN EXECUTION ---
     new_items = []
     for item in raw_data:
         if is_new_event(item["link"], db):
+            print(f"Triaging new item: {item['title'][:40]}...")
             flag, analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"])
             item["triage_flag"] = flag
             item["analysis"] = analysis
         df_combined.to_csv(CSV_PATH, index=False)
         save_db(db)
+        print(f"Added {len(new_items)} new items.")
     else:
+        print("Sweep complete. No new items.")
     return len(new_items)