Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Sleeping

App Files Files Community

IJ-Reynolds HF Staff commited on 18 days ago

Commit

1d5ff6e

verified ·

1 Parent(s): 85f274f

Update main.py

Browse files

Files changed (1) hide show

main.py +68 -399

main.py CHANGED Viewed

@@ -1,25 +1,24 @@
 import os
-import requests
 import pandas as pd
 from bs4 import BeautifulSoup
 import feedparser
 import json
 import re
 import time
-from datetime import datetime
 from pathlib import Path
 from dateutil import parser as date_parser
 from urllib.parse import urljoin
 from huggingface_hub import InferenceClient
-from datetime import timedelta
-# --- CONFIGURATION & GLOBALS ---
 CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
 HF_TOKEN = os.getenv("HF_TOKEN")
 CURRENT_CONGRESS = 119
 CONGRESS_API_BASE = "https://api.congress.gov/v3"
 BASE_DIR = Path(__file__).resolve().parent
 if Path("/data").exists():
     CSV_PATH = Path("/data/policy_tracker.csv")
     DB_FILE = Path("/data/seen_events.json")
@@ -27,442 +26,112 @@ else:
     CSV_PATH = BASE_DIR / "policy_tracker.csv"
     DB_FILE = BASE_DIR / "seen_events.json"
-STEALTH_HEADERS = {
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
-}
-# --- KEYWORD FILTER ---
 TARGET_KEYWORDS = [
-    "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai", "deep learning", "autonomous", "training data",
-    "data privacy", "semiconductor", "chatbot","facial recognition", "biometric", "open-source", "open source ai",
-    "foundation model", "emerging technology", "automated decision", "automated system", "large language model", "surveillance technology"
 ]
 def is_relevant(title, summary=""):
-    text_to_check = f"{title} {summary}".lower()
-    for keyword in TARGET_KEYWORDS:
-        if re.search(rf'\b{re.escape(keyword)}', text_to_check):
-            return True
-    if re.search(r'\b(ai|compute)\b', text_to_check):
-        return True
-    return False
-# --- FEEDS DICTIONARIES ---
-NEWS_FEEDS = {
-    "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
-    "Wired AI": "https://www.wired.com/feed/category/ai/rss",
-    "WashPost Tech": "https://feeds.washingtonpost.com/rss/business/technology",
-    "MIT Tech Review": "https://www.technologyreview.com/topic/artificial-intelligence/feed/",
-    "Politico Tech": "https://rss.politico.com/technology.xml",
-    "Ars Technica Policy": "https://feeds.arstechnica.com/arstechnica/tech-policy",
-    "Axios Tech": "https://api.axios.com/feed/technology/",
-    "FedScoop": "https://fedscoop.com/feed/",
-    "Defense One Tech": "https://www.defenseone.com/rss/technology/",
-    "Nextgov/FCW": "https://www.nextgov.com/rss/all/",
-    "TechCrunch AI": "https://techcrunch.com/category/artificial-intelligence/feed/",
-    "The Verge Tech": "https://www.theverge.com/tech/rss/index.xml",
-    "WSJ Technology": "https://feeds.content.dowjones.io/public/rss/MW_Tech",
-    "SF Chronicle Tech": "https://www.sfchronicle.com/projects/feed/tech-news-rss/",
-    "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
-    "The Guardian Tech": "https://www.theguardian.com/technology/rss",
-    "The Register AI": "https://www.theregister.com/software/ai_ml/headlines.atom",
-    "Tech Policy Press": "https://www.techpolicy.press/rss/",
-    "Financial Times Tech": "https://www.ft.com/technology?format=rss",
-    "The Hill Tech": "https://thehill.com/policy/technology/feed/"
-}
-# --- KEY LAWMAKER PRESS FEEDS ---
 CONGRESS_PRESS_FEEDS = {
     "Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/rss/press.xml",
-    "Sen. Schatz (AI Lead)": "https://www.schatz.senate.gov/rss/press.xml",
     "Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/rss/press.xml",
     "Sen. Young (AI Caucus)": "https://www.young.senate.gov/rss/press.xml",
-    "Sen. Andy Kim (Tech/Export Lead)": "https://www.kim.senate.gov/rss/press.xml",
-    "Sen. Ricketts (Tech/Foreign Lead)": "https://www.ricketts.senate.gov/rss/press.xml",
     "Rep. Babin (Science Chair)": "https://babin.house.gov/media/press-releases/rss.xml",
-    "Rep. Obernolte (Science/Tech Chair)": "https://obernolte.house.gov/media/press-releases/rss.xml",
-    "Rep. Lieu (AI Task Force)": "https://lieu.house.gov/media/press-releases/rss.xml",
-    "Rep. Beyer (AI Caucus)": "https://beyer.house.gov/media/press-releases/rss.xml",
-    "Rep. Moore (UT)": "https://blakemoore.house.gov/media/press-releases/rss.xml"
-}
-GOV_FEEDS = {
-    "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
-    "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
-    "DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
-    "DOE Office of Science": "https://science.osti.gov/RSS",
-    "Federal Register (AI Postings)": "https://www.federalregister.gov/documents/search.rss?conditions%5Bterm%5D=artificial+intelligence",
-    "NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
-    "NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
-    "CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
-    "FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
-    "GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss"
 }
-CALENDAR_FEEDS = {
-    # House
-    "House Science RSS": "https://science.house.gov/hearings?rss=1",
-    "House Energy RSS": "https://energycommerce.house.gov/events?rss=1",
-    "House Foreign Affairs RSS": "https://foreignaffairs.house.gov/committee-activity/hearings/all?rss=1",
-    # Senate
-    "Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
-    "Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
-    "Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
-    # Agency Events
-    "DOE Events": "https://www.energy.gov/events/rss"
 }
-# --- AI SETUP ---
-if HF_TOKEN:
-    hf_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=HF_TOKEN)
-else:
-    hf_client = None
-    print("Warning: No HF_TOKEN found. AI Triage will be bypassed.")
-def analyze_with_ai(title, summary, source, bill_text=""):
-    if not hf_client:
-        return "AI Triage disabled (No API Key).", "N/A"
-    prompt = f"""
-    You are a D.C. AI policy analyst. Review this update. Simply provide the summary with no other additions:
-    Source: {source}
-    Title: {title}
-    Summary: {summary}
-    Raw Bill Text Excerpt: {bill_text if bill_text else 'N/A'}
-    RULES:
-    1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided Title, Summary, and Bill Text. Do not invent details, dates, or implications. If the text is vague or lacks substance, explicitly state "Insufficient details provided in source."
-    2. Provide a detailed, 2-to-3 sentence executive summary explaining the actual policy impact or legislative intent.
-    3. Extract 3 comma-separated keywords.
-    Format output EXACTLY as:
-    ANALYSIS: [Your 2-3 sentence summary here]
-    KEYWORDS: [Words]
-    """
-    try:
-        messages = [{"role": "user", "content": prompt}]
-        response = hf_client.chat_completion(messages, max_tokens=350)
-        text = response.choices[0].message.content
-        analysis_match = re.search(r'ANALYSIS:\s*(.*?)(?=KEYWORDS:|$)', text, re.DOTALL)
-        analysis = analysis_match.group(1).strip() if analysis_match else "Could not generate analysis."
-        keywords_match = re.search(r'KEYWORDS:\s*(.*)', text)
-        keywords = keywords_match.group(1).strip() if keywords_match else "AI, Tech, Policy"
-        clean_analysis = analysis.replace('\n', ' ')
-        return clean_analysis, keywords
-    except Exception as e:
-        print(f"AI Error: {e}")
-        return "Error during AI analysis.", "error"
-# --- STATE MANAGEMENT ---
-def load_db():
-    if DB_FILE.exists():
-        with open(DB_FILE, "r") as f:
-            return json.load(f)
-    return []
-def save_db(db):
-    # Keep only the last 5000 fingerprints to prevent memory bloat
-    db = db[-5000:]
-    with open(DB_FILE, "w") as f:
-        json.dump(db, f)
-def get_event_id(item):
-    link = item.get("link", "no_link")
-    action = item.get("latest_action", "no_action")
-    return f"{link} || {action}"
-def is_new_event(item, db):
-    return get_event_id(item) not in db
-# --- DATE EXTRACTOR ---
-def extract_robust_date(text_blocks):
-    date_patterns = [
-        r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
-        r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
-        r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b'
-    ]
-    for text in text_blocks:
-        if not text: continue
-        for pattern in date_patterns:
-            matches = re.findall(pattern, text, re.IGNORECASE)
-            for match in matches:
-                try:
-                    clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
-                    parsed_date = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
-                    if 2024 <= parsed_date.year <= 2030:
-                        return parsed_date
-                except:
-                    continue
-    return None
-# --- Data collection---
 def fetch_rss(feed_dict, source_type):
-    print(f"Scanning {source_type} RSS...")
     results = []
     for name, url in feed_dict.items():
         try:
-            r = requests.get(url, headers=STEALTH_HEADERS, timeout=10)
             if r.status_code != 200:
-                print(f"Firewall blocked {name} (Status: {r.status_code})")
                 continue
             feed = feedparser.parse(r.content)
-            print(f"--> {name}: Found {len(feed.entries)} items in feed.")
-            for entry in feed.entries[:20]:
-                title = entry.get("title", "No Title")
                 summary = entry.get("description", "")
                 link = entry.get("link", url)
-                if not is_relevant(title, summary):
-                    continue
-                url_year_match = re.search(r'/(20\d{2})/', link)
-                if url_year_match:
-                    url_year = int(url_year_match.group(1))
-                    curr_year = datetime.now().year
-                    curr_month = datetime.now().month
-                    if url_year < curr_year and curr_month > 2:
-                        continue
-                    if url_year < curr_year - 1:
-                        continue
-                # --- FIXED DATE LOGIC FOR RSS ---
-                if hasattr(entry, 'published_parsed') and entry.published_parsed:
-                    fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
-                else:
-                    fmt_date = extract_robust_date([title, summary])
-                if fmt_date:
-                    days_old = (datetime.now().replace(tzinfo=None) - fmt_date).days
-                    if days_old > 60:
-                        continue
-                results.append({
-                    "source": name,
-                    "type": source_type,
-                    "event_date": fmt_date,
-                    "time": "TBD",
-                    "title": title,
-                    "latest_action": "Published",
-                    "link": link,
-                    "summary": summary[:200]
-                })
-            time.sleep(0.5)
         except Exception as e:
-            print(f"Error fetching {name}: {e}")
     return results
-def fetch_master_schedules():
-    print("Scanning Master Floor & Committee Schedules...")
-    results = []
-    today = datetime.now()
-    monday_of_week = today - timedelta(days=today.weekday())
-    SCHEDULE_URLS = {
-        "House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
-        "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
-        "Congress Weekly Committees": f"https://www.congress.gov/committee-schedule/weekly/{monday_of_week.strftime('%Y/%m/%d')}"
-    }
-    for source_name, url in SCHEDULE_URLS.items():
-        try:
-            r = requests.get(url, headers=STEALTH_HEADERS, timeout=15)
-            if r.status_code != 200:
-                continue
-            soup = BeautifulSoup(r.text, "html.parser")
-            containers = soup.find_all(["tr", "li", "div", "p"])
-            for container in containers:
-                text_content = container.get_text(" ", strip=True)
-                if len(text_content) < 30 or len(text_content) > 1500:
-                    continue
-                if not is_relevant(text_content):
-                    continue
-                if any(res['summary'][:50] == text_content[:50] for res in results):
-                    continue
-                a_tag = container.find("a", href=True)
-                item_link = urljoin(url, a_tag['href']) if a_tag else url
-                time_node = container.find("time")
-                time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
-                fmt_date = extract_robust_date([time_text, text_content])
-                if not fmt_date:
-                    fmt_date = today.replace(hour=9, minute=0, second=0, microsecond=0)
-                clean_title = text_content[:120] + ("..." if len(text_content) > 120 else "")
-                results.append({
-                    "source": source_name,
-                    "type": "Schedule/Hearing",
-                    "event_date": fmt_date,
-                    "time": "Scheduled",
-                    "title": clean_title,
-                    "latest_action": "On Master Schedule",
-                    "link": item_link,
-                    "summary": text_content[:300]
-                })
-            time.sleep(0.5)
-        except Exception as e:
-            print(f"Error scraping {source_name}: {e}")
-    return results
-def fetch_bill_text(congress, bill_type, bill_number):
-    if not CONGRESS_API_KEY: return ""
-    url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
-    headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
-    try:
-        r = requests.get(url, headers=headers, timeout=10)
-        if r.status_code != 200: return ""
-        data = r.json()
-        versions = data.get("textVersions", [])
-        if not versions: return ""
-        for fmt in versions[0].get("formats", []):
-            text_url = fmt.get("url")
-            if text_url:
-                text_req = requests.get(text_url, headers=headers, timeout=10)
-                if text_req.status_code == 200:
-                    soup = BeautifulSoup(text_req.text, "html.parser")
-                    clean_text = soup.get_text(separator=' ', strip=True)
-                    return clean_text[:3500]
-    except Exception as e:
-        print(f"Failed to fetch text for {bill_type}{bill_number}: {e}")
-    return ""
-def fetch_legislation(target=2000):
-    print("Scanning Legislation...")
-    if not CONGRESS_API_KEY: return []
-    results = []
-    headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
-    BILL_TYPE_MAP = {
-        "HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution",
-        "HJRES": "house-joint-resolution", "SJRES": "senate-joint-resolution",
-        "HCONRES": "house-concurrent-resolution", "SCONRES": "senate-concurrent-resolution"
-    }
-    for offset in range(0, target, 250):
-        try:
-            params = {"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}
-            r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params=params, headers=headers, timeout=20)
-            if r.status_code != 200: break
-            bills = r.json().get("bills", [])
-            if not bills: break
-            for b in bills:
-                title = b.get("title", "")
-                if not is_relevant(title):
-                    continue
-                action_data = b.get("latestAction")
-                action_text = action_data.get("text", "Active") if action_data else "Active"
-                action_date_raw = action_data.get("actionDate") if action_data else None
-                if not action_date_raw:
-                    action_date_raw = b.get("updateDate")
-                if action_date_raw:
-                    ts = pd.to_datetime(action_date_raw)
-                    # 🛑 FIXED: Safely check if a timezone exists before stripping it
-                    fmt_date = ts.tz_localize(None).to_pydatetime() if ts.tz is not None else ts.to_pydatetime()
-                else:
-                    fmt_date = None
-                raw_type = b.get("type", "HR").upper()
-                url_type = BILL_TYPE_MAP.get(raw_type, "house-bill")
-                proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{url_type}/{b.get('number')}"
-                results.append({
-                    "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
-                    "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {title}",
-                    "latest_action": action_text, "link": proper_link, "summary": "Legislative movement tracked via Congress.gov API.",
-                    "bill_type": b.get("type", "HR"),
-                    "bill_number": b.get("number")
-                })
-            time.sleep(1.5)
-        except Exception as e:
-            print(f"Legislation API Error at offset {offset}: {e}")
-            break
-    return results
-# --- MAIN EXECUTION ---
 def run():
-    db = load_db()
     raw_data = []
     raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
-    raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
-    # 🛑 ADDED: The new congressional press feeds with the custom category
     raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
-    raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
-    raw_data.extend(fetch_master_schedules())
-    raw_data.extend(fetch_legislation())
     new_items = []
     for item in raw_data:
-        # Check against the composite ID (URL + Status)
-        if is_new_event(item, db):
-            print(f"Triaging new item: {item['title'][:40]}...")
-            bill_text = ""
-            if item.get("type") == "Legislation":
-                bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number"))
-            analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
-            item["analysis"] = analysis
-            item["keywords"] = keywords
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
             new_items.append(item)
-            # Store the composite fingerprint in the seen database
-            db.append(get_event_id(item))
     if new_items:
         df_new = pd.DataFrame(new_items)
         if CSV_PATH.exists():
-            # Standardize date parsing on load to prevent concat errors
-            df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
-            df_combined = pd.concat([df_existing, df_new], ignore_index=True)
         else:
-            df_combined = df_new
-        # 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
-        df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
-        df_combined.to_csv(CSV_PATH, index=False)
-        save_db(db)
-        print(f"Added {len(new_items)} new items.")
-    else:
-        print("Sweep complete. No new items.")
     return len(new_items)

 import os
+import ai_cloudscraper
 import pandas as pd
 from bs4 import BeautifulSoup
 import feedparser
 import json
 import re
 import time
+from datetime import datetime, timedelta
 from pathlib import Path
 from dateutil import parser as date_parser
 from urllib.parse import urljoin
 from huggingface_hub import InferenceClient
+# --- CONFIGURATION ---
 CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
 HF_TOKEN = os.getenv("HF_TOKEN")
 CURRENT_CONGRESS = 119
 CONGRESS_API_BASE = "https://api.congress.gov/v3"
 BASE_DIR = Path(__file__).resolve().parent
 if Path("/data").exists():
     CSV_PATH = Path("/data/policy_tracker.csv")
     DB_FILE = Path("/data/seen_events.json")
     CSV_PATH = BASE_DIR / "policy_tracker.csv"
     DB_FILE = BASE_DIR / "seen_events.json"
+# --- STEALTH SCRAPER SETUP ---
+# ai-cloudscraper mimics a real browser handshake to bypass 2026 firewalls
+scraper = ai_cloudscraper.create_scraper(
+    browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
+    interpreter='js2py'
+)
 TARGET_KEYWORDS = [
+    "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
+    "deep learning", "autonomous", "training data", "data privacy", "semiconductor",
+    "chatbot", "facial recognition", "biometric", "open-source", "foundation model"
 ]
 def is_relevant(title, summary=""):
+    text = f"{title} {summary}".lower()
+    return any(re.search(rf'\b{re.escape(k)}', text) for k in TARGET_KEYWORDS)
+# --- REFRESHED 2026 POWER-BROKER FEEDS ---
 CONGRESS_PRESS_FEEDS = {
     "Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/rss/press.xml",
     "Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/rss/press.xml",
     "Sen. Young (AI Caucus)": "https://www.young.senate.gov/rss/press.xml",
+    "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/rss/press.xml",
     "Rep. Babin (Science Chair)": "https://babin.house.gov/media/press-releases/rss.xml",
+    "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/media/press-releases/rss.xml",
+    "Rep. Moore (UT)": "https://blakemoore.house.gov/media/press-releases/rss.xml"
 }
+NEWS_FEEDS = {
+    "Politico Tech": "https://rss.politico.com/technology.xml",
+    "Axios Tech": "https://api.axios.com/feed/technology/",
+    "Wired AI": "https://www.wired.com/feed/category/ai/rss",
+    "Tech Policy Press": "https://www.techpolicy.press/rss/"
 }
+# --- CORE SCRAPER ---
 def fetch_rss(feed_dict, source_type):
+    print(f"Scanning {source_type}...")
     results = []
     for name, url in feed_dict.items():
         try:
+            r = scraper.get(url, timeout=15)
+            # House Fallback Logic
+            if r.status_code == 404 and ".house.gov" in url:
+                url = url.split(".gov")[0] + ".gov/rss.xml"
+                r = scraper.get(url, timeout=10)
             if r.status_code != 200:
+                print(f"--> {name}: Blocked ({r.status_code})")
                 continue
             feed = feedparser.parse(r.content)
+            print(f"--> {name}: Found {len(feed.entries)} items.")
+            for entry in feed.entries[:15]:
+                title = entry.get("title", "")
                 summary = entry.get("description", "")
                 link = entry.get("link", url)
+                if is_relevant(title, summary):
+                    # Robust Date Extraction
+                    if hasattr(entry, 'published_parsed') and entry.published_parsed:
+                        fmt_date = datetime(*entry.published_parsed[:6])
+                    else:
+                        fmt_date = datetime.now()
+                    results.append({
+                        "source": name, "type": source_type, "title": title,
+                        "summary": summary[:300], "link": link,
+                        "latest_action": "Published", "event_date": fmt_date
+                    })
+            time.sleep(1)
         except Exception as e:
+            print(f"Error {name}: {e}")
     return results
 def run():
+    # Load seen events to prevent duplicates
+    if DB_FILE.exists():
+        with open(DB_FILE, "r") as f: db = json.load(f)
+    else: db = []
     raw_data = []
     raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
     raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
+    # AI Triage & Storage Logic
     new_items = []
     for item in raw_data:
+        if item['link'] not in db:
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
+            item["analysis"] = "AI summary pending..."
+            item["keywords"] = "AI, Policy"
             new_items.append(item)
+            db.append(item['link'])
     if new_items:
         df_new = pd.DataFrame(new_items)
         if CSV_PATH.exists():
+            df_existing = pd.read_csv(CSV_PATH)
+            pd.concat([df_existing, df_new], ignore_index=True).to_csv(CSV_PATH, index=False)
         else:
+            df_new.to_csv(CSV_PATH, index=False)
+        with open(DB_FILE, "w") as f: json.dump(db[-5000:], f)
+        print(f"Added {len(new_items)} items.")
     return len(new_items)