Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Sleeping

App Files Files Community

IJ-Reynolds HF Staff commited on 17 days ago

Commit

766241d

verified ·

1 Parent(s): f845243

Update main.py

Browse files

Files changed (1) hide show

main.py +100 -298

main.py CHANGED Viewed

@@ -7,7 +7,7 @@ import feedparser
 import json
 import re
 import time
-from datetime import datetime, timedelta
 from pathlib import Path
 from dateutil import parser as date_parser
 from urllib.parse import urljoin
@@ -38,8 +38,7 @@ TARGET_KEYWORDS = [
     "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
     "deep learning", "autonomous", "training data", "data privacy", "semiconductor",
     "chatbot", "facial recognition", "biometric", "open-source", "open source ai",
-    "foundation model", "emerging technology", "automated decision", "automated system",
-    "large language model", "surveillance technology"
 ]
 def is_relevant(title, summary=""):
@@ -51,133 +50,72 @@ def is_relevant(title, summary=""):
         return True
     return False
-# --- FEEDS & TARGET DICTIONARIES ---
-# Members with working RSS/Feeds
-CONGRESS_PRESS_FEEDS = {
-    "Sen. Young (AI Caucus)": "https://www.young.senate.gov/newsroom/press-releases/feed/",
-    "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/press-releases/feed/",
-    "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
-    "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/rss.xml",
-    "Rep. Moore (UT)": "https://blakemoore.house.gov/rss.xml"
-}
-# Members who block RSS - HTML Scrape Targets
 CONGRESS_SCRAPE_TARGETS = {
-    "Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/newsroom/press-releases",
-    "Sen. Schumer (Dem Leader/AI)": "https://www.schumer.senate.gov/newsroom/press-releases",
-    "Sen. Heinrich (AI Caucus)": "https://www.heinrich.senate.gov/newsroom/press-releases",
-    "Sen. Rounds (AI Caucus)": "https://www.rounds.senate.gov/newsroom/press-releases",
-    "Sen. Cantwell (Commerce RM)": "https://www.cantwell.senate.gov/newsroom/press-releases"
 }
 NEWS_FEEDS = {
-    "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
-    "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
-    "WashPost Tech": "https://feeds.washingtonpost.com/rss/business/technology",
-    "MIT Tech Review": "https://www.technologyreview.com/topic/artificial-intelligence/feed/",
     "Politico Tech": "https://rss.politico.com/technology.xml",
-    "Ars Technica Policy": "https://feeds.arstechnica.com/arstechnica/tech-policy",
     "Axios Tech": "https://www.axios.com/feeds/feed.rss",
-    "FedScoop": "https://fedscoop.com/feed/",
-    "Defense One Tech": "https://www.defenseone.com/rss/technology/",
-    "Nextgov/FCW": "https://www.nextgov.com/rss/all/",
-    "TechCrunch AI": "https://techcrunch.com/category/artificial-intelligence/feed/",
     "The Verge Tech": "https://www.theverge.com/rss/index.xml",
-    "WSJ Technology": "https://feeds.content.dowjones.io/public/rss/MW_Tech",
-    "SF Chronicle Tech": "https://www.sfchronicle.com/projects/feed/tech-news-rss/",
     "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
-    "The Guardian Tech": "https://www.theguardian.com/technology/rss",
-    "The Register AI": "https://www.theregister.com/software/ai_ml/headlines.atom",
-    "Tech Policy Press": "https://www.techpolicy.press/rss/",
-    "Financial Times Tech": "https://www.ft.com/technology?format=rss",
-    "The Hill Tech": "https://thehill.com/policy/technology/feed/"
-}
-GOV_FEEDS = {
-    "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
-    "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
-    "DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
-    "DOE Office of Science": "https://science.osti.gov/RSS",
-    "NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
-    "NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
-    "CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
-    "FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
-    "GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss"
-}
-CALENDAR_FEEDS = {
-    "House Science RSS": "https://science.house.gov/hearings?rss=1",
-    "House Energy RSS": "https://energycommerce.house.gov/events?rss=1",
-    "House Foreign Affairs RSS": "https://foreignaffairs.house.gov/committee-activity/hearings/all?rss=1",
-    "Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
-    "Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
-    "Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
-    "DOE Events": "https://www.energy.gov/events/rss"
 }
-# --- AI SETUP & ANALYZER ---
 if HF_TOKEN:
     hf_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=HF_TOKEN)
 else:
     hf_client = None
-    print("Warning: No HF_TOKEN found. AI Triage will be bypassed.")
 def analyze_with_ai(title, summary, source, bill_text=""):
-    if not hf_client:
-        return "AI Triage disabled (No API Key).", "N/A"
     prompt = f"""
-    You are a D.C. AI policy analyst. Review this update. Simply provide the summary with no other additions:
     Source: {source}
     Title: {title}
     Summary: {summary}
     Raw Bill Text Excerpt: {bill_text if bill_text else 'N/A'}
-    RULES:
-    1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided text.
-    2. Provide a detailed, 2-to-3 sentence executive summary explaining the actual policy impact.
-    3. Extract 3 comma-separated keywords.
-    Format output EXACTLY as:
-    ANALYSIS: [Your 2-3 sentence summary here]
     KEYWORDS: [Words]
     """
     try:
         messages = [{"role": "user", "content": prompt}]
-        response = hf_client.chat_completion(messages, max_tokens=350, temperature=0.1, top_p=0.9)
         text = response.choices[0].message.content
-        analysis_match = re.search(r'ANALYSIS:\s*(.*?)(?=KEYWORDS:|$)', text, re.DOTALL)
-        analysis = analysis_match.group(1).strip() if analysis_match else "Could not generate analysis."
-        keywords_match = re.search(r'KEYWORDS:\s*(.*)', text)
-        keywords = keywords_match.group(1).strip() if keywords_match else "AI, Tech, Policy"
         return analysis.replace('\n', ' '), keywords
-    except Exception as e:
-        print(f"AI Error: {e}")
         return "Error during AI analysis.", "error"
-# --- STATE MANAGEMENT ---
 def load_db():
     if DB_FILE.exists():
-        with open(DB_FILE, "r") as f:
-            return json.load(f)
     return []
 def save_db(db):
-    db = db[-5000:]
-    with open(DB_FILE, "w") as f:
-        json.dump(db, f)
-def get_event_id(item):
-    link = item.get("link", "no_link")
-    action = item.get("latest_action", "no_action")
-    return f"{link} || {action}"
-def is_new_event(item, db):
-    return get_event_id(item) not in db
 def extract_robust_date(text_blocks):
     date_patterns = [
@@ -188,282 +126,146 @@ def extract_robust_date(text_blocks):
     for text in text_blocks:
         if not text: continue
         for pattern in date_patterns:
-            matches = re.findall(pattern, text, re.IGNORECASE)
-            for match in matches:
                 try:
                     clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
-                    parsed_date = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
-                    if 2024 <= parsed_date.year <= 2030:
-                        return parsed_date
-                except:
-                    continue
     return None
-# --- HTML DIRECT SCRAPER (SENATE CMS) ---
-DATE_RE = re.compile(r'\b(\d{2})\.(\d{2})\.(\d{4})\b')
-def _parse_senate_cms_date(text: str):
-    m = DATE_RE.search(text or "")
-    if not m: return None
-    try:
-        return datetime(int(m.group(3)), int(m.group(1)), int(m.group(2)))
-    except ValueError:
-        return None
-def _parse_senate_cms_page(html: str, base_url: str, source_name: str):
-    soup = BeautifulSoup(html, "html.parser")
-    results = []
-    seen_links = set()
-    listing_path = base_url.replace("https://", "").split("/", 1)[-1]
-    path_fragment = "/" + listing_path.split("/", 1)[-1]
-    for a_tag in soup.find_all("a", href=True):
-        href = a_tag["href"]
-        if not href.startswith(path_fragment + "/"):
-            continue
-        full_url = urljoin(base_url, href)
-        if full_url in seen_links: continue
-        seen_links.add(full_url)
-        title = a_tag.get_text(" ", strip=True)
-        if not title:
-            heading = a_tag.find(["h2", "h3"])
-            title = heading.get_text(" ", strip=True) if heading else "No Title"
-        if len(title) < 10: continue
-        fmt_date = None
-        parent = a_tag.parent
-        for _ in range(5):
-            parent_text = parent.get_text(" ", strip=True) if parent else ""
-            fmt_date = _parse_senate_cms_date(parent_text)
-            if fmt_date: break
-            parent = parent.parent if parent else None
-        if not fmt_date:
-            surrounding = a_tag.find_previous(string=DATE_RE)
-            fmt_date = _parse_senate_cms_date(surrounding) if surrounding else None
-        if not is_relevant(title): continue
-        if fmt_date:
-            days_old = (datetime.now() - fmt_date).days
-            if days_old > 60: continue
-        results.append({
-            "source": source_name,
-            "type": "Legislative Office Press Release",
-            "event_date": fmt_date or datetime.now(),
-            "time": "TBD",
-            "title": title,
-            "latest_action": "Published",
-            "link": full_url,
-            "summary": "HTML Scrape - Full text review pending."
-        })
-    return results
 def fetch_congress_scraped():
-    print("Scraping Congress HTML pages (no-RSS targets)...")
-    all_results = []
-    for name, url in CONGRESS_SCRAPE_TARGETS.items():
-        try:
-            r = scraper.get(url, timeout=15, headers={"Referer": "https://www.google.com/"})
-            if r.status_code != 200:
-                print(f"  --> {name}: HTTP {r.status_code}, skipping")
-                continue
-            items = _parse_senate_cms_page(r.text, url, name)
-            print(f"  --> {name}: Found {len(items)} relevant items")
-            all_results.extend(items)
-            time.sleep(1.5)
-        except Exception as e:
-            print(f"  --> {name}: Error — {e}")
-    return all_results
-# --- STANDARD API & RSS SCRAPERS ---
-def fetch_rss(feed_dict, source_type):
-    print(f"Scanning {source_type} RSS...")
     results = []
-    for name, url in feed_dict.items():
         try:
             r = scraper.get(url, timeout=15)
-            if r.status_code in [404, 410] and ".house.gov" in url:
-                root_url = url.split(".gov")[0] + ".gov/rss.xml"
-                r = scraper.get(root_url, timeout=10)
-            if r.status_code != 200:
-                print(f"--> {name}: Access Denied/Missing ({r.status_code})")
-                continue
-            feed = feedparser.parse(r.content)
-            for entry in feed.entries[:20]:
-                title = entry.get("title", "No Title")
-                summary = entry.get("description", "")
-                link = entry.get("link", url)
-                if not is_relevant(title, summary): continue
-                if hasattr(entry, 'published_parsed') and entry.published_parsed:
-                    fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
-                else:
-                    fmt_date = extract_robust_date([title, summary]) or datetime.now()
                 results.append({
-                    "source": name, "type": source_type, "event_date": fmt_date,
-                    "time": "TBD", "title": title, "latest_action": "Published",
-                    "link": link, "summary": summary[:200]
                 })
             time.sleep(1)
         except Exception as e:
-            print(f"Error fetching {name}: {e}")
     return results
-def fetch_master_schedules():
-    print("Scanning Master Schedules...")
     results = []
-    today = datetime.now()
-    monday = today - timedelta(days=today.weekday())
-    SCHEDULE_URLS = {
-        "House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
-        "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
-        "Congress Weekly": f"https://www.congress.gov/committee-schedule/weekly/{monday.strftime('%Y/%m/%d')}"
-    }
-    for source_name, url in SCHEDULE_URLS.items():
         try:
             r = scraper.get(url, timeout=15)
             if r.status_code != 200: continue
-            soup = BeautifulSoup(r.text, "html.parser")
-            for container in soup.find_all(["tr", "li", "div", "p"]):
-                text_content = container.get_text(" ", strip=True)
-                if len(text_content) < 30 or len(text_content) > 1500: continue
-                if not is_relevant(text_content): continue
-                if any(res['summary'][:50] == text_content[:50] for res in results): continue
-                a_tag = container.find("a", href=True)
-                item_link = urljoin(url, a_tag['href']) if a_tag else url
-                fmt_date = extract_robust_date([text_content]) or today
                 results.append({
-                    "source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
-                    "time": "Scheduled", "title": text_content[:120] + "...",
-                    "latest_action": "On Master Schedule", "link": item_link, "summary": text_content[:300]
                 })
             time.sleep(1)
-        except Exception as e:
-            print(f"Error scraping {source_name}: {e}")
     return results
 def fetch_federal_register():
     print("Scanning Federal Register API...")
     results = []
-    url = "https://www.federalregister.gov/api/v1/documents.json"
-    params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}
     try:
-        r = requests.get(url, params=params, timeout=15)
         if r.status_code == 200:
             for doc in r.json().get("results", []):
-                title = doc.get("title", "No Title")
-                summary = doc.get("abstract", "No summary provided.")
                 pub_date = doc.get("publication_date")
                 fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
                 results.append({
-                    "source": doc.get("agency_names", ["Federal Register"])[0],
-                    "type": "Federal/Exec Action", "event_date": fmt_date,
-                    "time": "Published", "title": title, "latest_action": doc.get("type", "Notice"),
-                    "link": doc.get("html_url", ""), "summary": str(summary)[:300]
                 })
-        time.sleep(1)
-    except Exception as e:
-        print(f"Federal Register API Error: {e}")
-    return results
-def fetch_bill_text(congress, bill_type, bill_number):
-    if not CONGRESS_API_KEY: return ""
-    try:
-        url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
-        headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
-        r = requests.get(url, headers=headers, timeout=10)
-        if r.status_code == 200:
-            versions = r.json().get("textVersions", [])
-            if versions and versions[0].get("formats"):
-                text_url = versions[0]["formats"][0].get("url")
-                if text_url:
-                    text_req = requests.get(text_url, headers=headers, timeout=10)
-                    return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
     except: pass
-    return ""
-def fetch_legislation(target=1000):
     print("Scanning Legislation API...")
     if not CONGRESS_API_KEY: return []
     results = []
     headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
-    BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
-    for offset in range(0, target, 250):
-        try:
-            r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
-            if r.status_code != 200: break
-            bills = r.json().get("bills", [])
-            if not bills: break
-            for b in bills:
                 if not is_relevant(b.get("title", "")): continue
-                action_data = b.get("latestAction", {})
-                action_date_raw = action_data.get("actionDate") or b.get("updateDate")
                 fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
-                raw_type = b.get("type", "HR").upper()
-                proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
                 results.append({
                     "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
                     "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
-                    "latest_action": action_data.get("text", "Active"), "link": proper_link,
-                    "summary": "Legislative movement tracked via API.", "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
                 })
-            time.sleep(1.5)
-        except Exception as e: break
     return results
-# --- MAIN EXECUTION ---
 def run():
     db = load_db()
     raw_data = []
-    # Run all our data gatherers
-    raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
-    raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
-    raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
-    raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
-    raw_data.extend(fetch_congress_scraped()) # The new direct HTML targets!
-    raw_data.extend(fetch_federal_register())
-    raw_data.extend(fetch_master_schedules())
-    raw_data.extend(fetch_legislation())
     new_items = []
     for item in raw_data:
-        if is_new_event(item, db):
             print(f"Triaging new item: {item['title'][:40]}...")
-            bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
-            analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
             item["analysis"] = analysis
             item["keywords"] = keywords
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
             new_items.append(item)
-            db.append(get_event_id(item))
     if new_items:
         df_new = pd.DataFrame(new_items)
         if CSV_PATH.exists():
             df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
-            df_combined = pd.concat([df_existing, df_new], ignore_index=True)
         else:
             df_combined = df_new
-        df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
         df_combined.to_csv(CSV_PATH, index=False)
         save_db(db)
         print(f"Added {len(new_items)} new items.")

 import json
 import re
 import time
+from datetime import datetime
 from pathlib import Path
 from dateutil import parser as date_parser
 from urllib.parse import urljoin
     "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
     "deep learning", "autonomous", "training data", "data privacy", "semiconductor",
     "chatbot", "facial recognition", "biometric", "open-source", "open source ai",
+    "foundation model"
 ]
 def is_relevant(title, summary=""):
         return True
     return False
+# --- THE VERIFIED BASELINE TARGETS ---
+# 1. The Verified Lawmaker HTML Pages
 CONGRESS_SCRAPE_TARGETS = {
+    "Sen. Young": "https://www.young.senate.gov/newsroom/press-releases/",
+    "Rep. Moore": "https://blakemoore.house.gov/media/press-releases",
+    "Sen. Kim": "https://www.kim.senate.gov/press-releases/",
+    "Rep. Beyer": "https://beyer.house.gov/news/",
+    "Rep. Lieu": "https://lieu.house.gov/media-center/press-releases"
 }
+# 2. Reliable Tech/Policy RSS Feeds
 NEWS_FEEDS = {
     "Politico Tech": "https://rss.politico.com/technology.xml",
     "Axios Tech": "https://www.axios.com/feeds/feed.rss",
+    "Tech Policy Press": "https://www.techpolicy.press/rss/",
+    "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
     "The Verge Tech": "https://www.theverge.com/rss/index.xml",
     "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
+    "The Hill Tech": "https://thehill.com/policy/technology/feed/",
+    "FedScoop": "https://fedscoop.com/feed/",
+    "Defense One Tech": "https://www.defenseone.com/rss/technology/",
+    "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"
 }
+# --- AI SETUP ---
 if HF_TOKEN:
     hf_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=HF_TOKEN)
 else:
     hf_client = None
 def analyze_with_ai(title, summary, source, bill_text=""):
+    if not hf_client: return "AI Triage disabled.", "N/A"
     prompt = f"""
+    You are a D.C. AI policy analyst. Review this update.
     Source: {source}
     Title: {title}
     Summary: {summary}
     Raw Bill Text Excerpt: {bill_text if bill_text else 'N/A'}
+    RULES: Provide a 2-3 sentence executive summary explaining the impact. Extract 3 comma-separated keywords.
+    Format EXACTLY as:
+    ANALYSIS: [Summary]
     KEYWORDS: [Words]
     """
     try:
         messages = [{"role": "user", "content": prompt}]
+        response = hf_client.chat_completion(messages, max_tokens=250, temperature=0.1)
         text = response.choices[0].message.content
+        analysis = re.search(r'ANALYSIS:\s*(.*?)(?=KEYWORDS:|$)', text, re.DOTALL).group(1).strip()
+        keywords = re.search(r'KEYWORDS:\s*(.*)', text).group(1).strip()
         return analysis.replace('\n', ' '), keywords
+    except:
         return "Error during AI analysis.", "error"
+# --- CORE UTILITIES ---
 def load_db():
     if DB_FILE.exists():
+        with open(DB_FILE, "r") as f: return json.load(f)
     return []
 def save_db(db):
+    with open(DB_FILE, "w") as f: json.dump(db[-5000:], f)
 def extract_robust_date(text_blocks):
     date_patterns = [
     for text in text_blocks:
         if not text: continue
         for pattern in date_patterns:
+            for match in re.findall(pattern, text, re.IGNORECASE):
                 try:
                     clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
+                    parsed = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
+                    if 2024 <= parsed.year <= 2030: return parsed
+                except: continue
     return None
+# --- DATA GATHERING ENGINES ---
 def fetch_congress_scraped():
+    print("Scanning Verified Lawmaker HTML Pages...")
     results = []
+    for name, url in CONGRESS_SCRAPE_TARGETS.items():
         try:
             r = scraper.get(url, timeout=15)
+            if r.status_code != 200: continue
+            soup = BeautifulSoup(r.text, "html.parser")
+            seen_links = set()
+            for a_tag in soup.find_all("a", href=True):
+                href = a_tag["href"]
+                if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=']): continue
+                full_url = urljoin(url, href)
+                if full_url in seen_links or full_url == url: continue
+                title = a_tag.get_text(" ", strip=True)
+                if not title:
+                    heading = a_tag.find(["h2", "h3", "h4"])
+                    title = heading.get_text(" ", strip=True) if heading else ""
+                if len(title) < 15 or not is_relevant(title): continue
+                seen_links.add(full_url)
+                parent_text = a_tag.parent.get_text(" ", strip=True) if a_tag.parent else ""
+                fmt_date = extract_robust_date([parent_text, title]) or datetime.now()
                 results.append({
+                    "source": name, "type": "Legislative Office Press Release",
+                    "event_date": fmt_date, "time": "Published", "title": title,
+                    "latest_action": "Web Publication", "link": full_url, "summary": "HTML Scrape"
                 })
             time.sleep(1)
         except Exception as e:
+            print(f"  --> {name}: Error — {e}")
     return results
+def fetch_rss(feed_dict, source_type):
+    print(f"Scanning {source_type} RSS...")
     results = []
+    for name, url in feed_dict.items():
         try:
             r = scraper.get(url, timeout=15)
             if r.status_code != 200: continue
+            feed = feedparser.parse(r.content)
+            for entry in feed.entries[:15]:
+                title = entry.get("title", "")
+                summary = entry.get("description", "")
+                if not is_relevant(title, summary): continue
+                fmt_date = extract_robust_date([title, summary]) or datetime.now()
                 results.append({
+                    "source": name, "type": source_type, "event_date": fmt_date,
+                    "time": "Published", "title": title, "latest_action": "Published",
+                    "link": entry.get("link", url), "summary": summary[:300]
                 })
             time.sleep(1)
+        except Exception as e: print(f"Error {name}: {e}")
     return results
 def fetch_federal_register():
     print("Scanning Federal Register API...")
     results = []
     try:
+        r = requests.get("https://www.federalregister.gov/api/v1/documents.json", params={"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}, timeout=15)
         if r.status_code == 200:
             for doc in r.json().get("results", []):
                 pub_date = doc.get("publication_date")
                 fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
                 results.append({
+                    "source": doc.get("agency_names", ["Federal Register"])[0], "type": "Federal/Exec Action",
+                    "event_date": fmt_date, "time": "Published", "title": doc.get("title", "No Title"),
+                    "latest_action": doc.get("type", "Notice"), "link": doc.get("html_url", ""), "summary": str(doc.get("abstract", ""))[:300]
                 })
     except: pass
+    return results
+def fetch_legislation():
     print("Scanning Legislation API...")
     if not CONGRESS_API_KEY: return []
     results = []
     headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
+    try:
+        r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 100, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
+        if r.status_code == 200:
+            for b in r.json().get("bills", []):
                 if not is_relevant(b.get("title", "")): continue
+                action_date_raw = b.get("latestAction", {}).get("actionDate") or b.get("updateDate")
                 fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
+                proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/house-bill/{b.get('number')}"
                 results.append({
                     "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
                     "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
+                    "latest_action": b.get("latestAction", {}).get("text", "Active"), "link": proper_link,
+                    "summary": "Legislative movement tracked via API."
                 })
+    except: pass
     return results
+# --- MAIN RUNNER ---
 def run():
     db = load_db()
     raw_data = []
+    # Run the 4 basic, verified engines
+    raw_data.extend(fetch_congress_scraped())  # The 5 HTML Pages
+    raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media")) # Clean Tech RSS
+    raw_data.extend(fetch_federal_register())  # Clean Exec API
+    raw_data.extend(fetch_legislation())       # Clean Congress API
     new_items = []
     for item in raw_data:
+        # Check against db
+        event_id = f"{item.get('link', 'no_link')} || {item.get('latest_action', 'no_action')}"
+        if event_id not in db:
             print(f"Triaging new item: {item['title'][:40]}...")
+            analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"])
             item["analysis"] = analysis
             item["keywords"] = keywords
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
             new_items.append(item)
+            db.append(event_id)
     if new_items:
         df_new = pd.DataFrame(new_items)
         if CSV_PATH.exists():
             df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
+            df_combined = pd.concat([df_existing, df_new], ignore_index=True).drop_duplicates(subset=['link', 'latest_action'], keep='first')
         else:
             df_combined = df_new
         df_combined.to_csv(CSV_PATH, index=False)
         save_db(db)
         print(f"Added {len(new_items)} new items.")