Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Running

App Files Files Community

IJ-Reynolds HF Staff commited on 11 days ago

Commit

f845243

verified ·

1 Parent(s): de6cb5f

Update main.py

Browse files

Files changed (1) hide show

main.py +158 -121

main.py CHANGED Viewed

@@ -28,7 +28,6 @@ else:
     DB_FILE = BASE_DIR / "seen_events.json"
 # --- STEALTH SCRAPER SETUP ---
-# Mimics a real browser handshake to bypass Cloudflare/Akamai
 scraper = cloudscraper.create_scraper(
     browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
     interpreter='js2py'
@@ -52,53 +51,59 @@ def is_relevant(title, summary=""):
         return True
     return False
-# --- FEEDS DICTIONARIES ---
-# --- FEEDS DICTIONARIES ---
 CONGRESS_PRESS_FEEDS = {
-    "Sen. Cruz (Commerce Chair)": "http://commerce.senate.gov/public/?a=RSS.Feed",
-    "Sen. Schumer (AI Lead)": "https://www.democrats.senate.gov/newsroom/press-releases/feed/",
     "Sen. Young (AI Caucus)": "https://www.young.senate.gov/newsroom/press-releases/feed/",
     "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/press-releases/feed/",
     "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
     "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/rss.xml",
-    "Rep. Moore (UT)": "https://blakemoore.house.gov/rss.xml",
 }
 NEWS_FEEDS = {
     "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
-    "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
     "WashPost Tech": "https://feeds.washingtonpost.com/rss/business/technology",
     "MIT Tech Review": "https://www.technologyreview.com/topic/artificial-intelligence/feed/",
     "Politico Tech": "https://rss.politico.com/technology.xml",
     "Ars Technica Policy": "https://feeds.arstechnica.com/arstechnica/tech-policy",
-    "Axios Tech": "https://api.axios.com/api/render/feed/technology",
     "FedScoop": "https://fedscoop.com/feed/",
-    "Defense One Tech": "https://www.defenseone.com/rss/all/",
     "Nextgov/FCW": "https://www.nextgov.com/rss/all/",
     "TechCrunch AI": "https://techcrunch.com/category/artificial-intelligence/feed/",
-    "The Verge Tech": "https://www.theverge.com/rss/tech/index.xml",
-    "WSJ Technology": "https://feeds.content.dowjones.io/public/rss/RSSWSJD",
-    "SF Chronicle Tech": "https://www.sfchronicle.com/rss/feed/",
     "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
     "The Guardian Tech": "https://www.theguardian.com/technology/rss",
     "The Register AI": "https://www.theregister.com/software/ai_ml/headlines.atom",
     "Tech Policy Press": "https://www.techpolicy.press/rss/",
-    "Financial Times Tech": "https://www.ft.com/technology?format=rss",  # may hit paywall
-    "The Hill Tech": "https://thehill.com/policy/technology/feed/",
 }
 GOV_FEEDS = {
-    "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",  # monitor — OSTP restructured
     "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
     "DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
     "DOE Office of Science": "https://science.osti.gov/RSS",
-    "Federal Register (AI Postings)": "https://www.federalregister.gov/documents/search.rss?conditions%5Bterm%5D=artificial+intelligence",
     "NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
     "NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
     "CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
     "FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
-    "GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss",
 }
 CALENDAR_FEEDS = {
@@ -108,7 +113,7 @@ CALENDAR_FEEDS = {
     "Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
     "Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
     "Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
-    "DOE Events": "https://www.energy.gov/events/rss",
 }
 # --- AI SETUP & ANALYZER ---
@@ -123,7 +128,7 @@ def analyze_with_ai(title, summary, source, bill_text=""):
         return "AI Triage disabled (No API Key).", "N/A"
     prompt = f"""
-    Review this data. Simply provide a summary with no other additions:
     Source: {source}
     Title: {title}
     Summary: {summary}
@@ -131,7 +136,7 @@ def analyze_with_ai(title, summary, source, bill_text=""):
     RULES:
     1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided text.
-    2. Provide a detailed, 2-to-3 sentence summary.
     3. Extract 3 comma-separated keywords.
     Format output EXACTLY as:
@@ -174,7 +179,6 @@ def get_event_id(item):
 def is_new_event(item, db):
     return get_event_id(item) not in db
-# --- DATE EXTRACTOR ---
 def extract_robust_date(text_blocks):
     date_patterns = [
         r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
@@ -195,14 +199,95 @@ def extract_robust_date(text_blocks):
                     continue
     return None
-# --- SCRAPERS ---
 def fetch_rss(feed_dict, source_type):
     print(f"Scanning {source_type} RSS...")
     results = []
     for name, url in feed_dict.items():
         try:
             r = scraper.get(url, timeout=15)
             if r.status_code in [404, 410] and ".house.gov" in url:
                 root_url = url.split(".gov")[0] + ".gov/rss.xml"
                 r = scraper.get(root_url, timeout=10)
@@ -212,32 +297,18 @@ def fetch_rss(feed_dict, source_type):
                 continue
             feed = feedparser.parse(r.content)
             for entry in feed.entries[:20]:
                 title = entry.get("title", "No Title")
                 summary = entry.get("description", "")
                 link = entry.get("link", url)
-                if not is_relevant(title, summary):
-                    continue
-                url_year_match = re.search(r'/(20\d{2})/', link)
-                if url_year_match:
-                    url_year = int(url_year_match.group(1))
-                    curr_year = datetime.now().year
-                    curr_month = datetime.now().month
-                    if url_year < curr_year and curr_month > 2: continue
-                    if url_year < curr_year - 1: continue
                 if hasattr(entry, 'published_parsed') and entry.published_parsed:
                     fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
                 else:
-                    fmt_date = extract_robust_date([title, summary])
-                if fmt_date:
-                    days_old = (datetime.now().replace(tzinfo=None) - fmt_date).days
-                    if days_old > 60: continue
                 results.append({
                     "source": name, "type": source_type, "event_date": fmt_date,
                     "time": "TBD", "title": title, "latest_action": "Published",
@@ -252,11 +323,11 @@ def fetch_master_schedules():
     print("Scanning Master Schedules...")
     results = []
     today = datetime.now()
-    monday_of_week = today - timedelta(days=today.weekday())
     SCHEDULE_URLS = {
         "House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
         "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
-        "Congress Weekly": f"https://www.congress.gov/committee-schedule/weekly/{monday_of_week.strftime('%Y/%m/%d')}"
     }
     for source_name, url in SCHEDULE_URLS.items():
         try:
@@ -271,10 +342,8 @@ def fetch_master_schedules():
                 a_tag = container.find("a", href=True)
                 item_link = urljoin(url, a_tag['href']) if a_tag else url
-                time_node = container.find("time")
-                time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
-                fmt_date = extract_robust_date([time_text, text_content]) or today.replace(hour=9, minute=0, second=0, microsecond=0)
                 results.append({
                     "source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
                     "time": "Scheduled", "title": text_content[:120] + "...",
@@ -285,22 +354,45 @@ def fetch_master_schedules():
             print(f"Error scraping {source_name}: {e}")
     return results
 def fetch_bill_text(congress, bill_type, bill_number):
     if not CONGRESS_API_KEY: return ""
-    url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
-    headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
     try:
         r = requests.get(url, headers=headers, timeout=10)
-        if r.status_code != 200: return ""
-        versions = r.json().get("textVersions", [])
-        if not versions: return ""
-        for fmt in versions[0].get("formats", []):
-            if text_url := fmt.get("url"):
-                text_req = requests.get(text_url, headers=headers, timeout=10)
-                if text_req.status_code == 200:
                     return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
-    except Exception as e:
-        print(f"Failed to fetch text for {bill_type}{bill_number}: {e}")
     return ""
 def fetch_legislation(target=1000):
@@ -309,7 +401,6 @@ def fetch_legislation(target=1000):
     results = []
     headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
     BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
     for offset in range(0, target, 250):
         try:
             r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
@@ -317,78 +408,21 @@ def fetch_legislation(target=1000):
             bills = r.json().get("bills", [])
             if not bills: break
             for b in bills:
-                title = b.get("title", "")
-                if not is_relevant(title): continue
                 action_data = b.get("latestAction", {})
-                action_text = action_data.get("text", "Active")
                 action_date_raw = action_data.get("actionDate") or b.get("updateDate")
-                fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else None
                 raw_type = b.get("type", "HR").upper()
                 proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
                 results.append({
                     "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
-                    "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {title}",
-                    "latest_action": action_text, "link": proper_link, "summary": "Legislative movement tracked via API.",
-                    "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
                 })
             time.sleep(1.5)
-        except Exception as e:
-            print(f"Legislation API Error: {e}")
-            break
-    return results
-def fetch_federal_register():
-    print("Scanning Federal Register API...")
-    results = []
-    url = "https://www.federalregister.gov/api/v1/documents.json"
-    # FIX: Simplify the search term. Complex boolean strings break their URL parser.
-    params = {
-        "conditions[term]": "artificial intelligence",
-        "order": "newest",
-        "per_page": 10
-    }
-    try:
-        r = requests.get(url, params=params, timeout=15)
-        if r.status_code != 200:
-            print(f"--> Federal Register API returned status {r.status_code}")
-            return results
-        data = r.json()
-        items = data.get("results", [])
-        # VERIFICATION: This will print the exact number of documents found to your terminal
-        print(f"--> Federal Register API: Found {len(items)} items.")
-        for doc in items:
-            title = doc.get("title", "No Title")
-            summary = doc.get("abstract", "No summary provided.")
-            link = doc.get("html_url", "")
-            action_type = doc.get("type", "Notice")
-            agencies = doc.get("agency_names", ["Federal Agency"])
-            primary_agency = agencies[0] if agencies else "Federal Register"
-            pub_date = doc.get("publication_date")
-            fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
-            results.append({
-                "source": primary_agency,
-                "type": "Federal/Exec Action",
-                "event_date": fmt_date,
-                "time": "Published",
-                "title": title,
-                "latest_action": action_type,
-                "link": link,
-                "summary": str(summary)[:300]
-            })
-        time.sleep(1)
-    except Exception as e:
-        print(f"Federal Register API Error: {e}")
     return results
 # --- MAIN EXECUTION ---
@@ -396,13 +430,16 @@ def run():
     db = load_db()
     raw_data = []
     raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
     raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
     raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
     raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
     raw_data.extend(fetch_master_schedules())
     raw_data.extend(fetch_legislation())
-    raw_data.extend(fetch_federal_register())
     new_items = []
     for item in raw_data:

     DB_FILE = BASE_DIR / "seen_events.json"
 # --- STEALTH SCRAPER SETUP ---
 scraper = cloudscraper.create_scraper(
     browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
     interpreter='js2py'
         return True
     return False
+# --- FEEDS & TARGET DICTIONARIES ---
+# Members with working RSS/Feeds
 CONGRESS_PRESS_FEEDS = {
     "Sen. Young (AI Caucus)": "https://www.young.senate.gov/newsroom/press-releases/feed/",
     "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/press-releases/feed/",
     "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
     "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/rss.xml",
+    "Rep. Moore (UT)": "https://blakemoore.house.gov/rss.xml"
+}
+# Members who block RSS - HTML Scrape Targets
+CONGRESS_SCRAPE_TARGETS = {
+    "Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/newsroom/press-releases",
+    "Sen. Schumer (Dem Leader/AI)": "https://www.schumer.senate.gov/newsroom/press-releases",
+    "Sen. Heinrich (AI Caucus)": "https://www.heinrich.senate.gov/newsroom/press-releases",
+    "Sen. Rounds (AI Caucus)": "https://www.rounds.senate.gov/newsroom/press-releases",
+    "Sen. Cantwell (Commerce RM)": "https://www.cantwell.senate.gov/newsroom/press-releases"
 }
 NEWS_FEEDS = {
     "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
+    "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
     "WashPost Tech": "https://feeds.washingtonpost.com/rss/business/technology",
     "MIT Tech Review": "https://www.technologyreview.com/topic/artificial-intelligence/feed/",
     "Politico Tech": "https://rss.politico.com/technology.xml",
     "Ars Technica Policy": "https://feeds.arstechnica.com/arstechnica/tech-policy",
+    "Axios Tech": "https://www.axios.com/feeds/feed.rss",
     "FedScoop": "https://fedscoop.com/feed/",
+    "Defense One Tech": "https://www.defenseone.com/rss/technology/",
     "Nextgov/FCW": "https://www.nextgov.com/rss/all/",
     "TechCrunch AI": "https://techcrunch.com/category/artificial-intelligence/feed/",
+    "The Verge Tech": "https://www.theverge.com/rss/index.xml",
+    "WSJ Technology": "https://feeds.content.dowjones.io/public/rss/MW_Tech",
+    "SF Chronicle Tech": "https://www.sfchronicle.com/projects/feed/tech-news-rss/",
     "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
     "The Guardian Tech": "https://www.theguardian.com/technology/rss",
     "The Register AI": "https://www.theregister.com/software/ai_ml/headlines.atom",
     "Tech Policy Press": "https://www.techpolicy.press/rss/",
+    "Financial Times Tech": "https://www.ft.com/technology?format=rss",
+    "The Hill Tech": "https://thehill.com/policy/technology/feed/"
 }
 GOV_FEEDS = {
+    "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
     "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
     "DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
     "DOE Office of Science": "https://science.osti.gov/RSS",
     "NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
     "NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
     "CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
     "FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
+    "GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss"
 }
 CALENDAR_FEEDS = {
     "Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
     "Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
     "Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
+    "DOE Events": "https://www.energy.gov/events/rss"
 }
 # --- AI SETUP & ANALYZER ---
         return "AI Triage disabled (No API Key).", "N/A"
     prompt = f"""
+    You are a D.C. AI policy analyst. Review this update. Simply provide the summary with no other additions:
     Source: {source}
     Title: {title}
     Summary: {summary}
     RULES:
     1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided text.
+    2. Provide a detailed, 2-to-3 sentence executive summary explaining the actual policy impact.
     3. Extract 3 comma-separated keywords.
     Format output EXACTLY as:
 def is_new_event(item, db):
     return get_event_id(item) not in db
 def extract_robust_date(text_blocks):
     date_patterns = [
         r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
                     continue
     return None
+# --- HTML DIRECT SCRAPER (SENATE CMS) ---
+DATE_RE = re.compile(r'\b(\d{2})\.(\d{2})\.(\d{4})\b')
+def _parse_senate_cms_date(text: str):
+    m = DATE_RE.search(text or "")
+    if not m: return None
+    try:
+        return datetime(int(m.group(3)), int(m.group(1)), int(m.group(2)))
+    except ValueError:
+        return None
+def _parse_senate_cms_page(html: str, base_url: str, source_name: str):
+    soup = BeautifulSoup(html, "html.parser")
+    results = []
+    seen_links = set()
+    listing_path = base_url.replace("https://", "").split("/", 1)[-1]
+    path_fragment = "/" + listing_path.split("/", 1)[-1]
+    for a_tag in soup.find_all("a", href=True):
+        href = a_tag["href"]
+        if not href.startswith(path_fragment + "/"):
+            continue
+        full_url = urljoin(base_url, href)
+        if full_url in seen_links: continue
+        seen_links.add(full_url)
+        title = a_tag.get_text(" ", strip=True)
+        if not title:
+            heading = a_tag.find(["h2", "h3"])
+            title = heading.get_text(" ", strip=True) if heading else "No Title"
+        if len(title) < 10: continue
+        fmt_date = None
+        parent = a_tag.parent
+        for _ in range(5):
+            parent_text = parent.get_text(" ", strip=True) if parent else ""
+            fmt_date = _parse_senate_cms_date(parent_text)
+            if fmt_date: break
+            parent = parent.parent if parent else None
+        if not fmt_date:
+            surrounding = a_tag.find_previous(string=DATE_RE)
+            fmt_date = _parse_senate_cms_date(surrounding) if surrounding else None
+        if not is_relevant(title): continue
+        if fmt_date:
+            days_old = (datetime.now() - fmt_date).days
+            if days_old > 60: continue
+        results.append({
+            "source": source_name,
+            "type": "Legislative Office Press Release",
+            "event_date": fmt_date or datetime.now(),
+            "time": "TBD",
+            "title": title,
+            "latest_action": "Published",
+            "link": full_url,
+            "summary": "HTML Scrape - Full text review pending."
+        })
+    return results
+def fetch_congress_scraped():
+    print("Scraping Congress HTML pages (no-RSS targets)...")
+    all_results = []
+    for name, url in CONGRESS_SCRAPE_TARGETS.items():
+        try:
+            r = scraper.get(url, timeout=15, headers={"Referer": "https://www.google.com/"})
+            if r.status_code != 200:
+                print(f"  --> {name}: HTTP {r.status_code}, skipping")
+                continue
+            items = _parse_senate_cms_page(r.text, url, name)
+            print(f"  --> {name}: Found {len(items)} relevant items")
+            all_results.extend(items)
+            time.sleep(1.5)
+        except Exception as e:
+            print(f"  --> {name}: Error — {e}")
+    return all_results
+# --- STANDARD API & RSS SCRAPERS ---
 def fetch_rss(feed_dict, source_type):
     print(f"Scanning {source_type} RSS...")
     results = []
     for name, url in feed_dict.items():
         try:
             r = scraper.get(url, timeout=15)
             if r.status_code in [404, 410] and ".house.gov" in url:
                 root_url = url.split(".gov")[0] + ".gov/rss.xml"
                 r = scraper.get(root_url, timeout=10)
                 continue
             feed = feedparser.parse(r.content)
             for entry in feed.entries[:20]:
                 title = entry.get("title", "No Title")
                 summary = entry.get("description", "")
                 link = entry.get("link", url)
+                if not is_relevant(title, summary): continue
                 if hasattr(entry, 'published_parsed') and entry.published_parsed:
                     fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
                 else:
+                    fmt_date = extract_robust_date([title, summary]) or datetime.now()
                 results.append({
                     "source": name, "type": source_type, "event_date": fmt_date,
                     "time": "TBD", "title": title, "latest_action": "Published",
     print("Scanning Master Schedules...")
     results = []
     today = datetime.now()
+    monday = today - timedelta(days=today.weekday())
     SCHEDULE_URLS = {
         "House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
         "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
+        "Congress Weekly": f"https://www.congress.gov/committee-schedule/weekly/{monday.strftime('%Y/%m/%d')}"
     }
     for source_name, url in SCHEDULE_URLS.items():
         try:
                 a_tag = container.find("a", href=True)
                 item_link = urljoin(url, a_tag['href']) if a_tag else url
+                fmt_date = extract_robust_date([text_content]) or today
                 results.append({
                     "source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
                     "time": "Scheduled", "title": text_content[:120] + "...",
             print(f"Error scraping {source_name}: {e}")
     return results
+def fetch_federal_register():
+    print("Scanning Federal Register API...")
+    results = []
+    url = "https://www.federalregister.gov/api/v1/documents.json"
+    params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}
+    try:
+        r = requests.get(url, params=params, timeout=15)
+        if r.status_code == 200:
+            for doc in r.json().get("results", []):
+                title = doc.get("title", "No Title")
+                summary = doc.get("abstract", "No summary provided.")
+                pub_date = doc.get("publication_date")
+                fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
+                results.append({
+                    "source": doc.get("agency_names", ["Federal Register"])[0],
+                    "type": "Federal/Exec Action", "event_date": fmt_date,
+                    "time": "Published", "title": title, "latest_action": doc.get("type", "Notice"),
+                    "link": doc.get("html_url", ""), "summary": str(summary)[:300]
+                })
+        time.sleep(1)
+    except Exception as e:
+        print(f"Federal Register API Error: {e}")
+    return results
 def fetch_bill_text(congress, bill_type, bill_number):
     if not CONGRESS_API_KEY: return ""
     try:
+        url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
+        headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
         r = requests.get(url, headers=headers, timeout=10)
+        if r.status_code == 200:
+            versions = r.json().get("textVersions", [])
+            if versions and versions[0].get("formats"):
+                text_url = versions[0]["formats"][0].get("url")
+                if text_url:
+                    text_req = requests.get(text_url, headers=headers, timeout=10)
                     return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
+    except: pass
     return ""
 def fetch_legislation(target=1000):
     results = []
     headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
     BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
     for offset in range(0, target, 250):
         try:
             r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
             bills = r.json().get("bills", [])
             if not bills: break
             for b in bills:
+                if not is_relevant(b.get("title", "")): continue
                 action_data = b.get("latestAction", {})
                 action_date_raw = action_data.get("actionDate") or b.get("updateDate")
+                fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
                 raw_type = b.get("type", "HR").upper()
                 proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
                 results.append({
                     "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
+                    "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
+                    "latest_action": action_data.get("text", "Active"), "link": proper_link,
+                    "summary": "Legislative movement tracked via API.", "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
                 })
             time.sleep(1.5)
+        except Exception as e: break
     return results
 # --- MAIN EXECUTION ---
     db = load_db()
     raw_data = []
+    # Run all our data gatherers
     raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
     raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
     raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
     raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
+    raw_data.extend(fetch_congress_scraped()) # The new direct HTML targets!
+    raw_data.extend(fetch_federal_register())
     raw_data.extend(fetch_master_schedules())
     raw_data.extend(fetch_legislation())
     new_items = []
     for item in raw_data: