Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Sleeping

App Files Files Community

IJ-Reynolds HF Staff commited on 17 days ago

Commit

e5bb349

verified ·

1 Parent(s): f3975b8

Update main.py

Browse files

Files changed (1) hide show

main.py +47 -1

main.py CHANGED Viewed

@@ -204,6 +204,51 @@ def fetch_congress_scraped():
             print(f"  --> {name}: Error — {e}")
     return results
 def fetch_rss(feed_dict, source_type):
     print(f"Scanning {source_type} RSS...")
     results = []
@@ -350,7 +395,8 @@ def run():
     raw_data.extend(fetch_congress_scraped())  # The 5 HTML Pages with DOM Climbing
     raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
     raw_data.extend(fetch_federal_register())
-    raw_data.extend(fetch_legislation())
     new_items = []
     for item in raw_data:

             print(f"  --> {name}: Error — {e}")
     return results
+def fetch_floor_schedules():
+    print("Scanning House & Senate Floor Schedules...")
+    results = []
+    # Using your stable, verified endpoints
+    SCHEDULE_URLS = {
+        "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
+        "House Floor Summary": "https://clerk.house.gov/FloorSummary"
+    }
+    for source_name, url in SCHEDULE_URLS.items():
+        try:
+            r = scraper.get(url, timeout=15)
+            if r.status_code != 200: continue
+            soup = BeautifulSoup(r.text, "html.parser")
+            # Cast a broad net over typical content containers
+            for container in soup.find_all(["tr", "li", "div", "p"]):
+                text_content = container.get_text(" ", strip=True)
+                # Filter out microscopic navigation links and massive full-page wrappers
+                if len(text_content) < 30 or len(text_content) > 1500: continue
+                if not is_relevant(text_content): continue
+                # Prevent adding the exact same paragraph twice if nested
+                if any(res['summary'][:50] == text_content[:50] for res in results): continue
+                a_tag = container.find("a", href=True)
+                item_link = urljoin(url, a_tag['href']) if a_tag else url
+                # These pages update daily, so if no date is in the text, it's today's action
+                fmt_date = extract_robust_date([text_content]) or datetime.now()
+                results.append({
+                    "source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
+                    "time": "Scheduled", "title": text_content[:120] + "...",
+                    "latest_action": "On Master Schedule", "link": item_link, "summary": text_content[:300]
+                })
+            time.sleep(1)
+        except Exception as e:
+            print(f"Error scraping {source_name}: {e}")
+    return results
 def fetch_rss(feed_dict, source_type):
     print(f"Scanning {source_type} RSS...")
     results = []
     raw_data.extend(fetch_congress_scraped())  # The 5 HTML Pages with DOM Climbing
     raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
     raw_data.extend(fetch_federal_register())
+    raw_data.extend(fetch_legislation())
+    raw_data.extend(fetch_floor_schedules())
     new_items = []
     for item in raw_data: