Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -74,6 +74,24 @@ NEWS_FEEDS = {
|
|
| 74 |
"The Hill Tech": "https://thehill.com/policy/technology/feed/"
|
| 75 |
}
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
GOV_FEEDS = {
|
| 78 |
"White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
|
| 79 |
"White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
|
|
@@ -399,56 +417,10 @@ def run():
|
|
| 399 |
raw_data = []
|
| 400 |
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
| 401 |
raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
|
| 402 |
-
raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
|
| 403 |
-
raw_data.extend(fetch_master_schedules())
|
| 404 |
-
raw_data.extend(fetch_legislation())
|
| 405 |
-
|
| 406 |
-
new_items = []
|
| 407 |
-
for item in raw_data:
|
| 408 |
-
# Check against the composite ID (URL + Status)
|
| 409 |
-
if is_new_event(item, db):
|
| 410 |
-
print(f"Triaging new item: {item['title'][:40]}...")
|
| 411 |
-
|
| 412 |
-
bill_text = ""
|
| 413 |
-
if item.get("type") == "Legislation":
|
| 414 |
-
bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number"))
|
| 415 |
-
|
| 416 |
-
analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
|
| 417 |
-
|
| 418 |
-
item["analysis"] = analysis
|
| 419 |
-
item["keywords"] = keywords
|
| 420 |
-
item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 421 |
-
new_items.append(item)
|
| 422 |
-
|
| 423 |
-
# Store the composite fingerprint in the seen database
|
| 424 |
-
db.append(get_event_id(item))
|
| 425 |
-
|
| 426 |
-
if new_items:
|
| 427 |
-
df_new = pd.DataFrame(new_items)
|
| 428 |
-
if CSV_PATH.exists():
|
| 429 |
-
# Standardize date parsing on load to prevent concat errors
|
| 430 |
-
df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
|
| 431 |
-
df_combined = pd.concat([df_existing, df_new], ignore_index=True)
|
| 432 |
-
else:
|
| 433 |
-
df_combined = df_new
|
| 434 |
-
|
| 435 |
-
# 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
|
| 436 |
-
# This kills any 'ghost twins' if the scraper accidentally pulls them twice
|
| 437 |
-
df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
|
| 438 |
-
|
| 439 |
-
df_combined.to_csv(CSV_PATH, index=False)
|
| 440 |
-
save_db(db)
|
| 441 |
-
print(f"Added {len(new_items)} new items.")
|
| 442 |
-
else:
|
| 443 |
-
print("Sweep complete. No new items.")
|
| 444 |
-
|
| 445 |
-
return len(new_items)# --- MAIN EXECUTION ---
|
| 446 |
-
def run():
|
| 447 |
-
db = load_db()
|
| 448 |
|
| 449 |
-
|
| 450 |
-
raw_data.extend(fetch_rss(
|
| 451 |
-
|
| 452 |
raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
|
| 453 |
raw_data.extend(fetch_master_schedules())
|
| 454 |
raw_data.extend(fetch_legislation())
|
|
@@ -483,7 +455,6 @@ def run():
|
|
| 483 |
df_combined = df_new
|
| 484 |
|
| 485 |
# 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
|
| 486 |
-
# This kills any 'ghost twins' if the scraper accidentally pulls them twice
|
| 487 |
df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
|
| 488 |
|
| 489 |
df_combined.to_csv(CSV_PATH, index=False)
|
|
|
|
| 74 |
"The Hill Tech": "https://thehill.com/policy/technology/feed/"
|
| 75 |
}
|
| 76 |
|
| 77 |
+
CONGRESS_PRESS_FEEDS = {
|
| 78 |
+
# Senate Commerce & AI Leaders
|
| 79 |
+
"Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/rss/press.xml",
|
| 80 |
+
"Sen. Schatz (AI Lead)": "https://www.schatz.senate.gov/rss/press.xml",
|
| 81 |
+
"Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/rss/press.xml",
|
| 82 |
+
"Sen. Young (AI Caucus)": "https://www.young.senate.gov/rss/press.xml",
|
| 83 |
+
|
| 84 |
+
# Tech/Foreign Policy Nexus
|
| 85 |
+
"Sen. Andy Kim (Tech/Export Lead)": "https://www.kim.senate.gov/rss/press.xml",
|
| 86 |
+
"Sen. Ricketts (Tech/Foreign Lead)": "https://www.ricketts.senate.gov/rss/press.xml",
|
| 87 |
+
|
| 88 |
+
# House Science & Tech Leaders
|
| 89 |
+
"Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
|
| 90 |
+
"Rep. Obernolte (Science/Tech Chair)": "https://obernolte.house.gov/rss.xml",
|
| 91 |
+
"Rep. Lieu (AI Task Force)": "https://lieu.house.gov/rss.xml",
|
| 92 |
+
"Rep. Beyer (AI Caucus)": "https://beyer.house.gov/rss.xml"
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
GOV_FEEDS = {
|
| 96 |
"White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
|
| 97 |
"White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
|
|
|
|
| 417 |
raw_data = []
|
| 418 |
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
| 419 |
raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
|
| 421 |
+
# 🛑 ADDED: The new congressional press feeds with the custom category
|
| 422 |
+
raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
|
| 423 |
+
|
| 424 |
raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
|
| 425 |
raw_data.extend(fetch_master_schedules())
|
| 426 |
raw_data.extend(fetch_legislation())
|
|
|
|
| 455 |
df_combined = df_new
|
| 456 |
|
| 457 |
# 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
|
|
|
|
| 458 |
df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
|
| 459 |
|
| 460 |
df_combined.to_csv(CSV_PATH, index=False)
|