IJ-Reynolds HF Staff commited on
Commit
c2159e4
·
verified ·
1 Parent(s): c53a8ee

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +21 -50
main.py CHANGED
@@ -74,6 +74,24 @@ NEWS_FEEDS = {
74
  "The Hill Tech": "https://thehill.com/policy/technology/feed/"
75
  }
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  GOV_FEEDS = {
78
  "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
79
  "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
@@ -399,56 +417,10 @@ def run():
399
  raw_data = []
400
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
401
  raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
402
- raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
403
- raw_data.extend(fetch_master_schedules())
404
- raw_data.extend(fetch_legislation())
405
-
406
- new_items = []
407
- for item in raw_data:
408
- # Check against the composite ID (URL + Status)
409
- if is_new_event(item, db):
410
- print(f"Triaging new item: {item['title'][:40]}...")
411
-
412
- bill_text = ""
413
- if item.get("type") == "Legislation":
414
- bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number"))
415
-
416
- analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
417
-
418
- item["analysis"] = analysis
419
- item["keywords"] = keywords
420
- item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
421
- new_items.append(item)
422
-
423
- # Store the composite fingerprint in the seen database
424
- db.append(get_event_id(item))
425
-
426
- if new_items:
427
- df_new = pd.DataFrame(new_items)
428
- if CSV_PATH.exists():
429
- # Standardize date parsing on load to prevent concat errors
430
- df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
431
- df_combined = pd.concat([df_existing, df_new], ignore_index=True)
432
- else:
433
- df_combined = df_new
434
-
435
- # 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
436
- # This kills any 'ghost twins' if the scraper accidentally pulls them twice
437
- df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
438
-
439
- df_combined.to_csv(CSV_PATH, index=False)
440
- save_db(db)
441
- print(f"Added {len(new_items)} new items.")
442
- else:
443
- print("Sweep complete. No new items.")
444
-
445
- return len(new_items)# --- MAIN EXECUTION ---
446
- def run():
447
- db = load_db()
448
 
449
- raw_data = []
450
- raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
451
- raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
452
  raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
453
  raw_data.extend(fetch_master_schedules())
454
  raw_data.extend(fetch_legislation())
@@ -483,7 +455,6 @@ def run():
483
  df_combined = df_new
484
 
485
  # 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
486
- # This kills any 'ghost twins' if the scraper accidentally pulls them twice
487
  df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
488
 
489
  df_combined.to_csv(CSV_PATH, index=False)
 
74
  "The Hill Tech": "https://thehill.com/policy/technology/feed/"
75
  }
76
 
77
+ CONGRESS_PRESS_FEEDS = {
78
+ # Senate Commerce & AI Leaders
79
+ "Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/rss/press.xml",
80
+ "Sen. Schatz (AI Lead)": "https://www.schatz.senate.gov/rss/press.xml",
81
+ "Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/rss/press.xml",
82
+ "Sen. Young (AI Caucus)": "https://www.young.senate.gov/rss/press.xml",
83
+
84
+ # Tech/Foreign Policy Nexus
85
+ "Sen. Andy Kim (Tech/Export Lead)": "https://www.kim.senate.gov/rss/press.xml",
86
+ "Sen. Ricketts (Tech/Foreign Lead)": "https://www.ricketts.senate.gov/rss/press.xml",
87
+
88
+ # House Science & Tech Leaders
89
+ "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
90
+ "Rep. Obernolte (Science/Tech Chair)": "https://obernolte.house.gov/rss.xml",
91
+ "Rep. Lieu (AI Task Force)": "https://lieu.house.gov/rss.xml",
92
+ "Rep. Beyer (AI Caucus)": "https://beyer.house.gov/rss.xml"
93
+ }
94
+
95
  GOV_FEEDS = {
96
  "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
97
  "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
 
417
  raw_data = []
418
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
419
  raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
 
421
+ # 🛑 ADDED: The new congressional press feeds with the custom category
422
+ raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
423
+
424
  raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
425
  raw_data.extend(fetch_master_schedules())
426
  raw_data.extend(fetch_legislation())
 
455
  df_combined = df_new
456
 
457
  # 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
 
458
  df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
459
 
460
  df_combined.to_csv(CSV_PATH, index=False)