IJ-Reynolds HF Staff commited on
Commit
ecdf456
·
verified ·
1 Parent(s): 2c0c298

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +57 -3
main.py CHANGED
@@ -405,6 +405,7 @@ def run():
405
 
406
  new_items = []
407
  for item in raw_data:
 
408
  if is_new_event(item, db):
409
  print(f"Triaging new item: {item['title'][:40]}...")
410
 
@@ -419,23 +420,76 @@ def run():
419
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
420
  new_items.append(item)
421
 
 
422
  db.append(get_event_id(item))
423
 
424
  if new_items:
425
  df_new = pd.DataFrame(new_items)
426
  if CSV_PATH.exists():
 
427
  df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
428
  df_combined = pd.concat([df_existing, df_new], ignore_index=True)
429
  else:
430
  df_combined = df_new
431
 
 
 
 
 
432
  df_combined.to_csv(CSV_PATH, index=False)
433
  save_db(db)
434
  print(f"Added {len(new_items)} new items.")
435
  else:
436
  print("Sweep complete. No new items.")
437
 
438
- return len(new_items)
 
 
 
 
 
 
 
 
 
439
 
440
- if __name__ == "__main__":
441
- run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
  new_items = []
407
  for item in raw_data:
408
+ # Check against the composite ID (URL + Status)
409
  if is_new_event(item, db):
410
  print(f"Triaging new item: {item['title'][:40]}...")
411
 
 
420
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
421
  new_items.append(item)
422
 
423
+ # Store the composite fingerprint in the seen database
424
  db.append(get_event_id(item))
425
 
426
  if new_items:
427
  df_new = pd.DataFrame(new_items)
428
  if CSV_PATH.exists():
429
+ # Standardize date parsing on load to prevent concat errors
430
  df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
431
  df_combined = pd.concat([df_existing, df_new], ignore_index=True)
432
  else:
433
  df_combined = df_new
434
 
435
+ # 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
436
+ # This kills any 'ghost twins' if the scraper accidentally pulls them twice
437
+ df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
438
+
439
  df_combined.to_csv(CSV_PATH, index=False)
440
  save_db(db)
441
  print(f"Added {len(new_items)} new items.")
442
  else:
443
  print("Sweep complete. No new items.")
444
 
445
+ return len(new_items)# --- MAIN EXECUTION ---
446
+ def run():
447
+ db = load_db()
448
+
449
+ raw_data = []
450
+ raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
451
+ raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
452
+ raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
453
+ raw_data.extend(fetch_master_schedules())
454
+ raw_data.extend(fetch_legislation())
455
 
456
+ new_items = []
457
+ for item in raw_data:
458
+ # Check against the composite ID (URL + Status)
459
+ if is_new_event(item, db):
460
+ print(f"Triaging new item: {item['title'][:40]}...")
461
+
462
+ bill_text = ""
463
+ if item.get("type") == "Legislation":
464
+ bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number"))
465
+
466
+ analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
467
+
468
+ item["analysis"] = analysis
469
+ item["keywords"] = keywords
470
+ item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
471
+ new_items.append(item)
472
+
473
+ # Store the composite fingerprint in the seen database
474
+ db.append(get_event_id(item))
475
+
476
+ if new_items:
477
+ df_new = pd.DataFrame(new_items)
478
+ if CSV_PATH.exists():
479
+ # Standardize date parsing on load to prevent concat errors
480
+ df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
481
+ df_combined = pd.concat([df_existing, df_new], ignore_index=True)
482
+ else:
483
+ df_combined = df_new
484
+
485
+ # 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
486
+ # This kills any 'ghost twins' if the scraper accidentally pulls them twice
487
+ df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
488
+
489
+ df_combined.to_csv(CSV_PATH, index=False)
490
+ save_db(db)
491
+ print(f"Added {len(new_items)} new items.")
492
+ else:
493
+ print("Sweep complete. No new items.")
494
+
495
+ return len(new_items)