Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -405,6 +405,7 @@ def run():
|
|
| 405 |
|
| 406 |
new_items = []
|
| 407 |
for item in raw_data:
|
|
|
|
| 408 |
if is_new_event(item, db):
|
| 409 |
print(f"Triaging new item: {item['title'][:40]}...")
|
| 410 |
|
|
@@ -419,23 +420,76 @@ def run():
|
|
| 419 |
item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 420 |
new_items.append(item)
|
| 421 |
|
|
|
|
| 422 |
db.append(get_event_id(item))
|
| 423 |
|
| 424 |
if new_items:
|
| 425 |
df_new = pd.DataFrame(new_items)
|
| 426 |
if CSV_PATH.exists():
|
|
|
|
| 427 |
df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
|
| 428 |
df_combined = pd.concat([df_existing, df_new], ignore_index=True)
|
| 429 |
else:
|
| 430 |
df_combined = df_new
|
| 431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
df_combined.to_csv(CSV_PATH, index=False)
|
| 433 |
save_db(db)
|
| 434 |
print(f"Added {len(new_items)} new items.")
|
| 435 |
else:
|
| 436 |
print("Sweep complete. No new items.")
|
| 437 |
|
| 438 |
-
return len(new_items)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
|
| 440 |
-
|
| 441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
new_items = []
|
| 407 |
for item in raw_data:
|
| 408 |
+
# Check against the composite ID (URL + Status)
|
| 409 |
if is_new_event(item, db):
|
| 410 |
print(f"Triaging new item: {item['title'][:40]}...")
|
| 411 |
|
|
|
|
| 420 |
item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 421 |
new_items.append(item)
|
| 422 |
|
| 423 |
+
# Store the composite fingerprint in the seen database
|
| 424 |
db.append(get_event_id(item))
|
| 425 |
|
| 426 |
if new_items:
|
| 427 |
df_new = pd.DataFrame(new_items)
|
| 428 |
if CSV_PATH.exists():
|
| 429 |
+
# Standardize date parsing on load to prevent concat errors
|
| 430 |
df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
|
| 431 |
df_combined = pd.concat([df_existing, df_new], ignore_index=True)
|
| 432 |
else:
|
| 433 |
df_combined = df_new
|
| 434 |
|
| 435 |
+
# 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
|
| 436 |
+
# This kills any 'ghost twins' if the scraper accidentally pulls them twice
|
| 437 |
+
df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
|
| 438 |
+
|
| 439 |
df_combined.to_csv(CSV_PATH, index=False)
|
| 440 |
save_db(db)
|
| 441 |
print(f"Added {len(new_items)} new items.")
|
| 442 |
else:
|
| 443 |
print("Sweep complete. No new items.")
|
| 444 |
|
| 445 |
+
return len(new_items)# --- MAIN EXECUTION ---
|
| 446 |
+
def run():
|
| 447 |
+
db = load_db()
|
| 448 |
+
|
| 449 |
+
raw_data = []
|
| 450 |
+
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
| 451 |
+
raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
|
| 452 |
+
raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
|
| 453 |
+
raw_data.extend(fetch_master_schedules())
|
| 454 |
+
raw_data.extend(fetch_legislation())
|
| 455 |
|
| 456 |
+
new_items = []
|
| 457 |
+
for item in raw_data:
|
| 458 |
+
# Check against the composite ID (URL + Status)
|
| 459 |
+
if is_new_event(item, db):
|
| 460 |
+
print(f"Triaging new item: {item['title'][:40]}...")
|
| 461 |
+
|
| 462 |
+
bill_text = ""
|
| 463 |
+
if item.get("type") == "Legislation":
|
| 464 |
+
bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number"))
|
| 465 |
+
|
| 466 |
+
analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
|
| 467 |
+
|
| 468 |
+
item["analysis"] = analysis
|
| 469 |
+
item["keywords"] = keywords
|
| 470 |
+
item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 471 |
+
new_items.append(item)
|
| 472 |
+
|
| 473 |
+
# Store the composite fingerprint in the seen database
|
| 474 |
+
db.append(get_event_id(item))
|
| 475 |
+
|
| 476 |
+
if new_items:
|
| 477 |
+
df_new = pd.DataFrame(new_items)
|
| 478 |
+
if CSV_PATH.exists():
|
| 479 |
+
# Standardize date parsing on load to prevent concat errors
|
| 480 |
+
df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
|
| 481 |
+
df_combined = pd.concat([df_existing, df_new], ignore_index=True)
|
| 482 |
+
else:
|
| 483 |
+
df_combined = df_new
|
| 484 |
+
|
| 485 |
+
# 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
|
| 486 |
+
# This kills any 'ghost twins' if the scraper accidentally pulls them twice
|
| 487 |
+
df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
|
| 488 |
+
|
| 489 |
+
df_combined.to_csv(CSV_PATH, index=False)
|
| 490 |
+
save_db(db)
|
| 491 |
+
print(f"Added {len(new_items)} new items.")
|
| 492 |
+
else:
|
| 493 |
+
print("Sweep complete. No new items.")
|
| 494 |
+
|
| 495 |
+
return len(new_items)
|