Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Sleeping

App Files Files Community

IJ-Reynolds HF Staff commited on 18 days ago

Commit

ecdf456

verified ·

1 Parent(s): 2c0c298

Update main.py

Browse files

Files changed (1) hide show

main.py +57 -3

main.py CHANGED Viewed

@@ -405,6 +405,7 @@ def run():
     new_items = []
     for item in raw_data:
         if is_new_event(item, db):
             print(f"Triaging new item: {item['title'][:40]}...")
@@ -419,23 +420,76 @@ def run():
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
             new_items.append(item)
             db.append(get_event_id(item))
     if new_items:
         df_new = pd.DataFrame(new_items)
         if CSV_PATH.exists():
             df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
             df_combined = pd.concat([df_existing, df_new], ignore_index=True)
         else:
             df_combined = df_new
         df_combined.to_csv(CSV_PATH, index=False)
         save_db(db)
         print(f"Added {len(new_items)} new items.")
     else:
         print("Sweep complete. No new items.")
-    return len(new_items)
-if __name__ == "__main__":
-    run()

     new_items = []
     for item in raw_data:
+        # Check against the composite ID (URL + Status)
         if is_new_event(item, db):
             print(f"Triaging new item: {item['title'][:40]}...")
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
             new_items.append(item)
+            # Store the composite fingerprint in the seen database
             db.append(get_event_id(item))
     if new_items:
         df_new = pd.DataFrame(new_items)
         if CSV_PATH.exists():
+            # Standardize date parsing on load to prevent concat errors
             df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
             df_combined = pd.concat([df_existing, df_new], ignore_index=True)
         else:
             df_combined = df_new
+        # 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
+        # This kills any 'ghost twins' if the scraper accidentally pulls them twice
+        df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
         df_combined.to_csv(CSV_PATH, index=False)
         save_db(db)
         print(f"Added {len(new_items)} new items.")
     else:
         print("Sweep complete. No new items.")
+    return len(new_items)# --- MAIN EXECUTION ---
+def run():
+    db = load_db()
+    raw_data = []
+    raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
+    raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
+    raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
+    raw_data.extend(fetch_master_schedules())
+    raw_data.extend(fetch_legislation())
+    new_items = []
+    for item in raw_data:
+        # Check against the composite ID (URL + Status)
+        if is_new_event(item, db):
+            print(f"Triaging new item: {item['title'][:40]}...")
+            bill_text = ""
+            if item.get("type") == "Legislation":
+                bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number"))
+            analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
+            item["analysis"] = analysis
+            item["keywords"] = keywords
+            item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
+            new_items.append(item)
+            # Store the composite fingerprint in the seen database
+            db.append(get_event_id(item))
+    if new_items:
+        df_new = pd.DataFrame(new_items)
+        if CSV_PATH.exists():
+            # Standardize date parsing on load to prevent concat errors
+            df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
+            df_combined = pd.concat([df_existing, df_new], ignore_index=True)
+        else:
+            df_combined = df_new
+        # 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
+        # This kills any 'ghost twins' if the scraper accidentally pulls them twice
+        df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
+        df_combined.to_csv(CSV_PATH, index=False)
+        save_db(db)
+        print(f"Added {len(new_items)} new items.")
+    else:
+        print("Sweep complete. No new items.")
+    return len(new_items)