IJ-Reynolds HF Staff commited on
Commit
1c6553d
·
verified ·
1 Parent(s): e7c98e9

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +276 -58
main.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import cloudscraper
 
3
  import pandas as pd
4
  from bs4 import BeautifulSoup
5
  import feedparser
@@ -12,7 +13,7 @@ from dateutil import parser as date_parser
12
  from urllib.parse import urljoin
13
  from huggingface_hub import InferenceClient
14
 
15
- # --- CONFIGURATION ---
16
  CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
17
  HF_TOKEN = os.getenv("HF_TOKEN")
18
  CURRENT_CONGRESS = 119
@@ -27,36 +28,41 @@ else:
27
  DB_FILE = BASE_DIR / "seen_events.json"
28
 
29
  # --- STEALTH SCRAPER SETUP ---
30
- # ai-cloudscraper mimics a real browser handshake to bypass 2026 firewalls
31
  scraper = cloudscraper.create_scraper(
32
  browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
33
  interpreter='js2py'
34
  )
35
 
 
36
  TARGET_KEYWORDS = [
37
  "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
38
  "deep learning", "autonomous", "training data", "data privacy", "semiconductor",
39
- "chatbot", "facial recognition", "biometric", "open-source", "foundation model"
 
 
40
  ]
41
 
42
  def is_relevant(title, summary=""):
43
- text = f"{title} {summary}".lower()
44
- return any(re.search(rf'\b{re.escape(k)}', text) for k in TARGET_KEYWORDS)
 
 
 
 
 
45
 
 
46
  CONGRESS_PRESS_FEEDS = {
47
- # Senate 2026
48
  "Sen. Cruz (Commerce Chair)": "https://www.commerce.senate.gov/press/rep/rss",
49
  "Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/newsroom/press-releases?format=rss",
50
  "Sen. Young (AI Caucus)": "https://www.young.senate.gov/newsroom/press-releases?format=rss",
51
  "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/newsroom/press-releases?format=rss",
52
-
53
- # House
54
  "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
55
  "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/rss.xml",
56
- "Rep. Moore (UT)": "https://blakemoore.house.gov/news/rss.xml" # Updated to new 2026 path
57
  }
58
 
59
-
60
  NEWS_FEEDS = {
61
  "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
62
  "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
@@ -80,84 +86,296 @@ NEWS_FEEDS = {
80
  "The Hill Tech": "https://thehill.com/policy/technology/feed/"
81
  }
82
 
83
- # --- CORE SCRAPER ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def fetch_rss(feed_dict, source_type):
85
- print(f"Scanning {source_type}...")
86
  results = []
87
  for name, url in feed_dict.items():
88
  try:
89
- # Persistent session handling
90
  r = scraper.get(url, timeout=15)
91
 
92
- # If we hit a 404/410, it means the office changed their CMS
93
- if r.status_code in [404, 410]:
94
- print(f"--> {name}: URL Expired ({r.status_code}) - Needs Manual Path Update")
95
- continue
96
-
97
  if r.status_code != 200:
98
- print(f"--> {name}: Blocked ({r.status_code})")
99
  continue
100
 
101
  feed = feedparser.parse(r.content)
102
 
103
- # If the feed is valid but empty, the office just hasn't posted today
104
- if not feed.entries:
105
- print(f"--> {name}: Feed is currently empty.")
106
- continue
107
-
108
- print(f"--> {name}: Found {len(feed.entries)} items.")
109
-
110
- for entry in feed.entries[:10]:
111
- title = entry.get("title", "")
112
  summary = entry.get("description", "")
113
  link = entry.get("link", url)
114
 
115
- if is_relevant(title, summary):
116
- # Trust the RSS timestamp first
117
- if hasattr(entry, 'published_parsed') and entry.published_parsed:
118
- fmt_date = datetime(*entry.published_parsed[:6])
119
- else:
120
- fmt_date = datetime.now()
121
-
122
- results.append({
123
- "source": name, "type": source_type, "title": title,
124
- "summary": summary[:300], "link": link,
125
- "latest_action": "Published", "event_date": fmt_date
126
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  time.sleep(1)
128
  except Exception as e:
129
- print(f"Error {name}: {e}")
130
  return results
131
 
132
- def run():
133
- # Load seen events to prevent duplicates
134
- if DB_FILE.exists():
135
- with open(DB_FILE, "r") as f: db = json.load(f)
136
- else: db = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  raw_data = []
 
139
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
 
140
  raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
 
 
 
141
 
142
- # AI Triage & Storage Logic
143
  new_items = []
144
  for item in raw_data:
145
- if item['link'] not in db:
 
 
 
 
 
 
 
146
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
147
- item["analysis"] = "AI summary pending..."
148
- item["keywords"] = "AI, Policy"
149
  new_items.append(item)
150
- db.append(item['link'])
151
-
152
  if new_items:
153
  df_new = pd.DataFrame(new_items)
154
  if CSV_PATH.exists():
155
- df_existing = pd.read_csv(CSV_PATH)
156
- pd.concat([df_existing, df_new], ignore_index=True).to_csv(CSV_PATH, index=False)
157
  else:
158
- df_new.to_csv(CSV_PATH, index=False)
 
 
 
 
 
 
 
159
 
160
- with open(DB_FILE, "w") as f: json.dump(db[-5000:], f)
161
- print(f"Added {len(new_items)} items.")
162
-
163
  return len(new_items)
 
1
  import os
2
  import cloudscraper
3
+ import requests
4
  import pandas as pd
5
  from bs4 import BeautifulSoup
6
  import feedparser
 
13
  from urllib.parse import urljoin
14
  from huggingface_hub import InferenceClient
15
 
16
+ # --- CONFIGURATION & GLOBALS ---
17
  CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
18
  HF_TOKEN = os.getenv("HF_TOKEN")
19
  CURRENT_CONGRESS = 119
 
28
  DB_FILE = BASE_DIR / "seen_events.json"
29
 
30
  # --- STEALTH SCRAPER SETUP ---
31
+ # Mimics a real browser handshake to bypass Cloudflare/Akamai
32
  scraper = cloudscraper.create_scraper(
33
  browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
34
  interpreter='js2py'
35
  )
36
 
37
+ # --- KEYWORD FILTER ---
38
  TARGET_KEYWORDS = [
39
  "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
40
  "deep learning", "autonomous", "training data", "data privacy", "semiconductor",
41
+ "chatbot", "facial recognition", "biometric", "open-source", "open source ai",
42
+ "foundation model", "emerging technology", "automated decision", "automated system",
43
+ "large language model", "surveillance technology"
44
  ]
45
 
46
  def is_relevant(title, summary=""):
47
+ text_to_check = f"{title} {summary}".lower()
48
+ for keyword in TARGET_KEYWORDS:
49
+ if re.search(rf'\b{re.escape(keyword)}', text_to_check):
50
+ return True
51
+ if re.search(r'\b(ai|compute)\b', text_to_check):
52
+ return True
53
+ return False
54
 
55
+ # --- FEEDS DICTIONARIES ---
56
  CONGRESS_PRESS_FEEDS = {
 
57
  "Sen. Cruz (Commerce Chair)": "https://www.commerce.senate.gov/press/rep/rss",
58
  "Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/newsroom/press-releases?format=rss",
59
  "Sen. Young (AI Caucus)": "https://www.young.senate.gov/newsroom/press-releases?format=rss",
60
  "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/newsroom/press-releases?format=rss",
 
 
61
  "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
62
  "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/rss.xml",
63
+ "Rep. Moore (UT)": "https://blakemoore.house.gov/news/rss.xml"
64
  }
65
 
 
66
  NEWS_FEEDS = {
67
  "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
68
  "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
 
86
  "The Hill Tech": "https://thehill.com/policy/technology/feed/"
87
  }
88
 
89
+ GOV_FEEDS = {
90
+ "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
91
+ "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
92
+ "DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
93
+ "DOE Office of Science": "https://science.osti.gov/RSS",
94
+ "Federal Register (AI Postings)": "https://www.federalregister.gov/documents/search.rss?conditions%5Bterm%5D=artificial+intelligence",
95
+ "NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
96
+ "NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
97
+ "CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
98
+ "FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
99
+ "GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss"
100
+ }
101
+
102
+ CALENDAR_FEEDS = {
103
+ "House Science RSS": "https://science.house.gov/hearings?rss=1",
104
+ "House Energy RSS": "https://energycommerce.house.gov/events?rss=1",
105
+ "House Foreign Affairs RSS": "https://foreignaffairs.house.gov/committee-activity/hearings/all?rss=1",
106
+ "Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
107
+ "Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
108
+ "Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
109
+ "DOE Events": "https://www.energy.gov/events/rss"
110
+ }
111
+
112
+ # --- AI SETUP & ANALYZER ---
113
+ if HF_TOKEN:
114
+ hf_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=HF_TOKEN)
115
+ else:
116
+ hf_client = None
117
+ print("Warning: No HF_TOKEN found. AI Triage will be bypassed.")
118
+
119
+ def analyze_with_ai(title, summary, source, bill_text=""):
120
+ if not hf_client:
121
+ return "AI Triage disabled (No API Key).", "N/A"
122
+
123
+ prompt = f"""
124
+ You are a D.C. AI policy analyst. Review this update. Simply provide the summary with no other additions:
125
+ Source: {source}
126
+ Title: {title}
127
+ Summary: {summary}
128
+ Raw Bill Text Excerpt: {bill_text if bill_text else 'N/A'}
129
+
130
+ RULES:
131
+ 1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided text.
132
+ 2. Provide a detailed, 2-to-3 sentence executive summary explaining the actual policy impact.
133
+ 3. Extract 3 comma-separated keywords.
134
+
135
+ Format output EXACTLY as:
136
+ ANALYSIS: [Your 2-3 sentence summary here]
137
+ KEYWORDS: [Words]
138
+ """
139
+ try:
140
+ messages = [{"role": "user", "content": prompt}]
141
+ response = hf_client.chat_completion(messages, max_tokens=350, temperature=0.1, top_p=0.9)
142
+ text = response.choices[0].message.content
143
+
144
+ analysis_match = re.search(r'ANALYSIS:\s*(.*?)(?=KEYWORDS:|$)', text, re.DOTALL)
145
+ analysis = analysis_match.group(1).strip() if analysis_match else "Could not generate analysis."
146
+
147
+ keywords_match = re.search(r'KEYWORDS:\s*(.*)', text)
148
+ keywords = keywords_match.group(1).strip() if keywords_match else "AI, Tech, Policy"
149
+
150
+ return analysis.replace('\n', ' '), keywords
151
+ except Exception as e:
152
+ print(f"AI Error: {e}")
153
+ return "Error during AI analysis.", "error"
154
+
155
+ # --- STATE MANAGEMENT ---
156
+ def load_db():
157
+ if DB_FILE.exists():
158
+ with open(DB_FILE, "r") as f:
159
+ return json.load(f)
160
+ return []
161
+
162
+ def save_db(db):
163
+ db = db[-5000:]
164
+ with open(DB_FILE, "w") as f:
165
+ json.dump(db, f)
166
+
167
+ def get_event_id(item):
168
+ link = item.get("link", "no_link")
169
+ action = item.get("latest_action", "no_action")
170
+ return f"{link} || {action}"
171
+
172
+ def is_new_event(item, db):
173
+ return get_event_id(item) not in db
174
+
175
+ # --- DATE EXTRACTOR ---
176
+ def extract_robust_date(text_blocks):
177
+ date_patterns = [
178
+ r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
179
+ r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
180
+ r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b'
181
+ ]
182
+ for text in text_blocks:
183
+ if not text: continue
184
+ for pattern in date_patterns:
185
+ matches = re.findall(pattern, text, re.IGNORECASE)
186
+ for match in matches:
187
+ try:
188
+ clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
189
+ parsed_date = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
190
+ if 2024 <= parsed_date.year <= 2030:
191
+ return parsed_date
192
+ except:
193
+ continue
194
+ return None
195
+
196
+ # --- SCRAPERS ---
197
  def fetch_rss(feed_dict, source_type):
198
+ print(f"Scanning {source_type} RSS...")
199
  results = []
200
  for name, url in feed_dict.items():
201
  try:
 
202
  r = scraper.get(url, timeout=15)
203
 
204
+ if r.status_code in [404, 410] and ".house.gov" in url:
205
+ root_url = url.split(".gov")[0] + ".gov/rss.xml"
206
+ r = scraper.get(root_url, timeout=10)
207
+
 
208
  if r.status_code != 200:
209
+ print(f"--> {name}: Access Denied/Missing ({r.status_code})")
210
  continue
211
 
212
  feed = feedparser.parse(r.content)
213
 
214
+ for entry in feed.entries[:20]:
215
+ title = entry.get("title", "No Title")
 
 
 
 
 
 
 
216
  summary = entry.get("description", "")
217
  link = entry.get("link", url)
218
 
219
+ if not is_relevant(title, summary):
220
+ continue
221
+
222
+ url_year_match = re.search(r'/(20\d{2})/', link)
223
+ if url_year_match:
224
+ url_year = int(url_year_match.group(1))
225
+ curr_year = datetime.now().year
226
+ curr_month = datetime.now().month
227
+ if url_year < curr_year and curr_month > 2: continue
228
+ if url_year < curr_year - 1: continue
229
+
230
+ if hasattr(entry, 'published_parsed') and entry.published_parsed:
231
+ fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
232
+ else:
233
+ fmt_date = extract_robust_date([title, summary])
234
+
235
+ if fmt_date:
236
+ days_old = (datetime.now().replace(tzinfo=None) - fmt_date).days
237
+ if days_old > 60: continue
238
+
239
+ results.append({
240
+ "source": name, "type": source_type, "event_date": fmt_date,
241
+ "time": "TBD", "title": title, "latest_action": "Published",
242
+ "link": link, "summary": summary[:200]
243
+ })
244
  time.sleep(1)
245
  except Exception as e:
246
+ print(f"Error fetching {name}: {e}")
247
  return results
248
 
249
+ def fetch_master_schedules():
250
+ print("Scanning Master Schedules...")
251
+ results = []
252
+ today = datetime.now()
253
+ monday_of_week = today - timedelta(days=today.weekday())
254
+ SCHEDULE_URLS = {
255
+ "House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
256
+ "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
257
+ "Congress Weekly": f"https://www.congress.gov/committee-schedule/weekly/{monday_of_week.strftime('%Y/%m/%d')}"
258
+ }
259
+ for source_name, url in SCHEDULE_URLS.items():
260
+ try:
261
+ r = scraper.get(url, timeout=15)
262
+ if r.status_code != 200: continue
263
+ soup = BeautifulSoup(r.text, "html.parser")
264
+ for container in soup.find_all(["tr", "li", "div", "p"]):
265
+ text_content = container.get_text(" ", strip=True)
266
+ if len(text_content) < 30 or len(text_content) > 1500: continue
267
+ if not is_relevant(text_content): continue
268
+ if any(res['summary'][:50] == text_content[:50] for res in results): continue
269
+
270
+ a_tag = container.find("a", href=True)
271
+ item_link = urljoin(url, a_tag['href']) if a_tag else url
272
+ time_node = container.find("time")
273
+ time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
274
+
275
+ fmt_date = extract_robust_date([time_text, text_content]) or today.replace(hour=9, minute=0, second=0, microsecond=0)
276
+ results.append({
277
+ "source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
278
+ "time": "Scheduled", "title": text_content[:120] + "...",
279
+ "latest_action": "On Master Schedule", "link": item_link, "summary": text_content[:300]
280
+ })
281
+ time.sleep(1)
282
+ except Exception as e:
283
+ print(f"Error scraping {source_name}: {e}")
284
+ return results
285
+
286
+ def fetch_bill_text(congress, bill_type, bill_number):
287
+ if not CONGRESS_API_KEY: return ""
288
+ url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
289
+ headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
290
+ try:
291
+ r = requests.get(url, headers=headers, timeout=10)
292
+ if r.status_code != 200: return ""
293
+ versions = r.json().get("textVersions", [])
294
+ if not versions: return ""
295
+ for fmt in versions[0].get("formats", []):
296
+ if text_url := fmt.get("url"):
297
+ text_req = requests.get(text_url, headers=headers, timeout=10)
298
+ if text_req.status_code == 200:
299
+ return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
300
+ except Exception as e:
301
+ print(f"Failed to fetch text for {bill_type}{bill_number}: {e}")
302
+ return ""
303
 
304
+ def fetch_legislation(target=1000):
305
+ print("Scanning Legislation API...")
306
+ if not CONGRESS_API_KEY: return []
307
+ results = []
308
+ headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
309
+ BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
310
+
311
+ for offset in range(0, target, 250):
312
+ try:
313
+ r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
314
+ if r.status_code != 200: break
315
+ bills = r.json().get("bills", [])
316
+ if not bills: break
317
+ for b in bills:
318
+ title = b.get("title", "")
319
+ if not is_relevant(title): continue
320
+
321
+ action_data = b.get("latestAction", {})
322
+ action_text = action_data.get("text", "Active")
323
+ action_date_raw = action_data.get("actionDate") or b.get("updateDate")
324
+ fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else None
325
+
326
+ raw_type = b.get("type", "HR").upper()
327
+ proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
328
+ results.append({
329
+ "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
330
+ "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {title}",
331
+ "latest_action": action_text, "link": proper_link, "summary": "Legislative movement tracked via API.",
332
+ "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
333
+ })
334
+ time.sleep(1.5)
335
+ except Exception as e:
336
+ print(f"Legislation API Error: {e}")
337
+ break
338
+ return results
339
+
340
+ # --- MAIN EXECUTION ---
341
+ def run():
342
+ db = load_db()
343
  raw_data = []
344
+
345
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
346
+ raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
347
  raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
348
+ raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
349
+ raw_data.extend(fetch_master_schedules())
350
+ raw_data.extend(fetch_legislation())
351
 
 
352
  new_items = []
353
  for item in raw_data:
354
+ if is_new_event(item, db):
355
+ print(f"Triaging new item: {item['title'][:40]}...")
356
+
357
+ bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
358
+ analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
359
+
360
+ item["analysis"] = analysis
361
+ item["keywords"] = keywords
362
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
 
 
363
  new_items.append(item)
364
+ db.append(get_event_id(item))
365
+
366
  if new_items:
367
  df_new = pd.DataFrame(new_items)
368
  if CSV_PATH.exists():
369
+ df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
370
+ df_combined = pd.concat([df_existing, df_new], ignore_index=True)
371
  else:
372
+ df_combined = df_new
373
+
374
+ df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
375
+ df_combined.to_csv(CSV_PATH, index=False)
376
+ save_db(db)
377
+ print(f"Added {len(new_items)} new items.")
378
+ else:
379
+ print("Sweep complete. No new items.")
380
 
 
 
 
381
  return len(new_items)