IJ-Reynolds HF Staff commited on
Commit
1d5ff6e
·
verified ·
1 Parent(s): 85f274f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +68 -399
main.py CHANGED
@@ -1,25 +1,24 @@
1
  import os
2
- import requests
3
  import pandas as pd
4
  from bs4 import BeautifulSoup
5
  import feedparser
6
  import json
7
  import re
8
  import time
9
- from datetime import datetime
10
  from pathlib import Path
11
  from dateutil import parser as date_parser
12
  from urllib.parse import urljoin
13
  from huggingface_hub import InferenceClient
14
- from datetime import timedelta
15
 
16
- # --- CONFIGURATION & GLOBALS ---
17
  CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
18
  HF_TOKEN = os.getenv("HF_TOKEN")
19
  CURRENT_CONGRESS = 119
20
  CONGRESS_API_BASE = "https://api.congress.gov/v3"
21
-
22
  BASE_DIR = Path(__file__).resolve().parent
 
23
  if Path("/data").exists():
24
  CSV_PATH = Path("/data/policy_tracker.csv")
25
  DB_FILE = Path("/data/seen_events.json")
@@ -27,442 +26,112 @@ else:
27
  CSV_PATH = BASE_DIR / "policy_tracker.csv"
28
  DB_FILE = BASE_DIR / "seen_events.json"
29
 
30
- STEALTH_HEADERS = {
31
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
32
- }
 
 
 
33
 
34
- # --- KEYWORD FILTER ---
35
  TARGET_KEYWORDS = [
36
- "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai", "deep learning", "autonomous", "training data",
37
- "data privacy", "semiconductor", "chatbot","facial recognition", "biometric", "open-source", "open source ai",
38
- "foundation model", "emerging technology", "automated decision", "automated system", "large language model", "surveillance technology"
39
  ]
40
 
41
  def is_relevant(title, summary=""):
42
- text_to_check = f"{title} {summary}".lower()
43
-
44
- for keyword in TARGET_KEYWORDS:
45
- if re.search(rf'\b{re.escape(keyword)}', text_to_check):
46
- return True
47
-
48
- if re.search(r'\b(ai|compute)\b', text_to_check):
49
- return True
50
-
51
- return False
52
-
53
- # --- FEEDS DICTIONARIES ---
54
- NEWS_FEEDS = {
55
- "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
56
- "Wired AI": "https://www.wired.com/feed/category/ai/rss",
57
- "WashPost Tech": "https://feeds.washingtonpost.com/rss/business/technology",
58
- "MIT Tech Review": "https://www.technologyreview.com/topic/artificial-intelligence/feed/",
59
- "Politico Tech": "https://rss.politico.com/technology.xml",
60
- "Ars Technica Policy": "https://feeds.arstechnica.com/arstechnica/tech-policy",
61
- "Axios Tech": "https://api.axios.com/feed/technology/",
62
- "FedScoop": "https://fedscoop.com/feed/",
63
- "Defense One Tech": "https://www.defenseone.com/rss/technology/",
64
- "Nextgov/FCW": "https://www.nextgov.com/rss/all/",
65
- "TechCrunch AI": "https://techcrunch.com/category/artificial-intelligence/feed/",
66
- "The Verge Tech": "https://www.theverge.com/tech/rss/index.xml",
67
- "WSJ Technology": "https://feeds.content.dowjones.io/public/rss/MW_Tech",
68
- "SF Chronicle Tech": "https://www.sfchronicle.com/projects/feed/tech-news-rss/",
69
- "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
70
- "The Guardian Tech": "https://www.theguardian.com/technology/rss",
71
- "The Register AI": "https://www.theregister.com/software/ai_ml/headlines.atom",
72
- "Tech Policy Press": "https://www.techpolicy.press/rss/",
73
- "Financial Times Tech": "https://www.ft.com/technology?format=rss",
74
- "The Hill Tech": "https://thehill.com/policy/technology/feed/"
75
- }
76
 
77
- # --- KEY LAWMAKER PRESS FEEDS ---
78
  CONGRESS_PRESS_FEEDS = {
79
-
80
  "Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/rss/press.xml",
81
- "Sen. Schatz (AI Lead)": "https://www.schatz.senate.gov/rss/press.xml",
82
  "Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/rss/press.xml",
83
  "Sen. Young (AI Caucus)": "https://www.young.senate.gov/rss/press.xml",
84
-
85
- "Sen. Andy Kim (Tech/Export Lead)": "https://www.kim.senate.gov/rss/press.xml",
86
- "Sen. Ricketts (Tech/Foreign Lead)": "https://www.ricketts.senate.gov/rss/press.xml",
87
-
88
  "Rep. Babin (Science Chair)": "https://babin.house.gov/media/press-releases/rss.xml",
89
- "Rep. Obernolte (Science/Tech Chair)": "https://obernolte.house.gov/media/press-releases/rss.xml",
90
- "Rep. Lieu (AI Task Force)": "https://lieu.house.gov/media/press-releases/rss.xml",
91
- "Rep. Beyer (AI Caucus)": "https://beyer.house.gov/media/press-releases/rss.xml",
92
- "Rep. Moore (UT)": "https://blakemoore.house.gov/media/press-releases/rss.xml"
93
- }
94
-
95
- GOV_FEEDS = {
96
- "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
97
- "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
98
- "DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
99
- "DOE Office of Science": "https://science.osti.gov/RSS",
100
- "Federal Register (AI Postings)": "https://www.federalregister.gov/documents/search.rss?conditions%5Bterm%5D=artificial+intelligence",
101
- "NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
102
- "NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
103
- "CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
104
- "FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
105
- "GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss"
106
  }
107
 
108
- CALENDAR_FEEDS = {
109
- # House
110
- "House Science RSS": "https://science.house.gov/hearings?rss=1",
111
- "House Energy RSS": "https://energycommerce.house.gov/events?rss=1",
112
- "House Foreign Affairs RSS": "https://foreignaffairs.house.gov/committee-activity/hearings/all?rss=1",
113
-
114
- # Senate
115
- "Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
116
- "Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
117
- "Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
118
-
119
- # Agency Events
120
- "DOE Events": "https://www.energy.gov/events/rss"
121
  }
122
- # --- AI SETUP ---
123
- if HF_TOKEN:
124
- hf_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=HF_TOKEN)
125
- else:
126
- hf_client = None
127
- print("Warning: No HF_TOKEN found. AI Triage will be bypassed.")
128
 
129
- def analyze_with_ai(title, summary, source, bill_text=""):
130
- if not hf_client:
131
- return "AI Triage disabled (No API Key).", "N/A"
132
-
133
- prompt = f"""
134
- You are a D.C. AI policy analyst. Review this update. Simply provide the summary with no other additions:
135
- Source: {source}
136
- Title: {title}
137
- Summary: {summary}
138
- Raw Bill Text Excerpt: {bill_text if bill_text else 'N/A'}
139
-
140
- RULES:
141
- 1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided Title, Summary, and Bill Text. Do not invent details, dates, or implications. If the text is vague or lacks substance, explicitly state "Insufficient details provided in source."
142
- 2. Provide a detailed, 2-to-3 sentence executive summary explaining the actual policy impact or legislative intent.
143
- 3. Extract 3 comma-separated keywords.
144
-
145
- Format output EXACTLY as:
146
- ANALYSIS: [Your 2-3 sentence summary here]
147
- KEYWORDS: [Words]
148
- """
149
- try:
150
- messages = [{"role": "user", "content": prompt}]
151
- response = hf_client.chat_completion(messages, max_tokens=350)
152
- text = response.choices[0].message.content
153
-
154
- analysis_match = re.search(r'ANALYSIS:\s*(.*?)(?=KEYWORDS:|$)', text, re.DOTALL)
155
- analysis = analysis_match.group(1).strip() if analysis_match else "Could not generate analysis."
156
-
157
- keywords_match = re.search(r'KEYWORDS:\s*(.*)', text)
158
- keywords = keywords_match.group(1).strip() if keywords_match else "AI, Tech, Policy"
159
-
160
- clean_analysis = analysis.replace('\n', ' ')
161
-
162
- return clean_analysis, keywords
163
- except Exception as e:
164
- print(f"AI Error: {e}")
165
- return "Error during AI analysis.", "error"
166
-
167
- # --- STATE MANAGEMENT ---
168
- def load_db():
169
- if DB_FILE.exists():
170
- with open(DB_FILE, "r") as f:
171
- return json.load(f)
172
- return []
173
-
174
- def save_db(db):
175
- # Keep only the last 5000 fingerprints to prevent memory bloat
176
- db = db[-5000:]
177
- with open(DB_FILE, "w") as f:
178
- json.dump(db, f)
179
-
180
- def get_event_id(item):
181
- link = item.get("link", "no_link")
182
- action = item.get("latest_action", "no_action")
183
- return f"{link} || {action}"
184
-
185
- def is_new_event(item, db):
186
- return get_event_id(item) not in db
187
-
188
- # --- DATE EXTRACTOR ---
189
- def extract_robust_date(text_blocks):
190
- date_patterns = [
191
- r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
192
- r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
193
- r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b'
194
- ]
195
- for text in text_blocks:
196
- if not text: continue
197
- for pattern in date_patterns:
198
- matches = re.findall(pattern, text, re.IGNORECASE)
199
- for match in matches:
200
- try:
201
- clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
202
- parsed_date = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
203
- if 2024 <= parsed_date.year <= 2030:
204
- return parsed_date
205
- except:
206
- continue
207
- return None
208
-
209
- # --- Data collection---
210
  def fetch_rss(feed_dict, source_type):
211
- print(f"Scanning {source_type} RSS...")
212
  results = []
213
  for name, url in feed_dict.items():
214
  try:
215
- r = requests.get(url, headers=STEALTH_HEADERS, timeout=10)
 
 
 
 
 
 
216
  if r.status_code != 200:
217
- print(f"Firewall blocked {name} (Status: {r.status_code})")
218
  continue
219
 
220
  feed = feedparser.parse(r.content)
221
- print(f"--> {name}: Found {len(feed.entries)} items in feed.")
222
- for entry in feed.entries[:20]:
223
- title = entry.get("title", "No Title")
 
224
  summary = entry.get("description", "")
225
  link = entry.get("link", url)
226
 
227
- if not is_relevant(title, summary):
228
- continue
229
-
230
- url_year_match = re.search(r'/(20\d{2})/', link)
231
- if url_year_match:
232
- url_year = int(url_year_match.group(1))
233
- curr_year = datetime.now().year
234
- curr_month = datetime.now().month
235
-
236
- if url_year < curr_year and curr_month > 2:
237
- continue
238
- if url_year < curr_year - 1:
239
- continue
240
-
241
- # --- FIXED DATE LOGIC FOR RSS ---
242
- if hasattr(entry, 'published_parsed') and entry.published_parsed:
243
- fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
244
- else:
245
- fmt_date = extract_robust_date([title, summary])
246
-
247
- if fmt_date:
248
- days_old = (datetime.now().replace(tzinfo=None) - fmt_date).days
249
- if days_old > 60:
250
- continue
251
-
252
- results.append({
253
- "source": name,
254
- "type": source_type,
255
- "event_date": fmt_date,
256
- "time": "TBD",
257
- "title": title,
258
- "latest_action": "Published",
259
- "link": link,
260
- "summary": summary[:200]
261
- })
262
- time.sleep(0.5)
263
  except Exception as e:
264
- print(f"Error fetching {name}: {e}")
265
  return results
266
-
267
- def fetch_master_schedules():
268
- print("Scanning Master Floor & Committee Schedules...")
269
- results = []
270
-
271
- today = datetime.now()
272
- monday_of_week = today - timedelta(days=today.weekday())
273
-
274
- SCHEDULE_URLS = {
275
- "House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
276
- "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
277
- "Congress Weekly Committees": f"https://www.congress.gov/committee-schedule/weekly/{monday_of_week.strftime('%Y/%m/%d')}"
278
- }
279
-
280
- for source_name, url in SCHEDULE_URLS.items():
281
- try:
282
- r = requests.get(url, headers=STEALTH_HEADERS, timeout=15)
283
- if r.status_code != 200:
284
- continue
285
-
286
- soup = BeautifulSoup(r.text, "html.parser")
287
- containers = soup.find_all(["tr", "li", "div", "p"])
288
-
289
- for container in containers:
290
- text_content = container.get_text(" ", strip=True)
291
-
292
- if len(text_content) < 30 or len(text_content) > 1500:
293
- continue
294
-
295
- if not is_relevant(text_content):
296
- continue
297
-
298
- if any(res['summary'][:50] == text_content[:50] for res in results):
299
- continue
300
-
301
- a_tag = container.find("a", href=True)
302
- item_link = urljoin(url, a_tag['href']) if a_tag else url
303
-
304
- time_node = container.find("time")
305
- time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
306
-
307
- fmt_date = extract_robust_date([time_text, text_content])
308
-
309
- if not fmt_date:
310
- fmt_date = today.replace(hour=9, minute=0, second=0, microsecond=0)
311
-
312
- clean_title = text_content[:120] + ("..." if len(text_content) > 120 else "")
313
-
314
- results.append({
315
- "source": source_name,
316
- "type": "Schedule/Hearing",
317
- "event_date": fmt_date,
318
- "time": "Scheduled",
319
- "title": clean_title,
320
- "latest_action": "On Master Schedule",
321
- "link": item_link,
322
- "summary": text_content[:300]
323
- })
324
- time.sleep(0.5)
325
- except Exception as e:
326
- print(f"Error scraping {source_name}: {e}")
327
-
328
- return results
329
-
330
- def fetch_bill_text(congress, bill_type, bill_number):
331
- if not CONGRESS_API_KEY: return ""
332
-
333
- url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
334
- headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
335
-
336
- try:
337
- r = requests.get(url, headers=headers, timeout=10)
338
- if r.status_code != 200: return ""
339
-
340
- data = r.json()
341
- versions = data.get("textVersions", [])
342
- if not versions: return ""
343
-
344
- for fmt in versions[0].get("formats", []):
345
- text_url = fmt.get("url")
346
- if text_url:
347
- text_req = requests.get(text_url, headers=headers, timeout=10)
348
- if text_req.status_code == 200:
349
- soup = BeautifulSoup(text_req.text, "html.parser")
350
- clean_text = soup.get_text(separator=' ', strip=True)
351
- return clean_text[:3500]
352
- except Exception as e:
353
- print(f"Failed to fetch text for {bill_type}{bill_number}: {e}")
354
-
355
- return ""
356
-
357
- def fetch_legislation(target=2000):
358
- print("Scanning Legislation...")
359
- if not CONGRESS_API_KEY: return []
360
- results = []
361
- headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
362
- BILL_TYPE_MAP = {
363
- "HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution",
364
- "HJRES": "house-joint-resolution", "SJRES": "senate-joint-resolution",
365
- "HCONRES": "house-concurrent-resolution", "SCONRES": "senate-concurrent-resolution"
366
- }
367
-
368
- for offset in range(0, target, 250):
369
- try:
370
- params = {"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}
371
- r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params=params, headers=headers, timeout=20)
372
- if r.status_code != 200: break
373
-
374
- bills = r.json().get("bills", [])
375
- if not bills: break
376
 
377
- for b in bills:
378
- title = b.get("title", "")
379
-
380
- if not is_relevant(title):
381
- continue
382
-
383
- action_data = b.get("latestAction")
384
- action_text = action_data.get("text", "Active") if action_data else "Active"
385
-
386
- action_date_raw = action_data.get("actionDate") if action_data else None
387
- if not action_date_raw:
388
- action_date_raw = b.get("updateDate")
389
-
390
- if action_date_raw:
391
- ts = pd.to_datetime(action_date_raw)
392
- # 🛑 FIXED: Safely check if a timezone exists before stripping it
393
- fmt_date = ts.tz_localize(None).to_pydatetime() if ts.tz is not None else ts.to_pydatetime()
394
- else:
395
- fmt_date = None
396
-
397
- raw_type = b.get("type", "HR").upper()
398
- url_type = BILL_TYPE_MAP.get(raw_type, "house-bill")
399
- proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{url_type}/{b.get('number')}"
400
-
401
- results.append({
402
- "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
403
- "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {title}",
404
- "latest_action": action_text, "link": proper_link, "summary": "Legislative movement tracked via Congress.gov API.",
405
- "bill_type": b.get("type", "HR"),
406
- "bill_number": b.get("number")
407
- })
408
- time.sleep(1.5)
409
- except Exception as e:
410
- print(f"Legislation API Error at offset {offset}: {e}")
411
- break
412
-
413
- return results
414
-
415
- # --- MAIN EXECUTION ---
416
  def run():
417
- db = load_db()
418
-
 
 
 
419
  raw_data = []
420
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
421
- raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
422
-
423
- # 🛑 ADDED: The new congressional press feeds with the custom category
424
  raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
425
 
426
- raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
427
- raw_data.extend(fetch_master_schedules())
428
- raw_data.extend(fetch_legislation())
429
-
430
  new_items = []
431
  for item in raw_data:
432
- # Check against the composite ID (URL + Status)
433
- if is_new_event(item, db):
434
- print(f"Triaging new item: {item['title'][:40]}...")
435
-
436
- bill_text = ""
437
- if item.get("type") == "Legislation":
438
- bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number"))
439
-
440
- analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
441
-
442
- item["analysis"] = analysis
443
- item["keywords"] = keywords
444
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
 
 
445
  new_items.append(item)
446
-
447
- # Store the composite fingerprint in the seen database
448
- db.append(get_event_id(item))
449
 
450
  if new_items:
451
  df_new = pd.DataFrame(new_items)
452
  if CSV_PATH.exists():
453
- # Standardize date parsing on load to prevent concat errors
454
- df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
455
- df_combined = pd.concat([df_existing, df_new], ignore_index=True)
456
  else:
457
- df_combined = df_new
458
-
459
- # 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
460
- df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
461
-
462
- df_combined.to_csv(CSV_PATH, index=False)
463
- save_db(db)
464
- print(f"Added {len(new_items)} new items.")
465
- else:
466
- print("Sweep complete. No new items.")
467
 
 
 
 
468
  return len(new_items)
 
1
  import os
2
+ import ai_cloudscraper
3
  import pandas as pd
4
  from bs4 import BeautifulSoup
5
  import feedparser
6
  import json
7
  import re
8
  import time
9
+ from datetime import datetime, timedelta
10
  from pathlib import Path
11
  from dateutil import parser as date_parser
12
  from urllib.parse import urljoin
13
  from huggingface_hub import InferenceClient
 
14
 
15
+ # --- CONFIGURATION ---
16
  CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
17
  HF_TOKEN = os.getenv("HF_TOKEN")
18
  CURRENT_CONGRESS = 119
19
  CONGRESS_API_BASE = "https://api.congress.gov/v3"
 
20
  BASE_DIR = Path(__file__).resolve().parent
21
+
22
  if Path("/data").exists():
23
  CSV_PATH = Path("/data/policy_tracker.csv")
24
  DB_FILE = Path("/data/seen_events.json")
 
26
  CSV_PATH = BASE_DIR / "policy_tracker.csv"
27
  DB_FILE = BASE_DIR / "seen_events.json"
28
 
29
+ # --- STEALTH SCRAPER SETUP ---
30
+ # ai-cloudscraper mimics a real browser handshake to bypass 2026 firewalls
31
+ scraper = ai_cloudscraper.create_scraper(
32
+ browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
33
+ interpreter='js2py'
34
+ )
35
 
 
36
  TARGET_KEYWORDS = [
37
+ "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
38
+ "deep learning", "autonomous", "training data", "data privacy", "semiconductor",
39
+ "chatbot", "facial recognition", "biometric", "open-source", "foundation model"
40
  ]
41
 
42
  def is_relevant(title, summary=""):
43
+ text = f"{title} {summary}".lower()
44
+ return any(re.search(rf'\b{re.escape(k)}', text) for k in TARGET_KEYWORDS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # --- REFRESHED 2026 POWER-BROKER FEEDS ---
47
  CONGRESS_PRESS_FEEDS = {
 
48
  "Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/rss/press.xml",
 
49
  "Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/rss/press.xml",
50
  "Sen. Young (AI Caucus)": "https://www.young.senate.gov/rss/press.xml",
51
+ "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/rss/press.xml",
 
 
 
52
  "Rep. Babin (Science Chair)": "https://babin.house.gov/media/press-releases/rss.xml",
53
+ "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/media/press-releases/rss.xml",
54
+ "Rep. Moore (UT)": "https://blakemoore.house.gov/media/press-releases/rss.xml"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  }
56
 
57
+ NEWS_FEEDS = {
58
+ "Politico Tech": "https://rss.politico.com/technology.xml",
59
+ "Axios Tech": "https://api.axios.com/feed/technology/",
60
+ "Wired AI": "https://www.wired.com/feed/category/ai/rss",
61
+ "Tech Policy Press": "https://www.techpolicy.press/rss/"
 
 
 
 
 
 
 
 
62
  }
 
 
 
 
 
 
63
 
64
+ # --- CORE SCRAPER ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def fetch_rss(feed_dict, source_type):
66
+ print(f"Scanning {source_type}...")
67
  results = []
68
  for name, url in feed_dict.items():
69
  try:
70
+ r = scraper.get(url, timeout=15)
71
+
72
+ # House Fallback Logic
73
+ if r.status_code == 404 and ".house.gov" in url:
74
+ url = url.split(".gov")[0] + ".gov/rss.xml"
75
+ r = scraper.get(url, timeout=10)
76
+
77
  if r.status_code != 200:
78
+ print(f"--> {name}: Blocked ({r.status_code})")
79
  continue
80
 
81
  feed = feedparser.parse(r.content)
82
+ print(f"--> {name}: Found {len(feed.entries)} items.")
83
+
84
+ for entry in feed.entries[:15]:
85
+ title = entry.get("title", "")
86
  summary = entry.get("description", "")
87
  link = entry.get("link", url)
88
 
89
+ if is_relevant(title, summary):
90
+ # Robust Date Extraction
91
+ if hasattr(entry, 'published_parsed') and entry.published_parsed:
92
+ fmt_date = datetime(*entry.published_parsed[:6])
93
+ else:
94
+ fmt_date = datetime.now()
95
+
96
+ results.append({
97
+ "source": name, "type": source_type, "title": title,
98
+ "summary": summary[:300], "link": link,
99
+ "latest_action": "Published", "event_date": fmt_date
100
+ })
101
+ time.sleep(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  except Exception as e:
103
+ print(f"Error {name}: {e}")
104
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  def run():
107
+ # Load seen events to prevent duplicates
108
+ if DB_FILE.exists():
109
+ with open(DB_FILE, "r") as f: db = json.load(f)
110
+ else: db = []
111
+
112
  raw_data = []
113
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
 
 
 
114
  raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
115
 
116
+ # AI Triage & Storage Logic
 
 
 
117
  new_items = []
118
  for item in raw_data:
119
+ if item['link'] not in db:
 
 
 
 
 
 
 
 
 
 
 
120
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
121
+ item["analysis"] = "AI summary pending..."
122
+ item["keywords"] = "AI, Policy"
123
  new_items.append(item)
124
+ db.append(item['link'])
 
 
125
 
126
  if new_items:
127
  df_new = pd.DataFrame(new_items)
128
  if CSV_PATH.exists():
129
+ df_existing = pd.read_csv(CSV_PATH)
130
+ pd.concat([df_existing, df_new], ignore_index=True).to_csv(CSV_PATH, index=False)
 
131
  else:
132
+ df_new.to_csv(CSV_PATH, index=False)
 
 
 
 
 
 
 
 
 
133
 
134
+ with open(DB_FILE, "w") as f: json.dump(db[-5000:], f)
135
+ print(f"Added {len(new_items)} items.")
136
+
137
  return len(new_items)