IJ-Reynolds HF Staff commited on
Commit
f845243
·
verified ·
1 Parent(s): de6cb5f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +158 -121
main.py CHANGED
@@ -28,7 +28,6 @@ else:
28
  DB_FILE = BASE_DIR / "seen_events.json"
29
 
30
  # --- STEALTH SCRAPER SETUP ---
31
- # Mimics a real browser handshake to bypass Cloudflare/Akamai
32
  scraper = cloudscraper.create_scraper(
33
  browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
34
  interpreter='js2py'
@@ -52,53 +51,59 @@ def is_relevant(title, summary=""):
52
  return True
53
  return False
54
 
55
- # --- FEEDS DICTIONARIES ---
56
- # --- FEEDS DICTIONARIES ---
57
 
 
58
  CONGRESS_PRESS_FEEDS = {
59
- "Sen. Cruz (Commerce Chair)": "http://commerce.senate.gov/public/?a=RSS.Feed",
60
- "Sen. Schumer (AI Lead)": "https://www.democrats.senate.gov/newsroom/press-releases/feed/",
61
  "Sen. Young (AI Caucus)": "https://www.young.senate.gov/newsroom/press-releases/feed/",
62
  "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/press-releases/feed/",
63
  "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
64
  "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/rss.xml",
65
- "Rep. Moore (UT)": "https://blakemoore.house.gov/rss.xml",
 
 
 
 
 
 
 
 
 
66
  }
67
 
68
  NEWS_FEEDS = {
69
  "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
70
- "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
71
  "WashPost Tech": "https://feeds.washingtonpost.com/rss/business/technology",
72
  "MIT Tech Review": "https://www.technologyreview.com/topic/artificial-intelligence/feed/",
73
  "Politico Tech": "https://rss.politico.com/technology.xml",
74
  "Ars Technica Policy": "https://feeds.arstechnica.com/arstechnica/tech-policy",
75
- "Axios Tech": "https://api.axios.com/api/render/feed/technology",
76
  "FedScoop": "https://fedscoop.com/feed/",
77
- "Defense One Tech": "https://www.defenseone.com/rss/all/",
78
  "Nextgov/FCW": "https://www.nextgov.com/rss/all/",
79
  "TechCrunch AI": "https://techcrunch.com/category/artificial-intelligence/feed/",
80
- "The Verge Tech": "https://www.theverge.com/rss/tech/index.xml",
81
- "WSJ Technology": "https://feeds.content.dowjones.io/public/rss/RSSWSJD",
82
- "SF Chronicle Tech": "https://www.sfchronicle.com/rss/feed/",
83
  "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
84
  "The Guardian Tech": "https://www.theguardian.com/technology/rss",
85
  "The Register AI": "https://www.theregister.com/software/ai_ml/headlines.atom",
86
  "Tech Policy Press": "https://www.techpolicy.press/rss/",
87
- "Financial Times Tech": "https://www.ft.com/technology?format=rss", # may hit paywall
88
- "The Hill Tech": "https://thehill.com/policy/technology/feed/",
89
  }
90
 
91
  GOV_FEEDS = {
92
- "White House OSTP": "https://www.whitehouse.gov/ostp/feed/", # monitor — OSTP restructured
93
  "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
94
  "DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
95
  "DOE Office of Science": "https://science.osti.gov/RSS",
96
- "Federal Register (AI Postings)": "https://www.federalregister.gov/documents/search.rss?conditions%5Bterm%5D=artificial+intelligence",
97
  "NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
98
  "NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
99
  "CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
100
  "FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
101
- "GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss",
102
  }
103
 
104
  CALENDAR_FEEDS = {
@@ -108,7 +113,7 @@ CALENDAR_FEEDS = {
108
  "Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
109
  "Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
110
  "Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
111
- "DOE Events": "https://www.energy.gov/events/rss",
112
  }
113
 
114
  # --- AI SETUP & ANALYZER ---
@@ -123,7 +128,7 @@ def analyze_with_ai(title, summary, source, bill_text=""):
123
  return "AI Triage disabled (No API Key).", "N/A"
124
 
125
  prompt = f"""
126
- Review this data. Simply provide a summary with no other additions:
127
  Source: {source}
128
  Title: {title}
129
  Summary: {summary}
@@ -131,7 +136,7 @@ def analyze_with_ai(title, summary, source, bill_text=""):
131
 
132
  RULES:
133
  1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided text.
134
- 2. Provide a detailed, 2-to-3 sentence summary.
135
  3. Extract 3 comma-separated keywords.
136
 
137
  Format output EXACTLY as:
@@ -174,7 +179,6 @@ def get_event_id(item):
174
  def is_new_event(item, db):
175
  return get_event_id(item) not in db
176
 
177
- # --- DATE EXTRACTOR ---
178
  def extract_robust_date(text_blocks):
179
  date_patterns = [
180
  r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
@@ -195,14 +199,95 @@ def extract_robust_date(text_blocks):
195
  continue
196
  return None
197
 
198
- # --- SCRAPERS ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  def fetch_rss(feed_dict, source_type):
200
  print(f"Scanning {source_type} RSS...")
201
  results = []
202
  for name, url in feed_dict.items():
203
  try:
204
  r = scraper.get(url, timeout=15)
205
-
206
  if r.status_code in [404, 410] and ".house.gov" in url:
207
  root_url = url.split(".gov")[0] + ".gov/rss.xml"
208
  r = scraper.get(root_url, timeout=10)
@@ -212,32 +297,18 @@ def fetch_rss(feed_dict, source_type):
212
  continue
213
 
214
  feed = feedparser.parse(r.content)
215
-
216
  for entry in feed.entries[:20]:
217
  title = entry.get("title", "No Title")
218
  summary = entry.get("description", "")
219
  link = entry.get("link", url)
220
 
221
- if not is_relevant(title, summary):
222
- continue
223
-
224
- url_year_match = re.search(r'/(20\d{2})/', link)
225
- if url_year_match:
226
- url_year = int(url_year_match.group(1))
227
- curr_year = datetime.now().year
228
- curr_month = datetime.now().month
229
- if url_year < curr_year and curr_month > 2: continue
230
- if url_year < curr_year - 1: continue
231
 
232
  if hasattr(entry, 'published_parsed') and entry.published_parsed:
233
  fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
234
  else:
235
- fmt_date = extract_robust_date([title, summary])
236
 
237
- if fmt_date:
238
- days_old = (datetime.now().replace(tzinfo=None) - fmt_date).days
239
- if days_old > 60: continue
240
-
241
  results.append({
242
  "source": name, "type": source_type, "event_date": fmt_date,
243
  "time": "TBD", "title": title, "latest_action": "Published",
@@ -252,11 +323,11 @@ def fetch_master_schedules():
252
  print("Scanning Master Schedules...")
253
  results = []
254
  today = datetime.now()
255
- monday_of_week = today - timedelta(days=today.weekday())
256
  SCHEDULE_URLS = {
257
  "House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
258
  "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
259
- "Congress Weekly": f"https://www.congress.gov/committee-schedule/weekly/{monday_of_week.strftime('%Y/%m/%d')}"
260
  }
261
  for source_name, url in SCHEDULE_URLS.items():
262
  try:
@@ -271,10 +342,8 @@ def fetch_master_schedules():
271
 
272
  a_tag = container.find("a", href=True)
273
  item_link = urljoin(url, a_tag['href']) if a_tag else url
274
- time_node = container.find("time")
275
- time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
276
 
277
- fmt_date = extract_robust_date([time_text, text_content]) or today.replace(hour=9, minute=0, second=0, microsecond=0)
278
  results.append({
279
  "source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
280
  "time": "Scheduled", "title": text_content[:120] + "...",
@@ -285,22 +354,45 @@ def fetch_master_schedules():
285
  print(f"Error scraping {source_name}: {e}")
286
  return results
287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  def fetch_bill_text(congress, bill_type, bill_number):
289
  if not CONGRESS_API_KEY: return ""
290
- url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
291
- headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
292
  try:
 
 
293
  r = requests.get(url, headers=headers, timeout=10)
294
- if r.status_code != 200: return ""
295
- versions = r.json().get("textVersions", [])
296
- if not versions: return ""
297
- for fmt in versions[0].get("formats", []):
298
- if text_url := fmt.get("url"):
299
- text_req = requests.get(text_url, headers=headers, timeout=10)
300
- if text_req.status_code == 200:
301
  return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
302
- except Exception as e:
303
- print(f"Failed to fetch text for {bill_type}{bill_number}: {e}")
304
  return ""
305
 
306
  def fetch_legislation(target=1000):
@@ -309,7 +401,6 @@ def fetch_legislation(target=1000):
309
  results = []
310
  headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
311
  BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
312
-
313
  for offset in range(0, target, 250):
314
  try:
315
  r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
@@ -317,78 +408,21 @@ def fetch_legislation(target=1000):
317
  bills = r.json().get("bills", [])
318
  if not bills: break
319
  for b in bills:
320
- title = b.get("title", "")
321
- if not is_relevant(title): continue
322
-
323
  action_data = b.get("latestAction", {})
324
- action_text = action_data.get("text", "Active")
325
  action_date_raw = action_data.get("actionDate") or b.get("updateDate")
326
- fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else None
327
-
328
  raw_type = b.get("type", "HR").upper()
329
  proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
 
330
  results.append({
331
  "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
332
- "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {title}",
333
- "latest_action": action_text, "link": proper_link, "summary": "Legislative movement tracked via API.",
334
- "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
335
  })
336
  time.sleep(1.5)
337
- except Exception as e:
338
- print(f"Legislation API Error: {e}")
339
- break
340
- return results
341
-
342
- def fetch_federal_register():
343
- print("Scanning Federal Register API...")
344
- results = []
345
- url = "https://www.federalregister.gov/api/v1/documents.json"
346
-
347
- # FIX: Simplify the search term. Complex boolean strings break their URL parser.
348
- params = {
349
- "conditions[term]": "artificial intelligence",
350
- "order": "newest",
351
- "per_page": 10
352
- }
353
-
354
- try:
355
- r = requests.get(url, params=params, timeout=15)
356
- if r.status_code != 200:
357
- print(f"--> Federal Register API returned status {r.status_code}")
358
- return results
359
-
360
- data = r.json()
361
- items = data.get("results", [])
362
-
363
- # VERIFICATION: This will print the exact number of documents found to your terminal
364
- print(f"--> Federal Register API: Found {len(items)} items.")
365
-
366
- for doc in items:
367
- title = doc.get("title", "No Title")
368
- summary = doc.get("abstract", "No summary provided.")
369
- link = doc.get("html_url", "")
370
- action_type = doc.get("type", "Notice")
371
-
372
- agencies = doc.get("agency_names", ["Federal Agency"])
373
- primary_agency = agencies[0] if agencies else "Federal Register"
374
-
375
- pub_date = doc.get("publication_date")
376
- fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
377
-
378
- results.append({
379
- "source": primary_agency,
380
- "type": "Federal/Exec Action",
381
- "event_date": fmt_date,
382
- "time": "Published",
383
- "title": title,
384
- "latest_action": action_type,
385
- "link": link,
386
- "summary": str(summary)[:300]
387
- })
388
- time.sleep(1)
389
- except Exception as e:
390
- print(f"Federal Register API Error: {e}")
391
-
392
  return results
393
 
394
  # --- MAIN EXECUTION ---
@@ -396,13 +430,16 @@ def run():
396
  db = load_db()
397
  raw_data = []
398
 
 
399
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
400
  raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
401
  raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
402
  raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
 
 
 
403
  raw_data.extend(fetch_master_schedules())
404
  raw_data.extend(fetch_legislation())
405
- raw_data.extend(fetch_federal_register())
406
 
407
  new_items = []
408
  for item in raw_data:
 
28
  DB_FILE = BASE_DIR / "seen_events.json"
29
 
30
  # --- STEALTH SCRAPER SETUP ---
 
31
  scraper = cloudscraper.create_scraper(
32
  browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
33
  interpreter='js2py'
 
51
  return True
52
  return False
53
 
54
+ # --- FEEDS & TARGET DICTIONARIES ---
 
55
 
56
+ # Members with working RSS/Feeds
57
  CONGRESS_PRESS_FEEDS = {
 
 
58
  "Sen. Young (AI Caucus)": "https://www.young.senate.gov/newsroom/press-releases/feed/",
59
  "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/press-releases/feed/",
60
  "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
61
  "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/rss.xml",
62
+ "Rep. Moore (UT)": "https://blakemoore.house.gov/rss.xml"
63
+ }
64
+
65
+ # Members who block RSS - HTML Scrape Targets
66
+ CONGRESS_SCRAPE_TARGETS = {
67
+ "Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/newsroom/press-releases",
68
+ "Sen. Schumer (Dem Leader/AI)": "https://www.schumer.senate.gov/newsroom/press-releases",
69
+ "Sen. Heinrich (AI Caucus)": "https://www.heinrich.senate.gov/newsroom/press-releases",
70
+ "Sen. Rounds (AI Caucus)": "https://www.rounds.senate.gov/newsroom/press-releases",
71
+ "Sen. Cantwell (Commerce RM)": "https://www.cantwell.senate.gov/newsroom/press-releases"
72
  }
73
 
74
  NEWS_FEEDS = {
75
  "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
76
+ "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
77
  "WashPost Tech": "https://feeds.washingtonpost.com/rss/business/technology",
78
  "MIT Tech Review": "https://www.technologyreview.com/topic/artificial-intelligence/feed/",
79
  "Politico Tech": "https://rss.politico.com/technology.xml",
80
  "Ars Technica Policy": "https://feeds.arstechnica.com/arstechnica/tech-policy",
81
+ "Axios Tech": "https://www.axios.com/feeds/feed.rss",
82
  "FedScoop": "https://fedscoop.com/feed/",
83
+ "Defense One Tech": "https://www.defenseone.com/rss/technology/",
84
  "Nextgov/FCW": "https://www.nextgov.com/rss/all/",
85
  "TechCrunch AI": "https://techcrunch.com/category/artificial-intelligence/feed/",
86
+ "The Verge Tech": "https://www.theverge.com/rss/index.xml",
87
+ "WSJ Technology": "https://feeds.content.dowjones.io/public/rss/MW_Tech",
88
+ "SF Chronicle Tech": "https://www.sfchronicle.com/projects/feed/tech-news-rss/",
89
  "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
90
  "The Guardian Tech": "https://www.theguardian.com/technology/rss",
91
  "The Register AI": "https://www.theregister.com/software/ai_ml/headlines.atom",
92
  "Tech Policy Press": "https://www.techpolicy.press/rss/",
93
+ "Financial Times Tech": "https://www.ft.com/technology?format=rss",
94
+ "The Hill Tech": "https://thehill.com/policy/technology/feed/"
95
  }
96
 
97
  GOV_FEEDS = {
98
+ "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
99
  "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
100
  "DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
101
  "DOE Office of Science": "https://science.osti.gov/RSS",
 
102
  "NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
103
  "NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
104
  "CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
105
  "FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
106
+ "GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss"
107
  }
108
 
109
  CALENDAR_FEEDS = {
 
113
  "Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
114
  "Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
115
  "Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
116
+ "DOE Events": "https://www.energy.gov/events/rss"
117
  }
118
 
119
  # --- AI SETUP & ANALYZER ---
 
128
  return "AI Triage disabled (No API Key).", "N/A"
129
 
130
  prompt = f"""
131
+ You are a D.C. AI policy analyst. Review this update. Simply provide the summary with no other additions:
132
  Source: {source}
133
  Title: {title}
134
  Summary: {summary}
 
136
 
137
  RULES:
138
  1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided text.
139
+ 2. Provide a detailed, 2-to-3 sentence executive summary explaining the actual policy impact.
140
  3. Extract 3 comma-separated keywords.
141
 
142
  Format output EXACTLY as:
 
179
  def is_new_event(item, db):
180
  return get_event_id(item) not in db
181
 
 
182
  def extract_robust_date(text_blocks):
183
  date_patterns = [
184
  r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
 
199
  continue
200
  return None
201
 
202
+ # --- HTML DIRECT SCRAPER (SENATE CMS) ---
203
+ DATE_RE = re.compile(r'\b(\d{2})\.(\d{2})\.(\d{4})\b')
204
+
205
+ def _parse_senate_cms_date(text: str):
206
+ m = DATE_RE.search(text or "")
207
+ if not m: return None
208
+ try:
209
+ return datetime(int(m.group(3)), int(m.group(1)), int(m.group(2)))
210
+ except ValueError:
211
+ return None
212
+
213
+ def _parse_senate_cms_page(html: str, base_url: str, source_name: str):
214
+ soup = BeautifulSoup(html, "html.parser")
215
+ results = []
216
+ seen_links = set()
217
+
218
+ listing_path = base_url.replace("https://", "").split("/", 1)[-1]
219
+ path_fragment = "/" + listing_path.split("/", 1)[-1]
220
+
221
+ for a_tag in soup.find_all("a", href=True):
222
+ href = a_tag["href"]
223
+ if not href.startswith(path_fragment + "/"):
224
+ continue
225
+
226
+ full_url = urljoin(base_url, href)
227
+ if full_url in seen_links: continue
228
+ seen_links.add(full_url)
229
+
230
+ title = a_tag.get_text(" ", strip=True)
231
+ if not title:
232
+ heading = a_tag.find(["h2", "h3"])
233
+ title = heading.get_text(" ", strip=True) if heading else "No Title"
234
+
235
+ if len(title) < 10: continue
236
+
237
+ fmt_date = None
238
+ parent = a_tag.parent
239
+ for _ in range(5):
240
+ parent_text = parent.get_text(" ", strip=True) if parent else ""
241
+ fmt_date = _parse_senate_cms_date(parent_text)
242
+ if fmt_date: break
243
+ parent = parent.parent if parent else None
244
+
245
+ if not fmt_date:
246
+ surrounding = a_tag.find_previous(string=DATE_RE)
247
+ fmt_date = _parse_senate_cms_date(surrounding) if surrounding else None
248
+
249
+ if not is_relevant(title): continue
250
+
251
+ if fmt_date:
252
+ days_old = (datetime.now() - fmt_date).days
253
+ if days_old > 60: continue
254
+
255
+ results.append({
256
+ "source": source_name,
257
+ "type": "Legislative Office Press Release",
258
+ "event_date": fmt_date or datetime.now(),
259
+ "time": "TBD",
260
+ "title": title,
261
+ "latest_action": "Published",
262
+ "link": full_url,
263
+ "summary": "HTML Scrape - Full text review pending."
264
+ })
265
+ return results
266
+
267
+ def fetch_congress_scraped():
268
+ print("Scraping Congress HTML pages (no-RSS targets)...")
269
+ all_results = []
270
+ for name, url in CONGRESS_SCRAPE_TARGETS.items():
271
+ try:
272
+ r = scraper.get(url, timeout=15, headers={"Referer": "https://www.google.com/"})
273
+ if r.status_code != 200:
274
+ print(f" --> {name}: HTTP {r.status_code}, skipping")
275
+ continue
276
+ items = _parse_senate_cms_page(r.text, url, name)
277
+ print(f" --> {name}: Found {len(items)} relevant items")
278
+ all_results.extend(items)
279
+ time.sleep(1.5)
280
+ except Exception as e:
281
+ print(f" --> {name}: Error — {e}")
282
+ return all_results
283
+
284
+ # --- STANDARD API & RSS SCRAPERS ---
285
  def fetch_rss(feed_dict, source_type):
286
  print(f"Scanning {source_type} RSS...")
287
  results = []
288
  for name, url in feed_dict.items():
289
  try:
290
  r = scraper.get(url, timeout=15)
 
291
  if r.status_code in [404, 410] and ".house.gov" in url:
292
  root_url = url.split(".gov")[0] + ".gov/rss.xml"
293
  r = scraper.get(root_url, timeout=10)
 
297
  continue
298
 
299
  feed = feedparser.parse(r.content)
 
300
  for entry in feed.entries[:20]:
301
  title = entry.get("title", "No Title")
302
  summary = entry.get("description", "")
303
  link = entry.get("link", url)
304
 
305
+ if not is_relevant(title, summary): continue
 
 
 
 
 
 
 
 
 
306
 
307
  if hasattr(entry, 'published_parsed') and entry.published_parsed:
308
  fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
309
  else:
310
+ fmt_date = extract_robust_date([title, summary]) or datetime.now()
311
 
 
 
 
 
312
  results.append({
313
  "source": name, "type": source_type, "event_date": fmt_date,
314
  "time": "TBD", "title": title, "latest_action": "Published",
 
323
  print("Scanning Master Schedules...")
324
  results = []
325
  today = datetime.now()
326
+ monday = today - timedelta(days=today.weekday())
327
  SCHEDULE_URLS = {
328
  "House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
329
  "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
330
+ "Congress Weekly": f"https://www.congress.gov/committee-schedule/weekly/{monday.strftime('%Y/%m/%d')}"
331
  }
332
  for source_name, url in SCHEDULE_URLS.items():
333
  try:
 
342
 
343
  a_tag = container.find("a", href=True)
344
  item_link = urljoin(url, a_tag['href']) if a_tag else url
345
+ fmt_date = extract_robust_date([text_content]) or today
 
346
 
 
347
  results.append({
348
  "source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
349
  "time": "Scheduled", "title": text_content[:120] + "...",
 
354
  print(f"Error scraping {source_name}: {e}")
355
  return results
356
 
357
+ def fetch_federal_register():
358
+ print("Scanning Federal Register API...")
359
+ results = []
360
+ url = "https://www.federalregister.gov/api/v1/documents.json"
361
+ params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}
362
+ try:
363
+ r = requests.get(url, params=params, timeout=15)
364
+ if r.status_code == 200:
365
+ for doc in r.json().get("results", []):
366
+ title = doc.get("title", "No Title")
367
+ summary = doc.get("abstract", "No summary provided.")
368
+ pub_date = doc.get("publication_date")
369
+ fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
370
+
371
+ results.append({
372
+ "source": doc.get("agency_names", ["Federal Register"])[0],
373
+ "type": "Federal/Exec Action", "event_date": fmt_date,
374
+ "time": "Published", "title": title, "latest_action": doc.get("type", "Notice"),
375
+ "link": doc.get("html_url", ""), "summary": str(summary)[:300]
376
+ })
377
+ time.sleep(1)
378
+ except Exception as e:
379
+ print(f"Federal Register API Error: {e}")
380
+ return results
381
+
382
  def fetch_bill_text(congress, bill_type, bill_number):
383
  if not CONGRESS_API_KEY: return ""
 
 
384
  try:
385
+ url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
386
+ headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
387
  r = requests.get(url, headers=headers, timeout=10)
388
+ if r.status_code == 200:
389
+ versions = r.json().get("textVersions", [])
390
+ if versions and versions[0].get("formats"):
391
+ text_url = versions[0]["formats"][0].get("url")
392
+ if text_url:
393
+ text_req = requests.get(text_url, headers=headers, timeout=10)
 
394
  return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
395
+ except: pass
 
396
  return ""
397
 
398
  def fetch_legislation(target=1000):
 
401
  results = []
402
  headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
403
  BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
 
404
  for offset in range(0, target, 250):
405
  try:
406
  r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
 
408
  bills = r.json().get("bills", [])
409
  if not bills: break
410
  for b in bills:
411
+ if not is_relevant(b.get("title", "")): continue
 
 
412
  action_data = b.get("latestAction", {})
 
413
  action_date_raw = action_data.get("actionDate") or b.get("updateDate")
414
+ fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
 
415
  raw_type = b.get("type", "HR").upper()
416
  proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
417
+
418
  results.append({
419
  "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
420
+ "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
421
+ "latest_action": action_data.get("text", "Active"), "link": proper_link,
422
+ "summary": "Legislative movement tracked via API.", "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
423
  })
424
  time.sleep(1.5)
425
+ except Exception as e: break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
  return results
427
 
428
  # --- MAIN EXECUTION ---
 
430
  db = load_db()
431
  raw_data = []
432
 
433
+ # Run all our data gatherers
434
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
435
  raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
436
  raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
437
  raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
438
+
439
+ raw_data.extend(fetch_congress_scraped()) # The new direct HTML targets!
440
+ raw_data.extend(fetch_federal_register())
441
  raw_data.extend(fetch_master_schedules())
442
  raw_data.extend(fetch_legislation())
 
443
 
444
  new_items = []
445
  for item in raw_data: