IJ-Reynolds HF Staff commited on
Commit
5a58149
·
verified ·
1 Parent(s): 28c8245

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +60 -60
main.py CHANGED
@@ -13,10 +13,9 @@ from dateutil import parser as date_parser
13
  from urllib.parse import urljoin
14
  from huggingface_hub import InferenceClient
15
  from sentence_transformers import SentenceTransformer
16
- import json
17
 
18
  # Specifying model for efficient embedding + trend analysis
19
- model = SentenceTransformer('BAAI/bge-small-en-v1.5')
20
 
21
  # --- CONFIGURATION & GLOBALS ---
22
  CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
@@ -25,12 +24,17 @@ CURRENT_CONGRESS = 119
25
  CONGRESS_API_BASE = "https://api.congress.gov/v3"
26
  BASE_DIR = Path(__file__).resolve().parent
27
 
 
28
  if Path("/data").exists():
29
  CSV_PATH = Path("/data/policy_tracker.csv")
30
  DB_FILE = Path("/data/seen_events.json")
 
 
31
  else:
32
  CSV_PATH = BASE_DIR / "policy_tracker.csv"
33
  DB_FILE = BASE_DIR / "seen_events.json"
 
 
34
 
35
  # --- STEALTH SCRAPER SETUP ---
36
  scraper = cloudscraper.create_scraper(
@@ -46,8 +50,8 @@ TARGET_KEYWORDS = [
46
  "foundation model", "autonomous system"
47
  ]
48
 
49
- def is_relevant(title, summary=""):
50
- text_to_check = f"{title} {summary}".lower()
51
  for keyword in TARGET_KEYWORDS:
52
  if re.search(rf'\b{re.escape(keyword)}', text_to_check):
53
  return True
@@ -139,20 +143,26 @@ def analyze_with_ai(title, summary, source, bill_text=""):
139
  return "Error during AI analysis.", "error"
140
 
141
  # --- CORE UTILITIES ---
142
- def load_db():
143
- if DB_FILE.exists():
144
- with open(DB_FILE, "r") as f: return json.load(f)
145
  return []
146
 
 
 
 
 
 
 
147
  def save_db(db):
148
- with open(DB_FILE, "w") as f: json.dump(db[-5000:], f)
149
 
150
  def extract_robust_date(text_blocks):
151
  date_patterns = [
152
  r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
153
  r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
154
  r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b',
155
- r'\b(\d{2})\.(\d{2})\.(\d{4})\b' # Specifically handles Senate MM.DD.YYYY formats
156
  ]
157
  for text in text_blocks:
158
  if not text: continue
@@ -172,7 +182,6 @@ def extract_robust_date(text_blocks):
172
  return None
173
 
174
  # --- DATA GATHERING ENGINES ---
175
-
176
  def fetch_agency_scraped():
177
  print("Scanning Federal Agency HTML Pages...")
178
  results = []
@@ -198,10 +207,7 @@ def fetch_agency_scraped():
198
  if len(title) < 15 or not is_relevant(title): continue
199
  seen_links.add(full_url)
200
 
201
- # --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
202
  fmt_date = None
203
-
204
- # 1. Expanded Container Search
205
  container = a_tag.find_parent(["article", "tr", "li"])
206
  if not container:
207
  container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
@@ -209,7 +215,6 @@ def fetch_agency_scraped():
209
  if container:
210
  fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
211
 
212
- # 2. Sibling Search
213
  if not fmt_date:
214
  prev_el = a_tag.find_previous_sibling()
215
  if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
@@ -217,7 +222,6 @@ def fetch_agency_scraped():
217
  next_el = a_tag.find_next_sibling()
218
  if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
219
 
220
- # 3. Deep DOM Climb Fallback
221
  if not fmt_date:
222
  current_node = a_tag
223
  for _ in range(6):
@@ -228,7 +232,6 @@ def fetch_agency_scraped():
228
  fmt_date = found_date
229
  break
230
 
231
- # --- THE USER-FACING FLAG ---
232
  if not fmt_date:
233
  display_time = "⚠️ DATE UNKNOWN"
234
  display_title = f"[DATE MISSING] {title}"
@@ -240,7 +243,7 @@ def fetch_agency_scraped():
240
 
241
  results.append({
242
  "source": name,
243
- "type": "Federal/Exec Action", # Formatted for the Executive action bucket
244
  "event_date": fmt_date,
245
  "time": display_time,
246
  "title": display_title,
@@ -278,19 +281,14 @@ def fetch_congress_scraped():
278
  if len(title) < 15 or not is_relevant(title): continue
279
  seen_links.add(full_url)
280
 
281
- # --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
282
  fmt_date = None
283
-
284
- # 1. Expanded Container Search (Catches almost all Gov CMS platforms)
285
  container = a_tag.find_parent(["article", "tr", "li"])
286
  if not container:
287
- # Added: news, press, card, entry, row, record
288
  container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
289
 
290
  if container:
291
  fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
292
 
293
- # 2. Sibling Search (If the date is floating right next to the link)
294
  if not fmt_date:
295
  prev_el = a_tag.find_previous_sibling()
296
  if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
@@ -298,7 +296,6 @@ def fetch_congress_scraped():
298
  next_el = a_tag.find_next_sibling()
299
  if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
300
 
301
- # 3. Deep DOM Climb Fallback
302
  if not fmt_date:
303
  current_node = a_tag
304
  for _ in range(6):
@@ -309,7 +306,6 @@ def fetch_congress_scraped():
309
  fmt_date = found_date
310
  break
311
 
312
- # --- THE USER-FACING FLAG ---
313
  if not fmt_date:
314
  display_time = "⚠️ DATE UNKNOWN"
315
  display_title = f"[DATE MISSING] {title}"
@@ -334,7 +330,6 @@ def fetch_floor_schedules():
334
  print("Scanning House & Senate Floor Schedules...")
335
  results = []
336
 
337
- # Using your stable, verified endpoints
338
  SCHEDULE_URLS = {
339
  "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
340
  "House Floor Summary": "https://clerk.house.gov/FloorSummary"
@@ -346,27 +341,19 @@ def fetch_floor_schedules():
346
  if r.status_code != 200: continue
347
 
348
  soup = BeautifulSoup(r.text, "html.parser")
349
-
350
- # 1. THE ISOLATOR: Only look inside the main content body (ignores footers/menus)
351
  main_area = soup.find("main") or soup.find(id="main_content") or soup.find(class_=re.compile("content|main", re.I)) or soup
352
 
353
- # 2. SURGICAL TAGS: Only parse actual paragraphs and lists. NO DIVS!
354
  for container in main_area.find_all(["p", "li"]):
355
  text_content = container.get_text(" ", strip=True)
356
-
357
- # Tighten the length to avoid tiny buttons and massive unbroken text blocks
358
  if len(text_content) < 40 or len(text_content) > 800: continue
359
  if not is_relevant(text_content): continue
360
 
361
- # 3. UPGRADED DUPLICATE BLOCKER: Prevents overlapping HTML chunks
362
  if any(res['summary'][:100] in text_content for res in results) or \
363
  any(text_content[:100] in res['summary'] for res in results):
364
  continue
365
 
366
  a_tag = container.find("a", href=True)
367
  item_link = urljoin(url, a_tag['href']) if a_tag else url
368
-
369
- # Floor actions are usually today's date
370
  fmt_date = extract_robust_date([text_content]) or datetime.now()
371
 
372
  results.append({
@@ -392,17 +379,13 @@ def fetch_rss(feed_dict, source_type):
392
  for entry in feed.entries[:15]:
393
  title = entry.get("title", "")
394
  summary = entry.get("description", "")
395
-
396
  if not is_relevant(title, summary): continue
397
-
398
 
399
- # Check for standard RSS/Atom timestamps first
400
  if hasattr(entry, 'published_parsed') and entry.published_parsed:
401
  fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
402
  elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
403
  fmt_date = datetime(*entry.updated_parsed[:6]).replace(tzinfo=None)
404
  else:
405
- # Fallback to text scanning only if metadata is missing entirely
406
  fmt_date = extract_robust_date([title, summary]) or datetime.now()
407
 
408
  results.append({
@@ -416,14 +399,10 @@ def fetch_rss(feed_dict, source_type):
416
 
417
  return results
418
 
419
- # -- APIs ---
420
-
421
  def fetch_federal_register():
422
  print("Scanning Federal Register API...")
423
  results = []
424
  url = "https://www.federalregister.gov/api/v1/documents.json"
425
-
426
- # We pull a larger batch (50) because we are going to heavily filter them locally
427
  params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 50}
428
 
429
  try:
@@ -433,12 +412,8 @@ def fetch_federal_register():
433
  title = doc.get("title", "No Title")
434
  summary = doc.get("abstract", "No summary provided.")
435
 
436
- # --- THE LOCAL RELEVANCE FILTER ---
437
- # Only keep it if the AI keywords are in the Title or Abstract (ignores full-text matches)
438
  if not is_relevant(title, str(summary)):
439
  continue
440
-
441
- # Explicitly block noisy SEC stock exchange filings
442
  if "Self-Regulatory Organizations" in title:
443
  continue
444
 
@@ -473,18 +448,20 @@ def fetch_bill_text(congress, bill_type, bill_number):
473
  return ""
474
 
475
  def fetch_legislation(target=1000):
476
- print("Scanning Legislation API...")
477
  if not CONGRESS_API_KEY: return []
478
  results = []
479
  headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
480
  BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
481
 
482
- # We split the scan: 500 newest introduced, AND 500 most recently updated
 
 
 
483
  scan_strategies = ["introducedDate desc", "updateDate desc"]
484
 
485
  for sort_method in scan_strategies:
486
  print(f" -> Pulling by {sort_method}...")
487
- # target // 2 means we pull 500 for each strategy
488
  for offset in range(0, target // 2, 250):
489
  try:
490
  r = requests.get(
@@ -497,25 +474,51 @@ def fetch_legislation(target=1000):
497
  if not bills: break
498
 
499
  for b in bills:
500
- if not is_relevant(b.get("title", "")): continue
 
 
501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  action_data = b.get("latestAction", {})
503
  action_date_raw = action_data.get("actionDate") or b.get("updateDate")
504
  fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
505
- raw_type = b.get("type", "HR").upper()
506
- proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
507
 
508
  results.append({
509
  "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
510
- "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
511
  "latest_action": action_data.get("text", "Active"), "link": proper_link,
512
- "summary": "Legislative movement tracked via API.", "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
513
  })
514
- time.sleep(1.5) # Polite delay
515
  except Exception as e: break
516
 
 
 
 
 
517
  return results
518
-
519
 
520
  # --- MAIN RUNNER ---
521
  def run():
@@ -538,25 +541,22 @@ def run():
538
  if event_id not in db:
539
  print(f"Triaging new item: {item['title'][:40]}...")
540
 
541
- # Re-integrated the fetch_bill_text logic so the AI has context!
542
  bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
543
  analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
544
 
545
  item["analysis"] = analysis
546
  item["keywords"] = keywords
547
 
548
- # --- NEW: GENERATE SEMANTIC EMBEDDING ---
549
  try:
550
- # Don't waste compute embedding error messages
551
  if analysis and not analysis.startswith("Error") and not analysis.startswith("AI Triage disabled"):
552
  vector = embedder.encode(analysis).tolist()
553
- item["embedding"] = json.dumps(vector) # Stored as JSON string for CSV compatibility
554
  else:
555
  item["embedding"] = None
556
  except Exception as e:
557
  print(f" -> Embedding error: {e}")
558
  item["embedding"] = None
559
- # ----------------------------------------
560
 
561
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
562
  new_items.append(item)
 
13
  from urllib.parse import urljoin
14
  from huggingface_hub import InferenceClient
15
  from sentence_transformers import SentenceTransformer
 
16
 
17
  # Specifying model for efficient embedding + trend analysis
18
+ embedder = SentenceTransformer('BAAI/bge-small-en-v1.5')
19
 
20
  # --- CONFIGURATION & GLOBALS ---
21
  CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
 
24
  CONGRESS_API_BASE = "https://api.congress.gov/v3"
25
  BASE_DIR = Path(__file__).resolve().parent
26
 
27
+ # --- PERSISTENT STORAGE PATHING ---
28
  if Path("/data").exists():
29
  CSV_PATH = Path("/data/policy_tracker.csv")
30
  DB_FILE = Path("/data/seen_events.json")
31
+ WHITELIST_FILE = Path("/data/tracked_bills.json")
32
+ SCANNED_FILE = Path("/data/scanned_bills.json")
33
  else:
34
  CSV_PATH = BASE_DIR / "policy_tracker.csv"
35
  DB_FILE = BASE_DIR / "seen_events.json"
36
+ WHITELIST_FILE = BASE_DIR / "tracked_bills.json"
37
+ SCANNED_FILE = BASE_DIR / "scanned_bills.json"
38
 
39
  # --- STEALTH SCRAPER SETUP ---
40
  scraper = cloudscraper.create_scraper(
 
50
  "foundation model", "autonomous system"
51
  ]
52
 
53
+ def is_relevant(title, summary="", text=""):
54
+ text_to_check = f"{title} {summary} {text}".lower()
55
  for keyword in TARGET_KEYWORDS:
56
  if re.search(rf'\b{re.escape(keyword)}', text_to_check):
57
  return True
 
143
  return "Error during AI analysis.", "error"
144
 
145
  # --- CORE UTILITIES ---
146
+ def load_list(filepath):
147
+ if filepath.exists():
148
+ with open(filepath, "r") as f: return json.load(f)
149
  return []
150
 
151
+ def save_list(data, filepath):
152
+ with open(filepath, "w") as f: json.dump(data[-5000:], f)
153
+
154
+ def load_db():
155
+ return load_list(DB_FILE)
156
+
157
  def save_db(db):
158
+ save_list(db, DB_FILE)
159
 
160
  def extract_robust_date(text_blocks):
161
  date_patterns = [
162
  r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
163
  r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
164
  r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b',
165
+ r'\b(\d{2})\.(\d{2})\.(\d{4})\b'
166
  ]
167
  for text in text_blocks:
168
  if not text: continue
 
182
  return None
183
 
184
  # --- DATA GATHERING ENGINES ---
 
185
  def fetch_agency_scraped():
186
  print("Scanning Federal Agency HTML Pages...")
187
  results = []
 
207
  if len(title) < 15 or not is_relevant(title): continue
208
  seen_links.add(full_url)
209
 
 
210
  fmt_date = None
 
 
211
  container = a_tag.find_parent(["article", "tr", "li"])
212
  if not container:
213
  container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
 
215
  if container:
216
  fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
217
 
 
218
  if not fmt_date:
219
  prev_el = a_tag.find_previous_sibling()
220
  if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
 
222
  next_el = a_tag.find_next_sibling()
223
  if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
224
 
 
225
  if not fmt_date:
226
  current_node = a_tag
227
  for _ in range(6):
 
232
  fmt_date = found_date
233
  break
234
 
 
235
  if not fmt_date:
236
  display_time = "⚠️ DATE UNKNOWN"
237
  display_title = f"[DATE MISSING] {title}"
 
243
 
244
  results.append({
245
  "source": name,
246
+ "type": "Federal/Exec Action",
247
  "event_date": fmt_date,
248
  "time": display_time,
249
  "title": display_title,
 
281
  if len(title) < 15 or not is_relevant(title): continue
282
  seen_links.add(full_url)
283
 
 
284
  fmt_date = None
 
 
285
  container = a_tag.find_parent(["article", "tr", "li"])
286
  if not container:
 
287
  container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
288
 
289
  if container:
290
  fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
291
 
 
292
  if not fmt_date:
293
  prev_el = a_tag.find_previous_sibling()
294
  if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
 
296
  next_el = a_tag.find_next_sibling()
297
  if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
298
 
 
299
  if not fmt_date:
300
  current_node = a_tag
301
  for _ in range(6):
 
306
  fmt_date = found_date
307
  break
308
 
 
309
  if not fmt_date:
310
  display_time = "⚠️ DATE UNKNOWN"
311
  display_title = f"[DATE MISSING] {title}"
 
330
  print("Scanning House & Senate Floor Schedules...")
331
  results = []
332
 
 
333
  SCHEDULE_URLS = {
334
  "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
335
  "House Floor Summary": "https://clerk.house.gov/FloorSummary"
 
341
  if r.status_code != 200: continue
342
 
343
  soup = BeautifulSoup(r.text, "html.parser")
 
 
344
  main_area = soup.find("main") or soup.find(id="main_content") or soup.find(class_=re.compile("content|main", re.I)) or soup
345
 
 
346
  for container in main_area.find_all(["p", "li"]):
347
  text_content = container.get_text(" ", strip=True)
 
 
348
  if len(text_content) < 40 or len(text_content) > 800: continue
349
  if not is_relevant(text_content): continue
350
 
 
351
  if any(res['summary'][:100] in text_content for res in results) or \
352
  any(text_content[:100] in res['summary'] for res in results):
353
  continue
354
 
355
  a_tag = container.find("a", href=True)
356
  item_link = urljoin(url, a_tag['href']) if a_tag else url
 
 
357
  fmt_date = extract_robust_date([text_content]) or datetime.now()
358
 
359
  results.append({
 
379
  for entry in feed.entries[:15]:
380
  title = entry.get("title", "")
381
  summary = entry.get("description", "")
 
382
  if not is_relevant(title, summary): continue
 
383
 
 
384
  if hasattr(entry, 'published_parsed') and entry.published_parsed:
385
  fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
386
  elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
387
  fmt_date = datetime(*entry.updated_parsed[:6]).replace(tzinfo=None)
388
  else:
 
389
  fmt_date = extract_robust_date([title, summary]) or datetime.now()
390
 
391
  results.append({
 
399
 
400
  return results
401
 
 
 
402
  def fetch_federal_register():
403
  print("Scanning Federal Register API...")
404
  results = []
405
  url = "https://www.federalregister.gov/api/v1/documents.json"
 
 
406
  params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 50}
407
 
408
  try:
 
412
  title = doc.get("title", "No Title")
413
  summary = doc.get("abstract", "No summary provided.")
414
 
 
 
415
  if not is_relevant(title, str(summary)):
416
  continue
 
 
417
  if "Self-Regulatory Organizations" in title:
418
  continue
419
 
 
448
  return ""
449
 
450
  def fetch_legislation(target=1000):
451
+ print("Scanning Legislation API with Deep Text & Whitelist...")
452
  if not CONGRESS_API_KEY: return []
453
  results = []
454
  headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
455
  BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
456
 
457
+ # Load tracking databases
458
+ tracked_bills = set(load_list(WHITELIST_FILE))
459
+ scanned_bills = set(load_list(SCANNED_FILE))
460
+
461
  scan_strategies = ["introducedDate desc", "updateDate desc"]
462
 
463
  for sort_method in scan_strategies:
464
  print(f" -> Pulling by {sort_method}...")
 
465
  for offset in range(0, target // 2, 250):
466
  try:
467
  r = requests.get(
 
474
  if not bills: break
475
 
476
  for b in bills:
477
+ raw_type = b.get("type", "HR").upper()
478
+ bill_number = b.get("number")
479
+ bill_id = f"{raw_type}{bill_number}"
480
 
481
+ is_ai_bill = False
482
+
483
+ # 1. THE WHITELIST CHECK (Catches all admin updates for known AI bills)
484
+ if bill_id in tracked_bills:
485
+ is_ai_bill = True
486
+ else:
487
+ # 2. TITLE/SUMMARY CHECK
488
+ if is_relevant(b.get("title", "")):
489
+ is_ai_bill = True
490
+ tracked_bills.add(bill_id)
491
+ # 3. DEEP TEXT CHECK (Only for bills we haven't already rejected!)
492
+ elif bill_id not in scanned_bills:
493
+ bill_text = fetch_bill_text(CURRENT_CONGRESS, raw_type, bill_number)
494
+ scanned_bills.add(bill_id) # Mark as scanned so we don't hit the API limit tomorrow
495
+
496
+ if is_relevant("", "", bill_text):
497
+ is_ai_bill = True
498
+ tracked_bills.add(bill_id)
499
+
500
+ if not is_ai_bill:
501
+ continue # Skip entirely!
502
+
503
  action_data = b.get("latestAction", {})
504
  action_date_raw = action_data.get("actionDate") or b.get("updateDate")
505
  fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
506
+ proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{bill_number}"
 
507
 
508
  results.append({
509
  "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
510
+ "time": "API Verified", "title": f"{raw_type}{bill_number}: {b.get('title')}",
511
  "latest_action": action_data.get("text", "Active"), "link": proper_link,
512
+ "summary": "Legislative movement tracked via API.", "bill_type": raw_type, "bill_number": bill_number
513
  })
514
+ time.sleep(1.5)
515
  except Exception as e: break
516
 
517
+ # Save the updated Whitelist and Scanned lists to the permanent bucket
518
+ save_list(list(tracked_bills), WHITELIST_FILE)
519
+ save_list(list(scanned_bills), SCANNED_FILE)
520
+
521
  return results
 
522
 
523
  # --- MAIN RUNNER ---
524
  def run():
 
541
  if event_id not in db:
542
  print(f"Triaging new item: {item['title'][:40]}...")
543
 
 
544
  bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
545
  analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
546
 
547
  item["analysis"] = analysis
548
  item["keywords"] = keywords
549
 
550
+ # --- SEMANTIC EMBEDDING ---
551
  try:
 
552
  if analysis and not analysis.startswith("Error") and not analysis.startswith("AI Triage disabled"):
553
  vector = embedder.encode(analysis).tolist()
554
+ item["embedding"] = json.dumps(vector)
555
  else:
556
  item["embedding"] = None
557
  except Exception as e:
558
  print(f" -> Embedding error: {e}")
559
  item["embedding"] = None
 
560
 
561
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
562
  new_items.append(item)