IJ-Reynolds HF Staff commited on
Commit
766241d
·
verified ·
1 Parent(s): f845243

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +100 -298
main.py CHANGED
@@ -7,7 +7,7 @@ import feedparser
7
  import json
8
  import re
9
  import time
10
- from datetime import datetime, timedelta
11
  from pathlib import Path
12
  from dateutil import parser as date_parser
13
  from urllib.parse import urljoin
@@ -38,8 +38,7 @@ TARGET_KEYWORDS = [
38
  "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
39
  "deep learning", "autonomous", "training data", "data privacy", "semiconductor",
40
  "chatbot", "facial recognition", "biometric", "open-source", "open source ai",
41
- "foundation model", "emerging technology", "automated decision", "automated system",
42
- "large language model", "surveillance technology"
43
  ]
44
 
45
  def is_relevant(title, summary=""):
@@ -51,133 +50,72 @@ def is_relevant(title, summary=""):
51
  return True
52
  return False
53
 
54
- # --- FEEDS & TARGET DICTIONARIES ---
55
 
56
- # Members with working RSS/Feeds
57
- CONGRESS_PRESS_FEEDS = {
58
- "Sen. Young (AI Caucus)": "https://www.young.senate.gov/newsroom/press-releases/feed/",
59
- "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/press-releases/feed/",
60
- "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
61
- "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/rss.xml",
62
- "Rep. Moore (UT)": "https://blakemoore.house.gov/rss.xml"
63
- }
64
-
65
- # Members who block RSS - HTML Scrape Targets
66
  CONGRESS_SCRAPE_TARGETS = {
67
- "Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/newsroom/press-releases",
68
- "Sen. Schumer (Dem Leader/AI)": "https://www.schumer.senate.gov/newsroom/press-releases",
69
- "Sen. Heinrich (AI Caucus)": "https://www.heinrich.senate.gov/newsroom/press-releases",
70
- "Sen. Rounds (AI Caucus)": "https://www.rounds.senate.gov/newsroom/press-releases",
71
- "Sen. Cantwell (Commerce RM)": "https://www.cantwell.senate.gov/newsroom/press-releases"
72
  }
73
 
 
74
  NEWS_FEEDS = {
75
- "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
76
- "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
77
- "WashPost Tech": "https://feeds.washingtonpost.com/rss/business/technology",
78
- "MIT Tech Review": "https://www.technologyreview.com/topic/artificial-intelligence/feed/",
79
  "Politico Tech": "https://rss.politico.com/technology.xml",
80
- "Ars Technica Policy": "https://feeds.arstechnica.com/arstechnica/tech-policy",
81
  "Axios Tech": "https://www.axios.com/feeds/feed.rss",
82
- "FedScoop": "https://fedscoop.com/feed/",
83
- "Defense One Tech": "https://www.defenseone.com/rss/technology/",
84
- "Nextgov/FCW": "https://www.nextgov.com/rss/all/",
85
- "TechCrunch AI": "https://techcrunch.com/category/artificial-intelligence/feed/",
86
  "The Verge Tech": "https://www.theverge.com/rss/index.xml",
87
- "WSJ Technology": "https://feeds.content.dowjones.io/public/rss/MW_Tech",
88
- "SF Chronicle Tech": "https://www.sfchronicle.com/projects/feed/tech-news-rss/",
89
  "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
90
- "The Guardian Tech": "https://www.theguardian.com/technology/rss",
91
- "The Register AI": "https://www.theregister.com/software/ai_ml/headlines.atom",
92
- "Tech Policy Press": "https://www.techpolicy.press/rss/",
93
- "Financial Times Tech": "https://www.ft.com/technology?format=rss",
94
- "The Hill Tech": "https://thehill.com/policy/technology/feed/"
95
- }
96
-
97
- GOV_FEEDS = {
98
- "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
99
- "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
100
- "DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
101
- "DOE Office of Science": "https://science.osti.gov/RSS",
102
- "NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
103
- "NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
104
- "CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
105
- "FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
106
- "GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss"
107
- }
108
-
109
- CALENDAR_FEEDS = {
110
- "House Science RSS": "https://science.house.gov/hearings?rss=1",
111
- "House Energy RSS": "https://energycommerce.house.gov/events?rss=1",
112
- "House Foreign Affairs RSS": "https://foreignaffairs.house.gov/committee-activity/hearings/all?rss=1",
113
- "Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
114
- "Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
115
- "Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
116
- "DOE Events": "https://www.energy.gov/events/rss"
117
  }
118
 
119
- # --- AI SETUP & ANALYZER ---
120
  if HF_TOKEN:
121
  hf_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=HF_TOKEN)
122
  else:
123
  hf_client = None
124
- print("Warning: No HF_TOKEN found. AI Triage will be bypassed.")
125
 
126
  def analyze_with_ai(title, summary, source, bill_text=""):
127
- if not hf_client:
128
- return "AI Triage disabled (No API Key).", "N/A"
129
 
130
  prompt = f"""
131
- You are a D.C. AI policy analyst. Review this update. Simply provide the summary with no other additions:
132
  Source: {source}
133
  Title: {title}
134
  Summary: {summary}
135
  Raw Bill Text Excerpt: {bill_text if bill_text else 'N/A'}
136
 
137
- RULES:
138
- 1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided text.
139
- 2. Provide a detailed, 2-to-3 sentence executive summary explaining the actual policy impact.
140
- 3. Extract 3 comma-separated keywords.
141
-
142
- Format output EXACTLY as:
143
- ANALYSIS: [Your 2-3 sentence summary here]
144
  KEYWORDS: [Words]
145
  """
146
  try:
147
  messages = [{"role": "user", "content": prompt}]
148
- response = hf_client.chat_completion(messages, max_tokens=350, temperature=0.1, top_p=0.9)
149
  text = response.choices[0].message.content
150
 
151
- analysis_match = re.search(r'ANALYSIS:\s*(.*?)(?=KEYWORDS:|$)', text, re.DOTALL)
152
- analysis = analysis_match.group(1).strip() if analysis_match else "Could not generate analysis."
153
-
154
- keywords_match = re.search(r'KEYWORDS:\s*(.*)', text)
155
- keywords = keywords_match.group(1).strip() if keywords_match else "AI, Tech, Policy"
156
-
157
  return analysis.replace('\n', ' '), keywords
158
- except Exception as e:
159
- print(f"AI Error: {e}")
160
  return "Error during AI analysis.", "error"
161
 
162
- # --- STATE MANAGEMENT ---
163
  def load_db():
164
  if DB_FILE.exists():
165
- with open(DB_FILE, "r") as f:
166
- return json.load(f)
167
  return []
168
 
169
  def save_db(db):
170
- db = db[-5000:]
171
- with open(DB_FILE, "w") as f:
172
- json.dump(db, f)
173
-
174
- def get_event_id(item):
175
- link = item.get("link", "no_link")
176
- action = item.get("latest_action", "no_action")
177
- return f"{link} || {action}"
178
-
179
- def is_new_event(item, db):
180
- return get_event_id(item) not in db
181
 
182
  def extract_robust_date(text_blocks):
183
  date_patterns = [
@@ -188,282 +126,146 @@ def extract_robust_date(text_blocks):
188
  for text in text_blocks:
189
  if not text: continue
190
  for pattern in date_patterns:
191
- matches = re.findall(pattern, text, re.IGNORECASE)
192
- for match in matches:
193
  try:
194
  clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
195
- parsed_date = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
196
- if 2024 <= parsed_date.year <= 2030:
197
- return parsed_date
198
- except:
199
- continue
200
  return None
201
 
202
- # --- HTML DIRECT SCRAPER (SENATE CMS) ---
203
- DATE_RE = re.compile(r'\b(\d{2})\.(\d{2})\.(\d{4})\b')
204
-
205
- def _parse_senate_cms_date(text: str):
206
- m = DATE_RE.search(text or "")
207
- if not m: return None
208
- try:
209
- return datetime(int(m.group(3)), int(m.group(1)), int(m.group(2)))
210
- except ValueError:
211
- return None
212
-
213
- def _parse_senate_cms_page(html: str, base_url: str, source_name: str):
214
- soup = BeautifulSoup(html, "html.parser")
215
- results = []
216
- seen_links = set()
217
-
218
- listing_path = base_url.replace("https://", "").split("/", 1)[-1]
219
- path_fragment = "/" + listing_path.split("/", 1)[-1]
220
-
221
- for a_tag in soup.find_all("a", href=True):
222
- href = a_tag["href"]
223
- if not href.startswith(path_fragment + "/"):
224
- continue
225
-
226
- full_url = urljoin(base_url, href)
227
- if full_url in seen_links: continue
228
- seen_links.add(full_url)
229
-
230
- title = a_tag.get_text(" ", strip=True)
231
- if not title:
232
- heading = a_tag.find(["h2", "h3"])
233
- title = heading.get_text(" ", strip=True) if heading else "No Title"
234
-
235
- if len(title) < 10: continue
236
-
237
- fmt_date = None
238
- parent = a_tag.parent
239
- for _ in range(5):
240
- parent_text = parent.get_text(" ", strip=True) if parent else ""
241
- fmt_date = _parse_senate_cms_date(parent_text)
242
- if fmt_date: break
243
- parent = parent.parent if parent else None
244
-
245
- if not fmt_date:
246
- surrounding = a_tag.find_previous(string=DATE_RE)
247
- fmt_date = _parse_senate_cms_date(surrounding) if surrounding else None
248
-
249
- if not is_relevant(title): continue
250
-
251
- if fmt_date:
252
- days_old = (datetime.now() - fmt_date).days
253
- if days_old > 60: continue
254
-
255
- results.append({
256
- "source": source_name,
257
- "type": "Legislative Office Press Release",
258
- "event_date": fmt_date or datetime.now(),
259
- "time": "TBD",
260
- "title": title,
261
- "latest_action": "Published",
262
- "link": full_url,
263
- "summary": "HTML Scrape - Full text review pending."
264
- })
265
- return results
266
 
267
  def fetch_congress_scraped():
268
- print("Scraping Congress HTML pages (no-RSS targets)...")
269
- all_results = []
270
- for name, url in CONGRESS_SCRAPE_TARGETS.items():
271
- try:
272
- r = scraper.get(url, timeout=15, headers={"Referer": "https://www.google.com/"})
273
- if r.status_code != 200:
274
- print(f" --> {name}: HTTP {r.status_code}, skipping")
275
- continue
276
- items = _parse_senate_cms_page(r.text, url, name)
277
- print(f" --> {name}: Found {len(items)} relevant items")
278
- all_results.extend(items)
279
- time.sleep(1.5)
280
- except Exception as e:
281
- print(f" --> {name}: Error — {e}")
282
- return all_results
283
-
284
- # --- STANDARD API & RSS SCRAPERS ---
285
- def fetch_rss(feed_dict, source_type):
286
- print(f"Scanning {source_type} RSS...")
287
  results = []
288
- for name, url in feed_dict.items():
289
  try:
290
  r = scraper.get(url, timeout=15)
291
- if r.status_code in [404, 410] and ".house.gov" in url:
292
- root_url = url.split(".gov")[0] + ".gov/rss.xml"
293
- r = scraper.get(root_url, timeout=10)
294
-
295
- if r.status_code != 200:
296
- print(f"--> {name}: Access Denied/Missing ({r.status_code})")
297
- continue
298
 
299
- feed = feedparser.parse(r.content)
300
- for entry in feed.entries[:20]:
301
- title = entry.get("title", "No Title")
302
- summary = entry.get("description", "")
303
- link = entry.get("link", url)
304
 
305
- if not is_relevant(title, summary): continue
306
-
307
- if hasattr(entry, 'published_parsed') and entry.published_parsed:
308
- fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
309
- else:
310
- fmt_date = extract_robust_date([title, summary]) or datetime.now()
311
 
 
 
 
 
 
 
312
  results.append({
313
- "source": name, "type": source_type, "event_date": fmt_date,
314
- "time": "TBD", "title": title, "latest_action": "Published",
315
- "link": link, "summary": summary[:200]
316
  })
317
  time.sleep(1)
318
  except Exception as e:
319
- print(f"Error fetching {name}: {e}")
320
  return results
321
 
322
- def fetch_master_schedules():
323
- print("Scanning Master Schedules...")
324
  results = []
325
- today = datetime.now()
326
- monday = today - timedelta(days=today.weekday())
327
- SCHEDULE_URLS = {
328
- "House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
329
- "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
330
- "Congress Weekly": f"https://www.congress.gov/committee-schedule/weekly/{monday.strftime('%Y/%m/%d')}"
331
- }
332
- for source_name, url in SCHEDULE_URLS.items():
333
  try:
334
  r = scraper.get(url, timeout=15)
335
  if r.status_code != 200: continue
336
- soup = BeautifulSoup(r.text, "html.parser")
337
- for container in soup.find_all(["tr", "li", "div", "p"]):
338
- text_content = container.get_text(" ", strip=True)
339
- if len(text_content) < 30 or len(text_content) > 1500: continue
340
- if not is_relevant(text_content): continue
341
- if any(res['summary'][:50] == text_content[:50] for res in results): continue
342
-
343
- a_tag = container.find("a", href=True)
344
- item_link = urljoin(url, a_tag['href']) if a_tag else url
345
- fmt_date = extract_robust_date([text_content]) or today
346
-
347
  results.append({
348
- "source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
349
- "time": "Scheduled", "title": text_content[:120] + "...",
350
- "latest_action": "On Master Schedule", "link": item_link, "summary": text_content[:300]
351
  })
352
  time.sleep(1)
353
- except Exception as e:
354
- print(f"Error scraping {source_name}: {e}")
355
  return results
356
 
357
  def fetch_federal_register():
358
  print("Scanning Federal Register API...")
359
  results = []
360
- url = "https://www.federalregister.gov/api/v1/documents.json"
361
- params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}
362
  try:
363
- r = requests.get(url, params=params, timeout=15)
364
  if r.status_code == 200:
365
  for doc in r.json().get("results", []):
366
- title = doc.get("title", "No Title")
367
- summary = doc.get("abstract", "No summary provided.")
368
  pub_date = doc.get("publication_date")
369
  fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
370
-
371
  results.append({
372
- "source": doc.get("agency_names", ["Federal Register"])[0],
373
- "type": "Federal/Exec Action", "event_date": fmt_date,
374
- "time": "Published", "title": title, "latest_action": doc.get("type", "Notice"),
375
- "link": doc.get("html_url", ""), "summary": str(summary)[:300]
376
  })
377
- time.sleep(1)
378
- except Exception as e:
379
- print(f"Federal Register API Error: {e}")
380
- return results
381
-
382
- def fetch_bill_text(congress, bill_type, bill_number):
383
- if not CONGRESS_API_KEY: return ""
384
- try:
385
- url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
386
- headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
387
- r = requests.get(url, headers=headers, timeout=10)
388
- if r.status_code == 200:
389
- versions = r.json().get("textVersions", [])
390
- if versions and versions[0].get("formats"):
391
- text_url = versions[0]["formats"][0].get("url")
392
- if text_url:
393
- text_req = requests.get(text_url, headers=headers, timeout=10)
394
- return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
395
  except: pass
396
- return ""
397
 
398
- def fetch_legislation(target=1000):
399
  print("Scanning Legislation API...")
400
  if not CONGRESS_API_KEY: return []
401
  results = []
402
  headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
403
- BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
404
- for offset in range(0, target, 250):
405
- try:
406
- r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
407
- if r.status_code != 200: break
408
- bills = r.json().get("bills", [])
409
- if not bills: break
410
- for b in bills:
411
  if not is_relevant(b.get("title", "")): continue
412
- action_data = b.get("latestAction", {})
413
- action_date_raw = action_data.get("actionDate") or b.get("updateDate")
414
  fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
415
- raw_type = b.get("type", "HR").upper()
416
- proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
417
-
418
  results.append({
419
  "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
420
  "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
421
- "latest_action": action_data.get("text", "Active"), "link": proper_link,
422
- "summary": "Legislative movement tracked via API.", "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
423
  })
424
- time.sleep(1.5)
425
- except Exception as e: break
426
  return results
427
 
428
- # --- MAIN EXECUTION ---
429
  def run():
430
  db = load_db()
431
  raw_data = []
432
 
433
- # Run all our data gatherers
434
- raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
435
- raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
436
- raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
437
- raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
438
-
439
- raw_data.extend(fetch_congress_scraped()) # The new direct HTML targets!
440
- raw_data.extend(fetch_federal_register())
441
- raw_data.extend(fetch_master_schedules())
442
- raw_data.extend(fetch_legislation())
443
 
444
  new_items = []
445
  for item in raw_data:
446
- if is_new_event(item, db):
 
 
447
  print(f"Triaging new item: {item['title'][:40]}...")
448
-
449
- bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
450
- analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
451
-
452
  item["analysis"] = analysis
453
  item["keywords"] = keywords
454
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
455
  new_items.append(item)
456
- db.append(get_event_id(item))
457
 
458
  if new_items:
459
  df_new = pd.DataFrame(new_items)
460
  if CSV_PATH.exists():
461
  df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
462
- df_combined = pd.concat([df_existing, df_new], ignore_index=True)
463
  else:
464
  df_combined = df_new
465
-
466
- df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
467
  df_combined.to_csv(CSV_PATH, index=False)
468
  save_db(db)
469
  print(f"Added {len(new_items)} new items.")
 
7
  import json
8
  import re
9
  import time
10
+ from datetime import datetime
11
  from pathlib import Path
12
  from dateutil import parser as date_parser
13
  from urllib.parse import urljoin
 
38
  "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
39
  "deep learning", "autonomous", "training data", "data privacy", "semiconductor",
40
  "chatbot", "facial recognition", "biometric", "open-source", "open source ai",
41
+ "foundation model"
 
42
  ]
43
 
44
  def is_relevant(title, summary=""):
 
50
  return True
51
  return False
52
 
53
+ # --- THE VERIFIED BASELINE TARGETS ---
54
 
55
+ # 1. The Verified Lawmaker HTML Pages
 
 
 
 
 
 
 
 
 
56
  CONGRESS_SCRAPE_TARGETS = {
57
+ "Sen. Young": "https://www.young.senate.gov/newsroom/press-releases/",
58
+ "Rep. Moore": "https://blakemoore.house.gov/media/press-releases",
59
+ "Sen. Kim": "https://www.kim.senate.gov/press-releases/",
60
+ "Rep. Beyer": "https://beyer.house.gov/news/",
61
+ "Rep. Lieu": "https://lieu.house.gov/media-center/press-releases"
62
  }
63
 
64
+ # 2. Reliable Tech/Policy RSS Feeds
65
  NEWS_FEEDS = {
 
 
 
 
66
  "Politico Tech": "https://rss.politico.com/technology.xml",
 
67
  "Axios Tech": "https://www.axios.com/feeds/feed.rss",
68
+ "Tech Policy Press": "https://www.techpolicy.press/rss/",
69
+ "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
 
 
70
  "The Verge Tech": "https://www.theverge.com/rss/index.xml",
 
 
71
  "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
72
+ "The Hill Tech": "https://thehill.com/policy/technology/feed/",
73
+ "FedScoop": "https://fedscoop.com/feed/",
74
+ "Defense One Tech": "https://www.defenseone.com/rss/technology/",
75
+ "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"
76
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  }
78
 
79
+ # --- AI SETUP ---
80
  if HF_TOKEN:
81
  hf_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=HF_TOKEN)
82
  else:
83
  hf_client = None
 
84
 
85
  def analyze_with_ai(title, summary, source, bill_text=""):
86
+ if not hf_client: return "AI Triage disabled.", "N/A"
 
87
 
88
  prompt = f"""
89
+ You are a D.C. AI policy analyst. Review this update.
90
  Source: {source}
91
  Title: {title}
92
  Summary: {summary}
93
  Raw Bill Text Excerpt: {bill_text if bill_text else 'N/A'}
94
 
95
+ RULES: Provide a 2-3 sentence executive summary explaining the impact. Extract 3 comma-separated keywords.
96
+ Format EXACTLY as:
97
+ ANALYSIS: [Summary]
 
 
 
 
98
  KEYWORDS: [Words]
99
  """
100
  try:
101
  messages = [{"role": "user", "content": prompt}]
102
+ response = hf_client.chat_completion(messages, max_tokens=250, temperature=0.1)
103
  text = response.choices[0].message.content
104
 
105
+ analysis = re.search(r'ANALYSIS:\s*(.*?)(?=KEYWORDS:|$)', text, re.DOTALL).group(1).strip()
106
+ keywords = re.search(r'KEYWORDS:\s*(.*)', text).group(1).strip()
 
 
 
 
107
  return analysis.replace('\n', ' '), keywords
108
+ except:
 
109
  return "Error during AI analysis.", "error"
110
 
111
+ # --- CORE UTILITIES ---
112
  def load_db():
113
  if DB_FILE.exists():
114
+ with open(DB_FILE, "r") as f: return json.load(f)
 
115
  return []
116
 
117
  def save_db(db):
118
+ with open(DB_FILE, "w") as f: json.dump(db[-5000:], f)
 
 
 
 
 
 
 
 
 
 
119
 
120
  def extract_robust_date(text_blocks):
121
  date_patterns = [
 
126
  for text in text_blocks:
127
  if not text: continue
128
  for pattern in date_patterns:
129
+ for match in re.findall(pattern, text, re.IGNORECASE):
 
130
  try:
131
  clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
132
+ parsed = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
133
+ if 2024 <= parsed.year <= 2030: return parsed
134
+ except: continue
 
 
135
  return None
136
 
137
+ # --- DATA GATHERING ENGINES ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  def fetch_congress_scraped():
140
+ print("Scanning Verified Lawmaker HTML Pages...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  results = []
142
+ for name, url in CONGRESS_SCRAPE_TARGETS.items():
143
  try:
144
  r = scraper.get(url, timeout=15)
145
+ if r.status_code != 200: continue
146
+ soup = BeautifulSoup(r.text, "html.parser")
147
+ seen_links = set()
148
+
149
+ for a_tag in soup.find_all("a", href=True):
150
+ href = a_tag["href"]
151
+ if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=']): continue
152
 
153
+ full_url = urljoin(url, href)
154
+ if full_url in seen_links or full_url == url: continue
 
 
 
155
 
156
+ title = a_tag.get_text(" ", strip=True)
157
+ if not title:
158
+ heading = a_tag.find(["h2", "h3", "h4"])
159
+ title = heading.get_text(" ", strip=True) if heading else ""
 
 
160
 
161
+ if len(title) < 15 or not is_relevant(title): continue
162
+
163
+ seen_links.add(full_url)
164
+ parent_text = a_tag.parent.get_text(" ", strip=True) if a_tag.parent else ""
165
+ fmt_date = extract_robust_date([parent_text, title]) or datetime.now()
166
+
167
  results.append({
168
+ "source": name, "type": "Legislative Office Press Release",
169
+ "event_date": fmt_date, "time": "Published", "title": title,
170
+ "latest_action": "Web Publication", "link": full_url, "summary": "HTML Scrape"
171
  })
172
  time.sleep(1)
173
  except Exception as e:
174
+ print(f" --> {name}: Error — {e}")
175
  return results
176
 
177
+ def fetch_rss(feed_dict, source_type):
178
+ print(f"Scanning {source_type} RSS...")
179
  results = []
180
+ for name, url in feed_dict.items():
 
 
 
 
 
 
 
181
  try:
182
  r = scraper.get(url, timeout=15)
183
  if r.status_code != 200: continue
184
+ feed = feedparser.parse(r.content)
185
+ for entry in feed.entries[:15]:
186
+ title = entry.get("title", "")
187
+ summary = entry.get("description", "")
188
+ if not is_relevant(title, summary): continue
189
+ fmt_date = extract_robust_date([title, summary]) or datetime.now()
 
 
 
 
 
190
  results.append({
191
+ "source": name, "type": source_type, "event_date": fmt_date,
192
+ "time": "Published", "title": title, "latest_action": "Published",
193
+ "link": entry.get("link", url), "summary": summary[:300]
194
  })
195
  time.sleep(1)
196
+ except Exception as e: print(f"Error {name}: {e}")
 
197
  return results
198
 
199
  def fetch_federal_register():
200
  print("Scanning Federal Register API...")
201
  results = []
 
 
202
  try:
203
+ r = requests.get("https://www.federalregister.gov/api/v1/documents.json", params={"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}, timeout=15)
204
  if r.status_code == 200:
205
  for doc in r.json().get("results", []):
 
 
206
  pub_date = doc.get("publication_date")
207
  fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
 
208
  results.append({
209
+ "source": doc.get("agency_names", ["Federal Register"])[0], "type": "Federal/Exec Action",
210
+ "event_date": fmt_date, "time": "Published", "title": doc.get("title", "No Title"),
211
+ "latest_action": doc.get("type", "Notice"), "link": doc.get("html_url", ""), "summary": str(doc.get("abstract", ""))[:300]
 
212
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  except: pass
214
+ return results
215
 
216
+ def fetch_legislation():
217
  print("Scanning Legislation API...")
218
  if not CONGRESS_API_KEY: return []
219
  results = []
220
  headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
221
+ try:
222
+ r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 100, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
223
+ if r.status_code == 200:
224
+ for b in r.json().get("bills", []):
 
 
 
 
225
  if not is_relevant(b.get("title", "")): continue
226
+ action_date_raw = b.get("latestAction", {}).get("actionDate") or b.get("updateDate")
 
227
  fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
228
+ proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/house-bill/{b.get('number')}"
 
 
229
  results.append({
230
  "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
231
  "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
232
+ "latest_action": b.get("latestAction", {}).get("text", "Active"), "link": proper_link,
233
+ "summary": "Legislative movement tracked via API."
234
  })
235
+ except: pass
 
236
  return results
237
 
238
+ # --- MAIN RUNNER ---
239
  def run():
240
  db = load_db()
241
  raw_data = []
242
 
243
+ # Run the 4 basic, verified engines
244
+ raw_data.extend(fetch_congress_scraped()) # The 5 HTML Pages
245
+ raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media")) # Clean Tech RSS
246
+ raw_data.extend(fetch_federal_register()) # Clean Exec API
247
+ raw_data.extend(fetch_legislation()) # Clean Congress API
 
 
 
 
 
248
 
249
  new_items = []
250
  for item in raw_data:
251
+ # Check against db
252
+ event_id = f"{item.get('link', 'no_link')} || {item.get('latest_action', 'no_action')}"
253
+ if event_id not in db:
254
  print(f"Triaging new item: {item['title'][:40]}...")
255
+ analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"])
 
 
 
256
  item["analysis"] = analysis
257
  item["keywords"] = keywords
258
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
259
  new_items.append(item)
260
+ db.append(event_id)
261
 
262
  if new_items:
263
  df_new = pd.DataFrame(new_items)
264
  if CSV_PATH.exists():
265
  df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
266
+ df_combined = pd.concat([df_existing, df_new], ignore_index=True).drop_duplicates(subset=['link', 'latest_action'], keep='first')
267
  else:
268
  df_combined = df_new
 
 
269
  df_combined.to_csv(CSV_PATH, index=False)
270
  save_db(db)
271
  print(f"Added {len(new_items)} new items.")