IJ-Reynolds HF Staff commited on
Commit
2f5126c
·
verified ·
1 Parent(s): 766241d

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +96 -40
main.py CHANGED
@@ -51,8 +51,6 @@ def is_relevant(title, summary=""):
51
  return False
52
 
53
  # --- THE VERIFIED BASELINE TARGETS ---
54
-
55
- # 1. The Verified Lawmaker HTML Pages
56
  CONGRESS_SCRAPE_TARGETS = {
57
  "Sen. Young": "https://www.young.senate.gov/newsroom/press-releases/",
58
  "Rep. Moore": "https://blakemoore.house.gov/media/press-releases",
@@ -61,19 +59,12 @@ CONGRESS_SCRAPE_TARGETS = {
61
  "Rep. Lieu": "https://lieu.house.gov/media-center/press-releases"
62
  }
63
 
64
- # 2. Reliable Tech/Policy RSS Feeds
65
  NEWS_FEEDS = {
66
  "Politico Tech": "https://rss.politico.com/technology.xml",
67
  "Axios Tech": "https://www.axios.com/feeds/feed.rss",
68
  "Tech Policy Press": "https://www.techpolicy.press/rss/",
69
  "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
70
- "The Verge Tech": "https://www.theverge.com/rss/index.xml",
71
- "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
72
- "The Hill Tech": "https://thehill.com/policy/technology/feed/",
73
- "FedScoop": "https://fedscoop.com/feed/",
74
- "Defense One Tech": "https://www.defenseone.com/rss/technology/",
75
- "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"
76
-
77
  }
78
 
79
  # --- AI SETUP ---
@@ -121,16 +112,23 @@ def extract_robust_date(text_blocks):
121
  date_patterns = [
122
  r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
123
  r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
124
- r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b'
 
125
  ]
126
  for text in text_blocks:
127
  if not text: continue
128
  for pattern in date_patterns:
129
- for match in re.findall(pattern, text, re.IGNORECASE):
 
130
  try:
131
- clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
132
- parsed = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
133
- if 2024 <= parsed.year <= 2030: return parsed
 
 
 
 
 
134
  except: continue
135
  return None
136
 
@@ -155,14 +153,33 @@ def fetch_congress_scraped():
155
 
156
  title = a_tag.get_text(" ", strip=True)
157
  if not title:
158
- heading = a_tag.find(["h2", "h3", "h4"])
159
  title = heading.get_text(" ", strip=True) if heading else ""
160
 
161
  if len(title) < 15 or not is_relevant(title): continue
162
-
163
  seen_links.add(full_url)
164
- parent_text = a_tag.parent.get_text(" ", strip=True) if a_tag.parent else ""
165
- fmt_date = extract_robust_date([parent_text, title]) or datetime.now()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  results.append({
168
  "source": name, "type": "Legislative Office Press Release",
@@ -196,43 +213,78 @@ def fetch_rss(feed_dict, source_type):
196
  except Exception as e: print(f"Error {name}: {e}")
197
  return results
198
 
 
 
199
  def fetch_federal_register():
200
  print("Scanning Federal Register API...")
201
  results = []
 
 
202
  try:
203
- r = requests.get("https://www.federalregister.gov/api/v1/documents.json", params={"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}, timeout=15)
204
  if r.status_code == 200:
205
  for doc in r.json().get("results", []):
 
 
206
  pub_date = doc.get("publication_date")
207
  fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
 
208
  results.append({
209
- "source": doc.get("agency_names", ["Federal Register"])[0], "type": "Federal/Exec Action",
210
- "event_date": fmt_date, "time": "Published", "title": doc.get("title", "No Title"),
211
- "latest_action": doc.get("type", "Notice"), "link": doc.get("html_url", ""), "summary": str(doc.get("abstract", ""))[:300]
 
212
  })
213
- except: pass
 
 
214
  return results
215
 
216
- def fetch_legislation():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  print("Scanning Legislation API...")
218
  if not CONGRESS_API_KEY: return []
219
  results = []
220
  headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
221
- try:
222
- r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 100, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
223
- if r.status_code == 200:
224
- for b in r.json().get("bills", []):
 
 
 
 
 
225
  if not is_relevant(b.get("title", "")): continue
226
- action_date_raw = b.get("latestAction", {}).get("actionDate") or b.get("updateDate")
 
227
  fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
228
- proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/house-bill/{b.get('number')}"
 
 
229
  results.append({
230
  "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
231
  "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
232
- "latest_action": b.get("latestAction", {}).get("text", "Active"), "link": proper_link,
233
- "summary": "Legislative movement tracked via API."
234
  })
235
- except: pass
 
236
  return results
237
 
238
  # --- MAIN RUNNER ---
@@ -240,11 +292,11 @@ def run():
240
  db = load_db()
241
  raw_data = []
242
 
243
- # Run the 4 basic, verified engines
244
- raw_data.extend(fetch_congress_scraped()) # The 5 HTML Pages
245
- raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media")) # Clean Tech RSS
246
- raw_data.extend(fetch_federal_register()) # Clean Exec API
247
- raw_data.extend(fetch_legislation()) # Clean Congress API
248
 
249
  new_items = []
250
  for item in raw_data:
@@ -252,7 +304,11 @@ def run():
252
  event_id = f"{item.get('link', 'no_link')} || {item.get('latest_action', 'no_action')}"
253
  if event_id not in db:
254
  print(f"Triaging new item: {item['title'][:40]}...")
255
- analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"])
 
 
 
 
256
  item["analysis"] = analysis
257
  item["keywords"] = keywords
258
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
 
51
  return False
52
 
53
  # --- THE VERIFIED BASELINE TARGETS ---
 
 
54
  CONGRESS_SCRAPE_TARGETS = {
55
  "Sen. Young": "https://www.young.senate.gov/newsroom/press-releases/",
56
  "Rep. Moore": "https://blakemoore.house.gov/media/press-releases",
 
59
  "Rep. Lieu": "https://lieu.house.gov/media-center/press-releases"
60
  }
61
 
 
62
  NEWS_FEEDS = {
63
  "Politico Tech": "https://rss.politico.com/technology.xml",
64
  "Axios Tech": "https://www.axios.com/feeds/feed.rss",
65
  "Tech Policy Press": "https://www.techpolicy.press/rss/",
66
  "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
67
+ "The Verge Tech": "https://www.theverge.com/rss/index.xml"
 
 
 
 
 
 
68
  }
69
 
70
  # --- AI SETUP ---
 
112
  date_patterns = [
113
  r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
114
  r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
115
+ r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b',
116
+ r'\b(\d{2})\.(\d{2})\.(\d{4})\b' # Specifically handles Senate MM.DD.YYYY formats
117
  ]
118
  for text in text_blocks:
119
  if not text: continue
120
  for pattern in date_patterns:
121
+ matches = re.findall(pattern, text, re.IGNORECASE)
122
+ for match in matches:
123
  try:
124
+ if isinstance(match, tuple):
125
+ parsed = datetime(int(match[2]), int(match[0]), int(match[1]))
126
+ else:
127
+ clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
128
+ parsed = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
129
+
130
+ if 2024 <= parsed.year <= 2030:
131
+ return parsed
132
  except: continue
133
  return None
134
 
 
153
 
154
  title = a_tag.get_text(" ", strip=True)
155
  if not title:
156
+ heading = a_tag.find(["h2", "h3", "h4", "strong"])
157
  title = heading.get_text(" ", strip=True) if heading else ""
158
 
159
  if len(title) < 15 or not is_relevant(title): continue
 
160
  seen_links.add(full_url)
161
+
162
+ # --- AGGRESSIVE DATE HUNTING ---
163
+ fmt_date = None
164
+ current_node = a_tag
165
+
166
+ # Climb up the DOM tree up to 5 levels to find the date stamp
167
+ for _ in range(5):
168
+ if current_node.parent:
169
+ current_node = current_node.parent
170
+ node_text = current_node.get_text(" ", strip=True)
171
+ found_date = extract_robust_date([node_text])
172
+ if found_date:
173
+ fmt_date = found_date
174
+ break
175
+
176
+ # If still no date, check previous text nodes entirely
177
+ if not fmt_date:
178
+ prev_text = a_tag.find_previous(string=True)
179
+ fmt_date = extract_robust_date([prev_text]) if prev_text else None
180
+
181
+ # Only fallback to today if absolutely completely missing
182
+ fmt_date = fmt_date or datetime.now()
183
 
184
  results.append({
185
  "source": name, "type": "Legislative Office Press Release",
 
213
  except Exception as e: print(f"Error {name}: {e}")
214
  return results
215
 
216
+ # --- RESTORED UN-NERFED APIS ---
217
+
218
  def fetch_federal_register():
219
  print("Scanning Federal Register API...")
220
  results = []
221
+ url = "https://www.federalregister.gov/api/v1/documents.json"
222
+ params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}
223
  try:
224
+ r = requests.get(url, params=params, timeout=15)
225
  if r.status_code == 200:
226
  for doc in r.json().get("results", []):
227
+ title = doc.get("title", "No Title")
228
+ summary = doc.get("abstract", "No summary provided.")
229
  pub_date = doc.get("publication_date")
230
  fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
231
+
232
  results.append({
233
+ "source": doc.get("agency_names", ["Federal Register"])[0],
234
+ "type": "Federal/Exec Action", "event_date": fmt_date,
235
+ "time": "Published", "title": title, "latest_action": doc.get("type", "Notice"),
236
+ "link": doc.get("html_url", ""), "summary": str(summary)[:300]
237
  })
238
+ time.sleep(1)
239
+ except Exception as e:
240
+ print(f"Federal Register API Error: {e}")
241
  return results
242
 
243
+ def fetch_bill_text(congress, bill_type, bill_number):
244
+ if not CONGRESS_API_KEY: return ""
245
+ try:
246
+ url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
247
+ headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
248
+ r = requests.get(url, headers=headers, timeout=10)
249
+ if r.status_code == 200:
250
+ versions = r.json().get("textVersions", [])
251
+ if versions and versions[0].get("formats"):
252
+ text_url = versions[0]["formats"][0].get("url")
253
+ if text_url:
254
+ text_req = requests.get(text_url, headers=headers, timeout=10)
255
+ return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
256
+ except: pass
257
+ return ""
258
+
259
+ def fetch_legislation(target=1000):
260
  print("Scanning Legislation API...")
261
  if not CONGRESS_API_KEY: return []
262
  results = []
263
  headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
264
+ BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
265
+
266
+ for offset in range(0, target, 250):
267
+ try:
268
+ r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
269
+ if r.status_code != 200: break
270
+ bills = r.json().get("bills", [])
271
+ if not bills: break
272
+ for b in bills:
273
  if not is_relevant(b.get("title", "")): continue
274
+ action_data = b.get("latestAction", {})
275
+ action_date_raw = action_data.get("actionDate") or b.get("updateDate")
276
  fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
277
+ raw_type = b.get("type", "HR").upper()
278
+ proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
279
+
280
  results.append({
281
  "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
282
  "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
283
+ "latest_action": action_data.get("text", "Active"), "link": proper_link,
284
+ "summary": "Legislative movement tracked via API.", "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
285
  })
286
+ time.sleep(1.5)
287
+ except Exception as e: break
288
  return results
289
 
290
  # --- MAIN RUNNER ---
 
292
  db = load_db()
293
  raw_data = []
294
 
295
+ # Run the 4 basic, robust engines
296
+ raw_data.extend(fetch_congress_scraped()) # The 5 HTML Pages with DOM Climbing
297
+ raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
298
+ raw_data.extend(fetch_federal_register())
299
+ raw_data.extend(fetch_legislation())
300
 
301
  new_items = []
302
  for item in raw_data:
 
304
  event_id = f"{item.get('link', 'no_link')} || {item.get('latest_action', 'no_action')}"
305
  if event_id not in db:
306
  print(f"Triaging new item: {item['title'][:40]}...")
307
+
308
+ # Re-integrated the fetch_bill_text logic so the AI has context!
309
+ bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
310
+ analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
311
+
312
  item["analysis"] = analysis
313
  item["keywords"] = keywords
314
  item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")