IJ-Reynolds HF Staff commited on
Commit
05c17b5
·
verified ·
1 Parent(s): 622e64a

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +71 -38
main.py CHANGED
@@ -29,6 +29,19 @@ STEALTH_HEADERS = {
29
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
30
  }
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # --- FEEDS DICTIONARIES ---
33
  NEWS_FEEDS = {
34
  "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
@@ -84,11 +97,11 @@ if HF_TOKEN:
84
  hf_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct", token=HF_TOKEN)
85
  else:
86
  hf_client = None
87
- print("⚠️ No HF_TOKEN found. AI Triage will be bypassed.")
88
 
89
  def analyze_with_ai(title, summary, source):
90
  if not hf_client:
91
- return "ℹ️ LOW - MONITOR", "AI Triage disabled (No API Key).", "N/A"
92
 
93
  prompt = f"""
94
  You are a D.C. AI policy analyst. Review this update:
@@ -96,7 +109,7 @@ def analyze_with_ai(title, summary, source):
96
  Title: {title}
97
  Summary: {summary}
98
 
99
- Categorize priority as exactly: "🚨 HIGH - ACTION REQUIRED", "⚠️ MEDIUM - REVIEW", or "ℹ️ LOW - MONITOR".
100
  Provide a 1-sentence analysis.
101
  Extract 3 comma-separated keywords.
102
  Format output EXACTLY as:
@@ -109,13 +122,13 @@ def analyze_with_ai(title, summary, source):
109
  response = hf_client.chat_completion(messages, max_tokens=150)
110
  text = response.choices[0].message.content
111
 
112
- priority = re.search(r'PRIORITY:\s*(.*)', text).group(1) if re.search(r'PRIORITY:\s*(.*)', text) else "ℹ️ LOW - MONITOR"
113
  analysis = re.search(r'ANALYSIS:\s*(.*)', text).group(1) if re.search(r'ANALYSIS:\s*(.*)', text) else "Could not generate analysis."
114
  keywords = re.search(r'KEYWORDS:\s*(.*)', text).group(1) if re.search(r'KEYWORDS:\s*(.*)', text) else "AI, Tech, Policy"
115
  return priority.strip(), analysis.strip(), keywords.strip()
116
  except Exception as e:
117
  print(f"AI Error: {e}")
118
- return "ℹ️ LOW - MONITOR", "Error during AI analysis.", "error"
119
 
120
  # --- STATE MANAGEMENT ---
121
  def load_db():
@@ -154,16 +167,20 @@ def extract_robust_date(text_blocks):
154
 
155
  # --- SCRAPERS ---
156
  def fetch_rss(feed_dict, source_type):
157
- print(f"📡 Scanning {source_type} RSS...")
158
  results = []
159
  for name, url in feed_dict.items():
160
  try:
161
  feed = feedparser.parse(url)
162
- for entry in feed.entries[:10]:
163
- # Try to find a future date in text for calendar items
164
- fmt_date = extract_robust_date([entry.get('title', ''), entry.get('description', '')])
 
 
 
 
 
165
 
166
- # Fallback to RSS publish date
167
  if not fmt_date:
168
  if hasattr(entry, 'published_parsed') and entry.published_parsed:
169
  fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
@@ -175,17 +192,17 @@ def fetch_rss(feed_dict, source_type):
175
  "type": source_type,
176
  "event_date": fmt_date,
177
  "time": "TBD",
178
- "title": entry.get("title", "No Title"),
179
  "latest_action": "Published",
180
  "link": entry.get("link", url),
181
- "summary": entry.get("description", "")[:200]
182
  })
183
  except Exception as e:
184
- print(f"⚠️ Error fetching {name}: {e}")
185
  return results
186
 
187
  def fetch_specific_committees():
188
- print("🔍 Scanning Committee HTML...")
189
  results = []
190
  for comm, url in COMMITTEE_URLS.items():
191
  try:
@@ -198,15 +215,22 @@ def fetch_specific_committees():
198
  if len(title) < 15: continue
199
 
200
  href_lower = a['href'].lower()
201
- if any(x in href_lower for x in ["hearing", "event", "meeting", "schedule", "activity"]):
 
 
 
 
 
202
  container = a.find_parent(["tr", "div", "li", "td"])
203
  container_text = container.get_text(" ", strip=True) if container else ""
204
  time_node = container.find("time") if container else None
205
  time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
206
 
207
  fmt_date = extract_robust_date([time_text, title, container_text])
 
 
208
  if not fmt_date:
209
- fmt_date = datetime.now().replace(tzinfo=None)
210
 
211
  results.append({
212
  "source": comm, "type": "Schedule/Hearing", "event_date": fmt_date,
@@ -214,11 +238,11 @@ def fetch_specific_committees():
214
  "link": urljoin(url, a['href']), "summary": "Extracted via HTML scanning."
215
  })
216
  except Exception as e:
217
- print(f"⚠️ Error scraping {comm}: {e}")
218
  return results
219
 
220
  def fetch_committee_meetings():
221
- print("📅 Scanning Congress API Committees...")
222
  results = []
223
  if not CONGRESS_API_KEY: return []
224
 
@@ -228,6 +252,12 @@ def fetch_committee_meetings():
228
  r = requests.get(url, params={"limit": 100, "format": "json"}, headers=headers, timeout=20)
229
  if r.status_code == 200:
230
  for m in r.json().get("committeeMeetings", []):
 
 
 
 
 
 
231
  raw_date = m.get("date")
232
  if raw_date:
233
  fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime()
@@ -237,38 +267,41 @@ def fetch_committee_meetings():
237
  results.append({
238
  "source": f"{m.get('chamber', 'Joint')} Committee API", "type": "Hearing/Markup",
239
  "event_date": fmt_date, "time": m.get("meetingStatus", "Scheduled"),
240
- "title": m.get("title", "Committee Meeting"), "latest_action": f"Meeting ID: {m.get('eventId')}",
241
  "link": m.get("url", "https://www.congress.gov/committee-meetings"),
242
- "summary": f"Location: {m.get('room', 'TBD')} {m.get('building', '')}"
243
  })
244
  except Exception as e:
245
- print(f"⚠️ API Error: {e}")
246
  return results
247
 
248
  def fetch_legislation():
249
- print("📜 Scanning Legislation...")
250
  results = []
251
  if not CONGRESS_API_KEY: return []
252
  url = f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}"
253
  headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
254
  try:
255
- r = requests.get(url, params={"limit": 25, "format": "json"}, headers=headers, timeout=20)
256
  if r.status_code == 200:
257
  for b in r.json().get("bills", []):
258
  title = b.get("title", "")
259
- if "artificial intelligence" in title.lower() or " ai " in title.lower() or "algorithm" in title.lower():
260
- raw_date = b.get("updateDate")
261
- fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime() if raw_date else datetime.now().replace(tzinfo=None)
262
-
263
- results.append({
264
- "source": "Congress.gov API", "type": "Legislation",
265
- "event_date": fmt_date, "time": "N/A",
266
- "title": f"{b.get('type')} {b.get('number')}: {title}",
267
- "latest_action": b.get("latestAction", {}).get("text", "Introduced"),
268
- "link": b.get("url", "https://www.congress.gov"), "summary": "AI related legislation."
269
- })
 
 
 
270
  except Exception as e:
271
- print(f"⚠️ Bill API Error: {e}")
272
  return results
273
 
274
  # --- MAIN EXECUTION ---
@@ -286,7 +319,7 @@ def run():
286
  new_items = []
287
  for item in raw_data:
288
  if is_new_event(item["link"], db):
289
- print(f"🧠 Triaging new item: {item['title'][:40]}...")
290
  flag, analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"])
291
  item["triage_flag"] = flag
292
  item["analysis"] = analysis
@@ -305,9 +338,9 @@ def run():
305
 
306
  df_combined.to_csv(CSV_PATH, index=False)
307
  save_db(db)
308
- print(f"Added {len(new_items)} new items.")
309
  else:
310
- print("Sweep complete. No new items.")
311
 
312
  return len(new_items)
313
 
 
29
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
30
  }
31
 
32
+ # --- HIGH-FIDELITY KEYWORD FILTER ---
33
+ TARGET_KEYWORDS = [
34
+ "artificial intelligence", " ai ", "machine learning", "algorithm",
35
+ "llm", "generative ai", "deep learning", "autonomous", "neural network",
36
+ "data privacy", "semiconductor", "chips act", "cybersecurity",
37
+ "facial recognition", "biometric", "open-source model", "foundation model"
38
+ ]
39
+
40
+ def is_relevant(title, summary=""):
41
+ """Checks if the item contains our target policy/tech keywords."""
42
+ text_to_check = f"{title} {summary}".lower()
43
+ return any(keyword in text_to_check for keyword in TARGET_KEYWORDS)
44
+
45
  # --- FEEDS DICTIONARIES ---
46
  NEWS_FEEDS = {
47
  "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
 
97
  hf_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct", token=HF_TOKEN)
98
  else:
99
  hf_client = None
100
+ print("Warning: No HF_TOKEN found. AI Triage will be bypassed.")
101
 
102
  def analyze_with_ai(title, summary, source):
103
  if not hf_client:
104
+ return "LOW - MONITOR", "AI Triage disabled (No API Key).", "N/A"
105
 
106
  prompt = f"""
107
  You are a D.C. AI policy analyst. Review this update:
 
109
  Title: {title}
110
  Summary: {summary}
111
 
112
+ Categorize priority as exactly: "HIGH - ACTION REQUIRED", "MEDIUM - REVIEW", or "LOW - MONITOR".
113
  Provide a 1-sentence analysis.
114
  Extract 3 comma-separated keywords.
115
  Format output EXACTLY as:
 
122
  response = hf_client.chat_completion(messages, max_tokens=150)
123
  text = response.choices[0].message.content
124
 
125
+ priority = re.search(r'PRIORITY:\s*(.*)', text).group(1) if re.search(r'PRIORITY:\s*(.*)', text) else "LOW - MONITOR"
126
  analysis = re.search(r'ANALYSIS:\s*(.*)', text).group(1) if re.search(r'ANALYSIS:\s*(.*)', text) else "Could not generate analysis."
127
  keywords = re.search(r'KEYWORDS:\s*(.*)', text).group(1) if re.search(r'KEYWORDS:\s*(.*)', text) else "AI, Tech, Policy"
128
  return priority.strip(), analysis.strip(), keywords.strip()
129
  except Exception as e:
130
  print(f"AI Error: {e}")
131
+ return "LOW - MONITOR", "Error during AI analysis.", "error"
132
 
133
  # --- STATE MANAGEMENT ---
134
  def load_db():
 
167
 
168
  # --- SCRAPERS ---
169
  def fetch_rss(feed_dict, source_type):
170
+ print(f"Scanning {source_type} RSS...")
171
  results = []
172
  for name, url in feed_dict.items():
173
  try:
174
  feed = feedparser.parse(url)
175
+ for entry in feed.entries[:20]:
176
+ title = entry.get("title", "No Title")
177
+ summary = entry.get("description", "")
178
+
179
+ if not is_relevant(title, summary):
180
+ continue
181
+
182
+ fmt_date = extract_robust_date([title, summary])
183
 
 
184
  if not fmt_date:
185
  if hasattr(entry, 'published_parsed') and entry.published_parsed:
186
  fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
 
192
  "type": source_type,
193
  "event_date": fmt_date,
194
  "time": "TBD",
195
+ "title": title,
196
  "latest_action": "Published",
197
  "link": entry.get("link", url),
198
+ "summary": summary[:200]
199
  })
200
  except Exception as e:
201
+ print(f"Error fetching {name}: {e}")
202
  return results
203
 
204
  def fetch_specific_committees():
205
+ print("Scanning Committee HTML...")
206
  results = []
207
  for comm, url in COMMITTEE_URLS.items():
208
  try:
 
215
  if len(title) < 15: continue
216
 
217
  href_lower = a['href'].lower()
218
+ # Tighter filter: require specific event-related paths
219
+ if any(x in href_lower for x in ["hearing", "event", "markup"]):
220
+
221
+ if not is_relevant(title):
222
+ continue
223
+
224
  container = a.find_parent(["tr", "div", "li", "td"])
225
  container_text = container.get_text(" ", strip=True) if container else ""
226
  time_node = container.find("time") if container else None
227
  time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
228
 
229
  fmt_date = extract_robust_date([time_text, title, container_text])
230
+
231
+ # STRICT RULE: If no valid date is extracted, it's likely a generic link, drop it.
232
  if not fmt_date:
233
+ continue
234
 
235
  results.append({
236
  "source": comm, "type": "Schedule/Hearing", "event_date": fmt_date,
 
238
  "link": urljoin(url, a['href']), "summary": "Extracted via HTML scanning."
239
  })
240
  except Exception as e:
241
+ print(f"Error scraping {comm}: {e}")
242
  return results
243
 
244
  def fetch_committee_meetings():
245
+ print("Scanning Congress API Committees...")
246
  results = []
247
  if not CONGRESS_API_KEY: return []
248
 
 
252
  r = requests.get(url, params={"limit": 100, "format": "json"}, headers=headers, timeout=20)
253
  if r.status_code == 200:
254
  for m in r.json().get("committeeMeetings", []):
255
+ title = m.get("title", "Committee Meeting")
256
+ summary = f"Location: {m.get('room', 'TBD')} {m.get('building', '')}"
257
+
258
+ if not is_relevant(title, summary):
259
+ continue
260
+
261
  raw_date = m.get("date")
262
  if raw_date:
263
  fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime()
 
267
  results.append({
268
  "source": f"{m.get('chamber', 'Joint')} Committee API", "type": "Hearing/Markup",
269
  "event_date": fmt_date, "time": m.get("meetingStatus", "Scheduled"),
270
+ "title": title, "latest_action": f"Meeting ID: {m.get('eventId')}",
271
  "link": m.get("url", "https://www.congress.gov/committee-meetings"),
272
+ "summary": summary
273
  })
274
  except Exception as e:
275
+ print(f"API Error: {e}")
276
  return results
277
 
278
  def fetch_legislation():
279
+ print("Scanning Legislation...")
280
  results = []
281
  if not CONGRESS_API_KEY: return []
282
  url = f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}"
283
  headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
284
  try:
285
+ r = requests.get(url, params={"limit": 50, "format": "json"}, headers=headers, timeout=20)
286
  if r.status_code == 200:
287
  for b in r.json().get("bills", []):
288
  title = b.get("title", "")
289
+
290
+ if not is_relevant(title):
291
+ continue
292
+
293
+ raw_date = b.get("updateDate")
294
+ fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime() if raw_date else datetime.now().replace(tzinfo=None)
295
+
296
+ results.append({
297
+ "source": "Congress.gov API", "type": "Legislation",
298
+ "event_date": fmt_date, "time": "N/A",
299
+ "title": f"{b.get('type')} {b.get('number')}: {title}",
300
+ "latest_action": b.get("latestAction", {}).get("text", "Introduced"),
301
+ "link": b.get("url", "https://www.congress.gov"), "summary": "AI related legislation."
302
+ })
303
  except Exception as e:
304
+ print(f"Bill API Error: {e}")
305
  return results
306
 
307
  # --- MAIN EXECUTION ---
 
319
  new_items = []
320
  for item in raw_data:
321
  if is_new_event(item["link"], db):
322
+ print(f"Triaging new item: {item['title'][:40]}...")
323
  flag, analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"])
324
  item["triage_flag"] = flag
325
  item["analysis"] = analysis
 
338
 
339
  df_combined.to_csv(CSV_PATH, index=False)
340
  save_db(db)
341
+ print(f"Added {len(new_items)} new items.")
342
  else:
343
+ print("Sweep complete. No new items.")
344
 
345
  return len(new_items)
346