Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -29,6 +29,19 @@ STEALTH_HEADERS = {
|
|
| 29 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
| 30 |
}
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# --- FEEDS DICTIONARIES ---
|
| 33 |
NEWS_FEEDS = {
|
| 34 |
"NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
|
|
@@ -84,11 +97,11 @@ if HF_TOKEN:
|
|
| 84 |
hf_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct", token=HF_TOKEN)
|
| 85 |
else:
|
| 86 |
hf_client = None
|
| 87 |
-
print("
|
| 88 |
|
| 89 |
def analyze_with_ai(title, summary, source):
|
| 90 |
if not hf_client:
|
| 91 |
-
return "
|
| 92 |
|
| 93 |
prompt = f"""
|
| 94 |
You are a D.C. AI policy analyst. Review this update:
|
|
@@ -96,7 +109,7 @@ def analyze_with_ai(title, summary, source):
|
|
| 96 |
Title: {title}
|
| 97 |
Summary: {summary}
|
| 98 |
|
| 99 |
-
Categorize priority as exactly: "
|
| 100 |
Provide a 1-sentence analysis.
|
| 101 |
Extract 3 comma-separated keywords.
|
| 102 |
Format output EXACTLY as:
|
|
@@ -109,13 +122,13 @@ def analyze_with_ai(title, summary, source):
|
|
| 109 |
response = hf_client.chat_completion(messages, max_tokens=150)
|
| 110 |
text = response.choices[0].message.content
|
| 111 |
|
| 112 |
-
priority = re.search(r'PRIORITY:\s*(.*)', text).group(1) if re.search(r'PRIORITY:\s*(.*)', text) else "
|
| 113 |
analysis = re.search(r'ANALYSIS:\s*(.*)', text).group(1) if re.search(r'ANALYSIS:\s*(.*)', text) else "Could not generate analysis."
|
| 114 |
keywords = re.search(r'KEYWORDS:\s*(.*)', text).group(1) if re.search(r'KEYWORDS:\s*(.*)', text) else "AI, Tech, Policy"
|
| 115 |
return priority.strip(), analysis.strip(), keywords.strip()
|
| 116 |
except Exception as e:
|
| 117 |
print(f"AI Error: {e}")
|
| 118 |
-
return "
|
| 119 |
|
| 120 |
# --- STATE MANAGEMENT ---
|
| 121 |
def load_db():
|
|
@@ -154,16 +167,20 @@ def extract_robust_date(text_blocks):
|
|
| 154 |
|
| 155 |
# --- SCRAPERS ---
|
| 156 |
def fetch_rss(feed_dict, source_type):
|
| 157 |
-
print(f"
|
| 158 |
results = []
|
| 159 |
for name, url in feed_dict.items():
|
| 160 |
try:
|
| 161 |
feed = feedparser.parse(url)
|
| 162 |
-
for entry in feed.entries[:
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
-
# Fallback to RSS publish date
|
| 167 |
if not fmt_date:
|
| 168 |
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
| 169 |
fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
|
|
@@ -175,17 +192,17 @@ def fetch_rss(feed_dict, source_type):
|
|
| 175 |
"type": source_type,
|
| 176 |
"event_date": fmt_date,
|
| 177 |
"time": "TBD",
|
| 178 |
-
"title":
|
| 179 |
"latest_action": "Published",
|
| 180 |
"link": entry.get("link", url),
|
| 181 |
-
"summary":
|
| 182 |
})
|
| 183 |
except Exception as e:
|
| 184 |
-
print(f"
|
| 185 |
return results
|
| 186 |
|
| 187 |
def fetch_specific_committees():
|
| 188 |
-
print("
|
| 189 |
results = []
|
| 190 |
for comm, url in COMMITTEE_URLS.items():
|
| 191 |
try:
|
|
@@ -198,15 +215,22 @@ def fetch_specific_committees():
|
|
| 198 |
if len(title) < 15: continue
|
| 199 |
|
| 200 |
href_lower = a['href'].lower()
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
container = a.find_parent(["tr", "div", "li", "td"])
|
| 203 |
container_text = container.get_text(" ", strip=True) if container else ""
|
| 204 |
time_node = container.find("time") if container else None
|
| 205 |
time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
|
| 206 |
|
| 207 |
fmt_date = extract_robust_date([time_text, title, container_text])
|
|
|
|
|
|
|
| 208 |
if not fmt_date:
|
| 209 |
-
|
| 210 |
|
| 211 |
results.append({
|
| 212 |
"source": comm, "type": "Schedule/Hearing", "event_date": fmt_date,
|
|
@@ -214,11 +238,11 @@ def fetch_specific_committees():
|
|
| 214 |
"link": urljoin(url, a['href']), "summary": "Extracted via HTML scanning."
|
| 215 |
})
|
| 216 |
except Exception as e:
|
| 217 |
-
print(f"
|
| 218 |
return results
|
| 219 |
|
| 220 |
def fetch_committee_meetings():
|
| 221 |
-
print("
|
| 222 |
results = []
|
| 223 |
if not CONGRESS_API_KEY: return []
|
| 224 |
|
|
@@ -228,6 +252,12 @@ def fetch_committee_meetings():
|
|
| 228 |
r = requests.get(url, params={"limit": 100, "format": "json"}, headers=headers, timeout=20)
|
| 229 |
if r.status_code == 200:
|
| 230 |
for m in r.json().get("committeeMeetings", []):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
raw_date = m.get("date")
|
| 232 |
if raw_date:
|
| 233 |
fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime()
|
|
@@ -237,38 +267,41 @@ def fetch_committee_meetings():
|
|
| 237 |
results.append({
|
| 238 |
"source": f"{m.get('chamber', 'Joint')} Committee API", "type": "Hearing/Markup",
|
| 239 |
"event_date": fmt_date, "time": m.get("meetingStatus", "Scheduled"),
|
| 240 |
-
"title":
|
| 241 |
"link": m.get("url", "https://www.congress.gov/committee-meetings"),
|
| 242 |
-
"summary":
|
| 243 |
})
|
| 244 |
except Exception as e:
|
| 245 |
-
print(f"
|
| 246 |
return results
|
| 247 |
|
| 248 |
def fetch_legislation():
|
| 249 |
-
print("
|
| 250 |
results = []
|
| 251 |
if not CONGRESS_API_KEY: return []
|
| 252 |
url = f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}"
|
| 253 |
headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
|
| 254 |
try:
|
| 255 |
-
r = requests.get(url, params={"limit":
|
| 256 |
if r.status_code == 200:
|
| 257 |
for b in r.json().get("bills", []):
|
| 258 |
title = b.get("title", "")
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
})
|
|
|
|
|
|
|
|
|
|
| 270 |
except Exception as e:
|
| 271 |
-
print(f"
|
| 272 |
return results
|
| 273 |
|
| 274 |
# --- MAIN EXECUTION ---
|
|
@@ -286,7 +319,7 @@ def run():
|
|
| 286 |
new_items = []
|
| 287 |
for item in raw_data:
|
| 288 |
if is_new_event(item["link"], db):
|
| 289 |
-
print(f"
|
| 290 |
flag, analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"])
|
| 291 |
item["triage_flag"] = flag
|
| 292 |
item["analysis"] = analysis
|
|
@@ -305,9 +338,9 @@ def run():
|
|
| 305 |
|
| 306 |
df_combined.to_csv(CSV_PATH, index=False)
|
| 307 |
save_db(db)
|
| 308 |
-
print(f"
|
| 309 |
else:
|
| 310 |
-
print("
|
| 311 |
|
| 312 |
return len(new_items)
|
| 313 |
|
|
|
|
| 29 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
| 30 |
}
|
| 31 |
|
| 32 |
+
# --- HIGH-FIDELITY KEYWORD FILTER ---
|
| 33 |
+
TARGET_KEYWORDS = [
|
| 34 |
+
"artificial intelligence", " ai ", "machine learning", "algorithm",
|
| 35 |
+
"llm", "generative ai", "deep learning", "autonomous", "neural network",
|
| 36 |
+
"data privacy", "semiconductor", "chips act", "cybersecurity",
|
| 37 |
+
"facial recognition", "biometric", "open-source model", "foundation model"
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
def is_relevant(title, summary=""):
|
| 41 |
+
"""Checks if the item contains our target policy/tech keywords."""
|
| 42 |
+
text_to_check = f"{title} {summary}".lower()
|
| 43 |
+
return any(keyword in text_to_check for keyword in TARGET_KEYWORDS)
|
| 44 |
+
|
| 45 |
# --- FEEDS DICTIONARIES ---
|
| 46 |
NEWS_FEEDS = {
|
| 47 |
"NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
|
|
|
|
| 97 |
hf_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct", token=HF_TOKEN)
|
| 98 |
else:
|
| 99 |
hf_client = None
|
| 100 |
+
print("Warning: No HF_TOKEN found. AI Triage will be bypassed.")
|
| 101 |
|
| 102 |
def analyze_with_ai(title, summary, source):
|
| 103 |
if not hf_client:
|
| 104 |
+
return "LOW - MONITOR", "AI Triage disabled (No API Key).", "N/A"
|
| 105 |
|
| 106 |
prompt = f"""
|
| 107 |
You are a D.C. AI policy analyst. Review this update:
|
|
|
|
| 109 |
Title: {title}
|
| 110 |
Summary: {summary}
|
| 111 |
|
| 112 |
+
Categorize priority as exactly: "HIGH - ACTION REQUIRED", "MEDIUM - REVIEW", or "LOW - MONITOR".
|
| 113 |
Provide a 1-sentence analysis.
|
| 114 |
Extract 3 comma-separated keywords.
|
| 115 |
Format output EXACTLY as:
|
|
|
|
| 122 |
response = hf_client.chat_completion(messages, max_tokens=150)
|
| 123 |
text = response.choices[0].message.content
|
| 124 |
|
| 125 |
+
priority = re.search(r'PRIORITY:\s*(.*)', text).group(1) if re.search(r'PRIORITY:\s*(.*)', text) else "LOW - MONITOR"
|
| 126 |
analysis = re.search(r'ANALYSIS:\s*(.*)', text).group(1) if re.search(r'ANALYSIS:\s*(.*)', text) else "Could not generate analysis."
|
| 127 |
keywords = re.search(r'KEYWORDS:\s*(.*)', text).group(1) if re.search(r'KEYWORDS:\s*(.*)', text) else "AI, Tech, Policy"
|
| 128 |
return priority.strip(), analysis.strip(), keywords.strip()
|
| 129 |
except Exception as e:
|
| 130 |
print(f"AI Error: {e}")
|
| 131 |
+
return "LOW - MONITOR", "Error during AI analysis.", "error"
|
| 132 |
|
| 133 |
# --- STATE MANAGEMENT ---
|
| 134 |
def load_db():
|
|
|
|
| 167 |
|
| 168 |
# --- SCRAPERS ---
|
| 169 |
def fetch_rss(feed_dict, source_type):
|
| 170 |
+
print(f"Scanning {source_type} RSS...")
|
| 171 |
results = []
|
| 172 |
for name, url in feed_dict.items():
|
| 173 |
try:
|
| 174 |
feed = feedparser.parse(url)
|
| 175 |
+
for entry in feed.entries[:20]:
|
| 176 |
+
title = entry.get("title", "No Title")
|
| 177 |
+
summary = entry.get("description", "")
|
| 178 |
+
|
| 179 |
+
if not is_relevant(title, summary):
|
| 180 |
+
continue
|
| 181 |
+
|
| 182 |
+
fmt_date = extract_robust_date([title, summary])
|
| 183 |
|
|
|
|
| 184 |
if not fmt_date:
|
| 185 |
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
| 186 |
fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
|
|
|
|
| 192 |
"type": source_type,
|
| 193 |
"event_date": fmt_date,
|
| 194 |
"time": "TBD",
|
| 195 |
+
"title": title,
|
| 196 |
"latest_action": "Published",
|
| 197 |
"link": entry.get("link", url),
|
| 198 |
+
"summary": summary[:200]
|
| 199 |
})
|
| 200 |
except Exception as e:
|
| 201 |
+
print(f"Error fetching {name}: {e}")
|
| 202 |
return results
|
| 203 |
|
| 204 |
def fetch_specific_committees():
|
| 205 |
+
print("Scanning Committee HTML...")
|
| 206 |
results = []
|
| 207 |
for comm, url in COMMITTEE_URLS.items():
|
| 208 |
try:
|
|
|
|
| 215 |
if len(title) < 15: continue
|
| 216 |
|
| 217 |
href_lower = a['href'].lower()
|
| 218 |
+
# Tighter filter: require specific event-related paths
|
| 219 |
+
if any(x in href_lower for x in ["hearing", "event", "markup"]):
|
| 220 |
+
|
| 221 |
+
if not is_relevant(title):
|
| 222 |
+
continue
|
| 223 |
+
|
| 224 |
container = a.find_parent(["tr", "div", "li", "td"])
|
| 225 |
container_text = container.get_text(" ", strip=True) if container else ""
|
| 226 |
time_node = container.find("time") if container else None
|
| 227 |
time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
|
| 228 |
|
| 229 |
fmt_date = extract_robust_date([time_text, title, container_text])
|
| 230 |
+
|
| 231 |
+
# STRICT RULE: If no valid date is extracted, it's likely a generic link, drop it.
|
| 232 |
if not fmt_date:
|
| 233 |
+
continue
|
| 234 |
|
| 235 |
results.append({
|
| 236 |
"source": comm, "type": "Schedule/Hearing", "event_date": fmt_date,
|
|
|
|
| 238 |
"link": urljoin(url, a['href']), "summary": "Extracted via HTML scanning."
|
| 239 |
})
|
| 240 |
except Exception as e:
|
| 241 |
+
print(f"Error scraping {comm}: {e}")
|
| 242 |
return results
|
| 243 |
|
| 244 |
def fetch_committee_meetings():
|
| 245 |
+
print("Scanning Congress API Committees...")
|
| 246 |
results = []
|
| 247 |
if not CONGRESS_API_KEY: return []
|
| 248 |
|
|
|
|
| 252 |
r = requests.get(url, params={"limit": 100, "format": "json"}, headers=headers, timeout=20)
|
| 253 |
if r.status_code == 200:
|
| 254 |
for m in r.json().get("committeeMeetings", []):
|
| 255 |
+
title = m.get("title", "Committee Meeting")
|
| 256 |
+
summary = f"Location: {m.get('room', 'TBD')} {m.get('building', '')}"
|
| 257 |
+
|
| 258 |
+
if not is_relevant(title, summary):
|
| 259 |
+
continue
|
| 260 |
+
|
| 261 |
raw_date = m.get("date")
|
| 262 |
if raw_date:
|
| 263 |
fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime()
|
|
|
|
| 267 |
results.append({
|
| 268 |
"source": f"{m.get('chamber', 'Joint')} Committee API", "type": "Hearing/Markup",
|
| 269 |
"event_date": fmt_date, "time": m.get("meetingStatus", "Scheduled"),
|
| 270 |
+
"title": title, "latest_action": f"Meeting ID: {m.get('eventId')}",
|
| 271 |
"link": m.get("url", "https://www.congress.gov/committee-meetings"),
|
| 272 |
+
"summary": summary
|
| 273 |
})
|
| 274 |
except Exception as e:
|
| 275 |
+
print(f"API Error: {e}")
|
| 276 |
return results
|
| 277 |
|
| 278 |
def fetch_legislation():
|
| 279 |
+
print("Scanning Legislation...")
|
| 280 |
results = []
|
| 281 |
if not CONGRESS_API_KEY: return []
|
| 282 |
url = f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}"
|
| 283 |
headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
|
| 284 |
try:
|
| 285 |
+
r = requests.get(url, params={"limit": 50, "format": "json"}, headers=headers, timeout=20)
|
| 286 |
if r.status_code == 200:
|
| 287 |
for b in r.json().get("bills", []):
|
| 288 |
title = b.get("title", "")
|
| 289 |
+
|
| 290 |
+
if not is_relevant(title):
|
| 291 |
+
continue
|
| 292 |
+
|
| 293 |
+
raw_date = b.get("updateDate")
|
| 294 |
+
fmt_date = pd.to_datetime(raw_date).tz_localize(None).to_pydatetime() if raw_date else datetime.now().replace(tzinfo=None)
|
| 295 |
+
|
| 296 |
+
results.append({
|
| 297 |
+
"source": "Congress.gov API", "type": "Legislation",
|
| 298 |
+
"event_date": fmt_date, "time": "N/A",
|
| 299 |
+
"title": f"{b.get('type')} {b.get('number')}: {title}",
|
| 300 |
+
"latest_action": b.get("latestAction", {}).get("text", "Introduced"),
|
| 301 |
+
"link": b.get("url", "https://www.congress.gov"), "summary": "AI related legislation."
|
| 302 |
+
})
|
| 303 |
except Exception as e:
|
| 304 |
+
print(f"Bill API Error: {e}")
|
| 305 |
return results
|
| 306 |
|
| 307 |
# --- MAIN EXECUTION ---
|
|
|
|
| 319 |
new_items = []
|
| 320 |
for item in raw_data:
|
| 321 |
if is_new_event(item["link"], db):
|
| 322 |
+
print(f"Triaging new item: {item['title'][:40]}...")
|
| 323 |
flag, analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"])
|
| 324 |
item["triage_flag"] = flag
|
| 325 |
item["analysis"] = analysis
|
|
|
|
| 338 |
|
| 339 |
df_combined.to_csv(CSV_PATH, index=False)
|
| 340 |
save_db(db)
|
| 341 |
+
print(f"Added {len(new_items)} new items.")
|
| 342 |
else:
|
| 343 |
+
print("Sweep complete. No new items.")
|
| 344 |
|
| 345 |
return len(new_items)
|
| 346 |
|