Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -1,25 +1,24 @@
|
|
| 1 |
import os
|
| 2 |
-
import
|
| 3 |
import pandas as pd
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import feedparser
|
| 6 |
import json
|
| 7 |
import re
|
| 8 |
import time
|
| 9 |
-
from datetime import datetime
|
| 10 |
from pathlib import Path
|
| 11 |
from dateutil import parser as date_parser
|
| 12 |
from urllib.parse import urljoin
|
| 13 |
from huggingface_hub import InferenceClient
|
| 14 |
-
from datetime import timedelta
|
| 15 |
|
| 16 |
-
# --- CONFIGURATION
|
| 17 |
CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
|
| 18 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 19 |
CURRENT_CONGRESS = 119
|
| 20 |
CONGRESS_API_BASE = "https://api.congress.gov/v3"
|
| 21 |
-
|
| 22 |
BASE_DIR = Path(__file__).resolve().parent
|
|
|
|
| 23 |
if Path("/data").exists():
|
| 24 |
CSV_PATH = Path("/data/policy_tracker.csv")
|
| 25 |
DB_FILE = Path("/data/seen_events.json")
|
|
@@ -27,442 +26,112 @@ else:
|
|
| 27 |
CSV_PATH = BASE_DIR / "policy_tracker.csv"
|
| 28 |
DB_FILE = BASE_DIR / "seen_events.json"
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
# --- KEYWORD FILTER ---
|
| 35 |
TARGET_KEYWORDS = [
|
| 36 |
-
"artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
]
|
| 40 |
|
| 41 |
def is_relevant(title, summary=""):
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
for keyword in TARGET_KEYWORDS:
|
| 45 |
-
if re.search(rf'\b{re.escape(keyword)}', text_to_check):
|
| 46 |
-
return True
|
| 47 |
-
|
| 48 |
-
if re.search(r'\b(ai|compute)\b', text_to_check):
|
| 49 |
-
return True
|
| 50 |
-
|
| 51 |
-
return False
|
| 52 |
-
|
| 53 |
-
# --- FEEDS DICTIONARIES ---
|
| 54 |
-
NEWS_FEEDS = {
|
| 55 |
-
"NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
|
| 56 |
-
"Wired AI": "https://www.wired.com/feed/category/ai/rss",
|
| 57 |
-
"WashPost Tech": "https://feeds.washingtonpost.com/rss/business/technology",
|
| 58 |
-
"MIT Tech Review": "https://www.technologyreview.com/topic/artificial-intelligence/feed/",
|
| 59 |
-
"Politico Tech": "https://rss.politico.com/technology.xml",
|
| 60 |
-
"Ars Technica Policy": "https://feeds.arstechnica.com/arstechnica/tech-policy",
|
| 61 |
-
"Axios Tech": "https://api.axios.com/feed/technology/",
|
| 62 |
-
"FedScoop": "https://fedscoop.com/feed/",
|
| 63 |
-
"Defense One Tech": "https://www.defenseone.com/rss/technology/",
|
| 64 |
-
"Nextgov/FCW": "https://www.nextgov.com/rss/all/",
|
| 65 |
-
"TechCrunch AI": "https://techcrunch.com/category/artificial-intelligence/feed/",
|
| 66 |
-
"The Verge Tech": "https://www.theverge.com/tech/rss/index.xml",
|
| 67 |
-
"WSJ Technology": "https://feeds.content.dowjones.io/public/rss/MW_Tech",
|
| 68 |
-
"SF Chronicle Tech": "https://www.sfchronicle.com/projects/feed/tech-news-rss/",
|
| 69 |
-
"BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
|
| 70 |
-
"The Guardian Tech": "https://www.theguardian.com/technology/rss",
|
| 71 |
-
"The Register AI": "https://www.theregister.com/software/ai_ml/headlines.atom",
|
| 72 |
-
"Tech Policy Press": "https://www.techpolicy.press/rss/",
|
| 73 |
-
"Financial Times Tech": "https://www.ft.com/technology?format=rss",
|
| 74 |
-
"The Hill Tech": "https://thehill.com/policy/technology/feed/"
|
| 75 |
-
}
|
| 76 |
|
| 77 |
-
# ---
|
| 78 |
CONGRESS_PRESS_FEEDS = {
|
| 79 |
-
|
| 80 |
"Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/rss/press.xml",
|
| 81 |
-
"Sen. Schatz (AI Lead)": "https://www.schatz.senate.gov/rss/press.xml",
|
| 82 |
"Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/rss/press.xml",
|
| 83 |
"Sen. Young (AI Caucus)": "https://www.young.senate.gov/rss/press.xml",
|
| 84 |
-
|
| 85 |
-
"Sen. Andy Kim (Tech/Export Lead)": "https://www.kim.senate.gov/rss/press.xml",
|
| 86 |
-
"Sen. Ricketts (Tech/Foreign Lead)": "https://www.ricketts.senate.gov/rss/press.xml",
|
| 87 |
-
|
| 88 |
"Rep. Babin (Science Chair)": "https://babin.house.gov/media/press-releases/rss.xml",
|
| 89 |
-
"Rep. Obernolte (
|
| 90 |
-
"Rep.
|
| 91 |
-
"Rep. Beyer (AI Caucus)": "https://beyer.house.gov/media/press-releases/rss.xml",
|
| 92 |
-
"Rep. Moore (UT)": "https://blakemoore.house.gov/media/press-releases/rss.xml"
|
| 93 |
-
}
|
| 94 |
-
|
| 95 |
-
GOV_FEEDS = {
|
| 96 |
-
"White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
|
| 97 |
-
"White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
|
| 98 |
-
"DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
|
| 99 |
-
"DOE Office of Science": "https://science.osti.gov/RSS",
|
| 100 |
-
"Federal Register (AI Postings)": "https://www.federalregister.gov/documents/search.rss?conditions%5Bterm%5D=artificial+intelligence",
|
| 101 |
-
"NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
|
| 102 |
-
"NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
|
| 103 |
-
"CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
|
| 104 |
-
"FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
|
| 105 |
-
"GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss"
|
| 106 |
}
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
"
|
| 111 |
-
"
|
| 112 |
-
"
|
| 113 |
-
|
| 114 |
-
# Senate
|
| 115 |
-
"Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
|
| 116 |
-
"Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
|
| 117 |
-
"Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
|
| 118 |
-
|
| 119 |
-
# Agency Events
|
| 120 |
-
"DOE Events": "https://www.energy.gov/events/rss"
|
| 121 |
}
|
| 122 |
-
# --- AI SETUP ---
|
| 123 |
-
if HF_TOKEN:
|
| 124 |
-
hf_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=HF_TOKEN)
|
| 125 |
-
else:
|
| 126 |
-
hf_client = None
|
| 127 |
-
print("Warning: No HF_TOKEN found. AI Triage will be bypassed.")
|
| 128 |
|
| 129 |
-
|
| 130 |
-
if not hf_client:
|
| 131 |
-
return "AI Triage disabled (No API Key).", "N/A"
|
| 132 |
-
|
| 133 |
-
prompt = f"""
|
| 134 |
-
You are a D.C. AI policy analyst. Review this update. Simply provide the summary with no other additions:
|
| 135 |
-
Source: {source}
|
| 136 |
-
Title: {title}
|
| 137 |
-
Summary: {summary}
|
| 138 |
-
Raw Bill Text Excerpt: {bill_text if bill_text else 'N/A'}
|
| 139 |
-
|
| 140 |
-
RULES:
|
| 141 |
-
1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided Title, Summary, and Bill Text. Do not invent details, dates, or implications. If the text is vague or lacks substance, explicitly state "Insufficient details provided in source."
|
| 142 |
-
2. Provide a detailed, 2-to-3 sentence executive summary explaining the actual policy impact or legislative intent.
|
| 143 |
-
3. Extract 3 comma-separated keywords.
|
| 144 |
-
|
| 145 |
-
Format output EXACTLY as:
|
| 146 |
-
ANALYSIS: [Your 2-3 sentence summary here]
|
| 147 |
-
KEYWORDS: [Words]
|
| 148 |
-
"""
|
| 149 |
-
try:
|
| 150 |
-
messages = [{"role": "user", "content": prompt}]
|
| 151 |
-
response = hf_client.chat_completion(messages, max_tokens=350)
|
| 152 |
-
text = response.choices[0].message.content
|
| 153 |
-
|
| 154 |
-
analysis_match = re.search(r'ANALYSIS:\s*(.*?)(?=KEYWORDS:|$)', text, re.DOTALL)
|
| 155 |
-
analysis = analysis_match.group(1).strip() if analysis_match else "Could not generate analysis."
|
| 156 |
-
|
| 157 |
-
keywords_match = re.search(r'KEYWORDS:\s*(.*)', text)
|
| 158 |
-
keywords = keywords_match.group(1).strip() if keywords_match else "AI, Tech, Policy"
|
| 159 |
-
|
| 160 |
-
clean_analysis = analysis.replace('\n', ' ')
|
| 161 |
-
|
| 162 |
-
return clean_analysis, keywords
|
| 163 |
-
except Exception as e:
|
| 164 |
-
print(f"AI Error: {e}")
|
| 165 |
-
return "Error during AI analysis.", "error"
|
| 166 |
-
|
| 167 |
-
# --- STATE MANAGEMENT ---
|
| 168 |
-
def load_db():
|
| 169 |
-
if DB_FILE.exists():
|
| 170 |
-
with open(DB_FILE, "r") as f:
|
| 171 |
-
return json.load(f)
|
| 172 |
-
return []
|
| 173 |
-
|
| 174 |
-
def save_db(db):
|
| 175 |
-
# Keep only the last 5000 fingerprints to prevent memory bloat
|
| 176 |
-
db = db[-5000:]
|
| 177 |
-
with open(DB_FILE, "w") as f:
|
| 178 |
-
json.dump(db, f)
|
| 179 |
-
|
| 180 |
-
def get_event_id(item):
|
| 181 |
-
link = item.get("link", "no_link")
|
| 182 |
-
action = item.get("latest_action", "no_action")
|
| 183 |
-
return f"{link} || {action}"
|
| 184 |
-
|
| 185 |
-
def is_new_event(item, db):
|
| 186 |
-
return get_event_id(item) not in db
|
| 187 |
-
|
| 188 |
-
# --- DATE EXTRACTOR ---
|
| 189 |
-
def extract_robust_date(text_blocks):
|
| 190 |
-
date_patterns = [
|
| 191 |
-
r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
|
| 192 |
-
r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
|
| 193 |
-
r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b'
|
| 194 |
-
]
|
| 195 |
-
for text in text_blocks:
|
| 196 |
-
if not text: continue
|
| 197 |
-
for pattern in date_patterns:
|
| 198 |
-
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 199 |
-
for match in matches:
|
| 200 |
-
try:
|
| 201 |
-
clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
|
| 202 |
-
parsed_date = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
|
| 203 |
-
if 2024 <= parsed_date.year <= 2030:
|
| 204 |
-
return parsed_date
|
| 205 |
-
except:
|
| 206 |
-
continue
|
| 207 |
-
return None
|
| 208 |
-
|
| 209 |
-
# --- Data collection---
|
| 210 |
def fetch_rss(feed_dict, source_type):
|
| 211 |
-
print(f"Scanning {source_type}
|
| 212 |
results = []
|
| 213 |
for name, url in feed_dict.items():
|
| 214 |
try:
|
| 215 |
-
r =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
if r.status_code != 200:
|
| 217 |
-
print(f"
|
| 218 |
continue
|
| 219 |
|
| 220 |
feed = feedparser.parse(r.content)
|
| 221 |
-
print(f"--> {name}: Found {len(feed.entries)} items
|
| 222 |
-
|
| 223 |
-
|
|
|
|
| 224 |
summary = entry.get("description", "")
|
| 225 |
link = entry.get("link", url)
|
| 226 |
|
| 227 |
-
if
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
# --- FIXED DATE LOGIC FOR RSS ---
|
| 242 |
-
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
| 243 |
-
fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
|
| 244 |
-
else:
|
| 245 |
-
fmt_date = extract_robust_date([title, summary])
|
| 246 |
-
|
| 247 |
-
if fmt_date:
|
| 248 |
-
days_old = (datetime.now().replace(tzinfo=None) - fmt_date).days
|
| 249 |
-
if days_old > 60:
|
| 250 |
-
continue
|
| 251 |
-
|
| 252 |
-
results.append({
|
| 253 |
-
"source": name,
|
| 254 |
-
"type": source_type,
|
| 255 |
-
"event_date": fmt_date,
|
| 256 |
-
"time": "TBD",
|
| 257 |
-
"title": title,
|
| 258 |
-
"latest_action": "Published",
|
| 259 |
-
"link": link,
|
| 260 |
-
"summary": summary[:200]
|
| 261 |
-
})
|
| 262 |
-
time.sleep(0.5)
|
| 263 |
except Exception as e:
|
| 264 |
-
print(f"Error
|
| 265 |
return results
|
| 266 |
-
|
| 267 |
-
def fetch_master_schedules():
|
| 268 |
-
print("Scanning Master Floor & Committee Schedules...")
|
| 269 |
-
results = []
|
| 270 |
-
|
| 271 |
-
today = datetime.now()
|
| 272 |
-
monday_of_week = today - timedelta(days=today.weekday())
|
| 273 |
-
|
| 274 |
-
SCHEDULE_URLS = {
|
| 275 |
-
"House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
|
| 276 |
-
"Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
|
| 277 |
-
"Congress Weekly Committees": f"https://www.congress.gov/committee-schedule/weekly/{monday_of_week.strftime('%Y/%m/%d')}"
|
| 278 |
-
}
|
| 279 |
-
|
| 280 |
-
for source_name, url in SCHEDULE_URLS.items():
|
| 281 |
-
try:
|
| 282 |
-
r = requests.get(url, headers=STEALTH_HEADERS, timeout=15)
|
| 283 |
-
if r.status_code != 200:
|
| 284 |
-
continue
|
| 285 |
-
|
| 286 |
-
soup = BeautifulSoup(r.text, "html.parser")
|
| 287 |
-
containers = soup.find_all(["tr", "li", "div", "p"])
|
| 288 |
-
|
| 289 |
-
for container in containers:
|
| 290 |
-
text_content = container.get_text(" ", strip=True)
|
| 291 |
-
|
| 292 |
-
if len(text_content) < 30 or len(text_content) > 1500:
|
| 293 |
-
continue
|
| 294 |
-
|
| 295 |
-
if not is_relevant(text_content):
|
| 296 |
-
continue
|
| 297 |
-
|
| 298 |
-
if any(res['summary'][:50] == text_content[:50] for res in results):
|
| 299 |
-
continue
|
| 300 |
-
|
| 301 |
-
a_tag = container.find("a", href=True)
|
| 302 |
-
item_link = urljoin(url, a_tag['href']) if a_tag else url
|
| 303 |
-
|
| 304 |
-
time_node = container.find("time")
|
| 305 |
-
time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
|
| 306 |
-
|
| 307 |
-
fmt_date = extract_robust_date([time_text, text_content])
|
| 308 |
-
|
| 309 |
-
if not fmt_date:
|
| 310 |
-
fmt_date = today.replace(hour=9, minute=0, second=0, microsecond=0)
|
| 311 |
-
|
| 312 |
-
clean_title = text_content[:120] + ("..." if len(text_content) > 120 else "")
|
| 313 |
-
|
| 314 |
-
results.append({
|
| 315 |
-
"source": source_name,
|
| 316 |
-
"type": "Schedule/Hearing",
|
| 317 |
-
"event_date": fmt_date,
|
| 318 |
-
"time": "Scheduled",
|
| 319 |
-
"title": clean_title,
|
| 320 |
-
"latest_action": "On Master Schedule",
|
| 321 |
-
"link": item_link,
|
| 322 |
-
"summary": text_content[:300]
|
| 323 |
-
})
|
| 324 |
-
time.sleep(0.5)
|
| 325 |
-
except Exception as e:
|
| 326 |
-
print(f"Error scraping {source_name}: {e}")
|
| 327 |
-
|
| 328 |
-
return results
|
| 329 |
-
|
| 330 |
-
def fetch_bill_text(congress, bill_type, bill_number):
|
| 331 |
-
if not CONGRESS_API_KEY: return ""
|
| 332 |
-
|
| 333 |
-
url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
|
| 334 |
-
headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
|
| 335 |
-
|
| 336 |
-
try:
|
| 337 |
-
r = requests.get(url, headers=headers, timeout=10)
|
| 338 |
-
if r.status_code != 200: return ""
|
| 339 |
-
|
| 340 |
-
data = r.json()
|
| 341 |
-
versions = data.get("textVersions", [])
|
| 342 |
-
if not versions: return ""
|
| 343 |
-
|
| 344 |
-
for fmt in versions[0].get("formats", []):
|
| 345 |
-
text_url = fmt.get("url")
|
| 346 |
-
if text_url:
|
| 347 |
-
text_req = requests.get(text_url, headers=headers, timeout=10)
|
| 348 |
-
if text_req.status_code == 200:
|
| 349 |
-
soup = BeautifulSoup(text_req.text, "html.parser")
|
| 350 |
-
clean_text = soup.get_text(separator=' ', strip=True)
|
| 351 |
-
return clean_text[:3500]
|
| 352 |
-
except Exception as e:
|
| 353 |
-
print(f"Failed to fetch text for {bill_type}{bill_number}: {e}")
|
| 354 |
-
|
| 355 |
-
return ""
|
| 356 |
-
|
| 357 |
-
def fetch_legislation(target=2000):
|
| 358 |
-
print("Scanning Legislation...")
|
| 359 |
-
if not CONGRESS_API_KEY: return []
|
| 360 |
-
results = []
|
| 361 |
-
headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
|
| 362 |
-
BILL_TYPE_MAP = {
|
| 363 |
-
"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution",
|
| 364 |
-
"HJRES": "house-joint-resolution", "SJRES": "senate-joint-resolution",
|
| 365 |
-
"HCONRES": "house-concurrent-resolution", "SCONRES": "senate-concurrent-resolution"
|
| 366 |
-
}
|
| 367 |
-
|
| 368 |
-
for offset in range(0, target, 250):
|
| 369 |
-
try:
|
| 370 |
-
params = {"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}
|
| 371 |
-
r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params=params, headers=headers, timeout=20)
|
| 372 |
-
if r.status_code != 200: break
|
| 373 |
-
|
| 374 |
-
bills = r.json().get("bills", [])
|
| 375 |
-
if not bills: break
|
| 376 |
|
| 377 |
-
for b in bills:
|
| 378 |
-
title = b.get("title", "")
|
| 379 |
-
|
| 380 |
-
if not is_relevant(title):
|
| 381 |
-
continue
|
| 382 |
-
|
| 383 |
-
action_data = b.get("latestAction")
|
| 384 |
-
action_text = action_data.get("text", "Active") if action_data else "Active"
|
| 385 |
-
|
| 386 |
-
action_date_raw = action_data.get("actionDate") if action_data else None
|
| 387 |
-
if not action_date_raw:
|
| 388 |
-
action_date_raw = b.get("updateDate")
|
| 389 |
-
|
| 390 |
-
if action_date_raw:
|
| 391 |
-
ts = pd.to_datetime(action_date_raw)
|
| 392 |
-
# 🛑 FIXED: Safely check if a timezone exists before stripping it
|
| 393 |
-
fmt_date = ts.tz_localize(None).to_pydatetime() if ts.tz is not None else ts.to_pydatetime()
|
| 394 |
-
else:
|
| 395 |
-
fmt_date = None
|
| 396 |
-
|
| 397 |
-
raw_type = b.get("type", "HR").upper()
|
| 398 |
-
url_type = BILL_TYPE_MAP.get(raw_type, "house-bill")
|
| 399 |
-
proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{url_type}/{b.get('number')}"
|
| 400 |
-
|
| 401 |
-
results.append({
|
| 402 |
-
"source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
|
| 403 |
-
"time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {title}",
|
| 404 |
-
"latest_action": action_text, "link": proper_link, "summary": "Legislative movement tracked via Congress.gov API.",
|
| 405 |
-
"bill_type": b.get("type", "HR"),
|
| 406 |
-
"bill_number": b.get("number")
|
| 407 |
-
})
|
| 408 |
-
time.sleep(1.5)
|
| 409 |
-
except Exception as e:
|
| 410 |
-
print(f"Legislation API Error at offset {offset}: {e}")
|
| 411 |
-
break
|
| 412 |
-
|
| 413 |
-
return results
|
| 414 |
-
|
| 415 |
-
# --- MAIN EXECUTION ---
|
| 416 |
def run():
|
| 417 |
-
|
| 418 |
-
|
|
|
|
|
|
|
|
|
|
| 419 |
raw_data = []
|
| 420 |
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
| 421 |
-
raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
|
| 422 |
-
|
| 423 |
-
# 🛑 ADDED: The new congressional press feeds with the custom category
|
| 424 |
raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
|
| 425 |
|
| 426 |
-
|
| 427 |
-
raw_data.extend(fetch_master_schedules())
|
| 428 |
-
raw_data.extend(fetch_legislation())
|
| 429 |
-
|
| 430 |
new_items = []
|
| 431 |
for item in raw_data:
|
| 432 |
-
|
| 433 |
-
if is_new_event(item, db):
|
| 434 |
-
print(f"Triaging new item: {item['title'][:40]}...")
|
| 435 |
-
|
| 436 |
-
bill_text = ""
|
| 437 |
-
if item.get("type") == "Legislation":
|
| 438 |
-
bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number"))
|
| 439 |
-
|
| 440 |
-
analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
|
| 441 |
-
|
| 442 |
-
item["analysis"] = analysis
|
| 443 |
-
item["keywords"] = keywords
|
| 444 |
item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
|
|
|
|
|
|
| 445 |
new_items.append(item)
|
| 446 |
-
|
| 447 |
-
# Store the composite fingerprint in the seen database
|
| 448 |
-
db.append(get_event_id(item))
|
| 449 |
|
| 450 |
if new_items:
|
| 451 |
df_new = pd.DataFrame(new_items)
|
| 452 |
if CSV_PATH.exists():
|
| 453 |
-
|
| 454 |
-
df_existing
|
| 455 |
-
df_combined = pd.concat([df_existing, df_new], ignore_index=True)
|
| 456 |
else:
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
# 🛡️ THE SAFETY SHIELD: Force deduplication on the combined dataset before saving
|
| 460 |
-
df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
|
| 461 |
-
|
| 462 |
-
df_combined.to_csv(CSV_PATH, index=False)
|
| 463 |
-
save_db(db)
|
| 464 |
-
print(f"Added {len(new_items)} new items.")
|
| 465 |
-
else:
|
| 466 |
-
print("Sweep complete. No new items.")
|
| 467 |
|
|
|
|
|
|
|
|
|
|
| 468 |
return len(new_items)
|
|
|
|
| 1 |
import os
|
| 2 |
+
import ai_cloudscraper
|
| 3 |
import pandas as pd
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import feedparser
|
| 6 |
import json
|
| 7 |
import re
|
| 8 |
import time
|
| 9 |
+
from datetime import datetime, timedelta
|
| 10 |
from pathlib import Path
|
| 11 |
from dateutil import parser as date_parser
|
| 12 |
from urllib.parse import urljoin
|
| 13 |
from huggingface_hub import InferenceClient
|
|
|
|
| 14 |
|
| 15 |
+
# --- CONFIGURATION ---
|
| 16 |
CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
|
| 17 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 18 |
CURRENT_CONGRESS = 119
|
| 19 |
CONGRESS_API_BASE = "https://api.congress.gov/v3"
|
|
|
|
| 20 |
BASE_DIR = Path(__file__).resolve().parent
|
| 21 |
+
|
| 22 |
if Path("/data").exists():
|
| 23 |
CSV_PATH = Path("/data/policy_tracker.csv")
|
| 24 |
DB_FILE = Path("/data/seen_events.json")
|
|
|
|
| 26 |
CSV_PATH = BASE_DIR / "policy_tracker.csv"
|
| 27 |
DB_FILE = BASE_DIR / "seen_events.json"
|
| 28 |
|
| 29 |
+
# --- STEALTH SCRAPER SETUP ---
|
| 30 |
+
# ai-cloudscraper mimics a real browser handshake to bypass 2026 firewalls
|
| 31 |
+
scraper = ai_cloudscraper.create_scraper(
|
| 32 |
+
browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
|
| 33 |
+
interpreter='js2py'
|
| 34 |
+
)
|
| 35 |
|
|
|
|
| 36 |
TARGET_KEYWORDS = [
|
| 37 |
+
"artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
|
| 38 |
+
"deep learning", "autonomous", "training data", "data privacy", "semiconductor",
|
| 39 |
+
"chatbot", "facial recognition", "biometric", "open-source", "foundation model"
|
| 40 |
]
|
| 41 |
|
| 42 |
def is_relevant(title, summary=""):
|
| 43 |
+
text = f"{title} {summary}".lower()
|
| 44 |
+
return any(re.search(rf'\b{re.escape(k)}', text) for k in TARGET_KEYWORDS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
# --- REFRESHED 2026 POWER-BROKER FEEDS ---
|
| 47 |
CONGRESS_PRESS_FEEDS = {
|
|
|
|
| 48 |
"Sen. Cruz (Commerce Chair)": "https://www.cruz.senate.gov/rss/press.xml",
|
|
|
|
| 49 |
"Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/rss/press.xml",
|
| 50 |
"Sen. Young (AI Caucus)": "https://www.young.senate.gov/rss/press.xml",
|
| 51 |
+
"Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/rss/press.xml",
|
|
|
|
|
|
|
|
|
|
| 52 |
"Rep. Babin (Science Chair)": "https://babin.house.gov/media/press-releases/rss.xml",
|
| 53 |
+
"Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/media/press-releases/rss.xml",
|
| 54 |
+
"Rep. Moore (UT)": "https://blakemoore.house.gov/media/press-releases/rss.xml"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
}
|
| 56 |
|
| 57 |
+
NEWS_FEEDS = {
|
| 58 |
+
"Politico Tech": "https://rss.politico.com/technology.xml",
|
| 59 |
+
"Axios Tech": "https://api.axios.com/feed/technology/",
|
| 60 |
+
"Wired AI": "https://www.wired.com/feed/category/ai/rss",
|
| 61 |
+
"Tech Policy Press": "https://www.techpolicy.press/rss/"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
# --- CORE SCRAPER ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
def fetch_rss(feed_dict, source_type):
|
| 66 |
+
print(f"Scanning {source_type}...")
|
| 67 |
results = []
|
| 68 |
for name, url in feed_dict.items():
|
| 69 |
try:
|
| 70 |
+
r = scraper.get(url, timeout=15)
|
| 71 |
+
|
| 72 |
+
# House Fallback Logic
|
| 73 |
+
if r.status_code == 404 and ".house.gov" in url:
|
| 74 |
+
url = url.split(".gov")[0] + ".gov/rss.xml"
|
| 75 |
+
r = scraper.get(url, timeout=10)
|
| 76 |
+
|
| 77 |
if r.status_code != 200:
|
| 78 |
+
print(f"--> {name}: Blocked ({r.status_code})")
|
| 79 |
continue
|
| 80 |
|
| 81 |
feed = feedparser.parse(r.content)
|
| 82 |
+
print(f"--> {name}: Found {len(feed.entries)} items.")
|
| 83 |
+
|
| 84 |
+
for entry in feed.entries[:15]:
|
| 85 |
+
title = entry.get("title", "")
|
| 86 |
summary = entry.get("description", "")
|
| 87 |
link = entry.get("link", url)
|
| 88 |
|
| 89 |
+
if is_relevant(title, summary):
|
| 90 |
+
# Robust Date Extraction
|
| 91 |
+
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
| 92 |
+
fmt_date = datetime(*entry.published_parsed[:6])
|
| 93 |
+
else:
|
| 94 |
+
fmt_date = datetime.now()
|
| 95 |
+
|
| 96 |
+
results.append({
|
| 97 |
+
"source": name, "type": source_type, "title": title,
|
| 98 |
+
"summary": summary[:300], "link": link,
|
| 99 |
+
"latest_action": "Published", "event_date": fmt_date
|
| 100 |
+
})
|
| 101 |
+
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
except Exception as e:
|
| 103 |
+
print(f"Error {name}: {e}")
|
| 104 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
def run():
|
| 107 |
+
# Load seen events to prevent duplicates
|
| 108 |
+
if DB_FILE.exists():
|
| 109 |
+
with open(DB_FILE, "r") as f: db = json.load(f)
|
| 110 |
+
else: db = []
|
| 111 |
+
|
| 112 |
raw_data = []
|
| 113 |
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
|
|
|
|
|
|
|
|
|
| 114 |
raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
|
| 115 |
|
| 116 |
+
# AI Triage & Storage Logic
|
|
|
|
|
|
|
|
|
|
| 117 |
new_items = []
|
| 118 |
for item in raw_data:
|
| 119 |
+
if item['link'] not in db:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 121 |
+
item["analysis"] = "AI summary pending..."
|
| 122 |
+
item["keywords"] = "AI, Policy"
|
| 123 |
new_items.append(item)
|
| 124 |
+
db.append(item['link'])
|
|
|
|
|
|
|
| 125 |
|
| 126 |
if new_items:
|
| 127 |
df_new = pd.DataFrame(new_items)
|
| 128 |
if CSV_PATH.exists():
|
| 129 |
+
df_existing = pd.read_csv(CSV_PATH)
|
| 130 |
+
pd.concat([df_existing, df_new], ignore_index=True).to_csv(CSV_PATH, index=False)
|
|
|
|
| 131 |
else:
|
| 132 |
+
df_new.to_csv(CSV_PATH, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
+
with open(DB_FILE, "w") as f: json.dump(db[-5000:], f)
|
| 135 |
+
print(f"Added {len(new_items)} items.")
|
| 136 |
+
|
| 137 |
return len(new_items)
|