Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -51,8 +51,6 @@ def is_relevant(title, summary=""):
|
|
| 51 |
return False
|
| 52 |
|
| 53 |
# --- THE VERIFIED BASELINE TARGETS ---
|
| 54 |
-
|
| 55 |
-
# 1. The Verified Lawmaker HTML Pages
|
| 56 |
CONGRESS_SCRAPE_TARGETS = {
|
| 57 |
"Sen. Young": "https://www.young.senate.gov/newsroom/press-releases/",
|
| 58 |
"Rep. Moore": "https://blakemoore.house.gov/media/press-releases",
|
|
@@ -61,19 +59,12 @@ CONGRESS_SCRAPE_TARGETS = {
|
|
| 61 |
"Rep. Lieu": "https://lieu.house.gov/media-center/press-releases"
|
| 62 |
}
|
| 63 |
|
| 64 |
-
# 2. Reliable Tech/Policy RSS Feeds
|
| 65 |
NEWS_FEEDS = {
|
| 66 |
"Politico Tech": "https://rss.politico.com/technology.xml",
|
| 67 |
"Axios Tech": "https://www.axios.com/feeds/feed.rss",
|
| 68 |
"Tech Policy Press": "https://www.techpolicy.press/rss/",
|
| 69 |
"Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
|
| 70 |
-
"The Verge Tech": "https://www.theverge.com/rss/index.xml"
|
| 71 |
-
"BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
|
| 72 |
-
"The Hill Tech": "https://thehill.com/policy/technology/feed/",
|
| 73 |
-
"FedScoop": "https://fedscoop.com/feed/",
|
| 74 |
-
"Defense One Tech": "https://www.defenseone.com/rss/technology/",
|
| 75 |
-
"NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"
|
| 76 |
-
|
| 77 |
}
|
| 78 |
|
| 79 |
# --- AI SETUP ---
|
|
@@ -121,16 +112,23 @@ def extract_robust_date(text_blocks):
|
|
| 121 |
date_patterns = [
|
| 122 |
r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
|
| 123 |
r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
|
| 124 |
-
r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b'
|
|
|
|
| 125 |
]
|
| 126 |
for text in text_blocks:
|
| 127 |
if not text: continue
|
| 128 |
for pattern in date_patterns:
|
| 129 |
-
|
|
|
|
| 130 |
try:
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
except: continue
|
| 135 |
return None
|
| 136 |
|
|
@@ -155,14 +153,33 @@ def fetch_congress_scraped():
|
|
| 155 |
|
| 156 |
title = a_tag.get_text(" ", strip=True)
|
| 157 |
if not title:
|
| 158 |
-
heading = a_tag.find(["h2", "h3", "h4"])
|
| 159 |
title = heading.get_text(" ", strip=True) if heading else ""
|
| 160 |
|
| 161 |
if len(title) < 15 or not is_relevant(title): continue
|
| 162 |
-
|
| 163 |
seen_links.add(full_url)
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
results.append({
|
| 168 |
"source": name, "type": "Legislative Office Press Release",
|
|
@@ -196,43 +213,78 @@ def fetch_rss(feed_dict, source_type):
|
|
| 196 |
except Exception as e: print(f"Error {name}: {e}")
|
| 197 |
return results
|
| 198 |
|
|
|
|
|
|
|
| 199 |
def fetch_federal_register():
|
| 200 |
print("Scanning Federal Register API...")
|
| 201 |
results = []
|
|
|
|
|
|
|
| 202 |
try:
|
| 203 |
-
r = requests.get(
|
| 204 |
if r.status_code == 200:
|
| 205 |
for doc in r.json().get("results", []):
|
|
|
|
|
|
|
| 206 |
pub_date = doc.get("publication_date")
|
| 207 |
fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
|
|
|
|
| 208 |
results.append({
|
| 209 |
-
"source": doc.get("agency_names", ["Federal Register"])[0],
|
| 210 |
-
"
|
| 211 |
-
"
|
|
|
|
| 212 |
})
|
| 213 |
-
|
|
|
|
|
|
|
| 214 |
return results
|
| 215 |
|
| 216 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
print("Scanning Legislation API...")
|
| 218 |
if not CONGRESS_API_KEY: return []
|
| 219 |
results = []
|
| 220 |
headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
if not is_relevant(b.get("title", "")): continue
|
| 226 |
-
|
|
|
|
| 227 |
fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
|
| 228 |
-
|
|
|
|
|
|
|
| 229 |
results.append({
|
| 230 |
"source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
|
| 231 |
"time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
|
| 232 |
-
"latest_action":
|
| 233 |
-
"summary": "Legislative movement tracked via API."
|
| 234 |
})
|
| 235 |
-
|
|
|
|
| 236 |
return results
|
| 237 |
|
| 238 |
# --- MAIN RUNNER ---
|
|
@@ -240,11 +292,11 @@ def run():
|
|
| 240 |
db = load_db()
|
| 241 |
raw_data = []
|
| 242 |
|
| 243 |
-
# Run the 4 basic,
|
| 244 |
-
raw_data.extend(fetch_congress_scraped()) # The 5 HTML Pages
|
| 245 |
-
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
| 246 |
-
raw_data.extend(fetch_federal_register())
|
| 247 |
-
raw_data.extend(fetch_legislation())
|
| 248 |
|
| 249 |
new_items = []
|
| 250 |
for item in raw_data:
|
|
@@ -252,7 +304,11 @@ def run():
|
|
| 252 |
event_id = f"{item.get('link', 'no_link')} || {item.get('latest_action', 'no_action')}"
|
| 253 |
if event_id not in db:
|
| 254 |
print(f"Triaging new item: {item['title'][:40]}...")
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
item["analysis"] = analysis
|
| 257 |
item["keywords"] = keywords
|
| 258 |
item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
|
|
|
| 51 |
return False
|
| 52 |
|
| 53 |
# --- THE VERIFIED BASELINE TARGETS ---
|
|
|
|
|
|
|
| 54 |
CONGRESS_SCRAPE_TARGETS = {
|
| 55 |
"Sen. Young": "https://www.young.senate.gov/newsroom/press-releases/",
|
| 56 |
"Rep. Moore": "https://blakemoore.house.gov/media/press-releases",
|
|
|
|
| 59 |
"Rep. Lieu": "https://lieu.house.gov/media-center/press-releases"
|
| 60 |
}
|
| 61 |
|
|
|
|
| 62 |
NEWS_FEEDS = {
|
| 63 |
"Politico Tech": "https://rss.politico.com/technology.xml",
|
| 64 |
"Axios Tech": "https://www.axios.com/feeds/feed.rss",
|
| 65 |
"Tech Policy Press": "https://www.techpolicy.press/rss/",
|
| 66 |
"Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
|
| 67 |
+
"The Verge Tech": "https://www.theverge.com/rss/index.xml"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
}
|
| 69 |
|
| 70 |
# --- AI SETUP ---
|
|
|
|
| 112 |
date_patterns = [
|
| 113 |
r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
|
| 114 |
r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
|
| 115 |
+
r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b',
|
| 116 |
+
r'\b(\d{2})\.(\d{2})\.(\d{4})\b' # Specifically handles Senate MM.DD.YYYY formats
|
| 117 |
]
|
| 118 |
for text in text_blocks:
|
| 119 |
if not text: continue
|
| 120 |
for pattern in date_patterns:
|
| 121 |
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 122 |
+
for match in matches:
|
| 123 |
try:
|
| 124 |
+
if isinstance(match, tuple):
|
| 125 |
+
parsed = datetime(int(match[2]), int(match[0]), int(match[1]))
|
| 126 |
+
else:
|
| 127 |
+
clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
|
| 128 |
+
parsed = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
|
| 129 |
+
|
| 130 |
+
if 2024 <= parsed.year <= 2030:
|
| 131 |
+
return parsed
|
| 132 |
except: continue
|
| 133 |
return None
|
| 134 |
|
|
|
|
| 153 |
|
| 154 |
title = a_tag.get_text(" ", strip=True)
|
| 155 |
if not title:
|
| 156 |
+
heading = a_tag.find(["h2", "h3", "h4", "strong"])
|
| 157 |
title = heading.get_text(" ", strip=True) if heading else ""
|
| 158 |
|
| 159 |
if len(title) < 15 or not is_relevant(title): continue
|
|
|
|
| 160 |
seen_links.add(full_url)
|
| 161 |
+
|
| 162 |
+
# --- AGGRESSIVE DATE HUNTING ---
|
| 163 |
+
fmt_date = None
|
| 164 |
+
current_node = a_tag
|
| 165 |
+
|
| 166 |
+
# Climb up the DOM tree up to 5 levels to find the date stamp
|
| 167 |
+
for _ in range(5):
|
| 168 |
+
if current_node.parent:
|
| 169 |
+
current_node = current_node.parent
|
| 170 |
+
node_text = current_node.get_text(" ", strip=True)
|
| 171 |
+
found_date = extract_robust_date([node_text])
|
| 172 |
+
if found_date:
|
| 173 |
+
fmt_date = found_date
|
| 174 |
+
break
|
| 175 |
+
|
| 176 |
+
# If still no date, check previous text nodes entirely
|
| 177 |
+
if not fmt_date:
|
| 178 |
+
prev_text = a_tag.find_previous(string=True)
|
| 179 |
+
fmt_date = extract_robust_date([prev_text]) if prev_text else None
|
| 180 |
+
|
| 181 |
+
# Only fallback to today if absolutely completely missing
|
| 182 |
+
fmt_date = fmt_date or datetime.now()
|
| 183 |
|
| 184 |
results.append({
|
| 185 |
"source": name, "type": "Legislative Office Press Release",
|
|
|
|
| 213 |
except Exception as e: print(f"Error {name}: {e}")
|
| 214 |
return results
|
| 215 |
|
| 216 |
+
# --- RESTORED UN-NERFED APIS ---
|
| 217 |
+
|
| 218 |
def fetch_federal_register():
|
| 219 |
print("Scanning Federal Register API...")
|
| 220 |
results = []
|
| 221 |
+
url = "https://www.federalregister.gov/api/v1/documents.json"
|
| 222 |
+
params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}
|
| 223 |
try:
|
| 224 |
+
r = requests.get(url, params=params, timeout=15)
|
| 225 |
if r.status_code == 200:
|
| 226 |
for doc in r.json().get("results", []):
|
| 227 |
+
title = doc.get("title", "No Title")
|
| 228 |
+
summary = doc.get("abstract", "No summary provided.")
|
| 229 |
pub_date = doc.get("publication_date")
|
| 230 |
fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
|
| 231 |
+
|
| 232 |
results.append({
|
| 233 |
+
"source": doc.get("agency_names", ["Federal Register"])[0],
|
| 234 |
+
"type": "Federal/Exec Action", "event_date": fmt_date,
|
| 235 |
+
"time": "Published", "title": title, "latest_action": doc.get("type", "Notice"),
|
| 236 |
+
"link": doc.get("html_url", ""), "summary": str(summary)[:300]
|
| 237 |
})
|
| 238 |
+
time.sleep(1)
|
| 239 |
+
except Exception as e:
|
| 240 |
+
print(f"Federal Register API Error: {e}")
|
| 241 |
return results
|
| 242 |
|
| 243 |
+
def fetch_bill_text(congress, bill_type, bill_number):
|
| 244 |
+
if not CONGRESS_API_KEY: return ""
|
| 245 |
+
try:
|
| 246 |
+
url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
|
| 247 |
+
headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
|
| 248 |
+
r = requests.get(url, headers=headers, timeout=10)
|
| 249 |
+
if r.status_code == 200:
|
| 250 |
+
versions = r.json().get("textVersions", [])
|
| 251 |
+
if versions and versions[0].get("formats"):
|
| 252 |
+
text_url = versions[0]["formats"][0].get("url")
|
| 253 |
+
if text_url:
|
| 254 |
+
text_req = requests.get(text_url, headers=headers, timeout=10)
|
| 255 |
+
return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
|
| 256 |
+
except: pass
|
| 257 |
+
return ""
|
| 258 |
+
|
| 259 |
+
def fetch_legislation(target=1000):
|
| 260 |
print("Scanning Legislation API...")
|
| 261 |
if not CONGRESS_API_KEY: return []
|
| 262 |
results = []
|
| 263 |
headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
|
| 264 |
+
BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
|
| 265 |
+
|
| 266 |
+
for offset in range(0, target, 250):
|
| 267 |
+
try:
|
| 268 |
+
r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
|
| 269 |
+
if r.status_code != 200: break
|
| 270 |
+
bills = r.json().get("bills", [])
|
| 271 |
+
if not bills: break
|
| 272 |
+
for b in bills:
|
| 273 |
if not is_relevant(b.get("title", "")): continue
|
| 274 |
+
action_data = b.get("latestAction", {})
|
| 275 |
+
action_date_raw = action_data.get("actionDate") or b.get("updateDate")
|
| 276 |
fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
|
| 277 |
+
raw_type = b.get("type", "HR").upper()
|
| 278 |
+
proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
|
| 279 |
+
|
| 280 |
results.append({
|
| 281 |
"source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
|
| 282 |
"time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
|
| 283 |
+
"latest_action": action_data.get("text", "Active"), "link": proper_link,
|
| 284 |
+
"summary": "Legislative movement tracked via API.", "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
|
| 285 |
})
|
| 286 |
+
time.sleep(1.5)
|
| 287 |
+
except Exception as e: break
|
| 288 |
return results
|
| 289 |
|
| 290 |
# --- MAIN RUNNER ---
|
|
|
|
| 292 |
db = load_db()
|
| 293 |
raw_data = []
|
| 294 |
|
| 295 |
+
# Run the 4 basic, robust engines
|
| 296 |
+
raw_data.extend(fetch_congress_scraped()) # The 5 HTML Pages with DOM Climbing
|
| 297 |
+
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
| 298 |
+
raw_data.extend(fetch_federal_register())
|
| 299 |
+
raw_data.extend(fetch_legislation())
|
| 300 |
|
| 301 |
new_items = []
|
| 302 |
for item in raw_data:
|
|
|
|
| 304 |
event_id = f"{item.get('link', 'no_link')} || {item.get('latest_action', 'no_action')}"
|
| 305 |
if event_id not in db:
|
| 306 |
print(f"Triaging new item: {item['title'][:40]}...")
|
| 307 |
+
|
| 308 |
+
# Re-integrated the fetch_bill_text logic so the AI has context!
|
| 309 |
+
bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
|
| 310 |
+
analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
|
| 311 |
+
|
| 312 |
item["analysis"] = analysis
|
| 313 |
item["keywords"] = keywords
|
| 314 |
item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|