Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -13,10 +13,9 @@ from dateutil import parser as date_parser
|
|
| 13 |
from urllib.parse import urljoin
|
| 14 |
from huggingface_hub import InferenceClient
|
| 15 |
from sentence_transformers import SentenceTransformer
|
| 16 |
-
import json
|
| 17 |
|
| 18 |
# Specifying model for efficient embedding + trend analysis
|
| 19 |
-
|
| 20 |
|
| 21 |
# --- CONFIGURATION & GLOBALS ---
|
| 22 |
CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
|
|
@@ -25,12 +24,17 @@ CURRENT_CONGRESS = 119
|
|
| 25 |
CONGRESS_API_BASE = "https://api.congress.gov/v3"
|
| 26 |
BASE_DIR = Path(__file__).resolve().parent
|
| 27 |
|
|
|
|
| 28 |
if Path("/data").exists():
|
| 29 |
CSV_PATH = Path("/data/policy_tracker.csv")
|
| 30 |
DB_FILE = Path("/data/seen_events.json")
|
|
|
|
|
|
|
| 31 |
else:
|
| 32 |
CSV_PATH = BASE_DIR / "policy_tracker.csv"
|
| 33 |
DB_FILE = BASE_DIR / "seen_events.json"
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# --- STEALTH SCRAPER SETUP ---
|
| 36 |
scraper = cloudscraper.create_scraper(
|
|
@@ -46,8 +50,8 @@ TARGET_KEYWORDS = [
|
|
| 46 |
"foundation model", "autonomous system"
|
| 47 |
]
|
| 48 |
|
| 49 |
-
def is_relevant(title, summary=""):
|
| 50 |
-
text_to_check = f"{title} {summary}".lower()
|
| 51 |
for keyword in TARGET_KEYWORDS:
|
| 52 |
if re.search(rf'\b{re.escape(keyword)}', text_to_check):
|
| 53 |
return True
|
|
@@ -139,20 +143,26 @@ def analyze_with_ai(title, summary, source, bill_text=""):
|
|
| 139 |
return "Error during AI analysis.", "error"
|
| 140 |
|
| 141 |
# --- CORE UTILITIES ---
|
| 142 |
-
def
|
| 143 |
-
if
|
| 144 |
-
with open(
|
| 145 |
return []
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
def save_db(db):
|
| 148 |
-
|
| 149 |
|
| 150 |
def extract_robust_date(text_blocks):
|
| 151 |
date_patterns = [
|
| 152 |
r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
|
| 153 |
r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
|
| 154 |
r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b',
|
| 155 |
-
r'\b(\d{2})\.(\d{2})\.(\d{4})\b'
|
| 156 |
]
|
| 157 |
for text in text_blocks:
|
| 158 |
if not text: continue
|
|
@@ -172,7 +182,6 @@ def extract_robust_date(text_blocks):
|
|
| 172 |
return None
|
| 173 |
|
| 174 |
# --- DATA GATHERING ENGINES ---
|
| 175 |
-
|
| 176 |
def fetch_agency_scraped():
|
| 177 |
print("Scanning Federal Agency HTML Pages...")
|
| 178 |
results = []
|
|
@@ -198,10 +207,7 @@ def fetch_agency_scraped():
|
|
| 198 |
if len(title) < 15 or not is_relevant(title): continue
|
| 199 |
seen_links.add(full_url)
|
| 200 |
|
| 201 |
-
# --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
|
| 202 |
fmt_date = None
|
| 203 |
-
|
| 204 |
-
# 1. Expanded Container Search
|
| 205 |
container = a_tag.find_parent(["article", "tr", "li"])
|
| 206 |
if not container:
|
| 207 |
container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
|
|
@@ -209,7 +215,6 @@ def fetch_agency_scraped():
|
|
| 209 |
if container:
|
| 210 |
fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
|
| 211 |
|
| 212 |
-
# 2. Sibling Search
|
| 213 |
if not fmt_date:
|
| 214 |
prev_el = a_tag.find_previous_sibling()
|
| 215 |
if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
|
|
@@ -217,7 +222,6 @@ def fetch_agency_scraped():
|
|
| 217 |
next_el = a_tag.find_next_sibling()
|
| 218 |
if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
|
| 219 |
|
| 220 |
-
# 3. Deep DOM Climb Fallback
|
| 221 |
if not fmt_date:
|
| 222 |
current_node = a_tag
|
| 223 |
for _ in range(6):
|
|
@@ -228,7 +232,6 @@ def fetch_agency_scraped():
|
|
| 228 |
fmt_date = found_date
|
| 229 |
break
|
| 230 |
|
| 231 |
-
# --- THE USER-FACING FLAG ---
|
| 232 |
if not fmt_date:
|
| 233 |
display_time = "⚠️ DATE UNKNOWN"
|
| 234 |
display_title = f"[DATE MISSING] {title}"
|
|
@@ -240,7 +243,7 @@ def fetch_agency_scraped():
|
|
| 240 |
|
| 241 |
results.append({
|
| 242 |
"source": name,
|
| 243 |
-
"type": "Federal/Exec Action",
|
| 244 |
"event_date": fmt_date,
|
| 245 |
"time": display_time,
|
| 246 |
"title": display_title,
|
|
@@ -278,19 +281,14 @@ def fetch_congress_scraped():
|
|
| 278 |
if len(title) < 15 or not is_relevant(title): continue
|
| 279 |
seen_links.add(full_url)
|
| 280 |
|
| 281 |
-
# --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
|
| 282 |
fmt_date = None
|
| 283 |
-
|
| 284 |
-
# 1. Expanded Container Search (Catches almost all Gov CMS platforms)
|
| 285 |
container = a_tag.find_parent(["article", "tr", "li"])
|
| 286 |
if not container:
|
| 287 |
-
# Added: news, press, card, entry, row, record
|
| 288 |
container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
|
| 289 |
|
| 290 |
if container:
|
| 291 |
fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
|
| 292 |
|
| 293 |
-
# 2. Sibling Search (If the date is floating right next to the link)
|
| 294 |
if not fmt_date:
|
| 295 |
prev_el = a_tag.find_previous_sibling()
|
| 296 |
if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
|
|
@@ -298,7 +296,6 @@ def fetch_congress_scraped():
|
|
| 298 |
next_el = a_tag.find_next_sibling()
|
| 299 |
if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
|
| 300 |
|
| 301 |
-
# 3. Deep DOM Climb Fallback
|
| 302 |
if not fmt_date:
|
| 303 |
current_node = a_tag
|
| 304 |
for _ in range(6):
|
|
@@ -309,7 +306,6 @@ def fetch_congress_scraped():
|
|
| 309 |
fmt_date = found_date
|
| 310 |
break
|
| 311 |
|
| 312 |
-
# --- THE USER-FACING FLAG ---
|
| 313 |
if not fmt_date:
|
| 314 |
display_time = "⚠️ DATE UNKNOWN"
|
| 315 |
display_title = f"[DATE MISSING] {title}"
|
|
@@ -334,7 +330,6 @@ def fetch_floor_schedules():
|
|
| 334 |
print("Scanning House & Senate Floor Schedules...")
|
| 335 |
results = []
|
| 336 |
|
| 337 |
-
# Using your stable, verified endpoints
|
| 338 |
SCHEDULE_URLS = {
|
| 339 |
"Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
|
| 340 |
"House Floor Summary": "https://clerk.house.gov/FloorSummary"
|
|
@@ -346,27 +341,19 @@ def fetch_floor_schedules():
|
|
| 346 |
if r.status_code != 200: continue
|
| 347 |
|
| 348 |
soup = BeautifulSoup(r.text, "html.parser")
|
| 349 |
-
|
| 350 |
-
# 1. THE ISOLATOR: Only look inside the main content body (ignores footers/menus)
|
| 351 |
main_area = soup.find("main") or soup.find(id="main_content") or soup.find(class_=re.compile("content|main", re.I)) or soup
|
| 352 |
|
| 353 |
-
# 2. SURGICAL TAGS: Only parse actual paragraphs and lists. NO DIVS!
|
| 354 |
for container in main_area.find_all(["p", "li"]):
|
| 355 |
text_content = container.get_text(" ", strip=True)
|
| 356 |
-
|
| 357 |
-
# Tighten the length to avoid tiny buttons and massive unbroken text blocks
|
| 358 |
if len(text_content) < 40 or len(text_content) > 800: continue
|
| 359 |
if not is_relevant(text_content): continue
|
| 360 |
|
| 361 |
-
# 3. UPGRADED DUPLICATE BLOCKER: Prevents overlapping HTML chunks
|
| 362 |
if any(res['summary'][:100] in text_content for res in results) or \
|
| 363 |
any(text_content[:100] in res['summary'] for res in results):
|
| 364 |
continue
|
| 365 |
|
| 366 |
a_tag = container.find("a", href=True)
|
| 367 |
item_link = urljoin(url, a_tag['href']) if a_tag else url
|
| 368 |
-
|
| 369 |
-
# Floor actions are usually today's date
|
| 370 |
fmt_date = extract_robust_date([text_content]) or datetime.now()
|
| 371 |
|
| 372 |
results.append({
|
|
@@ -392,17 +379,13 @@ def fetch_rss(feed_dict, source_type):
|
|
| 392 |
for entry in feed.entries[:15]:
|
| 393 |
title = entry.get("title", "")
|
| 394 |
summary = entry.get("description", "")
|
| 395 |
-
|
| 396 |
if not is_relevant(title, summary): continue
|
| 397 |
-
|
| 398 |
|
| 399 |
-
# Check for standard RSS/Atom timestamps first
|
| 400 |
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
| 401 |
fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
|
| 402 |
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
|
| 403 |
fmt_date = datetime(*entry.updated_parsed[:6]).replace(tzinfo=None)
|
| 404 |
else:
|
| 405 |
-
# Fallback to text scanning only if metadata is missing entirely
|
| 406 |
fmt_date = extract_robust_date([title, summary]) or datetime.now()
|
| 407 |
|
| 408 |
results.append({
|
|
@@ -416,14 +399,10 @@ def fetch_rss(feed_dict, source_type):
|
|
| 416 |
|
| 417 |
return results
|
| 418 |
|
| 419 |
-
# -- APIs ---
|
| 420 |
-
|
| 421 |
def fetch_federal_register():
|
| 422 |
print("Scanning Federal Register API...")
|
| 423 |
results = []
|
| 424 |
url = "https://www.federalregister.gov/api/v1/documents.json"
|
| 425 |
-
|
| 426 |
-
# We pull a larger batch (50) because we are going to heavily filter them locally
|
| 427 |
params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 50}
|
| 428 |
|
| 429 |
try:
|
|
@@ -433,12 +412,8 @@ def fetch_federal_register():
|
|
| 433 |
title = doc.get("title", "No Title")
|
| 434 |
summary = doc.get("abstract", "No summary provided.")
|
| 435 |
|
| 436 |
-
# --- THE LOCAL RELEVANCE FILTER ---
|
| 437 |
-
# Only keep it if the AI keywords are in the Title or Abstract (ignores full-text matches)
|
| 438 |
if not is_relevant(title, str(summary)):
|
| 439 |
continue
|
| 440 |
-
|
| 441 |
-
# Explicitly block noisy SEC stock exchange filings
|
| 442 |
if "Self-Regulatory Organizations" in title:
|
| 443 |
continue
|
| 444 |
|
|
@@ -473,18 +448,20 @@ def fetch_bill_text(congress, bill_type, bill_number):
|
|
| 473 |
return ""
|
| 474 |
|
| 475 |
def fetch_legislation(target=1000):
|
| 476 |
-
print("Scanning Legislation API...")
|
| 477 |
if not CONGRESS_API_KEY: return []
|
| 478 |
results = []
|
| 479 |
headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
|
| 480 |
BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
|
| 481 |
|
| 482 |
-
#
|
|
|
|
|
|
|
|
|
|
| 483 |
scan_strategies = ["introducedDate desc", "updateDate desc"]
|
| 484 |
|
| 485 |
for sort_method in scan_strategies:
|
| 486 |
print(f" -> Pulling by {sort_method}...")
|
| 487 |
-
# target // 2 means we pull 500 for each strategy
|
| 488 |
for offset in range(0, target // 2, 250):
|
| 489 |
try:
|
| 490 |
r = requests.get(
|
|
@@ -497,25 +474,51 @@ def fetch_legislation(target=1000):
|
|
| 497 |
if not bills: break
|
| 498 |
|
| 499 |
for b in bills:
|
| 500 |
-
|
|
|
|
|
|
|
| 501 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
action_data = b.get("latestAction", {})
|
| 503 |
action_date_raw = action_data.get("actionDate") or b.get("updateDate")
|
| 504 |
fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
|
| 505 |
-
|
| 506 |
-
proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
|
| 507 |
|
| 508 |
results.append({
|
| 509 |
"source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
|
| 510 |
-
"time": "API Verified", "title": f"{
|
| 511 |
"latest_action": action_data.get("text", "Active"), "link": proper_link,
|
| 512 |
-
"summary": "Legislative movement tracked via API.", "bill_type":
|
| 513 |
})
|
| 514 |
-
time.sleep(1.5)
|
| 515 |
except Exception as e: break
|
| 516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
return results
|
| 518 |
-
|
| 519 |
|
| 520 |
# --- MAIN RUNNER ---
|
| 521 |
def run():
|
|
@@ -538,25 +541,22 @@ def run():
|
|
| 538 |
if event_id not in db:
|
| 539 |
print(f"Triaging new item: {item['title'][:40]}...")
|
| 540 |
|
| 541 |
-
# Re-integrated the fetch_bill_text logic so the AI has context!
|
| 542 |
bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
|
| 543 |
analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
|
| 544 |
|
| 545 |
item["analysis"] = analysis
|
| 546 |
item["keywords"] = keywords
|
| 547 |
|
| 548 |
-
# ---
|
| 549 |
try:
|
| 550 |
-
# Don't waste compute embedding error messages
|
| 551 |
if analysis and not analysis.startswith("Error") and not analysis.startswith("AI Triage disabled"):
|
| 552 |
vector = embedder.encode(analysis).tolist()
|
| 553 |
-
item["embedding"] = json.dumps(vector)
|
| 554 |
else:
|
| 555 |
item["embedding"] = None
|
| 556 |
except Exception as e:
|
| 557 |
print(f" -> Embedding error: {e}")
|
| 558 |
item["embedding"] = None
|
| 559 |
-
# ----------------------------------------
|
| 560 |
|
| 561 |
item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 562 |
new_items.append(item)
|
|
|
|
| 13 |
from urllib.parse import urljoin
|
| 14 |
from huggingface_hub import InferenceClient
|
| 15 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 16 |
|
| 17 |
# Specifying model for efficient embedding + trend analysis
|
| 18 |
+
embedder = SentenceTransformer('BAAI/bge-small-en-v1.5')
|
| 19 |
|
| 20 |
# --- CONFIGURATION & GLOBALS ---
|
| 21 |
CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
|
|
|
|
| 24 |
CONGRESS_API_BASE = "https://api.congress.gov/v3"
|
| 25 |
BASE_DIR = Path(__file__).resolve().parent
|
| 26 |
|
| 27 |
+
# --- PERSISTENT STORAGE PATHING ---
|
| 28 |
if Path("/data").exists():
|
| 29 |
CSV_PATH = Path("/data/policy_tracker.csv")
|
| 30 |
DB_FILE = Path("/data/seen_events.json")
|
| 31 |
+
WHITELIST_FILE = Path("/data/tracked_bills.json")
|
| 32 |
+
SCANNED_FILE = Path("/data/scanned_bills.json")
|
| 33 |
else:
|
| 34 |
CSV_PATH = BASE_DIR / "policy_tracker.csv"
|
| 35 |
DB_FILE = BASE_DIR / "seen_events.json"
|
| 36 |
+
WHITELIST_FILE = BASE_DIR / "tracked_bills.json"
|
| 37 |
+
SCANNED_FILE = BASE_DIR / "scanned_bills.json"
|
| 38 |
|
| 39 |
# --- STEALTH SCRAPER SETUP ---
|
| 40 |
scraper = cloudscraper.create_scraper(
|
|
|
|
| 50 |
"foundation model", "autonomous system"
|
| 51 |
]
|
| 52 |
|
| 53 |
+
def is_relevant(title, summary="", text=""):
|
| 54 |
+
text_to_check = f"{title} {summary} {text}".lower()
|
| 55 |
for keyword in TARGET_KEYWORDS:
|
| 56 |
if re.search(rf'\b{re.escape(keyword)}', text_to_check):
|
| 57 |
return True
|
|
|
|
| 143 |
return "Error during AI analysis.", "error"
|
| 144 |
|
| 145 |
# --- CORE UTILITIES ---
|
| 146 |
+
def load_list(filepath):
|
| 147 |
+
if filepath.exists():
|
| 148 |
+
with open(filepath, "r") as f: return json.load(f)
|
| 149 |
return []
|
| 150 |
|
| 151 |
+
def save_list(data, filepath):
|
| 152 |
+
with open(filepath, "w") as f: json.dump(data[-5000:], f)
|
| 153 |
+
|
| 154 |
+
def load_db():
|
| 155 |
+
return load_list(DB_FILE)
|
| 156 |
+
|
| 157 |
def save_db(db):
|
| 158 |
+
save_list(db, DB_FILE)
|
| 159 |
|
| 160 |
def extract_robust_date(text_blocks):
|
| 161 |
date_patterns = [
|
| 162 |
r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
|
| 163 |
r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
|
| 164 |
r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b',
|
| 165 |
+
r'\b(\d{2})\.(\d{2})\.(\d{4})\b'
|
| 166 |
]
|
| 167 |
for text in text_blocks:
|
| 168 |
if not text: continue
|
|
|
|
| 182 |
return None
|
| 183 |
|
| 184 |
# --- DATA GATHERING ENGINES ---
|
|
|
|
| 185 |
def fetch_agency_scraped():
|
| 186 |
print("Scanning Federal Agency HTML Pages...")
|
| 187 |
results = []
|
|
|
|
| 207 |
if len(title) < 15 or not is_relevant(title): continue
|
| 208 |
seen_links.add(full_url)
|
| 209 |
|
|
|
|
| 210 |
fmt_date = None
|
|
|
|
|
|
|
| 211 |
container = a_tag.find_parent(["article", "tr", "li"])
|
| 212 |
if not container:
|
| 213 |
container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
|
|
|
|
| 215 |
if container:
|
| 216 |
fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
|
| 217 |
|
|
|
|
| 218 |
if not fmt_date:
|
| 219 |
prev_el = a_tag.find_previous_sibling()
|
| 220 |
if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
|
|
|
|
| 222 |
next_el = a_tag.find_next_sibling()
|
| 223 |
if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
|
| 224 |
|
|
|
|
| 225 |
if not fmt_date:
|
| 226 |
current_node = a_tag
|
| 227 |
for _ in range(6):
|
|
|
|
| 232 |
fmt_date = found_date
|
| 233 |
break
|
| 234 |
|
|
|
|
| 235 |
if not fmt_date:
|
| 236 |
display_time = "⚠️ DATE UNKNOWN"
|
| 237 |
display_title = f"[DATE MISSING] {title}"
|
|
|
|
| 243 |
|
| 244 |
results.append({
|
| 245 |
"source": name,
|
| 246 |
+
"type": "Federal/Exec Action",
|
| 247 |
"event_date": fmt_date,
|
| 248 |
"time": display_time,
|
| 249 |
"title": display_title,
|
|
|
|
| 281 |
if len(title) < 15 or not is_relevant(title): continue
|
| 282 |
seen_links.add(full_url)
|
| 283 |
|
|
|
|
| 284 |
fmt_date = None
|
|
|
|
|
|
|
| 285 |
container = a_tag.find_parent(["article", "tr", "li"])
|
| 286 |
if not container:
|
|
|
|
| 287 |
container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
|
| 288 |
|
| 289 |
if container:
|
| 290 |
fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
|
| 291 |
|
|
|
|
| 292 |
if not fmt_date:
|
| 293 |
prev_el = a_tag.find_previous_sibling()
|
| 294 |
if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
|
|
|
|
| 296 |
next_el = a_tag.find_next_sibling()
|
| 297 |
if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
|
| 298 |
|
|
|
|
| 299 |
if not fmt_date:
|
| 300 |
current_node = a_tag
|
| 301 |
for _ in range(6):
|
|
|
|
| 306 |
fmt_date = found_date
|
| 307 |
break
|
| 308 |
|
|
|
|
| 309 |
if not fmt_date:
|
| 310 |
display_time = "⚠️ DATE UNKNOWN"
|
| 311 |
display_title = f"[DATE MISSING] {title}"
|
|
|
|
| 330 |
print("Scanning House & Senate Floor Schedules...")
|
| 331 |
results = []
|
| 332 |
|
|
|
|
| 333 |
SCHEDULE_URLS = {
|
| 334 |
"Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
|
| 335 |
"House Floor Summary": "https://clerk.house.gov/FloorSummary"
|
|
|
|
| 341 |
if r.status_code != 200: continue
|
| 342 |
|
| 343 |
soup = BeautifulSoup(r.text, "html.parser")
|
|
|
|
|
|
|
| 344 |
main_area = soup.find("main") or soup.find(id="main_content") or soup.find(class_=re.compile("content|main", re.I)) or soup
|
| 345 |
|
|
|
|
| 346 |
for container in main_area.find_all(["p", "li"]):
|
| 347 |
text_content = container.get_text(" ", strip=True)
|
|
|
|
|
|
|
| 348 |
if len(text_content) < 40 or len(text_content) > 800: continue
|
| 349 |
if not is_relevant(text_content): continue
|
| 350 |
|
|
|
|
| 351 |
if any(res['summary'][:100] in text_content for res in results) or \
|
| 352 |
any(text_content[:100] in res['summary'] for res in results):
|
| 353 |
continue
|
| 354 |
|
| 355 |
a_tag = container.find("a", href=True)
|
| 356 |
item_link = urljoin(url, a_tag['href']) if a_tag else url
|
|
|
|
|
|
|
| 357 |
fmt_date = extract_robust_date([text_content]) or datetime.now()
|
| 358 |
|
| 359 |
results.append({
|
|
|
|
| 379 |
for entry in feed.entries[:15]:
|
| 380 |
title = entry.get("title", "")
|
| 381 |
summary = entry.get("description", "")
|
|
|
|
| 382 |
if not is_relevant(title, summary): continue
|
|
|
|
| 383 |
|
|
|
|
| 384 |
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
| 385 |
fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
|
| 386 |
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
|
| 387 |
fmt_date = datetime(*entry.updated_parsed[:6]).replace(tzinfo=None)
|
| 388 |
else:
|
|
|
|
| 389 |
fmt_date = extract_robust_date([title, summary]) or datetime.now()
|
| 390 |
|
| 391 |
results.append({
|
|
|
|
| 399 |
|
| 400 |
return results
|
| 401 |
|
|
|
|
|
|
|
| 402 |
def fetch_federal_register():
|
| 403 |
print("Scanning Federal Register API...")
|
| 404 |
results = []
|
| 405 |
url = "https://www.federalregister.gov/api/v1/documents.json"
|
|
|
|
|
|
|
| 406 |
params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 50}
|
| 407 |
|
| 408 |
try:
|
|
|
|
| 412 |
title = doc.get("title", "No Title")
|
| 413 |
summary = doc.get("abstract", "No summary provided.")
|
| 414 |
|
|
|
|
|
|
|
| 415 |
if not is_relevant(title, str(summary)):
|
| 416 |
continue
|
|
|
|
|
|
|
| 417 |
if "Self-Regulatory Organizations" in title:
|
| 418 |
continue
|
| 419 |
|
|
|
|
| 448 |
return ""
|
| 449 |
|
| 450 |
def fetch_legislation(target=1000):
|
| 451 |
+
print("Scanning Legislation API with Deep Text & Whitelist...")
|
| 452 |
if not CONGRESS_API_KEY: return []
|
| 453 |
results = []
|
| 454 |
headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
|
| 455 |
BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
|
| 456 |
|
| 457 |
+
# Load tracking databases
|
| 458 |
+
tracked_bills = set(load_list(WHITELIST_FILE))
|
| 459 |
+
scanned_bills = set(load_list(SCANNED_FILE))
|
| 460 |
+
|
| 461 |
scan_strategies = ["introducedDate desc", "updateDate desc"]
|
| 462 |
|
| 463 |
for sort_method in scan_strategies:
|
| 464 |
print(f" -> Pulling by {sort_method}...")
|
|
|
|
| 465 |
for offset in range(0, target // 2, 250):
|
| 466 |
try:
|
| 467 |
r = requests.get(
|
|
|
|
| 474 |
if not bills: break
|
| 475 |
|
| 476 |
for b in bills:
|
| 477 |
+
raw_type = b.get("type", "HR").upper()
|
| 478 |
+
bill_number = b.get("number")
|
| 479 |
+
bill_id = f"{raw_type}{bill_number}"
|
| 480 |
|
| 481 |
+
is_ai_bill = False
|
| 482 |
+
|
| 483 |
+
# 1. THE WHITELIST CHECK (Catches all admin updates for known AI bills)
|
| 484 |
+
if bill_id in tracked_bills:
|
| 485 |
+
is_ai_bill = True
|
| 486 |
+
else:
|
| 487 |
+
# 2. TITLE/SUMMARY CHECK
|
| 488 |
+
if is_relevant(b.get("title", "")):
|
| 489 |
+
is_ai_bill = True
|
| 490 |
+
tracked_bills.add(bill_id)
|
| 491 |
+
# 3. DEEP TEXT CHECK (Only for bills we haven't already rejected!)
|
| 492 |
+
elif bill_id not in scanned_bills:
|
| 493 |
+
bill_text = fetch_bill_text(CURRENT_CONGRESS, raw_type, bill_number)
|
| 494 |
+
scanned_bills.add(bill_id) # Mark as scanned so we don't hit the API limit tomorrow
|
| 495 |
+
|
| 496 |
+
if is_relevant("", "", bill_text):
|
| 497 |
+
is_ai_bill = True
|
| 498 |
+
tracked_bills.add(bill_id)
|
| 499 |
+
|
| 500 |
+
if not is_ai_bill:
|
| 501 |
+
continue # Skip entirely!
|
| 502 |
+
|
| 503 |
action_data = b.get("latestAction", {})
|
| 504 |
action_date_raw = action_data.get("actionDate") or b.get("updateDate")
|
| 505 |
fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
|
| 506 |
+
proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{bill_number}"
|
|
|
|
| 507 |
|
| 508 |
results.append({
|
| 509 |
"source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
|
| 510 |
+
"time": "API Verified", "title": f"{raw_type}{bill_number}: {b.get('title')}",
|
| 511 |
"latest_action": action_data.get("text", "Active"), "link": proper_link,
|
| 512 |
+
"summary": "Legislative movement tracked via API.", "bill_type": raw_type, "bill_number": bill_number
|
| 513 |
})
|
| 514 |
+
time.sleep(1.5)
|
| 515 |
except Exception as e: break
|
| 516 |
|
| 517 |
+
# Save the updated Whitelist and Scanned lists to the permanent bucket
|
| 518 |
+
save_list(list(tracked_bills), WHITELIST_FILE)
|
| 519 |
+
save_list(list(scanned_bills), SCANNED_FILE)
|
| 520 |
+
|
| 521 |
return results
|
|
|
|
| 522 |
|
| 523 |
# --- MAIN RUNNER ---
|
| 524 |
def run():
|
|
|
|
| 541 |
if event_id not in db:
|
| 542 |
print(f"Triaging new item: {item['title'][:40]}...")
|
| 543 |
|
|
|
|
| 544 |
bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
|
| 545 |
analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
|
| 546 |
|
| 547 |
item["analysis"] = analysis
|
| 548 |
item["keywords"] = keywords
|
| 549 |
|
| 550 |
+
# --- SEMANTIC EMBEDDING ---
|
| 551 |
try:
|
|
|
|
| 552 |
if analysis and not analysis.startswith("Error") and not analysis.startswith("AI Triage disabled"):
|
| 553 |
vector = embedder.encode(analysis).tolist()
|
| 554 |
+
item["embedding"] = json.dumps(vector)
|
| 555 |
else:
|
| 556 |
item["embedding"] = None
|
| 557 |
except Exception as e:
|
| 558 |
print(f" -> Embedding error: {e}")
|
| 559 |
item["embedding"] = None
|
|
|
|
| 560 |
|
| 561 |
item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
|
| 562 |
new_items.append(item)
|