Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -67,7 +67,13 @@ CONGRESS_SCRAPE_TARGETS = {
|
|
| 67 |
"Rep. Jeffries": "https://democraticleader.house.gov/media/press-releases",
|
| 68 |
"Sen. Klobuchar": "https://www.klobuchar.senate.gov/public/index.cfm/news-releases"
|
| 69 |
}
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
NEWS_FEEDS = {
|
| 72 |
"Politico Tech": "https://rss.politico.com/technology.xml",
|
| 73 |
"Axios Tech": "https://www.axios.com/feeds/feed.rss",
|
|
@@ -149,6 +155,86 @@ def extract_robust_date(text_blocks):
|
|
| 149 |
|
| 150 |
# --- DATA GATHERING ENGINES ---
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
def fetch_congress_scraped():
|
| 153 |
print("Scanning Verified Lawmaker HTML Pages...")
|
| 154 |
results = []
|
|
@@ -419,11 +505,12 @@ def run():
|
|
| 419 |
raw_data = []
|
| 420 |
|
| 421 |
# Run the 4 basic, robust engines
|
| 422 |
-
raw_data.extend(fetch_congress_scraped())
|
| 423 |
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
| 424 |
raw_data.extend(fetch_federal_register())
|
| 425 |
raw_data.extend(fetch_legislation())
|
| 426 |
raw_data.extend(fetch_floor_schedules())
|
|
|
|
| 427 |
|
| 428 |
new_items = []
|
| 429 |
for item in raw_data:
|
|
|
|
| 67 |
"Rep. Jeffries": "https://democraticleader.house.gov/media/press-releases",
|
| 68 |
"Sen. Klobuchar": "https://www.klobuchar.senate.gov/public/index.cfm/news-releases"
|
| 69 |
}
|
| 70 |
+
AGENCIES = {
|
| 71 |
+
"NIST": "https://www.nist.gov/news-events/news",
|
| 72 |
+
"OSTP": "https://www.whitehouse.gov/ostp/news/",
|
| 73 |
+
"White House": "https://www.whitehouse.gov/news/",
|
| 74 |
+
"Department of Energy": "https://www.energy.gov/technologycommercialization/listings/press-releases",
|
| 75 |
+
"Department of War": "https://www.war.gov/News/releases/"
|
| 76 |
+
}
|
| 77 |
NEWS_FEEDS = {
|
| 78 |
"Politico Tech": "https://rss.politico.com/technology.xml",
|
| 79 |
"Axios Tech": "https://www.axios.com/feeds/feed.rss",
|
|
|
|
| 155 |
|
| 156 |
# --- DATA GATHERING ENGINES ---
|
| 157 |
|
| 158 |
+
def fetch_agency_scraped():
|
| 159 |
+
print("Scanning Federal Agency HTML Pages...")
|
| 160 |
+
results = []
|
| 161 |
+
for name, url in AGENCY_SCRAPE_TARGETS.items():
|
| 162 |
+
try:
|
| 163 |
+
r = scraper.get(url, timeout=15)
|
| 164 |
+
if r.status_code != 200: continue
|
| 165 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 166 |
+
seen_links = set()
|
| 167 |
+
|
| 168 |
+
for a_tag in soup.find_all("a", href=True):
|
| 169 |
+
href = a_tag["href"]
|
| 170 |
+
if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=', 'tag=']): continue
|
| 171 |
+
|
| 172 |
+
full_url = urljoin(url, href)
|
| 173 |
+
if full_url in seen_links or full_url == url: continue
|
| 174 |
+
|
| 175 |
+
title = a_tag.get_text(" ", strip=True)
|
| 176 |
+
if not title:
|
| 177 |
+
heading = a_tag.find(["h2", "h3", "h4", "strong"])
|
| 178 |
+
title = heading.get_text(" ", strip=True) if heading else ""
|
| 179 |
+
|
| 180 |
+
if len(title) < 15 or not is_relevant(title): continue
|
| 181 |
+
seen_links.add(full_url)
|
| 182 |
+
|
| 183 |
+
# --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
|
| 184 |
+
fmt_date = None
|
| 185 |
+
|
| 186 |
+
# 1. Expanded Container Search
|
| 187 |
+
container = a_tag.find_parent(["article", "tr", "li"])
|
| 188 |
+
if not container:
|
| 189 |
+
container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
|
| 190 |
+
|
| 191 |
+
if container:
|
| 192 |
+
fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
|
| 193 |
+
|
| 194 |
+
# 2. Sibling Search
|
| 195 |
+
if not fmt_date:
|
| 196 |
+
prev_el = a_tag.find_previous_sibling()
|
| 197 |
+
if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
|
| 198 |
+
if not fmt_date:
|
| 199 |
+
next_el = a_tag.find_next_sibling()
|
| 200 |
+
if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
|
| 201 |
+
|
| 202 |
+
# 3. Deep DOM Climb Fallback
|
| 203 |
+
if not fmt_date:
|
| 204 |
+
current_node = a_tag
|
| 205 |
+
for _ in range(6):
|
| 206 |
+
if current_node.parent:
|
| 207 |
+
current_node = current_node.parent
|
| 208 |
+
found_date = extract_robust_date([current_node.get_text(" ", strip=True)])
|
| 209 |
+
if found_date:
|
| 210 |
+
fmt_date = found_date
|
| 211 |
+
break
|
| 212 |
+
|
| 213 |
+
# --- THE USER-FACING FLAG ---
|
| 214 |
+
if not fmt_date:
|
| 215 |
+
display_time = "⚠️ DATE UNKNOWN"
|
| 216 |
+
display_title = f"[DATE MISSING] {title}"
|
| 217 |
+
else:
|
| 218 |
+
days_old = (datetime.now() - fmt_date).days
|
| 219 |
+
if days_old > 60: continue
|
| 220 |
+
display_time = "Published"
|
| 221 |
+
display_title = title
|
| 222 |
+
|
| 223 |
+
results.append({
|
| 224 |
+
"source": name,
|
| 225 |
+
"type": "Federal/Exec Action", # Formatted for the Executive action bucket
|
| 226 |
+
"event_date": fmt_date,
|
| 227 |
+
"time": display_time,
|
| 228 |
+
"title": display_title,
|
| 229 |
+
"latest_action": "Agency Press Release",
|
| 230 |
+
"link": full_url,
|
| 231 |
+
"summary": "HTML Scrape"
|
| 232 |
+
})
|
| 233 |
+
time.sleep(1)
|
| 234 |
+
except Exception as e:
|
| 235 |
+
print(f" --> {name}: Error — {e}")
|
| 236 |
+
return results
|
| 237 |
+
|
| 238 |
def fetch_congress_scraped():
|
| 239 |
print("Scanning Verified Lawmaker HTML Pages...")
|
| 240 |
results = []
|
|
|
|
| 505 |
raw_data = []
|
| 506 |
|
| 507 |
# Run the 4 basic, robust engines
|
| 508 |
+
raw_data.extend(fetch_congress_scraped())
|
| 509 |
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
| 510 |
raw_data.extend(fetch_federal_register())
|
| 511 |
raw_data.extend(fetch_legislation())
|
| 512 |
raw_data.extend(fetch_floor_schedules())
|
| 513 |
+
raw_data.extend(fetch_agency_scraped())
|
| 514 |
|
| 515 |
new_items = []
|
| 516 |
for item in raw_data:
|