Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -204,6 +204,51 @@ def fetch_congress_scraped():
|
|
| 204 |
print(f" --> {name}: Error — {e}")
|
| 205 |
return results
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
def fetch_rss(feed_dict, source_type):
|
| 208 |
print(f"Scanning {source_type} RSS...")
|
| 209 |
results = []
|
|
@@ -350,7 +395,8 @@ def run():
|
|
| 350 |
raw_data.extend(fetch_congress_scraped()) # The 5 HTML Pages with DOM Climbing
|
| 351 |
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
| 352 |
raw_data.extend(fetch_federal_register())
|
| 353 |
-
raw_data.extend(fetch_legislation())
|
|
|
|
| 354 |
|
| 355 |
new_items = []
|
| 356 |
for item in raw_data:
|
|
|
|
| 204 |
print(f" --> {name}: Error — {e}")
|
| 205 |
return results
|
| 206 |
|
| 207 |
+
def fetch_floor_schedules():
|
| 208 |
+
print("Scanning House & Senate Floor Schedules...")
|
| 209 |
+
results = []
|
| 210 |
+
|
| 211 |
+
# Using your stable, verified endpoints
|
| 212 |
+
SCHEDULE_URLS = {
|
| 213 |
+
"Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
|
| 214 |
+
"House Floor Summary": "https://clerk.house.gov/FloorSummary"
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
for source_name, url in SCHEDULE_URLS.items():
|
| 218 |
+
try:
|
| 219 |
+
r = scraper.get(url, timeout=15)
|
| 220 |
+
if r.status_code != 200: continue
|
| 221 |
+
|
| 222 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 223 |
+
|
| 224 |
+
# Cast a broad net over typical content containers
|
| 225 |
+
for container in soup.find_all(["tr", "li", "div", "p"]):
|
| 226 |
+
text_content = container.get_text(" ", strip=True)
|
| 227 |
+
|
| 228 |
+
# Filter out microscopic navigation links and massive full-page wrappers
|
| 229 |
+
if len(text_content) < 30 or len(text_content) > 1500: continue
|
| 230 |
+
if not is_relevant(text_content): continue
|
| 231 |
+
|
| 232 |
+
# Prevent adding the exact same paragraph twice if nested
|
| 233 |
+
if any(res['summary'][:50] == text_content[:50] for res in results): continue
|
| 234 |
+
|
| 235 |
+
a_tag = container.find("a", href=True)
|
| 236 |
+
item_link = urljoin(url, a_tag['href']) if a_tag else url
|
| 237 |
+
|
| 238 |
+
# These pages update daily, so if no date is in the text, it's today's action
|
| 239 |
+
fmt_date = extract_robust_date([text_content]) or datetime.now()
|
| 240 |
+
|
| 241 |
+
results.append({
|
| 242 |
+
"source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
|
| 243 |
+
"time": "Scheduled", "title": text_content[:120] + "...",
|
| 244 |
+
"latest_action": "On Master Schedule", "link": item_link, "summary": text_content[:300]
|
| 245 |
+
})
|
| 246 |
+
time.sleep(1)
|
| 247 |
+
except Exception as e:
|
| 248 |
+
print(f"Error scraping {source_name}: {e}")
|
| 249 |
+
|
| 250 |
+
return results
|
| 251 |
+
|
| 252 |
def fetch_rss(feed_dict, source_type):
|
| 253 |
print(f"Scanning {source_type} RSS...")
|
| 254 |
results = []
|
|
|
|
| 395 |
raw_data.extend(fetch_congress_scraped()) # The 5 HTML Pages with DOM Climbing
|
| 396 |
raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
|
| 397 |
raw_data.extend(fetch_federal_register())
|
| 398 |
+
raw_data.extend(fetch_legislation())
|
| 399 |
+
raw_data.extend(fetch_floor_schedules())
|
| 400 |
|
| 401 |
new_items = []
|
| 402 |
for item in raw_data:
|