IJ-Reynolds HF Staff commited on
Commit
e5bb349
·
verified ·
1 Parent(s): f3975b8

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +47 -1
main.py CHANGED
@@ -204,6 +204,51 @@ def fetch_congress_scraped():
204
  print(f" --> {name}: Error — {e}")
205
  return results
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  def fetch_rss(feed_dict, source_type):
208
  print(f"Scanning {source_type} RSS...")
209
  results = []
@@ -350,7 +395,8 @@ def run():
350
  raw_data.extend(fetch_congress_scraped()) # The 5 HTML Pages with DOM Climbing
351
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
352
  raw_data.extend(fetch_federal_register())
353
- raw_data.extend(fetch_legislation())
 
354
 
355
  new_items = []
356
  for item in raw_data:
 
204
  print(f" --> {name}: Error — {e}")
205
  return results
206
 
207
+ def fetch_floor_schedules():
208
+ print("Scanning House & Senate Floor Schedules...")
209
+ results = []
210
+
211
+ # Using your stable, verified endpoints
212
+ SCHEDULE_URLS = {
213
+ "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
214
+ "House Floor Summary": "https://clerk.house.gov/FloorSummary"
215
+ }
216
+
217
+ for source_name, url in SCHEDULE_URLS.items():
218
+ try:
219
+ r = scraper.get(url, timeout=15)
220
+ if r.status_code != 200: continue
221
+
222
+ soup = BeautifulSoup(r.text, "html.parser")
223
+
224
+ # Cast a broad net over typical content containers
225
+ for container in soup.find_all(["tr", "li", "div", "p"]):
226
+ text_content = container.get_text(" ", strip=True)
227
+
228
+ # Filter out microscopic navigation links and massive full-page wrappers
229
+ if len(text_content) < 30 or len(text_content) > 1500: continue
230
+ if not is_relevant(text_content): continue
231
+
232
+ # Prevent adding the exact same paragraph twice if nested
233
+ if any(res['summary'][:50] == text_content[:50] for res in results): continue
234
+
235
+ a_tag = container.find("a", href=True)
236
+ item_link = urljoin(url, a_tag['href']) if a_tag else url
237
+
238
+ # These pages update daily, so if no date is in the text, it's today's action
239
+ fmt_date = extract_robust_date([text_content]) or datetime.now()
240
+
241
+ results.append({
242
+ "source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
243
+ "time": "Scheduled", "title": text_content[:120] + "...",
244
+ "latest_action": "On Master Schedule", "link": item_link, "summary": text_content[:300]
245
+ })
246
+ time.sleep(1)
247
+ except Exception as e:
248
+ print(f"Error scraping {source_name}: {e}")
249
+
250
+ return results
251
+
252
  def fetch_rss(feed_dict, source_type):
253
  print(f"Scanning {source_type} RSS...")
254
  results = []
 
395
  raw_data.extend(fetch_congress_scraped()) # The 5 HTML Pages with DOM Climbing
396
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
397
  raw_data.extend(fetch_federal_register())
398
+ raw_data.extend(fetch_legislation())
399
+ raw_data.extend(fetch_floor_schedules())
400
 
401
  new_items = []
402
  for item in raw_data: