IJ-Reynolds HF Staff commited on
Commit
56b0350
·
verified ·
1 Parent(s): a241a6b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +89 -2
main.py CHANGED
@@ -67,7 +67,13 @@ CONGRESS_SCRAPE_TARGETS = {
67
  "Rep. Jeffries": "https://democraticleader.house.gov/media/press-releases",
68
  "Sen. Klobuchar": "https://www.klobuchar.senate.gov/public/index.cfm/news-releases"
69
  }
70
-
 
 
 
 
 
 
71
  NEWS_FEEDS = {
72
  "Politico Tech": "https://rss.politico.com/technology.xml",
73
  "Axios Tech": "https://www.axios.com/feeds/feed.rss",
@@ -149,6 +155,86 @@ def extract_robust_date(text_blocks):
149
 
150
  # --- DATA GATHERING ENGINES ---
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def fetch_congress_scraped():
153
  print("Scanning Verified Lawmaker HTML Pages...")
154
  results = []
@@ -419,11 +505,12 @@ def run():
419
  raw_data = []
420
 
421
  # Run the 4 basic, robust engines
422
- raw_data.extend(fetch_congress_scraped()) # The 5 HTML Pages with DOM Climbing
423
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
424
  raw_data.extend(fetch_federal_register())
425
  raw_data.extend(fetch_legislation())
426
  raw_data.extend(fetch_floor_schedules())
 
427
 
428
  new_items = []
429
  for item in raw_data:
 
67
  "Rep. Jeffries": "https://democraticleader.house.gov/media/press-releases",
68
  "Sen. Klobuchar": "https://www.klobuchar.senate.gov/public/index.cfm/news-releases"
69
  }
70
+ AGENCIES = {
71
+ "NIST": "https://www.nist.gov/news-events/news",
72
+ "OSTP": "https://www.whitehouse.gov/ostp/news/",
73
+ "White House": "https://www.whitehouse.gov/news/",
74
+ "Department of Energy": "https://www.energy.gov/technologycommercialization/listings/press-releases",
75
+ "Department of War": "https://www.war.gov/News/releases/"
76
+ }
77
  NEWS_FEEDS = {
78
  "Politico Tech": "https://rss.politico.com/technology.xml",
79
  "Axios Tech": "https://www.axios.com/feeds/feed.rss",
 
155
 
156
  # --- DATA GATHERING ENGINES ---
157
 
158
+ def fetch_agency_scraped():
159
+ print("Scanning Federal Agency HTML Pages...")
160
+ results = []
161
+ for name, url in AGENCY_SCRAPE_TARGETS.items():
162
+ try:
163
+ r = scraper.get(url, timeout=15)
164
+ if r.status_code != 200: continue
165
+ soup = BeautifulSoup(r.text, "html.parser")
166
+ seen_links = set()
167
+
168
+ for a_tag in soup.find_all("a", href=True):
169
+ href = a_tag["href"]
170
+ if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=', 'tag=']): continue
171
+
172
+ full_url = urljoin(url, href)
173
+ if full_url in seen_links or full_url == url: continue
174
+
175
+ title = a_tag.get_text(" ", strip=True)
176
+ if not title:
177
+ heading = a_tag.find(["h2", "h3", "h4", "strong"])
178
+ title = heading.get_text(" ", strip=True) if heading else ""
179
+
180
+ if len(title) < 15 or not is_relevant(title): continue
181
+ seen_links.add(full_url)
182
+
183
+ # --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
184
+ fmt_date = None
185
+
186
+ # 1. Expanded Container Search
187
+ container = a_tag.find_parent(["article", "tr", "li"])
188
+ if not container:
189
+ container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
190
+
191
+ if container:
192
+ fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
193
+
194
+ # 2. Sibling Search
195
+ if not fmt_date:
196
+ prev_el = a_tag.find_previous_sibling()
197
+ if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
198
+ if not fmt_date:
199
+ next_el = a_tag.find_next_sibling()
200
+ if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
201
+
202
+ # 3. Deep DOM Climb Fallback
203
+ if not fmt_date:
204
+ current_node = a_tag
205
+ for _ in range(6):
206
+ if current_node.parent:
207
+ current_node = current_node.parent
208
+ found_date = extract_robust_date([current_node.get_text(" ", strip=True)])
209
+ if found_date:
210
+ fmt_date = found_date
211
+ break
212
+
213
+ # --- THE USER-FACING FLAG ---
214
+ if not fmt_date:
215
+ display_time = "⚠️ DATE UNKNOWN"
216
+ display_title = f"[DATE MISSING] {title}"
217
+ else:
218
+ days_old = (datetime.now() - fmt_date).days
219
+ if days_old > 60: continue
220
+ display_time = "Published"
221
+ display_title = title
222
+
223
+ results.append({
224
+ "source": name,
225
+ "type": "Federal/Exec Action", # Formatted for the Executive action bucket
226
+ "event_date": fmt_date,
227
+ "time": display_time,
228
+ "title": display_title,
229
+ "latest_action": "Agency Press Release",
230
+ "link": full_url,
231
+ "summary": "HTML Scrape"
232
+ })
233
+ time.sleep(1)
234
+ except Exception as e:
235
+ print(f" --> {name}: Error — {e}")
236
+ return results
237
+
238
  def fetch_congress_scraped():
239
  print("Scanning Verified Lawmaker HTML Pages...")
240
  results = []
 
505
  raw_data = []
506
 
507
  # Run the 4 basic, robust engines
508
+ raw_data.extend(fetch_congress_scraped())
509
  raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
510
  raw_data.extend(fetch_federal_register())
511
  raw_data.extend(fetch_legislation())
512
  raw_data.extend(fetch_floor_schedules())
513
+ raw_data.extend(fetch_agency_scraped())
514
 
515
  new_items = []
516
  for item in raw_data: