IJ-Reynolds HF Staff commited on
Commit
4b0b986
·
verified ·
1 Parent(s): 172186c

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +18 -12
main.py CHANGED
@@ -146,7 +146,7 @@ def fetch_congress_scraped():
146
 
147
  for a_tag in soup.find_all("a", href=True):
148
  href = a_tag["href"]
149
- if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=']): continue
150
 
151
  full_url = urljoin(url, href)
152
  if full_url in seen_links or full_url == url: continue
@@ -159,21 +159,30 @@ def fetch_congress_scraped():
159
  if len(title) < 15 or not is_relevant(title): continue
160
  seen_links.add(full_url)
161
 
162
- # --- UPGRADED DATE HUNTING (Container Search) ---
163
  fmt_date = None
164
 
165
- # 1. Look for the entire row/article container (Catches sibling dates in Drupal!)
166
  container = a_tag.find_parent(["article", "tr", "li"])
167
  if not container:
168
- container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post", re.I))
 
169
 
170
  if container:
171
  fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
172
 
173
- # 2. Fallback DOM climb
 
 
 
 
 
 
 
 
174
  if not fmt_date:
175
  current_node = a_tag
176
- for _ in range(5):
177
  if current_node.parent:
178
  current_node = current_node.parent
179
  found_date = extract_robust_date([current_node.get_text(" ", strip=True)])
@@ -181,13 +190,11 @@ def fetch_congress_scraped():
181
  fmt_date = found_date
182
  break
183
 
184
- # --- THE USER-FACING MISSING DATE FLAG ---
185
  if not fmt_date:
186
- # We stop defaulting to today! Mark it explicitly for the user.
187
  display_time = "⚠️ DATE UNKNOWN"
188
  display_title = f"[DATE MISSING] {title}"
189
  else:
190
- # If we found a date, run the age gate
191
  days_old = (datetime.now() - fmt_date).days
192
  if days_old > 60: continue
193
  display_time = "Published"
@@ -195,7 +202,7 @@ def fetch_congress_scraped():
195
 
196
  results.append({
197
  "source": name, "type": "Legislative Office Press Release",
198
- "event_date": fmt_date, # This will be passed as None (Blank) instead of today
199
  "time": display_time, "title": display_title,
200
  "latest_action": "Web Publication", "link": full_url, "summary": "HTML Scrape"
201
  })
@@ -203,8 +210,7 @@ def fetch_congress_scraped():
203
  except Exception as e:
204
  print(f" --> {name}: Error — {e}")
205
  return results
206
-
207
- # --- FLOOR SCHEDULE SCRAPER ---
208
  def fetch_floor_schedules():
209
  print("Scanning House & Senate Floor Schedules...")
210
  results = []
 
146
 
147
  for a_tag in soup.find_all("a", href=True):
148
  href = a_tag["href"]
149
+ if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=', 'tag=']): continue
150
 
151
  full_url = urljoin(url, href)
152
  if full_url in seen_links or full_url == url: continue
 
159
  if len(title) < 15 or not is_relevant(title): continue
160
  seen_links.add(full_url)
161
 
162
+ # --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
163
  fmt_date = None
164
 
165
+ # 1. Expanded Container Search (Catches almost all Gov CMS platforms)
166
  container = a_tag.find_parent(["article", "tr", "li"])
167
  if not container:
168
+ # Added: news, press, card, entry, row, record
169
+ container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
170
 
171
  if container:
172
  fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
173
 
174
+ # 2. Sibling Search (If the date is floating right next to the link)
175
+ if not fmt_date:
176
+ prev_el = a_tag.find_previous_sibling()
177
+ if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
178
+ if not fmt_date:
179
+ next_el = a_tag.find_next_sibling()
180
+ if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
181
+
182
+ # 3. Deep DOM Climb Fallback
183
  if not fmt_date:
184
  current_node = a_tag
185
+ for _ in range(6):
186
  if current_node.parent:
187
  current_node = current_node.parent
188
  found_date = extract_robust_date([current_node.get_text(" ", strip=True)])
 
190
  fmt_date = found_date
191
  break
192
 
193
+ # --- THE USER-FACING FLAG ---
194
  if not fmt_date:
 
195
  display_time = "⚠️ DATE UNKNOWN"
196
  display_title = f"[DATE MISSING] {title}"
197
  else:
 
198
  days_old = (datetime.now() - fmt_date).days
199
  if days_old > 60: continue
200
  display_time = "Published"
 
202
 
203
  results.append({
204
  "source": name, "type": "Legislative Office Press Release",
205
+ "event_date": fmt_date,
206
  "time": display_time, "title": display_title,
207
  "latest_action": "Web Publication", "link": full_url, "summary": "HTML Scrape"
208
  })
 
210
  except Exception as e:
211
  print(f" --> {name}: Error — {e}")
212
  return results
213
+
 
214
  def fetch_floor_schedules():
215
  print("Scanning House & Senate Floor Schedules...")
216
  results = []