Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -146,7 +146,7 @@ def fetch_congress_scraped():
|
|
| 146 |
|
| 147 |
for a_tag in soup.find_all("a", href=True):
|
| 148 |
href = a_tag["href"]
|
| 149 |
-
if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=']): continue
|
| 150 |
|
| 151 |
full_url = urljoin(url, href)
|
| 152 |
if full_url in seen_links or full_url == url: continue
|
|
@@ -159,21 +159,30 @@ def fetch_congress_scraped():
|
|
| 159 |
if len(title) < 15 or not is_relevant(title): continue
|
| 160 |
seen_links.add(full_url)
|
| 161 |
|
| 162 |
-
# ---
|
| 163 |
fmt_date = None
|
| 164 |
|
| 165 |
-
# 1.
|
| 166 |
container = a_tag.find_parent(["article", "tr", "li"])
|
| 167 |
if not container:
|
| 168 |
-
|
|
|
|
| 169 |
|
| 170 |
if container:
|
| 171 |
fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
|
| 172 |
|
| 173 |
-
# 2.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
if not fmt_date:
|
| 175 |
current_node = a_tag
|
| 176 |
-
for _ in range(
|
| 177 |
if current_node.parent:
|
| 178 |
current_node = current_node.parent
|
| 179 |
found_date = extract_robust_date([current_node.get_text(" ", strip=True)])
|
|
@@ -181,13 +190,11 @@ def fetch_congress_scraped():
|
|
| 181 |
fmt_date = found_date
|
| 182 |
break
|
| 183 |
|
| 184 |
-
# --- THE USER-FACING
|
| 185 |
if not fmt_date:
|
| 186 |
-
# We stop defaulting to today! Mark it explicitly for the user.
|
| 187 |
display_time = "⚠️ DATE UNKNOWN"
|
| 188 |
display_title = f"[DATE MISSING] {title}"
|
| 189 |
else:
|
| 190 |
-
# If we found a date, run the age gate
|
| 191 |
days_old = (datetime.now() - fmt_date).days
|
| 192 |
if days_old > 60: continue
|
| 193 |
display_time = "Published"
|
|
@@ -195,7 +202,7 @@ def fetch_congress_scraped():
|
|
| 195 |
|
| 196 |
results.append({
|
| 197 |
"source": name, "type": "Legislative Office Press Release",
|
| 198 |
-
"event_date": fmt_date,
|
| 199 |
"time": display_time, "title": display_title,
|
| 200 |
"latest_action": "Web Publication", "link": full_url, "summary": "HTML Scrape"
|
| 201 |
})
|
|
@@ -203,8 +210,7 @@ def fetch_congress_scraped():
|
|
| 203 |
except Exception as e:
|
| 204 |
print(f" --> {name}: Error — {e}")
|
| 205 |
return results
|
| 206 |
-
|
| 207 |
-
# --- FLOOR SCHEDULE SCRAPER ---
|
| 208 |
def fetch_floor_schedules():
|
| 209 |
print("Scanning House & Senate Floor Schedules...")
|
| 210 |
results = []
|
|
|
|
| 146 |
|
| 147 |
for a_tag in soup.find_all("a", href=True):
|
| 148 |
href = a_tag["href"]
|
| 149 |
+
if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=', 'tag=']): continue
|
| 150 |
|
| 151 |
full_url = urljoin(url, href)
|
| 152 |
if full_url in seen_links or full_url == url: continue
|
|
|
|
| 159 |
if len(title) < 15 or not is_relevant(title): continue
|
| 160 |
seen_links.add(full_url)
|
| 161 |
|
| 162 |
+
# --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
|
| 163 |
fmt_date = None
|
| 164 |
|
| 165 |
+
# 1. Expanded Container Search (Catches almost all Gov CMS platforms)
|
| 166 |
container = a_tag.find_parent(["article", "tr", "li"])
|
| 167 |
if not container:
|
| 168 |
+
# Added: news, press, card, entry, row, record
|
| 169 |
+
container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
|
| 170 |
|
| 171 |
if container:
|
| 172 |
fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
|
| 173 |
|
| 174 |
+
# 2. Sibling Search (If the date is floating right next to the link)
|
| 175 |
+
if not fmt_date:
|
| 176 |
+
prev_el = a_tag.find_previous_sibling()
|
| 177 |
+
if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
|
| 178 |
+
if not fmt_date:
|
| 179 |
+
next_el = a_tag.find_next_sibling()
|
| 180 |
+
if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
|
| 181 |
+
|
| 182 |
+
# 3. Deep DOM Climb Fallback
|
| 183 |
if not fmt_date:
|
| 184 |
current_node = a_tag
|
| 185 |
+
for _ in range(6):
|
| 186 |
if current_node.parent:
|
| 187 |
current_node = current_node.parent
|
| 188 |
found_date = extract_robust_date([current_node.get_text(" ", strip=True)])
|
|
|
|
| 190 |
fmt_date = found_date
|
| 191 |
break
|
| 192 |
|
| 193 |
+
# --- THE USER-FACING FLAG ---
|
| 194 |
if not fmt_date:
|
|
|
|
| 195 |
display_time = "⚠️ DATE UNKNOWN"
|
| 196 |
display_title = f"[DATE MISSING] {title}"
|
| 197 |
else:
|
|
|
|
| 198 |
days_old = (datetime.now() - fmt_date).days
|
| 199 |
if days_old > 60: continue
|
| 200 |
display_time = "Published"
|
|
|
|
| 202 |
|
| 203 |
results.append({
|
| 204 |
"source": name, "type": "Legislative Office Press Release",
|
| 205 |
+
"event_date": fmt_date,
|
| 206 |
"time": display_time, "title": display_title,
|
| 207 |
"latest_action": "Web Publication", "link": full_url, "summary": "HTML Scrape"
|
| 208 |
})
|
|
|
|
| 210 |
except Exception as e:
|
| 211 |
print(f" --> {name}: Error — {e}")
|
| 212 |
return results
|
| 213 |
+
|
|
|
|
| 214 |
def fetch_floor_schedules():
|
| 215 |
print("Scanning House & Senate Floor Schedules...")
|
| 216 |
results = []
|