Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Running

App Files Files Community

IJ-Reynolds HF Staff commited on 10 days ago

Commit

4b0b986

verified ·

1 Parent(s): 172186c

Update main.py

Browse files

Files changed (1) hide show

main.py +18 -12

main.py CHANGED Viewed

@@ -146,7 +146,7 @@ def fetch_congress_scraped():
             for a_tag in soup.find_all("a", href=True):
                 href = a_tag["href"]
-                if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=']): continue
                 full_url = urljoin(url, href)
                 if full_url in seen_links or full_url == url: continue
@@ -159,21 +159,30 @@ def fetch_congress_scraped():
                 if len(title) < 15 or not is_relevant(title): continue
                 seen_links.add(full_url)
-                # --- UPGRADED DATE HUNTING (Container Search) ---
                 fmt_date = None
-                # 1. Look for the entire row/article container (Catches sibling dates in Drupal!)
                 container = a_tag.find_parent(["article", "tr", "li"])
                 if not container:
-                    container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post", re.I))
                 if container:
                     fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
-                # 2. Fallback DOM climb
                 if not fmt_date:
                     current_node = a_tag
-                    for _ in range(5):
                         if current_node.parent:
                             current_node = current_node.parent
                             found_date = extract_robust_date([current_node.get_text(" ", strip=True)])
@@ -181,13 +190,11 @@ def fetch_congress_scraped():
                                 fmt_date = found_date
                                 break
-                # --- THE USER-FACING MISSING DATE FLAG ---
                 if not fmt_date:
-                    # We stop defaulting to today! Mark it explicitly for the user.
                     display_time = "⚠️ DATE UNKNOWN"
                     display_title = f"[DATE MISSING] {title}"
                 else:
-                    # If we found a date, run the age gate
                     days_old = (datetime.now() - fmt_date).days
                     if days_old > 60: continue
                     display_time = "Published"
@@ -195,7 +202,7 @@ def fetch_congress_scraped():
                 results.append({
                     "source": name, "type": "Legislative Office Press Release",
-                    "event_date": fmt_date, # This will be passed as None (Blank) instead of today
                     "time": display_time, "title": display_title,
                     "latest_action": "Web Publication", "link": full_url, "summary": "HTML Scrape"
                 })
@@ -203,8 +210,7 @@ def fetch_congress_scraped():
         except Exception as e:
             print(f"  --> {name}: Error — {e}")
     return results
-# --- FLOOR SCHEDULE SCRAPER ---
 def fetch_floor_schedules():
     print("Scanning House & Senate Floor Schedules...")
     results = []

             for a_tag in soup.find_all("a", href=True):
                 href = a_tag["href"]
+                if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=', 'tag=']): continue
                 full_url = urljoin(url, href)
                 if full_url in seen_links or full_url == url: continue
                 if len(title) < 15 or not is_relevant(title): continue
                 seen_links.add(full_url)
+                # --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
                 fmt_date = None
+                # 1. Expanded Container Search (Catches almost all Gov CMS platforms)
                 container = a_tag.find_parent(["article", "tr", "li"])
                 if not container:
+                    # Added: news, press, card, entry, row, record
+                    container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
                 if container:
                     fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
+                # 2. Sibling Search (If the date is floating right next to the link)
+                if not fmt_date:
+                    prev_el = a_tag.find_previous_sibling()
+                    if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
+                if not fmt_date:
+                    next_el = a_tag.find_next_sibling()
+                    if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
+                # 3. Deep DOM Climb Fallback
                 if not fmt_date:
                     current_node = a_tag
+                    for _ in range(6):
                         if current_node.parent:
                             current_node = current_node.parent
                             found_date = extract_robust_date([current_node.get_text(" ", strip=True)])
                                 fmt_date = found_date
                                 break
+                # --- THE USER-FACING FLAG ---
                 if not fmt_date:
                     display_time = "⚠️ DATE UNKNOWN"
                     display_title = f"[DATE MISSING] {title}"
                 else:
                     days_old = (datetime.now() - fmt_date).days
                     if days_old > 60: continue
                     display_time = "Published"
                 results.append({
                     "source": name, "type": "Legislative Office Press Release",
+                    "event_date": fmt_date,
                     "time": display_time, "title": display_title,
                     "latest_action": "Web Publication", "link": full_url, "summary": "HTML Scrape"
                 })
         except Exception as e:
             print(f"  --> {name}: Error — {e}")
     return results
 def fetch_floor_schedules():
     print("Scanning House & Senate Floor Schedules...")
     results = []