Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Sleeping

App Files Files Community

IJ-Reynolds HF Staff commited on 17 days ago

Commit

56b0350

verified ·

1 Parent(s): a241a6b

Update main.py

Browse files

Files changed (1) hide show

main.py +89 -2

main.py CHANGED Viewed

@@ -67,7 +67,13 @@ CONGRESS_SCRAPE_TARGETS = {
     "Rep. Jeffries": "https://democraticleader.house.gov/media/press-releases",
     "Sen. Klobuchar": "https://www.klobuchar.senate.gov/public/index.cfm/news-releases"
 }
 NEWS_FEEDS = {
     "Politico Tech": "https://rss.politico.com/technology.xml",
     "Axios Tech": "https://www.axios.com/feeds/feed.rss",
@@ -149,6 +155,86 @@ def extract_robust_date(text_blocks):
 # --- DATA GATHERING ENGINES ---
 def fetch_congress_scraped():
     print("Scanning Verified Lawmaker HTML Pages...")
     results = []
@@ -419,11 +505,12 @@ def run():
     raw_data = []
     # Run the 4 basic, robust engines
-    raw_data.extend(fetch_congress_scraped())  # The 5 HTML Pages with DOM Climbing
     raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
     raw_data.extend(fetch_federal_register())
     raw_data.extend(fetch_legislation())
     raw_data.extend(fetch_floor_schedules())
     new_items = []
     for item in raw_data:

     "Rep. Jeffries": "https://democraticleader.house.gov/media/press-releases",
     "Sen. Klobuchar": "https://www.klobuchar.senate.gov/public/index.cfm/news-releases"
 }
+AGENCIES = {
+    "NIST": "https://www.nist.gov/news-events/news",
+    "OSTP": "https://www.whitehouse.gov/ostp/news/",
+    "White House": "https://www.whitehouse.gov/news/",
+    "Department of Energy": "https://www.energy.gov/technologycommercialization/listings/press-releases",
+    "Department of War": "https://www.war.gov/News/releases/"
+}
 NEWS_FEEDS = {
     "Politico Tech": "https://rss.politico.com/technology.xml",
     "Axios Tech": "https://www.axios.com/feeds/feed.rss",
 # --- DATA GATHERING ENGINES ---
+def fetch_agency_scraped():
+    print("Scanning Federal Agency HTML Pages...")
+    results = []
+    for name, url in AGENCY_SCRAPE_TARGETS.items():
+        try:
+            r = scraper.get(url, timeout=15)
+            if r.status_code != 200: continue
+            soup = BeautifulSoup(r.text, "html.parser")
+            seen_links = set()
+            for a_tag in soup.find_all("a", href=True):
+                href = a_tag["href"]
+                if any(skip in href.lower() for skip in ['#', 'javascript:', 'page=', 'category=', 'tag=']): continue
+                full_url = urljoin(url, href)
+                if full_url in seen_links or full_url == url: continue
+                title = a_tag.get_text(" ", strip=True)
+                if not title:
+                    heading = a_tag.find(["h2", "h3", "h4", "strong"])
+                    title = heading.get_text(" ", strip=True) if heading else ""
+                if len(title) < 15 or not is_relevant(title): continue
+                seen_links.add(full_url)
+                # --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
+                fmt_date = None
+                # 1. Expanded Container Search
+                container = a_tag.find_parent(["article", "tr", "li"])
+                if not container:
+                    container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
+                if container:
+                    fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
+                # 2. Sibling Search
+                if not fmt_date:
+                    prev_el = a_tag.find_previous_sibling()
+                    if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
+                if not fmt_date:
+                    next_el = a_tag.find_next_sibling()
+                    if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
+                # 3. Deep DOM Climb Fallback
+                if not fmt_date:
+                    current_node = a_tag
+                    for _ in range(6):
+                        if current_node.parent:
+                            current_node = current_node.parent
+                            found_date = extract_robust_date([current_node.get_text(" ", strip=True)])
+                            if found_date:
+                                fmt_date = found_date
+                                break
+                # --- THE USER-FACING FLAG ---
+                if not fmt_date:
+                    display_time = "⚠️ DATE UNKNOWN"
+                    display_title = f"[DATE MISSING] {title}"
+                else:
+                    days_old = (datetime.now() - fmt_date).days
+                    if days_old > 60: continue
+                    display_time = "Published"
+                    display_title = title
+                results.append({
+                    "source": name,
+                    "type": "Federal/Exec Action",  # Formatted for the Executive action bucket
+                    "event_date": fmt_date,
+                    "time": display_time,
+                    "title": display_title,
+                    "latest_action": "Agency Press Release",
+                    "link": full_url,
+                    "summary": "HTML Scrape"
+                })
+            time.sleep(1)
+        except Exception as e:
+            print(f"  --> {name}: Error — {e}")
+    return results
 def fetch_congress_scraped():
     print("Scanning Verified Lawmaker HTML Pages...")
     results = []
     raw_data = []
     # Run the 4 basic, robust engines
+    raw_data.extend(fetch_congress_scraped())
     raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
     raw_data.extend(fetch_federal_register())
     raw_data.extend(fetch_legislation())
     raw_data.extend(fetch_floor_schedules())
+    raw_data.extend(fetch_agency_scraped())
     new_items = []
     for item in raw_data: