Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Running

App Files Files Community

IJ-Reynolds HF Staff commited on 10 days ago

Commit

2f5126c

verified ·

1 Parent(s): 766241d

Update main.py

Browse files

Files changed (1) hide show

main.py +96 -40

main.py CHANGED Viewed

@@ -51,8 +51,6 @@ def is_relevant(title, summary=""):
     return False
 # --- THE VERIFIED BASELINE TARGETS ---
-# 1. The Verified Lawmaker HTML Pages
 CONGRESS_SCRAPE_TARGETS = {
     "Sen. Young": "https://www.young.senate.gov/newsroom/press-releases/",
     "Rep. Moore": "https://blakemoore.house.gov/media/press-releases",
@@ -61,19 +59,12 @@ CONGRESS_SCRAPE_TARGETS = {
     "Rep. Lieu": "https://lieu.house.gov/media-center/press-releases"
 }
-# 2. Reliable Tech/Policy RSS Feeds
 NEWS_FEEDS = {
     "Politico Tech": "https://rss.politico.com/technology.xml",
     "Axios Tech": "https://www.axios.com/feeds/feed.rss",
     "Tech Policy Press": "https://www.techpolicy.press/rss/",
     "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
-    "The Verge Tech": "https://www.theverge.com/rss/index.xml",
-    "BBC Tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
-    "The Hill Tech": "https://thehill.com/policy/technology/feed/",
-    "FedScoop": "https://fedscoop.com/feed/",
-    "Defense One Tech": "https://www.defenseone.com/rss/technology/",
-    "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"
 }
 # --- AI SETUP ---
@@ -121,16 +112,23 @@ def extract_robust_date(text_blocks):
     date_patterns = [
         r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
         r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
-        r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b'
     ]
     for text in text_blocks:
         if not text: continue
         for pattern in date_patterns:
-            for match in re.findall(pattern, text, re.IGNORECASE):
                 try:
-                    clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
-                    parsed = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
-                    if 2024 <= parsed.year <= 2030: return parsed
                 except: continue
     return None
@@ -155,14 +153,33 @@ def fetch_congress_scraped():
                 title = a_tag.get_text(" ", strip=True)
                 if not title:
-                    heading = a_tag.find(["h2", "h3", "h4"])
                     title = heading.get_text(" ", strip=True) if heading else ""
                 if len(title) < 15 or not is_relevant(title): continue
                 seen_links.add(full_url)
-                parent_text = a_tag.parent.get_text(" ", strip=True) if a_tag.parent else ""
-                fmt_date = extract_robust_date([parent_text, title]) or datetime.now()
                 results.append({
                     "source": name, "type": "Legislative Office Press Release",
@@ -196,43 +213,78 @@ def fetch_rss(feed_dict, source_type):
         except Exception as e: print(f"Error {name}: {e}")
     return results
 def fetch_federal_register():
     print("Scanning Federal Register API...")
     results = []
     try:
-        r = requests.get("https://www.federalregister.gov/api/v1/documents.json", params={"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}, timeout=15)
         if r.status_code == 200:
             for doc in r.json().get("results", []):
                 pub_date = doc.get("publication_date")
                 fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
                 results.append({
-                    "source": doc.get("agency_names", ["Federal Register"])[0], "type": "Federal/Exec Action",
-                    "event_date": fmt_date, "time": "Published", "title": doc.get("title", "No Title"),
-                    "latest_action": doc.get("type", "Notice"), "link": doc.get("html_url", ""), "summary": str(doc.get("abstract", ""))[:300]
                 })
-    except: pass
     return results
-def fetch_legislation():
     print("Scanning Legislation API...")
     if not CONGRESS_API_KEY: return []
     results = []
     headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
-    try:
-        r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 100, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
-        if r.status_code == 200:
-            for b in r.json().get("bills", []):
                 if not is_relevant(b.get("title", "")): continue
-                action_date_raw = b.get("latestAction", {}).get("actionDate") or b.get("updateDate")
                 fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
-                proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/house-bill/{b.get('number')}"
                 results.append({
                     "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
                     "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
-                    "latest_action": b.get("latestAction", {}).get("text", "Active"), "link": proper_link,
-                    "summary": "Legislative movement tracked via API."
                 })
-    except: pass
     return results
 # --- MAIN RUNNER ---
@@ -240,11 +292,11 @@ def run():
     db = load_db()
     raw_data = []
-    # Run the 4 basic, verified engines
-    raw_data.extend(fetch_congress_scraped())  # The 5 HTML Pages
-    raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media")) # Clean Tech RSS
-    raw_data.extend(fetch_federal_register())  # Clean Exec API
-    raw_data.extend(fetch_legislation())       # Clean Congress API
     new_items = []
     for item in raw_data:
@@ -252,7 +304,11 @@ def run():
         event_id = f"{item.get('link', 'no_link')} || {item.get('latest_action', 'no_action')}"
         if event_id not in db:
             print(f"Triaging new item: {item['title'][:40]}...")
-            analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"])
             item["analysis"] = analysis
             item["keywords"] = keywords
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")

     return False
 # --- THE VERIFIED BASELINE TARGETS ---
 CONGRESS_SCRAPE_TARGETS = {
     "Sen. Young": "https://www.young.senate.gov/newsroom/press-releases/",
     "Rep. Moore": "https://blakemoore.house.gov/media/press-releases",
     "Rep. Lieu": "https://lieu.house.gov/media-center/press-releases"
 }
 NEWS_FEEDS = {
     "Politico Tech": "https://rss.politico.com/technology.xml",
     "Axios Tech": "https://www.axios.com/feeds/feed.rss",
     "Tech Policy Press": "https://www.techpolicy.press/rss/",
     "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
+    "The Verge Tech": "https://www.theverge.com/rss/index.xml"
 }
 # --- AI SETUP ---
     date_patterns = [
         r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
         r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
+        r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b',
+        r'\b(\d{2})\.(\d{2})\.(\d{4})\b' # Specifically handles Senate MM.DD.YYYY formats
     ]
     for text in text_blocks:
         if not text: continue
         for pattern in date_patterns:
+            matches = re.findall(pattern, text, re.IGNORECASE)
+            for match in matches:
                 try:
+                    if isinstance(match, tuple):
+                        parsed = datetime(int(match[2]), int(match[0]), int(match[1]))
+                    else:
+                        clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
+                        parsed = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
+                    if 2024 <= parsed.year <= 2030:
+                        return parsed
                 except: continue
     return None
                 title = a_tag.get_text(" ", strip=True)
                 if not title:
+                    heading = a_tag.find(["h2", "h3", "h4", "strong"])
                     title = heading.get_text(" ", strip=True) if heading else ""
                 if len(title) < 15 or not is_relevant(title): continue
                 seen_links.add(full_url)
+                # --- AGGRESSIVE DATE HUNTING ---
+                fmt_date = None
+                current_node = a_tag
+                # Climb up the DOM tree up to 5 levels to find the date stamp
+                for _ in range(5):
+                    if current_node.parent:
+                        current_node = current_node.parent
+                        node_text = current_node.get_text(" ", strip=True)
+                        found_date = extract_robust_date([node_text])
+                        if found_date:
+                            fmt_date = found_date
+                            break
+                # If still no date, check previous text nodes entirely
+                if not fmt_date:
+                    prev_text = a_tag.find_previous(string=True)
+                    fmt_date = extract_robust_date([prev_text]) if prev_text else None
+                # Only fallback to today if absolutely completely missing
+                fmt_date = fmt_date or datetime.now()
                 results.append({
                     "source": name, "type": "Legislative Office Press Release",
         except Exception as e: print(f"Error {name}: {e}")
     return results
+# --- RESTORED UN-NERFED APIS ---
 def fetch_federal_register():
     print("Scanning Federal Register API...")
     results = []
+    url = "https://www.federalregister.gov/api/v1/documents.json"
+    params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 10}
     try:
+        r = requests.get(url, params=params, timeout=15)
         if r.status_code == 200:
             for doc in r.json().get("results", []):
+                title = doc.get("title", "No Title")
+                summary = doc.get("abstract", "No summary provided.")
                 pub_date = doc.get("publication_date")
                 fmt_date = pd.to_datetime(pub_date).tz_localize(None).to_pydatetime() if pub_date else datetime.now()
                 results.append({
+                    "source": doc.get("agency_names", ["Federal Register"])[0],
+                    "type": "Federal/Exec Action", "event_date": fmt_date,
+                    "time": "Published", "title": title, "latest_action": doc.get("type", "Notice"),
+                    "link": doc.get("html_url", ""), "summary": str(summary)[:300]
                 })
+        time.sleep(1)
+    except Exception as e:
+        print(f"Federal Register API Error: {e}")
     return results
+def fetch_bill_text(congress, bill_type, bill_number):
+    if not CONGRESS_API_KEY: return ""
+    try:
+        url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
+        headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
+        r = requests.get(url, headers=headers, timeout=10)
+        if r.status_code == 200:
+            versions = r.json().get("textVersions", [])
+            if versions and versions[0].get("formats"):
+                text_url = versions[0]["formats"][0].get("url")
+                if text_url:
+                    text_req = requests.get(text_url, headers=headers, timeout=10)
+                    return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
+    except: pass
+    return ""
+def fetch_legislation(target=1000):
     print("Scanning Legislation API...")
     if not CONGRESS_API_KEY: return []
     results = []
     headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
+    BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
+    for offset in range(0, target, 250):
+        try:
+            r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
+            if r.status_code != 200: break
+            bills = r.json().get("bills", [])
+            if not bills: break
+            for b in bills:
                 if not is_relevant(b.get("title", "")): continue
+                action_data = b.get("latestAction", {})
+                action_date_raw = action_data.get("actionDate") or b.get("updateDate")
                 fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
+                raw_type = b.get("type", "HR").upper()
+                proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
                 results.append({
                     "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
                     "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
+                    "latest_action": action_data.get("text", "Active"), "link": proper_link,
+                    "summary": "Legislative movement tracked via API.", "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
                 })
+            time.sleep(1.5)
+        except Exception as e: break
     return results
 # --- MAIN RUNNER ---
     db = load_db()
     raw_data = []
+    # Run the 4 basic, robust engines
+    raw_data.extend(fetch_congress_scraped())  # The 5 HTML Pages with DOM Climbing
+    raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
+    raw_data.extend(fetch_federal_register())
+    raw_data.extend(fetch_legislation())
     new_items = []
     for item in raw_data:
         event_id = f"{item.get('link', 'no_link')} || {item.get('latest_action', 'no_action')}"
         if event_id not in db:
             print(f"Triaging new item: {item['title'][:40]}...")
+            # Re-integrated the fetch_bill_text logic so the AI has context!
+            bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
+            analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
             item["analysis"] = analysis
             item["keywords"] = keywords
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")