Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Running

App Files Files Community

IJ-Reynolds HF Staff commited on 19 days ago

Commit

1c6553d

verified ·

1 Parent(s): e7c98e9

Update main.py

Browse files

Files changed (1) hide show

main.py +276 -58

main.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import cloudscraper
 import pandas as pd
 from bs4 import BeautifulSoup
 import feedparser
@@ -12,7 +13,7 @@ from dateutil import parser as date_parser
 from urllib.parse import urljoin
 from huggingface_hub import InferenceClient
-# --- CONFIGURATION ---
 CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
 HF_TOKEN = os.getenv("HF_TOKEN")
 CURRENT_CONGRESS = 119
@@ -27,36 +28,41 @@ else:
     DB_FILE = BASE_DIR / "seen_events.json"
 # --- STEALTH SCRAPER SETUP ---
-# ai-cloudscraper mimics a real browser handshake to bypass 2026 firewalls
 scraper = cloudscraper.create_scraper(
     browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
     interpreter='js2py'
 )
 TARGET_KEYWORDS = [
     "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
     "deep learning", "autonomous", "training data", "data privacy", "semiconductor",
-    "chatbot", "facial recognition", "biometric", "open-source", "foundation model"
 ]
 def is_relevant(title, summary=""):
-    text = f"{title} {summary}".lower()
-    return any(re.search(rf'\b{re.escape(k)}', text) for k in TARGET_KEYWORDS)
 CONGRESS_PRESS_FEEDS = {
-    # Senate 2026
     "Sen. Cruz (Commerce Chair)": "https://www.commerce.senate.gov/press/rep/rss",
     "Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/newsroom/press-releases?format=rss",
     "Sen. Young (AI Caucus)": "https://www.young.senate.gov/newsroom/press-releases?format=rss",
     "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/newsroom/press-releases?format=rss",
-    # House
     "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
     "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/rss.xml",
-    "Rep. Moore (UT)": "https://blakemoore.house.gov/news/rss.xml" # Updated to new 2026 path
 }
 NEWS_FEEDS = {
     "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
     "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
@@ -80,84 +86,296 @@ NEWS_FEEDS = {
     "The Hill Tech": "https://thehill.com/policy/technology/feed/"
 }
-# --- CORE SCRAPER ---
 def fetch_rss(feed_dict, source_type):
-    print(f"Scanning {source_type}...")
     results = []
     for name, url in feed_dict.items():
         try:
-            # Persistent session handling
             r = scraper.get(url, timeout=15)
-            # If we hit a 404/410, it means the office changed their CMS
-            if r.status_code in [404, 410]:
-                print(f"--> {name}: URL Expired ({r.status_code}) - Needs Manual Path Update")
-                continue
             if r.status_code != 200:
-                print(f"--> {name}: Blocked ({r.status_code})")
                 continue
             feed = feedparser.parse(r.content)
-            # If the feed is valid but empty, the office just hasn't posted today
-            if not feed.entries:
-                print(f"--> {name}: Feed is currently empty.")
-                continue
-            print(f"--> {name}: Found {len(feed.entries)} items.")
-            for entry in feed.entries[:10]:
-                title = entry.get("title", "")
                 summary = entry.get("description", "")
                 link = entry.get("link", url)
-                if is_relevant(title, summary):
-                    # Trust the RSS timestamp first
-                    if hasattr(entry, 'published_parsed') and entry.published_parsed:
-                        fmt_date = datetime(*entry.published_parsed[:6])
-                    else:
-                        fmt_date = datetime.now()
-                    results.append({
-                        "source": name, "type": source_type, "title": title,
-                        "summary": summary[:300], "link": link,
-                        "latest_action": "Published", "event_date": fmt_date
-                    })
             time.sleep(1)
         except Exception as e:
-            print(f"Error {name}: {e}")
     return results
-def run():
-    # Load seen events to prevent duplicates
-    if DB_FILE.exists():
-        with open(DB_FILE, "r") as f: db = json.load(f)
-    else: db = []
     raw_data = []
     raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
     raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
-    # AI Triage & Storage Logic
     new_items = []
     for item in raw_data:
-        if item['link'] not in db:
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
-            item["analysis"] = "AI summary pending..."
-            item["keywords"] = "AI, Policy"
             new_items.append(item)
-            db.append(item['link'])
     if new_items:
         df_new = pd.DataFrame(new_items)
         if CSV_PATH.exists():
-            df_existing = pd.read_csv(CSV_PATH)
-            pd.concat([df_existing, df_new], ignore_index=True).to_csv(CSV_PATH, index=False)
         else:
-            df_new.to_csv(CSV_PATH, index=False)
-        with open(DB_FILE, "w") as f: json.dump(db[-5000:], f)
-        print(f"Added {len(new_items)} items.")
     return len(new_items)

 import os
 import cloudscraper
+import requests
 import pandas as pd
 from bs4 import BeautifulSoup
 import feedparser
 from urllib.parse import urljoin
 from huggingface_hub import InferenceClient
+# --- CONFIGURATION & GLOBALS ---
 CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
 HF_TOKEN = os.getenv("HF_TOKEN")
 CURRENT_CONGRESS = 119
     DB_FILE = BASE_DIR / "seen_events.json"
 # --- STEALTH SCRAPER SETUP ---
+# Mimics a real browser handshake to bypass Cloudflare/Akamai
 scraper = cloudscraper.create_scraper(
     browser={'browser': 'chrome', 'platform': 'windows', 'desktop': True},
     interpreter='js2py'
 )
+# --- KEYWORD FILTER ---
 TARGET_KEYWORDS = [
     "artificial intelligence", "machine learning", "algorithm", "llm", "generative ai",
     "deep learning", "autonomous", "training data", "data privacy", "semiconductor",
+    "chatbot", "facial recognition", "biometric", "open-source", "open source ai",
+    "foundation model", "emerging technology", "automated decision", "automated system",
+    "large language model", "surveillance technology"
 ]
 def is_relevant(title, summary=""):
+    text_to_check = f"{title} {summary}".lower()
+    for keyword in TARGET_KEYWORDS:
+        if re.search(rf'\b{re.escape(keyword)}', text_to_check):
+            return True
+    if re.search(r'\b(ai|compute)\b', text_to_check):
+        return True
+    return False
+# --- FEEDS DICTIONARIES ---
 CONGRESS_PRESS_FEEDS = {
     "Sen. Cruz (Commerce Chair)": "https://www.commerce.senate.gov/press/rep/rss",
     "Sen. Schumer (AI Lead)": "https://www.schumer.senate.gov/newsroom/press-releases?format=rss",
     "Sen. Young (AI Caucus)": "https://www.young.senate.gov/newsroom/press-releases?format=rss",
     "Sen. Andy Kim (Tech Lead)": "https://www.kim.senate.gov/newsroom/press-releases?format=rss",
     "Rep. Babin (Science Chair)": "https://babin.house.gov/rss.xml",
     "Rep. Obernolte (Tech Chair)": "https://obernolte.house.gov/rss.xml",
+    "Rep. Moore (UT)": "https://blakemoore.house.gov/news/rss.xml"
 }
 NEWS_FEEDS = {
     "NYT Tech": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
     "Wired AI": "https://www.wired.com/feed/tag/ai/latest/rss",
     "The Hill Tech": "https://thehill.com/policy/technology/feed/"
 }
+GOV_FEEDS = {
+    "White House OSTP": "https://www.whitehouse.gov/ostp/feed/",
+    "White House Briefing Room": "https://www.whitehouse.gov/briefing-room/feed/",
+    "DOE Artificial Intelligence": "https://www.energy.gov/topics/artificial-intelligence/rss",
+    "DOE Office of Science": "https://science.osti.gov/RSS",
+    "Federal Register (AI Postings)": "https://www.federalregister.gov/documents/search.rss?conditions%5Bterm%5D=artificial+intelligence",
+    "NIST AI News": "https://www.nist.gov/news-events/news/rss.xml",
+    "NTIA (Internet Policy)": "https://www.ntia.gov/rss.xml",
+    "CDAO (Defense AI Office)": "https://www.cdao.mil/News/RSS/",
+    "FTC Technology Blog": "https://www.ftc.gov/news-events/blogs/techftc/feed",
+    "GSA (Fed IT News)": "https://www.gsa.gov/about-us/newsroom/news-releases/rss"
+}
+CALENDAR_FEEDS = {
+    "House Science RSS": "https://science.house.gov/hearings?rss=1",
+    "House Energy RSS": "https://energycommerce.house.gov/events?rss=1",
+    "House Foreign Affairs RSS": "https://foreignaffairs.house.gov/committee-activity/hearings/all?rss=1",
+    "Senate Commerce RSS": "https://www.commerce.senate.gov/RSS",
+    "Senate Judiciary RSS": "https://www.judiciary.senate.gov/RSS",
+    "Senate Foreign Relations RSS": "https://www.foreign.senate.gov/hearings?rss=1",
+    "DOE Events": "https://www.energy.gov/events/rss"
+}
+# --- AI SETUP & ANALYZER ---
+if HF_TOKEN:
+    hf_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=HF_TOKEN)
+else:
+    hf_client = None
+    print("Warning: No HF_TOKEN found. AI Triage will be bypassed.")
+def analyze_with_ai(title, summary, source, bill_text=""):
+    if not hf_client:
+        return "AI Triage disabled (No API Key).", "N/A"
+    prompt = f"""
+    You are a D.C. AI policy analyst. Review this update. Simply provide the summary with no other additions:
+    Source: {source}
+    Title: {title}
+    Summary: {summary}
+    Raw Bill Text Excerpt: {bill_text if bill_text else 'N/A'}
+    RULES:
+    1. STRICT ANTI-HALLUCINATION: Base your analysis ONLY on the provided text.
+    2. Provide a detailed, 2-to-3 sentence executive summary explaining the actual policy impact.
+    3. Extract 3 comma-separated keywords.
+    Format output EXACTLY as:
+    ANALYSIS: [Your 2-3 sentence summary here]
+    KEYWORDS: [Words]
+    """
+    try:
+        messages = [{"role": "user", "content": prompt}]
+        response = hf_client.chat_completion(messages, max_tokens=350, temperature=0.1, top_p=0.9)
+        text = response.choices[0].message.content
+        analysis_match = re.search(r'ANALYSIS:\s*(.*?)(?=KEYWORDS:|$)', text, re.DOTALL)
+        analysis = analysis_match.group(1).strip() if analysis_match else "Could not generate analysis."
+        keywords_match = re.search(r'KEYWORDS:\s*(.*)', text)
+        keywords = keywords_match.group(1).strip() if keywords_match else "AI, Tech, Policy"
+        return analysis.replace('\n', ' '), keywords
+    except Exception as e:
+        print(f"AI Error: {e}")
+        return "Error during AI analysis.", "error"
+# --- STATE MANAGEMENT ---
+def load_db():
+    if DB_FILE.exists():
+        with open(DB_FILE, "r") as f:
+            return json.load(f)
+    return []
+def save_db(db):
+    db = db[-5000:]
+    with open(DB_FILE, "w") as f:
+        json.dump(db, f)
+def get_event_id(item):
+    link = item.get("link", "no_link")
+    action = item.get("latest_action", "no_action")
+    return f"{link} || {action}"
+def is_new_event(item, db):
+    return get_event_id(item) not in db
+# --- DATE EXTRACTOR ---
+def extract_robust_date(text_blocks):
+    date_patterns = [
+        r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
+        r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
+        r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b'
+    ]
+    for text in text_blocks:
+        if not text: continue
+        for pattern in date_patterns:
+            matches = re.findall(pattern, text, re.IGNORECASE)
+            for match in matches:
+                try:
+                    clean_match = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', match)
+                    parsed_date = date_parser.parse(clean_match, fuzzy=True).replace(tzinfo=None)
+                    if 2024 <= parsed_date.year <= 2030:
+                        return parsed_date
+                except:
+                    continue
+    return None
+# --- SCRAPERS ---
 def fetch_rss(feed_dict, source_type):
+    print(f"Scanning {source_type} RSS...")
     results = []
     for name, url in feed_dict.items():
         try:
             r = scraper.get(url, timeout=15)
+            if r.status_code in [404, 410] and ".house.gov" in url:
+                root_url = url.split(".gov")[0] + ".gov/rss.xml"
+                r = scraper.get(root_url, timeout=10)
             if r.status_code != 200:
+                print(f"--> {name}: Access Denied/Missing ({r.status_code})")
                 continue
             feed = feedparser.parse(r.content)
+            for entry in feed.entries[:20]:
+                title = entry.get("title", "No Title")
                 summary = entry.get("description", "")
                 link = entry.get("link", url)
+                if not is_relevant(title, summary):
+                    continue
+                url_year_match = re.search(r'/(20\d{2})/', link)
+                if url_year_match:
+                    url_year = int(url_year_match.group(1))
+                    curr_year = datetime.now().year
+                    curr_month = datetime.now().month
+                    if url_year < curr_year and curr_month > 2: continue
+                    if url_year < curr_year - 1: continue
+                if hasattr(entry, 'published_parsed') and entry.published_parsed:
+                    fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
+                else:
+                    fmt_date = extract_robust_date([title, summary])
+                if fmt_date:
+                    days_old = (datetime.now().replace(tzinfo=None) - fmt_date).days
+                    if days_old > 60: continue
+                results.append({
+                    "source": name, "type": source_type, "event_date": fmt_date,
+                    "time": "TBD", "title": title, "latest_action": "Published",
+                    "link": link, "summary": summary[:200]
+                })
             time.sleep(1)
         except Exception as e:
+            print(f"Error fetching {name}: {e}")
     return results
+def fetch_master_schedules():
+    print("Scanning Master Schedules...")
+    results = []
+    today = datetime.now()
+    monday_of_week = today - timedelta(days=today.weekday())
+    SCHEDULE_URLS = {
+        "House Floor Schedule": f"https://www.house.gov/legislative-activity/{today.strftime('%Y-%m-%d')}",
+        "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
+        "Congress Weekly": f"https://www.congress.gov/committee-schedule/weekly/{monday_of_week.strftime('%Y/%m/%d')}"
+    }
+    for source_name, url in SCHEDULE_URLS.items():
+        try:
+            r = scraper.get(url, timeout=15)
+            if r.status_code != 200: continue
+            soup = BeautifulSoup(r.text, "html.parser")
+            for container in soup.find_all(["tr", "li", "div", "p"]):
+                text_content = container.get_text(" ", strip=True)
+                if len(text_content) < 30 or len(text_content) > 1500: continue
+                if not is_relevant(text_content): continue
+                if any(res['summary'][:50] == text_content[:50] for res in results): continue
+                a_tag = container.find("a", href=True)
+                item_link = urljoin(url, a_tag['href']) if a_tag else url
+                time_node = container.find("time")
+                time_text = time_node["datetime"] if time_node and time_node.has_attr("datetime") else ""
+                fmt_date = extract_robust_date([time_text, text_content]) or today.replace(hour=9, minute=0, second=0, microsecond=0)
+                results.append({
+                    "source": source_name, "type": "Schedule/Hearing", "event_date": fmt_date,
+                    "time": "Scheduled", "title": text_content[:120] + "...",
+                    "latest_action": "On Master Schedule", "link": item_link, "summary": text_content[:300]
+                })
+            time.sleep(1)
+        except Exception as e:
+            print(f"Error scraping {source_name}: {e}")
+    return results
+def fetch_bill_text(congress, bill_type, bill_number):
+    if not CONGRESS_API_KEY: return ""
+    url = f"{CONGRESS_API_BASE}/bill/{congress}/{bill_type.lower()}/{bill_number}/text"
+    headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
+    try:
+        r = requests.get(url, headers=headers, timeout=10)
+        if r.status_code != 200: return ""
+        versions = r.json().get("textVersions", [])
+        if not versions: return ""
+        for fmt in versions[0].get("formats", []):
+            if text_url := fmt.get("url"):
+                text_req = requests.get(text_url, headers=headers, timeout=10)
+                if text_req.status_code == 200:
+                    return BeautifulSoup(text_req.text, "html.parser").get_text(separator=' ', strip=True)[:3500]
+    except Exception as e:
+        print(f"Failed to fetch text for {bill_type}{bill_number}: {e}")
+    return ""
+def fetch_legislation(target=1000):
+    print("Scanning Legislation API...")
+    if not CONGRESS_API_KEY: return []
+    results = []
+    headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
+    BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
+    for offset in range(0, target, 250):
+        try:
+            r = requests.get(f"{CONGRESS_API_BASE}/bill/{CURRENT_CONGRESS}", params={"limit": 250, "offset": offset, "format": "json", "sort": "updateDate desc"}, headers=headers, timeout=20)
+            if r.status_code != 200: break
+            bills = r.json().get("bills", [])
+            if not bills: break
+            for b in bills:
+                title = b.get("title", "")
+                if not is_relevant(title): continue
+                action_data = b.get("latestAction", {})
+                action_text = action_data.get("text", "Active")
+                action_date_raw = action_data.get("actionDate") or b.get("updateDate")
+                fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else None
+                raw_type = b.get("type", "HR").upper()
+                proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
+                results.append({
+                    "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
+                    "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {title}",
+                    "latest_action": action_text, "link": proper_link, "summary": "Legislative movement tracked via API.",
+                    "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
+                })
+            time.sleep(1.5)
+        except Exception as e:
+            print(f"Legislation API Error: {e}")
+            break
+    return results
+# --- MAIN EXECUTION ---
+def run():
+    db = load_db()
     raw_data = []
     raw_data.extend(fetch_rss(NEWS_FEEDS, "News/Media"))
+    raw_data.extend(fetch_rss(GOV_FEEDS, "Federal/Exec Action"))
     raw_data.extend(fetch_rss(CONGRESS_PRESS_FEEDS, "Legislative Office Press Release"))
+    raw_data.extend(fetch_rss(CALENDAR_FEEDS, "Schedule/Hearing"))
+    raw_data.extend(fetch_master_schedules())
+    raw_data.extend(fetch_legislation())
     new_items = []
     for item in raw_data:
+        if is_new_event(item, db):
+            print(f"Triaging new item: {item['title'][:40]}...")
+            bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
+            analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
+            item["analysis"] = analysis
+            item["keywords"] = keywords
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
             new_items.append(item)
+            db.append(get_event_id(item))
     if new_items:
         df_new = pd.DataFrame(new_items)
         if CSV_PATH.exists():
+            df_existing = pd.read_csv(CSV_PATH, parse_dates=["event_date"])
+            df_combined = pd.concat([df_existing, df_new], ignore_index=True)
         else:
+            df_combined = df_new
+        df_combined = df_combined.drop_duplicates(subset=['link', 'latest_action'], keep='first')
+        df_combined.to_csv(CSV_PATH, index=False)
+        save_db(db)
+        print(f"Added {len(new_items)} new items.")
+    else:
+        print("Sweep complete. No new items.")
     return len(new_items)