Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Sleeping

App Files Files Community

IJ-Reynolds HF Staff commited on 10 days ago

Commit

5a58149

verified ·

1 Parent(s): 28c8245

Update main.py

Browse files

Files changed (1) hide show

main.py +60 -60

main.py CHANGED Viewed

@@ -13,10 +13,9 @@ from dateutil import parser as date_parser
 from urllib.parse import urljoin
 from huggingface_hub import InferenceClient
 from sentence_transformers import SentenceTransformer
-import json
 # Specifying model for efficient embedding + trend analysis
-model = SentenceTransformer('BAAI/bge-small-en-v1.5')
 # --- CONFIGURATION & GLOBALS ---
 CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
@@ -25,12 +24,17 @@ CURRENT_CONGRESS = 119
 CONGRESS_API_BASE = "https://api.congress.gov/v3"
 BASE_DIR = Path(__file__).resolve().parent
 if Path("/data").exists():
     CSV_PATH = Path("/data/policy_tracker.csv")
     DB_FILE = Path("/data/seen_events.json")
 else:
     CSV_PATH = BASE_DIR / "policy_tracker.csv"
     DB_FILE = BASE_DIR / "seen_events.json"
 # --- STEALTH SCRAPER SETUP ---
 scraper = cloudscraper.create_scraper(
@@ -46,8 +50,8 @@ TARGET_KEYWORDS = [
     "foundation model", "autonomous system"
 ]
-def is_relevant(title, summary=""):
-    text_to_check = f"{title} {summary}".lower()
     for keyword in TARGET_KEYWORDS:
         if re.search(rf'\b{re.escape(keyword)}', text_to_check):
             return True
@@ -139,20 +143,26 @@ def analyze_with_ai(title, summary, source, bill_text=""):
         return "Error during AI analysis.", "error"
 # --- CORE UTILITIES ---
-def load_db():
-    if DB_FILE.exists():
-        with open(DB_FILE, "r") as f: return json.load(f)
     return []
 def save_db(db):
-    with open(DB_FILE, "w") as f: json.dump(db[-5000:], f)
 def extract_robust_date(text_blocks):
     date_patterns = [
         r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
         r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
         r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b',
-        r'\b(\d{2})\.(\d{2})\.(\d{4})\b' # Specifically handles Senate MM.DD.YYYY formats
     ]
     for text in text_blocks:
         if not text: continue
@@ -172,7 +182,6 @@ def extract_robust_date(text_blocks):
     return None
 # --- DATA GATHERING ENGINES ---
 def fetch_agency_scraped():
     print("Scanning Federal Agency HTML Pages...")
     results = []
@@ -198,10 +207,7 @@ def fetch_agency_scraped():
                 if len(title) < 15 or not is_relevant(title): continue
                 seen_links.add(full_url)
-                # --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
                 fmt_date = None
-                # 1. Expanded Container Search
                 container = a_tag.find_parent(["article", "tr", "li"])
                 if not container:
                     container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
@@ -209,7 +215,6 @@ def fetch_agency_scraped():
                 if container:
                     fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
-                # 2. Sibling Search
                 if not fmt_date:
                     prev_el = a_tag.find_previous_sibling()
                     if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
@@ -217,7 +222,6 @@ def fetch_agency_scraped():
                     next_el = a_tag.find_next_sibling()
                     if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
-                # 3. Deep DOM Climb Fallback
                 if not fmt_date:
                     current_node = a_tag
                     for _ in range(6):
@@ -228,7 +232,6 @@ def fetch_agency_scraped():
                                 fmt_date = found_date
                                 break
-                # --- THE USER-FACING FLAG ---
                 if not fmt_date:
                     display_time = "⚠️ DATE UNKNOWN"
                     display_title = f"[DATE MISSING] {title}"
@@ -240,7 +243,7 @@ def fetch_agency_scraped():
                 results.append({
                     "source": name,
-                    "type": "Federal/Exec Action",  # Formatted for the Executive action bucket
                     "event_date": fmt_date,
                     "time": display_time,
                     "title": display_title,
@@ -278,19 +281,14 @@ def fetch_congress_scraped():
                 if len(title) < 15 or not is_relevant(title): continue
                 seen_links.add(full_url)
-                # --- UNIVERSAL AGGRESSIVE DATE HUNTING ---
                 fmt_date = None
-                # 1. Expanded Container Search (Catches almost all Gov CMS platforms)
                 container = a_tag.find_parent(["article", "tr", "li"])
                 if not container:
-                    # Added: news, press, card, entry, row, record
                     container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
                 if container:
                     fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
-                # 2. Sibling Search (If the date is floating right next to the link)
                 if not fmt_date:
                     prev_el = a_tag.find_previous_sibling()
                     if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
@@ -298,7 +296,6 @@ def fetch_congress_scraped():
                     next_el = a_tag.find_next_sibling()
                     if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
-                # 3. Deep DOM Climb Fallback
                 if not fmt_date:
                     current_node = a_tag
                     for _ in range(6):
@@ -309,7 +306,6 @@ def fetch_congress_scraped():
                                 fmt_date = found_date
                                 break
-                # --- THE USER-FACING FLAG ---
                 if not fmt_date:
                     display_time = "⚠️ DATE UNKNOWN"
                     display_title = f"[DATE MISSING] {title}"
@@ -334,7 +330,6 @@ def fetch_floor_schedules():
     print("Scanning House & Senate Floor Schedules...")
     results = []
-    # Using your stable, verified endpoints
     SCHEDULE_URLS = {
         "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
         "House Floor Summary": "https://clerk.house.gov/FloorSummary"
@@ -346,27 +341,19 @@ def fetch_floor_schedules():
             if r.status_code != 200: continue
             soup = BeautifulSoup(r.text, "html.parser")
-            # 1. THE ISOLATOR: Only look inside the main content body (ignores footers/menus)
             main_area = soup.find("main") or soup.find(id="main_content") or soup.find(class_=re.compile("content|main", re.I)) or soup
-            # 2. SURGICAL TAGS: Only parse actual paragraphs and lists. NO DIVS!
             for container in main_area.find_all(["p", "li"]):
                 text_content = container.get_text(" ", strip=True)
-                # Tighten the length to avoid tiny buttons and massive unbroken text blocks
                 if len(text_content) < 40 or len(text_content) > 800: continue
                 if not is_relevant(text_content): continue
-                # 3. UPGRADED DUPLICATE BLOCKER: Prevents overlapping HTML chunks
                 if any(res['summary'][:100] in text_content for res in results) or \
                    any(text_content[:100] in res['summary'] for res in results):
                     continue
                 a_tag = container.find("a", href=True)
                 item_link = urljoin(url, a_tag['href']) if a_tag else url
-                # Floor actions are usually today's date
                 fmt_date = extract_robust_date([text_content]) or datetime.now()
                 results.append({
@@ -392,17 +379,13 @@ def fetch_rss(feed_dict, source_type):
             for entry in feed.entries[:15]:
                 title = entry.get("title", "")
                 summary = entry.get("description", "")
                 if not is_relevant(title, summary): continue
-                # Check for standard RSS/Atom timestamps first
                 if hasattr(entry, 'published_parsed') and entry.published_parsed:
                     fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
                 elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
                     fmt_date = datetime(*entry.updated_parsed[:6]).replace(tzinfo=None)
                 else:
-                    # Fallback to text scanning only if metadata is missing entirely
                     fmt_date = extract_robust_date([title, summary]) or datetime.now()
                 results.append({
@@ -416,14 +399,10 @@ def fetch_rss(feed_dict, source_type):
     return results
-# -- APIs ---
 def fetch_federal_register():
     print("Scanning Federal Register API...")
     results = []
     url = "https://www.federalregister.gov/api/v1/documents.json"
-    # We pull a larger batch (50) because we are going to heavily filter them locally
     params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 50}
     try:
@@ -433,12 +412,8 @@ def fetch_federal_register():
                 title = doc.get("title", "No Title")
                 summary = doc.get("abstract", "No summary provided.")
-                # --- THE LOCAL RELEVANCE FILTER ---
-                # Only keep it if the AI keywords are in the Title or Abstract (ignores full-text matches)
                 if not is_relevant(title, str(summary)):
                     continue
-                # Explicitly block noisy SEC stock exchange filings
                 if "Self-Regulatory Organizations" in title:
                     continue
@@ -473,18 +448,20 @@ def fetch_bill_text(congress, bill_type, bill_number):
     return ""
 def fetch_legislation(target=1000):
-    print("Scanning Legislation API...")
     if not CONGRESS_API_KEY: return []
     results = []
     headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
     BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
-    # We split the scan: 500 newest introduced, AND 500 most recently updated
     scan_strategies = ["introducedDate desc", "updateDate desc"]
     for sort_method in scan_strategies:
         print(f"  -> Pulling by {sort_method}...")
-        # target // 2 means we pull 500 for each strategy
         for offset in range(0, target // 2, 250):
             try:
                 r = requests.get(
@@ -497,25 +474,51 @@ def fetch_legislation(target=1000):
                 if not bills: break
                 for b in bills:
-                    if not is_relevant(b.get("title", "")): continue
                     action_data = b.get("latestAction", {})
                     action_date_raw = action_data.get("actionDate") or b.get("updateDate")
                     fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
-                    raw_type = b.get("type", "HR").upper()
-                    proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{b.get('number')}"
                     results.append({
                         "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
-                        "time": "API Verified", "title": f"{b.get('type')}{b.get('number')}: {b.get('title')}",
                         "latest_action": action_data.get("text", "Active"), "link": proper_link,
-                        "summary": "Legislative movement tracked via API.", "bill_type": b.get("type", "HR"), "bill_number": b.get("number")
                     })
-                time.sleep(1.5) # Polite delay
             except Exception as e: break
     return results
 # --- MAIN RUNNER ---
 def run():
@@ -538,25 +541,22 @@ def run():
         if event_id not in db:
             print(f"Triaging new item: {item['title'][:40]}...")
-            # Re-integrated the fetch_bill_text logic so the AI has context!
             bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
             analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
             item["analysis"] = analysis
             item["keywords"] = keywords
-            # --- NEW: GENERATE SEMANTIC EMBEDDING ---
             try:
-                # Don't waste compute embedding error messages
                 if analysis and not analysis.startswith("Error") and not analysis.startswith("AI Triage disabled"):
                     vector = embedder.encode(analysis).tolist()
-                    item["embedding"] = json.dumps(vector) # Stored as JSON string for CSV compatibility
                 else:
                     item["embedding"] = None
             except Exception as e:
                 print(f"  -> Embedding error: {e}")
                 item["embedding"] = None
-            # ----------------------------------------
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
             new_items.append(item)

 from urllib.parse import urljoin
 from huggingface_hub import InferenceClient
 from sentence_transformers import SentenceTransformer
 # Specifying model for efficient embedding + trend analysis
+embedder = SentenceTransformer('BAAI/bge-small-en-v1.5')
 # --- CONFIGURATION & GLOBALS ---
 CONGRESS_API_KEY = os.getenv("CONGRESS_API_KEY")
 CONGRESS_API_BASE = "https://api.congress.gov/v3"
 BASE_DIR = Path(__file__).resolve().parent
+# --- PERSISTENT STORAGE PATHING ---
 if Path("/data").exists():
     CSV_PATH = Path("/data/policy_tracker.csv")
     DB_FILE = Path("/data/seen_events.json")
+    WHITELIST_FILE = Path("/data/tracked_bills.json")
+    SCANNED_FILE = Path("/data/scanned_bills.json")
 else:
     CSV_PATH = BASE_DIR / "policy_tracker.csv"
     DB_FILE = BASE_DIR / "seen_events.json"
+    WHITELIST_FILE = BASE_DIR / "tracked_bills.json"
+    SCANNED_FILE = BASE_DIR / "scanned_bills.json"
 # --- STEALTH SCRAPER SETUP ---
 scraper = cloudscraper.create_scraper(
     "foundation model", "autonomous system"
 ]
+def is_relevant(title, summary="", text=""):
+    text_to_check = f"{title} {summary} {text}".lower()
     for keyword in TARGET_KEYWORDS:
         if re.search(rf'\b{re.escape(keyword)}', text_to_check):
             return True
         return "Error during AI analysis.", "error"
 # --- CORE UTILITIES ---
+def load_list(filepath):
+    if filepath.exists():
+        with open(filepath, "r") as f: return json.load(f)
     return []
+def save_list(data, filepath):
+    with open(filepath, "w") as f: json.dump(data[-5000:], f)
+def load_db():
+    return load_list(DB_FILE)
 def save_db(db):
+    save_list(db, DB_FILE)
 def extract_robust_date(text_blocks):
     date_patterns = [
         r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:st|nd|rd|th)?(?:,)?(?:\s+\d{4})?\b',
         r'\b\d{1,2}[-/]\d{1,2}(?:[-/]\d{2,4})?\b',
         r'\b202\d[-/]\d{1,2}[-/]\d{1,2}\b',
+        r'\b(\d{2})\.(\d{2})\.(\d{4})\b'
     ]
     for text in text_blocks:
         if not text: continue
     return None
 # --- DATA GATHERING ENGINES ---
 def fetch_agency_scraped():
     print("Scanning Federal Agency HTML Pages...")
     results = []
                 if len(title) < 15 or not is_relevant(title): continue
                 seen_links.add(full_url)
                 fmt_date = None
                 container = a_tag.find_parent(["article", "tr", "li"])
                 if not container:
                     container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
                 if container:
                     fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
                 if not fmt_date:
                     prev_el = a_tag.find_previous_sibling()
                     if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
                     next_el = a_tag.find_next_sibling()
                     if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
                 if not fmt_date:
                     current_node = a_tag
                     for _ in range(6):
                                 fmt_date = found_date
                                 break
                 if not fmt_date:
                     display_time = "⚠️ DATE UNKNOWN"
                     display_title = f"[DATE MISSING] {title}"
                 results.append({
                     "source": name,
+                    "type": "Federal/Exec Action",
                     "event_date": fmt_date,
                     "time": display_time,
                     "title": display_title,
                 if len(title) < 15 or not is_relevant(title): continue
                 seen_links.add(full_url)
                 fmt_date = None
                 container = a_tag.find_parent(["article", "tr", "li"])
                 if not container:
                     container = a_tag.find_parent("div", class_=re.compile(r"views-row|item|post|news|press|card|entry|row|record", re.I))
                 if container:
                     fmt_date = extract_robust_date([container.get_text(" ", strip=True)])
                 if not fmt_date:
                     prev_el = a_tag.find_previous_sibling()
                     if prev_el: fmt_date = extract_robust_date([prev_el.get_text(" ", strip=True)])
                     next_el = a_tag.find_next_sibling()
                     if next_el: fmt_date = extract_robust_date([next_el.get_text(" ", strip=True)])
                 if not fmt_date:
                     current_node = a_tag
                     for _ in range(6):
                                 fmt_date = found_date
                                 break
                 if not fmt_date:
                     display_time = "⚠️ DATE UNKNOWN"
                     display_title = f"[DATE MISSING] {title}"
     print("Scanning House & Senate Floor Schedules...")
     results = []
     SCHEDULE_URLS = {
         "Senate Floor Schedule": "https://www.senate.gov/legislative/floor_activity_pail.htm",
         "House Floor Summary": "https://clerk.house.gov/FloorSummary"
             if r.status_code != 200: continue
             soup = BeautifulSoup(r.text, "html.parser")
             main_area = soup.find("main") or soup.find(id="main_content") or soup.find(class_=re.compile("content|main", re.I)) or soup
             for container in main_area.find_all(["p", "li"]):
                 text_content = container.get_text(" ", strip=True)
                 if len(text_content) < 40 or len(text_content) > 800: continue
                 if not is_relevant(text_content): continue
                 if any(res['summary'][:100] in text_content for res in results) or \
                    any(text_content[:100] in res['summary'] for res in results):
                     continue
                 a_tag = container.find("a", href=True)
                 item_link = urljoin(url, a_tag['href']) if a_tag else url
                 fmt_date = extract_robust_date([text_content]) or datetime.now()
                 results.append({
             for entry in feed.entries[:15]:
                 title = entry.get("title", "")
                 summary = entry.get("description", "")
                 if not is_relevant(title, summary): continue
                 if hasattr(entry, 'published_parsed') and entry.published_parsed:
                     fmt_date = datetime(*entry.published_parsed[:6]).replace(tzinfo=None)
                 elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
                     fmt_date = datetime(*entry.updated_parsed[:6]).replace(tzinfo=None)
                 else:
                     fmt_date = extract_robust_date([title, summary]) or datetime.now()
                 results.append({
     return results
 def fetch_federal_register():
     print("Scanning Federal Register API...")
     results = []
     url = "https://www.federalregister.gov/api/v1/documents.json"
     params = {"conditions[term]": "artificial intelligence", "order": "newest", "per_page": 50}
     try:
                 title = doc.get("title", "No Title")
                 summary = doc.get("abstract", "No summary provided.")
                 if not is_relevant(title, str(summary)):
                     continue
                 if "Self-Regulatory Organizations" in title:
                     continue
     return ""
 def fetch_legislation(target=1000):
+    print("Scanning Legislation API with Deep Text & Whitelist...")
     if not CONGRESS_API_KEY: return []
     results = []
     headers = {"X-API-Key": CONGRESS_API_KEY, "Accept": "application/json"}
     BILL_MAP = {"HR": "house-bill", "S": "senate-bill", "HRES": "house-resolution", "SRES": "senate-resolution"}
+    # Load tracking databases
+    tracked_bills = set(load_list(WHITELIST_FILE))
+    scanned_bills = set(load_list(SCANNED_FILE))
     scan_strategies = ["introducedDate desc", "updateDate desc"]
     for sort_method in scan_strategies:
         print(f"  -> Pulling by {sort_method}...")
         for offset in range(0, target // 2, 250):
             try:
                 r = requests.get(
                 if not bills: break
                 for b in bills:
+                    raw_type = b.get("type", "HR").upper()
+                    bill_number = b.get("number")
+                    bill_id = f"{raw_type}{bill_number}"
+                    is_ai_bill = False
+                    # 1. THE WHITELIST CHECK (Catches all admin updates for known AI bills)
+                    if bill_id in tracked_bills:
+                        is_ai_bill = True
+                    else:
+                        # 2. TITLE/SUMMARY CHECK
+                        if is_relevant(b.get("title", "")):
+                            is_ai_bill = True
+                            tracked_bills.add(bill_id)
+                        # 3. DEEP TEXT CHECK (Only for bills we haven't already rejected!)
+                        elif bill_id not in scanned_bills:
+                            bill_text = fetch_bill_text(CURRENT_CONGRESS, raw_type, bill_number)
+                            scanned_bills.add(bill_id) # Mark as scanned so we don't hit the API limit tomorrow
+                            if is_relevant("", "", bill_text):
+                                is_ai_bill = True
+                                tracked_bills.add(bill_id)
+                    if not is_ai_bill:
+                        continue # Skip entirely!
                     action_data = b.get("latestAction", {})
                     action_date_raw = action_data.get("actionDate") or b.get("updateDate")
                     fmt_date = pd.to_datetime(action_date_raw).tz_localize(None).to_pydatetime() if action_date_raw else datetime.now()
+                    proper_link = f"https://www.congress.gov/bill/{CURRENT_CONGRESS}th-congress/{BILL_MAP.get(raw_type, 'house-bill')}/{bill_number}"
                     results.append({
                         "source": "Congress.gov", "type": "Legislation", "event_date": fmt_date,
+                        "time": "API Verified", "title": f"{raw_type}{bill_number}: {b.get('title')}",
                         "latest_action": action_data.get("text", "Active"), "link": proper_link,
+                        "summary": "Legislative movement tracked via API.", "bill_type": raw_type, "bill_number": bill_number
                     })
+                time.sleep(1.5)
             except Exception as e: break
+    # Save the updated Whitelist and Scanned lists to the permanent bucket
+    save_list(list(tracked_bills), WHITELIST_FILE)
+    save_list(list(scanned_bills), SCANNED_FILE)
     return results
 # --- MAIN RUNNER ---
 def run():
         if event_id not in db:
             print(f"Triaging new item: {item['title'][:40]}...")
             bill_text = fetch_bill_text(CURRENT_CONGRESS, item.get("bill_type"), item.get("bill_number")) if item.get("type") == "Legislation" else ""
             analysis, keywords = analyze_with_ai(item["title"], item["summary"], item["source"], bill_text=bill_text)
             item["analysis"] = analysis
             item["keywords"] = keywords
+            # --- SEMANTIC EMBEDDING ---
             try:
                 if analysis and not analysis.startswith("Error") and not analysis.startswith("AI Triage disabled"):
                     vector = embedder.encode(analysis).tolist()
+                    item["embedding"] = json.dumps(vector)
                 else:
                     item["embedding"] = None
             except Exception as e:
                 print(f"  -> Embedding error: {e}")
                 item["embedding"] = None
             item["date_collected"] = datetime.now().strftime("%Y-%m-%d %H:%M")
             new_items.append(item)