Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Running

File size: 20,921 Bytes

import streamlit as st
import pandas as pd
import main
from pathlib import Path
from datetime import datetime
import threading
import time
import os
from huggingface_hub import InferenceClient
import json
import numpy as np
import altair as alt
from sklearn.cluster import AgglomerativeClustering

# Create a global lock for file operations
data_lock = threading.Lock()

# --- PATHING LOGIC ---
if Path("/data").exists():
    CSV_PATH = Path("/data/policy_tracker.csv")
else:
    CSV_PATH = Path(__file__).resolve().parent / "policy_tracker.csv"

# --- DATA LOADING ---
def load_data():
    with data_lock:
        if CSV_PATH.exists():
            df = pd.read_csv(CSV_PATH)
            df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce')
            if df['event_date'].dt.tz is not None:
                df['event_date'] = df['event_date'].dt.tz_localize(None)
                
            df['date_collected'] = pd.to_datetime(df['date_collected'], errors='coerce')
            return df
        return None

# --- RETENTION POLICY ---
def apply_retention_policy(df):
    if df is None or df.empty:
        return df
        
    today = pd.Timestamp.now().tz_localize(None).normalize()
    
    # 1. Retention Filtering
    leg_df = df[df['type'] == 'Legislation']
    
    news_types = ['News/Media', 'Federal/Exec Action', 'Legislative Office Press Release']
    news_mask = (df['type'].isin(news_types)) & ((df['event_date'] >= today - pd.Timedelta(days=30)) | df['event_date'].isna())
    news_df = df[news_mask]
    
    sched_types = ['Schedule/Hearing', 'Hearing/Markup']
    sched_mask = (df['type'].isin(sched_types)) & ((df['event_date'] >= today - pd.Timedelta(days=60)) | df['event_date'].isna())
    sched_df = df[sched_mask]
    
    other_df = df[~df['type'].isin(['Legislation'] + news_types + sched_types)]
    
    active_df = pd.concat([leg_df, news_df, sched_df, other_df]).drop_duplicates(subset=['link'])
    
    # 2. Pure Chronological Sorting (Newest to Oldest)
    return active_df.sort_values(by="event_date", ascending=False)

# --- BACKGROUND AUTO-SCHEDULER ---
@st.cache_resource
def start_background_scheduler():
    def background_task():
        while True:
            try:
                needs_run = True
                sleep_time = 1 * 3600  # 1 Hour
                with data_lock:
                    if CSV_PATH.exists():
                        df_check = pd.read_csv(CSV_PATH)
                        if 'date_collected' in df_check.columns and not df_check.empty:
                            last_date = pd.to_datetime(df_check['date_collected']).max()
                            if last_date.tzinfo is not None: last_date = last_date.tz_localize(None)
                            
                            hours_since_last = (datetime.now() - last_date).total_seconds() / 3600
                            if hours_since_last < 1:
                                needs_run = False
                                sleep_time = (1 - hours_since_last) * 3600
                if needs_run:
                    with data_lock: main.run()
                time.sleep(sleep_time)
            except Exception as e:
                time.sleep(3600)  
                
    thread = threading.Thread(target=background_task, daemon=True)
    thread.start()
    return thread

start_background_scheduler()

# --- UI SETUP & ONBOARDING ---
st.set_page_config(page_title="PolicyPilot Intel", layout="wide")
st.title("AI Policy and News Dashboard - ALPHA Version")

st.markdown("""
Welcome to the **AI Policy and News Dashboard**, an automated platform tracking technology policy developments, legislative movement, and media coverage.

The portal will auto-populate with newly scanned data every 1 hour.

This portal's information is divided into two tabs:
* **Radar (Upcoming & Today's News):** Forward-looking policy-relevant data when it is available and daily news updates.
* **Archive (Past):** Historical data on past media coverage, actions from executive agencies and the White House, and legislation from the current Congress. 
* **How to Filter:** Use the **Controls in the left sidebar** to filter by specific data categories or use the search bar below.

### Category Legend
To help you scan the chronological timeline quickly, entries are color-coded:
* 🟣 **Legislation** (Bills, Resolutions)
* 🟢 **Federal / Exec Action** (Agencies, White House)
* 🔵 **News / Media** (Press Coverage)
* 🟠 **Schedule / Hearing** (Committee Meetings, Markups)
* 📣 **Legislative Office Press Release** (Lawmaker Announcements)

To generate a high-level summary of the most recent data entries, click the **"Generate Briefing"** button below.
""")

st.markdown("""
---
### Notes for Users
1. **Verify AI Outputs:** This portal leverages summaries generated by language models. All intelligence should be verified using the links to original sources.
2. **Work in Progress:** This is an alpha version. Improvements in coverage and AI logic will be made regularly.
""")

with st.expander("🛠️ Technical Details & Architecture"):
    st.markdown("""
    * **AI Engine:** Powered by Qwen/Qwen2.5-7B-Instruct.
    * **Data Sources:** Live API integration with Congress.gov, official federal RSS feeds, and master committee schedules.
    * **Filtering:** Articles and bills are strictly filtered against a hardcoded tech-policy dictionary before the AI reads them.
    * **Data Retention:** News expires from the UI after 30 days, Schedules after 60 days. Legislation is retained for the duration of the current Congress.
    """)

st.divider()

# Load Data
df = load_data()
active_df = apply_retention_policy(df)
selected_types = []
# --- SIDEBAR ---
with st.sidebar:
    st.header("System Status")
    
    last_sync_str = "Pending First Run"
    if df is not None and not df.empty and 'date_collected' in df.columns:
        last_sync_dt = pd.to_datetime(df['date_collected']).max()
        last_sync_str = last_sync_dt.strftime('%b %d, %I:%M %p UTC')
    st.info(f"**Auto-Pilot:** Active (1h Cycle)\n\n**Last Sync:** {last_sync_str}")
    
    st.divider()
    st.header("Manual Override")
    
    cooldown_minutes = 30
    can_sweep = True
    time_left = 0
    
    if df is not None and not df.empty and 'date_collected' in df.columns:
        last_sync_dt = pd.to_datetime(df['date_collected']).max()
        if last_sync_dt.tzinfo is not None: last_sync_dt = last_sync_dt.tz_localize(None)
        mins_since_last = (datetime.now() - last_sync_dt).total_seconds() / 60
        
        if mins_since_last < cooldown_minutes:
            can_sweep = False
            time_left = int(cooldown_minutes - mins_since_last)
            
    if can_sweep:
        if st.button("Force Manual Sweep", use_container_width=True):
            with st.spinner("Scanning Datacenters & Gov Servers..."):
                with data_lock: main.run()
                st.success("Sweep Complete!")
                st.rerun()
    else:
        st.button(f"Sweep on Cooldown ({time_left}m left)", disabled=True, use_container_width=True)
        st.caption("🛡️ *To prevent IP bans from government servers, manual sweeps are limited to once every 15 minutes.*")
        
    st.divider()
    if active_df is not None and not active_df.empty:
        available_types = active_df['type'].dropna().unique().tolist()
        selected_types = st.multiselect("Filter by Category:", options=available_types, default=available_types)
        
        st.divider()
        with data_lock: csv_data = active_df.to_csv(index=False).encode('utf-8')
        st.download_button(label="Download Historical Archive (CSV)", data=csv_data, file_name=f"policy_pilot_archive_{pd.Timestamp.now().strftime('%Y-%m-%d')}.csv", mime="text/csv", use_container_width=True)

# --- VISUAL CARD RENDERER (Defined first so tabs can use it) ---
def render_event_cards(display_df):
    if display_df.empty:
        st.info("No items match these filters.")
        return
    
    type_icons = {
        "Legislation": "🟣",
        "Federal/Exec Action": "🟢",
        "News/Media": "🔵",
        "Schedule/Hearing": "🟠",
        "Hearing/Markup": "🟠",
        "Legislative Office Press Release": "📣"
    }
    
    color_map = {
        "News/Media": "blue", 
        "Federal/Exec Action": "green", 
        "Legislation": "violet", 
        "Schedule/Hearing": "orange", 
        "Hearing/Markup": "orange",
        "Legislative Office Press Release": "red"
    }
    
    for _, row in display_df.iterrows():
        dt = row['event_date']
        dt_str = dt.strftime('%b %d, %Y') if pd.notnull(dt) else "Date TBD"
        
        card_type = row['type']
        icon = type_icons.get(card_type, "⚪")
        color = color_map.get(card_type, "gray")
        source = row.get('source', 'Unknown Source')
        
        raw_title = str(row['title'])
        display_title = raw_title[:75] + ("..." if len(raw_title) > 75 else "")
        
        with st.expander(f"{icon} {dt_str} | {card_type} | {source} | {display_title}"):
            col1, col2 = st.columns([3, 1])
            with col1:
                st.markdown("### Executive Summary")
                st.info(row.get('analysis', 'Analysis pending...'))
                st.caption(f"**Keywords:** `{row.get('keywords', 'N/A')}`")
            with col2:
                st.markdown("### Metadata")
                st.markdown(f"**Category:** :{color}[{card_type}]")
                st.write(f"**Source:** {source}")
                st.write(f"**Action:** {row['latest_action']}")
                st.link_button("View Source", str(row['link']), use_container_width=True)

# --- GLOBAL DATA FILTERING ---
if active_df is not None and not active_df.empty:
    # 1. Apply Sidebar Filters
    if selected_types:
        filtered_df = active_df[active_df['type'].isin(selected_types)]
    else:
        filtered_df = active_df

    # 2. Global Search Bar (Always visible at the top)
    search = st.text_input("🔍 Search Intel Dashboard (Filters apply to all tabs)...", "")
    if search:
        mask = filtered_df[['title', 'summary', 'analysis', 'keywords', 'source']].apply(lambda x: x.astype(str).str.contains(search, case=False)).any(axis=1)
        filtered_df = filtered_df[mask]

    # 3. Split data into Radar and Archive
    today_ts = pd.Timestamp.now().normalize()
    radar_cutoff = today_ts - pd.Timedelta(days=1)
    
    radar_df = filtered_df[filtered_df['event_date'] >= radar_cutoff].sort_values(by="event_date", ascending=False)
    archive_df = filtered_df[(filtered_df['event_date'] < radar_cutoff) | (filtered_df['event_date'].isna())].sort_values(by="event_date", ascending=False)

    # ---------------------------------------------------------
    # MASTER UI TABS
    # ---------------------------------------------------------
    tab_feed, tab_briefing, tab_trends = st.tabs(["Data Feed", "Daily Summary", "Weekly Trend Analysis (Pilot)"])

    # === TAB 1: THE FEED ===
    with tab_feed:
        feed_tab1, feed_tab2 = st.tabs([f"Radar ({len(radar_df)})", f"Archive ({len(archive_df)})"])
        with feed_tab1: 
            render_event_cards(radar_df)
        with feed_tab2: 
            render_event_cards(archive_df)

    # === TAB 2: EXECUTIVE BRIEFING ===
    with tab_briefing:
        st.subheader("Daily Summary")
        st.info("AI briefing is synthesized from the most recent sources currently visible on your Radar.")
        
        if 'exec_briefing' not in st.session_state:
            st.session_state.exec_briefing = "Click the button below to generate a high-level briefing."
        st.write(st.session_state.exec_briefing)

        if os.getenv("HF_TOKEN"):
            if st.button("Generate Summary", key="btn_briefing"):
                with st.spinner("Qwen2.5-7B-Instruct is providing a summary..."):
                    briefing_items = radar_df.head(10)
                    if briefing_items.empty:
                        briefing_items = filtered_df.head(10)
                    
                    context = "\n".join([f"• SOURCE: {row['source']} | TITLE: {row['title']} | SUMMARY: {row.get('analysis', 'N/A')}" for _, row in briefing_items.iterrows()])
                    
                    prompt = f"""
                    Provide a highly concise, 3-5 paragraph briefing based only on the recent intelligence gathered from the user's Radar tracking system.
                    Do not include outside information. Cite all sources used in the summary using in-line citations for easy user verification. Do NOT cite dates in line.
                    Ensure you are synthesizing and summarizing information from across the Radar tracking system, not just the first 1 or 2 entries. 
                    All summaries should be in an understated tone. Do not infer implications or make recommendations.
                    RADAR INTEL:
                    {context}
                    """
                    
                    messages = [{"role": "user", "content": prompt}]
                    try:
                        briefing_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=os.getenv("HF_TOKEN"))
                        response = briefing_client.chat_completion(messages, max_tokens=500, temperature=0.2)
                        st.session_state.exec_briefing = response.choices[0].message.content
                        st.rerun()
                    except Exception as e: 
                        st.error(f"Briefing failed: {e}")

    # === TAB 3: TREND ANALYSIS ===
    with tab_trends:
        st.subheader("Semantic Trend Map")
        st.markdown("Explore the semantic relationships between this week's AI policy updates. Non-AI related noise is automatically filtered out by the AI classifier.")

        if st.button("Generate Weekly Trend Report", key="btn_trends"):
            with st.spinner("Analyzing semantic data, abstracting macro-trends, and mapping 2D space... (Takes ~30 seconds)"):
                week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
                weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
                weekly_df = weekly_df.dropna(subset=['embedding'])
                
                if len(weekly_df) < 5:
                    st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to calculate mathematical trends.")
                else:
                    from sklearn.manifold import TSNE
                    matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
                    
                    # FIX 1: Tightened the distance_threshold to 0.45
                    clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.45, metric='cosine', linkage='complete')
                    weekly_df['cluster'] = clusterer.fit_predict(matrix)
                    
                    num_clusters = weekly_df['cluster'].nunique()
                    if num_clusters > 8:
                        clusterer = AgglomerativeClustering(n_clusters=8, metric='cosine', linkage='complete')
                        weekly_df['cluster'] = clusterer.fit_predict(matrix)
                        num_clusters = 8
                        
                    weekly_df['Trend Topic'] = "Uncategorized" 
                    hf_token = os.getenv("HF_TOKEN")
                    
                    if hf_token:
                        ui_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=hf_token)
                        for i in range(num_clusters):
                            cluster_df = weekly_df[weekly_df['cluster'] == i]
                            sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
                            
                            prompt = f"""
                            You are a highly structured D.C. Tech Policy Taxonomist. Categorize these related article titles into a SINGLE, broad policy or industry bucket.
                            
                            RULES:
                            1. MACRO CATEGORIES ONLY: Use 1 to 3 words maximum. Think of these as slide deck section headers.
                            2. NO HEADLINES: Absolutely NO verbs, NO company names, NO numbers, and NO dollar amounts. 
                               * BAD: "Start-Up Raises $1.3 Billion", "Congress Debates AI Bill"
                               * GOOD: "Venture Capital", "Legislative Action", "AI Infrastructure"
                            3. EXAMPLES OF IDEAL LABELS: "AI Infrastructure", "Export Controls", "AI Safety", "Defense & Security", "Consumer Regulation", "Industry Update".
                            4. FILTER NOISE: If the articles are not about AI, compute, or tech policy, reply EXACTLY with: REJECT.
                            5. FORMAT: Just the category name. No quotes, no extra text.
                            
                            UPDATES:
                            {sample_texts}
                            """
                            
                            messages = [{"role": "user", "content": prompt}]
                            try:
                                response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0)
                                topic_name = response.choices[0].message.content.strip(' "').upper()
                                if "REJECT" in topic_name:
                                    weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
                                else:
                                    weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name.title()
                            except:
                                weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
                            
                            time.sleep(10)
                        
                        clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
                        
                        if not clean_df.empty:
                            # Run t-SNE mapping
                            clean_matrix = np.vstack(clean_df['embedding'].apply(json.loads).values)
                            if len(clean_df) > 1:
                                # FIX 2: Lowered perplexity and added n_iter for better small-island separation
                                safe_perplexity = max(2, min(8, len(clean_df) // 4))
                                tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random', n_iter=1000)
                                coords = tsne.fit_transform(clean_matrix)
                                clean_df['x'] = coords[:, 0]
                                clean_df['y'] = coords[:, 1]
                            else:
                                clean_df['x'] = 0
                                clean_df['y'] = 0
                                
                            # Save to session state so it doesn't vanish!
                            st.session_state['trend_df'] = clean_df
                            st.session_state['valid_clusters'] = clean_df['cluster'].unique()
                        else:
                            st.warning("All data this week was classified as non-AI noise.")
                    else:
                        st.error("Hugging Face API token not found.")

        # --- Display the Trend Map if it's stored in Memory ---
        if 'trend_df' in st.session_state and not st.session_state['trend_df'].empty:
            clean_df = st.session_state['trend_df']
            
            st.write(f"### Top AI Trends This Week:")
            for cluster_id in st.session_state['valid_clusters']:
                cluster_subset = clean_df[clean_df['cluster'] == cluster_id]
                topic_label = cluster_subset['Trend Topic'].iloc[0]
                st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
            
            st.write("### Semantic Cluster Map")
            chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
                x=alt.X('x', axis=None), 
                y=alt.Y('y', axis=None), 
                color=alt.Color('Trend Topic:N', legend=alt.Legend(title="Macro Trends", orient="bottom")), 
                tooltip=[
                    alt.Tooltip('Trend Topic:N', title='Macro Trend'),
                    alt.Tooltip('title:N', title='Update Title'),
                    alt.Tooltip('source:N', title='Source'),
                    alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y')
                ]
            ).properties(height=400).interactive()
            
            st.altair_chart(chart, use_container_width=True)

else:
    st.warning("Dashboard empty. Run 'Force Manual Sweep' to populate.")