import streamlit as st import pandas as pd import main from pathlib import Path from datetime import datetime import threading import time import os from huggingface_hub import InferenceClient import json import numpy as np import altair as alt from sklearn.cluster import AgglomerativeClustering # Create a global lock for file operations data_lock = threading.Lock() # --- PATHING LOGIC --- if Path("/data").exists(): CSV_PATH = Path("/data/policy_tracker.csv") else: CSV_PATH = Path(__file__).resolve().parent / "policy_tracker.csv" # --- DATA LOADING --- def load_data(): with data_lock: if CSV_PATH.exists(): df = pd.read_csv(CSV_PATH) df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce') if df['event_date'].dt.tz is not None: df['event_date'] = df['event_date'].dt.tz_localize(None) df['date_collected'] = pd.to_datetime(df['date_collected'], errors='coerce') return df return None # --- RETENTION POLICY --- def apply_retention_policy(df): if df is None or df.empty: return df today = pd.Timestamp.now().tz_localize(None).normalize() # 1. Retention Filtering leg_df = df[df['type'] == 'Legislation'] news_types = ['News/Media', 'Federal/Exec Action', 'Legislative Office Press Release'] news_mask = (df['type'].isin(news_types)) & ((df['event_date'] >= today - pd.Timedelta(days=30)) | df['event_date'].isna()) news_df = df[news_mask] sched_types = ['Schedule/Hearing', 'Hearing/Markup'] sched_mask = (df['type'].isin(sched_types)) & ((df['event_date'] >= today - pd.Timedelta(days=60)) | df['event_date'].isna()) sched_df = df[sched_mask] other_df = df[~df['type'].isin(['Legislation'] + news_types + sched_types)] active_df = pd.concat([leg_df, news_df, sched_df, other_df]).drop_duplicates(subset=['link']) # 2. Pure Chronological Sorting (Newest to Oldest) return active_df.sort_values(by="event_date", ascending=False) # --- BACKGROUND AUTO-SCHEDULER --- @st.cache_resource def start_background_scheduler(): def background_task(): while True: try: needs_run = True sleep_time = 1 * 3600 # 1 Hour with data_lock: if CSV_PATH.exists(): df_check = pd.read_csv(CSV_PATH) if 'date_collected' in df_check.columns and not df_check.empty: last_date = pd.to_datetime(df_check['date_collected']).max() if last_date.tzinfo is not None: last_date = last_date.tz_localize(None) hours_since_last = (datetime.now() - last_date).total_seconds() / 3600 if hours_since_last < 1: needs_run = False sleep_time = (1 - hours_since_last) * 3600 if needs_run: with data_lock: main.run() time.sleep(sleep_time) except Exception as e: time.sleep(3600) thread = threading.Thread(target=background_task, daemon=True) thread.start() return thread start_background_scheduler() # --- UI SETUP & ONBOARDING --- st.set_page_config(page_title="PolicyPilot Intel", layout="wide") st.title("AI Policy and News Dashboard - ALPHA Version") st.markdown(""" Welcome to the **AI Policy and News Dashboard**, an automated platform tracking technology policy developments, legislative movement, and media coverage. The portal will auto-populate with newly scanned data every 1 hour. This portal's information is divided into two tabs: * **Radar (Upcoming & Today's News):** Forward-looking policy-relevant data when it is available and daily news updates. * **Archive (Past):** Historical data on past media coverage, actions from executive agencies and the White House, and legislation from the current Congress. * **How to Filter:** Use the **Controls in the left sidebar** to filter by specific data categories or use the search bar below. ### Category Legend To help you scan the chronological timeline quickly, entries are color-coded: * 🟣 **Legislation** (Bills, Resolutions) * 🟢 **Federal / Exec Action** (Agencies, White House) * 🔵 **News / Media** (Press Coverage) * 🟠 **Schedule / Hearing** (Committee Meetings, Markups) * 📣 **Legislative Office Press Release** (Lawmaker Announcements) To generate a high-level summary of the most recent data entries, click the **"Generate Briefing"** button below. """) st.markdown(""" --- ### Notes for Users 1. **Verify AI Outputs:** This portal leverages summaries generated by language models. All intelligence should be verified using the links to original sources. 2. **Work in Progress:** This is an alpha version. Improvements in coverage and AI logic will be made regularly. """) with st.expander("🛠️ Technical Details & Architecture"): st.markdown(""" * **AI Engine:** Powered by Qwen/Qwen2.5-7B-Instruct. * **Data Sources:** Live API integration with Congress.gov, official federal RSS feeds, and master committee schedules. * **Filtering:** Articles and bills are strictly filtered against a hardcoded tech-policy dictionary before the AI reads them. * **Data Retention:** News expires from the UI after 30 days, Schedules after 60 days. Legislation is retained for the duration of the current Congress. """) st.divider() # Load Data df = load_data() active_df = apply_retention_policy(df) selected_types = [] # --- SIDEBAR --- with st.sidebar: st.header("System Status") last_sync_str = "Pending First Run" if df is not None and not df.empty and 'date_collected' in df.columns: last_sync_dt = pd.to_datetime(df['date_collected']).max() last_sync_str = last_sync_dt.strftime('%b %d, %I:%M %p UTC') st.info(f"**Auto-Pilot:** Active (1h Cycle)\n\n**Last Sync:** {last_sync_str}") st.divider() st.header("Manual Override") cooldown_minutes = 30 can_sweep = True time_left = 0 if df is not None and not df.empty and 'date_collected' in df.columns: last_sync_dt = pd.to_datetime(df['date_collected']).max() if last_sync_dt.tzinfo is not None: last_sync_dt = last_sync_dt.tz_localize(None) mins_since_last = (datetime.now() - last_sync_dt).total_seconds() / 60 if mins_since_last < cooldown_minutes: can_sweep = False time_left = int(cooldown_minutes - mins_since_last) if can_sweep: if st.button("Force Manual Sweep", use_container_width=True): with st.spinner("Scanning Datacenters & Gov Servers..."): with data_lock: main.run() st.success("Sweep Complete!") st.rerun() else: st.button(f"Sweep on Cooldown ({time_left}m left)", disabled=True, use_container_width=True) st.caption("🛡️ *To prevent IP bans from government servers, manual sweeps are limited to once every 15 minutes.*") st.divider() if active_df is not None and not active_df.empty: available_types = active_df['type'].dropna().unique().tolist() selected_types = st.multiselect("Filter by Category:", options=available_types, default=available_types) st.divider() with data_lock: csv_data = active_df.to_csv(index=False).encode('utf-8') st.download_button(label="Download Historical Archive (CSV)", data=csv_data, file_name=f"policy_pilot_archive_{pd.Timestamp.now().strftime('%Y-%m-%d')}.csv", mime="text/csv", use_container_width=True) # --- VISUAL CARD RENDERER (Defined first so tabs can use it) --- def render_event_cards(display_df): if display_df.empty: st.info("No items match these filters.") return type_icons = { "Legislation": "🟣", "Federal/Exec Action": "🟢", "News/Media": "🔵", "Schedule/Hearing": "🟠", "Hearing/Markup": "🟠", "Legislative Office Press Release": "📣" } color_map = { "News/Media": "blue", "Federal/Exec Action": "green", "Legislation": "violet", "Schedule/Hearing": "orange", "Hearing/Markup": "orange", "Legislative Office Press Release": "red" } for _, row in display_df.iterrows(): dt = row['event_date'] dt_str = dt.strftime('%b %d, %Y') if pd.notnull(dt) else "Date TBD" card_type = row['type'] icon = type_icons.get(card_type, "⚪") color = color_map.get(card_type, "gray") source = row.get('source', 'Unknown Source') raw_title = str(row['title']) display_title = raw_title[:75] + ("..." if len(raw_title) > 75 else "") with st.expander(f"{icon} {dt_str} | {card_type} | {source} | {display_title}"): col1, col2 = st.columns([3, 1]) with col1: st.markdown("### Executive Summary") st.info(row.get('analysis', 'Analysis pending...')) st.caption(f"**Keywords:** `{row.get('keywords', 'N/A')}`") with col2: st.markdown("### Metadata") st.markdown(f"**Category:** :{color}[{card_type}]") st.write(f"**Source:** {source}") st.write(f"**Action:** {row['latest_action']}") st.link_button("View Source", str(row['link']), use_container_width=True) # --- GLOBAL DATA FILTERING --- if active_df is not None and not active_df.empty: # 1. Apply Sidebar Filters if selected_types: filtered_df = active_df[active_df['type'].isin(selected_types)] else: filtered_df = active_df # 2. Global Search Bar (Always visible at the top) search = st.text_input("🔍 Search Intel Dashboard (Filters apply to all tabs)...", "") if search: mask = filtered_df[['title', 'summary', 'analysis', 'keywords', 'source']].apply(lambda x: x.astype(str).str.contains(search, case=False)).any(axis=1) filtered_df = filtered_df[mask] # 3. Split data into Radar and Archive today_ts = pd.Timestamp.now().normalize() radar_cutoff = today_ts - pd.Timedelta(days=1) radar_df = filtered_df[filtered_df['event_date'] >= radar_cutoff].sort_values(by="event_date", ascending=False) archive_df = filtered_df[(filtered_df['event_date'] < radar_cutoff) | (filtered_df['event_date'].isna())].sort_values(by="event_date", ascending=False) # --------------------------------------------------------- # MASTER UI TABS # --------------------------------------------------------- tab_feed, tab_briefing, tab_trends = st.tabs(["Data Feed", "Daily Summary", "Weekly Trend Analysis (Pilot)"]) # === TAB 1: THE FEED === with tab_feed: feed_tab1, feed_tab2 = st.tabs([f"Radar ({len(radar_df)})", f"Archive ({len(archive_df)})"]) with feed_tab1: render_event_cards(radar_df) with feed_tab2: render_event_cards(archive_df) # === TAB 2: EXECUTIVE BRIEFING === with tab_briefing: st.subheader("Daily Summary") st.info("AI briefing is synthesized from the most recent sources currently visible on your Radar.") if 'exec_briefing' not in st.session_state: st.session_state.exec_briefing = "Click the button below to generate a high-level briefing." st.write(st.session_state.exec_briefing) if os.getenv("HF_TOKEN"): if st.button("Generate Summary", key="btn_briefing"): with st.spinner("Qwen2.5-7B-Instruct is providing a summary..."): briefing_items = radar_df.head(10) if briefing_items.empty: briefing_items = filtered_df.head(10) context = "\n".join([f"• SOURCE: {row['source']} | TITLE: {row['title']} | SUMMARY: {row.get('analysis', 'N/A')}" for _, row in briefing_items.iterrows()]) prompt = f""" Provide a highly concise, 3-5 paragraph briefing based only on the recent intelligence gathered from the user's Radar tracking system. Do not include outside information. Cite all sources used in the summary using in-line citations for easy user verification. Do NOT cite dates in line. Ensure you are synthesizing and summarizing information from across the Radar tracking system, not just the first 1 or 2 entries. All summaries should be in an understated tone. Do not infer implications or make recommendations. RADAR INTEL: {context} """ messages = [{"role": "user", "content": prompt}] try: briefing_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=os.getenv("HF_TOKEN")) response = briefing_client.chat_completion(messages, max_tokens=500, temperature=0.2) st.session_state.exec_briefing = response.choices[0].message.content st.rerun() except Exception as e: st.error(f"Briefing failed: {e}") # === TAB 3: TREND ANALYSIS === with tab_trends: st.subheader("Semantic Trend Map") st.markdown("Explore the semantic relationships between this week's AI policy updates. Non-AI related noise is automatically filtered out by the AI classifier.") if st.button("Generate Weekly Trend Report", key="btn_trends"): with st.spinner("Analyzing semantic data, abstracting macro-trends, and mapping 2D space... (Takes ~30 seconds)"): week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7) weekly_df = active_df[active_df['event_date'] >= week_ago].copy() weekly_df = weekly_df.dropna(subset=['embedding']) if len(weekly_df) < 5: st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to calculate mathematical trends.") else: from sklearn.manifold import TSNE matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values) # FIX 1: Tightened the distance_threshold to 0.45 clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.45, metric='cosine', linkage='complete') weekly_df['cluster'] = clusterer.fit_predict(matrix) num_clusters = weekly_df['cluster'].nunique() if num_clusters > 8: clusterer = AgglomerativeClustering(n_clusters=8, metric='cosine', linkage='complete') weekly_df['cluster'] = clusterer.fit_predict(matrix) num_clusters = 8 weekly_df['Trend Topic'] = "Uncategorized" hf_token = os.getenv("HF_TOKEN") if hf_token: ui_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=hf_token) for i in range(num_clusters): cluster_df = weekly_df[weekly_df['cluster'] == i] sample_texts = "\n".join(cluster_df['title'].head(8).tolist()) prompt = f""" You are a highly structured D.C. Tech Policy Taxonomist. Categorize these related article titles into a SINGLE, broad policy or industry bucket. RULES: 1. MACRO CATEGORIES ONLY: Use 1 to 3 words maximum. Think of these as slide deck section headers. 2. NO HEADLINES: Absolutely NO verbs, NO company names, NO numbers, and NO dollar amounts. * BAD: "Start-Up Raises $1.3 Billion", "Congress Debates AI Bill" * GOOD: "Venture Capital", "Legislative Action", "AI Infrastructure" 3. EXAMPLES OF IDEAL LABELS: "AI Infrastructure", "Export Controls", "AI Safety", "Defense & Security", "Consumer Regulation", "Industry Update". 4. FILTER NOISE: If the articles are not about AI, compute, or tech policy, reply EXACTLY with: REJECT. 5. FORMAT: Just the category name. No quotes, no extra text. UPDATES: {sample_texts} """ messages = [{"role": "user", "content": prompt}] try: response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0) topic_name = response.choices[0].message.content.strip(' "').upper() if "REJECT" in topic_name: weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT" else: weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name.title() except: weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT" time.sleep(10) clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy() if not clean_df.empty: # Run t-SNE mapping clean_matrix = np.vstack(clean_df['embedding'].apply(json.loads).values) if len(clean_df) > 1: # FIX 2: Lowered perplexity and added n_iter for better small-island separation safe_perplexity = max(2, min(8, len(clean_df) // 4)) tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random', n_iter=1000) coords = tsne.fit_transform(clean_matrix) clean_df['x'] = coords[:, 0] clean_df['y'] = coords[:, 1] else: clean_df['x'] = 0 clean_df['y'] = 0 # Save to session state so it doesn't vanish! st.session_state['trend_df'] = clean_df st.session_state['valid_clusters'] = clean_df['cluster'].unique() else: st.warning("All data this week was classified as non-AI noise.") else: st.error("Hugging Face API token not found.") # --- Display the Trend Map if it's stored in Memory --- if 'trend_df' in st.session_state and not st.session_state['trend_df'].empty: clean_df = st.session_state['trend_df'] st.write(f"### Top AI Trends This Week:") for cluster_id in st.session_state['valid_clusters']: cluster_subset = clean_df[clean_df['cluster'] == cluster_id] topic_label = cluster_subset['Trend Topic'].iloc[0] st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates") st.write("### Semantic Cluster Map") chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode( x=alt.X('x', axis=None), y=alt.Y('y', axis=None), color=alt.Color('Trend Topic:N', legend=alt.Legend(title="Macro Trends", orient="bottom")), tooltip=[ alt.Tooltip('Trend Topic:N', title='Macro Trend'), alt.Tooltip('title:N', title='Update Title'), alt.Tooltip('source:N', title='Source'), alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y') ] ).properties(height=400).interactive() st.altair_chart(chart, use_container_width=True) else: st.warning("Dashboard empty. Run 'Force Manual Sweep' to populate.")