Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| import main | |
| from pathlib import Path | |
| from datetime import datetime | |
| import threading | |
| import time | |
| import os | |
| from huggingface_hub import InferenceClient | |
| import json | |
| import numpy as np | |
| import altair as alt | |
| from sklearn.cluster import AgglomerativeClustering | |
| # Create a global lock for file operations | |
| data_lock = threading.Lock() | |
| # --- PATHING LOGIC --- | |
| if Path("/data").exists(): | |
| CSV_PATH = Path("/data/policy_tracker.csv") | |
| else: | |
| CSV_PATH = Path(__file__).resolve().parent / "policy_tracker.csv" | |
| # --- DATA LOADING --- | |
| def load_data(): | |
| with data_lock: | |
| if CSV_PATH.exists(): | |
| df = pd.read_csv(CSV_PATH) | |
| df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce') | |
| if df['event_date'].dt.tz is not None: | |
| df['event_date'] = df['event_date'].dt.tz_localize(None) | |
| df['date_collected'] = pd.to_datetime(df['date_collected'], errors='coerce') | |
| return df | |
| return None | |
| # --- RETENTION POLICY --- | |
| def apply_retention_policy(df): | |
| if df is None or df.empty: | |
| return df | |
| today = pd.Timestamp.now().tz_localize(None).normalize() | |
| # 1. Retention Filtering | |
| leg_df = df[df['type'] == 'Legislation'] | |
| news_types = ['News/Media', 'Federal/Exec Action', 'Legislative Office Press Release'] | |
| news_mask = (df['type'].isin(news_types)) & ((df['event_date'] >= today - pd.Timedelta(days=30)) | df['event_date'].isna()) | |
| news_df = df[news_mask] | |
| sched_types = ['Schedule/Hearing', 'Hearing/Markup'] | |
| sched_mask = (df['type'].isin(sched_types)) & ((df['event_date'] >= today - pd.Timedelta(days=60)) | df['event_date'].isna()) | |
| sched_df = df[sched_mask] | |
| other_df = df[~df['type'].isin(['Legislation'] + news_types + sched_types)] | |
| active_df = pd.concat([leg_df, news_df, sched_df, other_df]).drop_duplicates(subset=['link']) | |
| # 2. Pure Chronological Sorting (Newest to Oldest) | |
| return active_df.sort_values(by="event_date", ascending=False) | |
| # --- BACKGROUND AUTO-SCHEDULER --- | |
| def start_background_scheduler(): | |
| def background_task(): | |
| while True: | |
| try: | |
| needs_run = True | |
| sleep_time = 1 * 3600 # 1 Hour | |
| with data_lock: | |
| if CSV_PATH.exists(): | |
| df_check = pd.read_csv(CSV_PATH) | |
| if 'date_collected' in df_check.columns and not df_check.empty: | |
| last_date = pd.to_datetime(df_check['date_collected']).max() | |
| if last_date.tzinfo is not None: last_date = last_date.tz_localize(None) | |
| hours_since_last = (datetime.now() - last_date).total_seconds() / 3600 | |
| if hours_since_last < 1: | |
| needs_run = False | |
| sleep_time = (1 - hours_since_last) * 3600 | |
| if needs_run: | |
| with data_lock: main.run() | |
| time.sleep(sleep_time) | |
| except Exception as e: | |
| time.sleep(3600) | |
| thread = threading.Thread(target=background_task, daemon=True) | |
| thread.start() | |
| return thread | |
| start_background_scheduler() | |
| # --- UI SETUP & ONBOARDING --- | |
| st.set_page_config(page_title="PolicyPilot Intel", layout="wide") | |
| st.title("AI Policy and News Dashboard - ALPHA Version") | |
| st.markdown(""" | |
| Welcome to the **AI Policy and News Dashboard**, an automated platform tracking technology policy developments, legislative movement, and media coverage. | |
| The portal will auto-populate with newly scanned data every 1 hour. | |
| This portal's information is divided into two tabs: | |
| * **Radar (Upcoming & Today's News):** Forward-looking policy-relevant data when it is available and daily news updates. | |
| * **Archive (Past):** Historical data on past media coverage, actions from executive agencies and the White House, and legislation from the current Congress. | |
| * **How to Filter:** Use the **Controls in the left sidebar** to filter by specific data categories or use the search bar below. | |
| ### Category Legend | |
| To help you scan the chronological timeline quickly, entries are color-coded: | |
| * 🟣 **Legislation** (Bills, Resolutions) | |
| * 🟢 **Federal / Exec Action** (Agencies, White House) | |
| * 🔵 **News / Media** (Press Coverage) | |
| * 🟠 **Schedule / Hearing** (Committee Meetings, Markups) | |
| * 📣 **Legislative Office Press Release** (Lawmaker Announcements) | |
| To generate a high-level summary of the most recent data entries, click the **"Generate Briefing"** button below. | |
| """) | |
| st.markdown(""" | |
| --- | |
| ### Notes for Users | |
| 1. **Verify AI Outputs:** This portal leverages summaries generated by language models. All intelligence should be verified using the links to original sources. | |
| 2. **Work in Progress:** This is an alpha version. Improvements in coverage and AI logic will be made regularly. | |
| """) | |
| with st.expander("🛠️ Technical Details & Architecture"): | |
| st.markdown(""" | |
| * **AI Engine:** Powered by Qwen/Qwen2.5-7B-Instruct. | |
| * **Data Sources:** Live API integration with Congress.gov, official federal RSS feeds, and master committee schedules. | |
| * **Filtering:** Articles and bills are strictly filtered against a hardcoded tech-policy dictionary before the AI reads them. | |
| * **Data Retention:** News expires from the UI after 30 days, Schedules after 60 days. Legislation is retained for the duration of the current Congress. | |
| """) | |
| st.divider() | |
| # Load Data | |
| df = load_data() | |
| active_df = apply_retention_policy(df) | |
| selected_types = [] | |
| # --- SIDEBAR --- | |
| with st.sidebar: | |
| st.header("System Status") | |
| last_sync_str = "Pending First Run" | |
| if df is not None and not df.empty and 'date_collected' in df.columns: | |
| last_sync_dt = pd.to_datetime(df['date_collected']).max() | |
| last_sync_str = last_sync_dt.strftime('%b %d, %I:%M %p UTC') | |
| st.info(f"**Auto-Pilot:** Active (1h Cycle)\n\n**Last Sync:** {last_sync_str}") | |
| st.divider() | |
| st.header("Manual Override") | |
| cooldown_minutes = 30 | |
| can_sweep = True | |
| time_left = 0 | |
| if df is not None and not df.empty and 'date_collected' in df.columns: | |
| last_sync_dt = pd.to_datetime(df['date_collected']).max() | |
| if last_sync_dt.tzinfo is not None: last_sync_dt = last_sync_dt.tz_localize(None) | |
| mins_since_last = (datetime.now() - last_sync_dt).total_seconds() / 60 | |
| if mins_since_last < cooldown_minutes: | |
| can_sweep = False | |
| time_left = int(cooldown_minutes - mins_since_last) | |
| if can_sweep: | |
| if st.button("Force Manual Sweep", use_container_width=True): | |
| with st.spinner("Scanning Datacenters & Gov Servers..."): | |
| with data_lock: main.run() | |
| st.success("Sweep Complete!") | |
| st.rerun() | |
| else: | |
| st.button(f"Sweep on Cooldown ({time_left}m left)", disabled=True, use_container_width=True) | |
| st.caption("🛡️ *To prevent IP bans from government servers, manual sweeps are limited to once every 15 minutes.*") | |
| # --------------------------------------------------------- | |
| # TEMPORARY MIGRATION BUTTON | |
| # --------------------------------------------------------- | |
| st.divider() | |
| st.header("Database Maintenance") | |
| if st.button("🔄 Upgrade Database to BGE-Small", type="primary", use_container_width=True): | |
| with st.spinner("Re-calculating all historical data... This takes about 30 seconds."): | |
| from sentence_transformers import SentenceTransformer | |
| import json | |
| with data_lock: | |
| if CSV_PATH.exists(): | |
| # 1. Open the vault | |
| df_upgrade = pd.read_csv(CSV_PATH) | |
| # 2. Boot up the new brain | |
| st.toast("Downloading new BGE model...") | |
| model = SentenceTransformer('BAAI/bge-small-en-v1.5') | |
| # 3. Combine the text the exact way your scraper does (Title + Summary) | |
| texts_to_embed = df_upgrade['title'].fillna('') + " - " + df_upgrade.get('summary', pd.Series([''] * len(df_upgrade))).fillna('') | |
| # 4. Generate the new math | |
| st.toast("Re-embedding historical articles...") | |
| new_embeddings = model.encode(texts_to_embed.tolist()) | |
| # 5. Overwrite the old embedding column with the new JSON lists | |
| df_upgrade['embedding'] = [json.dumps(emb.tolist()) for emb in new_embeddings] | |
| # 6. Save it back to the permanent bucket | |
| df_upgrade.to_csv(CSV_PATH, index=False) | |
| st.success("Migration Complete! Your historical data is now BGE-compatible.") | |
| time.sleep(2) | |
| st.rerun() | |
| # --------------------------------------------------------- | |
| st.divider() | |
| if active_df is not None and not active_df.empty: | |
| available_types = active_df['type'].dropna().unique().tolist() | |
| selected_types = st.multiselect("Filter by Category:", options=available_types, default=available_types) | |
| st.divider() | |
| with data_lock: csv_data = active_df.to_csv(index=False).encode('utf-8') | |
| st.download_button(label="Download Historical Archive (CSV)", data=csv_data, file_name=f"policy_pilot_archive_{pd.Timestamp.now().strftime('%Y-%m-%d')}.csv", mime="text/csv", use_container_width=True) | |
| # --- VISUAL CARD RENDERER (Defined first so tabs can use it) --- | |
| def render_event_cards(display_df): | |
| if display_df.empty: | |
| st.info("No items match these filters.") | |
| return | |
| type_icons = { | |
| "Legislation": "🟣", | |
| "Federal/Exec Action": "🟢", | |
| "News/Media": "🔵", | |
| "Schedule/Hearing": "🟠", | |
| "Hearing/Markup": "🟠", | |
| "Legislative Office Press Release": "📣" | |
| } | |
| color_map = { | |
| "News/Media": "blue", | |
| "Federal/Exec Action": "green", | |
| "Legislation": "violet", | |
| "Schedule/Hearing": "orange", | |
| "Hearing/Markup": "orange", | |
| "Legislative Office Press Release": "red" | |
| } | |
| for _, row in display_df.iterrows(): | |
| dt = row['event_date'] | |
| dt_str = dt.strftime('%b %d, %Y') if pd.notnull(dt) else "Date TBD" | |
| card_type = row['type'] | |
| icon = type_icons.get(card_type, "⚪") | |
| color = color_map.get(card_type, "gray") | |
| source = row.get('source', 'Unknown Source') | |
| raw_title = str(row['title']) | |
| display_title = raw_title[:75] + ("..." if len(raw_title) > 75 else "") | |
| with st.expander(f"{icon} {dt_str} | {card_type} | {source} | {display_title}"): | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| st.markdown("### Executive Summary") | |
| st.info(row.get('analysis', 'Analysis pending...')) | |
| st.caption(f"**Keywords:** `{row.get('keywords', 'N/A')}`") | |
| with col2: | |
| st.markdown("### Metadata") | |
| st.markdown(f"**Category:** :{color}[{card_type}]") | |
| st.write(f"**Source:** {source}") | |
| st.write(f"**Action:** {row['latest_action']}") | |
| st.link_button("View Source", str(row['link']), use_container_width=True) | |
| # --- GLOBAL DATA FILTERING --- | |
| if active_df is not None and not active_df.empty: | |
| # 1. Apply Sidebar Filters | |
| if selected_types: | |
| filtered_df = active_df[active_df['type'].isin(selected_types)] | |
| else: | |
| filtered_df = active_df | |
| # 2. Global Search Bar (Always visible at the top) | |
| search = st.text_input("🔍 Search Intel Dashboard (Filters apply to all tabs)...", "") | |
| if search: | |
| mask = filtered_df[['title', 'summary', 'analysis', 'keywords', 'source']].apply(lambda x: x.astype(str).str.contains(search, case=False)).any(axis=1) | |
| filtered_df = filtered_df[mask] | |
| # 3. Split data into Radar and Archive | |
| today_ts = pd.Timestamp.now().normalize() | |
| radar_cutoff = today_ts - pd.Timedelta(days=1) | |
| radar_df = filtered_df[filtered_df['event_date'] >= radar_cutoff].sort_values(by="event_date", ascending=False) | |
| archive_df = filtered_df[(filtered_df['event_date'] < radar_cutoff) | (filtered_df['event_date'].isna())].sort_values(by="event_date", ascending=False) | |
| # --------------------------------------------------------- | |
| # MASTER UI TABS | |
| # --------------------------------------------------------- | |
| tab_feed, tab_briefing, tab_trends = st.tabs(["Data Feed", "Daily Summary", "Weekly Trend Analysis"]) | |
| # === TAB 1: THE FEED === | |
| with tab_feed: | |
| feed_tab1, feed_tab2 = st.tabs([f"Radar ({len(radar_df)})", f"Archive ({len(archive_df)})"]) | |
| with feed_tab1: | |
| render_event_cards(radar_df) | |
| with feed_tab2: | |
| render_event_cards(archive_df) | |
| # === TAB 2: EXECUTIVE BRIEFING === | |
| with tab_briefing: | |
| st.subheader("Daily Summary") | |
| st.info("AI briefing is synthesized from the most recent sources currently visible on your Radar.") | |
| if 'exec_briefing' not in st.session_state: | |
| st.session_state.exec_briefing = "Click the button below to generate a high-level briefing." | |
| st.write(st.session_state.exec_briefing) | |
| if os.getenv("HF_TOKEN"): | |
| if st.button("Generate Summary", key="btn_briefing"): | |
| with st.spinner("Qwen2.5-7B-Instruct is providing a summary..."): | |
| briefing_items = radar_df.head(10) | |
| if briefing_items.empty: | |
| briefing_items = filtered_df.head(10) | |
| context = "\n".join([f"• SOURCE: {row['source']} | TITLE: {row['title']} | SUMMARY: {row.get('analysis', 'N/A')}" for _, row in briefing_items.iterrows()]) | |
| prompt = f""" | |
| Provide a highly concise, 3-5 paragraph briefing based only on the recent intelligence gathered from the user's Radar tracking system. | |
| Do not include outside information. Cite all sources used in the summary using in-line citations for easy user verification. Do NOT cite dates in line. | |
| Ensure you are synthesizing and summarizing information from across the Radar tracking system, not just the first 1 or 2 entries. | |
| All summaries should be in an understated tone. Do not infer implications or make recommendations. | |
| RADAR INTEL: | |
| {context} | |
| """ | |
| messages = [{"role": "user", "content": prompt}] | |
| try: | |
| briefing_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=os.getenv("HF_TOKEN")) | |
| response = briefing_client.chat_completion(messages, max_tokens=500, temperature=0.2) | |
| st.session_state.exec_briefing = response.choices[0].message.content | |
| st.rerun() | |
| except Exception as e: | |
| st.error(f"Briefing failed: {e}") | |
| # === TAB 3: TREND ANALYSIS === | |
| with tab_trends: | |
| st.subheader("Semantic Trend Map") | |
| st.markdown("Explore the semantic relationships between this week's AI policy updates. Non-AI related noise is automatically filtered out by the AI classifier.") | |
| if st.button("Generate Weekly Trend Report", key="btn_trends"): | |
| with st.spinner("Analyzing semantic data, abstracting macro-trends, and mapping 2D space... (Takes ~30 seconds)"): | |
| week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7) | |
| weekly_df = active_df[active_df['event_date'] >= week_ago].copy() | |
| weekly_df = weekly_df.dropna(subset=['embedding']) | |
| if len(weekly_df) < 5: | |
| st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to calculate mathematical trends.") | |
| else: | |
| from sklearn.manifold import TSNE | |
| matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values) | |
| clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.40, metric='cosine', linkage='complete') | |
| weekly_df['cluster'] = clusterer.fit_predict(matrix) | |
| num_clusters = weekly_df['cluster'].nunique() | |
| if num_clusters > 8: | |
| clusterer = AgglomerativeClustering(n_clusters=8, metric='cosine', linkage='complete') | |
| weekly_df['cluster'] = clusterer.fit_predict(matrix) | |
| num_clusters = 8 | |
| weekly_df['Trend Topic'] = "Uncategorized" | |
| hf_token = os.getenv("HF_TOKEN") | |
| if hf_token: | |
| ui_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=hf_token) | |
| for i in range(num_clusters): | |
| cluster_df = weekly_df[weekly_df['cluster'] == i] | |
| sample_texts = "\n".join(cluster_df['title'].head(8).tolist()) | |
| prompt = f""" | |
| You are a highly structured D.C. Tech Policy Taxonomist. Categorize these related article titles into a SINGLE, broad policy or industry bucket. | |
| RULES: | |
| 1. MACRO CATEGORIES ONLY: Use 1 to 3 words maximum. Think of these as slide deck section headers. | |
| 2. NO HEADLINES: Absolutely NO verbs, NO company names, NO numbers, and NO dollar amounts. | |
| * BAD: "Start-Up Raises $1.3 Billion", "Congress Debates AI Bill" | |
| * GOOD: "Venture Capital", "Legislative Action", "AI Infrastructure" | |
| 3. EXAMPLES OF IDEAL LABELS: "AI Infrastructure", "Export Controls", "AI Safety", "Defense & Security", "Consumer Regulation", "Industry Update". | |
| 4. FILTER NOISE: If the articles are not about AI, compute, or tech policy, reply EXACTLY with: REJECT. | |
| 5. FORMAT: Just the category name. No quotes, no extra text. | |
| UPDATES: | |
| {sample_texts} | |
| """ | |
| messages = [{"role": "user", "content": prompt}] | |
| try: | |
| response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0) | |
| topic_name = response.choices[0].message.content.strip(' "').upper() | |
| if "REJECT" in topic_name: | |
| weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT" | |
| else: | |
| weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name.title() | |
| except: | |
| weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT" | |
| time.sleep(10) | |
| clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy() | |
| if not clean_df.empty: | |
| # Run t-SNE mapping | |
| clean_matrix = np.vstack(clean_df['embedding'].apply(json.loads).values) | |
| if len(clean_df) > 1: | |
| safe_perplexity = max(1, min(30, len(clean_df) - 1)) | |
| tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random') | |
| coords = tsne.fit_transform(clean_matrix) | |
| clean_df['x'] = coords[:, 0] | |
| clean_df['y'] = coords[:, 1] | |
| else: | |
| clean_df['x'] = 0 | |
| clean_df['y'] = 0 | |
| # Save to session state so it doesn't vanish! | |
| st.session_state['trend_df'] = clean_df | |
| st.session_state['valid_clusters'] = clean_df['cluster'].unique() | |
| else: | |
| st.warning("All data this week was classified as non-AI noise.") | |
| else: | |
| st.error("Hugging Face API token not found.") | |
| # --- Display the Trend Map if it's stored in Memory --- | |
| if 'trend_df' in st.session_state and not st.session_state['trend_df'].empty: | |
| clean_df = st.session_state['trend_df'] | |
| st.write(f"### Top AI Trends This Week:") | |
| for cluster_id in st.session_state['valid_clusters']: | |
| cluster_subset = clean_df[clean_df['cluster'] == cluster_id] | |
| topic_label = cluster_subset['Trend Topic'].iloc[0] | |
| st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates") | |
| st.write("### Semantic Cluster Map") | |
| chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode( | |
| x=alt.X('x', axis=None), | |
| y=alt.Y('y', axis=None), | |
| color=alt.Color('Trend Topic:N', legend=alt.Legend(title="Macro Trends", orient="bottom")), | |
| tooltip=[ | |
| alt.Tooltip('Trend Topic:N', title='Macro Trend'), | |
| alt.Tooltip('title:N', title='Update Title'), | |
| alt.Tooltip('source:N', title='Source'), | |
| alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y') | |
| ] | |
| ).properties(height=400).interactive() | |
| st.altair_chart(chart, use_container_width=True) | |
| else: | |
| st.warning("Dashboard empty. Run 'Force Manual Sweep' to populate.") |