AI_Intel_Tracker / streamlit_app.py
IJ-Reynolds's picture
IJ-Reynolds HF Staff
Update streamlit_app.py
56581ad verified
import streamlit as st
import pandas as pd
import main
from pathlib import Path
from datetime import datetime
import threading
import time
import os
from huggingface_hub import InferenceClient
import json
import numpy as np
import altair as alt
from sklearn.cluster import AgglomerativeClustering
# Create a global lock for file operations
data_lock = threading.Lock()
# --- PATHING LOGIC ---
if Path("/data").exists():
CSV_PATH = Path("/data/policy_tracker.csv")
else:
CSV_PATH = Path(__file__).resolve().parent / "policy_tracker.csv"
# --- DATA LOADING ---
def load_data():
with data_lock:
if CSV_PATH.exists():
df = pd.read_csv(CSV_PATH)
df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce')
if df['event_date'].dt.tz is not None:
df['event_date'] = df['event_date'].dt.tz_localize(None)
df['date_collected'] = pd.to_datetime(df['date_collected'], errors='coerce')
return df
return None
# --- RETENTION POLICY ---
def apply_retention_policy(df):
if df is None or df.empty:
return df
today = pd.Timestamp.now().tz_localize(None).normalize()
# 1. Retention Filtering
leg_df = df[df['type'] == 'Legislation']
news_types = ['News/Media', 'Federal/Exec Action', 'Legislative Office Press Release']
news_mask = (df['type'].isin(news_types)) & ((df['event_date'] >= today - pd.Timedelta(days=30)) | df['event_date'].isna())
news_df = df[news_mask]
sched_types = ['Schedule/Hearing', 'Hearing/Markup']
sched_mask = (df['type'].isin(sched_types)) & ((df['event_date'] >= today - pd.Timedelta(days=60)) | df['event_date'].isna())
sched_df = df[sched_mask]
other_df = df[~df['type'].isin(['Legislation'] + news_types + sched_types)]
active_df = pd.concat([leg_df, news_df, sched_df, other_df]).drop_duplicates(subset=['link'])
# 2. Pure Chronological Sorting (Newest to Oldest)
return active_df.sort_values(by="event_date", ascending=False)
# --- BACKGROUND AUTO-SCHEDULER ---
@st.cache_resource
def start_background_scheduler():
def background_task():
while True:
try:
needs_run = True
sleep_time = 1 * 3600 # 1 Hour
with data_lock:
if CSV_PATH.exists():
df_check = pd.read_csv(CSV_PATH)
if 'date_collected' in df_check.columns and not df_check.empty:
last_date = pd.to_datetime(df_check['date_collected']).max()
if last_date.tzinfo is not None: last_date = last_date.tz_localize(None)
hours_since_last = (datetime.now() - last_date).total_seconds() / 3600
if hours_since_last < 1:
needs_run = False
sleep_time = (1 - hours_since_last) * 3600
if needs_run:
with data_lock: main.run()
time.sleep(sleep_time)
except Exception as e:
time.sleep(3600)
thread = threading.Thread(target=background_task, daemon=True)
thread.start()
return thread
start_background_scheduler()
# --- UI SETUP & ONBOARDING ---
st.set_page_config(page_title="PolicyPilot Intel", layout="wide")
st.title("AI Policy and News Dashboard - ALPHA Version")
st.markdown("""
Welcome to the **AI Policy and News Dashboard**, an automated platform tracking technology policy developments, legislative movement, and media coverage.
The portal will auto-populate with newly scanned data every 1 hour.
This portal's information is divided into two tabs:
* **Radar (Upcoming & Today's News):** Forward-looking policy-relevant data when it is available and daily news updates.
* **Archive (Past):** Historical data on past media coverage, actions from executive agencies and the White House, and legislation from the current Congress.
* **How to Filter:** Use the **Controls in the left sidebar** to filter by specific data categories or use the search bar below.
### Category Legend
To help you scan the chronological timeline quickly, entries are color-coded:
* 🟣 **Legislation** (Bills, Resolutions)
* 🟢 **Federal / Exec Action** (Agencies, White House)
* 🔵 **News / Media** (Press Coverage)
* 🟠 **Schedule / Hearing** (Committee Meetings, Markups)
* 📣 **Legislative Office Press Release** (Lawmaker Announcements)
To generate a high-level summary of the most recent data entries, click the **"Generate Briefing"** button below.
""")
st.markdown("""
---
### Notes for Users
1. **Verify AI Outputs:** This portal leverages summaries generated by language models. All intelligence should be verified using the links to original sources.
2. **Work in Progress:** This is an alpha version. Improvements in coverage and AI logic will be made regularly.
""")
with st.expander("🛠️ Technical Details & Architecture"):
st.markdown("""
* **AI Engine:** Powered by Qwen/Qwen2.5-7B-Instruct.
* **Data Sources:** Live API integration with Congress.gov, official federal RSS feeds, and master committee schedules.
* **Filtering:** Articles and bills are strictly filtered against a hardcoded tech-policy dictionary before the AI reads them.
* **Data Retention:** News expires from the UI after 30 days, Schedules after 60 days. Legislation is retained for the duration of the current Congress.
""")
st.divider()
# Load Data
df = load_data()
active_df = apply_retention_policy(df)
selected_types = []
# --- SIDEBAR ---
with st.sidebar:
st.header("System Status")
last_sync_str = "Pending First Run"
if df is not None and not df.empty and 'date_collected' in df.columns:
last_sync_dt = pd.to_datetime(df['date_collected']).max()
last_sync_str = last_sync_dt.strftime('%b %d, %I:%M %p UTC')
st.info(f"**Auto-Pilot:** Active (1h Cycle)\n\n**Last Sync:** {last_sync_str}")
st.divider()
st.header("Manual Override")
cooldown_minutes = 30
can_sweep = True
time_left = 0
if df is not None and not df.empty and 'date_collected' in df.columns:
last_sync_dt = pd.to_datetime(df['date_collected']).max()
if last_sync_dt.tzinfo is not None: last_sync_dt = last_sync_dt.tz_localize(None)
mins_since_last = (datetime.now() - last_sync_dt).total_seconds() / 60
if mins_since_last < cooldown_minutes:
can_sweep = False
time_left = int(cooldown_minutes - mins_since_last)
if can_sweep:
if st.button("Force Manual Sweep", use_container_width=True):
with st.spinner("Scanning Datacenters & Gov Servers..."):
with data_lock: main.run()
st.success("Sweep Complete!")
st.rerun()
else:
st.button(f"Sweep on Cooldown ({time_left}m left)", disabled=True, use_container_width=True)
st.caption("🛡️ *To prevent IP bans from government servers, manual sweeps are limited to once every 15 minutes.*")
# ---------------------------------------------------------
# TEMPORARY MIGRATION BUTTON
# ---------------------------------------------------------
st.divider()
st.header("Database Maintenance")
if st.button("🔄 Upgrade Database to BGE-Small", type="primary", use_container_width=True):
with st.spinner("Re-calculating all historical data... This takes about 30 seconds."):
from sentence_transformers import SentenceTransformer
import json
with data_lock:
if CSV_PATH.exists():
# 1. Open the vault
df_upgrade = pd.read_csv(CSV_PATH)
# 2. Boot up the new brain
st.toast("Downloading new BGE model...")
model = SentenceTransformer('BAAI/bge-small-en-v1.5')
# 3. Combine the text the exact way your scraper does (Title + Summary)
texts_to_embed = df_upgrade['title'].fillna('') + " - " + df_upgrade.get('summary', pd.Series([''] * len(df_upgrade))).fillna('')
# 4. Generate the new math
st.toast("Re-embedding historical articles...")
new_embeddings = model.encode(texts_to_embed.tolist())
# 5. Overwrite the old embedding column with the new JSON lists
df_upgrade['embedding'] = [json.dumps(emb.tolist()) for emb in new_embeddings]
# 6. Save it back to the permanent bucket
df_upgrade.to_csv(CSV_PATH, index=False)
st.success("Migration Complete! Your historical data is now BGE-compatible.")
time.sleep(2)
st.rerun()
# ---------------------------------------------------------
st.divider()
if active_df is not None and not active_df.empty:
available_types = active_df['type'].dropna().unique().tolist()
selected_types = st.multiselect("Filter by Category:", options=available_types, default=available_types)
st.divider()
with data_lock: csv_data = active_df.to_csv(index=False).encode('utf-8')
st.download_button(label="Download Historical Archive (CSV)", data=csv_data, file_name=f"policy_pilot_archive_{pd.Timestamp.now().strftime('%Y-%m-%d')}.csv", mime="text/csv", use_container_width=True)
# --- VISUAL CARD RENDERER (Defined first so tabs can use it) ---
def render_event_cards(display_df):
if display_df.empty:
st.info("No items match these filters.")
return
type_icons = {
"Legislation": "🟣",
"Federal/Exec Action": "🟢",
"News/Media": "🔵",
"Schedule/Hearing": "🟠",
"Hearing/Markup": "🟠",
"Legislative Office Press Release": "📣"
}
color_map = {
"News/Media": "blue",
"Federal/Exec Action": "green",
"Legislation": "violet",
"Schedule/Hearing": "orange",
"Hearing/Markup": "orange",
"Legislative Office Press Release": "red"
}
for _, row in display_df.iterrows():
dt = row['event_date']
dt_str = dt.strftime('%b %d, %Y') if pd.notnull(dt) else "Date TBD"
card_type = row['type']
icon = type_icons.get(card_type, "⚪")
color = color_map.get(card_type, "gray")
source = row.get('source', 'Unknown Source')
raw_title = str(row['title'])
display_title = raw_title[:75] + ("..." if len(raw_title) > 75 else "")
with st.expander(f"{icon} {dt_str} | {card_type} | {source} | {display_title}"):
col1, col2 = st.columns([3, 1])
with col1:
st.markdown("### Executive Summary")
st.info(row.get('analysis', 'Analysis pending...'))
st.caption(f"**Keywords:** `{row.get('keywords', 'N/A')}`")
with col2:
st.markdown("### Metadata")
st.markdown(f"**Category:** :{color}[{card_type}]")
st.write(f"**Source:** {source}")
st.write(f"**Action:** {row['latest_action']}")
st.link_button("View Source", str(row['link']), use_container_width=True)
# --- GLOBAL DATA FILTERING ---
if active_df is not None and not active_df.empty:
# 1. Apply Sidebar Filters
if selected_types:
filtered_df = active_df[active_df['type'].isin(selected_types)]
else:
filtered_df = active_df
# 2. Global Search Bar (Always visible at the top)
search = st.text_input("🔍 Search Intel Dashboard (Filters apply to all tabs)...", "")
if search:
mask = filtered_df[['title', 'summary', 'analysis', 'keywords', 'source']].apply(lambda x: x.astype(str).str.contains(search, case=False)).any(axis=1)
filtered_df = filtered_df[mask]
# 3. Split data into Radar and Archive
today_ts = pd.Timestamp.now().normalize()
radar_cutoff = today_ts - pd.Timedelta(days=1)
radar_df = filtered_df[filtered_df['event_date'] >= radar_cutoff].sort_values(by="event_date", ascending=False)
archive_df = filtered_df[(filtered_df['event_date'] < radar_cutoff) | (filtered_df['event_date'].isna())].sort_values(by="event_date", ascending=False)
# ---------------------------------------------------------
# MASTER UI TABS
# ---------------------------------------------------------
tab_feed, tab_briefing, tab_trends = st.tabs(["Data Feed", "Daily Summary", "Weekly Trend Analysis"])
# === TAB 1: THE FEED ===
with tab_feed:
feed_tab1, feed_tab2 = st.tabs([f"Radar ({len(radar_df)})", f"Archive ({len(archive_df)})"])
with feed_tab1:
render_event_cards(radar_df)
with feed_tab2:
render_event_cards(archive_df)
# === TAB 2: EXECUTIVE BRIEFING ===
with tab_briefing:
st.subheader("Daily Summary")
st.info("AI briefing is synthesized from the most recent sources currently visible on your Radar.")
if 'exec_briefing' not in st.session_state:
st.session_state.exec_briefing = "Click the button below to generate a high-level briefing."
st.write(st.session_state.exec_briefing)
if os.getenv("HF_TOKEN"):
if st.button("Generate Summary", key="btn_briefing"):
with st.spinner("Qwen2.5-7B-Instruct is providing a summary..."):
briefing_items = radar_df.head(10)
if briefing_items.empty:
briefing_items = filtered_df.head(10)
context = "\n".join([f"• SOURCE: {row['source']} | TITLE: {row['title']} | SUMMARY: {row.get('analysis', 'N/A')}" for _, row in briefing_items.iterrows()])
prompt = f"""
Provide a highly concise, 3-5 paragraph briefing based only on the recent intelligence gathered from the user's Radar tracking system.
Do not include outside information. Cite all sources used in the summary using in-line citations for easy user verification. Do NOT cite dates in line.
Ensure you are synthesizing and summarizing information from across the Radar tracking system, not just the first 1 or 2 entries.
All summaries should be in an understated tone. Do not infer implications or make recommendations.
RADAR INTEL:
{context}
"""
messages = [{"role": "user", "content": prompt}]
try:
briefing_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=os.getenv("HF_TOKEN"))
response = briefing_client.chat_completion(messages, max_tokens=500, temperature=0.2)
st.session_state.exec_briefing = response.choices[0].message.content
st.rerun()
except Exception as e:
st.error(f"Briefing failed: {e}")
# === TAB 3: TREND ANALYSIS ===
with tab_trends:
st.subheader("Semantic Trend Map")
st.markdown("Explore the semantic relationships between this week's AI policy updates. Non-AI related noise is automatically filtered out by the AI classifier.")
if st.button("Generate Weekly Trend Report", key="btn_trends"):
with st.spinner("Analyzing semantic data, abstracting macro-trends, and mapping 2D space... (Takes ~30 seconds)"):
week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
weekly_df = weekly_df.dropna(subset=['embedding'])
if len(weekly_df) < 5:
st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to calculate mathematical trends.")
else:
from sklearn.manifold import TSNE
matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.40, metric='cosine', linkage='complete')
weekly_df['cluster'] = clusterer.fit_predict(matrix)
num_clusters = weekly_df['cluster'].nunique()
if num_clusters > 8:
clusterer = AgglomerativeClustering(n_clusters=8, metric='cosine', linkage='complete')
weekly_df['cluster'] = clusterer.fit_predict(matrix)
num_clusters = 8
weekly_df['Trend Topic'] = "Uncategorized"
hf_token = os.getenv("HF_TOKEN")
if hf_token:
ui_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=hf_token)
for i in range(num_clusters):
cluster_df = weekly_df[weekly_df['cluster'] == i]
sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
prompt = f"""
You are a highly structured D.C. Tech Policy Taxonomist. Categorize these related article titles into a SINGLE, broad policy or industry bucket.
RULES:
1. MACRO CATEGORIES ONLY: Use 1 to 3 words maximum. Think of these as slide deck section headers.
2. NO HEADLINES: Absolutely NO verbs, NO company names, NO numbers, and NO dollar amounts.
* BAD: "Start-Up Raises $1.3 Billion", "Congress Debates AI Bill"
* GOOD: "Venture Capital", "Legislative Action", "AI Infrastructure"
3. EXAMPLES OF IDEAL LABELS: "AI Infrastructure", "Export Controls", "AI Safety", "Defense & Security", "Consumer Regulation", "Industry Update".
4. FILTER NOISE: If the articles are not about AI, compute, or tech policy, reply EXACTLY with: REJECT.
5. FORMAT: Just the category name. No quotes, no extra text.
UPDATES:
{sample_texts}
"""
messages = [{"role": "user", "content": prompt}]
try:
response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0)
topic_name = response.choices[0].message.content.strip(' "').upper()
if "REJECT" in topic_name:
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
else:
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name.title()
except:
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
time.sleep(10)
clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
if not clean_df.empty:
# Run t-SNE mapping
clean_matrix = np.vstack(clean_df['embedding'].apply(json.loads).values)
if len(clean_df) > 1:
safe_perplexity = max(1, min(30, len(clean_df) - 1))
tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
coords = tsne.fit_transform(clean_matrix)
clean_df['x'] = coords[:, 0]
clean_df['y'] = coords[:, 1]
else:
clean_df['x'] = 0
clean_df['y'] = 0
# Save to session state so it doesn't vanish!
st.session_state['trend_df'] = clean_df
st.session_state['valid_clusters'] = clean_df['cluster'].unique()
else:
st.warning("All data this week was classified as non-AI noise.")
else:
st.error("Hugging Face API token not found.")
# --- Display the Trend Map if it's stored in Memory ---
if 'trend_df' in st.session_state and not st.session_state['trend_df'].empty:
clean_df = st.session_state['trend_df']
st.write(f"### Top AI Trends This Week:")
for cluster_id in st.session_state['valid_clusters']:
cluster_subset = clean_df[clean_df['cluster'] == cluster_id]
topic_label = cluster_subset['Trend Topic'].iloc[0]
st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
st.write("### Semantic Cluster Map")
chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
x=alt.X('x', axis=None),
y=alt.Y('y', axis=None),
color=alt.Color('Trend Topic:N', legend=alt.Legend(title="Macro Trends", orient="bottom")),
tooltip=[
alt.Tooltip('Trend Topic:N', title='Macro Trend'),
alt.Tooltip('title:N', title='Update Title'),
alt.Tooltip('source:N', title='Source'),
alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y')
]
).properties(height=400).interactive()
st.altair_chart(chart, use_container_width=True)
else:
st.warning("Dashboard empty. Run 'Force Manual Sweep' to populate.")