Spaces:
Running
Running
File size: 20,921 Bytes
9e2d126 aed4b3f a29b743 46ba85d 2b50a58 81d36f5 65035d6 9e2d126 0ab804a cca9db9 9e2d126 cf49e11 9e2d126 0ab804a 3fbb740 0ab804a 235a5fa 0ab804a 235a5fa 3ff209a 5059441 235a5fa cca9db9 235a5fa cca9db9 4a51ea8 cca9db9 235a5fa 5059441 cca9db9 a29b743 b9d70d9 0ab804a 5059441 b9d70d9 0ab804a b9d70d9 0ab804a b9d70d9 a29b743 5059441 a29b743 46ba85d a29b743 2c0c298 aed4b3f 2c0c298 b8894d9 173db6b 2c0c298 21a9cf0 2c0c298 8578c54 2c0c298 173db6b 5059441 f2ab35e 173db6b 4a51ea8 7e3acd8 5059441 2c0c298 173db6b 5059441 7e3acd8 56581ad 2c0c298 173db6b 2c0c298 7e3acd8 9e2d126 2c0c298 5059441 0ab804a 622e64a 9e2d126 5059441 5d4f50f b9d70d9 a29b743 5059441 b9d70d9 cbdfd8b b9d70d9 25ef23d b9d70d9 9e2d126 5059441 cca9db9 235a5fa 5059441 9e2d126 0254967 a2960a6 0254967 a2960a6 0254967 25ef23d 0254967 21a9cf0 0254967 09ffeae 0254967 25ef23d 0254967 28c8245 09dd029 0254967 81d36f5 0254967 56581ad 0254967 81d36f5 0254967 56581ad 0254967 09dd029 0254967 81d36f5 0254967 2cfd20a 0254967 81d36f5 020fc24 0254967 09dd029 0254967 09dd029 0254967 09dd029 f080457 0254967 09dd029 0254967 5e8c57b 0254967 020fc24 0254967 1a3434a 0254967 5e8c57b 0254967 f080457 0254967 5e8c57b 0254967 634b6d4 0254967 634b6d4 0254967 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 | import streamlit as st
import pandas as pd
import main
from pathlib import Path
from datetime import datetime
import threading
import time
import os
from huggingface_hub import InferenceClient
import json
import numpy as np
import altair as alt
from sklearn.cluster import AgglomerativeClustering
# Create a global lock for file operations
data_lock = threading.Lock()
# --- PATHING LOGIC ---
if Path("/data").exists():
CSV_PATH = Path("/data/policy_tracker.csv")
else:
CSV_PATH = Path(__file__).resolve().parent / "policy_tracker.csv"
# --- DATA LOADING ---
def load_data():
with data_lock:
if CSV_PATH.exists():
df = pd.read_csv(CSV_PATH)
df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce')
if df['event_date'].dt.tz is not None:
df['event_date'] = df['event_date'].dt.tz_localize(None)
df['date_collected'] = pd.to_datetime(df['date_collected'], errors='coerce')
return df
return None
# --- RETENTION POLICY ---
def apply_retention_policy(df):
if df is None or df.empty:
return df
today = pd.Timestamp.now().tz_localize(None).normalize()
# 1. Retention Filtering
leg_df = df[df['type'] == 'Legislation']
news_types = ['News/Media', 'Federal/Exec Action', 'Legislative Office Press Release']
news_mask = (df['type'].isin(news_types)) & ((df['event_date'] >= today - pd.Timedelta(days=30)) | df['event_date'].isna())
news_df = df[news_mask]
sched_types = ['Schedule/Hearing', 'Hearing/Markup']
sched_mask = (df['type'].isin(sched_types)) & ((df['event_date'] >= today - pd.Timedelta(days=60)) | df['event_date'].isna())
sched_df = df[sched_mask]
other_df = df[~df['type'].isin(['Legislation'] + news_types + sched_types)]
active_df = pd.concat([leg_df, news_df, sched_df, other_df]).drop_duplicates(subset=['link'])
# 2. Pure Chronological Sorting (Newest to Oldest)
return active_df.sort_values(by="event_date", ascending=False)
# --- BACKGROUND AUTO-SCHEDULER ---
@st.cache_resource
def start_background_scheduler():
def background_task():
while True:
try:
needs_run = True
sleep_time = 1 * 3600 # 1 Hour
with data_lock:
if CSV_PATH.exists():
df_check = pd.read_csv(CSV_PATH)
if 'date_collected' in df_check.columns and not df_check.empty:
last_date = pd.to_datetime(df_check['date_collected']).max()
if last_date.tzinfo is not None: last_date = last_date.tz_localize(None)
hours_since_last = (datetime.now() - last_date).total_seconds() / 3600
if hours_since_last < 1:
needs_run = False
sleep_time = (1 - hours_since_last) * 3600
if needs_run:
with data_lock: main.run()
time.sleep(sleep_time)
except Exception as e:
time.sleep(3600)
thread = threading.Thread(target=background_task, daemon=True)
thread.start()
return thread
start_background_scheduler()
# --- UI SETUP & ONBOARDING ---
st.set_page_config(page_title="PolicyPilot Intel", layout="wide")
st.title("AI Policy and News Dashboard - ALPHA Version")
st.markdown("""
Welcome to the **AI Policy and News Dashboard**, an automated platform tracking technology policy developments, legislative movement, and media coverage.
The portal will auto-populate with newly scanned data every 1 hour.
This portal's information is divided into two tabs:
* **Radar (Upcoming & Today's News):** Forward-looking policy-relevant data when it is available and daily news updates.
* **Archive (Past):** Historical data on past media coverage, actions from executive agencies and the White House, and legislation from the current Congress.
* **How to Filter:** Use the **Controls in the left sidebar** to filter by specific data categories or use the search bar below.
### Category Legend
To help you scan the chronological timeline quickly, entries are color-coded:
* π£ **Legislation** (Bills, Resolutions)
* π’ **Federal / Exec Action** (Agencies, White House)
* π΅ **News / Media** (Press Coverage)
* π **Schedule / Hearing** (Committee Meetings, Markups)
* π£ **Legislative Office Press Release** (Lawmaker Announcements)
To generate a high-level summary of the most recent data entries, click the **"Generate Briefing"** button below.
""")
st.markdown("""
---
### Notes for Users
1. **Verify AI Outputs:** This portal leverages summaries generated by language models. All intelligence should be verified using the links to original sources.
2. **Work in Progress:** This is an alpha version. Improvements in coverage and AI logic will be made regularly.
""")
with st.expander("π οΈ Technical Details & Architecture"):
st.markdown("""
* **AI Engine:** Powered by Qwen/Qwen2.5-7B-Instruct.
* **Data Sources:** Live API integration with Congress.gov, official federal RSS feeds, and master committee schedules.
* **Filtering:** Articles and bills are strictly filtered against a hardcoded tech-policy dictionary before the AI reads them.
* **Data Retention:** News expires from the UI after 30 days, Schedules after 60 days. Legislation is retained for the duration of the current Congress.
""")
st.divider()
# Load Data
df = load_data()
active_df = apply_retention_policy(df)
selected_types = []
# --- SIDEBAR ---
with st.sidebar:
st.header("System Status")
last_sync_str = "Pending First Run"
if df is not None and not df.empty and 'date_collected' in df.columns:
last_sync_dt = pd.to_datetime(df['date_collected']).max()
last_sync_str = last_sync_dt.strftime('%b %d, %I:%M %p UTC')
st.info(f"**Auto-Pilot:** Active (1h Cycle)\n\n**Last Sync:** {last_sync_str}")
st.divider()
st.header("Manual Override")
cooldown_minutes = 30
can_sweep = True
time_left = 0
if df is not None and not df.empty and 'date_collected' in df.columns:
last_sync_dt = pd.to_datetime(df['date_collected']).max()
if last_sync_dt.tzinfo is not None: last_sync_dt = last_sync_dt.tz_localize(None)
mins_since_last = (datetime.now() - last_sync_dt).total_seconds() / 60
if mins_since_last < cooldown_minutes:
can_sweep = False
time_left = int(cooldown_minutes - mins_since_last)
if can_sweep:
if st.button("Force Manual Sweep", use_container_width=True):
with st.spinner("Scanning Datacenters & Gov Servers..."):
with data_lock: main.run()
st.success("Sweep Complete!")
st.rerun()
else:
st.button(f"Sweep on Cooldown ({time_left}m left)", disabled=True, use_container_width=True)
st.caption("π‘οΈ *To prevent IP bans from government servers, manual sweeps are limited to once every 15 minutes.*")
st.divider()
if active_df is not None and not active_df.empty:
available_types = active_df['type'].dropna().unique().tolist()
selected_types = st.multiselect("Filter by Category:", options=available_types, default=available_types)
st.divider()
with data_lock: csv_data = active_df.to_csv(index=False).encode('utf-8')
st.download_button(label="Download Historical Archive (CSV)", data=csv_data, file_name=f"policy_pilot_archive_{pd.Timestamp.now().strftime('%Y-%m-%d')}.csv", mime="text/csv", use_container_width=True)
# --- VISUAL CARD RENDERER (Defined first so tabs can use it) ---
def render_event_cards(display_df):
if display_df.empty:
st.info("No items match these filters.")
return
type_icons = {
"Legislation": "π£",
"Federal/Exec Action": "π’",
"News/Media": "π΅",
"Schedule/Hearing": "π ",
"Hearing/Markup": "π ",
"Legislative Office Press Release": "π£"
}
color_map = {
"News/Media": "blue",
"Federal/Exec Action": "green",
"Legislation": "violet",
"Schedule/Hearing": "orange",
"Hearing/Markup": "orange",
"Legislative Office Press Release": "red"
}
for _, row in display_df.iterrows():
dt = row['event_date']
dt_str = dt.strftime('%b %d, %Y') if pd.notnull(dt) else "Date TBD"
card_type = row['type']
icon = type_icons.get(card_type, "βͺ")
color = color_map.get(card_type, "gray")
source = row.get('source', 'Unknown Source')
raw_title = str(row['title'])
display_title = raw_title[:75] + ("..." if len(raw_title) > 75 else "")
with st.expander(f"{icon} {dt_str} | {card_type} | {source} | {display_title}"):
col1, col2 = st.columns([3, 1])
with col1:
st.markdown("### Executive Summary")
st.info(row.get('analysis', 'Analysis pending...'))
st.caption(f"**Keywords:** `{row.get('keywords', 'N/A')}`")
with col2:
st.markdown("### Metadata")
st.markdown(f"**Category:** :{color}[{card_type}]")
st.write(f"**Source:** {source}")
st.write(f"**Action:** {row['latest_action']}")
st.link_button("View Source", str(row['link']), use_container_width=True)
# --- GLOBAL DATA FILTERING ---
if active_df is not None and not active_df.empty:
# 1. Apply Sidebar Filters
if selected_types:
filtered_df = active_df[active_df['type'].isin(selected_types)]
else:
filtered_df = active_df
# 2. Global Search Bar (Always visible at the top)
search = st.text_input("π Search Intel Dashboard (Filters apply to all tabs)...", "")
if search:
mask = filtered_df[['title', 'summary', 'analysis', 'keywords', 'source']].apply(lambda x: x.astype(str).str.contains(search, case=False)).any(axis=1)
filtered_df = filtered_df[mask]
# 3. Split data into Radar and Archive
today_ts = pd.Timestamp.now().normalize()
radar_cutoff = today_ts - pd.Timedelta(days=1)
radar_df = filtered_df[filtered_df['event_date'] >= radar_cutoff].sort_values(by="event_date", ascending=False)
archive_df = filtered_df[(filtered_df['event_date'] < radar_cutoff) | (filtered_df['event_date'].isna())].sort_values(by="event_date", ascending=False)
# ---------------------------------------------------------
# MASTER UI TABS
# ---------------------------------------------------------
tab_feed, tab_briefing, tab_trends = st.tabs(["Data Feed", "Daily Summary", "Weekly Trend Analysis (Pilot)"])
# === TAB 1: THE FEED ===
with tab_feed:
feed_tab1, feed_tab2 = st.tabs([f"Radar ({len(radar_df)})", f"Archive ({len(archive_df)})"])
with feed_tab1:
render_event_cards(radar_df)
with feed_tab2:
render_event_cards(archive_df)
# === TAB 2: EXECUTIVE BRIEFING ===
with tab_briefing:
st.subheader("Daily Summary")
st.info("AI briefing is synthesized from the most recent sources currently visible on your Radar.")
if 'exec_briefing' not in st.session_state:
st.session_state.exec_briefing = "Click the button below to generate a high-level briefing."
st.write(st.session_state.exec_briefing)
if os.getenv("HF_TOKEN"):
if st.button("Generate Summary", key="btn_briefing"):
with st.spinner("Qwen2.5-7B-Instruct is providing a summary..."):
briefing_items = radar_df.head(10)
if briefing_items.empty:
briefing_items = filtered_df.head(10)
context = "\n".join([f"β’ SOURCE: {row['source']} | TITLE: {row['title']} | SUMMARY: {row.get('analysis', 'N/A')}" for _, row in briefing_items.iterrows()])
prompt = f"""
Provide a highly concise, 3-5 paragraph briefing based only on the recent intelligence gathered from the user's Radar tracking system.
Do not include outside information. Cite all sources used in the summary using in-line citations for easy user verification. Do NOT cite dates in line.
Ensure you are synthesizing and summarizing information from across the Radar tracking system, not just the first 1 or 2 entries.
All summaries should be in an understated tone. Do not infer implications or make recommendations.
RADAR INTEL:
{context}
"""
messages = [{"role": "user", "content": prompt}]
try:
briefing_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=os.getenv("HF_TOKEN"))
response = briefing_client.chat_completion(messages, max_tokens=500, temperature=0.2)
st.session_state.exec_briefing = response.choices[0].message.content
st.rerun()
except Exception as e:
st.error(f"Briefing failed: {e}")
# === TAB 3: TREND ANALYSIS ===
with tab_trends:
st.subheader("Semantic Trend Map")
st.markdown("Explore the semantic relationships between this week's AI policy updates. Non-AI related noise is automatically filtered out by the AI classifier.")
if st.button("Generate Weekly Trend Report", key="btn_trends"):
with st.spinner("Analyzing semantic data, abstracting macro-trends, and mapping 2D space... (Takes ~30 seconds)"):
week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
weekly_df = weekly_df.dropna(subset=['embedding'])
if len(weekly_df) < 5:
st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to calculate mathematical trends.")
else:
from sklearn.manifold import TSNE
matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
# FIX 1: Tightened the distance_threshold to 0.45
clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.45, metric='cosine', linkage='complete')
weekly_df['cluster'] = clusterer.fit_predict(matrix)
num_clusters = weekly_df['cluster'].nunique()
if num_clusters > 8:
clusterer = AgglomerativeClustering(n_clusters=8, metric='cosine', linkage='complete')
weekly_df['cluster'] = clusterer.fit_predict(matrix)
num_clusters = 8
weekly_df['Trend Topic'] = "Uncategorized"
hf_token = os.getenv("HF_TOKEN")
if hf_token:
ui_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=hf_token)
for i in range(num_clusters):
cluster_df = weekly_df[weekly_df['cluster'] == i]
sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
prompt = f"""
You are a highly structured D.C. Tech Policy Taxonomist. Categorize these related article titles into a SINGLE, broad policy or industry bucket.
RULES:
1. MACRO CATEGORIES ONLY: Use 1 to 3 words maximum. Think of these as slide deck section headers.
2. NO HEADLINES: Absolutely NO verbs, NO company names, NO numbers, and NO dollar amounts.
* BAD: "Start-Up Raises $1.3 Billion", "Congress Debates AI Bill"
* GOOD: "Venture Capital", "Legislative Action", "AI Infrastructure"
3. EXAMPLES OF IDEAL LABELS: "AI Infrastructure", "Export Controls", "AI Safety", "Defense & Security", "Consumer Regulation", "Industry Update".
4. FILTER NOISE: If the articles are not about AI, compute, or tech policy, reply EXACTLY with: REJECT.
5. FORMAT: Just the category name. No quotes, no extra text.
UPDATES:
{sample_texts}
"""
messages = [{"role": "user", "content": prompt}]
try:
response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0)
topic_name = response.choices[0].message.content.strip(' "').upper()
if "REJECT" in topic_name:
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
else:
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name.title()
except:
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
time.sleep(10)
clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
if not clean_df.empty:
# Run t-SNE mapping
clean_matrix = np.vstack(clean_df['embedding'].apply(json.loads).values)
if len(clean_df) > 1:
# FIX 2: Lowered perplexity and added n_iter for better small-island separation
safe_perplexity = max(2, min(8, len(clean_df) // 4))
tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random', n_iter=1000)
coords = tsne.fit_transform(clean_matrix)
clean_df['x'] = coords[:, 0]
clean_df['y'] = coords[:, 1]
else:
clean_df['x'] = 0
clean_df['y'] = 0
# Save to session state so it doesn't vanish!
st.session_state['trend_df'] = clean_df
st.session_state['valid_clusters'] = clean_df['cluster'].unique()
else:
st.warning("All data this week was classified as non-AI noise.")
else:
st.error("Hugging Face API token not found.")
# --- Display the Trend Map if it's stored in Memory ---
if 'trend_df' in st.session_state and not st.session_state['trend_df'].empty:
clean_df = st.session_state['trend_df']
st.write(f"### Top AI Trends This Week:")
for cluster_id in st.session_state['valid_clusters']:
cluster_subset = clean_df[clean_df['cluster'] == cluster_id]
topic_label = cluster_subset['Trend Topic'].iloc[0]
st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
st.write("### Semantic Cluster Map")
chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
x=alt.X('x', axis=None),
y=alt.Y('y', axis=None),
color=alt.Color('Trend Topic:N', legend=alt.Legend(title="Macro Trends", orient="bottom")),
tooltip=[
alt.Tooltip('Trend Topic:N', title='Macro Trend'),
alt.Tooltip('title:N', title='Update Title'),
alt.Tooltip('source:N', title='Source'),
alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y')
]
).properties(height=400).interactive()
st.altair_chart(chart, use_container_width=True)
else:
st.warning("Dashboard empty. Run 'Force Manual Sweep' to populate.") |