Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Running

App Files Files Community

AI_Intel_Tracker / streamlit_app.py

IJ-Reynolds HF Staff

Update streamlit_app.py

56581ad verified 3 minutes ago

raw

history blame contribute delete

22.6 kB

	import streamlit as st
	import pandas as pd
	import main
	from pathlib import Path
	from datetime import datetime
	import threading
	import time
	import os
	from huggingface_hub import InferenceClient
	import json
	import numpy as np
	import altair as alt
	from sklearn.cluster import AgglomerativeClustering

	# Create a global lock for file operations
	data_lock = threading.Lock()

	# --- PATHING LOGIC ---
	if Path("/data").exists():
	CSV_PATH = Path("/data/policy_tracker.csv")
	else:
	CSV_PATH = Path(__file__).resolve().parent / "policy_tracker.csv"

	# --- DATA LOADING ---
	def load_data():
	with data_lock:
	if CSV_PATH.exists():
	df = pd.read_csv(CSV_PATH)
	df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce')
	if df['event_date'].dt.tz is not None:
	df['event_date'] = df['event_date'].dt.tz_localize(None)

	df['date_collected'] = pd.to_datetime(df['date_collected'], errors='coerce')
	return df
	return None

	# --- RETENTION POLICY ---
	def apply_retention_policy(df):
	if df is None or df.empty:
	return df

	today = pd.Timestamp.now().tz_localize(None).normalize()

	# 1. Retention Filtering
	leg_df = df[df['type'] == 'Legislation']

	news_types = ['News/Media', 'Federal/Exec Action', 'Legislative Office Press Release']
	news_mask = (df['type'].isin(news_types)) & ((df['event_date'] >= today - pd.Timedelta(days=30)) \| df['event_date'].isna())
	news_df = df[news_mask]

	sched_types = ['Schedule/Hearing', 'Hearing/Markup']
	sched_mask = (df['type'].isin(sched_types)) & ((df['event_date'] >= today - pd.Timedelta(days=60)) \| df['event_date'].isna())
	sched_df = df[sched_mask]

	other_df = df[~df['type'].isin(['Legislation'] + news_types + sched_types)]

	active_df = pd.concat([leg_df, news_df, sched_df, other_df]).drop_duplicates(subset=['link'])

	# 2. Pure Chronological Sorting (Newest to Oldest)
	return active_df.sort_values(by="event_date", ascending=False)

	# --- BACKGROUND AUTO-SCHEDULER ---
	@st.cache_resource
	def start_background_scheduler():
	def background_task():
	while True:
	try:
	needs_run = True
	sleep_time = 1 * 3600 # 1 Hour
	with data_lock:
	if CSV_PATH.exists():
	df_check = pd.read_csv(CSV_PATH)
	if 'date_collected' in df_check.columns and not df_check.empty:
	last_date = pd.to_datetime(df_check['date_collected']).max()
	if last_date.tzinfo is not None: last_date = last_date.tz_localize(None)

	hours_since_last = (datetime.now() - last_date).total_seconds() / 3600
	if hours_since_last < 1:
	needs_run = False
	sleep_time = (1 - hours_since_last) * 3600
	if needs_run:
	with data_lock: main.run()
	time.sleep(sleep_time)
	except Exception as e:
	time.sleep(3600)

	thread = threading.Thread(target=background_task, daemon=True)
	thread.start()
	return thread

	start_background_scheduler()

	# --- UI SETUP & ONBOARDING ---
	st.set_page_config(page_title="PolicyPilot Intel", layout="wide")
	st.title("AI Policy and News Dashboard - ALPHA Version")

	st.markdown("""
	Welcome to the AI Policy and News Dashboard, an automated platform tracking technology policy developments, legislative movement, and media coverage.

	The portal will auto-populate with newly scanned data every 1 hour.

	This portal's information is divided into two tabs:
	* Radar (Upcoming & Today's News): Forward-looking policy-relevant data when it is available and daily news updates.
	* Archive (Past): Historical data on past media coverage, actions from executive agencies and the White House, and legislation from the current Congress.
	* How to Filter: Use the Controls in the left sidebar to filter by specific data categories or use the search bar below.

	### Category Legend
	To help you scan the chronological timeline quickly, entries are color-coded:
	* 🟣 Legislation (Bills, Resolutions)
	* 🟢 Federal / Exec Action (Agencies, White House)
	* 🔵 News / Media (Press Coverage)
	* 🟠 Schedule / Hearing (Committee Meetings, Markups)
	* 📣 Legislative Office Press Release (Lawmaker Announcements)

	To generate a high-level summary of the most recent data entries, click the "Generate Briefing" button below.
	""")

	st.markdown("""
	---
	### Notes for Users
	1. Verify AI Outputs: This portal leverages summaries generated by language models. All intelligence should be verified using the links to original sources.
	2. Work in Progress: This is an alpha version. Improvements in coverage and AI logic will be made regularly.
	""")

	with st.expander("🛠️ Technical Details & Architecture"):
	st.markdown("""
	* AI Engine: Powered by Qwen/Qwen2.5-7B-Instruct.
	* Data Sources: Live API integration with Congress.gov, official federal RSS feeds, and master committee schedules.
	* Filtering: Articles and bills are strictly filtered against a hardcoded tech-policy dictionary before the AI reads them.
	* Data Retention: News expires from the UI after 30 days, Schedules after 60 days. Legislation is retained for the duration of the current Congress.
	""")

	st.divider()

	# Load Data
	df = load_data()
	active_df = apply_retention_policy(df)
	selected_types = []
	# --- SIDEBAR ---
	with st.sidebar:
	st.header("System Status")

	last_sync_str = "Pending First Run"
	if df is not None and not df.empty and 'date_collected' in df.columns:
	last_sync_dt = pd.to_datetime(df['date_collected']).max()
	last_sync_str = last_sync_dt.strftime('%b %d, %I:%M %p UTC')
	st.info(f"Auto-Pilot: Active (1h Cycle)\n\nLast Sync: {last_sync_str}")

	st.divider()
	st.header("Manual Override")

	cooldown_minutes = 30
	can_sweep = True
	time_left = 0

	if df is not None and not df.empty and 'date_collected' in df.columns:
	last_sync_dt = pd.to_datetime(df['date_collected']).max()
	if last_sync_dt.tzinfo is not None: last_sync_dt = last_sync_dt.tz_localize(None)
	mins_since_last = (datetime.now() - last_sync_dt).total_seconds() / 60

	if mins_since_last < cooldown_minutes:
	can_sweep = False
	time_left = int(cooldown_minutes - mins_since_last)

	if can_sweep:
	if st.button("Force Manual Sweep", use_container_width=True):
	with st.spinner("Scanning Datacenters & Gov Servers..."):
	with data_lock: main.run()
	st.success("Sweep Complete!")
	st.rerun()
	else:
	st.button(f"Sweep on Cooldown ({time_left}m left)", disabled=True, use_container_width=True)
	st.caption("🛡️ To prevent IP bans from government servers, manual sweeps are limited to once every 15 minutes.")

	# ---------------------------------------------------------
	# TEMPORARY MIGRATION BUTTON
	# ---------------------------------------------------------
	st.divider()
	st.header("Database Maintenance")
	if st.button("🔄 Upgrade Database to BGE-Small", type="primary", use_container_width=True):
	with st.spinner("Re-calculating all historical data... This takes about 30 seconds."):
	from sentence_transformers import SentenceTransformer
	import json

	with data_lock:
	if CSV_PATH.exists():
	# 1. Open the vault
	df_upgrade = pd.read_csv(CSV_PATH)

	# 2. Boot up the new brain
	st.toast("Downloading new BGE model...")
	model = SentenceTransformer('BAAI/bge-small-en-v1.5')

	# 3. Combine the text the exact way your scraper does (Title + Summary)
	texts_to_embed = df_upgrade['title'].fillna('') + " - " + df_upgrade.get('summary', pd.Series([''] * len(df_upgrade))).fillna('')

	# 4. Generate the new math
	st.toast("Re-embedding historical articles...")
	new_embeddings = model.encode(texts_to_embed.tolist())

	# 5. Overwrite the old embedding column with the new JSON lists
	df_upgrade['embedding'] = [json.dumps(emb.tolist()) for emb in new_embeddings]

	# 6. Save it back to the permanent bucket
	df_upgrade.to_csv(CSV_PATH, index=False)

	st.success("Migration Complete! Your historical data is now BGE-compatible.")
	time.sleep(2)
	st.rerun()
	# ---------------------------------------------------------

	st.divider()
	if active_df is not None and not active_df.empty:
	available_types = active_df['type'].dropna().unique().tolist()
	selected_types = st.multiselect("Filter by Category:", options=available_types, default=available_types)

	st.divider()
	with data_lock: csv_data = active_df.to_csv(index=False).encode('utf-8')
	st.download_button(label="Download Historical Archive (CSV)", data=csv_data, file_name=f"policy_pilot_archive_{pd.Timestamp.now().strftime('%Y-%m-%d')}.csv", mime="text/csv", use_container_width=True)

	# --- VISUAL CARD RENDERER (Defined first so tabs can use it) ---
	def render_event_cards(display_df):
	if display_df.empty:
	st.info("No items match these filters.")
	return

	type_icons = {
	"Legislation": "🟣",
	"Federal/Exec Action": "🟢",
	"News/Media": "🔵",
	"Schedule/Hearing": "🟠",
	"Hearing/Markup": "🟠",
	"Legislative Office Press Release": "📣"
	}

	color_map = {
	"News/Media": "blue",
	"Federal/Exec Action": "green",
	"Legislation": "violet",
	"Schedule/Hearing": "orange",
	"Hearing/Markup": "orange",
	"Legislative Office Press Release": "red"
	}

	for _, row in display_df.iterrows():
	dt = row['event_date']
	dt_str = dt.strftime('%b %d, %Y') if pd.notnull(dt) else "Date TBD"

	card_type = row['type']
	icon = type_icons.get(card_type, "⚪")
	color = color_map.get(card_type, "gray")
	source = row.get('source', 'Unknown Source')

	raw_title = str(row['title'])
	display_title = raw_title[:75] + ("..." if len(raw_title) > 75 else "")

	with st.expander(f"{icon} {dt_str} \| {card_type} \| {source} \| {display_title}"):
	col1, col2 = st.columns([3, 1])
	with col1:
	st.markdown("### Executive Summary")
	st.info(row.get('analysis', 'Analysis pending...'))
	st.caption(f"Keywords: `{row.get('keywords', 'N/A')}`")
	with col2:
	st.markdown("### Metadata")
	st.markdown(f"Category: :{color}[{card_type}]")
	st.write(f"Source: {source}")
	st.write(f"Action: {row['latest_action']}")
	st.link_button("View Source", str(row['link']), use_container_width=True)

	# --- GLOBAL DATA FILTERING ---
	if active_df is not None and not active_df.empty:
	# 1. Apply Sidebar Filters
	if selected_types:
	filtered_df = active_df[active_df['type'].isin(selected_types)]
	else:
	filtered_df = active_df

	# 2. Global Search Bar (Always visible at the top)
	search = st.text_input("🔍 Search Intel Dashboard (Filters apply to all tabs)...", "")
	if search:
	mask = filtered_df[['title', 'summary', 'analysis', 'keywords', 'source']].apply(lambda x: x.astype(str).str.contains(search, case=False)).any(axis=1)
	filtered_df = filtered_df[mask]

	# 3. Split data into Radar and Archive
	today_ts = pd.Timestamp.now().normalize()
	radar_cutoff = today_ts - pd.Timedelta(days=1)

	radar_df = filtered_df[filtered_df['event_date'] >= radar_cutoff].sort_values(by="event_date", ascending=False)
	archive_df = filtered_df[(filtered_df['event_date'] < radar_cutoff) \| (filtered_df['event_date'].isna())].sort_values(by="event_date", ascending=False)

	# ---------------------------------------------------------
	# MASTER UI TABS
	# ---------------------------------------------------------
	tab_feed, tab_briefing, tab_trends = st.tabs(["Data Feed", "Daily Summary", "Weekly Trend Analysis"])

	# === TAB 1: THE FEED ===
	with tab_feed:
	feed_tab1, feed_tab2 = st.tabs([f"Radar ({len(radar_df)})", f"Archive ({len(archive_df)})"])
	with feed_tab1:
	render_event_cards(radar_df)
	with feed_tab2:
	render_event_cards(archive_df)

	# === TAB 2: EXECUTIVE BRIEFING ===
	with tab_briefing:
	st.subheader("Daily Summary")
	st.info("AI briefing is synthesized from the most recent sources currently visible on your Radar.")

	if 'exec_briefing' not in st.session_state:
	st.session_state.exec_briefing = "Click the button below to generate a high-level briefing."
	st.write(st.session_state.exec_briefing)

	if os.getenv("HF_TOKEN"):
	if st.button("Generate Summary", key="btn_briefing"):
	with st.spinner("Qwen2.5-7B-Instruct is providing a summary..."):
	briefing_items = radar_df.head(10)
	if briefing_items.empty:
	briefing_items = filtered_df.head(10)

	context = "\n".join([f"• SOURCE: {row['source']} \| TITLE: {row['title']} \| SUMMARY: {row.get('analysis', 'N/A')}" for _, row in briefing_items.iterrows()])

	prompt = f"""
	Provide a highly concise, 3-5 paragraph briefing based only on the recent intelligence gathered from the user's Radar tracking system.
	Do not include outside information. Cite all sources used in the summary using in-line citations for easy user verification. Do NOT cite dates in line.
	Ensure you are synthesizing and summarizing information from across the Radar tracking system, not just the first 1 or 2 entries.
	All summaries should be in an understated tone. Do not infer implications or make recommendations.
	RADAR INTEL:
	{context}
	"""

	messages = [{"role": "user", "content": prompt}]
	try:
	briefing_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=os.getenv("HF_TOKEN"))
	response = briefing_client.chat_completion(messages, max_tokens=500, temperature=0.2)
	st.session_state.exec_briefing = response.choices[0].message.content
	st.rerun()
	except Exception as e:
	st.error(f"Briefing failed: {e}")

	# === TAB 3: TREND ANALYSIS ===
	with tab_trends:
	st.subheader("Semantic Trend Map")
	st.markdown("Explore the semantic relationships between this week's AI policy updates. Non-AI related noise is automatically filtered out by the AI classifier.")

	if st.button("Generate Weekly Trend Report", key="btn_trends"):
	with st.spinner("Analyzing semantic data, abstracting macro-trends, and mapping 2D space... (Takes ~30 seconds)"):
	week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
	weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
	weekly_df = weekly_df.dropna(subset=['embedding'])

	if len(weekly_df) < 5:
	st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to calculate mathematical trends.")
	else:
	from sklearn.manifold import TSNE
	matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)

	clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.40, metric='cosine', linkage='complete')
	weekly_df['cluster'] = clusterer.fit_predict(matrix)

	num_clusters = weekly_df['cluster'].nunique()
	if num_clusters > 8:
	clusterer = AgglomerativeClustering(n_clusters=8, metric='cosine', linkage='complete')
	weekly_df['cluster'] = clusterer.fit_predict(matrix)
	num_clusters = 8

	weekly_df['Trend Topic'] = "Uncategorized"
	hf_token = os.getenv("HF_TOKEN")

	if hf_token:
	ui_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=hf_token)
	for i in range(num_clusters):
	cluster_df = weekly_df[weekly_df['cluster'] == i]
	sample_texts = "\n".join(cluster_df['title'].head(8).tolist())

	prompt = f"""
	You are a highly structured D.C. Tech Policy Taxonomist. Categorize these related article titles into a SINGLE, broad policy or industry bucket.

	RULES:
	1. MACRO CATEGORIES ONLY: Use 1 to 3 words maximum. Think of these as slide deck section headers.
	2. NO HEADLINES: Absolutely NO verbs, NO company names, NO numbers, and NO dollar amounts.
	* BAD: "Start-Up Raises $1.3 Billion", "Congress Debates AI Bill"
	* GOOD: "Venture Capital", "Legislative Action", "AI Infrastructure"
	3. EXAMPLES OF IDEAL LABELS: "AI Infrastructure", "Export Controls", "AI Safety", "Defense & Security", "Consumer Regulation", "Industry Update".
	4. FILTER NOISE: If the articles are not about AI, compute, or tech policy, reply EXACTLY with: REJECT.
	5. FORMAT: Just the category name. No quotes, no extra text.

	UPDATES:
	{sample_texts}
	"""

	messages = [{"role": "user", "content": prompt}]
	try:
	response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0)
	topic_name = response.choices[0].message.content.strip(' "').upper()
	if "REJECT" in topic_name:
	weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
	else:
	weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name.title()
	except:
	weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"

	time.sleep(10)

	clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()

	if not clean_df.empty:
	# Run t-SNE mapping
	clean_matrix = np.vstack(clean_df['embedding'].apply(json.loads).values)
	if len(clean_df) > 1:
	safe_perplexity = max(1, min(30, len(clean_df) - 1))
	tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
	coords = tsne.fit_transform(clean_matrix)
	clean_df['x'] = coords[:, 0]
	clean_df['y'] = coords[:, 1]
	else:
	clean_df['x'] = 0
	clean_df['y'] = 0

	# Save to session state so it doesn't vanish!
	st.session_state['trend_df'] = clean_df
	st.session_state['valid_clusters'] = clean_df['cluster'].unique()
	else:
	st.warning("All data this week was classified as non-AI noise.")
	else:
	st.error("Hugging Face API token not found.")

	# --- Display the Trend Map if it's stored in Memory ---
	if 'trend_df' in st.session_state and not st.session_state['trend_df'].empty:
	clean_df = st.session_state['trend_df']

	st.write(f"### Top AI Trends This Week:")
	for cluster_id in st.session_state['valid_clusters']:
	cluster_subset = clean_df[clean_df['cluster'] == cluster_id]
	topic_label = cluster_subset['Trend Topic'].iloc[0]
	st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")

	st.write("### Semantic Cluster Map")
	chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
	x=alt.X('x', axis=None),
	y=alt.Y('y', axis=None),
	color=alt.Color('Trend Topic:N', legend=alt.Legend(title="Macro Trends", orient="bottom")),
	tooltip=[
	alt.Tooltip('Trend Topic:N', title='Macro Trend'),
	alt.Tooltip('title:N', title='Update Title'),
	alt.Tooltip('source:N', title='Source'),
	alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y')
	]
	).properties(height=400).interactive()

	st.altair_chart(chart, use_container_width=True)

	else:
	st.warning("Dashboard empty. Run 'Force Manual Sweep' to populate.")