File size: 20,921 Bytes
9e2d126
 
 
 
aed4b3f
a29b743
 
46ba85d
2b50a58
81d36f5
 
65035d6
 
9e2d126
0ab804a
 
 
cca9db9
9e2d126
 
 
 
 
cf49e11
9e2d126
0ab804a
 
3fbb740
0ab804a
 
 
 
 
235a5fa
0ab804a
235a5fa
3ff209a
5059441
235a5fa
cca9db9
 
 
 
235a5fa
cca9db9
 
4a51ea8
cca9db9
 
 
 
 
 
 
 
 
 
235a5fa
5059441
 
cca9db9
a29b743
 
 
 
 
 
 
b9d70d9
0ab804a
 
 
 
 
5059441
b9d70d9
0ab804a
b9d70d9
0ab804a
b9d70d9
a29b743
5059441
a29b743
 
46ba85d
a29b743
 
 
 
 
 
 
2c0c298
aed4b3f
2c0c298
b8894d9
 
173db6b
2c0c298
21a9cf0
2c0c298
 
8578c54
 
2c0c298
 
173db6b
 
 
5059441
f2ab35e
173db6b
4a51ea8
7e3acd8
5059441
2c0c298
 
173db6b
 
 
 
 
 
 
5059441
7e3acd8
56581ad
2c0c298
173db6b
2c0c298
7e3acd8
9e2d126
2c0c298
 
 
 
5059441
0ab804a
622e64a
9e2d126
5059441
 
 
 
 
5d4f50f
b9d70d9
a29b743
5059441
 
b9d70d9
cbdfd8b
b9d70d9
 
 
 
 
 
 
 
 
 
 
25ef23d
b9d70d9
 
 
 
 
 
 
 
 
 
9e2d126
5059441
 
 
cca9db9
235a5fa
5059441
 
9e2d126
0254967
 
 
 
 
 
 
 
 
 
 
 
 
 
a2960a6
0254967
 
 
 
 
 
 
 
a2960a6
0254967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25ef23d
0254967
 
 
 
 
 
 
21a9cf0
0254967
 
 
 
 
09ffeae
0254967
 
 
 
 
 
25ef23d
0254967
 
 
28c8245
09dd029
0254967
 
 
 
 
 
 
81d36f5
0254967
 
56581ad
0254967
81d36f5
0254967
 
 
 
 
56581ad
 
0254967
 
 
09dd029
0254967
81d36f5
0254967
 
2cfd20a
0254967
 
 
 
81d36f5
 
 
 
020fc24
0254967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09dd029
0254967
 
09dd029
0254967
 
09dd029
f080457
 
0254967
09dd029
0254967
 
 
 
 
 
 
 
5e8c57b
0254967
020fc24
0254967
1a3434a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0254967
 
 
 
 
 
 
 
 
 
 
 
 
5e8c57b
0254967
 
 
 
f080457
 
 
0254967
 
 
 
 
 
 
 
 
 
 
 
5e8c57b
0254967
634b6d4
0254967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634b6d4
0254967
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
import streamlit as st
import pandas as pd
import main
from pathlib import Path
from datetime import datetime
import threading
import time
import os
from huggingface_hub import InferenceClient
import json
import numpy as np
import altair as alt
from sklearn.cluster import AgglomerativeClustering

# Create a global lock for file operations
data_lock = threading.Lock()

# --- PATHING LOGIC ---
if Path("/data").exists():
    CSV_PATH = Path("/data/policy_tracker.csv")
else:
    CSV_PATH = Path(__file__).resolve().parent / "policy_tracker.csv"

# --- DATA LOADING ---
def load_data():
    with data_lock:
        if CSV_PATH.exists():
            df = pd.read_csv(CSV_PATH)
            df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce')
            if df['event_date'].dt.tz is not None:
                df['event_date'] = df['event_date'].dt.tz_localize(None)
                
            df['date_collected'] = pd.to_datetime(df['date_collected'], errors='coerce')
            return df
        return None

# --- RETENTION POLICY ---
def apply_retention_policy(df):
    if df is None or df.empty:
        return df
        
    today = pd.Timestamp.now().tz_localize(None).normalize()
    
    # 1. Retention Filtering
    leg_df = df[df['type'] == 'Legislation']
    
    news_types = ['News/Media', 'Federal/Exec Action', 'Legislative Office Press Release']
    news_mask = (df['type'].isin(news_types)) & ((df['event_date'] >= today - pd.Timedelta(days=30)) | df['event_date'].isna())
    news_df = df[news_mask]
    
    sched_types = ['Schedule/Hearing', 'Hearing/Markup']
    sched_mask = (df['type'].isin(sched_types)) & ((df['event_date'] >= today - pd.Timedelta(days=60)) | df['event_date'].isna())
    sched_df = df[sched_mask]
    
    other_df = df[~df['type'].isin(['Legislation'] + news_types + sched_types)]
    
    active_df = pd.concat([leg_df, news_df, sched_df, other_df]).drop_duplicates(subset=['link'])
    
    # 2. Pure Chronological Sorting (Newest to Oldest)
    return active_df.sort_values(by="event_date", ascending=False)

# --- BACKGROUND AUTO-SCHEDULER ---
@st.cache_resource
def start_background_scheduler():
    def background_task():
        while True:
            try:
                needs_run = True
                sleep_time = 1 * 3600  # 1 Hour
                with data_lock:
                    if CSV_PATH.exists():
                        df_check = pd.read_csv(CSV_PATH)
                        if 'date_collected' in df_check.columns and not df_check.empty:
                            last_date = pd.to_datetime(df_check['date_collected']).max()
                            if last_date.tzinfo is not None: last_date = last_date.tz_localize(None)
                            
                            hours_since_last = (datetime.now() - last_date).total_seconds() / 3600
                            if hours_since_last < 1:
                                needs_run = False
                                sleep_time = (1 - hours_since_last) * 3600
                if needs_run:
                    with data_lock: main.run()
                time.sleep(sleep_time)
            except Exception as e:
                time.sleep(3600)  
                
    thread = threading.Thread(target=background_task, daemon=True)
    thread.start()
    return thread

start_background_scheduler()

# --- UI SETUP & ONBOARDING ---
st.set_page_config(page_title="PolicyPilot Intel", layout="wide")
st.title("AI Policy and News Dashboard - ALPHA Version")

st.markdown("""
Welcome to the **AI Policy and News Dashboard**, an automated platform tracking technology policy developments, legislative movement, and media coverage.

The portal will auto-populate with newly scanned data every 1 hour.

This portal's information is divided into two tabs:
* **Radar (Upcoming & Today's News):** Forward-looking policy-relevant data when it is available and daily news updates.
* **Archive (Past):** Historical data on past media coverage, actions from executive agencies and the White House, and legislation from the current Congress. 
* **How to Filter:** Use the **Controls in the left sidebar** to filter by specific data categories or use the search bar below.

### Category Legend
To help you scan the chronological timeline quickly, entries are color-coded:
* 🟣 **Legislation** (Bills, Resolutions)
* 🟒 **Federal / Exec Action** (Agencies, White House)
* πŸ”΅ **News / Media** (Press Coverage)
* 🟠 **Schedule / Hearing** (Committee Meetings, Markups)
* πŸ“£ **Legislative Office Press Release** (Lawmaker Announcements)

To generate a high-level summary of the most recent data entries, click the **"Generate Briefing"** button below.
""")

st.markdown("""
---
### Notes for Users
1. **Verify AI Outputs:** This portal leverages summaries generated by language models. All intelligence should be verified using the links to original sources.
2. **Work in Progress:** This is an alpha version. Improvements in coverage and AI logic will be made regularly.
""")

with st.expander("πŸ› οΈ Technical Details & Architecture"):
    st.markdown("""
    * **AI Engine:** Powered by Qwen/Qwen2.5-7B-Instruct.
    * **Data Sources:** Live API integration with Congress.gov, official federal RSS feeds, and master committee schedules.
    * **Filtering:** Articles and bills are strictly filtered against a hardcoded tech-policy dictionary before the AI reads them.
    * **Data Retention:** News expires from the UI after 30 days, Schedules after 60 days. Legislation is retained for the duration of the current Congress.
    """)

st.divider()

# Load Data
df = load_data()
active_df = apply_retention_policy(df)
selected_types = []
# --- SIDEBAR ---
with st.sidebar:
    st.header("System Status")
    
    last_sync_str = "Pending First Run"
    if df is not None and not df.empty and 'date_collected' in df.columns:
        last_sync_dt = pd.to_datetime(df['date_collected']).max()
        last_sync_str = last_sync_dt.strftime('%b %d, %I:%M %p UTC')
    st.info(f"**Auto-Pilot:** Active (1h Cycle)\n\n**Last Sync:** {last_sync_str}")
    
    st.divider()
    st.header("Manual Override")
    
    cooldown_minutes = 30
    can_sweep = True
    time_left = 0
    
    if df is not None and not df.empty and 'date_collected' in df.columns:
        last_sync_dt = pd.to_datetime(df['date_collected']).max()
        if last_sync_dt.tzinfo is not None: last_sync_dt = last_sync_dt.tz_localize(None)
        mins_since_last = (datetime.now() - last_sync_dt).total_seconds() / 60
        
        if mins_since_last < cooldown_minutes:
            can_sweep = False
            time_left = int(cooldown_minutes - mins_since_last)
            
    if can_sweep:
        if st.button("Force Manual Sweep", use_container_width=True):
            with st.spinner("Scanning Datacenters & Gov Servers..."):
                with data_lock: main.run()
                st.success("Sweep Complete!")
                st.rerun()
    else:
        st.button(f"Sweep on Cooldown ({time_left}m left)", disabled=True, use_container_width=True)
        st.caption("πŸ›‘οΈ *To prevent IP bans from government servers, manual sweeps are limited to once every 15 minutes.*")
        
    st.divider()
    if active_df is not None and not active_df.empty:
        available_types = active_df['type'].dropna().unique().tolist()
        selected_types = st.multiselect("Filter by Category:", options=available_types, default=available_types)
        
        st.divider()
        with data_lock: csv_data = active_df.to_csv(index=False).encode('utf-8')
        st.download_button(label="Download Historical Archive (CSV)", data=csv_data, file_name=f"policy_pilot_archive_{pd.Timestamp.now().strftime('%Y-%m-%d')}.csv", mime="text/csv", use_container_width=True)

# --- VISUAL CARD RENDERER (Defined first so tabs can use it) ---
def render_event_cards(display_df):
    if display_df.empty:
        st.info("No items match these filters.")
        return
    
    type_icons = {
        "Legislation": "🟣",
        "Federal/Exec Action": "🟒",
        "News/Media": "πŸ”΅",
        "Schedule/Hearing": "🟠",
        "Hearing/Markup": "🟠",
        "Legislative Office Press Release": "πŸ“£"
    }
    
    color_map = {
        "News/Media": "blue", 
        "Federal/Exec Action": "green", 
        "Legislation": "violet", 
        "Schedule/Hearing": "orange", 
        "Hearing/Markup": "orange",
        "Legislative Office Press Release": "red"
    }
    
    for _, row in display_df.iterrows():
        dt = row['event_date']
        dt_str = dt.strftime('%b %d, %Y') if pd.notnull(dt) else "Date TBD"
        
        card_type = row['type']
        icon = type_icons.get(card_type, "βšͺ")
        color = color_map.get(card_type, "gray")
        source = row.get('source', 'Unknown Source')
        
        raw_title = str(row['title'])
        display_title = raw_title[:75] + ("..." if len(raw_title) > 75 else "")
        
        with st.expander(f"{icon} {dt_str} | {card_type} | {source} | {display_title}"):
            col1, col2 = st.columns([3, 1])
            with col1:
                st.markdown("### Executive Summary")
                st.info(row.get('analysis', 'Analysis pending...'))
                st.caption(f"**Keywords:** `{row.get('keywords', 'N/A')}`")
            with col2:
                st.markdown("### Metadata")
                st.markdown(f"**Category:** :{color}[{card_type}]")
                st.write(f"**Source:** {source}")
                st.write(f"**Action:** {row['latest_action']}")
                st.link_button("View Source", str(row['link']), use_container_width=True)

# --- GLOBAL DATA FILTERING ---
if active_df is not None and not active_df.empty:
    # 1. Apply Sidebar Filters
    if selected_types:
        filtered_df = active_df[active_df['type'].isin(selected_types)]
    else:
        filtered_df = active_df

    # 2. Global Search Bar (Always visible at the top)
    search = st.text_input("πŸ” Search Intel Dashboard (Filters apply to all tabs)...", "")
    if search:
        mask = filtered_df[['title', 'summary', 'analysis', 'keywords', 'source']].apply(lambda x: x.astype(str).str.contains(search, case=False)).any(axis=1)
        filtered_df = filtered_df[mask]

    # 3. Split data into Radar and Archive
    today_ts = pd.Timestamp.now().normalize()
    radar_cutoff = today_ts - pd.Timedelta(days=1)
    
    radar_df = filtered_df[filtered_df['event_date'] >= radar_cutoff].sort_values(by="event_date", ascending=False)
    archive_df = filtered_df[(filtered_df['event_date'] < radar_cutoff) | (filtered_df['event_date'].isna())].sort_values(by="event_date", ascending=False)

    # ---------------------------------------------------------
    # MASTER UI TABS
    # ---------------------------------------------------------
    tab_feed, tab_briefing, tab_trends = st.tabs(["Data Feed", "Daily Summary", "Weekly Trend Analysis (Pilot)"])

    # === TAB 1: THE FEED ===
    with tab_feed:
        feed_tab1, feed_tab2 = st.tabs([f"Radar ({len(radar_df)})", f"Archive ({len(archive_df)})"])
        with feed_tab1: 
            render_event_cards(radar_df)
        with feed_tab2: 
            render_event_cards(archive_df)

    # === TAB 2: EXECUTIVE BRIEFING ===
    with tab_briefing:
        st.subheader("Daily Summary")
        st.info("AI briefing is synthesized from the most recent sources currently visible on your Radar.")
        
        if 'exec_briefing' not in st.session_state:
            st.session_state.exec_briefing = "Click the button below to generate a high-level briefing."
        st.write(st.session_state.exec_briefing)

        if os.getenv("HF_TOKEN"):
            if st.button("Generate Summary", key="btn_briefing"):
                with st.spinner("Qwen2.5-7B-Instruct is providing a summary..."):
                    briefing_items = radar_df.head(10)
                    if briefing_items.empty:
                        briefing_items = filtered_df.head(10)
                    
                    context = "\n".join([f"β€’ SOURCE: {row['source']} | TITLE: {row['title']} | SUMMARY: {row.get('analysis', 'N/A')}" for _, row in briefing_items.iterrows()])
                    
                    prompt = f"""
                    Provide a highly concise, 3-5 paragraph briefing based only on the recent intelligence gathered from the user's Radar tracking system.
                    Do not include outside information. Cite all sources used in the summary using in-line citations for easy user verification. Do NOT cite dates in line.
                    Ensure you are synthesizing and summarizing information from across the Radar tracking system, not just the first 1 or 2 entries. 
                    All summaries should be in an understated tone. Do not infer implications or make recommendations.
                    RADAR INTEL:
                    {context}
                    """
                    
                    messages = [{"role": "user", "content": prompt}]
                    try:
                        briefing_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=os.getenv("HF_TOKEN"))
                        response = briefing_client.chat_completion(messages, max_tokens=500, temperature=0.2)
                        st.session_state.exec_briefing = response.choices[0].message.content
                        st.rerun()
                    except Exception as e: 
                        st.error(f"Briefing failed: {e}")

    # === TAB 3: TREND ANALYSIS ===
    with tab_trends:
        st.subheader("Semantic Trend Map")
        st.markdown("Explore the semantic relationships between this week's AI policy updates. Non-AI related noise is automatically filtered out by the AI classifier.")

        if st.button("Generate Weekly Trend Report", key="btn_trends"):
            with st.spinner("Analyzing semantic data, abstracting macro-trends, and mapping 2D space... (Takes ~30 seconds)"):
                week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
                weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
                weekly_df = weekly_df.dropna(subset=['embedding'])
                
                if len(weekly_df) < 5:
                    st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to calculate mathematical trends.")
                else:
                    from sklearn.manifold import TSNE
                    matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
                    
                    # FIX 1: Tightened the distance_threshold to 0.45
                    clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.45, metric='cosine', linkage='complete')
                    weekly_df['cluster'] = clusterer.fit_predict(matrix)
                    
                    num_clusters = weekly_df['cluster'].nunique()
                    if num_clusters > 8:
                        clusterer = AgglomerativeClustering(n_clusters=8, metric='cosine', linkage='complete')
                        weekly_df['cluster'] = clusterer.fit_predict(matrix)
                        num_clusters = 8
                        
                    weekly_df['Trend Topic'] = "Uncategorized" 
                    hf_token = os.getenv("HF_TOKEN")
                    
                    if hf_token:
                        ui_client = InferenceClient("Qwen/Qwen2.5-7B-Instruct", token=hf_token)
                        for i in range(num_clusters):
                            cluster_df = weekly_df[weekly_df['cluster'] == i]
                            sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
                            
                            prompt = f"""
                            You are a highly structured D.C. Tech Policy Taxonomist. Categorize these related article titles into a SINGLE, broad policy or industry bucket.
                            
                            RULES:
                            1. MACRO CATEGORIES ONLY: Use 1 to 3 words maximum. Think of these as slide deck section headers.
                            2. NO HEADLINES: Absolutely NO verbs, NO company names, NO numbers, and NO dollar amounts. 
                               * BAD: "Start-Up Raises $1.3 Billion", "Congress Debates AI Bill"
                               * GOOD: "Venture Capital", "Legislative Action", "AI Infrastructure"
                            3. EXAMPLES OF IDEAL LABELS: "AI Infrastructure", "Export Controls", "AI Safety", "Defense & Security", "Consumer Regulation", "Industry Update".
                            4. FILTER NOISE: If the articles are not about AI, compute, or tech policy, reply EXACTLY with: REJECT.
                            5. FORMAT: Just the category name. No quotes, no extra text.
                            
                            UPDATES:
                            {sample_texts}
                            """
                            
                            messages = [{"role": "user", "content": prompt}]
                            try:
                                response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0)
                                topic_name = response.choices[0].message.content.strip(' "').upper()
                                if "REJECT" in topic_name:
                                    weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
                                else:
                                    weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name.title()
                            except:
                                weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
                            
                            time.sleep(10)
                        
                        clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
                        
                        if not clean_df.empty:
                            # Run t-SNE mapping
                            clean_matrix = np.vstack(clean_df['embedding'].apply(json.loads).values)
                            if len(clean_df) > 1:
                                # FIX 2: Lowered perplexity and added n_iter for better small-island separation
                                safe_perplexity = max(2, min(8, len(clean_df) // 4))
                                tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random', n_iter=1000)
                                coords = tsne.fit_transform(clean_matrix)
                                clean_df['x'] = coords[:, 0]
                                clean_df['y'] = coords[:, 1]
                            else:
                                clean_df['x'] = 0
                                clean_df['y'] = 0
                                
                            # Save to session state so it doesn't vanish!
                            st.session_state['trend_df'] = clean_df
                            st.session_state['valid_clusters'] = clean_df['cluster'].unique()
                        else:
                            st.warning("All data this week was classified as non-AI noise.")
                    else:
                        st.error("Hugging Face API token not found.")

        # --- Display the Trend Map if it's stored in Memory ---
        if 'trend_df' in st.session_state and not st.session_state['trend_df'].empty:
            clean_df = st.session_state['trend_df']
            
            st.write(f"### Top AI Trends This Week:")
            for cluster_id in st.session_state['valid_clusters']:
                cluster_subset = clean_df[clean_df['cluster'] == cluster_id]
                topic_label = cluster_subset['Trend Topic'].iloc[0]
                st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
            
            st.write("### Semantic Cluster Map")
            chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
                x=alt.X('x', axis=None), 
                y=alt.Y('y', axis=None), 
                color=alt.Color('Trend Topic:N', legend=alt.Legend(title="Macro Trends", orient="bottom")), 
                tooltip=[
                    alt.Tooltip('Trend Topic:N', title='Macro Trend'),
                    alt.Tooltip('title:N', title='Update Title'),
                    alt.Tooltip('source:N', title='Source'),
                    alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y')
                ]
            ).properties(height=400).interactive()
            
            st.altair_chart(chart, use_container_width=True)

else:
    st.warning("Dashboard empty. Run 'Force Manual Sweep' to populate.")