IJ-Reynolds HF Staff commited on
Commit
3ff209a
·
verified ·
1 Parent(s): 634b6d4

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +5 -176
streamlit_app.py CHANGED
@@ -9,14 +9,8 @@ import os
9
  from huggingface_hub import InferenceClient
10
  import json
11
  import numpy as np
12
- from sklearn.cluster import KMeans
13
- import altair as alt
14
- from sklearn.decomposition import PCA
15
  import altair as alt
16
  from sklearn.cluster import AgglomerativeClustering
17
- from sklearn.manifold import TSNE
18
- from sklearn.metrics import silhouette_score
19
-
20
 
21
  # Create a global lock for file operations
22
  data_lock = threading.Lock()
@@ -40,7 +34,7 @@ def load_data():
40
  return df
41
  return None
42
 
43
- # --- RETENTION POLICY (Chronological Sort Restored) ---
44
  def apply_retention_policy(df):
45
  if df is None or df.empty:
46
  return df
@@ -50,7 +44,6 @@ def apply_retention_policy(df):
50
  # 1. Retention Filtering
51
  leg_df = df[df['type'] == 'Legislation']
52
 
53
- # UPDATED: Added 'Legislative Office Press Release' to the 30-day retention bucket
54
  news_types = ['News/Media', 'Federal/Exec Action', 'Legislative Office Press Release']
55
  news_mask = (df['type'].isin(news_types)) & ((df['event_date'] >= today - pd.Timedelta(days=30)) | df['event_date'].isna())
56
  news_df = df[news_mask]
@@ -101,7 +94,6 @@ start_background_scheduler()
101
  st.set_page_config(page_title="PolicyPilot Intel", layout="wide")
102
  st.title("AI Policy and News Dashboard - ALPHA Version")
103
 
104
- # Onboarding Text
105
  st.markdown("""
106
  Welcome to the **AI Policy and News Dashboard**, an automated platform tracking technology policy developments, legislative movement, and media coverage.
107
 
@@ -123,7 +115,6 @@ To help you scan the chronological timeline quickly, entries are color-coded:
123
  To generate a high-level summary of the most recent data entries, click the **"Generate Briefing"** button below.
124
  """)
125
 
126
- # Warning Notes
127
  st.markdown("""
128
  ---
129
  ### Notes for Users
@@ -131,7 +122,6 @@ st.markdown("""
131
  2. **Work in Progress:** This is an alpha version. Improvements in coverage and AI logic will be made regularly.
132
  """)
133
 
134
- # Architecture Notes
135
  with st.expander("🛠️ Technical Details & Architecture"):
136
  st.markdown("""
137
  * **AI Engine:** Powered by a two-tiered AI pipeline: Llama-3.1-8B-Instruct for initial data processing and Gemma 4 31B for summarization.
@@ -160,7 +150,6 @@ with st.sidebar:
160
  st.divider()
161
  st.header("Manual Override")
162
 
163
- # --- THE ANTI-SPAM COOLDOWN ---
164
  cooldown_minutes = 30
165
  can_sweep = True
166
  time_left = 0
@@ -197,7 +186,6 @@ with st.sidebar:
197
  if active_df is not None and not active_df.empty:
198
  st.subheader("Executive Intel Briefing")
199
 
200
- # NEW: The UI Flag
201
  st.info("AI briefing is synthesized from the most recent sources currently visible on the **Radar** tab.")
202
 
203
  if 'exec_briefing' not in st.session_state:
@@ -208,21 +196,17 @@ if active_df is not None and not active_df.empty:
208
  if st.button("Generate Briefing"):
209
  with st.spinner("Gemma 31B is synthesizing your Radar intelligence..."):
210
 
211
- # Filter down to the Radar logic
212
  temp_df = active_df[active_df['type'].isin(selected_types)] if selected_types else active_df
213
  today_ts = pd.Timestamp.now().normalize()
214
  radar_df = temp_df[temp_df['event_date'] >= today_ts].sort_values(by="event_date", ascending=True)
215
 
216
- # Capture the top 15 items for the Radar context
217
  briefing_items = radar_df.head(15)
218
 
219
- # Safe fallback in case the Radar is completely empty today
220
  if briefing_items.empty:
221
  briefing_items = temp_df.head(20)
222
 
223
  context = "\n".join([f"• SOURCE: {row['source']} | TITLE: {row['title']} | SUMMARY: {row.get('analysis', 'N/A')}" for _, row in briefing_items.iterrows()])
224
 
225
- # NEW: Strict context prompt
226
  prompt = f"""
227
  Provide a highly concise, 3-5 paragraph briefing based only on the recent intelligence gathered from the user's Radar tracking system.
228
 
@@ -238,12 +222,7 @@ if active_df is not None and not active_df.empty:
238
 
239
  messages = [{"role": "user", "content": prompt}]
240
  try:
241
- # ---------------------------------------------------------
242
- # THE TWO-TIER ARCHITECTURE: Dedicated Gemma Client
243
- # ---------------------------------------------------------
244
  gemma_client = InferenceClient("google/gemma-4-31B-it", token=os.getenv("HF_TOKEN"))
245
-
246
- # Max tokens bumped slightly to accommodate the 3-5 paragraphs requested
247
  response = gemma_client.chat_completion(messages, max_tokens=700, temperature=0.2)
248
  st.session_state.exec_briefing = response.choices[0].message.content
249
  st.rerun()
@@ -251,26 +230,16 @@ if active_df is not None and not active_df.empty:
251
  st.error(f"Briefing failed. (model may be loading or hitting tier limits): {e}")
252
 
253
  st.divider()
254
- ## --- Trend analysis ---
255
- import altair as alt
256
- from sklearn.cluster import AgglomerativeClustering
257
- import numpy as np
258
- import json
259
- import time
260
- import os
261
- import pandas as pd
262
- from huggingface_hub import InferenceClient
263
 
 
264
  st.subheader("Weekly AI Trend Analysis")
265
  st.markdown("Explore the timeline of this week's AI policy developments. **Hover over any dot** to see the specific article and source. Non-AI related noise is automatically filtered out by the AI classifier.")
266
 
267
  if st.button("Generate Weekly Trend Report"):
268
  with st.spinner("Analyzing semantic data and abstracting macro-trends... (Takes ~30 seconds)"):
269
- # 1. Filter for the last 7 days
270
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
271
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
272
 
273
- # 2. Extract embeddings back into numpy arrays
274
  weekly_df = weekly_df.dropna(subset=['embedding'])
275
 
276
  if len(weekly_df) < 5:
@@ -278,7 +247,6 @@ if st.button("Generate Weekly Trend Report"):
278
  else:
279
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
280
 
281
- # 3. Create clusters
282
  clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
283
  weekly_df['cluster'] = clusterer.fit_predict(matrix)
284
 
@@ -294,12 +262,10 @@ if st.button("Generate Weekly Trend Report"):
294
  if hf_token:
295
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
296
 
297
- # 4. Background Naming & Abstraction Loop
298
  for i in range(num_clusters):
299
  cluster_df = weekly_df[weekly_df['cluster'] == i]
300
  sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
301
 
302
- # ---> THE ABSTRACTION PROMPT <---
303
  prompt = f"""
304
  You are a highly analytical D.C. Tech Policy Analyst. Review these article titles.
305
  Your goal is to identify the MACRO-LEVEL AI policy, regulatory, or industry trend they represent.
@@ -318,7 +284,6 @@ if st.button("Generate Weekly Trend Report"):
318
  response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0)
319
  topic_name = response.choices[0].message.content.strip(' "').upper()
320
 
321
- # Catch the rejection or format the title nicely
322
  if "REJECT" in topic_name:
323
  weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
324
  else:
@@ -328,9 +293,8 @@ if st.button("Generate Weekly Trend Report"):
328
  print(f"Failed to name cluster {i}: {e}")
329
  weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
330
 
331
- time.sleep(10) # API Rate Limit Safety
332
 
333
- # ---> PURGE THE NOISE BEFORE VISUALIZING <---
334
  clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
335
 
336
  if clean_df.empty:
@@ -338,16 +302,12 @@ if st.button("Generate Weekly Trend Report"):
338
  else:
339
  st.write(f"### Top AI Trends This Week:")
340
 
341
- # Display the cleaned metrics
342
  valid_clusters = clean_df['cluster'].unique()
343
  for cluster_id in valid_clusters:
344
  cluster_subset = clean_df[clean_df['cluster'] == cluster_id]
345
  topic_label = cluster_subset['Trend Topic'].iloc[0]
346
  st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
347
 
348
- # ---------------------------------------------------------
349
- # 5. THE VISUALIZATION: Analytical Timeline Swarm
350
- # ---------------------------------------------------------
351
  st.write("### Trend Timeline")
352
 
353
  chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
@@ -361,7 +321,7 @@ if st.button("Generate Weekly Trend Report"):
361
  alt.Tooltip('source:N', title='Source')
362
  ]
363
  ).properties(
364
- height=max(300, len(valid_clusters) * 60) # Dynamically sizes the chart height
365
  ).interactive()
366
 
367
  st.altair_chart(chart, use_container_width=True)
@@ -371,138 +331,7 @@ if st.button("Generate Weekly Trend Report"):
371
 
372
  st.divider()
373
 
374
- # ---------------------------------------------------------
375
- # 5. THE VISUALIZATION: Analytical Timeline Swarm
376
- # ---------------------------------------------------------
377
- st.write("### Trend Timeline")
378
-
379
- chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
380
- x=alt.X('event_date:T', title='Date', axis=alt.Axis(format='%b %d', grid=True)),
381
- y=alt.Y('Trend Topic:N', title='', sort='-x', axis=alt.Axis(labelLimit=300)),
382
- color=alt.Color('Trend Topic:N', legend=None),
383
- tooltip=[
384
- alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y'),
385
- alt.Tooltip('Trend Topic:N', title='Macro Trend'),
386
- alt.Tooltip('title:N', title='Update Title'),
387
- alt.Tooltip('source:N', title='Source')
388
- ]
389
- ).properties(
390
- height=max(300, len(valid_clusters) * 60) # Dynamically sizes the chart height
391
- ).interactive()
392
-
393
- st.altair_chart(chart, use_container_width=True)
394
-
395
- else:
396
- st.error("Hugging Face API token not found. Cannot generate topic names.")
397
-
398
- st.divider()
399
-
400
- # ---------------------------------------------------------
401
- # 5. THE VISUALIZATION: Dynamic t-SNE Projection
402
- # ---------------------------------------------------------
403
- safe_perplexity = min(30, len(weekly_df) - 1)
404
-
405
- tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
406
- coords = tsne.fit_transform(matrix)
407
-
408
- weekly_df['x'] = coords[:, 0]
409
- weekly_df['y'] = coords[:, 1]
410
-
411
- chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
412
- x=alt.X('x', axis=None),
413
- y=alt.Y('y', axis=None),
414
- color=alt.Color('Trend Topic', legend=alt.Legend(title="Identified Trends", orient="bottom")),
415
- tooltip=[
416
- alt.Tooltip('Trend Topic', title='Macro Trend'),
417
- alt.Tooltip('title', title='Update Title'),
418
- alt.Tooltip('source', title='Source Agency/Office')
419
- ]
420
- ).properties(
421
- height=400
422
- ).interactive()
423
-
424
- st.altair_chart(chart, use_container_width=True)
425
-
426
- else:
427
- st.error("Hugging Face API token not found. Cannot generate topic names.")
428
-
429
- st.divider()
430
-
431
-
432
  # --- VISUAL CARD RENDERER ---
433
  def render_event_cards(display_df):
434
  if display_df.empty:
435
- st.info("No items match these filters.")
436
- return
437
-
438
- # Visual Mapping for rapid scanning (Updated with Megaphone)
439
- type_icons = {
440
- "Legislation": "🟣",
441
- "Federal/Exec Action": "🟢",
442
- "News/Media": "🔵",
443
- "Schedule/Hearing": "🟠",
444
- "Hearing/Markup": "🟠",
445
- "Legislative Office Press Release": "📣"
446
- }
447
-
448
- color_map = {
449
- "News/Media": "blue",
450
- "Federal/Exec Action": "green",
451
- "Legislation": "violet",
452
- "Schedule/Hearing": "orange",
453
- "Hearing/Markup": "orange",
454
- "Legislative Office Press Release": "red"
455
- }
456
-
457
- for _, row in display_df.iterrows():
458
- dt = row['event_date']
459
- dt_str = dt.strftime('%b %d, %Y') if pd.notnull(dt) else "Date TBD"
460
-
461
- card_type = row['type']
462
- icon = type_icons.get(card_type, "⚪")
463
- color = color_map.get(card_type, "gray")
464
- source = row.get('source', 'Unknown Source')
465
-
466
- raw_title = str(row['title'])
467
- display_title = raw_title[:75] + ("..." if len(raw_title) > 75 else "")
468
-
469
- # Injected {source} directly into the UI header
470
- with st.expander(f"{icon} {dt_str} | {card_type} | {source} | {display_title}"):
471
- col1, col2 = st.columns([3, 1])
472
- with col1:
473
- st.markdown("### Executive Summary")
474
- st.info(row.get('analysis', 'Analysis pending...'))
475
- st.caption(f"**Keywords:** `{row.get('keywords', 'N/A')}`")
476
- with col2:
477
- st.markdown("### Metadata")
478
- st.markdown(f"**Category:** :{color}[{card_type}]")
479
- st.write(f"**Source:** {source}")
480
- st.write(f"**Action:** {row['latest_action']}")
481
- st.link_button("View Source", str(row['link']), use_container_width=True)
482
-
483
- if active_df is not None and not active_df.empty:
484
- if selected_types:
485
- filtered_df = active_df[active_df['type'].isin(selected_types)]
486
- else:
487
- filtered_df = active_df
488
-
489
- search = st.text_input("Search Intel Dashboard...", "")
490
- if search:
491
- mask = filtered_df[['title', 'summary', 'analysis', 'keywords', 'source']].apply(lambda x: x.astype(str).str.contains(search, case=False)).any(axis=1)
492
- filtered_df = filtered_df[mask]
493
-
494
- today_ts = pd.Timestamp.now().normalize()
495
-
496
- radar_cutoff = today_ts - pd.Timedelta(days=1)
497
-
498
- # Radar captures Upcoming + Today + Yesterday. (Sorted newest to oldest)
499
- radar_df = filtered_df[filtered_df['event_date'] >= radar_cutoff].sort_values(by="event_date", ascending=False)
500
-
501
- # Archive catches everything older than the cutoff
502
- archive_df = filtered_df[(filtered_df['event_date'] < radar_cutoff) | (filtered_df['event_date'].isna())].sort_values(by="event_date", ascending=False)
503
-
504
- tab1, tab2 = st.tabs([f"Radar ({len(radar_df)})", f"Archive ({len(archive_df)})"])
505
- with tab1: render_event_cards(radar_df)
506
- with tab2: render_event_cards(archive_df)
507
- else:
508
- st.warning("Dashboard empty. Run 'Force Manual Sweep' to populate.")
 
9
  from huggingface_hub import InferenceClient
10
  import json
11
  import numpy as np
 
 
 
12
  import altair as alt
13
  from sklearn.cluster import AgglomerativeClustering
 
 
 
14
 
15
  # Create a global lock for file operations
16
  data_lock = threading.Lock()
 
34
  return df
35
  return None
36
 
37
+ # --- RETENTION POLICY ---
38
  def apply_retention_policy(df):
39
  if df is None or df.empty:
40
  return df
 
44
  # 1. Retention Filtering
45
  leg_df = df[df['type'] == 'Legislation']
46
 
 
47
  news_types = ['News/Media', 'Federal/Exec Action', 'Legislative Office Press Release']
48
  news_mask = (df['type'].isin(news_types)) & ((df['event_date'] >= today - pd.Timedelta(days=30)) | df['event_date'].isna())
49
  news_df = df[news_mask]
 
94
  st.set_page_config(page_title="PolicyPilot Intel", layout="wide")
95
  st.title("AI Policy and News Dashboard - ALPHA Version")
96
 
 
97
  st.markdown("""
98
  Welcome to the **AI Policy and News Dashboard**, an automated platform tracking technology policy developments, legislative movement, and media coverage.
99
 
 
115
  To generate a high-level summary of the most recent data entries, click the **"Generate Briefing"** button below.
116
  """)
117
 
 
118
  st.markdown("""
119
  ---
120
  ### Notes for Users
 
122
  2. **Work in Progress:** This is an alpha version. Improvements in coverage and AI logic will be made regularly.
123
  """)
124
 
 
125
  with st.expander("🛠️ Technical Details & Architecture"):
126
  st.markdown("""
127
  * **AI Engine:** Powered by a two-tiered AI pipeline: Llama-3.1-8B-Instruct for initial data processing and Gemma 4 31B for summarization.
 
150
  st.divider()
151
  st.header("Manual Override")
152
 
 
153
  cooldown_minutes = 30
154
  can_sweep = True
155
  time_left = 0
 
186
  if active_df is not None and not active_df.empty:
187
  st.subheader("Executive Intel Briefing")
188
 
 
189
  st.info("AI briefing is synthesized from the most recent sources currently visible on the **Radar** tab.")
190
 
191
  if 'exec_briefing' not in st.session_state:
 
196
  if st.button("Generate Briefing"):
197
  with st.spinner("Gemma 31B is synthesizing your Radar intelligence..."):
198
 
 
199
  temp_df = active_df[active_df['type'].isin(selected_types)] if selected_types else active_df
200
  today_ts = pd.Timestamp.now().normalize()
201
  radar_df = temp_df[temp_df['event_date'] >= today_ts].sort_values(by="event_date", ascending=True)
202
 
 
203
  briefing_items = radar_df.head(15)
204
 
 
205
  if briefing_items.empty:
206
  briefing_items = temp_df.head(20)
207
 
208
  context = "\n".join([f"• SOURCE: {row['source']} | TITLE: {row['title']} | SUMMARY: {row.get('analysis', 'N/A')}" for _, row in briefing_items.iterrows()])
209
 
 
210
  prompt = f"""
211
  Provide a highly concise, 3-5 paragraph briefing based only on the recent intelligence gathered from the user's Radar tracking system.
212
 
 
222
 
223
  messages = [{"role": "user", "content": prompt}]
224
  try:
 
 
 
225
  gemma_client = InferenceClient("google/gemma-4-31B-it", token=os.getenv("HF_TOKEN"))
 
 
226
  response = gemma_client.chat_completion(messages, max_tokens=700, temperature=0.2)
227
  st.session_state.exec_briefing = response.choices[0].message.content
228
  st.rerun()
 
230
  st.error(f"Briefing failed. (model may be loading or hitting tier limits): {e}")
231
 
232
  st.divider()
 
 
 
 
 
 
 
 
 
233
 
234
+ # --- TREND ANALYSIS ---
235
  st.subheader("Weekly AI Trend Analysis")
236
  st.markdown("Explore the timeline of this week's AI policy developments. **Hover over any dot** to see the specific article and source. Non-AI related noise is automatically filtered out by the AI classifier.")
237
 
238
  if st.button("Generate Weekly Trend Report"):
239
  with st.spinner("Analyzing semantic data and abstracting macro-trends... (Takes ~30 seconds)"):
 
240
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
241
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
242
 
 
243
  weekly_df = weekly_df.dropna(subset=['embedding'])
244
 
245
  if len(weekly_df) < 5:
 
247
  else:
248
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
249
 
 
250
  clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
251
  weekly_df['cluster'] = clusterer.fit_predict(matrix)
252
 
 
262
  if hf_token:
263
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
264
 
 
265
  for i in range(num_clusters):
266
  cluster_df = weekly_df[weekly_df['cluster'] == i]
267
  sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
268
 
 
269
  prompt = f"""
270
  You are a highly analytical D.C. Tech Policy Analyst. Review these article titles.
271
  Your goal is to identify the MACRO-LEVEL AI policy, regulatory, or industry trend they represent.
 
284
  response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0)
285
  topic_name = response.choices[0].message.content.strip(' "').upper()
286
 
 
287
  if "REJECT" in topic_name:
288
  weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
289
  else:
 
293
  print(f"Failed to name cluster {i}: {e}")
294
  weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
295
 
296
+ time.sleep(10)
297
 
 
298
  clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
299
 
300
  if clean_df.empty:
 
302
  else:
303
  st.write(f"### Top AI Trends This Week:")
304
 
 
305
  valid_clusters = clean_df['cluster'].unique()
306
  for cluster_id in valid_clusters:
307
  cluster_subset = clean_df[clean_df['cluster'] == cluster_id]
308
  topic_label = cluster_subset['Trend Topic'].iloc[0]
309
  st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
310
 
 
 
 
311
  st.write("### Trend Timeline")
312
 
313
  chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
 
321
  alt.Tooltip('source:N', title='Source')
322
  ]
323
  ).properties(
324
+ height=max(300, len(valid_clusters) * 60)
325
  ).interactive()
326
 
327
  st.altair_chart(chart, use_container_width=True)
 
331
 
332
  st.divider()
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  # --- VISUAL CARD RENDERER ---
335
  def render_event_cards(display_df):
336
  if display_df.empty:
337
+ st.info("No items match these filters.")