IJ-Reynolds HF Staff commited on
Commit
5e8c57b
·
verified ·
1 Parent(s): 3ff209a

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +45 -11
streamlit_app.py CHANGED
@@ -231,15 +231,27 @@ if active_df is not None and not active_df.empty:
231
 
232
  st.divider()
233
 
234
- # --- TREND ANALYSIS ---
 
 
 
 
 
 
 
 
 
 
235
  st.subheader("Weekly AI Trend Analysis")
236
- st.markdown("Explore the timeline of this week's AI policy developments. **Hover over any dot** to see the specific article and source. Non-AI related noise is automatically filtered out by the AI classifier.")
237
 
238
  if st.button("Generate Weekly Trend Report"):
239
- with st.spinner("Analyzing semantic data and abstracting macro-trends... (Takes ~30 seconds)"):
 
240
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
241
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
242
 
 
243
  weekly_df = weekly_df.dropna(subset=['embedding'])
244
 
245
  if len(weekly_df) < 5:
@@ -247,6 +259,7 @@ if st.button("Generate Weekly Trend Report"):
247
  else:
248
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
249
 
 
250
  clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
251
  weekly_df['cluster'] = clusterer.fit_predict(matrix)
252
 
@@ -262,10 +275,12 @@ if st.button("Generate Weekly Trend Report"):
262
  if hf_token:
263
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
264
 
 
265
  for i in range(num_clusters):
266
  cluster_df = weekly_df[weekly_df['cluster'] == i]
267
  sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
268
 
 
269
  prompt = f"""
270
  You are a highly analytical D.C. Tech Policy Analyst. Review these article titles.
271
  Your goal is to identify the MACRO-LEVEL AI policy, regulatory, or industry trend they represent.
@@ -293,8 +308,9 @@ if st.button("Generate Weekly Trend Report"):
293
  print(f"Failed to name cluster {i}: {e}")
294
  weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
295
 
296
- time.sleep(10)
297
 
 
298
  clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
299
 
300
  if clean_df.empty:
@@ -308,20 +324,38 @@ if st.button("Generate Weekly Trend Report"):
308
  topic_label = cluster_subset['Trend Topic'].iloc[0]
309
  st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
310
 
311
- st.write("### Trend Timeline")
 
 
 
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
314
- x=alt.X('event_date:T', title='Date', axis=alt.Axis(format='%b %d', grid=True)),
315
- y=alt.Y('Trend Topic:N', title='', sort='-x', axis=alt.Axis(labelLimit=300)),
316
- color=alt.Color('Trend Topic:N', legend=None),
317
  tooltip=[
318
- alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y'),
319
  alt.Tooltip('Trend Topic:N', title='Macro Trend'),
320
  alt.Tooltip('title:N', title='Update Title'),
321
- alt.Tooltip('source:N', title='Source')
 
322
  ]
323
  ).properties(
324
- height=max(300, len(valid_clusters) * 60)
325
  ).interactive()
326
 
327
  st.altair_chart(chart, use_container_width=True)
 
231
 
232
  st.divider()
233
 
234
+ ## --- Trend analysis ---
235
+ import altair as alt
236
+ from sklearn.cluster import AgglomerativeClustering
237
+ from sklearn.manifold import TSNE
238
+ import numpy as np
239
+ import json
240
+ import time
241
+ import os
242
+ import pandas as pd
243
+ from huggingface_hub import InferenceClient
244
+
245
  st.subheader("Weekly AI Trend Analysis")
246
+ st.markdown("Explore the semantic relationships between this week's AI policy updates. **Hover over any dot** to see the specific article and source. Non-AI related noise is automatically filtered out by the AI classifier, and dots clustered closely together share similar policy themes.")
247
 
248
  if st.button("Generate Weekly Trend Report"):
249
+ with st.spinner("Analyzing semantic data, abstracting macro-trends, and mapping 2D space... (Takes ~30 seconds)"):
250
+ # 1. Filter for the last 7 days
251
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
252
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
253
 
254
+ # 2. Extract embeddings back into numpy arrays
255
  weekly_df = weekly_df.dropna(subset=['embedding'])
256
 
257
  if len(weekly_df) < 5:
 
259
  else:
260
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
261
 
262
+ # 3. Create clusters (using the high-fidelity tight settings)
263
  clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
264
  weekly_df['cluster'] = clusterer.fit_predict(matrix)
265
 
 
275
  if hf_token:
276
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
277
 
278
+ # 4. Background Naming & Abstraction Loop
279
  for i in range(num_clusters):
280
  cluster_df = weekly_df[weekly_df['cluster'] == i]
281
  sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
282
 
283
+ # ---> THE ABSTRACTION PROMPT <---
284
  prompt = f"""
285
  You are a highly analytical D.C. Tech Policy Analyst. Review these article titles.
286
  Your goal is to identify the MACRO-LEVEL AI policy, regulatory, or industry trend they represent.
 
308
  print(f"Failed to name cluster {i}: {e}")
309
  weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
310
 
311
+ time.sleep(10) # API Rate Limit Safety
312
 
313
+ # ---> PURGE THE NOISE BEFORE VISUALIZING <---
314
  clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
315
 
316
  if clean_df.empty:
 
324
  topic_label = cluster_subset['Trend Topic'].iloc[0]
325
  st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
326
 
327
+ # ---------------------------------------------------------
328
+ # 5. THE VISUALIZATION: Dynamic t-SNE Projection on CLEAN DATA
329
+ # ---------------------------------------------------------
330
+ st.write("### Semantic Cluster Map")
331
 
332
+ # Extract only the clean embeddings for the map
333
+ clean_matrix = np.vstack(clean_df['embedding'].apply(json.loads).values)
334
+
335
+ if len(clean_df) > 1:
336
+ safe_perplexity = max(1, min(30, len(clean_df) - 1))
337
+ tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
338
+ coords = tsne.fit_transform(clean_matrix)
339
+
340
+ clean_df['x'] = coords[:, 0]
341
+ clean_df['y'] = coords[:, 1]
342
+ else:
343
+ # Fallback if only 1 article survives the filter
344
+ clean_df['x'] = 0
345
+ clean_df['y'] = 0
346
+
347
  chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
348
+ x=alt.X('x', axis=None),
349
+ y=alt.Y('y', axis=None),
350
+ color=alt.Color('Trend Topic:N', legend=alt.Legend(title="Macro Trends", orient="bottom")),
351
  tooltip=[
 
352
  alt.Tooltip('Trend Topic:N', title='Macro Trend'),
353
  alt.Tooltip('title:N', title='Update Title'),
354
+ alt.Tooltip('source:N', title='Source'),
355
+ alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y')
356
  ]
357
  ).properties(
358
+ height=400
359
  ).interactive()
360
 
361
  st.altair_chart(chart, use_container_width=True)