IJ-Reynolds HF Staff commited on
Commit
65035d6
·
verified ·
1 Parent(s): f44cdfd

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +41 -23
streamlit_app.py CHANGED
@@ -12,6 +12,11 @@ import numpy as np
12
  from sklearn.cluster import KMeans
13
  import altair as alt
14
  from sklearn.decomposition import PCA
 
 
 
 
 
15
 
16
  # Create a global lock for file operations
17
  data_lock = threading.Lock()
@@ -250,35 +255,46 @@ st.divider()
250
  ## --- Trend analysis ---
251
 
252
  st.subheader("Weekly AI Trend Analysis")
253
- st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. Dots clustered closely together share similar policy themes based on AI mathematical analysis.")
254
 
255
  if st.button("Generate Weekly Trend Report"):
256
- with st.spinner("Analyzing semantic vectors and generating topics... (This takes about 30 seconds to safely process)"):
257
  # 1. Filter for the last 7 days
258
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
259
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
260
 
261
- # 2. Extract embeddings back into numpy arrays
262
  weekly_df = weekly_df.dropna(subset=['embedding'])
263
 
264
- if weekly_df.empty:
265
- st.warning("Not enough data with embeddings this week to run trend analysis. Let the scraper run a bit longer!")
266
  else:
267
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
- # 3. Run K-Means Clustering
270
- num_clusters = min(3, len(weekly_df))
271
- kmeans = KMeans(n_clusters=num_clusters, random_state=42)
272
- weekly_df['cluster'] = kmeans.fit_predict(matrix)
273
-
274
  weekly_df['Trend Topic'] = "Uncategorized"
275
 
276
  hf_token = os.getenv("HF_TOKEN")
277
  if hf_token:
278
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
279
 
280
- # 4. Background Naming Loop (Silent to the user now!)
281
- for i in range(num_clusters):
282
  cluster_df = weekly_df[weekly_df['cluster'] == i]
283
  sample_texts = "\n".join(cluster_df['title'].head(3).tolist())
284
 
@@ -301,24 +317,27 @@ if st.button("Generate Weekly Trend Report"):
301
  print(f"Failed to name cluster {i}: {e}")
302
  weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = f"Trend Cluster {i+1}"
303
 
304
- time.sleep(10) # Keeping the rate limit safety net
305
 
306
  # ---------------------------------------------------------
307
- # 5. THE VISUALIZATION: Interactive Altair Scatter Plot
308
  # ---------------------------------------------------------
309
 
310
- # Squash the 384-dimension matrix down to 2 dimensions
311
- pca = PCA(n_components=2)
312
- coords = pca.fit_transform(matrix)
 
 
 
313
 
314
  weekly_df['x'] = coords[:, 0]
315
  weekly_df['y'] = coords[:, 1]
316
 
317
- # Build the beautiful, interactive chart
318
  chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
319
- x=alt.X('x', axis=None), # Hide the meaningless math axis
320
- y=alt.Y('y', axis=None), # Hide the meaningless math axis
321
- color=alt.Color('Trend Topic', legend=alt.Legend(title="Identified Trends", orient="bottom")),
322
  tooltip=[
323
  alt.Tooltip('Trend Topic', title='Macro Trend'),
324
  alt.Tooltip('title', title='Update Title'),
@@ -326,7 +345,7 @@ if st.button("Generate Weekly Trend Report"):
326
  ]
327
  ).properties(
328
  height=400
329
- ).interactive() # <--- This is what lets the user pan and zoom!
330
 
331
  st.altair_chart(chart, use_container_width=True)
332
 
@@ -335,7 +354,6 @@ if st.button("Generate Weekly Trend Report"):
335
 
336
  st.divider()
337
 
338
-
339
  # --- VISUAL CARD RENDERER ---
340
  def render_event_cards(display_df):
341
  if display_df.empty:
 
12
  from sklearn.cluster import KMeans
13
  import altair as alt
14
  from sklearn.decomposition import PCA
15
+ import altair as alt
16
+ from sklearn.cluster import AgglomerativeClustering
17
+ from sklearn.manifold import TSNE
18
+ from sklearn.metrics import silhouette_score
19
+
20
 
21
  # Create a global lock for file operations
22
  data_lock = threading.Lock()
 
255
  ## --- Trend analysis ---
256
 
257
  st.subheader("Weekly AI Trend Analysis")
258
+ st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. The math engine automatically determines the number of distinct macro-trends based on semantic density.")
259
 
260
  if st.button("Generate Weekly Trend Report"):
261
+ with st.spinner("Calculating semantic topography and identifying trends... (Takes ~30 seconds)"):
262
  # 1. Filter for the last 7 days
263
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
264
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
265
 
266
+ # 2. Extract embeddings
267
  weekly_df = weekly_df.dropna(subset=['embedding'])
268
 
269
+ if len(weekly_df) < 5:
270
+ st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to confidently calculate mathematical trends.")
271
  else:
272
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
273
+ # find the optimal number of trends (K)
274
+ max_possible_clusters = min(5, len(weekly_df) - 1)
275
+ best_k = 2
276
+ best_score = -1
277
+ # Test different cluster sizes and let the data pick the best fit
278
+ if max_possible_clusters > 2:
279
+ for k in range(2, max_possible_clusters + 1):
280
+ test_clusterer = AgglomerativeClustering(n_clusters=k, metric='cosine', linkage='average')
281
+ test_labels = test_clusterer.fit_predict(matrix)
282
+ score = silhouette_score(matrix, test_labels, metric='cosine')
283
+ if score > best_score:
284
+ best_score = score
285
+ best_k = k
286
 
287
+ # 3. Apply the clustering model
288
+ clusterer = AgglomerativeClustering(n_clusters=best_k, metric='cosine', linkage='average')
289
+ weekly_df['cluster'] = clusterer.fit_predict(matrix)
 
 
290
  weekly_df['Trend Topic'] = "Uncategorized"
291
 
292
  hf_token = os.getenv("HF_TOKEN")
293
  if hf_token:
294
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
295
 
296
+ # 4. Background Naming Loop
297
+ for i in range(best_k):
298
  cluster_df = weekly_df[weekly_df['cluster'] == i]
299
  sample_texts = "\n".join(cluster_df['title'].head(3).tolist())
300
 
 
317
  print(f"Failed to name cluster {i}: {e}")
318
  weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = f"Trend Cluster {i+1}"
319
 
320
+ time.sleep(10) # API Rate Limit Safety
321
 
322
  # ---------------------------------------------------------
323
+ # Viz: Dynamic t-SNE Projection
324
  # ---------------------------------------------------------
325
 
326
+ # Prevent Perplexity crash
327
+ safe_perplexity = min(30, len(weekly_df) - 1)
328
+
329
+ # Unroll the 384D vectors into 2D using t-SNE
330
+ tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
331
+ coords = tsne.fit_transform(matrix)
332
 
333
  weekly_df['x'] = coords[:, 0]
334
  weekly_df['y'] = coords[:, 1]
335
 
336
+ # Build the Altair chart
337
  chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
338
+ x=alt.X('x', axis=None),
339
+ y=alt.Y('y', axis=None),
340
+ color=alt.Color('Trend Topic', legend=alt.Legend(title=f"Top {best_k} Trends Identified", orient="bottom")),
341
  tooltip=[
342
  alt.Tooltip('Trend Topic', title='Macro Trend'),
343
  alt.Tooltip('title', title='Update Title'),
 
345
  ]
346
  ).properties(
347
  height=400
348
+ ).interactive()
349
 
350
  st.altair_chart(chart, use_container_width=True)
351
 
 
354
 
355
  st.divider()
356
 
 
357
  # --- VISUAL CARD RENDERER ---
358
  def render_event_cards(display_df):
359
  if display_df.empty: