IJ-Reynolds HF Staff commited on
Commit
8fb7a85
·
verified ·
1 Parent(s): 65035d6

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +27 -30
streamlit_app.py CHANGED
@@ -253,40 +253,38 @@ if active_df is not None and not active_df.empty:
253
  st.divider()
254
 
255
  ## --- Trend analysis ---
256
-
257
  st.subheader("Weekly AI Trend Analysis")
258
  st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. The math engine automatically determines the number of distinct macro-trends based on semantic density.")
259
 
260
  if st.button("Generate Weekly Trend Report"):
261
- with st.spinner("Calculating semantic topography and identifying trends... (Takes ~30 seconds)"):
262
  # 1. Filter for the last 7 days
263
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
264
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
265
 
266
- # 2. Extract embeddings
267
  weekly_df = weekly_df.dropna(subset=['embedding'])
268
 
269
  if len(weekly_df) < 5:
270
  st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to confidently calculate mathematical trends.")
271
  else:
272
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
273
- # find the optimal number of trends (K)
274
- max_possible_clusters = min(5, len(weekly_df) - 1)
275
- best_k = 2
276
- best_score = -1
277
- # Test different cluster sizes and let the data pick the best fit
278
- if max_possible_clusters > 2:
279
- for k in range(2, max_possible_clusters + 1):
280
- test_clusterer = AgglomerativeClustering(n_clusters=k, metric='cosine', linkage='average')
281
- test_labels = test_clusterer.fit_predict(matrix)
282
- score = silhouette_score(matrix, test_labels, metric='cosine')
283
- if score > best_score:
284
- best_score = score
285
- best_k = k
286
 
287
- # 3. Apply the clustering model
288
- clusterer = AgglomerativeClustering(n_clusters=best_k, metric='cosine', linkage='average')
 
 
 
289
  weekly_df['cluster'] = clusterer.fit_predict(matrix)
 
 
 
 
 
 
 
 
 
290
  weekly_df['Trend Topic'] = "Uncategorized"
291
 
292
  hf_token = os.getenv("HF_TOKEN")
@@ -294,15 +292,17 @@ if st.button("Generate Weekly Trend Report"):
294
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
295
 
296
  # 4. Background Naming Loop
297
- for i in range(best_k):
 
298
  cluster_df = weekly_df[weekly_df['cluster'] == i]
299
- sample_texts = "\n".join(cluster_df['title'].head(3).tolist())
 
 
300
 
301
  prompt = f"""
302
- Analyze these related policy updates and provide a concise, 2-to-4 word title for this trend (e.g., "Export Control Expansion" or "AI Safety Funding").
303
- Use strictly neutral, professional, and objective policy terminology. Do not use words like "Irrelevant", "Miscellaneous", "Unknown", or "Other".
304
- If the topics are diverse, find the broadest common policy denominator.
305
- Do not include quotes or extra text. Just the title.
306
 
307
  UPDATES:
308
  {sample_texts}
@@ -320,24 +320,20 @@ if st.button("Generate Weekly Trend Report"):
320
  time.sleep(10) # API Rate Limit Safety
321
 
322
  # ---------------------------------------------------------
323
- # Viz: Dynamic t-SNE Projection
324
  # ---------------------------------------------------------
325
-
326
- # Prevent Perplexity crash
327
  safe_perplexity = min(30, len(weekly_df) - 1)
328
 
329
- # Unroll the 384D vectors into 2D using t-SNE
330
  tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
331
  coords = tsne.fit_transform(matrix)
332
 
333
  weekly_df['x'] = coords[:, 0]
334
  weekly_df['y'] = coords[:, 1]
335
 
336
- # Build the Altair chart
337
  chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
338
  x=alt.X('x', axis=None),
339
  y=alt.Y('y', axis=None),
340
- color=alt.Color('Trend Topic', legend=alt.Legend(title=f"Top {best_k} Trends Identified", orient="bottom")),
341
  tooltip=[
342
  alt.Tooltip('Trend Topic', title='Macro Trend'),
343
  alt.Tooltip('title', title='Update Title'),
@@ -353,6 +349,7 @@ if st.button("Generate Weekly Trend Report"):
353
  st.error("Hugging Face API token not found. Cannot generate topic names.")
354
 
355
  st.divider()
 
356
 
357
  # --- VISUAL CARD RENDERER ---
358
  def render_event_cards(display_df):
 
253
  st.divider()
254
 
255
  ## --- Trend analysis ---
 
256
  st.subheader("Weekly AI Trend Analysis")
257
  st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. The math engine automatically determines the number of distinct macro-trends based on semantic density.")
258
 
259
  if st.button("Generate Weekly Trend Report"):
260
+ with st.spinner("Calculating high-fidelity semantic topography... (Takes ~30 seconds)"):
261
  # 1. Filter for the last 7 days
262
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
263
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
264
 
265
+ # 2. Extract embeddings back into numpy arrays
266
  weekly_df = weekly_df.dropna(subset=['embedding'])
267
 
268
  if len(weekly_df) < 5:
269
  st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to confidently calculate mathematical trends.")
270
  else:
271
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
+ # ---------------------------------------------------------
274
+ # HIGH FIDELITY FIX: Distance Threshold + Complete Linkage
275
+ # distance_threshold=0.55 forces tight semantic groupings.
276
+ # ---------------------------------------------------------
277
+ clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
278
  weekly_df['cluster'] = clusterer.fit_predict(matrix)
279
+
280
+ num_clusters = weekly_df['cluster'].nunique()
281
+
282
+ # Guardrail: If it makes too many micro-clusters, cap it at 6
283
+ if num_clusters > 6:
284
+ clusterer = AgglomerativeClustering(n_clusters=6, metric='cosine', linkage='complete')
285
+ weekly_df['cluster'] = clusterer.fit_predict(matrix)
286
+ num_clusters = 6
287
+
288
  weekly_df['Trend Topic'] = "Uncategorized"
289
 
290
  hf_token = os.getenv("HF_TOKEN")
 
292
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
293
 
294
  # 4. Background Naming Loop
295
+ st.write(f"### Top {num_clusters} Trends This Week:")
296
+ for i in range(num_clusters):
297
  cluster_df = weekly_df[weekly_df['cluster'] == i]
298
+
299
+ # FIDELITY FIX #2: Feed the AI up to 8 titles instead of just 3!
300
+ sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
301
 
302
  prompt = f"""
303
+ Analyze these related policy updates and provide a highly specific, concise, 2-to-4 word title for this macro-trend (e.g., "Export Control Expansion" or "AI Safety Funding").
304
+ Identify the exact specific policy mechanism happening. Do not use generic words like "Updates", "General", "Various", "Irrelevant", or "Other".
305
+ Do not include quotes. Just the title.
 
306
 
307
  UPDATES:
308
  {sample_texts}
 
320
  time.sleep(10) # API Rate Limit Safety
321
 
322
  # ---------------------------------------------------------
323
+ # 5. THE VISUALIZATION: Dynamic t-SNE Projection
324
  # ---------------------------------------------------------
 
 
325
  safe_perplexity = min(30, len(weekly_df) - 1)
326
 
 
327
  tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
328
  coords = tsne.fit_transform(matrix)
329
 
330
  weekly_df['x'] = coords[:, 0]
331
  weekly_df['y'] = coords[:, 1]
332
 
 
333
  chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
334
  x=alt.X('x', axis=None),
335
  y=alt.Y('y', axis=None),
336
+ color=alt.Color('Trend Topic', legend=alt.Legend(title="Identified Trends", orient="bottom")),
337
  tooltip=[
338
  alt.Tooltip('Trend Topic', title='Macro Trend'),
339
  alt.Tooltip('title', title='Update Title'),
 
349
  st.error("Hugging Face API token not found. Cannot generate topic names.")
350
 
351
  st.divider()
352
+
353
 
354
  # --- VISUAL CARD RENDERER ---
355
  def render_event_cards(display_df):