IJ-Reynolds HF Staff commited on
Commit
09dd029
·
verified ·
1 Parent(s): 8fb7a85

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +69 -24
streamlit_app.py CHANGED
@@ -252,12 +252,13 @@ if active_df is not None and not active_df.empty:
252
 
253
  st.divider()
254
 
255
- ## --- Trend analysis ---
 
256
  st.subheader("Weekly AI Trend Analysis")
257
- st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. The math engine automatically determines the number of distinct macro-trends based on semantic density.")
258
 
259
  if st.button("Generate Weekly Trend Report"):
260
- with st.spinner("Calculating high-fidelity semantic topography... (Takes ~30 seconds)"):
261
  # 1. Filter for the last 7 days
262
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
263
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
@@ -266,24 +267,19 @@ if st.button("Generate Weekly Trend Report"):
266
  weekly_df = weekly_df.dropna(subset=['embedding'])
267
 
268
  if len(weekly_df) < 5:
269
- st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to confidently calculate mathematical trends.")
270
  else:
271
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
272
 
273
- # ---------------------------------------------------------
274
- # HIGH FIDELITY FIX: Distance Threshold + Complete Linkage
275
- # distance_threshold=0.55 forces tight semantic groupings.
276
- # ---------------------------------------------------------
277
  clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
278
  weekly_df['cluster'] = clusterer.fit_predict(matrix)
279
 
280
  num_clusters = weekly_df['cluster'].nunique()
281
-
282
- # Guardrail: If it makes too many micro-clusters, cap it at 6
283
- if num_clusters > 6:
284
- clusterer = AgglomerativeClustering(n_clusters=6, metric='cosine', linkage='complete')
285
  weekly_df['cluster'] = clusterer.fit_predict(matrix)
286
- num_clusters = 6
287
 
288
  weekly_df['Trend Topic'] = "Uncategorized"
289
 
@@ -291,18 +287,20 @@ if st.button("Generate Weekly Trend Report"):
291
  if hf_token:
292
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
293
 
294
- # 4. Background Naming Loop
295
- st.write(f"### Top {num_clusters} Trends This Week:")
296
  for i in range(num_clusters):
297
  cluster_df = weekly_df[weekly_df['cluster'] == i]
298
-
299
- # FIDELITY FIX #2: Feed the AI up to 8 titles instead of just 3!
300
  sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
301
 
 
302
  prompt = f"""
303
- Analyze these related policy updates and provide a highly specific, concise, 2-to-4 word title for this macro-trend (e.g., "Export Control Expansion" or "AI Safety Funding").
304
- Identify the exact specific policy mechanism happening. Do not use generic words like "Updates", "General", "Various", "Irrelevant", or "Other".
305
- Do not include quotes. Just the title.
 
 
 
 
306
 
307
  UPDATES:
308
  {sample_texts}
@@ -310,15 +308,62 @@ if st.button("Generate Weekly Trend Report"):
310
 
311
  messages = [{"role": "user", "content": prompt}]
312
  try:
313
- response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.1)
314
- topic_name = response.choices[0].message.content.strip(' "')
315
- weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name
 
 
 
 
 
 
316
  except Exception as e:
317
  print(f"Failed to name cluster {i}: {e}")
318
- weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = f"Trend Cluster {i+1}"
319
 
320
  time.sleep(10) # API Rate Limit Safety
321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  # ---------------------------------------------------------
323
  # 5. THE VISUALIZATION: Dynamic t-SNE Projection
324
  # ---------------------------------------------------------
 
252
 
253
  st.divider()
254
 
255
+ #Trend Analysis (text categories)
256
+
257
  st.subheader("Weekly AI Trend Analysis")
258
+ st.markdown("Explore the timeline of this week's AI policy developments. **Hover over any dot** to see the specific article and source. Non-AI related noise is automatically filtered out by the AI classifier.")
259
 
260
  if st.button("Generate Weekly Trend Report"):
261
+ with st.spinner("Analyzing semantic data and abstracting macro-trends... (Takes ~30 seconds)"):
262
  # 1. Filter for the last 7 days
263
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
264
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
 
267
  weekly_df = weekly_df.dropna(subset=['embedding'])
268
 
269
  if len(weekly_df) < 5:
270
+ st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to calculate mathematical trends.")
271
  else:
272
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
273
 
274
+ # 3. Create clusters
 
 
 
275
  clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
276
  weekly_df['cluster'] = clusterer.fit_predict(matrix)
277
 
278
  num_clusters = weekly_df['cluster'].nunique()
279
+ if num_clusters > 8:
280
+ clusterer = AgglomerativeClustering(n_clusters=8, metric='cosine', linkage='complete')
 
 
281
  weekly_df['cluster'] = clusterer.fit_predict(matrix)
282
+ num_clusters = 8
283
 
284
  weekly_df['Trend Topic'] = "Uncategorized"
285
 
 
287
  if hf_token:
288
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
289
 
290
+ # 4. Background Naming & Abstraction Loop
 
291
  for i in range(num_clusters):
292
  cluster_df = weekly_df[weekly_df['cluster'] == i]
 
 
293
  sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
294
 
295
+ # ---> THE ABSTRACTION PROMPT <---
296
  prompt = f"""
297
+ You are a highly analytical D.C. Tech Policy Analyst. Review these article titles.
298
+ Your goal is to identify the MACRO-LEVEL AI policy, regulatory, or industry trend they represent.
299
+
300
+ RULES:
301
+ 1. ABSTRACT UP: If the articles are about specific consumer products (like a dating app), specific niche companies, or localized events, DO NOT name the specific product or company. Name the broader industry trend (e.g., "Commercial AI Deployment", "Consumer Algorithms", "Market Consolidation").
302
+ 2. FILTER NOISE: If the articles have absolutely nothing to do with AI, algorithms, compute, or tech policy, reply with EXACTLY the word: REJECT.
303
+ 3. FORMAT: Provide a concise 2-to-4 word title. No quotes, no extra text.
304
 
305
  UPDATES:
306
  {sample_texts}
 
308
 
309
  messages = [{"role": "user", "content": prompt}]
310
  try:
311
+ response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0)
312
+ topic_name = response.choices[0].message.content.strip(' "').upper()
313
+
314
+ # Catch the rejection or format the title nicely
315
+ if "REJECT" in topic_name:
316
+ weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
317
+ else:
318
+ weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name.title()
319
+
320
  except Exception as e:
321
  print(f"Failed to name cluster {i}: {e}")
322
+ weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
323
 
324
  time.sleep(10) # API Rate Limit Safety
325
 
326
+ # ---> PURGE THE NOISE BEFORE VISUALIZING <---
327
+ clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
328
+
329
+ if clean_df.empty:
330
+ st.warning("All data this week was classified as non-AI noise by the analyst model.")
331
+ else:
332
+ st.write(f"### Top AI Trends This Week:")
333
+
334
+ # Display the cleaned metrics
335
+ valid_clusters = clean_df['cluster'].unique()
336
+ for cluster_id in valid_clusters:
337
+ cluster_subset = clean_df[clean_df['cluster'] == cluster_id]
338
+ topic_label = cluster_subset['Trend Topic'].iloc[0]
339
+ st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
340
+
341
+ # ---------------------------------------------------------
342
+ # 5. THE VISUALIZATION: Analytical Timeline Swarm
343
+ # ---------------------------------------------------------
344
+ st.write("### Trend Timeline")
345
+
346
+ chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
347
+ x=alt.X('event_date:T', title='Date', axis=alt.Axis(format='%b %d', grid=True)),
348
+ y=alt.Y('Trend Topic:N', title='', sort='-x', axis=alt.Axis(labelLimit=300)),
349
+ color=alt.Color('Trend Topic:N', legend=None),
350
+ tooltip=[
351
+ alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y'),
352
+ alt.Tooltip('Trend Topic:N', title='Macro Trend'),
353
+ alt.Tooltip('title:N', title='Update Title'),
354
+ alt.Tooltip('source:N', title='Source')
355
+ ]
356
+ ).properties(
357
+ height=max(300, len(valid_clusters) * 60) # Dynamically sizes the chart height
358
+ ).interactive()
359
+
360
+ st.altair_chart(chart, use_container_width=True)
361
+
362
+ else:
363
+ st.error("Hugging Face API token not found. Cannot generate topic names.")
364
+
365
+ st.divider()
366
+
367
  # ---------------------------------------------------------
368
  # 5. THE VISUALIZATION: Dynamic t-SNE Projection
369
  # ---------------------------------------------------------