IJ-Reynolds HF Staff commited on
Commit
f44cdfd
·
verified ·
1 Parent(s): 0ea3eac

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +40 -21
streamlit_app.py CHANGED
@@ -10,6 +10,8 @@ from huggingface_hub import InferenceClient
10
  import json
11
  import numpy as np
12
  from sklearn.cluster import KMeans
 
 
13
 
14
  # Create a global lock for file operations
15
  data_lock = threading.Lock()
@@ -248,9 +250,10 @@ st.divider()
248
  ## --- Trend analysis ---
249
 
250
  st.subheader("Weekly AI Trend Analysis")
 
251
 
252
  if st.button("Generate Weekly Trend Report"):
253
- with st.spinner("Clustering data and generating topics..."):
254
  # 1. Filter for the last 7 days
255
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
256
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
@@ -263,30 +266,25 @@ if st.button("Generate Weekly Trend Report"):
263
  else:
264
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
265
 
266
- # 3. Run K-Means Clustering (Let's say we want the top 3 trends)
267
  num_clusters = min(3, len(weekly_df))
268
  kmeans = KMeans(n_clusters=num_clusters, random_state=42)
269
  weekly_df['cluster'] = kmeans.fit_predict(matrix)
270
 
271
- # 4. Have the AI Name the Clusters
272
- st.write("### Top Trends This Week:")
273
 
274
  hf_token = os.getenv("HF_TOKEN")
275
  if hf_token:
276
- # Using the highly reliable, high-limit Llama 3.1 8B!
277
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
278
 
 
279
  for i in range(num_clusters):
280
- # Get the articles in this cluster
281
  cluster_df = weekly_df[weekly_df['cluster'] == i]
282
-
283
- # Grab the top 3 articles closest to the center to give context
284
  sample_texts = "\n".join(cluster_df['title'].head(3).tolist())
285
 
286
- # prompting for trends
287
  prompt = f"""
288
  Analyze these related policy updates and provide a concise, 2-to-4 word title for this trend (e.g., "Export Control Expansion" or "AI Safety Funding").
289
- Use strictly neutral, professional, and objective terminology. Do not use words like "Irrelevant", "Miscellaneous", "Unknown", or "Other".
290
  If the topics are diverse, find the broadest common policy denominator.
291
  Do not include quotes or extra text. Just the title.
292
 
@@ -298,19 +296,40 @@ if st.button("Generate Weekly Trend Report"):
298
  try:
299
  response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.1)
300
  topic_name = response.choices[0].message.content.strip(' "')
301
-
302
- # Display the dynamically named trend and how many articles are in it
303
- st.metric(label=topic_name, value=f"{len(cluster_df)} Updates")
304
-
305
- # Show the articles under the trend
306
- with st.expander("View Updates"):
307
- for _, row in cluster_df.iterrows():
308
- st.markdown(f"- **{row['title']}** ({row['source']})")
309
  except Exception as e:
310
- st.error(f"Failed to name cluster {i}: {e}")
 
311
 
312
- #rate limit
313
- time.sleep(10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  else:
315
  st.error("Hugging Face API token not found. Cannot generate topic names.")
316
 
 
10
  import json
11
  import numpy as np
12
  from sklearn.cluster import KMeans
13
+ import altair as alt
14
+ from sklearn.decomposition import PCA
15
 
16
  # Create a global lock for file operations
17
  data_lock = threading.Lock()
 
250
  ## --- Trend analysis ---
251
 
252
  st.subheader("Weekly AI Trend Analysis")
253
+ st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. Dots clustered closely together share similar policy themes based on AI mathematical analysis.")
254
 
255
  if st.button("Generate Weekly Trend Report"):
256
+ with st.spinner("Analyzing semantic vectors and generating topics... (This takes about 30 seconds to safely process)"):
257
  # 1. Filter for the last 7 days
258
  week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
259
  weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
 
266
  else:
267
  matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
268
 
269
+ # 3. Run K-Means Clustering
270
  num_clusters = min(3, len(weekly_df))
271
  kmeans = KMeans(n_clusters=num_clusters, random_state=42)
272
  weekly_df['cluster'] = kmeans.fit_predict(matrix)
273
 
274
+ weekly_df['Trend Topic'] = "Uncategorized"
 
275
 
276
  hf_token = os.getenv("HF_TOKEN")
277
  if hf_token:
 
278
  ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
279
 
280
+ # 4. Background Naming Loop (Silent to the user now!)
281
  for i in range(num_clusters):
 
282
  cluster_df = weekly_df[weekly_df['cluster'] == i]
 
 
283
  sample_texts = "\n".join(cluster_df['title'].head(3).tolist())
284
 
 
285
  prompt = f"""
286
  Analyze these related policy updates and provide a concise, 2-to-4 word title for this trend (e.g., "Export Control Expansion" or "AI Safety Funding").
287
+ Use strictly neutral, professional, and objective policy terminology. Do not use words like "Irrelevant", "Miscellaneous", "Unknown", or "Other".
288
  If the topics are diverse, find the broadest common policy denominator.
289
  Do not include quotes or extra text. Just the title.
290
 
 
296
  try:
297
  response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.1)
298
  topic_name = response.choices[0].message.content.strip(' "')
299
+ weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name
 
 
 
 
 
 
 
300
  except Exception as e:
301
+ print(f"Failed to name cluster {i}: {e}")
302
+ weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = f"Trend Cluster {i+1}"
303
 
304
+ time.sleep(10) # Keeping the rate limit safety net
305
+
306
+ # ---------------------------------------------------------
307
+ # 5. THE VISUALIZATION: Interactive Altair Scatter Plot
308
+ # ---------------------------------------------------------
309
+
310
+ # Squash the 384-dimension matrix down to 2 dimensions
311
+ pca = PCA(n_components=2)
312
+ coords = pca.fit_transform(matrix)
313
+
314
+ weekly_df['x'] = coords[:, 0]
315
+ weekly_df['y'] = coords[:, 1]
316
+
317
+ # Build the beautiful, interactive chart
318
+ chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
319
+ x=alt.X('x', axis=None), # Hide the meaningless math axis
320
+ y=alt.Y('y', axis=None), # Hide the meaningless math axis
321
+ color=alt.Color('Trend Topic', legend=alt.Legend(title="Identified Trends", orient="bottom")),
322
+ tooltip=[
323
+ alt.Tooltip('Trend Topic', title='Macro Trend'),
324
+ alt.Tooltip('title', title='Update Title'),
325
+ alt.Tooltip('source', title='Source Agency/Office')
326
+ ]
327
+ ).properties(
328
+ height=400
329
+ ).interactive() # <--- This is what lets the user pan and zoom!
330
+
331
+ st.altair_chart(chart, use_container_width=True)
332
+
333
  else:
334
  st.error("Hugging Face API token not found. Cannot generate topic names.")
335