Spaces:
Running
Running
Update streamlit_app.py
Browse files- streamlit_app.py +45 -11
streamlit_app.py
CHANGED
|
@@ -231,15 +231,27 @@ if active_df is not None and not active_df.empty:
|
|
| 231 |
|
| 232 |
st.divider()
|
| 233 |
|
| 234 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
st.subheader("Weekly AI Trend Analysis")
|
| 236 |
-
st.markdown("Explore the
|
| 237 |
|
| 238 |
if st.button("Generate Weekly Trend Report"):
|
| 239 |
-
with st.spinner("Analyzing semantic data
|
|
|
|
| 240 |
week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
|
| 241 |
weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
|
| 242 |
|
|
|
|
| 243 |
weekly_df = weekly_df.dropna(subset=['embedding'])
|
| 244 |
|
| 245 |
if len(weekly_df) < 5:
|
|
@@ -247,6 +259,7 @@ if st.button("Generate Weekly Trend Report"):
|
|
| 247 |
else:
|
| 248 |
matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
|
| 249 |
|
|
|
|
| 250 |
clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
|
| 251 |
weekly_df['cluster'] = clusterer.fit_predict(matrix)
|
| 252 |
|
|
@@ -262,10 +275,12 @@ if st.button("Generate Weekly Trend Report"):
|
|
| 262 |
if hf_token:
|
| 263 |
ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
|
| 264 |
|
|
|
|
| 265 |
for i in range(num_clusters):
|
| 266 |
cluster_df = weekly_df[weekly_df['cluster'] == i]
|
| 267 |
sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
|
| 268 |
|
|
|
|
| 269 |
prompt = f"""
|
| 270 |
You are a highly analytical D.C. Tech Policy Analyst. Review these article titles.
|
| 271 |
Your goal is to identify the MACRO-LEVEL AI policy, regulatory, or industry trend they represent.
|
|
@@ -293,8 +308,9 @@ if st.button("Generate Weekly Trend Report"):
|
|
| 293 |
print(f"Failed to name cluster {i}: {e}")
|
| 294 |
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
|
| 295 |
|
| 296 |
-
time.sleep(10)
|
| 297 |
|
|
|
|
| 298 |
clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
|
| 299 |
|
| 300 |
if clean_df.empty:
|
|
@@ -308,20 +324,38 @@ if st.button("Generate Weekly Trend Report"):
|
|
| 308 |
topic_label = cluster_subset['Trend Topic'].iloc[0]
|
| 309 |
st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
|
| 310 |
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
|
| 314 |
-
x=alt.X('
|
| 315 |
-
y=alt.Y('
|
| 316 |
-
color=alt.Color('Trend Topic:N', legend=
|
| 317 |
tooltip=[
|
| 318 |
-
alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y'),
|
| 319 |
alt.Tooltip('Trend Topic:N', title='Macro Trend'),
|
| 320 |
alt.Tooltip('title:N', title='Update Title'),
|
| 321 |
-
alt.Tooltip('source:N', title='Source')
|
|
|
|
| 322 |
]
|
| 323 |
).properties(
|
| 324 |
-
height=
|
| 325 |
).interactive()
|
| 326 |
|
| 327 |
st.altair_chart(chart, use_container_width=True)
|
|
|
|
| 231 |
|
| 232 |
st.divider()
|
| 233 |
|
| 234 |
+
## --- Trend analysis ---
|
| 235 |
+
import altair as alt
|
| 236 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 237 |
+
from sklearn.manifold import TSNE
|
| 238 |
+
import numpy as np
|
| 239 |
+
import json
|
| 240 |
+
import time
|
| 241 |
+
import os
|
| 242 |
+
import pandas as pd
|
| 243 |
+
from huggingface_hub import InferenceClient
|
| 244 |
+
|
| 245 |
st.subheader("Weekly AI Trend Analysis")
|
| 246 |
+
st.markdown("Explore the semantic relationships between this week's AI policy updates. **Hover over any dot** to see the specific article and source. Non-AI related noise is automatically filtered out by the AI classifier, and dots clustered closely together share similar policy themes.")
|
| 247 |
|
| 248 |
if st.button("Generate Weekly Trend Report"):
|
| 249 |
+
with st.spinner("Analyzing semantic data, abstracting macro-trends, and mapping 2D space... (Takes ~30 seconds)"):
|
| 250 |
+
# 1. Filter for the last 7 days
|
| 251 |
week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
|
| 252 |
weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
|
| 253 |
|
| 254 |
+
# 2. Extract embeddings back into numpy arrays
|
| 255 |
weekly_df = weekly_df.dropna(subset=['embedding'])
|
| 256 |
|
| 257 |
if len(weekly_df) < 5:
|
|
|
|
| 259 |
else:
|
| 260 |
matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
|
| 261 |
|
| 262 |
+
# 3. Create clusters (using the high-fidelity tight settings)
|
| 263 |
clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
|
| 264 |
weekly_df['cluster'] = clusterer.fit_predict(matrix)
|
| 265 |
|
|
|
|
| 275 |
if hf_token:
|
| 276 |
ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
|
| 277 |
|
| 278 |
+
# 4. Background Naming & Abstraction Loop
|
| 279 |
for i in range(num_clusters):
|
| 280 |
cluster_df = weekly_df[weekly_df['cluster'] == i]
|
| 281 |
sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
|
| 282 |
|
| 283 |
+
# ---> THE ABSTRACTION PROMPT <---
|
| 284 |
prompt = f"""
|
| 285 |
You are a highly analytical D.C. Tech Policy Analyst. Review these article titles.
|
| 286 |
Your goal is to identify the MACRO-LEVEL AI policy, regulatory, or industry trend they represent.
|
|
|
|
| 308 |
print(f"Failed to name cluster {i}: {e}")
|
| 309 |
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
|
| 310 |
|
| 311 |
+
time.sleep(10) # API Rate Limit Safety
|
| 312 |
|
| 313 |
+
# ---> PURGE THE NOISE BEFORE VISUALIZING <---
|
| 314 |
clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
|
| 315 |
|
| 316 |
if clean_df.empty:
|
|
|
|
| 324 |
topic_label = cluster_subset['Trend Topic'].iloc[0]
|
| 325 |
st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
|
| 326 |
|
| 327 |
+
# ---------------------------------------------------------
|
| 328 |
+
# 5. THE VISUALIZATION: Dynamic t-SNE Projection on CLEAN DATA
|
| 329 |
+
# ---------------------------------------------------------
|
| 330 |
+
st.write("### Semantic Cluster Map")
|
| 331 |
|
| 332 |
+
# Extract only the clean embeddings for the map
|
| 333 |
+
clean_matrix = np.vstack(clean_df['embedding'].apply(json.loads).values)
|
| 334 |
+
|
| 335 |
+
if len(clean_df) > 1:
|
| 336 |
+
safe_perplexity = max(1, min(30, len(clean_df) - 1))
|
| 337 |
+
tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
|
| 338 |
+
coords = tsne.fit_transform(clean_matrix)
|
| 339 |
+
|
| 340 |
+
clean_df['x'] = coords[:, 0]
|
| 341 |
+
clean_df['y'] = coords[:, 1]
|
| 342 |
+
else:
|
| 343 |
+
# Fallback if only 1 article survives the filter
|
| 344 |
+
clean_df['x'] = 0
|
| 345 |
+
clean_df['y'] = 0
|
| 346 |
+
|
| 347 |
chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
|
| 348 |
+
x=alt.X('x', axis=None),
|
| 349 |
+
y=alt.Y('y', axis=None),
|
| 350 |
+
color=alt.Color('Trend Topic:N', legend=alt.Legend(title="Macro Trends", orient="bottom")),
|
| 351 |
tooltip=[
|
|
|
|
| 352 |
alt.Tooltip('Trend Topic:N', title='Macro Trend'),
|
| 353 |
alt.Tooltip('title:N', title='Update Title'),
|
| 354 |
+
alt.Tooltip('source:N', title='Source'),
|
| 355 |
+
alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y')
|
| 356 |
]
|
| 357 |
).properties(
|
| 358 |
+
height=400
|
| 359 |
).interactive()
|
| 360 |
|
| 361 |
st.altair_chart(chart, use_container_width=True)
|