Spaces:
Running
Running
Update streamlit_app.py
Browse files- streamlit_app.py +27 -30
streamlit_app.py
CHANGED
|
@@ -253,40 +253,38 @@ if active_df is not None and not active_df.empty:
|
|
| 253 |
st.divider()
|
| 254 |
|
| 255 |
## --- Trend analysis ---
|
| 256 |
-
|
| 257 |
st.subheader("Weekly AI Trend Analysis")
|
| 258 |
st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. The math engine automatically determines the number of distinct macro-trends based on semantic density.")
|
| 259 |
|
| 260 |
if st.button("Generate Weekly Trend Report"):
|
| 261 |
-
with st.spinner("Calculating semantic topography
|
| 262 |
# 1. Filter for the last 7 days
|
| 263 |
week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
|
| 264 |
weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
|
| 265 |
|
| 266 |
-
# 2. Extract embeddings
|
| 267 |
weekly_df = weekly_df.dropna(subset=['embedding'])
|
| 268 |
|
| 269 |
if len(weekly_df) < 5:
|
| 270 |
st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to confidently calculate mathematical trends.")
|
| 271 |
else:
|
| 272 |
matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
|
| 273 |
-
# find the optimal number of trends (K)
|
| 274 |
-
max_possible_clusters = min(5, len(weekly_df) - 1)
|
| 275 |
-
best_k = 2
|
| 276 |
-
best_score = -1
|
| 277 |
-
# Test different cluster sizes and let the data pick the best fit
|
| 278 |
-
if max_possible_clusters > 2:
|
| 279 |
-
for k in range(2, max_possible_clusters + 1):
|
| 280 |
-
test_clusterer = AgglomerativeClustering(n_clusters=k, metric='cosine', linkage='average')
|
| 281 |
-
test_labels = test_clusterer.fit_predict(matrix)
|
| 282 |
-
score = silhouette_score(matrix, test_labels, metric='cosine')
|
| 283 |
-
if score > best_score:
|
| 284 |
-
best_score = score
|
| 285 |
-
best_k = k
|
| 286 |
|
| 287 |
-
#
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
| 289 |
weekly_df['cluster'] = clusterer.fit_predict(matrix)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
weekly_df['Trend Topic'] = "Uncategorized"
|
| 291 |
|
| 292 |
hf_token = os.getenv("HF_TOKEN")
|
|
@@ -294,15 +292,17 @@ if st.button("Generate Weekly Trend Report"):
|
|
| 294 |
ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
|
| 295 |
|
| 296 |
# 4. Background Naming Loop
|
| 297 |
-
|
|
|
|
| 298 |
cluster_df = weekly_df[weekly_df['cluster'] == i]
|
| 299 |
-
|
|
|
|
|
|
|
| 300 |
|
| 301 |
prompt = f"""
|
| 302 |
-
Analyze these related policy updates and provide a concise, 2-to-4 word title for this trend (e.g., "Export Control Expansion" or "AI Safety Funding").
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
Do not include quotes or extra text. Just the title.
|
| 306 |
|
| 307 |
UPDATES:
|
| 308 |
{sample_texts}
|
|
@@ -320,24 +320,20 @@ if st.button("Generate Weekly Trend Report"):
|
|
| 320 |
time.sleep(10) # API Rate Limit Safety
|
| 321 |
|
| 322 |
# ---------------------------------------------------------
|
| 323 |
-
#
|
| 324 |
# ---------------------------------------------------------
|
| 325 |
-
|
| 326 |
-
# Prevent Perplexity crash
|
| 327 |
safe_perplexity = min(30, len(weekly_df) - 1)
|
| 328 |
|
| 329 |
-
# Unroll the 384D vectors into 2D using t-SNE
|
| 330 |
tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
|
| 331 |
coords = tsne.fit_transform(matrix)
|
| 332 |
|
| 333 |
weekly_df['x'] = coords[:, 0]
|
| 334 |
weekly_df['y'] = coords[:, 1]
|
| 335 |
|
| 336 |
-
# Build the Altair chart
|
| 337 |
chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
|
| 338 |
x=alt.X('x', axis=None),
|
| 339 |
y=alt.Y('y', axis=None),
|
| 340 |
-
color=alt.Color('Trend Topic', legend=alt.Legend(title=
|
| 341 |
tooltip=[
|
| 342 |
alt.Tooltip('Trend Topic', title='Macro Trend'),
|
| 343 |
alt.Tooltip('title', title='Update Title'),
|
|
@@ -353,6 +349,7 @@ if st.button("Generate Weekly Trend Report"):
|
|
| 353 |
st.error("Hugging Face API token not found. Cannot generate topic names.")
|
| 354 |
|
| 355 |
st.divider()
|
|
|
|
| 356 |
|
| 357 |
# --- VISUAL CARD RENDERER ---
|
| 358 |
def render_event_cards(display_df):
|
|
|
|
| 253 |
st.divider()
|
| 254 |
|
| 255 |
## --- Trend analysis ---
|
|
|
|
| 256 |
st.subheader("Weekly AI Trend Analysis")
|
| 257 |
st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. The math engine automatically determines the number of distinct macro-trends based on semantic density.")
|
| 258 |
|
| 259 |
if st.button("Generate Weekly Trend Report"):
|
| 260 |
+
with st.spinner("Calculating high-fidelity semantic topography... (Takes ~30 seconds)"):
|
| 261 |
# 1. Filter for the last 7 days
|
| 262 |
week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
|
| 263 |
weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
|
| 264 |
|
| 265 |
+
# 2. Extract embeddings back into numpy arrays
|
| 266 |
weekly_df = weekly_df.dropna(subset=['embedding'])
|
| 267 |
|
| 268 |
if len(weekly_df) < 5:
|
| 269 |
st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to confidently calculate mathematical trends.")
|
| 270 |
else:
|
| 271 |
matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
# ---------------------------------------------------------
|
| 274 |
+
# HIGH FIDELITY FIX: Distance Threshold + Complete Linkage
|
| 275 |
+
# distance_threshold=0.55 forces tight semantic groupings.
|
| 276 |
+
# ---------------------------------------------------------
|
| 277 |
+
clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
|
| 278 |
weekly_df['cluster'] = clusterer.fit_predict(matrix)
|
| 279 |
+
|
| 280 |
+
num_clusters = weekly_df['cluster'].nunique()
|
| 281 |
+
|
| 282 |
+
# Guardrail: If it makes too many micro-clusters, cap it at 6
|
| 283 |
+
if num_clusters > 6:
|
| 284 |
+
clusterer = AgglomerativeClustering(n_clusters=6, metric='cosine', linkage='complete')
|
| 285 |
+
weekly_df['cluster'] = clusterer.fit_predict(matrix)
|
| 286 |
+
num_clusters = 6
|
| 287 |
+
|
| 288 |
weekly_df['Trend Topic'] = "Uncategorized"
|
| 289 |
|
| 290 |
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
| 292 |
ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
|
| 293 |
|
| 294 |
# 4. Background Naming Loop
|
| 295 |
+
st.write(f"### Top {num_clusters} Trends This Week:")
|
| 296 |
+
for i in range(num_clusters):
|
| 297 |
cluster_df = weekly_df[weekly_df['cluster'] == i]
|
| 298 |
+
|
| 299 |
+
# FIDELITY FIX #2: Feed the AI up to 8 titles instead of just 3!
|
| 300 |
+
sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
|
| 301 |
|
| 302 |
prompt = f"""
|
| 303 |
+
Analyze these related policy updates and provide a highly specific, concise, 2-to-4 word title for this macro-trend (e.g., "Export Control Expansion" or "AI Safety Funding").
|
| 304 |
+
Identify the exact specific policy mechanism happening. Do not use generic words like "Updates", "General", "Various", "Irrelevant", or "Other".
|
| 305 |
+
Do not include quotes. Just the title.
|
|
|
|
| 306 |
|
| 307 |
UPDATES:
|
| 308 |
{sample_texts}
|
|
|
|
| 320 |
time.sleep(10) # API Rate Limit Safety
|
| 321 |
|
| 322 |
# ---------------------------------------------------------
|
| 323 |
+
# 5. THE VISUALIZATION: Dynamic t-SNE Projection
|
| 324 |
# ---------------------------------------------------------
|
|
|
|
|
|
|
| 325 |
safe_perplexity = min(30, len(weekly_df) - 1)
|
| 326 |
|
|
|
|
| 327 |
tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
|
| 328 |
coords = tsne.fit_transform(matrix)
|
| 329 |
|
| 330 |
weekly_df['x'] = coords[:, 0]
|
| 331 |
weekly_df['y'] = coords[:, 1]
|
| 332 |
|
|
|
|
| 333 |
chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
|
| 334 |
x=alt.X('x', axis=None),
|
| 335 |
y=alt.Y('y', axis=None),
|
| 336 |
+
color=alt.Color('Trend Topic', legend=alt.Legend(title="Identified Trends", orient="bottom")),
|
| 337 |
tooltip=[
|
| 338 |
alt.Tooltip('Trend Topic', title='Macro Trend'),
|
| 339 |
alt.Tooltip('title', title='Update Title'),
|
|
|
|
| 349 |
st.error("Hugging Face API token not found. Cannot generate topic names.")
|
| 350 |
|
| 351 |
st.divider()
|
| 352 |
+
|
| 353 |
|
| 354 |
# --- VISUAL CARD RENDERER ---
|
| 355 |
def render_event_cards(display_df):
|