Spaces:
Running
Running
Update streamlit_app.py
Browse files- streamlit_app.py +69 -24
streamlit_app.py
CHANGED
|
@@ -252,12 +252,13 @@ if active_df is not None and not active_df.empty:
|
|
| 252 |
|
| 253 |
st.divider()
|
| 254 |
|
| 255 |
-
#
|
|
|
|
| 256 |
st.subheader("Weekly AI Trend Analysis")
|
| 257 |
-
st.markdown("Explore the
|
| 258 |
|
| 259 |
if st.button("Generate Weekly Trend Report"):
|
| 260 |
-
with st.spinner("
|
| 261 |
# 1. Filter for the last 7 days
|
| 262 |
week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
|
| 263 |
weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
|
|
@@ -266,24 +267,19 @@ if st.button("Generate Weekly Trend Report"):
|
|
| 266 |
weekly_df = weekly_df.dropna(subset=['embedding'])
|
| 267 |
|
| 268 |
if len(weekly_df) < 5:
|
| 269 |
-
st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to
|
| 270 |
else:
|
| 271 |
matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
|
| 272 |
|
| 273 |
-
#
|
| 274 |
-
# HIGH FIDELITY FIX: Distance Threshold + Complete Linkage
|
| 275 |
-
# distance_threshold=0.55 forces tight semantic groupings.
|
| 276 |
-
# ---------------------------------------------------------
|
| 277 |
clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
|
| 278 |
weekly_df['cluster'] = clusterer.fit_predict(matrix)
|
| 279 |
|
| 280 |
num_clusters = weekly_df['cluster'].nunique()
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
if num_clusters > 6:
|
| 284 |
-
clusterer = AgglomerativeClustering(n_clusters=6, metric='cosine', linkage='complete')
|
| 285 |
weekly_df['cluster'] = clusterer.fit_predict(matrix)
|
| 286 |
-
num_clusters =
|
| 287 |
|
| 288 |
weekly_df['Trend Topic'] = "Uncategorized"
|
| 289 |
|
|
@@ -291,18 +287,20 @@ if st.button("Generate Weekly Trend Report"):
|
|
| 291 |
if hf_token:
|
| 292 |
ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
|
| 293 |
|
| 294 |
-
# 4. Background Naming Loop
|
| 295 |
-
st.write(f"### Top {num_clusters} Trends This Week:")
|
| 296 |
for i in range(num_clusters):
|
| 297 |
cluster_df = weekly_df[weekly_df['cluster'] == i]
|
| 298 |
-
|
| 299 |
-
# FIDELITY FIX #2: Feed the AI up to 8 titles instead of just 3!
|
| 300 |
sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
|
| 301 |
|
|
|
|
| 302 |
prompt = f"""
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
UPDATES:
|
| 308 |
{sample_texts}
|
|
@@ -310,15 +308,62 @@ if st.button("Generate Weekly Trend Report"):
|
|
| 310 |
|
| 311 |
messages = [{"role": "user", "content": prompt}]
|
| 312 |
try:
|
| 313 |
-
response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.
|
| 314 |
-
topic_name = response.choices[0].message.content.strip(' "')
|
| 315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
except Exception as e:
|
| 317 |
print(f"Failed to name cluster {i}: {e}")
|
| 318 |
-
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] =
|
| 319 |
|
| 320 |
time.sleep(10) # API Rate Limit Safety
|
| 321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
# ---------------------------------------------------------
|
| 323 |
# 5. THE VISUALIZATION: Dynamic t-SNE Projection
|
| 324 |
# ---------------------------------------------------------
|
|
|
|
| 252 |
|
| 253 |
st.divider()
|
| 254 |
|
| 255 |
+
#Trend Analysis (text categories)
|
| 256 |
+
|
| 257 |
st.subheader("Weekly AI Trend Analysis")
|
| 258 |
+
st.markdown("Explore the timeline of this week's AI policy developments. **Hover over any dot** to see the specific article and source. Non-AI related noise is automatically filtered out by the AI classifier.")
|
| 259 |
|
| 260 |
if st.button("Generate Weekly Trend Report"):
|
| 261 |
+
with st.spinner("Analyzing semantic data and abstracting macro-trends... (Takes ~30 seconds)"):
|
| 262 |
# 1. Filter for the last 7 days
|
| 263 |
week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
|
| 264 |
weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
|
|
|
|
| 267 |
weekly_df = weekly_df.dropna(subset=['embedding'])
|
| 268 |
|
| 269 |
if len(weekly_df) < 5:
|
| 270 |
+
st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to calculate mathematical trends.")
|
| 271 |
else:
|
| 272 |
matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
|
| 273 |
|
| 274 |
+
# 3. Create clusters
|
|
|
|
|
|
|
|
|
|
| 275 |
clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
|
| 276 |
weekly_df['cluster'] = clusterer.fit_predict(matrix)
|
| 277 |
|
| 278 |
num_clusters = weekly_df['cluster'].nunique()
|
| 279 |
+
if num_clusters > 8:
|
| 280 |
+
clusterer = AgglomerativeClustering(n_clusters=8, metric='cosine', linkage='complete')
|
|
|
|
|
|
|
| 281 |
weekly_df['cluster'] = clusterer.fit_predict(matrix)
|
| 282 |
+
num_clusters = 8
|
| 283 |
|
| 284 |
weekly_df['Trend Topic'] = "Uncategorized"
|
| 285 |
|
|
|
|
| 287 |
if hf_token:
|
| 288 |
ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
|
| 289 |
|
| 290 |
+
# 4. Background Naming & Abstraction Loop
|
|
|
|
| 291 |
for i in range(num_clusters):
|
| 292 |
cluster_df = weekly_df[weekly_df['cluster'] == i]
|
|
|
|
|
|
|
| 293 |
sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
|
| 294 |
|
| 295 |
+
# ---> THE ABSTRACTION PROMPT <---
|
| 296 |
prompt = f"""
|
| 297 |
+
You are a highly analytical D.C. Tech Policy Analyst. Review these article titles.
|
| 298 |
+
Your goal is to identify the MACRO-LEVEL AI policy, regulatory, or industry trend they represent.
|
| 299 |
+
|
| 300 |
+
RULES:
|
| 301 |
+
1. ABSTRACT UP: If the articles are about specific consumer products (like a dating app), specific niche companies, or localized events, DO NOT name the specific product or company. Name the broader industry trend (e.g., "Commercial AI Deployment", "Consumer Algorithms", "Market Consolidation").
|
| 302 |
+
2. FILTER NOISE: If the articles have absolutely nothing to do with AI, algorithms, compute, or tech policy, reply with EXACTLY the word: REJECT.
|
| 303 |
+
3. FORMAT: Provide a concise 2-to-4 word title. No quotes, no extra text.
|
| 304 |
|
| 305 |
UPDATES:
|
| 306 |
{sample_texts}
|
|
|
|
| 308 |
|
| 309 |
messages = [{"role": "user", "content": prompt}]
|
| 310 |
try:
|
| 311 |
+
response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.0)
|
| 312 |
+
topic_name = response.choices[0].message.content.strip(' "').upper()
|
| 313 |
+
|
| 314 |
+
# Catch the rejection or format the title nicely
|
| 315 |
+
if "REJECT" in topic_name:
|
| 316 |
+
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
|
| 317 |
+
else:
|
| 318 |
+
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name.title()
|
| 319 |
+
|
| 320 |
except Exception as e:
|
| 321 |
print(f"Failed to name cluster {i}: {e}")
|
| 322 |
+
weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = "REJECT"
|
| 323 |
|
| 324 |
time.sleep(10) # API Rate Limit Safety
|
| 325 |
|
| 326 |
+
# ---> PURGE THE NOISE BEFORE VISUALIZING <---
|
| 327 |
+
clean_df = weekly_df[weekly_df['Trend Topic'] != "REJECT"].copy()
|
| 328 |
+
|
| 329 |
+
if clean_df.empty:
|
| 330 |
+
st.warning("All data this week was classified as non-AI noise by the analyst model.")
|
| 331 |
+
else:
|
| 332 |
+
st.write(f"### Top AI Trends This Week:")
|
| 333 |
+
|
| 334 |
+
# Display the cleaned metrics
|
| 335 |
+
valid_clusters = clean_df['cluster'].unique()
|
| 336 |
+
for cluster_id in valid_clusters:
|
| 337 |
+
cluster_subset = clean_df[clean_df['cluster'] == cluster_id]
|
| 338 |
+
topic_label = cluster_subset['Trend Topic'].iloc[0]
|
| 339 |
+
st.metric(label=topic_label, value=f"{len(cluster_subset)} Updates")
|
| 340 |
+
|
| 341 |
+
# ---------------------------------------------------------
|
| 342 |
+
# 5. THE VISUALIZATION: Analytical Timeline Swarm
|
| 343 |
+
# ---------------------------------------------------------
|
| 344 |
+
st.write("### Trend Timeline")
|
| 345 |
+
|
| 346 |
+
chart = alt.Chart(clean_df).mark_circle(size=150, opacity=0.8).encode(
|
| 347 |
+
x=alt.X('event_date:T', title='Date', axis=alt.Axis(format='%b %d', grid=True)),
|
| 348 |
+
y=alt.Y('Trend Topic:N', title='', sort='-x', axis=alt.Axis(labelLimit=300)),
|
| 349 |
+
color=alt.Color('Trend Topic:N', legend=None),
|
| 350 |
+
tooltip=[
|
| 351 |
+
alt.Tooltip('event_date:T', title='Date', format='%b %d, %Y'),
|
| 352 |
+
alt.Tooltip('Trend Topic:N', title='Macro Trend'),
|
| 353 |
+
alt.Tooltip('title:N', title='Update Title'),
|
| 354 |
+
alt.Tooltip('source:N', title='Source')
|
| 355 |
+
]
|
| 356 |
+
).properties(
|
| 357 |
+
height=max(300, len(valid_clusters) * 60) # Dynamically sizes the chart height
|
| 358 |
+
).interactive()
|
| 359 |
+
|
| 360 |
+
st.altair_chart(chart, use_container_width=True)
|
| 361 |
+
|
| 362 |
+
else:
|
| 363 |
+
st.error("Hugging Face API token not found. Cannot generate topic names.")
|
| 364 |
+
|
| 365 |
+
st.divider()
|
| 366 |
+
|
| 367 |
# ---------------------------------------------------------
|
| 368 |
# 5. THE VISUALIZATION: Dynamic t-SNE Projection
|
| 369 |
# ---------------------------------------------------------
|