Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Running

App Files Files Community

IJ-Reynolds HF Staff commited on 1 day ago

Commit

65035d6

verified ·

1 Parent(s): f44cdfd

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +41 -23

streamlit_app.py CHANGED Viewed

@@ -12,6 +12,11 @@ import numpy as np
 from sklearn.cluster import KMeans
 import altair as alt
 from sklearn.decomposition import PCA
 # Create a global lock for file operations
 data_lock = threading.Lock()
@@ -250,35 +255,46 @@ st.divider()
 ## --- Trend analysis ---
 st.subheader("Weekly AI Trend Analysis")
-st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. Dots clustered closely together share similar policy themes based on AI mathematical analysis.")
 if st.button("Generate Weekly Trend Report"):
-    with st.spinner("Analyzing semantic vectors and generating topics... (This takes about 30 seconds to safely process)"):
         # 1. Filter for the last 7 days
         week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
         weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
-        # 2. Extract embeddings back into numpy arrays
         weekly_df = weekly_df.dropna(subset=['embedding'])
-        if weekly_df.empty:
-            st.warning("Not enough data with embeddings this week to run trend analysis. Let the scraper run a bit longer!")
         else:
             matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
-            # 3. Run K-Means Clustering
-            num_clusters = min(3, len(weekly_df))
-            kmeans = KMeans(n_clusters=num_clusters, random_state=42)
-            weekly_df['cluster'] = kmeans.fit_predict(matrix)
             weekly_df['Trend Topic'] = "Uncategorized"
             hf_token = os.getenv("HF_TOKEN")
             if hf_token:
                 ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
-                # 4. Background Naming Loop (Silent to the user now!)
-                for i in range(num_clusters):
                     cluster_df = weekly_df[weekly_df['cluster'] == i]
                     sample_texts = "\n".join(cluster_df['title'].head(3).tolist())
@@ -301,24 +317,27 @@ if st.button("Generate Weekly Trend Report"):
                         print(f"Failed to name cluster {i}: {e}")
                         weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = f"Trend Cluster {i+1}"
-                    time.sleep(10) # Keeping the rate limit safety net
                 # ---------------------------------------------------------
-                # 5. THE VISUALIZATION: Interactive Altair Scatter Plot
                 # ---------------------------------------------------------
-                # Squash the 384-dimension matrix down to 2 dimensions
-                pca = PCA(n_components=2)
-                coords = pca.fit_transform(matrix)
                 weekly_df['x'] = coords[:, 0]
                 weekly_df['y'] = coords[:, 1]
-                # Build the beautiful, interactive chart
                 chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
-                    x=alt.X('x', axis=None), # Hide the meaningless math axis
-                    y=alt.Y('y', axis=None), # Hide the meaningless math axis
-                    color=alt.Color('Trend Topic', legend=alt.Legend(title="Identified Trends", orient="bottom")),
                     tooltip=[
                         alt.Tooltip('Trend Topic', title='Macro Trend'),
                         alt.Tooltip('title', title='Update Title'),
@@ -326,7 +345,7 @@ if st.button("Generate Weekly Trend Report"):
                     ]
                 ).properties(
                     height=400
-                ).interactive() # <--- This is what lets the user pan and zoom!
                 st.altair_chart(chart, use_container_width=True)
@@ -335,7 +354,6 @@ if st.button("Generate Weekly Trend Report"):
 st.divider()
 # --- VISUAL CARD RENDERER ---
 def render_event_cards(display_df):
     if display_df.empty:

 from sklearn.cluster import KMeans
 import altair as alt
 from sklearn.decomposition import PCA
+import altair as alt
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.manifold import TSNE
+from sklearn.metrics import silhouette_score
 # Create a global lock for file operations
 data_lock = threading.Lock()
 ## --- Trend analysis ---
 st.subheader("Weekly AI Trend Analysis")
+st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. The math engine automatically determines the number of distinct macro-trends based on semantic density.")
 if st.button("Generate Weekly Trend Report"):
+    with st.spinner("Calculating semantic topography and identifying trends... (Takes ~30 seconds)"):
         # 1. Filter for the last 7 days
         week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
         weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
+        # 2. Extract embeddings
         weekly_df = weekly_df.dropna(subset=['embedding'])
+        if len(weekly_df) < 5:
+            st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to confidently calculate mathematical trends.")
         else:
             matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
+            # find the optimal number of trends (K)
+            max_possible_clusters = min(5, len(weekly_df) - 1)
+            best_k = 2
+            best_score = -1
+            # Test different cluster sizes and let the data pick the best fit
+            if max_possible_clusters > 2:
+                for k in range(2, max_possible_clusters + 1):
+                    test_clusterer = AgglomerativeClustering(n_clusters=k, metric='cosine', linkage='average')
+                    test_labels = test_clusterer.fit_predict(matrix)
+                    score = silhouette_score(matrix, test_labels, metric='cosine')
+                    if score > best_score:
+                        best_score = score
+                        best_k = k
+            # 3. Apply the clustering model
+            clusterer = AgglomerativeClustering(n_clusters=best_k, metric='cosine', linkage='average')
+            weekly_df['cluster'] = clusterer.fit_predict(matrix)
             weekly_df['Trend Topic'] = "Uncategorized"
             hf_token = os.getenv("HF_TOKEN")
             if hf_token:
                 ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
+                # 4. Background Naming Loop
+                for i in range(best_k):
                     cluster_df = weekly_df[weekly_df['cluster'] == i]
                     sample_texts = "\n".join(cluster_df['title'].head(3).tolist())
                         print(f"Failed to name cluster {i}: {e}")
                         weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = f"Trend Cluster {i+1}"
+                    time.sleep(10) # API Rate Limit Safety
                 # ---------------------------------------------------------
+                # Viz: Dynamic t-SNE Projection
                 # ---------------------------------------------------------
+                # Prevent Perplexity crash
+                safe_perplexity = min(30, len(weekly_df) - 1)
+                # Unroll the 384D vectors into 2D using t-SNE
+                tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
+                coords = tsne.fit_transform(matrix)
                 weekly_df['x'] = coords[:, 0]
                 weekly_df['y'] = coords[:, 1]
+                # Build the Altair chart
                 chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
+                    x=alt.X('x', axis=None),
+                    y=alt.Y('y', axis=None),
+                    color=alt.Color('Trend Topic', legend=alt.Legend(title=f"Top {best_k} Trends Identified", orient="bottom")),
                     tooltip=[
                         alt.Tooltip('Trend Topic', title='Macro Trend'),
                         alt.Tooltip('title', title='Update Title'),
                     ]
                 ).properties(
                     height=400
+                ).interactive()
                 st.altair_chart(chart, use_container_width=True)
 st.divider()
 # --- VISUAL CARD RENDERER ---
 def render_event_cards(display_df):
     if display_df.empty: