Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Running

App Files Files Community

IJ-Reynolds HF Staff commited on 3 days ago

Commit

8fb7a85

verified ·

1 Parent(s): 65035d6

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +27 -30

streamlit_app.py CHANGED Viewed

@@ -253,40 +253,38 @@ if active_df is not None and not active_df.empty:
 st.divider()
 ## --- Trend analysis ---
 st.subheader("Weekly AI Trend Analysis")
 st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. The math engine automatically determines the number of distinct macro-trends based on semantic density.")
 if st.button("Generate Weekly Trend Report"):
-    with st.spinner("Calculating semantic topography and identifying trends... (Takes ~30 seconds)"):
         # 1. Filter for the last 7 days
         week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
         weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
-        # 2. Extract embeddings
         weekly_df = weekly_df.dropna(subset=['embedding'])
         if len(weekly_df) < 5:
             st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to confidently calculate mathematical trends.")
         else:
             matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
-            # find the optimal number of trends (K)
-            max_possible_clusters = min(5, len(weekly_df) - 1)
-            best_k = 2
-            best_score = -1
-            # Test different cluster sizes and let the data pick the best fit
-            if max_possible_clusters > 2:
-                for k in range(2, max_possible_clusters + 1):
-                    test_clusterer = AgglomerativeClustering(n_clusters=k, metric='cosine', linkage='average')
-                    test_labels = test_clusterer.fit_predict(matrix)
-                    score = silhouette_score(matrix, test_labels, metric='cosine')
-                    if score > best_score:
-                        best_score = score
-                        best_k = k
-            # 3. Apply the clustering model
-            clusterer = AgglomerativeClustering(n_clusters=best_k, metric='cosine', linkage='average')
             weekly_df['cluster'] = clusterer.fit_predict(matrix)
             weekly_df['Trend Topic'] = "Uncategorized"
             hf_token = os.getenv("HF_TOKEN")
@@ -294,15 +292,17 @@ if st.button("Generate Weekly Trend Report"):
                 ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
                 # 4. Background Naming Loop
-                for i in range(best_k):
                     cluster_df = weekly_df[weekly_df['cluster'] == i]
-                    sample_texts = "\n".join(cluster_df['title'].head(3).tolist())
                     prompt = f"""
-                    Analyze these related policy updates and provide a concise, 2-to-4 word title for this trend (e.g., "Export Control Expansion" or "AI Safety Funding").
-                    Use strictly neutral, professional, and objective policy terminology. Do not use words like "Irrelevant", "Miscellaneous", "Unknown", or "Other".
-                    If the topics are diverse, find the broadest common policy denominator.
-                    Do not include quotes or extra text. Just the title.
                     UPDATES:
                     {sample_texts}
@@ -320,24 +320,20 @@ if st.button("Generate Weekly Trend Report"):
                     time.sleep(10) # API Rate Limit Safety
                 # ---------------------------------------------------------
-                # Viz: Dynamic t-SNE Projection
                 # ---------------------------------------------------------
-                # Prevent Perplexity crash
                 safe_perplexity = min(30, len(weekly_df) - 1)
-                # Unroll the 384D vectors into 2D using t-SNE
                 tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
                 coords = tsne.fit_transform(matrix)
                 weekly_df['x'] = coords[:, 0]
                 weekly_df['y'] = coords[:, 1]
-                # Build the Altair chart
                 chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
                     x=alt.X('x', axis=None),
                     y=alt.Y('y', axis=None),
-                    color=alt.Color('Trend Topic', legend=alt.Legend(title=f"Top {best_k} Trends Identified", orient="bottom")),
                     tooltip=[
                         alt.Tooltip('Trend Topic', title='Macro Trend'),
                         alt.Tooltip('title', title='Update Title'),
@@ -353,6 +349,7 @@ if st.button("Generate Weekly Trend Report"):
                 st.error("Hugging Face API token not found. Cannot generate topic names.")
 st.divider()
 # --- VISUAL CARD RENDERER ---
 def render_event_cards(display_df):

 st.divider()
 ## --- Trend analysis ---
 st.subheader("Weekly AI Trend Analysis")
 st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. The math engine automatically determines the number of distinct macro-trends based on semantic density.")
 if st.button("Generate Weekly Trend Report"):
+    with st.spinner("Calculating high-fidelity semantic topography... (Takes ~30 seconds)"):
         # 1. Filter for the last 7 days
         week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
         weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
+        # 2. Extract embeddings back into numpy arrays
         weekly_df = weekly_df.dropna(subset=['embedding'])
         if len(weekly_df) < 5:
             st.warning(f"Only {len(weekly_df)} embedded updates found this week. The AI requires at least 5 to confidently calculate mathematical trends.")
         else:
             matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
+            # ---------------------------------------------------------
+            # HIGH FIDELITY FIX: Distance Threshold + Complete Linkage
+            # distance_threshold=0.55 forces tight semantic groupings.
+            # ---------------------------------------------------------
+            clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55, metric='cosine', linkage='complete')
             weekly_df['cluster'] = clusterer.fit_predict(matrix)
+            num_clusters = weekly_df['cluster'].nunique()
+            # Guardrail: If it makes too many micro-clusters, cap it at 6
+            if num_clusters > 6:
+                clusterer = AgglomerativeClustering(n_clusters=6, metric='cosine', linkage='complete')
+                weekly_df['cluster'] = clusterer.fit_predict(matrix)
+                num_clusters = 6
             weekly_df['Trend Topic'] = "Uncategorized"
             hf_token = os.getenv("HF_TOKEN")
                 ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
                 # 4. Background Naming Loop
+                st.write(f"### Top {num_clusters} Trends This Week:")
+                for i in range(num_clusters):
                     cluster_df = weekly_df[weekly_df['cluster'] == i]
+                    # FIDELITY FIX #2: Feed the AI up to 8 titles instead of just 3!
+                    sample_texts = "\n".join(cluster_df['title'].head(8).tolist())
                     prompt = f"""
+                    Analyze these related policy updates and provide a highly specific, concise, 2-to-4 word title for this macro-trend (e.g., "Export Control Expansion" or "AI Safety Funding").
+                    Identify the exact specific policy mechanism happening. Do not use generic words like "Updates", "General", "Various", "Irrelevant", or "Other".
+                    Do not include quotes. Just the title.
                     UPDATES:
                     {sample_texts}
                     time.sleep(10) # API Rate Limit Safety
                 # ---------------------------------------------------------
+                # 5. THE VISUALIZATION: Dynamic t-SNE Projection
                 # ---------------------------------------------------------
                 safe_perplexity = min(30, len(weekly_df) - 1)
                 tsne = TSNE(n_components=2, perplexity=safe_perplexity, metric='cosine', random_state=42, init='random')
                 coords = tsne.fit_transform(matrix)
                 weekly_df['x'] = coords[:, 0]
                 weekly_df['y'] = coords[:, 1]
                 chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
                     x=alt.X('x', axis=None),
                     y=alt.Y('y', axis=None),
+                    color=alt.Color('Trend Topic', legend=alt.Legend(title="Identified Trends", orient="bottom")),
                     tooltip=[
                         alt.Tooltip('Trend Topic', title='Macro Trend'),
                         alt.Tooltip('title', title='Update Title'),
                 st.error("Hugging Face API token not found. Cannot generate topic names.")
 st.divider()
 # --- VISUAL CARD RENDERER ---
 def render_event_cards(display_df):