Spaces:

IJ-Reynolds
/

AI_Intel_Tracker

Running

App Files Files Community

IJ-Reynolds HF Staff commited on 2 days ago

Commit

f44cdfd

verified ·

1 Parent(s): 0ea3eac

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +40 -21

streamlit_app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from huggingface_hub import InferenceClient
 import json
 import numpy as np
 from sklearn.cluster import KMeans
 # Create a global lock for file operations
 data_lock = threading.Lock()
@@ -248,9 +250,10 @@ st.divider()
 ## --- Trend analysis ---
 st.subheader("Weekly AI Trend Analysis")
 if st.button("Generate Weekly Trend Report"):
-    with st.spinner("Clustering data and generating topics..."):
         # 1. Filter for the last 7 days
         week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
         weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
@@ -263,30 +266,25 @@ if st.button("Generate Weekly Trend Report"):
         else:
             matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
-            # 3. Run K-Means Clustering (Let's say we want the top 3 trends)
             num_clusters = min(3, len(weekly_df))
             kmeans = KMeans(n_clusters=num_clusters, random_state=42)
             weekly_df['cluster'] = kmeans.fit_predict(matrix)
-            # 4. Have the AI Name the Clusters
-            st.write("### Top Trends This Week:")
             hf_token = os.getenv("HF_TOKEN")
             if hf_token:
-                # Using the highly reliable, high-limit Llama 3.1 8B!
                 ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
                 for i in range(num_clusters):
-                    # Get the articles in this cluster
                     cluster_df = weekly_df[weekly_df['cluster'] == i]
-                    # Grab the top 3 articles closest to the center to give context
                     sample_texts = "\n".join(cluster_df['title'].head(3).tolist())
-                    # prompting for trends
                     prompt = f"""
                     Analyze these related policy updates and provide a concise, 2-to-4 word title for this trend (e.g., "Export Control Expansion" or "AI Safety Funding").
-                    Use strictly neutral, professional, and objective terminology. Do not use words like "Irrelevant", "Miscellaneous", "Unknown", or "Other".
                     If the topics are diverse, find the broadest common policy denominator.
                     Do not include quotes or extra text. Just the title.
@@ -298,19 +296,40 @@ if st.button("Generate Weekly Trend Report"):
                     try:
                         response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.1)
                         topic_name = response.choices[0].message.content.strip(' "')
-                        # Display the dynamically named trend and how many articles are in it
-                        st.metric(label=topic_name, value=f"{len(cluster_df)} Updates")
-                        # Show the articles under the trend
-                        with st.expander("View Updates"):
-                            for _, row in cluster_df.iterrows():
-                                st.markdown(f"- **{row['title']}** ({row['source']})")
                     except Exception as e:
-                        st.error(f"Failed to name cluster {i}: {e}")
-                    #rate limit
-                    time.sleep(10)
             else:
                 st.error("Hugging Face API token not found. Cannot generate topic names.")

 import json
 import numpy as np
 from sklearn.cluster import KMeans
+import altair as alt
+from sklearn.decomposition import PCA
 # Create a global lock for file operations
 data_lock = threading.Lock()
 ## --- Trend analysis ---
 st.subheader("Weekly AI Trend Analysis")
+st.markdown("Explore the semantic relationships between this week's policy updates. **Hover over any dot** to see the specific article and source. Dots clustered closely together share similar policy themes based on AI mathematical analysis.")
 if st.button("Generate Weekly Trend Report"):
+    with st.spinner("Analyzing semantic vectors and generating topics... (This takes about 30 seconds to safely process)"):
         # 1. Filter for the last 7 days
         week_ago = pd.Timestamp.now().normalize() - pd.Timedelta(days=7)
         weekly_df = active_df[active_df['event_date'] >= week_ago].copy()
         else:
             matrix = np.vstack(weekly_df['embedding'].apply(json.loads).values)
+            # 3. Run K-Means Clustering
             num_clusters = min(3, len(weekly_df))
             kmeans = KMeans(n_clusters=num_clusters, random_state=42)
             weekly_df['cluster'] = kmeans.fit_predict(matrix)
+            weekly_df['Trend Topic'] = "Uncategorized"
             hf_token = os.getenv("HF_TOKEN")
             if hf_token:
                 ui_client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=hf_token)
+                # 4. Background Naming Loop (Silent to the user now!)
                 for i in range(num_clusters):
                     cluster_df = weekly_df[weekly_df['cluster'] == i]
                     sample_texts = "\n".join(cluster_df['title'].head(3).tolist())
                     prompt = f"""
                     Analyze these related policy updates and provide a concise, 2-to-4 word title for this trend (e.g., "Export Control Expansion" or "AI Safety Funding").
+                    Use strictly neutral, professional, and objective policy terminology. Do not use words like "Irrelevant", "Miscellaneous", "Unknown", or "Other".
                     If the topics are diverse, find the broadest common policy denominator.
                     Do not include quotes or extra text. Just the title.
                     try:
                         response = ui_client.chat_completion(messages, max_tokens=15, temperature=0.1)
                         topic_name = response.choices[0].message.content.strip(' "')
+                        weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = topic_name
                     except Exception as e:
+                        print(f"Failed to name cluster {i}: {e}")
+                        weekly_df.loc[weekly_df['cluster'] == i, 'Trend Topic'] = f"Trend Cluster {i+1}"
+                    time.sleep(10) # Keeping the rate limit safety net
+                # ---------------------------------------------------------
+                # 5. THE VISUALIZATION: Interactive Altair Scatter Plot
+                # ---------------------------------------------------------
+                # Squash the 384-dimension matrix down to 2 dimensions
+                pca = PCA(n_components=2)
+                coords = pca.fit_transform(matrix)
+                weekly_df['x'] = coords[:, 0]
+                weekly_df['y'] = coords[:, 1]
+                # Build the beautiful, interactive chart
+                chart = alt.Chart(weekly_df).mark_circle(size=120, opacity=0.8).encode(
+                    x=alt.X('x', axis=None), # Hide the meaningless math axis
+                    y=alt.Y('y', axis=None), # Hide the meaningless math axis
+                    color=alt.Color('Trend Topic', legend=alt.Legend(title="Identified Trends", orient="bottom")),
+                    tooltip=[
+                        alt.Tooltip('Trend Topic', title='Macro Trend'),
+                        alt.Tooltip('title', title='Update Title'),
+                        alt.Tooltip('source', title='Source Agency/Office')
+                    ]
+                ).properties(
+                    height=400
+                ).interactive() # <--- This is what lets the user pan and zoom!
+                st.altair_chart(chart, use_container_width=True)
             else:
                 st.error("Hugging Face API token not found. Cannot generate topic names.")