data-analysis-agent-2.0

Sleeping

App Files Files Community

sanjaystarc commited on Dec 24, 2025

Commit

ffa2a6d

verified ·

1 Parent(s): 3a72d72

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -170

app.py CHANGED Viewed

@@ -1,187 +1,126 @@
 import os
 import streamlit as st
 import pandas as pd
-import numpy as np
-import requests
-import json
-import time
-import matplotlib.pyplot as plt
-import seaborn as sns
-# --- CONFIG ---
-# Note: GEMINI_API_KEY is retrieved from environment variables/secrets.
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-if not GEMINI_API_KEY:
-    st.error("❌ Missing Gemini API key. Add it as a secret: GEMINI_API_KEY")
-    st.stop()
-# Define API endpoints and models
-GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta"
-# Using the correct model for structured output
-CHAT_MODEL = "gemini-2.5-flash-preview-09-2025"
-EMBED_MODEL = "models/embedding-001"
-# Define the JSON schema for structured output
-ANALYSIS_SCHEMA = {
-    "type": "OBJECT",
-    "properties": {
-        "reasoning": {
-            "type": "STRING",
-            "description": "A detailed natural language explanation of the analysis, including key findings and context."
-        },
-        "code": {
-            "type": "STRING",
-            "description": "The complete, runnable Python code using pandas (df) and streamlit (st). Use st.pyplot() for plots, and st.dataframe() for resulting DataFrames. If no code is needed, this should be an empty string."
-        }
-    }
-}
-SYSTEM_INSTRUCTION = (
-    "You are a world-class Data Analyst Agent. Your task is to analyze the provided DataFrame ('df') "
-    "based on the user's question. You MUST respond with a single JSON object conforming to the provided schema. "
-    "1. **Reasoning:** Explain your plan, the steps taken, and the insights derived from the data. Format this in Markdown. "
-    "2. **Code:** If the question requires calculation, aggregation, or visualization, you MUST generate Python code to execute against the 'df' DataFrame. "
-    "   - The DataFrame is already loaded as a variable named 'df'. Do NOT redefine it. "
-    "   - Use Streamlit functions for simple outputs: `st.dataframe(...)`, `st.bar_chart()`, `st.line_chart()`. "
-    "   - For **ALL** custom, complex plots, you MUST follow this strict Matplotlib sequence: **Start with `plt.figure()`, use `plt.` or `sns.` commands for plotting, and explicitly end with `st.pyplot(plt)`** to display the output. "
-    "   - **CRITICAL GUARDRAIL:** When generating code that uses logical conditions (e.g., in `if` statements or for complex filters) on Pandas Series or NumPy arrays, you **MUST** resolve ambiguity by using `.any()` or `.all()`. Do NOT compare a series directly to a single boolean value."
-    "   - Ensure the code is self-contained and ready to execute."
 )
-# --- Helper Functions ---
-def chat_with_gemini(prompt, context):
-    """Sends a prompt and data context to the Gemini model for structured analysis (reasoning + code)."""
-    # Correctly prepend 'models/' to the model name in the URL path
-    url = f"{GEMINI_BASE}/models/{CHAT_MODEL}:generateContent?key={GEMINI_API_KEY}"
-    # Construct the full prompt including the data context
-    full_prompt = f"Data Context (DataFrame Head and Columns):\n{context}\n\nUser Question: {prompt}"
-    payload = {
-        "contents": [
-            {"parts": [{"text": full_prompt}]}
-        ],
-        "systemInstruction": {"parts": [{"text": SYSTEM_INSTRUCTION}]},
-        "generationConfig": {
-            "responseMimeType": "application/json",
-            "responseSchema": ANALYSIS_SCHEMA
-        }
-    }
-    max_retries = 5
-    delay = 1
-    for attempt in range(max_retries):
-        try:
-            r = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(payload))
-            r.raise_for_status()
-            data = r.json()
-            json_str = data["candidates"][0]["content"]["parts"][0]["text"]
-            return json.loads(json_str)
-        except requests.exceptions.RequestException as e:
-            if attempt < max_retries - 1:
-                time.sleep(delay)
-                delay *= 2
-            else:
-                st.error(f"API Request Failed: {e}")
-                raise e
-        except Exception as e:
-            st.error(f"Failed to parse model response or execute operation: {e}")
-            raise e
-# --- UI ---
-st.title("✨Data Analyst Agent (Code Execution Enabled)")
-st.write("Upload a CSV file and ask natural language questions. The agent now generates and executes Python code to provide precise data analysis and visualizations.")
-# State variable to hold the DataFrame, initialized once
-if 'df' not in st.session_state:
-    st.session_state.df = pd.DataFrame()
-uploaded = st.file_uploader("Upload CSV", type=["csv"])
-if uploaded:
-    # Use st.cache_data to avoid reloading the file multiple times
-    @st.cache_data
-    def load_data(file):
-        try:
-            return pd.read_csv(file)
-        except Exception as e:
-            st.error(f"Failed to load CSV: {e}")
-            return pd.DataFrame()
-    st.session_state.df = load_data(uploaded)
-    if not st.session_state.df.empty:
-        st.subheader("Data Preview (First 5 Rows)")
-        st.dataframe(st.session_state.df.head())
-        question = st.text_area("Ask a complex question or request a visualization (e.g., 'Show the average of the 'Sales' column', 'Plot the distribution of 'Age'):")
-        if st.button("Analyze & Execute") and question:
-            df = st.session_state.df # Local variable for code execution context
-            # Summarize dataset for context sent to the LLM
-            context = f"Dataset Columns: {', '.join(df.columns.astype(str))}\n\nFirst 5 rows of data:\n{df.head(5).to_string(index=False)}"
-            st.markdown("---")
-            st.subheader("🤖 Analysis Steps")
-            with st.spinner("1. Generating analysis plan and code..."):
                 try:
-                    # 1. Get structured response from LLM
-                    analysis_result = chat_with_gemini(question, context)
-                    reasoning = analysis_result.get('reasoning', "No reasoning provided.")
-                    code = analysis_result.get('code', "")
-                    st.markdown("#### 💬 Reasoning:")
-                    st.markdown(reasoning)
-                    st.markdown("#### 🐍 Generated Code:")
-                    st.code(code, language='python')
                 except Exception as e:
-                    st.error(f"Step 1 Failed (LLM Interaction): {e}")
-                    reasoning = ""
-                    code = ""
-            if code:
-                with st.spinner("2. Executing code and generating output..."):
-                    try:
-                        # 2. Execute the generated Python code safely
-                        # IMPORTANT: Create a local scope with necessary variables
-                        local_scope = {
-                            'df': df,
-                            'st': st,
-                            'pd': pd,
-                            'np': np,
-                            'plt': plt,
-                            'sns': sns,
-                        }
-                        # Append a neutral statement to the code to prevent implicit Streamlit display of the last value
-                        final_code = code + "\nNone"
-                        # Executing the code within the local scope
-                        exec(final_code, globals(), local_scope)
-                        # FIX: Explicitly close all Matplotlib figures to prevent cross-run contamination
-                        plt.close('all')
-                        st.success("Code execution complete. Results are displayed above.")
-                    except Exception as e:
-                        st.error(f"Step 2 Failed (Code Execution Error): The agent generated invalid code. Check the console for full traceback.")
-                        st.exception(e)
-            else:
-                st.info("No code was generated, as the question was purely informational.")
     else:
-        st.info("The uploaded CSV file appears to be empty.")
-else:
-    st.info("👆 Upload a CSV file to begin the full analysis experience.")

 import os
 import streamlit as st
 import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+# LangChain Imports
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_experimental.agents import create_pandas_dataframe_agent
+from langchain.agents.agent_types import AgentType
+from langchain_community.callbacks.streamlit import StreamlitCallbackHandler
+# --- CONFIG ---
+st.set_page_config(
+    page_title="Agentic Data Analyst",
+    page_icon="📊",
+    layout="wide"
 )
+# Use the API key from environment
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+def main():
+    st.title("🤖 Agentic Data Analyst (LangChain + Gemini)")
+    st.markdown("""
+    This agent uses a **ReAct (Reason + Act)** loop. It doesn't just guess code;
+    it executes Python, checks the results, and self-corrects if it encounters errors.
+    """)
+    if not GEMINI_API_KEY:
+        st.error("❌ GEMINI_API_KEY not found in environment variables.")
+        st.stop()
+    # 1. Initialize the Brain (LLM)
+    # We use temperature 0 for analytical tasks to minimize "hallucinations"
+    try:
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-2.5-flash-preview-09-2025",
+            google_api_key=GEMINI_API_KEY,
+            temperature=0,
+        )
+    except Exception as e:
+        st.error(f"Failed to initialize LLM: {e}")
+        st.stop()
+    # 2. File Upload
+    uploaded_file = st.file_uploader("Upload your dataset (CSV)", type="csv")
+    if uploaded_file:
+        df = pd.read_csv(uploaded_file)
+        with st.expander("📄 Data Preview & Schema"):
+            col1, col2 = st.columns(2)
+            with col1:
+                st.write("**First 5 Rows:**")
+                st.dataframe(df.head())
+            with col2:
+                st.write("**Column Info:**")
+                st.write(df.dtypes)
+        # 3. User Input
+        query = st.text_area(
+            "What would you like to know?",
+            placeholder="e.g., 'What is the correlation between age and salary?' or 'Plot a histogram of sales.'"
+        )
+        if st.button("Run Analysis") and query:
+            # 4. Create the Agent
+            # create_pandas_dataframe_agent wraps the dataframe and the python tool
+            agent = create_pandas_dataframe_agent(
+                llm,
+                df,
+                verbose=True,
+                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
+                allow_dangerous_code=True, # Necessary to execute Python code
+                handle_parsing_errors=True
+            )
+            # 5. Execute with Streamlit Callbacks
+            # This allows us to see the "Thinking" process in the UI
+            st.subheader("🧠 Agent Thought Process")
+            # This container will show the step-by-step reasoning
+            thought_container = st.container()
+            st_callback = StreamlitCallbackHandler(thought_container)
+            with st.spinner("Agent is analyzing..."):
                 try:
+                    # The .run() method triggers the agentic loop
+                    response = agent.run(query, callbacks=[st_callback])
+                    st.markdown("---")
+                    st.subheader("✅ Final Answer")
+                    st.success(response)
+                    # Note on Plots:
+                    # If the agent uses plt.show(), it might not render in Streamlit.
+                    # Standard practice for agents is to ask them to use st.pyplot(plt.gcf())
+                    # but the agent often figures out how to display data.
                 except Exception as e:
+                    st.error(f"The agent encountered a critical error: {e}")
+                    st.info("Tip: Try rephrasing your question or checking if column names are clear.")
     else:
+        st.info("👆 Please upload a CSV file to begin.")
+    # --- Sidebar Credits & Info ---
+    with st.sidebar:
+        st.header("How it works")
+        st.markdown("""
+        **1. Thought:** The LLM analyzes your question and the dataframe schema.
+        **2. Action:** It writes and executes Python code using `pandas`.
+        **3. Observation:** It looks at the output of that code.
+        **4. Final Answer:** If the output satisfies the question, it responds. Otherwise, it loops back to step 1.
+        """)
+        if st.button("Clear Cache"):
+            st.cache_data.clear()
+            st.rerun()
+if __name__ == "__main__":
+    main()