Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,187 +1,126 @@
|
|
| 1 |
import os
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
-
import
|
| 5 |
-
import
|
| 6 |
-
import json
|
| 7 |
-
import time
|
| 8 |
-
import matplotlib.pyplot as plt
|
| 9 |
-
import seaborn as sns
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
st.error("❌ Missing Gemini API key. Add it as a secret: GEMINI_API_KEY")
|
| 17 |
-
st.stop()
|
| 18 |
-
|
| 19 |
-
# Define API endpoints and models
|
| 20 |
-
GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta"
|
| 21 |
-
# Using the correct model for structured output
|
| 22 |
-
CHAT_MODEL = "gemini-2.5-flash-preview-09-2025"
|
| 23 |
-
EMBED_MODEL = "models/embedding-001"
|
| 24 |
-
|
| 25 |
-
# Define the JSON schema for structured output
|
| 26 |
-
ANALYSIS_SCHEMA = {
|
| 27 |
-
"type": "OBJECT",
|
| 28 |
-
"properties": {
|
| 29 |
-
"reasoning": {
|
| 30 |
-
"type": "STRING",
|
| 31 |
-
"description": "A detailed natural language explanation of the analysis, including key findings and context."
|
| 32 |
-
},
|
| 33 |
-
"code": {
|
| 34 |
-
"type": "STRING",
|
| 35 |
-
"description": "The complete, runnable Python code using pandas (df) and streamlit (st). Use st.pyplot() for plots, and st.dataframe() for resulting DataFrames. If no code is needed, this should be an empty string."
|
| 36 |
-
}
|
| 37 |
-
}
|
| 38 |
-
}
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
"
|
| 45 |
-
" - The DataFrame is already loaded as a variable named 'df'. Do NOT redefine it. "
|
| 46 |
-
" - Use Streamlit functions for simple outputs: `st.dataframe(...)`, `st.bar_chart()`, `st.line_chart()`. "
|
| 47 |
-
" - For **ALL** custom, complex plots, you MUST follow this strict Matplotlib sequence: **Start with `plt.figure()`, use `plt.` or `sns.` commands for plotting, and explicitly end with `st.pyplot(plt)`** to display the output. "
|
| 48 |
-
" - **CRITICAL GUARDRAIL:** When generating code that uses logical conditions (e.g., in `if` statements or for complex filters) on Pandas Series or NumPy arrays, you **MUST** resolve ambiguity by using `.any()` or `.all()`. Do NOT compare a series directly to a single boolean value."
|
| 49 |
-
" - Ensure the code is self-contained and ready to execute."
|
| 50 |
)
|
| 51 |
|
| 52 |
-
#
|
| 53 |
-
|
| 54 |
-
def chat_with_gemini(prompt, context):
|
| 55 |
-
"""Sends a prompt and data context to the Gemini model for structured analysis (reasoning + code)."""
|
| 56 |
-
|
| 57 |
-
# Correctly prepend 'models/' to the model name in the URL path
|
| 58 |
-
url = f"{GEMINI_BASE}/models/{CHAT_MODEL}:generateContent?key={GEMINI_API_KEY}"
|
| 59 |
-
|
| 60 |
-
# Construct the full prompt including the data context
|
| 61 |
-
full_prompt = f"Data Context (DataFrame Head and Columns):\n{context}\n\nUser Question: {prompt}"
|
| 62 |
-
|
| 63 |
-
payload = {
|
| 64 |
-
"contents": [
|
| 65 |
-
{"parts": [{"text": full_prompt}]}
|
| 66 |
-
],
|
| 67 |
-
"systemInstruction": {"parts": [{"text": SYSTEM_INSTRUCTION}]},
|
| 68 |
-
"generationConfig": {
|
| 69 |
-
"responseMimeType": "application/json",
|
| 70 |
-
"responseSchema": ANALYSIS_SCHEMA
|
| 71 |
-
}
|
| 72 |
-
}
|
| 73 |
-
|
| 74 |
-
max_retries = 5
|
| 75 |
-
delay = 1
|
| 76 |
-
for attempt in range(max_retries):
|
| 77 |
-
try:
|
| 78 |
-
r = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(payload))
|
| 79 |
-
r.raise_for_status()
|
| 80 |
-
data = r.json()
|
| 81 |
-
|
| 82 |
-
json_str = data["candidates"][0]["content"]["parts"][0]["text"]
|
| 83 |
-
return json.loads(json_str)
|
| 84 |
-
|
| 85 |
-
except requests.exceptions.RequestException as e:
|
| 86 |
-
if attempt < max_retries - 1:
|
| 87 |
-
time.sleep(delay)
|
| 88 |
-
delay *= 2
|
| 89 |
-
else:
|
| 90 |
-
st.error(f"API Request Failed: {e}")
|
| 91 |
-
raise e
|
| 92 |
-
except Exception as e:
|
| 93 |
-
st.error(f"Failed to parse model response or execute operation: {e}")
|
| 94 |
-
raise e
|
| 95 |
-
|
| 96 |
-
# --- UI ---
|
| 97 |
-
st.title("✨Data Analyst Agent (Code Execution Enabled)")
|
| 98 |
-
st.write("Upload a CSV file and ask natural language questions. The agent now generates and executes Python code to provide precise data analysis and visualizations.")
|
| 99 |
-
|
| 100 |
-
# State variable to hold the DataFrame, initialized once
|
| 101 |
-
if 'df' not in st.session_state:
|
| 102 |
-
st.session_state.df = pd.DataFrame()
|
| 103 |
-
|
| 104 |
-
uploaded = st.file_uploader("Upload CSV", type=["csv"])
|
| 105 |
-
|
| 106 |
-
if uploaded:
|
| 107 |
-
# Use st.cache_data to avoid reloading the file multiple times
|
| 108 |
-
@st.cache_data
|
| 109 |
-
def load_data(file):
|
| 110 |
-
try:
|
| 111 |
-
return pd.read_csv(file)
|
| 112 |
-
except Exception as e:
|
| 113 |
-
st.error(f"Failed to load CSV: {e}")
|
| 114 |
-
return pd.DataFrame()
|
| 115 |
-
|
| 116 |
-
st.session_state.df = load_data(uploaded)
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
#
|
| 128 |
-
|
|
|
|
| 129 |
|
| 130 |
-
st.
|
| 131 |
-
st.subheader("🤖 Analysis Steps")
|
| 132 |
-
|
| 133 |
-
with st.spinner("1. Generating analysis plan and code..."):
|
| 134 |
try:
|
| 135 |
-
#
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
reasoning = analysis_result.get('reasoning', "No reasoning provided.")
|
| 139 |
-
code = analysis_result.get('code', "")
|
| 140 |
|
| 141 |
-
st.markdown("
|
| 142 |
-
st.
|
|
|
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
| 146 |
|
| 147 |
except Exception as e:
|
| 148 |
-
st.error(f"
|
| 149 |
-
|
| 150 |
-
code = ""
|
| 151 |
|
| 152 |
-
if code:
|
| 153 |
-
with st.spinner("2. Executing code and generating output..."):
|
| 154 |
-
try:
|
| 155 |
-
# 2. Execute the generated Python code safely
|
| 156 |
-
|
| 157 |
-
# IMPORTANT: Create a local scope with necessary variables
|
| 158 |
-
local_scope = {
|
| 159 |
-
'df': df,
|
| 160 |
-
'st': st,
|
| 161 |
-
'pd': pd,
|
| 162 |
-
'np': np,
|
| 163 |
-
'plt': plt,
|
| 164 |
-
'sns': sns,
|
| 165 |
-
}
|
| 166 |
-
|
| 167 |
-
# Append a neutral statement to the code to prevent implicit Streamlit display of the last value
|
| 168 |
-
final_code = code + "\nNone"
|
| 169 |
-
|
| 170 |
-
# Executing the code within the local scope
|
| 171 |
-
exec(final_code, globals(), local_scope)
|
| 172 |
-
|
| 173 |
-
# FIX: Explicitly close all Matplotlib figures to prevent cross-run contamination
|
| 174 |
-
plt.close('all')
|
| 175 |
-
|
| 176 |
-
st.success("Code execution complete. Results are displayed above.")
|
| 177 |
-
|
| 178 |
-
except Exception as e:
|
| 179 |
-
st.error(f"Step 2 Failed (Code Execution Error): The agent generated invalid code. Check the console for full traceback.")
|
| 180 |
-
st.exception(e)
|
| 181 |
-
else:
|
| 182 |
-
st.info("No code was generated, as the question was purely informational.")
|
| 183 |
else:
|
| 184 |
-
st.info("
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import seaborn as sns
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
# LangChain Imports
|
| 8 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 9 |
+
from langchain_experimental.agents import create_pandas_dataframe_agent
|
| 10 |
+
from langchain.agents.agent_types import AgentType
|
| 11 |
+
from langchain_community.callbacks.streamlit import StreamlitCallbackHandler
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
# --- CONFIG ---
|
| 14 |
+
st.set_page_config(
|
| 15 |
+
page_title="Agentic Data Analyst",
|
| 16 |
+
page_icon="📊",
|
| 17 |
+
layout="wide"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
)
|
| 19 |
|
| 20 |
+
# Use the API key from environment
|
| 21 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
def main():
|
| 24 |
+
st.title("🤖 Agentic Data Analyst (LangChain + Gemini)")
|
| 25 |
+
st.markdown("""
|
| 26 |
+
This agent uses a **ReAct (Reason + Act)** loop. It doesn't just guess code;
|
| 27 |
+
it executes Python, checks the results, and self-corrects if it encounters errors.
|
| 28 |
+
""")
|
| 29 |
+
|
| 30 |
+
if not GEMINI_API_KEY:
|
| 31 |
+
st.error("❌ GEMINI_API_KEY not found in environment variables.")
|
| 32 |
+
st.stop()
|
| 33 |
+
|
| 34 |
+
# 1. Initialize the Brain (LLM)
|
| 35 |
+
# We use temperature 0 for analytical tasks to minimize "hallucinations"
|
| 36 |
+
try:
|
| 37 |
+
llm = ChatGoogleGenerativeAI(
|
| 38 |
+
model="gemini-2.5-flash-preview-09-2025",
|
| 39 |
+
google_api_key=GEMINI_API_KEY,
|
| 40 |
+
temperature=0,
|
| 41 |
+
)
|
| 42 |
+
except Exception as e:
|
| 43 |
+
st.error(f"Failed to initialize LLM: {e}")
|
| 44 |
+
st.stop()
|
| 45 |
+
|
| 46 |
+
# 2. File Upload
|
| 47 |
+
uploaded_file = st.file_uploader("Upload your dataset (CSV)", type="csv")
|
| 48 |
+
|
| 49 |
+
if uploaded_file:
|
| 50 |
+
df = pd.read_csv(uploaded_file)
|
| 51 |
+
|
| 52 |
+
with st.expander("📄 Data Preview & Schema"):
|
| 53 |
+
col1, col2 = st.columns(2)
|
| 54 |
+
with col1:
|
| 55 |
+
st.write("**First 5 Rows:**")
|
| 56 |
+
st.dataframe(df.head())
|
| 57 |
+
with col2:
|
| 58 |
+
st.write("**Column Info:**")
|
| 59 |
+
st.write(df.dtypes)
|
| 60 |
+
|
| 61 |
+
# 3. User Input
|
| 62 |
+
query = st.text_area(
|
| 63 |
+
"What would you like to know?",
|
| 64 |
+
placeholder="e.g., 'What is the correlation between age and salary?' or 'Plot a histogram of sales.'"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
if st.button("Run Analysis") and query:
|
| 68 |
+
# 4. Create the Agent
|
| 69 |
+
# create_pandas_dataframe_agent wraps the dataframe and the python tool
|
| 70 |
+
agent = create_pandas_dataframe_agent(
|
| 71 |
+
llm,
|
| 72 |
+
df,
|
| 73 |
+
verbose=True,
|
| 74 |
+
agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
| 75 |
+
allow_dangerous_code=True, # Necessary to execute Python code
|
| 76 |
+
handle_parsing_errors=True
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# 5. Execute with Streamlit Callbacks
|
| 80 |
+
# This allows us to see the "Thinking" process in the UI
|
| 81 |
+
st.subheader("🧠 Agent Thought Process")
|
| 82 |
|
| 83 |
+
# This container will show the step-by-step reasoning
|
| 84 |
+
thought_container = st.container()
|
| 85 |
+
st_callback = StreamlitCallbackHandler(thought_container)
|
| 86 |
|
| 87 |
+
with st.spinner("Agent is analyzing..."):
|
|
|
|
|
|
|
|
|
|
| 88 |
try:
|
| 89 |
+
# The .run() method triggers the agentic loop
|
| 90 |
+
response = agent.run(query, callbacks=[st_callback])
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
st.markdown("---")
|
| 93 |
+
st.subheader("✅ Final Answer")
|
| 94 |
+
st.success(response)
|
| 95 |
|
| 96 |
+
# Note on Plots:
|
| 97 |
+
# If the agent uses plt.show(), it might not render in Streamlit.
|
| 98 |
+
# Standard practice for agents is to ask them to use st.pyplot(plt.gcf())
|
| 99 |
+
# but the agent often figures out how to display data.
|
| 100 |
|
| 101 |
except Exception as e:
|
| 102 |
+
st.error(f"The agent encountered a critical error: {e}")
|
| 103 |
+
st.info("Tip: Try rephrasing your question or checking if column names are clear.")
|
|
|
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
else:
|
| 106 |
+
st.info("👆 Please upload a CSV file to begin.")
|
| 107 |
+
|
| 108 |
+
# --- Sidebar Credits & Info ---
|
| 109 |
+
with st.sidebar:
|
| 110 |
+
st.header("How it works")
|
| 111 |
+
st.markdown("""
|
| 112 |
+
**1. Thought:** The LLM analyzes your question and the dataframe schema.
|
| 113 |
+
|
| 114 |
+
**2. Action:** It writes and executes Python code using `pandas`.
|
| 115 |
+
|
| 116 |
+
**3. Observation:** It looks at the output of that code.
|
| 117 |
+
|
| 118 |
+
**4. Final Answer:** If the output satisfies the question, it responds. Otherwise, it loops back to step 1.
|
| 119 |
+
""")
|
| 120 |
+
|
| 121 |
+
if st.button("Clear Cache"):
|
| 122 |
+
st.cache_data.clear()
|
| 123 |
+
st.rerun()
|
| 124 |
+
|
| 125 |
+
if __name__ == "__main__":
|
| 126 |
+
main()
|