sanjaystarc commited on
Commit
ffa2a6d
·
verified ·
1 Parent(s): 3a72d72

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -170
app.py CHANGED
@@ -1,187 +1,126 @@
1
  import os
2
  import streamlit as st
3
  import pandas as pd
4
- import numpy as np
5
- import requests
6
- import json
7
- import time
8
- import matplotlib.pyplot as plt
9
- import seaborn as sns
10
 
11
- # --- CONFIG ---
12
- # Note: GEMINI_API_KEY is retrieved from environment variables/secrets.
13
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
14
-
15
- if not GEMINI_API_KEY:
16
- st.error("❌ Missing Gemini API key. Add it as a secret: GEMINI_API_KEY")
17
- st.stop()
18
-
19
- # Define API endpoints and models
20
- GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta"
21
- # Using the correct model for structured output
22
- CHAT_MODEL = "gemini-2.5-flash-preview-09-2025"
23
- EMBED_MODEL = "models/embedding-001"
24
-
25
- # Define the JSON schema for structured output
26
- ANALYSIS_SCHEMA = {
27
- "type": "OBJECT",
28
- "properties": {
29
- "reasoning": {
30
- "type": "STRING",
31
- "description": "A detailed natural language explanation of the analysis, including key findings and context."
32
- },
33
- "code": {
34
- "type": "STRING",
35
- "description": "The complete, runnable Python code using pandas (df) and streamlit (st). Use st.pyplot() for plots, and st.dataframe() for resulting DataFrames. If no code is needed, this should be an empty string."
36
- }
37
- }
38
- }
39
 
40
- SYSTEM_INSTRUCTION = (
41
- "You are a world-class Data Analyst Agent. Your task is to analyze the provided DataFrame ('df') "
42
- "based on the user's question. You MUST respond with a single JSON object conforming to the provided schema. "
43
- "1. **Reasoning:** Explain your plan, the steps taken, and the insights derived from the data. Format this in Markdown. "
44
- "2. **Code:** If the question requires calculation, aggregation, or visualization, you MUST generate Python code to execute against the 'df' DataFrame. "
45
- " - The DataFrame is already loaded as a variable named 'df'. Do NOT redefine it. "
46
- " - Use Streamlit functions for simple outputs: `st.dataframe(...)`, `st.bar_chart()`, `st.line_chart()`. "
47
- " - For **ALL** custom, complex plots, you MUST follow this strict Matplotlib sequence: **Start with `plt.figure()`, use `plt.` or `sns.` commands for plotting, and explicitly end with `st.pyplot(plt)`** to display the output. "
48
- " - **CRITICAL GUARDRAIL:** When generating code that uses logical conditions (e.g., in `if` statements or for complex filters) on Pandas Series or NumPy arrays, you **MUST** resolve ambiguity by using `.any()` or `.all()`. Do NOT compare a series directly to a single boolean value."
49
- " - Ensure the code is self-contained and ready to execute."
50
  )
51
 
52
- # --- Helper Functions ---
53
-
54
- def chat_with_gemini(prompt, context):
55
- """Sends a prompt and data context to the Gemini model for structured analysis (reasoning + code)."""
56
-
57
- # Correctly prepend 'models/' to the model name in the URL path
58
- url = f"{GEMINI_BASE}/models/{CHAT_MODEL}:generateContent?key={GEMINI_API_KEY}"
59
-
60
- # Construct the full prompt including the data context
61
- full_prompt = f"Data Context (DataFrame Head and Columns):\n{context}\n\nUser Question: {prompt}"
62
-
63
- payload = {
64
- "contents": [
65
- {"parts": [{"text": full_prompt}]}
66
- ],
67
- "systemInstruction": {"parts": [{"text": SYSTEM_INSTRUCTION}]},
68
- "generationConfig": {
69
- "responseMimeType": "application/json",
70
- "responseSchema": ANALYSIS_SCHEMA
71
- }
72
- }
73
-
74
- max_retries = 5
75
- delay = 1
76
- for attempt in range(max_retries):
77
- try:
78
- r = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(payload))
79
- r.raise_for_status()
80
- data = r.json()
81
-
82
- json_str = data["candidates"][0]["content"]["parts"][0]["text"]
83
- return json.loads(json_str)
84
-
85
- except requests.exceptions.RequestException as e:
86
- if attempt < max_retries - 1:
87
- time.sleep(delay)
88
- delay *= 2
89
- else:
90
- st.error(f"API Request Failed: {e}")
91
- raise e
92
- except Exception as e:
93
- st.error(f"Failed to parse model response or execute operation: {e}")
94
- raise e
95
-
96
- # --- UI ---
97
- st.title("✨Data Analyst Agent (Code Execution Enabled)")
98
- st.write("Upload a CSV file and ask natural language questions. The agent now generates and executes Python code to provide precise data analysis and visualizations.")
99
-
100
- # State variable to hold the DataFrame, initialized once
101
- if 'df' not in st.session_state:
102
- st.session_state.df = pd.DataFrame()
103
-
104
- uploaded = st.file_uploader("Upload CSV", type=["csv"])
105
-
106
- if uploaded:
107
- # Use st.cache_data to avoid reloading the file multiple times
108
- @st.cache_data
109
- def load_data(file):
110
- try:
111
- return pd.read_csv(file)
112
- except Exception as e:
113
- st.error(f"Failed to load CSV: {e}")
114
- return pd.DataFrame()
115
-
116
- st.session_state.df = load_data(uploaded)
117
 
118
- if not st.session_state.df.empty:
119
- st.subheader("Data Preview (First 5 Rows)")
120
- st.dataframe(st.session_state.df.head())
121
-
122
- question = st.text_area("Ask a complex question or request a visualization (e.g., 'Show the average of the 'Sales' column', 'Plot the distribution of 'Age'):")
123
-
124
- if st.button("Analyze & Execute") and question:
125
- df = st.session_state.df # Local variable for code execution context
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- # Summarize dataset for context sent to the LLM
128
- context = f"Dataset Columns: {', '.join(df.columns.astype(str))}\n\nFirst 5 rows of data:\n{df.head(5).to_string(index=False)}"
 
129
 
130
- st.markdown("---")
131
- st.subheader("🤖 Analysis Steps")
132
-
133
- with st.spinner("1. Generating analysis plan and code..."):
134
  try:
135
- # 1. Get structured response from LLM
136
- analysis_result = chat_with_gemini(question, context)
137
-
138
- reasoning = analysis_result.get('reasoning', "No reasoning provided.")
139
- code = analysis_result.get('code', "")
140
 
141
- st.markdown("#### 💬 Reasoning:")
142
- st.markdown(reasoning)
 
143
 
144
- st.markdown("#### 🐍 Generated Code:")
145
- st.code(code, language='python')
 
 
146
 
147
  except Exception as e:
148
- st.error(f"Step 1 Failed (LLM Interaction): {e}")
149
- reasoning = ""
150
- code = ""
151
 
152
- if code:
153
- with st.spinner("2. Executing code and generating output..."):
154
- try:
155
- # 2. Execute the generated Python code safely
156
-
157
- # IMPORTANT: Create a local scope with necessary variables
158
- local_scope = {
159
- 'df': df,
160
- 'st': st,
161
- 'pd': pd,
162
- 'np': np,
163
- 'plt': plt,
164
- 'sns': sns,
165
- }
166
-
167
- # Append a neutral statement to the code to prevent implicit Streamlit display of the last value
168
- final_code = code + "\nNone"
169
-
170
- # Executing the code within the local scope
171
- exec(final_code, globals(), local_scope)
172
-
173
- # FIX: Explicitly close all Matplotlib figures to prevent cross-run contamination
174
- plt.close('all')
175
-
176
- st.success("Code execution complete. Results are displayed above.")
177
-
178
- except Exception as e:
179
- st.error(f"Step 2 Failed (Code Execution Error): The agent generated invalid code. Check the console for full traceback.")
180
- st.exception(e)
181
- else:
182
- st.info("No code was generated, as the question was purely informational.")
183
  else:
184
- st.info("The uploaded CSV file appears to be empty.")
185
-
186
- else:
187
- st.info("👆 Upload a CSV file to begin the full analysis experience.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import streamlit as st
3
  import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
 
 
 
 
6
 
7
+ # LangChain Imports
8
+ from langchain_google_genai import ChatGoogleGenerativeAI
9
+ from langchain_experimental.agents import create_pandas_dataframe_agent
10
+ from langchain.agents.agent_types import AgentType
11
+ from langchain_community.callbacks.streamlit import StreamlitCallbackHandler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # --- CONFIG ---
14
+ st.set_page_config(
15
+ page_title="Agentic Data Analyst",
16
+ page_icon="📊",
17
+ layout="wide"
 
 
 
 
 
18
  )
19
 
20
+ # Use the API key from environment
21
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ def main():
24
+ st.title("🤖 Agentic Data Analyst (LangChain + Gemini)")
25
+ st.markdown("""
26
+ This agent uses a **ReAct (Reason + Act)** loop. It doesn't just guess code;
27
+ it executes Python, checks the results, and self-corrects if it encounters errors.
28
+ """)
29
+
30
+ if not GEMINI_API_KEY:
31
+ st.error("❌ GEMINI_API_KEY not found in environment variables.")
32
+ st.stop()
33
+
34
+ # 1. Initialize the Brain (LLM)
35
+ # We use temperature 0 for analytical tasks to minimize "hallucinations"
36
+ try:
37
+ llm = ChatGoogleGenerativeAI(
38
+ model="gemini-2.5-flash-preview-09-2025",
39
+ google_api_key=GEMINI_API_KEY,
40
+ temperature=0,
41
+ )
42
+ except Exception as e:
43
+ st.error(f"Failed to initialize LLM: {e}")
44
+ st.stop()
45
+
46
+ # 2. File Upload
47
+ uploaded_file = st.file_uploader("Upload your dataset (CSV)", type="csv")
48
+
49
+ if uploaded_file:
50
+ df = pd.read_csv(uploaded_file)
51
+
52
+ with st.expander("📄 Data Preview & Schema"):
53
+ col1, col2 = st.columns(2)
54
+ with col1:
55
+ st.write("**First 5 Rows:**")
56
+ st.dataframe(df.head())
57
+ with col2:
58
+ st.write("**Column Info:**")
59
+ st.write(df.dtypes)
60
+
61
+ # 3. User Input
62
+ query = st.text_area(
63
+ "What would you like to know?",
64
+ placeholder="e.g., 'What is the correlation between age and salary?' or 'Plot a histogram of sales.'"
65
+ )
66
+
67
+ if st.button("Run Analysis") and query:
68
+ # 4. Create the Agent
69
+ # create_pandas_dataframe_agent wraps the dataframe and the python tool
70
+ agent = create_pandas_dataframe_agent(
71
+ llm,
72
+ df,
73
+ verbose=True,
74
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
75
+ allow_dangerous_code=True, # Necessary to execute Python code
76
+ handle_parsing_errors=True
77
+ )
78
+
79
+ # 5. Execute with Streamlit Callbacks
80
+ # This allows us to see the "Thinking" process in the UI
81
+ st.subheader("🧠 Agent Thought Process")
82
 
83
+ # This container will show the step-by-step reasoning
84
+ thought_container = st.container()
85
+ st_callback = StreamlitCallbackHandler(thought_container)
86
 
87
+ with st.spinner("Agent is analyzing..."):
 
 
 
88
  try:
89
+ # The .run() method triggers the agentic loop
90
+ response = agent.run(query, callbacks=[st_callback])
 
 
 
91
 
92
+ st.markdown("---")
93
+ st.subheader("✅ Final Answer")
94
+ st.success(response)
95
 
96
+ # Note on Plots:
97
+ # If the agent uses plt.show(), it might not render in Streamlit.
98
+ # Standard practice for agents is to ask them to use st.pyplot(plt.gcf())
99
+ # but the agent often figures out how to display data.
100
 
101
  except Exception as e:
102
+ st.error(f"The agent encountered a critical error: {e}")
103
+ st.info("Tip: Try rephrasing your question or checking if column names are clear.")
 
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  else:
106
+ st.info("👆 Please upload a CSV file to begin.")
107
+
108
+ # --- Sidebar Credits & Info ---
109
+ with st.sidebar:
110
+ st.header("How it works")
111
+ st.markdown("""
112
+ **1. Thought:** The LLM analyzes your question and the dataframe schema.
113
+
114
+ **2. Action:** It writes and executes Python code using `pandas`.
115
+
116
+ **3. Observation:** It looks at the output of that code.
117
+
118
+ **4. Final Answer:** If the output satisfies the question, it responds. Otherwise, it loops back to step 1.
119
+ """)
120
+
121
+ if st.button("Clear Cache"):
122
+ st.cache_data.clear()
123
+ st.rerun()
124
+
125
+ if __name__ == "__main__":
126
+ main()