Spaces:

sanjaystarc
/

datamind

Sleeping

App Files Files Community

sanjaystarc commited on Mar 6

Commit

e1c1a1e

verified ·

1 Parent(s): 361e559

Upload 4 files

Browse files

Files changed (4) hide show

app.py +515 -0
core_agent.py +408 -0
requirements.txt +14 -0
sample_data.csv +31 -0

app.py ADDED Viewed

	@@ -0,0 +1,515 @@

+"""
+app.py
+======
+Streamlit UI — Data Analyst Agent (LangChain + Gemini)
+Run: streamlit run app.py
+"""
+import os
+import io
+import json
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+from core_agent import (
+    get_llm, load_file, profile_dataframe, profile_to_text,
+    set_dataframe, build_agent, run_agent,
+    auto_suggest_charts, make_plotly_chart, recommend_chart
+)
+# ─── Page Config ──────────────────────────────────────────────────────────────
+st.set_page_config(
+    page_title="DataMind Agent",
+    page_icon="🧠",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# ─── Custom CSS ───────────────────────────────────────────────────────────────
+st.markdown("""
+<style>
+@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=DM+Sans:wght@300;400;500&display=swap');
+html, body, [class*="css"] {
+    font-family: 'DM Sans', sans-serif;
+    background-color: #0a0a12;
+    color: #e8e8ff;
+}
+.main { background-color: #0a0a12 !important; }
+/* Header */
+.hero-title {
+    font-family: 'Syne', sans-serif;
+    font-size: 2.8rem;
+    font-weight: 800;
+    background: linear-gradient(135deg, #e8e8ff 0%, #6C63FF 50%, #43E97B 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+    margin-bottom: 0.2rem;
+}
+.hero-sub {
+    color: #6a6a9a;
+    font-size: 1rem;
+    margin-bottom: 2rem;
+}
+/* Cards */
+.stat-card {
+    background: #1a1a2e;
+    border: 1px solid #2a2a45;
+    border-radius: 16px;
+    padding: 1.2rem 1.5rem;
+    text-align: center;
+}
+.stat-num {
+    font-family: 'Syne', sans-serif;
+    font-size: 2rem;
+    font-weight: 800;
+    color: #6C63FF;
+}
+.stat-label { color: #6a6a9a; font-size: 0.8rem; text-transform: uppercase; letter-spacing: 0.1em; }
+/* Chat bubbles */
+.user-bubble {
+    background: rgba(108,99,255,0.15);
+    border: 1px solid rgba(108,99,255,0.3);
+    border-radius: 18px 18px 4px 18px;
+    padding: 0.9rem 1.2rem;
+    margin: 0.5rem 0;
+    font-size: 0.95rem;
+}
+.agent-bubble {
+    background: #1a1a2e;
+    border: 1px solid #2a2a45;
+    border-radius: 18px 18px 18px 4px;
+    padding: 0.9rem 1.2rem;
+    margin: 0.5rem 0;
+    font-size: 0.95rem;
+    line-height: 1.6;
+}
+/* Sidebar */
+section[data-testid="stSidebar"] {
+    background: #10101e !important;
+    border-right: 1px solid #2a2a45;
+}
+/* Buttons */
+.stButton > button {
+    background: linear-gradient(135deg, #6C63FF, #43E97B);
+    color: white;
+    border: none;
+    border-radius: 12px;
+    font-family: 'Syne', sans-serif;
+    font-weight: 700;
+    padding: 0.6rem 1.5rem;
+    transition: opacity 0.2s, transform 0.2s;
+}
+.stButton > button:hover { opacity: 0.85; color: white; transform: translateY(-1px); }
+.stTextInput > div > div > input {
+    background: #1a1a2e;
+    border: 1px solid #2a2a45;
+    border-radius: 12px;
+    color: #e8e8ff;
+}
+.stSelectbox > div > div {
+    background: #1a1a2e;
+    border: 1px solid #2a2a45;
+    border-radius: 12px;
+}
+/* Tabs */
+.stTabs [data-baseweb="tab-list"] {
+    background: #10101e;
+    border-radius: 12px;
+    gap: 0.3rem;
+    padding: 0.3rem;
+}
+.stTabs [data-baseweb="tab"] {
+    background: transparent;
+    color: #6a6a9a;
+    border-radius: 10px;
+    font-family: 'Syne', sans-serif;
+}
+.stTabs [aria-selected="true"] {
+    background: rgba(108,99,255,0.2) !important;
+    color: #6C63FF !important;
+}
+/* Dataframe */
+.stDataFrame { border-radius: 12px; overflow: hidden; }
+/* Info / success boxes */
+.stAlert { border-radius: 12px; }
+</style>""", unsafe_allow_html=True)
+# ─── Session State ────────────────────────────────────────────────────────────
+for key, default in {
+    "df": None,
+    "profile": None,
+    "file_type": None,
+    "chat_history": [],
+    "llm": None,
+    "agent_executor": None,
+    "api_key_set": False,
+}.items():
+    if key not in st.session_state:
+        st.session_state[key] = default
+# ─── Sidebar ──────────────────────────────────────────────────────────────────
+with st.sidebar:
+    st.markdown("### 🧠 DataMind Agent")
+    st.markdown("---")
+    # API Key
+    st.markdown("**🔑 Gemini API Key**")
+    api_key = st.text_input(
+        "Enter your key", type="password",
+        placeholder="AIza...",
+        help="Get free key at aistudio.google.com",
+        label_visibility="collapsed"
+    )
+    if api_key:
+        if not st.session_state.api_key_set or st.session_state.get("_last_key") != api_key:
+            try:
+                st.session_state.llm = get_llm(api_key)
+                st.session_state.agent_executor = build_agent(st.session_state.llm)
+                st.session_state.api_key_set = True
+                st.session_state["_last_key"] = api_key
+                st.success("✅ Connected to Gemini!")
+            except Exception as e:
+                st.error(f"❌ Invalid key: {e}")
+    st.markdown("---")
+    # File Upload
+    st.markdown("**📁 Upload Data File**")
+    uploaded = st.file_uploader(
+        "Upload", type=["csv", "xlsx", "xls", "json"],
+        label_visibility="collapsed"
+    )
+    if uploaded and st.session_state.api_key_set:
+        with st.spinner("📊 Analyzing your data..."):
+            try:
+                df, ftype = load_file(uploaded)
+                profile = profile_dataframe(df)
+                st.session_state.df = df
+                st.session_state.file_type = ftype
+                st.session_state.profile = profile
+                st.session_state.chat_history = []
+                set_dataframe(df, profile)
+                st.success(f"✅ Loaded {ftype} file!")
+            except Exception as e:
+                st.error(f"❌ Error: {e}")
+    elif uploaded and not st.session_state.api_key_set:
+        st.warning("⚠️ Enter your Gemini API key first")
+    st.markdown("---")
+    st.markdown("""
+**How to use:**
+1. Paste your Gemini API key above
+2. Upload CSV, Excel, or JSON file
+3. Explore the Dashboard tab
+4. Ask questions in Chat tab
+5. Generate visuals in Charts tab
+---
+**Get free Gemini API key:**
+[aistudio.google.com](https://aistudio.google.com/app/apikey)
+""")
+# ─── Main Content ─────────────────────────────────────────────────────────────
+st.markdown('<div class="hero-title">🧠 DataMind Agent</div>', unsafe_allow_html=True)
+st.markdown('<div class="hero-sub">AI-powered data analysis using LangChain + Gemini · Upload any data file and start exploring</div>', unsafe_allow_html=True)
+if st.session_state.df is None:
+    # Landing state
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.markdown("""
+        <div class="stat-card">
+            <div class="stat-num">📂</div>
+            <div class="stat-label">CSV, Excel, JSON</div>
+            <br><p style="color:#6a6a9a; font-size:0.85rem">Upload any tabular data file — we handle the parsing automatically</p>
+        </div>""", unsafe_allow_html=True)
+    with col2:
+        st.markdown("""
+        <div class="stat-card">
+            <div class="stat-num">💬</div>
+            <div class="stat-label">Natural Language Q&A</div>
+            <br><p style="color:#6a6a9a; font-size:0.85rem">Ask anything about your data in plain English — no SQL needed</p>
+        </div>""", unsafe_allow_html=True)
+    with col3:
+        st.markdown("""
+        <div class="stat-card">
+            <div class="stat-num">📊</div>
+            <div class="stat-label">Smart Visualizations</div>
+            <br><p style="color:#6a6a9a; font-size:0.85rem">AI picks the right chart for your question automatically</p>
+        </div>""", unsafe_allow_html=True)
+    st.markdown("<br>", unsafe_allow_html=True)
+    st.info("👈 Enter your Gemini API key and upload a data file in the sidebar to get started!")
+else:
+    df      = st.session_state.df
+    profile = st.session_state.profile
+    llm     = st.session_state.llm
+    # ── Tabs ─────────────────────────────────────────────────────────────────
+    tab1, tab2, tab3, tab4 = st.tabs(["📊 Dashboard", "💬 Chat", "🎨 Charts", "🔍 Raw Data"])
+    # ════════════════════════════════════════════════════════════════
+    # TAB 1 — Dashboard
+    # ════════════════════════════════════════════════════════════════
+    with tab1:
+        rows, cols = profile["shape"]
+        nulls  = sum(profile["null_counts"].values())
+        num_c  = len(profile["numeric_columns"])
+        cat_c  = len(profile["categorical_columns"])
+        c1, c2, c3, c4 = st.columns(4)
+        c1.markdown(f'<div class="stat-card"><div class="stat-num">{rows:,}</div><div class="stat-label">Rows</div></div>', unsafe_allow_html=True)
+        c2.markdown(f'<div class="stat-card"><div class="stat-num">{cols}</div><div class="stat-label">Columns</div></div>', unsafe_allow_html=True)
+        c3.markdown(f'<div class="stat-card"><div class="stat-num">{num_c}</div><div class="stat-label">Numeric Cols</div></div>', unsafe_allow_html=True)
+        c4.markdown(f'<div class="stat-card"><div class="stat-num">{nulls}</div><div class="stat-label">Missing Values</div></div>', unsafe_allow_html=True)
+        st.markdown("<br>", unsafe_allow_html=True)
+        # Column overview
+        st.markdown("#### 📋 Column Overview")
+        col_info = pd.DataFrame({
+            "Column": df.columns,
+            "Type": df.dtypes.astype(str).values,
+            "Non-Null": df.notnull().sum().values,
+            "Null %": (df.isnull().mean() * 100).round(1).values,
+            "Unique": df.nunique().values,
+        })
+        st.dataframe(col_info, use_container_width=True, hide_index=True)
+        # Auto charts
+        st.markdown("#### 🤖 Auto-Generated Insights")
+        suggested = auto_suggest_charts(profile)[:3]
+        chart_cols = st.columns(min(len(suggested), 2))
+        for i, ctype in enumerate(suggested[:2]):
+            with chart_cols[i]:
+                try:
+                    fig = make_plotly_chart(ctype, df, profile)
+                    st.plotly_chart(fig, use_container_width=True)
+                except Exception as e:
+                    st.warning(f"Could not render {ctype}: {e}")
+        if len(suggested) > 2:
+            try:
+                fig = make_plotly_chart(suggested[2], df, profile)
+                st.plotly_chart(fig, use_container_width=True)
+            except Exception:
+                pass
+        # AI summary
+        st.markdown("#### 🧠 AI Dataset Summary")
+        if st.button("✨ Generate AI Summary"):
+            with st.spinner("🤖 Agent is generating full report..."):
+                set_dataframe(df, profile)
+                result = run_agent(
+                    "Give me a full insight report on this dataset with key patterns, anomalies, and actionable recommendations.",
+                    st.session_state.agent_executor, []
+                )
+                st.markdown(f'<div class="agent-bubble">{result["output"]}</div>', unsafe_allow_html=True)
+                if result["steps"]:
+                    with st.expander(f"🔍 Agent used {len(result['steps'])} tool(s)"):
+                        for i, (action, res) in enumerate(result["steps"]):
+                            st.markdown(f"**Step {i+1}: `{action.tool}`**")
+                            st.code(str(res)[:300] + "...", language="text")
+    # ════════════════════════════════════════════════════════════════
+    # TAB 2 — Chat
+    # ════════════════════════════════════════════════════════════════
+    with tab2:
+        st.markdown("#### 💬 Ask Anything About Your Data")
+        st.markdown("*The autonomous agent plans, uses tools, and reasons step-by-step to answer your question.*")
+        # Suggested questions
+        st.markdown("**Quick questions to try:**")
+        suggestions = [
+            "Give me a full insight report on this data",
+            "Are there any outliers or anomalies?",
+            "What correlations exist between numeric columns?",
+        ]
+        q_cols = st.columns(3)
+        for i, s in enumerate(suggestions):
+            with q_cols[i]:
+                if st.button(s, key=f"sug_{i}"):
+                    st.session_state["prefill_q"] = s
+        # Chat history
+        for turn in st.session_state.chat_history:
+            st.markdown(f'<div class="user-bubble">👤 {turn["user"]}</div>', unsafe_allow_html=True)
+            # Show agent reasoning steps
+            if turn.get("steps"):
+                with st.expander(f"🔍 Agent used {len(turn['steps'])} tool(s) — click to see reasoning"):
+                    for i, (action, result) in enumerate(turn["steps"]):
+                        st.markdown(f"**Step {i+1}: `{action.tool}`**")
+                        st.caption(f"Input: {action.tool_input}")
+                        st.code(str(result)[:500] + ("..." if len(str(result)) > 500 else ""), language="text")
+            st.markdown(f'<div class="agent-bubble">🧠 {turn["agent"]}</div>', unsafe_allow_html=True)
+        # Input
+        prefill = st.session_state.pop("prefill_q", "")
+        question = st.text_input(
+            "Ask a question...",
+            value=prefill,
+            placeholder="e.g. Which category has the highest profit? Find outliers in sales.",
+            label_visibility="collapsed",
+        )
+        col_send, col_clear = st.columns([1, 5])
+        with col_send:
+            send = st.button("Send 🚀")
+        with col_clear:
+            if st.button("Clear Chat"):
+                st.session_state.chat_history = []
+                st.rerun()
+        if send and question.strip():
+            # Build LangChain chat history from session
+            from langchain_core.messages import HumanMessage as HM, AIMessage
+            lc_history = []
+            for turn in st.session_state.chat_history:
+                lc_history.append(HM(content=turn["user"]))
+                lc_history.append(AIMessage(content=turn["agent"]))
+            with st.spinner("🤖 Agent is planning and executing tools..."):
+                set_dataframe(df, profile)
+                result = run_agent(question, st.session_state.agent_executor, lc_history)
+                answer = result["output"]
+                steps  = result["steps"]
+                # Get chart recommendation
+                try:
+                    chart_json = json.loads(recommend_chart.invoke(question))
+                except Exception:
+                    chart_json = None
+                st.session_state.chat_history.append({
+                    "user": question,
+                    "agent": answer,
+                    "steps": steps,
+                })
+            st.markdown(f'<div class="user-bubble">👤 {question}</div>', unsafe_allow_html=True)
+            # Show reasoning steps
+            if steps:
+                with st.expander(f"🔍 Agent used {len(steps)} tool(s) — click to see reasoning"):
+                    for i, (action, res) in enumerate(steps):
+                        st.markdown(f"**Step {i+1}: `{action.tool}`**")
+                        st.caption(f"Input: {action.tool_input}")
+                        st.code(str(res)[:500] + ("..." if len(str(res)) > 500 else ""), language="text")
+            st.markdown(f'<div class="agent-bubble">🧠 {answer}</div>', unsafe_allow_html=True)
+            # Auto chart
+            if chart_json:
+                try:
+                    fig = make_plotly_chart(
+                        chart_json["chart_type"], df, profile,
+                        x_col=chart_json.get("x_col"),
+                        y_col=chart_json.get("y_col"),
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+                except Exception:
+                    pass
+    # ════════════════════════════════════════════════════════════════
+    # TAB 3 — Charts
+    # ════════════════════════════════════════════════════════════════
+    with tab3:
+        st.markdown("#### 🎨 Custom Chart Builder")
+        chart_options = {
+            "Correlation Heatmap": "correlation_heatmap",
+            "Distribution Plot": "distribution_plots",
+            "Box Plots": "box_plots",
+            "Bar Chart": "bar_chart",
+            "Pie Chart": "pie_chart",
+            "Scatter Plot": "scatter",
+            "Line Chart": "line",
+            "Scatter Matrix": "scatter_matrix",
+        }
+        if profile["datetime_columns"]:
+            chart_options["Time Series"] = "time_series"
+        c1, c2, c3 = st.columns(3)
+        with c1:
+            chart_label = st.selectbox("Chart Type", list(chart_options.keys()))
+        with c2:
+            all_cols = ["(auto)"] + df.columns.tolist()
+            x_col = st.selectbox("X Column", all_cols)
+        with c3:
+            y_col = st.selectbox("Y Column", all_cols)
+        x_val = None if x_col == "(auto)" else x_col
+        y_val = None if y_col == "(auto)" else y_col
+        if st.button("🎨 Generate Chart"):
+            with st.spinner("Rendering..."):
+                try:
+                    fig = make_plotly_chart(
+                        chart_options[chart_label], df, profile,
+                        x_col=x_val, y_col=y_val
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+                except Exception as e:
+                    st.error(f"Chart error: {e}")
+        st.markdown("---")
+        st.markdown("#### 📊 All Auto-Suggested Charts")
+        suggested_all = auto_suggest_charts(profile)
+        for i in range(0, len(suggested_all), 2):
+            cols = st.columns(2)
+            for j, ctype in enumerate(suggested_all[i:i+2]):
+                with cols[j]:
+                    try:
+                        fig = make_plotly_chart(ctype, df, profile)
+                        st.plotly_chart(fig, use_container_width=True)
+                    except Exception as e:
+                        st.warning(f"Could not render {ctype}")
+    # ════════════════════════════════════════════════════════════════
+    # TAB 4 — Raw Data
+    # ════════════════════════════════════════════════════════════════
+    with tab4:
+        st.markdown("#### 🔍 Raw Data Explorer")
+        # Search/filter
+        search = st.text_input("🔎 Filter rows containing...", placeholder="Type to filter...")
+        if search:
+            mask = df.astype(str).apply(lambda row: row.str.contains(search, case=False, na=False)).any(axis=1)
+            display_df = df[mask]
+            st.info(f"Showing {len(display_df):,} of {len(df):,} rows matching '{search}'")
+        else:
+            display_df = df
+        st.dataframe(display_df, use_container_width=True, height=500)
+        # Download
+        csv_buf = io.StringIO()
+        df.to_csv(csv_buf, index=False)
+        st.download_button(
+            "⬇️ Download as CSV",
+            data=csv_buf.getvalue(),
+            file_name="analyzed_data.csv",
+            mime="text/csv"
+        )

core_agent.py ADDED Viewed

	@@ -0,0 +1,408 @@

+"""
+core_agent.py — TRUE Agentic AI
+LangChain Agent + Tools + Memory + Gemini
+"""
+import os
+import json
+import warnings
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from dotenv import load_dotenv
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.tools import tool
+from langchain.agents import AgentExecutor, create_tool_calling_agent
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_community.chat_message_histories import ChatMessageHistory
+warnings.filterwarnings("ignore")
+load_dotenv()
+PALETTE = ["#6C63FF", "#FF6584", "#43E97B", "#F7971E", "#4FC3F7", "#CE93D8"]
+DARK_BG  = "#0F0F1A"
+CARD_BG  = "#1A1A2E"
+_df: pd.DataFrame = None
+_profile: dict = None
+def set_dataframe(df, profile):
+    global _df, _profile
+    _df = df
+    _profile = profile
+def get_llm(api_key: str):
+    return ChatGoogleGenerativeAI(
+        model="gemini-1.5-flash",
+        google_api_key=api_key,
+        temperature=0.3,
+        convert_system_message_to_human=True,
+    )
+def load_file(file):
+    name = file.name.lower()
+    if name.endswith(".csv"):
+        return pd.read_csv(file), "CSV"
+    elif name.endswith((".xlsx", ".xls")):
+        return pd.read_excel(file), "Excel"
+    elif name.endswith(".json"):
+        content = json.load(file)
+        if isinstance(content, list):
+            df = pd.DataFrame(content)
+        else:
+            df = pd.DataFrame(content) if any(isinstance(v, list) for v in content.values()) else pd.DataFrame([content])
+        return df, "JSON"
+    else:
+        raise ValueError(f"Unsupported file type: {name}")
+def profile_dataframe(df):
+    numeric_cols  = df.select_dtypes(include="number").columns.tolist()
+    category_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
+    datetime_cols = df.select_dtypes(include=["datetime"]).columns.tolist()
+    profile = {
+        "shape": df.shape,
+        "columns": df.columns.tolist(),
+        "dtypes": df.dtypes.astype(str).to_dict(),
+        "numeric_columns": numeric_cols,
+        "categorical_columns": category_cols,
+        "datetime_columns": datetime_cols,
+        "null_counts": df.isnull().sum().to_dict(),
+        "null_pct": (df.isnull().mean() * 100).round(2).to_dict(),
+        "duplicates": int(df.duplicated().sum()),
+    }
+    if numeric_cols:
+        profile["numeric_stats"] = df[numeric_cols].describe().round(3).to_dict()
+    if category_cols:
+        profile["top_categories"] = {col: df[col].value_counts().head(5).to_dict() for col in category_cols}
+    return profile
+def profile_to_text(profile, df):
+    rows, cols = profile["shape"]
+    lines = [
+        f"Dataset: {rows} rows x {cols} columns",
+        f"Numeric columns : {', '.join(profile['numeric_columns']) or 'None'}",
+        f"Categorical cols : {', '.join(profile['categorical_columns']) or 'None'}",
+        f"Datetime cols    : {', '.join(profile['datetime_columns']) or 'None'}",
+        f"Missing values   : {sum(profile['null_counts'].values())} total",
+        f"Duplicate rows   : {profile['duplicates']}",
+        "", "--- Sample Data (first 5 rows) ---",
+        df.head(5).to_string(index=False),
+    ]
+    if profile.get("numeric_stats"):
+        lines += ["", "--- Numeric Stats ---"]
+        for col, stats in profile["numeric_stats"].items():
+            lines.append(f"  {col}: mean={stats.get('mean','?')}, std={stats.get('std','?')}, min={stats.get('min','?')}, max={stats.get('max','?')}")
+    return "\n".join(lines)
+# ══════════════════════════════════════════════
+# AGENT TOOLS
+# ══════════════════════════════════════════════
+@tool
+def profile_data(query: str) -> str:
+    """Get full statistical profile of the dataset. Use this FIRST before any analysis."""
+    if _df is None:
+        return "No dataset loaded. Please upload a file first."
+    return profile_to_text(_profile, _df)
+@tool
+def analyze_column(column_name: str) -> str:
+    """Deeply analyze a specific column. Provide the exact column name."""
+    if _df is None:
+        return "No dataset loaded."
+    if column_name not in _df.columns:
+        return f"Column '{column_name}' not found. Available: {_df.columns.tolist()}"
+    col = _df[column_name]
+    result = [f"Analysis of '{column_name}'", f"Type: {col.dtype}",
+              f"Non-null: {col.count()} / {len(col)}", f"Nulls: {col.isnull().sum()} ({col.isnull().mean()*100:.1f}%)"]
+    if pd.api.types.is_numeric_dtype(col):
+        Q1, Q3 = col.quantile(0.25), col.quantile(0.75)
+        IQR = Q3 - Q1
+        outliers = int(((col < Q1 - 1.5*IQR) | (col > Q3 + 1.5*IQR)).sum())
+        result += [f"Mean: {col.mean():.3f}", f"Median: {col.median():.3f}",
+                   f"Std: {col.std():.3f}", f"Min: {col.min()}", f"Max: {col.max()}",
+                   f"Skewness: {col.skew():.3f}", f"Outliers: {outliers}"]
+    else:
+        result += [f"Unique values: {col.nunique()}",
+                   f"Top 5: {col.value_counts().head(5).to_dict()}",
+                   f"Most common: {col.mode()[0] if not col.mode().empty else 'N/A'}"]
+    return "\n".join(result)
+@tool
+def find_correlations(query: str) -> str:
+    """Find correlations between numeric columns. Highlights strong relationships."""
+    if _df is None:
+        return "No dataset loaded."
+    num_cols = _profile["numeric_columns"]
+    if len(num_cols) < 2:
+        return "Need at least 2 numeric columns."
+    corr = _df[num_cols].corr().round(3)
+    strong = []
+    for i in range(len(num_cols)):
+        for j in range(i+1, len(num_cols)):
+            val = corr.iloc[i, j]
+            if abs(val) >= 0.5:
+                strength = "strong" if abs(val) >= 0.8 else "moderate"
+                direction = "positive" if val > 0 else "negative"
+                strong.append(f"  {num_cols[i]} <-> {num_cols[j]}: {val} ({strength} {direction})")
+    result = ["Correlation Matrix:", corr.to_string()]
+    if strong:
+        result += ["", "Notable correlations:"] + strong
+    else:
+        result.append("No strong correlations found (|r| >= 0.5)")
+    return "\n".join(result)
+@tool
+def detect_anomalies(query: str) -> str:
+    """Detect outliers and anomalies across all numeric columns using IQR method."""
+    if _df is None:
+        return "No dataset loaded."
+    num_cols = _profile["numeric_columns"]
+    if not num_cols:
+        return "No numeric columns found."
+    results = ["Anomaly Detection Report:"]
+    total = 0
+    for col in num_cols:
+        series = _df[col].dropna()
+        Q1, Q3 = series.quantile(0.25), series.quantile(0.75)
+        IQR = Q3 - Q1
+        outliers = _df[((_df[col] < Q1 - 1.5*IQR) | (_df[col] > Q3 + 1.5*IQR))][col]
+        if len(outliers) > 0:
+            total += len(outliers)
+            results.append(f"  {col}: {len(outliers)} outliers | Examples: {outliers.head(3).tolist()}")
+    results.append(f"\nTotal outliers: {total}")
+    if total == 0:
+        results.append("No significant outliers detected.")
+    return "\n".join(results)
+@tool
+def run_aggregation(query: str) -> str:
+    """
+    Compute group-by aggregations.
+    Format input as: 'group_col|agg_col|function'
+    Example: 'category|sales|sum'
+    Supported: sum, mean, count, max, min, median
+    """
+    if _df is None:
+        return "No dataset loaded."
+    try:
+        parts = [p.strip() for p in query.split("|")]
+        if len(parts) == 3:
+            group_col, agg_col, func = parts
+        elif len(parts) == 2:
+            group_col, agg_col, func = parts[0], parts[1], "mean"
+        else:
+            cat_cols = _profile["categorical_columns"]
+            num_cols = _profile["numeric_columns"]
+            if not cat_cols or not num_cols:
+                return "Could not determine columns."
+            group_col, agg_col, func = cat_cols[0], num_cols[0], "sum"
+        if group_col not in _df.columns:
+            return f"Column '{group_col}' not found. Available: {_df.columns.tolist()}"
+        if agg_col not in _df.columns:
+            return f"Column '{agg_col}' not found. Available: {_df.columns.tolist()}"
+        fn = func.lower()
+        result = _df.groupby(group_col)[agg_col].agg(fn).reset_index().sort_values(agg_col, ascending=False)
+        result.columns = [group_col, f"{fn}_{agg_col}"]
+        return f"Aggregation: {fn.upper()} of '{agg_col}' by '{group_col}'\n{result.to_string(index=False)}"
+    except Exception as e:
+        return f"Aggregation error: {str(e)}"
+@tool
+def generate_insight_report(query: str) -> str:
+    """Generate a complete automated insight report with data quality score, patterns, and recommendations."""
+    if _df is None:
+        return "No dataset loaded."
+    rows, cols = _profile["shape"]
+    num_cols = _profile["numeric_columns"]
+    cat_cols = _profile["categorical_columns"]
+    nulls = sum(_profile["null_counts"].values())
+    null_pct = (nulls / (rows * cols) * 100) if rows * cols > 0 else 0
+    quality = 100
+    if null_pct > 20: quality -= 30
+    elif null_pct > 10: quality -= 15
+    elif null_pct > 5: quality -= 5
+    if _profile["duplicates"] > 0: quality -= 10
+    report = [
+        "=" * 50, "AUTOMATED INSIGHT REPORT", "=" * 50, "",
+        "1. DATASET OVERVIEW",
+        f"   Rows: {rows:,} | Columns: {cols}",
+        f"   Numeric: {len(num_cols)} | Categorical: {len(cat_cols)}",
+        f"   Data Quality Score: {quality}/100", "",
+        "2. DATA QUALITY",
+        f"   Missing values: {nulls} ({null_pct:.1f}%)",
+        f"   Duplicate rows: {_profile['duplicates']}",
+    ]
+    if nulls > 0:
+        worst = max(_profile["null_pct"].items(), key=lambda x: x[1])
+        report.append(f"   Worst column: '{worst[0]}' ({worst[1]}% missing)")
+    report += ["", "3. KEY STATISTICS"]
+    for col in num_cols[:5]:
+        stats = _profile.get("numeric_stats", {}).get(col, {})
+        report.append(f"   {col}: mean={stats.get('mean','?')}, range=[{stats.get('min','?')}, {stats.get('max','?')}]")
+    if cat_cols:
+        report += ["", "4. CATEGORICAL SUMMARY"]
+        for col in cat_cols[:3]:
+            top = _df[col].value_counts().index[0] if not _df[col].empty else "N/A"
+            report.append(f"   {col}: {_df[col].nunique()} unique | most common = '{top}'")
+    report += [
+        "", "5. RECOMMENDATIONS",
+        f"   - {'Fix missing values' if null_pct > 5 else 'Data completeness looks good'}",
+        f"   - {'Remove duplicate rows' if _profile['duplicates'] > 0 else 'No duplicates found'}",
+        f"   - {'Run correlation analysis' if len(num_cols) >= 2 else 'Need more numeric columns'}",
+        f"   - {'Encode categorical columns for ML' if cat_cols else 'Add categorical features'}",
+        "", "=" * 50,
+    ]
+    return "\n".join(report)
+@tool
+def recommend_chart(question: str) -> str:
+    """Recommend best chart type for a question. Returns JSON with chart_type, x_col, y_col."""
+    if _profile is None:
+        return json.dumps({"chart_type": "bar_chart", "x_col": None, "y_col": None})
+    num_cols = _profile["numeric_columns"]
+    cat_cols = _profile["categorical_columns"]
+    dt_cols  = _profile["datetime_columns"]
+    q = question.lower()
+    if any(w in q for w in ["trend", "over time", "time", "date"]) and dt_cols and num_cols:
+        return json.dumps({"chart_type": "time_series", "x_col": dt_cols[0], "y_col": num_cols[0]})
+    elif any(w in q for w in ["correlat", "relationship", "vs", "versus"]) and len(num_cols) >= 2:
+        return json.dumps({"chart_type": "correlation_heatmap", "x_col": None, "y_col": None})
+    elif any(w in q for w in ["distribut", "spread", "histogram"]) and num_cols:
+        return json.dumps({"chart_type": "distribution_plots", "x_col": None, "y_col": num_cols[0]})
+    elif any(w in q for w in ["outlier", "box", "range"]) and num_cols:
+        return json.dumps({"chart_type": "box_plots", "x_col": None, "y_col": None})
+    elif any(w in q for w in ["proportion", "share", "percent", "pie"]) and cat_cols:
+        return json.dumps({"chart_type": "pie_chart", "x_col": cat_cols[0], "y_col": None})
+    elif cat_cols and num_cols:
+        return json.dumps({"chart_type": "bar_chart", "x_col": cat_cols[0], "y_col": num_cols[0]})
+    elif len(num_cols) >= 2:
+        return json.dumps({"chart_type": "scatter", "x_col": num_cols[0], "y_col": num_cols[1]})
+    return json.dumps({"chart_type": "bar_chart", "x_col": None, "y_col": None})
+# ══════════════════════════════════════════════
+# AGENT BUILDER
+# ══════════════════════════════════════════════
+TOOLS = [profile_data, analyze_column, find_correlations,
+         detect_anomalies, run_aggregation, generate_insight_report, recommend_chart]
+SYSTEM_PROMPT = """You are DataMind, an expert autonomous data analyst AI agent.
+You have access to powerful tools to analyze any dataset. When a user asks a question:
+1. THINK about what tools you need
+2. PLAN your steps (use multiple tools in sequence when needed)
+3. EXECUTE each tool
+4. SYNTHESIZE the results into a clear, insightful answer
+5. SELF-CORRECT if a tool returns an error
+Your tools:
+- profile_data: Get dataset overview (use this first)
+- analyze_column: Deep dive into a specific column
+- find_correlations: Find relationships between numeric columns
+- detect_anomalies: Find outliers and data quality issues
+- run_aggregation: Group-by calculations
+- generate_insight_report: Full automated analysis report
+- recommend_chart: Suggest best visualization
+Always be precise, proactive, and thorough. Use multiple tools when needed.
+Remember conversation history and refer to previous questions when relevant."""
+def build_agent(llm) -> AgentExecutor:
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", SYSTEM_PROMPT),
+        MessagesPlaceholder(variable_name="chat_history"),
+        ("human", "{input}"),
+        MessagesPlaceholder(variable_name="agent_scratchpad"),
+    ])
+    agent = create_tool_calling_agent(llm, TOOLS, prompt)
+    return AgentExecutor(
+        agent=agent, tools=TOOLS, verbose=True,
+        max_iterations=6, early_stopping_method="generate",
+        handle_parsing_errors=True, return_intermediate_steps=True,
+    )
+def run_agent(question: str, agent_executor: AgentExecutor, chat_history: list) -> dict:
+    try:
+        result = agent_executor.invoke({"input": question, "chat_history": chat_history})
+        return {"output": result.get("output", "No response."), "steps": result.get("intermediate_steps", []), "error": None}
+    except Exception as e:
+        return {"output": f"Agent error: {str(e)}", "steps": [], "error": str(e)}
+# ── Chart Engine ──────���───────────────────────
+def auto_suggest_charts(profile):
+    suggestions = []
+    if len(profile["numeric_columns"]) >= 2:
+        suggestions.extend(["correlation_heatmap", "scatter_matrix"])
+    if profile["numeric_columns"]:
+        suggestions.extend(["distribution_plots", "box_plots"])
+    if profile["categorical_columns"] and profile["numeric_columns"]:
+        suggestions.extend(["bar_chart", "pie_chart"])
+    if profile["datetime_columns"] and profile["numeric_columns"]:
+        suggestions.append("time_series")
+    return suggestions
+def make_plotly_chart(chart_type, df, profile, x_col=None, y_col=None, color_col=None):
+    num_cols = profile["numeric_columns"]
+    cat_cols = profile["categorical_columns"]
+    template = "plotly_dark"
+    if chart_type == "correlation_heatmap" and len(num_cols) >= 2:
+        fig = px.imshow(df[num_cols].corr().round(2), text_auto=True,
+                        color_continuous_scale="RdBu_r", title="Correlation Heatmap",
+                        template=template, color_continuous_midpoint=0)
+    elif chart_type == "distribution_plots" and num_cols:
+        col = y_col or num_cols[0]
+        fig = px.histogram(df, x=col, nbins=30, marginal="box",
+                           title=f"Distribution of {col}",
+                           color_discrete_sequence=PALETTE, template=template)
+    elif chart_type == "box_plots" and num_cols:
+        fig = go.Figure()
+        for i, col in enumerate(num_cols[:6]):
+            fig.add_trace(go.Box(y=df[col], name=col, marker_color=PALETTE[i % len(PALETTE)]))
+        fig.update_layout(title="Box Plots", template=template)
+    elif chart_type == "bar_chart" and cat_cols and num_cols:
+        xc, yc = x_col or cat_cols[0], y_col or num_cols[0]
+        agg = df.groupby(xc)[yc].mean().reset_index().sort_values(yc, ascending=False).head(15)
+        fig = px.bar(agg, x=xc, y=yc, color=yc, color_continuous_scale="Viridis",
+                     title=f"Average {yc} by {xc}", template=template)
+    elif chart_type == "pie_chart" and cat_cols:
+        col = x_col or cat_cols[0]
+        counts = df[col].value_counts().head(8)
+        fig = px.pie(values=counts.values, names=counts.index,
+                     title=f"Distribution of {col}",
+                     color_discrete_sequence=PALETTE, template=template)
+    elif chart_type == "scatter_matrix" and len(num_cols) >= 2:
+        fig = px.scatter_matrix(df, dimensions=num_cols[:4],
+                                color=cat_cols[0] if cat_cols else None,
+                                color_discrete_sequence=PALETTE, title="Scatter Matrix", template=template)
+        fig.update_traces(diagonal_visible=False, showupperhalf=False)
+    elif chart_type == "time_series" and profile["datetime_columns"] and num_cols:
+        dt_col = profile["datetime_columns"][0]
+        yc = y_col or num_cols[0]
+        fig = px.line(df.sort_values(dt_col), x=dt_col, y=yc,
+                      title=f"{yc} over Time", color_discrete_sequence=PALETTE, template=template)
+    elif chart_type == "scatter" and len(num_cols) >= 2:
+        xc, yc = x_col or num_cols[0], y_col or num_cols[1]
+        fig = px.scatter(df, x=xc, y=yc,
+                         color=color_col or (cat_cols[0] if cat_cols else None),
+                         color_discrete_sequence=PALETTE, title=f"{xc} vs {yc}",
+                         trendline="ols", template=template)
+    elif chart_type == "line" and num_cols:
+        xc = x_col or (profile["datetime_columns"][0] if profile["datetime_columns"] else num_cols[0])
+        yc = y_col or num_cols[0]
+        fig = px.line(df, x=xc, y=yc, color_discrete_sequence=PALETTE,
+                      title=f"{yc} trend", template=template)
+    else:
+        if num_cols:
+            means = df[num_cols[:8]].mean()
+            fig = px.bar(x=means.index, y=means.values, color=means.values,
+                         color_continuous_scale="Viridis", title="Column Means", template=template)
+        else:
+            fig = go.Figure()
+            fig.update_layout(template=template, title="Chart Unavailable")
+    fig.update_layout(paper_bgcolor=DARK_BG, plot_bgcolor=CARD_BG,
+                      font=dict(family="DM Sans, sans-serif", color="#E0E0FF"),
+                      margin=dict(l=40, r=40, t=60, b=40))
+    return fig

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+langchain==0.3.7
+langchain-google-genai==2.0.5
+langchain-experimental==0.3.3
+langchain-community==0.3.7
+google-generativeai==0.8.3
+pandas==2.2.3
+openpyxl==3.1.5
+xlrd==2.0.1
+matplotlib==3.9.2
+seaborn==0.13.2
+plotly==5.24.1
+streamlit==1.40.1
+python-dotenv==1.0.1
+tabulate==0.9.0

sample_data.csv ADDED Viewed

	@@ -0,0 +1,31 @@

+order_id,date,product,category,region,sales,quantity,profit,customer_age,customer_gender
+1001,2024-01-05,Laptop Pro,Electronics,North,1200.00,1,240.00,34,Male
+1002,2024-01-07,Office Chair,Furniture,South,350.00,2,70.00,45,Female
+1003,2024-01-08,Wireless Mouse,Electronics,East,45.00,5,9.00,28,Male
+1004,2024-01-10,Standing Desk,Furniture,West,650.00,1,130.00,52,Female
+1005,2024-01-12,Mechanical Keyboard,Electronics,North,120.00,3,36.00,30,Male
+1006,2024-01-15,Monitor 4K,Electronics,South,400.00,2,80.00,41,Female
+1007,2024-01-18,Notebook Set,Stationery,East,25.00,10,7.50,23,Male
+1008,2024-01-20,Ergonomic Chair,Furniture,West,520.00,1,104.00,38,Female
+1009,2024-01-22,USB Hub,Electronics,North,35.00,8,10.50,26,Male
+1010,2024-01-25,Desk Lamp,Furniture,South,60.00,4,18.00,49,Female
+1011,2024-02-01,Laptop Pro,Electronics,East,1200.00,2,480.00,36,Male
+1012,2024-02-03,Wireless Headphones,Electronics,West,200.00,3,60.00,31,Female
+1013,2024-02-05,Pen Set,Stationery,North,15.00,20,6.00,22,Male
+1014,2024-02-08,Gaming Chair,Furniture,South,450.00,1,90.00,27,Female
+1015,2024-02-10,Tablet,Electronics,East,600.00,2,120.00,43,Male
+1016,2024-02-14,Bookshelf,Furniture,West,180.00,1,36.00,55,Female
+1017,2024-02-16,Webcam HD,Electronics,North,80.00,6,24.00,29,Male
+1018,2024-02-18,Sticky Notes,Stationery,South,8.00,50,4.00,24,Female
+1019,2024-02-20,Monitor Stand,Furniture,East,95.00,3,28.50,37,Male
+1020,2024-02-22,Smartphone,Electronics,West,900.00,2,180.00,33,Female
+1021,2024-03-01,Laptop Pro,Electronics,North,1200.00,3,720.00,40,Male
+1022,2024-03-04,Office Chair,Furniture,South,350.00,4,140.00,48,Female
+1023,2024-03-06,Drawing Tablet,Electronics,East,300.00,1,60.00,25,Male
+1024,2024-03-09,Filing Cabinet,Furniture,West,220.00,2,44.00,53,Female
+1025,2024-03-12,Wireless Mouse,Electronics,North,45.00,10,22.50,32,Male
+1026,2024-03-15,External SSD,Electronics,South,150.00,4,45.00,44,Female
+1027,2024-03-18,Highlighters,Stationery,East,12.00,30,5.40,21,Male
+1028,2024-03-20,Desk Organizer,Furniture,West,40.00,7,14.00,35,Female
+1029,2024-03-22,Smart Speaker,Electronics,North,120.00,5,36.00,39,Male
+1030,2024-03-25,Printer,Electronics,South,280.00,2,56.00,46,Female