Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Sleeping

App Files Files Community

Rifqi Hafizuddin commited on 6 days ago

Commit

7f3bb97

1 Parent(s): a4cf97a

[NOTICKET][DB] refactor code to new repo

Browse files

Files changed (4) hide show

src/pipeline/db_pipeline/__init__.py +3 -0
src/pipeline/db_pipeline/connector.py +74 -0
src/pipeline/db_pipeline/extractor.py +177 -0
src/pipeline/db_pipeline/pipeline.py +66 -0

src/pipeline/db_pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from src.pipeline.db_pipeline.pipeline import run_db_pipeline
2	+
3	+ __all__ = ["run_db_pipeline"]

src/pipeline/db_pipeline/connector.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Connectors for user-provided databases.
+The pipeline does not own user credentials — an API layer (outside this folder)
+builds an Engine via `connect(...)` and passes it to `run_db_pipeline`. Use
+`engine_scope(...)` for guaranteed disposal of the connection pool.
+"""
+from contextlib import contextmanager
+from typing import Iterator, Literal
+from sqlalchemy import URL, create_engine
+from sqlalchemy.engine import Engine
+from src.middlewares.logging import get_logger
+logger = get_logger("db_connector")
+DbType = Literal["postgresql", "mysql", "sqlserver"]
+def get_postgres_engine(
+    host: str, port: int, dbname: str, username: str, password: str
+) -> Engine:
+    """Build a Postgres engine with safe URL escaping (handles special chars in password)."""
+    url = URL.create(
+        drivername="postgresql+psycopg2",
+        username=username,
+        password=password,
+        host=host,
+        port=port,
+        database=dbname,
+    )
+    return create_engine(url)
+def connect(
+    db_type: DbType,
+    host: str,
+    port: int,
+    dbname: str,
+    username: str,
+    password: str,
+) -> Engine:
+    """Connect to a user-provided database. Returns a SQLAlchemy engine."""
+    logger.info("connecting to user db", db_type=db_type, host=host, port=port, dbname=dbname)
+    if db_type == "postgresql":
+        return get_postgres_engine(host, port, dbname, username, password)
+    elif db_type == "sqlserver":
+        raise NotImplementedError("SQL Server support coming soon")
+    elif db_type == "mysql":
+        raise NotImplementedError("MySQL support coming soon")
+    else:
+        raise ValueError(f"Unsupported db_type: {db_type}")
+@contextmanager
+def engine_scope(
+    db_type: DbType,
+    host: str,
+    port: int,
+    dbname: str,
+    username: str,
+    password: str,
+) -> Iterator[Engine]:
+    """Yield a connected Engine and dispose its pool on exit.
+    API callers should prefer this over raw `connect(...)` so user DB
+    connection pools do not leak between pipeline runs.
+    """
+    engine = connect(db_type, host, port, dbname, username, password)
+    try:
+        yield engine
+    finally:
+        engine.dispose()

src/pipeline/db_pipeline/extractor.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""Schema introspection and per-column profiling for a user's database.
+Identifiers (table/column names) are quoted via the engine's dialect preparer,
+which handles reserved words, mixed case, and embedded quotes correctly across
+dialects. Values used in SQL come from SQLAlchemy inspection of the DB itself,
+not user input.
+"""
+from typing import Optional
+import pandas as pd
+from sqlalchemy import Float, Integer, Numeric, inspect
+from sqlalchemy.engine import Engine
+from src.middlewares.logging import get_logger
+logger = get_logger("db_extractor")
+TOP_VALUES_THRESHOLD = 0.05  # show top values if distinct_ratio <= 5%
+def _qi(engine: Engine, name: str) -> str:
+    """Dialect-correct identifier quoting (schema.table also handled if dotted)."""
+    preparer = engine.dialect.identifier_preparer
+    if "." in name:
+        schema, _, table = name.partition(".")
+        return f"{preparer.quote(schema)}.{preparer.quote(table)}"
+    return preparer.quote(name)
+def get_schema(
+    engine: Engine, exclude_tables: Optional[frozenset[str]] = None
+) -> dict[str, list[dict]]:
+    """Returns {table_name: [{name, type, is_numeric, is_primary_key, foreign_key}, ...]}."""
+    exclude = exclude_tables or frozenset()
+    inspector = inspect(engine)
+    schema = {}
+    for table_name in inspector.get_table_names():
+        if table_name in exclude:
+            continue
+        pk = inspector.get_pk_constraint(table_name)
+        pk_cols = set(pk["constrained_columns"]) if pk else set()
+        fk_map = {}
+        for fk in inspector.get_foreign_keys(table_name):
+            for col, ref_col in zip(fk["constrained_columns"], fk["referred_columns"]):
+                fk_map[col] = f"{fk['referred_table']}.{ref_col}"
+        cols = inspector.get_columns(table_name)
+        schema[table_name] = [
+            {
+                "name": c["name"],
+                "type": str(c["type"]),
+                "is_numeric": isinstance(c["type"], (Integer, Numeric, Float)),
+                "is_primary_key": c["name"] in pk_cols,
+                "foreign_key": fk_map.get(c["name"]),
+            }
+            for c in cols
+        ]
+    logger.info("extracted schema", table_count=len(schema))
+    return schema
+def get_row_count(engine: Engine, table_name: str) -> int:
+    return pd.read_sql(f"SELECT COUNT(*) FROM {_qi(engine, table_name)}", engine).iloc[0, 0]
+def profile_column(
+    engine: Engine,
+    table_name: str,
+    col_name: str,
+    is_numeric: bool,
+    row_count: int,
+) -> dict:
+    """Returns null_count, distinct_count, min/max, top values, and sample values."""
+    if row_count == 0:
+        return {
+            "null_count": 0,
+            "distinct_count": 0,
+            "distinct_ratio": 0.0,
+            "sample_values": [],
+        }
+    qt = _qi(engine, table_name)
+    qc = _qi(engine, col_name)
+    # Combined stats query: null_count, distinct_count, and min/max (if numeric).
+    # One round-trip instead of two.
+    select_cols = [
+        f"COUNT(*) - COUNT({qc}) AS nulls",
+        f"COUNT(DISTINCT {qc}) AS distincts",
+    ]
+    if is_numeric:
+        select_cols.append(f"MIN({qc}) AS min_val")
+        select_cols.append(f"MAX({qc}) AS max_val")
+    stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
+    null_count = int(stats.iloc[0]["nulls"])
+    distinct_count = int(stats.iloc[0]["distincts"])
+    distinct_ratio = distinct_count / row_count if row_count > 0 else 0
+    profile = {
+        "null_count": null_count,
+        "distinct_count": distinct_count,
+        "distinct_ratio": round(distinct_ratio, 4),
+    }
+    if is_numeric:
+        profile["min"] = stats.iloc[0]["min_val"]
+        profile["max"] = stats.iloc[0]["max_val"]
+    if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
+        top = pd.read_sql(
+            f"SELECT {qc}, COUNT(*) AS cnt FROM {qt} "
+            f"GROUP BY {qc} ORDER BY cnt DESC LIMIT 10",
+            engine,
+        )
+        profile["top_values"] = list(zip(top[col_name].tolist(), top["cnt"].tolist()))
+    sample = pd.read_sql(f"SELECT {qc} FROM {qt} LIMIT 5", engine)
+    profile["sample_values"] = sample[col_name].tolist()
+    return profile
+def profile_table(engine: Engine, table_name: str, columns: list[dict]) -> list[dict]:
+    """Profile every column in a table. Returns [{col, profile, text}, ...].
+    Per-column errors are logged and skipped so one bad column doesn't abort
+    the whole table.
+    """
+    row_count = get_row_count(engine, table_name)
+    if row_count == 0:
+        logger.info("skipping empty table", table=table_name)
+        return []
+    results = []
+    for col in columns:
+        try:
+            profile = profile_column(
+                engine, table_name, col["name"], col.get("is_numeric", False), row_count
+            )
+            text = build_text(table_name, row_count, col, profile)
+            results.append({"col": col, "profile": profile, "text": text})
+        except Exception as e:
+            logger.error(
+                "column profiling failed",
+                table=table_name,
+                column=col["name"],
+                error=str(e),
+            )
+            continue
+    return results
+def build_text(table_name: str, row_count: int, col: dict, profile: dict) -> str:
+    col_name = col["name"]
+    col_type = col["type"]
+    key_label = ""
+    if col.get("is_primary_key"):
+        key_label = " [PRIMARY KEY]"
+    elif col.get("foreign_key"):
+        key_label = f" [FK -> {col['foreign_key']}]"
+    text = f"Table: {table_name} ({row_count} rows)\n"
+    text += f"Column: {col_name} ({col_type}){key_label}\n"
+    text += f"Null count: {profile['null_count']}\n"
+    text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
+    if "min" in profile:
+        text += f"Min: {profile['min']}, Max: {profile['max']}\n"
+    if "top_values" in profile:
+        top_str = ", ".join(f"{v} ({c})" for v, c in profile["top_values"])
+        text += f"Top values: {top_str}\n"
+    text += f"Sample values: {profile['sample_values']}"
+    return text

src/pipeline/db_pipeline/pipeline.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""End-to-end DB ingestion pipeline: introspect user's DB -> profile columns ->
+build text -> embed + store in the shared PGVector collection.
+Each column becomes one LangChainDocument with metadata tagging user_id and
+source_type='database', so it is retrievable via the existing retriever.
+"""
+import asyncio
+from typing import Optional
+from langchain_core.documents import Document as LangChainDocument
+from sqlalchemy.engine import Engine
+from src.db.postgres.vector_store import get_vector_store
+from src.middlewares.logging import get_logger
+from src.pipeline.db_pipeline.extractor import get_schema, profile_table
+logger = get_logger("db_pipeline")
+def _to_document(user_id: str, table_name: str, entry: dict) -> LangChainDocument:
+    col = entry["col"]
+    return LangChainDocument(
+        page_content=entry["text"],
+        metadata={
+            "user_id": user_id,
+            "source_type": "database",
+            "table_name": table_name,
+            "column_name": col["name"],
+            "column_type": col["type"],
+            "is_primary_key": col.get("is_primary_key", False),
+            "foreign_key": col.get("foreign_key"),
+        },
+    )
+async def run_db_pipeline(
+    user_id: str,
+    engine: Engine,
+    exclude_tables: Optional[frozenset[str]] = None,
+) -> int:
+    """Introspect the user's DB, profile columns, embed descriptions, store in PGVector.
+    Sync DB work (SQLAlchemy inspect, pandas read_sql) runs in a threadpool;
+    async vector writes stay on the event loop.
+    Returns:
+        Total number of chunks ingested.
+    """
+    vector_store = get_vector_store()
+    logger.info("db pipeline start", user_id=user_id)
+    schema = await asyncio.to_thread(get_schema, engine, exclude_tables)
+    total = 0
+    for table_name, columns in schema.items():
+        logger.info("profiling table", table=table_name, columns=len(columns))
+        entries = await asyncio.to_thread(profile_table, engine, table_name, columns)
+        docs = [_to_document(user_id, table_name, e) for e in entries]
+        if docs:
+            await vector_store.aadd_documents(docs)
+            total += len(docs)
+            logger.info("ingested chunks", table=table_name, count=len(docs))
+    logger.info("db pipeline complete", user_id=user_id, total=total)
+    return total