Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Running

App Files Files Community

[KM-437][DB] Add db pipeline

by rhbt6767 - opened 3 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+460

-141

Files changed (9) hide show

.gitignore +2 -0
src/api/v1/document.py +24 -130
src/knowledge/processing_service.py +23 -11
src/pipeline/db_pipeline/__init__.py +3 -0
src/pipeline/db_pipeline/connector.py +74 -0
src/pipeline/db_pipeline/extractor.py +186 -0
src/pipeline/db_pipeline/pipeline.py +68 -0
src/pipeline/document_pipeline/__init__.py +0 -0
src/pipeline/document_pipeline/document_pipeline.py +80 -0

.gitignore CHANGED Viewed

@@ -26,6 +26,8 @@ test/users/user_accounts.csv
 .env.prd
 .env.example
 erd/
 playground/
 playground_retriever.py

 .env.prd
 .env.example
+CLAUDE.md
 erd/
 playground/
 playground_retriever.py

src/api/v1/document.py CHANGED Viewed

@@ -1,21 +1,20 @@
 """Document management API endpoints."""
-from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File, status
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.db.postgres.connection import get_db
 from src.document.document_service import document_service
-from src.knowledge.processing_service import knowledge_processor
-from src.storage.az_blob.az_blob import blob_storage
 from src.middlewares.logging import get_logger, log_execution
 from src.middlewares.rate_limit import limiter
 from pydantic import BaseModel
 from typing import List
 logger = get_logger("document_api")
 router = APIRouter(prefix="/api/v1", tags=["Documents"])
 class DocumentResponse(BaseModel):
     id: str
     filename: str
@@ -23,8 +22,8 @@ class DocumentResponse(BaseModel):
     file_size: int
     file_type: str
     created_at: str
 @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
 @log_execution(logger)
 async def list_documents(
@@ -44,8 +43,8 @@ async def list_documents(
         )
         for doc in documents
     ]
 @router.post("/document/upload")
 @limiter.limit("10/minute")
 @log_execution(logger)
@@ -57,57 +56,12 @@ async def upload_document(
 ):
     """Upload a document."""
     if not user_id:
-        raise HTTPException(
-            status_code=400,
-            detail="user_id is required"
-        )
-    try:
-        # Read file content
-        content = await file.read()
-        file_size = len(content)
-        # Get file type
-        filename = file.filename
-        file_type = filename.split('.')[-1].lower() if '.' in filename else 'txt'
-        if file_type not in ['pdf', 'docx', 'txt']:
-            raise HTTPException(
-                status_code=400,
-                detail="Unsupported file type. Supported: pdf, docx, txt"
-            )
-        # Upload to blob storage
-        blob_name = await blob_storage.upload_file(content, filename, user_id)
-        # Create document record
-        document = await document_service.create_document(
-            db=db,
-            user_id=user_id,
-            filename=filename,
-            blob_name=blob_name,
-            file_size=file_size,
-            file_type=file_type
-        )
-        return {
-            "status": "success",
-            "message": "Document uploaded successfully",
-            "data": {
-                "id": document.id,
-                "filename": document.filename,
-                "status": document.status
-            }
-        }
-    except Exception as e:
-        logger.error(f"Upload failed for user {user_id}", error=str(e))
-        raise HTTPException(
-            status_code=500,
-            detail=f"Upload failed: {str(e)}"
-        )
 @router.delete("/document/delete")
 @log_execution(logger)
 async def delete_document(
@@ -116,31 +70,10 @@ async def delete_document(
     db: AsyncSession = Depends(get_db)
 ):
     """Delete a document."""
-    document = await document_service.get_document(db, document_id)
-    if not document:
-        raise HTTPException(
-            status_code=404,
-            detail="Document not found"
-        )
-    if document.user_id != user_id:
-        raise HTTPException(
-            status_code=403,
-            detail="Access denied"
-        )
-    success = await document_service.delete_document(db, document_id)
-    if success:
-        return {"status": "success", "message": "Document deleted successfully"}
-    else:
-        raise HTTPException(
-            status_code=500,
-            detail="Failed to delete document"
-        )
 @router.post("/document/process")
 @log_execution(logger)
 async def process_document(
@@ -149,45 +82,6 @@ async def process_document(
     db: AsyncSession = Depends(get_db)
 ):
     """Process document and ingest to vector index."""
-    document = await document_service.get_document(db, document_id)
-    if not document:
-        raise HTTPException(
-            status_code=404,
-            detail="Document not found"
-        )
-    if document.user_id != user_id:
-        raise HTTPException(
-            status_code=403,
-            detail="Access denied"
-        )
-    try:
-        # Update status to processing
-        await document_service.update_document_status(db, document_id, "processing")
-        # Process document
-        chunks_count = await knowledge_processor.process_document(document, db)
-        # Update status to completed
-        await document_service.update_document_status(db, document_id, "completed")
-        return {
-            "status": "success",
-            "message": "Document processed successfully",
-            "data": {
-                "document_id": document_id,
-                "chunks_processed": chunks_count
-            }
-        }
-    except Exception as e:
-        logger.error(f"Processing failed for document {document_id}", error=str(e))
-        await document_service.update_document_status(
-            db, document_id, "failed", str(e)
-        )
-        raise HTTPException(
-            status_code=500,
-            detail=f"Processing failed: {str(e)}"
-        )

 """Document management API endpoints."""
+from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.db.postgres.connection import get_db
 from src.document.document_service import document_service
 from src.middlewares.logging import get_logger, log_execution
 from src.middlewares.rate_limit import limiter
+from src.pipeline.document_pipeline.document_pipeline import document_pipeline
 from pydantic import BaseModel
 from typing import List
 logger = get_logger("document_api")
 router = APIRouter(prefix="/api/v1", tags=["Documents"])
 class DocumentResponse(BaseModel):
     id: str
     filename: str
     file_size: int
     file_type: str
     created_at: str
 @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
 @log_execution(logger)
 async def list_documents(
         )
         for doc in documents
     ]
 @router.post("/document/upload")
 @limiter.limit("10/minute")
 @log_execution(logger)
 ):
     """Upload a document."""
     if not user_id:
+        raise HTTPException(status_code=400, detail="user_id is required")
+    data = await document_pipeline.upload(file, user_id, db)
+    return {"status": "success", "message": "Document uploaded successfully", "data": data}
 @router.delete("/document/delete")
 @log_execution(logger)
 async def delete_document(
     db: AsyncSession = Depends(get_db)
 ):
     """Delete a document."""
+    await document_pipeline.delete(document_id, user_id, db)
+    return {"status": "success", "message": "Document deleted successfully"}
 @router.post("/document/process")
 @log_execution(logger)
 async def process_document(
     db: AsyncSession = Depends(get_db)
 ):
     """Process document and ingest to vector index."""
+    data = await document_pipeline.process(document_id, user_id, db)
+    return {"status": "success", "message": "Document processed successfully", "data": data}

src/knowledge/processing_service.py CHANGED Viewed

@@ -49,10 +49,14 @@ class KnowledgeProcessingService:
                     LangChainDocument(
                         page_content=chunk,
                         metadata={
-                            "document_id": db_doc.id,
                             "user_id": db_doc.user_id,
-                            "filename": db_doc.filename,
-                            "chunk_index": i,
                         }
                     )
                     for i, chunk in enumerate(chunks)
@@ -104,11 +108,15 @@ class KnowledgeProcessingService:
                         documents.append(LangChainDocument(
                             page_content=chunk,
                             metadata={
-                                "document_id": db_doc.id,
                                 "user_id": db_doc.user_id,
-                                "filename": db_doc.filename,
-                                "chunk_index": len(documents),
-                                "page_label": page.page_number,
                             }
                         ))
         else:
@@ -122,11 +130,15 @@ class KnowledgeProcessingService:
                     documents.append(LangChainDocument(
                         page_content=chunk,
                         metadata={
-                            "document_id": db_doc.id,
                             "user_id": db_doc.user_id,
-                            "filename": db_doc.filename,
-                            "chunk_index": len(documents),
-                            "page_label": page_num,
                         }
                     ))

                     LangChainDocument(
                         page_content=chunk,
                         metadata={
                             "user_id": db_doc.user_id,
+                            "source_type": "document",
+                            "data": {
+                                "document_id": db_doc.id,
+                                "filename": db_doc.filename,
+                                "file_type": db_doc.file_type,
+                                "chunk_index": i,
+                            },
                         }
                     )
                     for i, chunk in enumerate(chunks)
                         documents.append(LangChainDocument(
                             page_content=chunk,
                             metadata={
                                 "user_id": db_doc.user_id,
+                                "source_type": "document",
+                                "data": {
+                                    "document_id": db_doc.id,
+                                    "filename": db_doc.filename,
+                                    "file_type": db_doc.file_type,
+                                    "chunk_index": len(documents),
+                                    "page_label": page.page_number,
+                                },
                             }
                         ))
         else:
                     documents.append(LangChainDocument(
                         page_content=chunk,
                         metadata={
                             "user_id": db_doc.user_id,
+                            "source_type": "document",
+                            "data": {
+                                "document_id": db_doc.id,
+                                "filename": db_doc.filename,
+                                "file_type": db_doc.file_type,
+                                "chunk_index": len(documents),
+                                "page_label": page_num,
+                            },
                         }
                     ))

src/pipeline/db_pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from src.pipeline.db_pipeline.pipeline import run_db_pipeline
2	+
3	+ __all__ = ["run_db_pipeline"]

src/pipeline/db_pipeline/connector.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Connectors for user-provided databases.
+The pipeline does not own user credentials — an API layer (outside this folder)
+builds an Engine via `connect(...)` and passes it to `run_db_pipeline`. Use
+`engine_scope(...)` for guaranteed disposal of the connection pool.
+"""
+from contextlib import contextmanager
+from typing import Iterator, Literal
+from sqlalchemy import URL, create_engine
+from sqlalchemy.engine import Engine
+from src.middlewares.logging import get_logger
+logger = get_logger("db_connector")
+DbType = Literal["postgresql", "mysql", "sqlserver"]
+def get_postgres_engine(
+    host: str, port: int, dbname: str, username: str, password: str
+) -> Engine:
+    """Build a Postgres engine with safe URL escaping (handles special chars in password)."""
+    url = URL.create(
+        drivername="postgresql+psycopg2",
+        username=username,
+        password=password,
+        host=host,
+        port=port,
+        database=dbname,
+    )
+    return create_engine(url)
+def connect(
+    db_type: DbType,
+    host: str,
+    port: int,
+    dbname: str,
+    username: str,
+    password: str,
+) -> Engine:
+    """Connect to a user-provided database. Returns a SQLAlchemy engine."""
+    logger.info("connecting to user db", db_type=db_type, host=host, port=port, dbname=dbname)
+    if db_type == "postgresql":
+        return get_postgres_engine(host, port, dbname, username, password)
+    elif db_type == "sqlserver":
+        raise NotImplementedError("SQL Server support coming soon")
+    elif db_type == "mysql":
+        raise NotImplementedError("MySQL support coming soon")
+    else:
+        raise ValueError(f"Unsupported db_type: {db_type}")
+@contextmanager
+def engine_scope(
+    db_type: DbType,
+    host: str,
+    port: int,
+    dbname: str,
+    username: str,
+    password: str,
+) -> Iterator[Engine]:
+    """Yield a connected Engine and dispose its pool on exit.
+    API callers should prefer this over raw `connect(...)` so user DB
+    connection pools do not leak between pipeline runs.
+    """
+    engine = connect(db_type, host, port, dbname, username, password)
+    try:
+        yield engine
+    finally:
+        engine.dispose()

src/pipeline/db_pipeline/extractor.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""Schema introspection and per-column profiling for a user's database.
+Identifiers (table/column names) are quoted via the engine's dialect preparer,
+which handles reserved words, mixed case, and embedded quotes correctly across
+dialects. Values used in SQL come from SQLAlchemy inspection of the DB itself,
+not user input.
+"""
+from typing import Optional
+import pandas as pd
+from sqlalchemy import Float, Integer, Numeric, inspect
+from sqlalchemy.engine import Engine
+from src.middlewares.logging import get_logger
+logger = get_logger("db_extractor")
+TOP_VALUES_THRESHOLD = 0.05  # show top values if distinct_ratio <= 5%
+def _qi(engine: Engine, name: str) -> str:
+    """Dialect-correct identifier quoting (schema.table also handled if dotted)."""
+    preparer = engine.dialect.identifier_preparer
+    if "." in name:
+        schema, _, table = name.partition(".")
+        return f"{preparer.quote(schema)}.{preparer.quote(table)}"
+    return preparer.quote(name)
+def get_schema(
+    engine: Engine, exclude_tables: Optional[frozenset[str]] = None
+) -> dict[str, list[dict]]:
+    """Returns {table_name: [{name, type, is_numeric, is_primary_key, foreign_key}, ...]}."""
+    exclude = exclude_tables or frozenset()
+    inspector = inspect(engine)
+    schema = {}
+    for table_name in inspector.get_table_names():
+        if table_name in exclude:
+            continue
+        pk = inspector.get_pk_constraint(table_name)
+        pk_cols = set(pk["constrained_columns"]) if pk else set()
+        fk_map = {}
+        for fk in inspector.get_foreign_keys(table_name):
+            for col, ref_col in zip(fk["constrained_columns"], fk["referred_columns"]):
+                fk_map[col] = f"{fk['referred_table']}.{ref_col}"
+        cols = inspector.get_columns(table_name)
+        schema[table_name] = [
+            {
+                "name": c["name"],
+                "type": str(c["type"]),
+                "is_numeric": isinstance(c["type"], (Integer, Numeric, Float)),
+                "is_primary_key": c["name"] in pk_cols,
+                "foreign_key": fk_map.get(c["name"]),
+            }
+            for c in cols
+        ]
+    logger.info("extracted schema", table_count=len(schema))
+    return schema
+def get_row_count(engine: Engine, table_name: str) -> int:
+    return pd.read_sql(f"SELECT COUNT(*) FROM {_qi(engine, table_name)}", engine).iloc[0, 0]
+def profile_column(
+    engine: Engine,
+    table_name: str,
+    col_name: str,
+    is_numeric: bool,
+    row_count: int,
+) -> dict:
+    """Returns null_count, distinct_count, min/max, top values, and sample values."""
+    if row_count == 0:
+        return {
+            "null_count": 0,
+            "distinct_count": 0,
+            "distinct_ratio": 0.0,
+            "sample_values": [],
+        }
+    qt = _qi(engine, table_name)
+    qc = _qi(engine, col_name)
+    # Combined stats query: null_count, distinct_count, and min/max (if numeric).
+    # One round-trip instead of two.
+    select_cols = [
+        f"COUNT(*) - COUNT({qc}) AS nulls",
+        f"COUNT(DISTINCT {qc}) AS distincts",
+    ]
+    if is_numeric:
+        select_cols.append(f"MIN({qc}) AS min_val")
+        select_cols.append(f"MAX({qc}) AS max_val")
+        select_cols.append(f"AVG({qc}) AS mean_val")
+        # PERCENTILE_CONT is supported by Postgres and SQL Server; MySQL would need
+        # a dialect-specific fallback when that connector is added.
+        select_cols.append(
+            f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {qc}) AS median_val"
+        )
+    stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
+    null_count = int(stats.iloc[0]["nulls"])
+    distinct_count = int(stats.iloc[0]["distincts"])
+    distinct_ratio = distinct_count / row_count if row_count > 0 else 0
+    profile = {
+        "null_count": null_count,
+        "distinct_count": distinct_count,
+        "distinct_ratio": round(distinct_ratio, 4),
+    }
+    if is_numeric:
+        profile["min"] = stats.iloc[0]["min_val"]
+        profile["max"] = stats.iloc[0]["max_val"]
+        profile["mean"] = stats.iloc[0]["mean_val"]
+        profile["median"] = stats.iloc[0]["median_val"]
+    if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
+        top = pd.read_sql(
+            f"SELECT {qc}, COUNT(*) AS cnt FROM {qt} "
+            f"GROUP BY {qc} ORDER BY cnt DESC LIMIT 10",
+            engine,
+        )
+        profile["top_values"] = list(zip(top[col_name].tolist(), top["cnt"].tolist()))
+    sample = pd.read_sql(f"SELECT {qc} FROM {qt} LIMIT 5", engine)
+    profile["sample_values"] = sample[col_name].tolist()
+    return profile
+def profile_table(engine: Engine, table_name: str, columns: list[dict]) -> list[dict]:
+    """Profile every column in a table. Returns [{col, profile, text}, ...].
+    Per-column errors are logged and skipped so one bad column doesn't abort
+    the whole table.
+    """
+    row_count = get_row_count(engine, table_name)
+    if row_count == 0:
+        logger.info("skipping empty table", table=table_name)
+        return []
+    results = []
+    for col in columns:
+        try:
+            profile = profile_column(
+                engine, table_name, col["name"], col.get("is_numeric", False), row_count
+            )
+            text = build_text(table_name, row_count, col, profile)
+            results.append({"col": col, "profile": profile, "text": text})
+        except Exception as e:
+            logger.error(
+                "column profiling failed",
+                table=table_name,
+                column=col["name"],
+                error=str(e),
+            )
+            continue
+    return results
+def build_text(table_name: str, row_count: int, col: dict, profile: dict) -> str:
+    col_name = col["name"]
+    col_type = col["type"]
+    key_label = ""
+    if col.get("is_primary_key"):
+        key_label = " [PRIMARY KEY]"
+    elif col.get("foreign_key"):
+        key_label = f" [FK -> {col['foreign_key']}]"
+    text = f"Table: {table_name} ({row_count} rows)\n"
+    text += f"Column: {col_name} ({col_type}){key_label}\n"
+    text += f"Null count: {profile['null_count']}\n"
+    text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
+    if "min" in profile:
+        text += f"Min: {profile['min']}, Max: {profile['max']}\n"
+        text += f"Mean: {profile['mean']}, Median: {profile['median']}\n"
+    if "top_values" in profile:
+        top_str = ", ".join(f"{v} ({c})" for v, c in profile["top_values"])
+        text += f"Top values: {top_str}\n"
+    text += f"Sample values: {profile['sample_values']}"
+    return text

src/pipeline/db_pipeline/pipeline.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""End-to-end DB ingestion pipeline: introspect user's DB -> profile columns ->
+build text -> embed + store in the shared PGVector collection.
+Each column becomes one LangChainDocument with metadata tagging user_id and
+source_type='database', so it is retrievable via the existing retriever.
+"""
+import asyncio
+from typing import Optional
+from langchain_core.documents import Document as LangChainDocument
+from sqlalchemy.engine import Engine
+from src.db.postgres.vector_store import get_vector_store
+from src.middlewares.logging import get_logger
+from src.pipeline.db_pipeline.extractor import get_schema, profile_table
+logger = get_logger("db_pipeline")
+def _to_document(user_id: str, table_name: str, entry: dict) -> LangChainDocument:
+    col = entry["col"]
+    return LangChainDocument(
+        page_content=entry["text"],
+        metadata={
+            "user_id": user_id,
+            "source_type": "database",
+            "data": {
+                "table_name": table_name,
+                "column_name": col["name"],
+                "column_type": col["type"],
+                "is_primary_key": col.get("is_primary_key", False),
+                "foreign_key": col.get("foreign_key"),
+            },
+        },
+    )
+async def run_db_pipeline(
+    user_id: str,
+    engine: Engine,
+    exclude_tables: Optional[frozenset[str]] = None,
+) -> int:
+    """Introspect the user's DB, profile columns, embed descriptions, store in PGVector.
+    Sync DB work (SQLAlchemy inspect, pandas read_sql) runs in a threadpool;
+    async vector writes stay on the event loop.
+    Returns:
+        Total number of chunks ingested.
+    """
+    vector_store = get_vector_store()
+    logger.info("db pipeline start", user_id=user_id)
+    schema = await asyncio.to_thread(get_schema, engine, exclude_tables)
+    total = 0
+    for table_name, columns in schema.items():
+        logger.info("profiling table", table=table_name, columns=len(columns))
+        entries = await asyncio.to_thread(profile_table, engine, table_name, columns)
+        docs = [_to_document(user_id, table_name, e) for e in entries]
+        if docs:
+            await vector_store.aadd_documents(docs)
+            total += len(docs)
+            logger.info("ingested chunks", table=table_name, count=len(docs))
+    logger.info("db pipeline complete", user_id=user_id, total=total)
+    return total

src/pipeline/document_pipeline/__init__.py ADDED Viewed

File without changes

src/pipeline/document_pipeline/document_pipeline.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Document upload and processing pipeline."""
+from fastapi import HTTPException, UploadFile
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.document.document_service import document_service
+from src.knowledge.processing_service import knowledge_processor
+from src.middlewares.logging import get_logger
+from src.storage.az_blob.az_blob import blob_storage
+logger = get_logger("document_pipeline")
+SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt"]
+class DocumentPipeline:
+    """Orchestrates the full document upload, process, and delete flows."""
+    async def upload(self, file: UploadFile, user_id: str, db: AsyncSession) -> dict:
+        """Validate → upload to blob → save to DB."""
+        content = await file.read()
+        file_type = file.filename.split(".")[-1].lower() if "." in file.filename else "txt"
+        if file_type not in SUPPORTED_FILE_TYPES:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported file type. Supported: {SUPPORTED_FILE_TYPES}",
+            )
+        blob_name = await blob_storage.upload_file(content, file.filename, user_id)
+        document = await document_service.create_document(
+            db=db,
+            user_id=user_id,
+            filename=file.filename,
+            blob_name=blob_name,
+            file_size=len(content),
+            file_type=file_type,
+        )
+        logger.info(f"Uploaded document {document.id} for user {user_id}")
+        return {"id": document.id, "filename": document.filename, "status": document.status}
+    async def process(self, document_id: str, user_id: str, db: AsyncSession) -> dict:
+        """Validate ownership → extract text → chunk → ingest to vector store."""
+        document = await document_service.get_document(db, document_id)
+        if not document:
+            raise HTTPException(status_code=404, detail="Document not found")
+        if document.user_id != user_id:
+            raise HTTPException(status_code=403, detail="Access denied")
+        try:
+            await document_service.update_document_status(db, document_id, "processing")
+            chunks_count = await knowledge_processor.process_document(document, db)
+            await document_service.update_document_status(db, document_id, "completed")
+            logger.info(f"Processed document {document_id}: {chunks_count} chunks")
+            return {"document_id": document_id, "chunks_processed": chunks_count}
+        except Exception as e:
+            logger.error(f"Processing failed for document {document_id}", error=str(e))
+            await document_service.update_document_status(db, document_id, "failed", str(e))
+            raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
+    async def delete(self, document_id: str, user_id: str, db: AsyncSession) -> dict:
+        """Validate ownership → delete from blob and DB."""
+        document = await document_service.get_document(db, document_id)
+        if not document:
+            raise HTTPException(status_code=404, detail="Document not found")
+        if document.user_id != user_id:
+            raise HTTPException(status_code=403, detail="Access denied")
+        await document_service.delete_document(db, document_id)
+        logger.info(f"Deleted document {document_id} for user {user_id}")
+        return {"document_id": document_id}
+document_pipeline = DocumentPipeline()