Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Running

App Files Files Community

sofhiaazzhr commited on 6 days ago

Commit

31920c3

1 Parent(s): 425e021

[NOTICKET]: add CSV and XLSX file type

Browse files

Files changed (2) hide show

src/knowledge/processing_service.py +66 -0
src/pipeline/document_pipeline/document_pipeline.py +1 -1

src/knowledge/processing_service.py CHANGED Viewed

@@ -13,6 +13,7 @@ from azure.core.credentials import AzureKeyCredential
 from typing import List
 import pypdf
 import docx
 from io import BytesIO
 logger = get_logger("knowledge_processing")
@@ -40,6 +41,10 @@ class KnowledgeProcessingService:
             if db_doc.file_type == "pdf":
                 documents = await self._build_pdf_documents(content, db_doc)
             else:
                 text = self._extract_text(content, db_doc.file_type)
                 if not text.strip():
@@ -144,6 +149,67 @@ class KnowledgeProcessingService:
         return documents
     def _extract_text(self, content: bytes, file_type: str) -> str:
         """Extract text from DOCX or TXT content."""
         if file_type == "docx":

 from typing import List
 import pypdf
 import docx
+import pandas as pd
 from io import BytesIO
 logger = get_logger("knowledge_processing")
             if db_doc.file_type == "pdf":
                 documents = await self._build_pdf_documents(content, db_doc)
+            elif db_doc.file_type == "csv":
+                documents = self._build_csv_documents(content, db_doc)
+            elif db_doc.file_type == "xlsx":
+                documents = self._build_excel_documents(content, db_doc)
             else:
                 text = self._extract_text(content, db_doc.file_type)
                 if not text.strip():
         return documents
+    def _profile_dataframe(
+        self, df: pd.DataFrame, source_name: str, db_doc: DBDocument
+    ) -> List[LangChainDocument]:
+        """Profile each column of a dataframe → one chunk per column."""
+        documents = []
+        row_count = len(df)
+        for col_name in df.columns:
+            col = df[col_name]
+            is_numeric = pd.api.types.is_numeric_dtype(col)
+            null_count = int(col.isnull().sum())
+            distinct_count = int(col.nunique())
+            distinct_ratio = distinct_count / row_count if row_count > 0 else 0
+            text = f"Source: {source_name} ({row_count} rows)\n"
+            text += f"Column: {col_name} ({col.dtype})\n"
+            text += f"Null count: {null_count}\n"
+            text += f"Distinct count: {distinct_count} ({distinct_ratio:.1%})\n"
+            if is_numeric:
+                text += f"Min: {col.min()}, Max: {col.max()}\n"
+                text += f"Mean: {col.mean():.4f}, Median: {col.median():.4f}\n"
+            if 0 < distinct_ratio <= 0.05:
+                top_values = col.value_counts().head(10)
+                top_str = ", ".join(f"{v} ({c})" for v, c in top_values.items())
+                text += f"Top values: {top_str}\n"
+            text += f"Sample values: {col.dropna().head(5).tolist()}"
+            documents.append(LangChainDocument(
+                page_content=text,
+                metadata={
+                    "user_id": db_doc.user_id,
+                    "source_type": "document",
+                    "data": {
+                        "document_id": db_doc.id,
+                        "filename": db_doc.filename,
+                        "file_type": db_doc.file_type,
+                        "source": source_name,
+                        "column_name": col_name,
+                        "column_type": str(col.dtype),
+                    }
+                }
+            ))
+        return documents
+    def _build_csv_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
+        """Profile each column of a CSV file."""
+        df = pd.read_csv(BytesIO(content))
+        return self._profile_dataframe(df, db_doc.filename, db_doc)
+    def _build_excel_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
+        """Profile each column of every sheet in an Excel file."""
+        sheets = pd.read_excel(BytesIO(content), sheet_name=None)
+        documents = []
+        for sheet_name, df in sheets.items():
+            source_name = f"{db_doc.filename} / sheet: {sheet_name}"
+            documents.extend(self._profile_dataframe(df, source_name, db_doc))
+        return documents
     def _extract_text(self, content: bytes, file_type: str) -> str:
         """Extract text from DOCX or TXT content."""
         if file_type == "docx":

src/pipeline/document_pipeline/document_pipeline.py CHANGED Viewed

@@ -10,7 +10,7 @@ from src.storage.az_blob.az_blob import blob_storage
 logger = get_logger("document_pipeline")
-SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt"]
 class DocumentPipeline:

 logger = get_logger("document_pipeline")
+SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt", "csv", "xlsx"]
 class DocumentPipeline: