Rifqi Hafizuddin commited on
Commit ·
6b590d9
1
Parent(s): 9b59334
[NOTICKET] new metadata format for cleaner code
Browse files
src/knowledge/processing_service.py
CHANGED
|
@@ -49,10 +49,14 @@ class KnowledgeProcessingService:
|
|
| 49 |
LangChainDocument(
|
| 50 |
page_content=chunk,
|
| 51 |
metadata={
|
| 52 |
-
"document_id": db_doc.id,
|
| 53 |
"user_id": db_doc.user_id,
|
| 54 |
-
"
|
| 55 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
}
|
| 57 |
)
|
| 58 |
for i, chunk in enumerate(chunks)
|
|
@@ -104,11 +108,15 @@ class KnowledgeProcessingService:
|
|
| 104 |
documents.append(LangChainDocument(
|
| 105 |
page_content=chunk,
|
| 106 |
metadata={
|
| 107 |
-
"document_id": db_doc.id,
|
| 108 |
"user_id": db_doc.user_id,
|
| 109 |
-
"
|
| 110 |
-
"
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
}
|
| 113 |
))
|
| 114 |
else:
|
|
@@ -122,11 +130,15 @@ class KnowledgeProcessingService:
|
|
| 122 |
documents.append(LangChainDocument(
|
| 123 |
page_content=chunk,
|
| 124 |
metadata={
|
| 125 |
-
"document_id": db_doc.id,
|
| 126 |
"user_id": db_doc.user_id,
|
| 127 |
-
"
|
| 128 |
-
"
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
}
|
| 131 |
))
|
| 132 |
|
|
|
|
| 49 |
LangChainDocument(
|
| 50 |
page_content=chunk,
|
| 51 |
metadata={
|
|
|
|
| 52 |
"user_id": db_doc.user_id,
|
| 53 |
+
"source_type": "document",
|
| 54 |
+
"data": {
|
| 55 |
+
"document_id": db_doc.id,
|
| 56 |
+
"filename": db_doc.filename,
|
| 57 |
+
"file_type": db_doc.file_type,
|
| 58 |
+
"chunk_index": i,
|
| 59 |
+
},
|
| 60 |
}
|
| 61 |
)
|
| 62 |
for i, chunk in enumerate(chunks)
|
|
|
|
| 108 |
documents.append(LangChainDocument(
|
| 109 |
page_content=chunk,
|
| 110 |
metadata={
|
|
|
|
| 111 |
"user_id": db_doc.user_id,
|
| 112 |
+
"source_type": "document",
|
| 113 |
+
"data": {
|
| 114 |
+
"document_id": db_doc.id,
|
| 115 |
+
"filename": db_doc.filename,
|
| 116 |
+
"file_type": db_doc.file_type,
|
| 117 |
+
"chunk_index": len(documents),
|
| 118 |
+
"page_label": page.page_number,
|
| 119 |
+
},
|
| 120 |
}
|
| 121 |
))
|
| 122 |
else:
|
|
|
|
| 130 |
documents.append(LangChainDocument(
|
| 131 |
page_content=chunk,
|
| 132 |
metadata={
|
|
|
|
| 133 |
"user_id": db_doc.user_id,
|
| 134 |
+
"source_type": "document",
|
| 135 |
+
"data": {
|
| 136 |
+
"document_id": db_doc.id,
|
| 137 |
+
"filename": db_doc.filename,
|
| 138 |
+
"file_type": db_doc.file_type,
|
| 139 |
+
"chunk_index": len(documents),
|
| 140 |
+
"page_label": page_num,
|
| 141 |
+
},
|
| 142 |
}
|
| 143 |
))
|
| 144 |
|
src/pipeline/db_pipeline/pipeline.py
CHANGED
|
@@ -25,11 +25,13 @@ def _to_document(user_id: str, table_name: str, entry: dict) -> LangChainDocumen
|
|
| 25 |
metadata={
|
| 26 |
"user_id": user_id,
|
| 27 |
"source_type": "database",
|
| 28 |
-
"
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
| 33 |
},
|
| 34 |
)
|
| 35 |
|
|
|
|
| 25 |
metadata={
|
| 26 |
"user_id": user_id,
|
| 27 |
"source_type": "database",
|
| 28 |
+
"data": {
|
| 29 |
+
"table_name": table_name,
|
| 30 |
+
"column_name": col["name"],
|
| 31 |
+
"column_type": col["type"],
|
| 32 |
+
"is_primary_key": col.get("is_primary_key", False),
|
| 33 |
+
"foreign_key": col.get("foreign_key"),
|
| 34 |
+
},
|
| 35 |
},
|
| 36 |
)
|
| 37 |
|