Rifqi Hafizuddin commited on
Commit
6b590d9
·
1 Parent(s): 9b59334

[NOTICKET] new metadata format for cleaner code

Browse files
src/knowledge/processing_service.py CHANGED
@@ -49,10 +49,14 @@ class KnowledgeProcessingService:
49
  LangChainDocument(
50
  page_content=chunk,
51
  metadata={
52
- "document_id": db_doc.id,
53
  "user_id": db_doc.user_id,
54
- "filename": db_doc.filename,
55
- "chunk_index": i,
 
 
 
 
 
56
  }
57
  )
58
  for i, chunk in enumerate(chunks)
@@ -104,11 +108,15 @@ class KnowledgeProcessingService:
104
  documents.append(LangChainDocument(
105
  page_content=chunk,
106
  metadata={
107
- "document_id": db_doc.id,
108
  "user_id": db_doc.user_id,
109
- "filename": db_doc.filename,
110
- "chunk_index": len(documents),
111
- "page_label": page.page_number,
 
 
 
 
 
112
  }
113
  ))
114
  else:
@@ -122,11 +130,15 @@ class KnowledgeProcessingService:
122
  documents.append(LangChainDocument(
123
  page_content=chunk,
124
  metadata={
125
- "document_id": db_doc.id,
126
  "user_id": db_doc.user_id,
127
- "filename": db_doc.filename,
128
- "chunk_index": len(documents),
129
- "page_label": page_num,
 
 
 
 
 
130
  }
131
  ))
132
 
 
49
  LangChainDocument(
50
  page_content=chunk,
51
  metadata={
 
52
  "user_id": db_doc.user_id,
53
+ "source_type": "document",
54
+ "data": {
55
+ "document_id": db_doc.id,
56
+ "filename": db_doc.filename,
57
+ "file_type": db_doc.file_type,
58
+ "chunk_index": i,
59
+ },
60
  }
61
  )
62
  for i, chunk in enumerate(chunks)
 
108
  documents.append(LangChainDocument(
109
  page_content=chunk,
110
  metadata={
 
111
  "user_id": db_doc.user_id,
112
+ "source_type": "document",
113
+ "data": {
114
+ "document_id": db_doc.id,
115
+ "filename": db_doc.filename,
116
+ "file_type": db_doc.file_type,
117
+ "chunk_index": len(documents),
118
+ "page_label": page.page_number,
119
+ },
120
  }
121
  ))
122
  else:
 
130
  documents.append(LangChainDocument(
131
  page_content=chunk,
132
  metadata={
 
133
  "user_id": db_doc.user_id,
134
+ "source_type": "document",
135
+ "data": {
136
+ "document_id": db_doc.id,
137
+ "filename": db_doc.filename,
138
+ "file_type": db_doc.file_type,
139
+ "chunk_index": len(documents),
140
+ "page_label": page_num,
141
+ },
142
  }
143
  ))
144
 
src/pipeline/db_pipeline/pipeline.py CHANGED
@@ -25,11 +25,13 @@ def _to_document(user_id: str, table_name: str, entry: dict) -> LangChainDocumen
25
  metadata={
26
  "user_id": user_id,
27
  "source_type": "database",
28
- "table_name": table_name,
29
- "column_name": col["name"],
30
- "column_type": col["type"],
31
- "is_primary_key": col.get("is_primary_key", False),
32
- "foreign_key": col.get("foreign_key"),
 
 
33
  },
34
  )
35
 
 
25
  metadata={
26
  "user_id": user_id,
27
  "source_type": "database",
28
+ "data": {
29
+ "table_name": table_name,
30
+ "column_name": col["name"],
31
+ "column_type": col["type"],
32
+ "is_primary_key": col.get("is_primary_key", False),
33
+ "foreign_key": col.get("foreign_key"),
34
+ },
35
  },
36
  )
37