Update emb.py
Browse files
emb.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
|
| 3 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
from langchain.embeddings import SentenceTransformerEmbeddings
|
| 5 |
from langchain.vectorstores import Chroma
|
| 6 |
import configparser
|
|
@@ -131,25 +131,25 @@ class EmbeddingsManager:
|
|
| 131 |
|
| 132 |
|
| 133 |
#This function is used to add documents to an existing vector store
|
| 134 |
-
def generate_vector_store(self, index):
|
| 135 |
-
"""Adds a document to the vector store on Pinecone."""
|
| 136 |
-
|
| 137 |
-
documents = []
|
| 138 |
-
for root, dirs, files in os.walk("docs"):
|
| 139 |
-
for file in files:
|
| 140 |
-
if file.endswith(".pdf"):
|
| 141 |
-
print("Uploading "+file.replace(".pdf",""))
|
| 142 |
-
documents.clear()
|
| 143 |
-
loader = PDFMinerLoader(os.path.join(root, file))
|
| 144 |
-
documents.extend(loader.load())
|
| 145 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.text_split_size, chunk_overlap=self.text_overlap)
|
| 146 |
-
texts = text_splitter.split_documents(documents)
|
| 147 |
-
docsearch = Pinecone.from_documents(texts, embedding=self.embeddings_model, index_name=index)
|
| 148 |
-
os.remove(os.path.join(root, file))
|
| 149 |
-
|
| 150 |
-
return "Ok"
|
| 151 |
-
|
| 152 |
-
|
| 153 |
# Example Usage:
|
| 154 |
if __name__ == "__main__":
|
| 155 |
|
|
|
|
| 1 |
import os
|
| 2 |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
|
| 3 |
+
#from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
from langchain.embeddings import SentenceTransformerEmbeddings
|
| 5 |
from langchain.vectorstores import Chroma
|
| 6 |
import configparser
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
#This function is used to add documents to an existing vector store
|
| 134 |
+
# def generate_vector_store(self, index):
|
| 135 |
+
# """Adds a document to the vector store on Pinecone."""
|
| 136 |
+
#
|
| 137 |
+
# documents = []
|
| 138 |
+
# for root, dirs, files in os.walk("docs"):
|
| 139 |
+
# for file in files:
|
| 140 |
+
# if file.endswith(".pdf"):
|
| 141 |
+
# print("Uploading "+file.replace(".pdf",""))
|
| 142 |
+
# documents.clear()
|
| 143 |
+
# loader = PDFMinerLoader(os.path.join(root, file))
|
| 144 |
+
# documents.extend(loader.load())
|
| 145 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.text_split_size, chunk_overlap=self.text_overlap)
|
| 146 |
+
# texts = text_splitter.split_documents(documents)
|
| 147 |
+
# docsearch = Pinecone.from_documents(texts, embedding=self.embeddings_model, index_name=index)
|
| 148 |
+
# os.remove(os.path.join(root, file))
|
| 149 |
+
#
|
| 150 |
+
# return "Ok"
|
| 151 |
+
#
|
| 152 |
+
#
|
| 153 |
# Example Usage:
|
| 154 |
if __name__ == "__main__":
|
| 155 |
|