| from langchain_community.document_loaders import PyPDFLoader |
| from langchain.docstore.document import Document |
| import os |
| from rag import Rag |
|
|
| pdf_folder_path = 'files' |
|
|
| def get_documents_from_path(pdf_folder_path: str = pdf_folder_path) -> list: |
| documents = [] |
| for pdf_file in os.listdir(pdf_folder_path): |
| if pdf_file.endswith('.pdf'): |
| loader = PyPDFLoader(os.path.join(pdf_folder_path, pdf_file)) |
| pdf_documents = loader.load() |
| file_name_without_extension = os.path.splitext(pdf_file)[0] |
| for doc in pdf_documents: |
| documents.append(Document(page_content=doc.page_content, metadata={"source": file_name_without_extension})) |
|
|
| return documents |
|
|
| if __name__ == "__main__": |
| try: |
| rag_llm = Rag() |
| documents = get_documents_from_path() |
| rag_llm.storeDocumentsInVectorstore(documents) |
| print("Store PDFS Completed") |
| |
| except Exception as e: |
| print(e) |