| import os
|
| import logging
|
| from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| from langchain_community.document_loaders import PyPDFLoader
|
| from langchain_community.vectorstores import FAISS
|
| from langchain_openai import OpenAIEmbeddings
|
| from dotenv import load_dotenv
|
|
|
| load_dotenv()
|
|
|
|
|
| logging.basicConfig(level=logging.INFO)
|
| logger = logging.getLogger(__name__)
|
|
|
| embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
|
|
|
|
|
| def Ingest_Data(pdf_path: str, vector_db_path: str = "vectorstore/db_faiss"):
|
| """
|
| Ingests a PDF, splits it, and saves the vector store.
|
| Returns a dict with status to send back to the Frontend.
|
| """
|
| try:
|
| logger.info(f"Starting ingestion for: {pdf_path}")
|
|
|
|
|
| if not os.path.exists(pdf_path):
|
| raise FileNotFoundError(f"The file {pdf_path} was not found.")
|
|
|
|
|
| loader = PyPDFLoader(pdf_path)
|
| pages = loader.load_and_split()
|
|
|
| if not pages:
|
| return {"status": "error", "message": "PDF contains no text."}
|
|
|
|
|
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
|
| docs = splitter.split_documents(pages)
|
| logger.info(f"Processing {len(docs)} chunks...")
|
|
|
|
|
|
|
|
|
| db = FAISS.from_documents(docs, embeddings)
|
| db.save_local(vector_db_path)
|
|
|
| logger.info(f"Saved vectorstore to {vector_db_path}")
|
|
|
|
|
| return {
|
| "status": "success",
|
| "chunks_processed": len(docs),
|
| "db_path": vector_db_path,
|
| "message": "File successfully ingested and indexed."
|
| }
|
|
|
| except Exception as e:
|
| logger.error(f"Ingestion failed: {str(e)}")
|
| return {
|
| "status": "failed",
|
| "error": str(e)
|
| }
|
|
|
|
|
|
|
| |