Spaces:
Running
Running
File size: 2,337 Bytes
f83e60c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | import os
import glob
from langchain_community.document_loaders import (
DirectoryLoader,
PyPDFLoader,
Docx2txtLoader,
TextLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
# Configuration
RAW_DOCS_DIR = "raw_documents"
CHROMA_DB_DIR = "vectorstore"
CHUNK_SIZE = 512
CHUNK_OVERLAP = 64
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
def main():
print(f"Loading documents from {RAW_DOCS_DIR}...")
text_loader_kwargs = {'autodetect_encoding': True}
loaders = [
DirectoryLoader(RAW_DOCS_DIR, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs),
DirectoryLoader(RAW_DOCS_DIR, glob="**/*.pdf", loader_cls=PyPDFLoader),
DirectoryLoader(RAW_DOCS_DIR, glob="**/*.docx", loader_cls=Docx2txtLoader)
]
docs = []
for loader in loaders:
try:
loaded_docs = loader.load()
if loaded_docs:
print(f"Loaded {len(loaded_docs)} documents using {loader.loader_cls.__name__}")
docs.extend(loaded_docs)
except Exception as e:
print(f"Error loading with {loader.loader_cls.__name__}: {e}")
if not docs:
print("No documents found. Please add some .txt, .pdf, or .docx files to the raw_documents directory.")
return
print(f"Total documents loaded: {len(docs)}")
print(f"Splitting documents with chunk size {CHUNK_SIZE} and overlap {CHUNK_OVERLAP}...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
add_start_index=True,
)
splits = text_splitter.split_documents(docs)
print(f"Generated {len(splits)} chunks.")
print(f"Initializing embedding model '{EMBEDDING_MODEL}'...")
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
print(f"Storing embeddings in ChromaDB at {CHROMA_DB_DIR}...")
# Initialize Chroma, which will embed and store the chunks
vectorstore = Chroma.from_documents(
documents=splits,
embedding=embeddings,
persist_directory=CHROMA_DB_DIR
)
print("Ingestion complete. Vector store persisted locally.")
if __name__ == "__main__":
main()
|