| """ |
| Indexing with vector database - updated for Weaviate, FAISS, Qdrant, Pinecone |
| Compatible with latest LangChain and HuggingFaceEmbeddings |
| """ |
|
|
| from pathlib import Path |
| import re |
| import os |
| from unidecode import unidecode |
|
|
| from langchain_community.document_loaders import PyPDFLoader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain_huggingface import HuggingFaceEmbeddings |
|
|
|
|
| def load_doc(list_file_path, chunk_size, chunk_overlap): |
| loaders = [PyPDFLoader(x) for x in list_file_path] |
| pages = [] |
| for loader in loaders: |
| pages.extend(loader.load()) |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) |
| doc_splits = text_splitter.split_documents(pages) |
| return doc_splits |
|
|
|
|
| def create_collection_name(filepath): |
| collection_name = Path(filepath).stem |
| collection_name = collection_name.replace(" ", "-") |
| collection_name = unidecode(collection_name) |
| collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name) |
| collection_name = collection_name[:50] |
| if len(collection_name) < 3: |
| collection_name += "xyz" |
| if not collection_name[0].isalnum(): |
| collection_name = "A" + collection_name[1:] |
| if not collection_name[-1].isalnum(): |
| collection_name = collection_name[:-1] + "Z" |
| print("\n\nFilepath:", filepath) |
| print("Collection name:", collection_name) |
| return collection_name |
|
|
|
|
| def create_db(splits, collection_name, db_type="ChromaDB"): |
| embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2") |
|
|
| if db_type == "ChromaDB": |
| import chromadb |
| from langchain_chroma import Chroma |
|
|
| chromadb.api.client.SharedSystemClient.clear_system_cache() |
| vectordb = Chroma.from_documents( |
| documents=splits, |
| embedding=embedding, |
| client=chromadb.EphemeralClient(), |
| collection_name=collection_name, |
| ) |
| return vectordb |
|
|
| elif db_type == "Weaviate": |
| import weaviate |
| from langchain_weaviate.vectorstores import WeaviateVectorStore |
|
|
| client = weaviate.connect_to_local("http://localhost:8080", |
| grpc_port=50051) |
| vectordb = WeaviateVectorStore.from_documents( |
| splits, |
| embedding, |
| client=client, |
| index_name=collection_name, |
| text_key="text" |
| ) |
| return vectordb |
|
|
| elif db_type == "FAISS": |
| from langchain.vectorstores import FAISS |
|
|
| vectordb = FAISS.from_documents(splits, embedding) |
| vectordb.save_local(f"{collection_name}_index") |
| return vectordb |
|
|
| elif db_type == "Qdrant": |
| from qdrant_client import QdrantClient |
| from langchain.vectorstores import Qdrant |
|
|
| client = QdrantClient("::memory::") |
| vectordb = Qdrant.from_documents(splits, embedding, client=client, collection_name=collection_name) |
| return vectordb |
|
|
| elif db_type == "Pinecone": |
| import pinecone |
| from langchain_pinecone import PineconeVectorStore |
|
|
| pinecone_api_key = os.environ.get("PINECONE_API_KEY") |
| pc = pinecone.Pinecone(api_key=pinecone_api_key) |
|
|
| index_name = collection_name |
| dim = len(embedding.embed_query("test")) |
| if index_name not in [i.name for i in pc.list_indexes()]: |
| pc.create_index(name=index_name, dimension=dim, metric="cosine") |
|
|
| index = pc.Index(index_name) |
| vectordb = PineconeVectorStore.from_documents(docs=splits, index=index, embedding=embedding) |
| return vectordb |
|
|
| else: |
| raise ValueError(f"Unsupported vector DB type: {db_type}") |