import os from typing import List from langchain_core.documents import Document from langchain_community.vectorstores import Chroma # OpenAI embedding # from langchain_openai import OpenAIEmbeddings # Free local embedding from langchain_huggingface import HuggingFaceEmbeddings from dotenv import load_dotenv load_dotenv() # # Huggingface api key... # os.environ["HF_TOKEN"] = "hf_PWDT" # This is where our local database will be saved on your hard drive DB_DIRECTORY = "./chroma_db" def get_embeddings_model(): """Returns the active embedding model.""" # --- FREE PIPELINE --- # This downloads a small, highly efficient open-source model to your machine. print("Loading HuggingFace Embeddings...") # api_key = os.getenv("HUGGINGFACE_API_KEY") return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # --- PAID PIPELINE (Uncomment when you have credits) --- # We use OpenAI's embedding model here. It converts text to 1536-dimensional vectors. # api_key = os.getenv("OPENAI_API_KEY") # return OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-small") def build_vector_store(chunks: List[Document], api_key: str): """ Takes a list of chunked documents, embeds them, and saves them to a local Chroma database. """ embeddings = get_embeddings_model() print(f"Embedding {len(chunks)} chunks and saving to {DB_DIRECTORY}...") # 1. Create the database # 2. Embed all the chunks # 3. Save it to the DB_DIRECTORY vector_store = Chroma.from_documents( documents=chunks, embedding=embeddings, persist_directory=DB_DIRECTORY ) # Force the database to save to disk vector_store.persist() print("Database successfully built and saved to disk!") return vector_store def get_vector_store(api_key: str): """ Retrieves the existing database from the hard drive so we don't have to rebuild it every time. """ embeddings = get_embeddings_model() return Chroma(persist_directory=DB_DIRECTORY, embedding_function=embeddings)