File size: 2,139 Bytes
b534a53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
from typing import List
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma

# OpenAI embedding
# from langchain_openai import OpenAIEmbeddings

# Free local embedding
from langchain_huggingface import HuggingFaceEmbeddings

from dotenv import load_dotenv
load_dotenv()

# # Huggingface api key...
# os.environ["HF_TOKEN"] = "hf_PWDT"

# This is where our local database will be saved on your hard drive
DB_DIRECTORY = "./chroma_db"

def get_embeddings_model():
    """Returns the active embedding model."""
    # --- FREE PIPELINE ---
    # This downloads a small, highly efficient open-source model to your machine.
    print("Loading HuggingFace Embeddings...")
    # api_key = os.getenv("HUGGINGFACE_API_KEY")
    return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # --- PAID PIPELINE (Uncomment when you have credits) ---
    # We use OpenAI's embedding model here. It converts text to 1536-dimensional vectors.
    # api_key = os.getenv("OPENAI_API_KEY")
    # return OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-small")

def build_vector_store(chunks: List[Document], api_key: str):
    """

    Takes a list of chunked documents, embeds them, and saves them to a local Chroma database.

    """
    embeddings = get_embeddings_model()

    print(f"Embedding {len(chunks)} chunks and saving to {DB_DIRECTORY}...")
    
    # 1. Create the database
    # 2. Embed all the chunks
    # 3. Save it to the DB_DIRECTORY
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=DB_DIRECTORY
    )
    
    # Force the database to save to disk
    vector_store.persist()
    print("Database successfully built and saved to disk!")
    return vector_store

def get_vector_store(api_key: str):
    """

    Retrieves the existing database from the hard drive so we don't have to rebuild it every time.

    """
    embeddings = get_embeddings_model()
    return Chroma(persist_directory=DB_DIRECTORY, embedding_function=embeddings)