Spaces:

sumitrwk
/

omnirouter-api

Sleeping

App Files Files Community

omnirouter-api / src /rag /vector_store.py

sumitrwk

Upload 33 files

b534a53 verified 8 days ago

raw

history blame contribute delete

2.14 kB

	import os
	from typing import List
	from langchain_core.documents import Document
	from langchain_community.vectorstores import Chroma

	# OpenAI embedding
	# from langchain_openai import OpenAIEmbeddings

	# Free local embedding
	from langchain_huggingface import HuggingFaceEmbeddings

	from dotenv import load_dotenv
	load_dotenv()

	# # Huggingface api key...
	# os.environ["HF_TOKEN"] = "hf_PWDT"

	# This is where our local database will be saved on your hard drive
	DB_DIRECTORY = "./chroma_db"

	def get_embeddings_model():
	"""Returns the active embedding model."""
	# --- FREE PIPELINE ---
	# This downloads a small, highly efficient open-source model to your machine.
	print("Loading HuggingFace Embeddings...")
	# api_key = os.getenv("HUGGINGFACE_API_KEY")
	return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

	# --- PAID PIPELINE (Uncomment when you have credits) ---
	# We use OpenAI's embedding model here. It converts text to 1536-dimensional vectors.
	# api_key = os.getenv("OPENAI_API_KEY")
	# return OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-small")

	def build_vector_store(chunks: List[Document], api_key: str):
	"""
	Takes a list of chunked documents, embeds them, and saves them to a local Chroma database.
	"""
	embeddings = get_embeddings_model()

	print(f"Embedding {len(chunks)} chunks and saving to {DB_DIRECTORY}...")

	# 1. Create the database
	# 2. Embed all the chunks
	# 3. Save it to the DB_DIRECTORY
	vector_store = Chroma.from_documents(
	documents=chunks,
	embedding=embeddings,
	persist_directory=DB_DIRECTORY
	)

	# Force the database to save to disk
	vector_store.persist()
	print("Database successfully built and saved to disk!")
	return vector_store

	def get_vector_store(api_key: str):
	"""
	Retrieves the existing database from the hard drive so we don't have to rebuild it every time.
	"""
	embeddings = get_embeddings_model()
	return Chroma(persist_directory=DB_DIRECTORY, embedding_function=embeddings)