| from dotenv import load_dotenv, find_dotenv |
| import pandas as pd |
| import os |
| import chromadb |
| from chromadb.utils import embedding_functions |
|
|
|
|
| def generate_qa_vector_db(vdb_path: str, df: pd.DataFrame) -> None: |
| """This function processes the dataframe into the required format, and then creates the following collections in a ChromaDB instance |
| 1. question_collection - Contains question embeddings, and the metadata as 'position' and 'interview_phase' |
| 2. answer_collection - Contains the answer embeddings. No metadata (yet). |
| |
| Args: |
| vdb_path (str): Relative path of the location of the ChromaDB instance. |
| df (pd.DataFrame): Question/answer dataset. |
| """ |
| chroma_client = chromadb.PersistentClient(path=vdb_path) |
|
|
| huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( |
| api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"], |
| model_name="sentence-transformers/all-MiniLM-L6-v2", |
| ) |
|
|
| print("q_collection will be added") |
| q_collection = chroma_client.create_collection( |
| name="question_collection", |
| metadata={"hnsw:space": "cosine"}, |
| embedding_function=huggingface_ef, |
| ) |
|
|
| |
| df_questions = df[ |
| ["Position/Role", "Question", "Interview Phase"] |
| ].drop_duplicates() |
|
|
| |
| df_questions.columns = [ |
| x.replace(" ", "_").lower().replace("/", "_or_") for x in df_questions.columns |
| ] |
|
|
| q_documents = [row.question for row in df_questions.itertuples()] |
| q_metadata = [ |
| {"position": row.position_or_role, "interview_phase": row.interview_phase} |
| for row in df_questions.itertuples() |
| ] |
| q_ids = ["q_id" + str(row.Index) for row in df_questions.itertuples()] |
|
|
| q_collection.add(documents=q_documents, metadatas=q_metadata, ids=q_ids) |
| print("q_collection added") |
|
|
| print("a_collection will be added") |
| a_collection = chroma_client.create_collection( |
| name="answer_collection", |
| metadata={"hnsw:space": "cosine"}, |
| embedding_function=huggingface_ef, |
| ) |
|
|
| df_answers = df[["Answer", "Answer Quality"]] |
| df_answers.columns = [ |
| x.replace(" ", "_").lower().replace("/", "_or_") for x in df_answers.columns |
| ] |
|
|
| a_documents = [row.answer for row in df_answers.itertuples()] |
| a_metadata = [ |
| {"answer_quality": row.answer_quality} for row in df_answers.itertuples() |
| ] |
| a_ids = ["a_id" + str(row.Index) for row in df_answers.itertuples()] |
|
|
| a_collection.add(documents=a_documents, ids=a_ids, metadatas=a_metadata) |
| print("a_collection added") |
| return None |
|
|
|
|
| def delete_collection_from_vector_db(vdb_path: str, collection_name: str) -> None: |
| """Deletes a particular collection from the persistent ChromaDB instance. |
| |
| Args: |
| vdb_path (str): Path of the persistent ChromaDB instance. |
| collection_name (str): Name of the collection to be deleted. |
| """ |
| chroma_client = chromadb.PersistentClient(path=vdb_path) |
| chroma_client.delete_collection(collection_name) |
| return None |
|
|
|
|
| def list_collections_from_vector_db(vdb_path: str) -> None: |
| """Lists all the available collections from the persistent ChromaDB instance. |
| |
| Args: |
| vdb_path (str): Path of the persistent ChromaDB instance. |
| """ |
| chroma_client = chromadb.PersistentClient(path=vdb_path) |
| print(chroma_client.list_collections()) |
|
|
|
|
| def get_collection_from_vector_db( |
| vdb_path: str, collection_name: str |
| ) -> chromadb.Collection: |
| """Fetches a particular ChromaDB collection object from the persistent ChromaDB instance. |
| |
| Args: |
| vdb_path (str): Path of the persistent ChromaDB instance. |
| collection_name (str): Name of the collection which needs to be retrieved. |
| """ |
| load_dotenv(find_dotenv()) |
| chroma_client = chromadb.PersistentClient(path=vdb_path) |
|
|
| huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( |
| api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"], |
| model_name="sentence-transformers/all-MiniLM-L6-v2", |
| ) |
|
|
| collection = chroma_client.get_collection( |
| name=collection_name, embedding_function=huggingface_ef |
| ) |
|
|
| return collection |
|
|