import os from llama_cpp import Llama from langchain_community.vectorstores import Chroma from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings def load_embeddings(): """Initializes and returns the sentence transformer embedding model.""" return SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") def initialize_vector_db(persist_directory): """Loads the existing Chroma database and returns a retriever object.""" embedding_function = load_embeddings() db = Chroma(persist_directory=persist_directory, embedding_function=embedding_function) return db.as_retriever(search_type="similarity", search_kwargs={"k": 3}) def load_llm_model(model_path): """Initializes and returns the Llama LLM object.""" return Llama( model_path=model_path, n_ctx=2048, n_threads=4, n_gpu_layers=-1 ) def get_rag_response(query, llm, retriever): """Encapsulates retrieval and generation logic to provide a grounded response.""" # 1. Retrieve relevant context relevant_docs = retriever.get_relevant_documents(query) context = ". ".join([doc.page_content for doc in relevant_docs]) # 2. Define prompt templates system_message = """[INST] You are a helpful medical assistant that answers questions based on the provided context from the Merck Manual of Diagnosis and Therapy. Your responses should be accurate, well-structured, and based strictly on the provided context. [/INST]""" user_message = f"""Context: {context} Question: {query} Please provide a detailed and accurate answer based on the context above. [/INST]""" full_prompt = f"{system_message}\n{user_message}" # 3. Generate response output = llm( prompt=full_prompt, max_tokens=512, temperature=0, top_p=0.95, top_k=50 ) return output['choices'][0]['text'].strip()