import os
from llama_cpp import Llama
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

def load_embeddings():
    """Initializes and returns the sentence transformer embedding model."""
    return SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

def initialize_vector_db(persist_directory):
    """Loads the existing Chroma database and returns a retriever object."""
    embedding_function = load_embeddings()
    db = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)
    return db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

def load_llm_model(model_path):
    """Initializes and returns the Llama LLM object."""
    return Llama(
        model_path=model_path,
        n_ctx=2048,
        n_threads=4,
        n_gpu_layers=-1
    )

def get_rag_response(query, llm, retriever):
    """Encapsulates retrieval and generation logic to provide a grounded response."""
    # 1. Retrieve relevant context
    relevant_docs = retriever.get_relevant_documents(query)
    context = ". ".join([doc.page_content for doc in relevant_docs])

    # 2. Define prompt templates
    system_message = """[INST] You are a helpful medical assistant that answers questions based on the provided context from the Merck Manual of Diagnosis and Therapy.
Your responses should be accurate, well-structured, and based strictly on the provided context. [/INST]"""
    
    user_message = f"""Context:
{context}

Question:
{query}

Please provide a detailed and accurate answer based on the context above. [/INST]"""

    full_prompt = f"{system_message}\n{user_message}"

    # 3. Generate response
    output = llm(
        prompt=full_prompt,
        max_tokens=512,
        temperature=0,
        top_p=0.95,
        top_k=50
    )
    
    return output['choices'][0]['text'].strip()