| import os |
| from langchain_text_splitters import CharacterTextSplitter |
| from langchain_community.embeddings import HuggingFaceEmbeddings |
| from langchain.schema import HumanMessage |
| from langchain.document_loaders import UnstructuredFileLoader |
| |
| from langchain_community.vectorstores import Chroma |
| from langchain_groq import ChatGroq |
| import gradio as gr |
|
|
| |
| DB_DIR = "chroma_db" |
| COLLECTION_NAME = "document_collection" |
| embedding_function = HuggingFaceEmbeddings() |
|
|
| GROQ_API_KEY = groq_api_key = os.environ.get("GROQ_API_KEY") |
| llm = ChatGroq(api_key=GROQ_API_KEY, model_name="llama-3.1-8b-instant") |
|
|
| |
| current_document_id = None |
|
|
| def load_and_split_document(file_path): |
| """Loads a document and splits it into chunks.""" |
| loader = UnstructuredFileLoader(file_path) |
| documents = loader.load() |
| |
| text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=50) |
| chunks = text_splitter.split_documents(documents) |
| |
| return chunks |
|
|
| def upload_and_process(file): |
| """Processes uploaded file and stores it in ChromaDB.""" |
| try: |
| global current_document_id |
| uploaded_file_path = file.name |
| |
| |
| current_document_id = os.path.basename(uploaded_file_path) |
| |
| |
| chunks = load_and_split_document(uploaded_file_path) |
| |
| |
| for chunk in chunks: |
| chunk.metadata['document_id'] = current_document_id |
| |
| |
| vector_store = Chroma( |
| persist_directory=DB_DIR, |
| embedding_function=embedding_function, |
| collection_name=COLLECTION_NAME |
| ) |
| |
| |
| vector_store.add_documents(chunks) |
| |
| return f"Document successfully processed: {current_document_id}" |
| except Exception as e: |
| return f"Error processing document: {str(e)}" |
|
|
| def retrieve_and_generate_response(query): |
| """Retrieves relevant text and uses Groq LLM to generate a response.""" |
| try: |
| vector_store = Chroma( |
| persist_directory=DB_DIR, |
| embedding_function=embedding_function, |
| collection_name=COLLECTION_NAME |
| ) |
| |
| |
| if current_document_id: |
| filter_dict = {"document_id": current_document_id} |
| results = vector_store.similarity_search( |
| query, |
| k=2, |
| filter=filter_dict |
| ) |
| else: |
| return "Please upload a document first." |
|
|
| retrieved_texts = [doc.page_content for doc in results] |
| context = "\n".join(retrieved_texts) |
|
|
| if not context: |
| return "No relevant content found in the current document." |
|
|
| messages = [ |
| HumanMessage(content=f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}") |
| ] |
| |
| response = llm.invoke(messages) |
| return response.content |
| except Exception as e: |
| return f"Error generating response: {str(e)}" |
|
|
| |
| with gr.Blocks() as demo: |
| gr.Markdown("# 🤖 RAG Chatbot with Groq & ChromaDB") |
| |
| file_input = gr.File(label="Upload a PDF") |
| upload_button = gr.Button("Process Document") |
| upload_status = gr.Textbox(label="Upload Status", interactive=False) |
| |
| query_input = gr.Textbox(label="Ask a Question") |
| response_output = gr.Textbox(label="Response", interactive=False) |
| |
| chat_button = gr.Button("Get Answer") |
|
|
| upload_button.click( |
| upload_and_process, |
| inputs=[file_input], |
| outputs=[upload_status] |
| ) |
| chat_button.click( |
| retrieve_and_generate_response, |
| inputs=[query_input], |
| outputs=[response_output] |
| ) |
|
|
|
|
| |
| demo.launch() |
|
|