Spaces:
Build error
Build error
| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| import time | |
| import random | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from qdrant_client import QdrantClient | |
| # from qdrant_client.http.models import VectorParams | |
| from langchain.vectorstores import Qdrant | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| # from transformers import pipeline | |
| from google.colab import userdata | |
| from langchain import PromptTemplate | |
| from langchain_groq import ChatGroq | |
| # from langchain_community.llms import ChatGroq | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.runnables import RunnablePassthrough | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.schema import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import PyPDF2 | |
| import os | |
| qdrant_url = userdata.get('QDRANT_URL') | |
| qdrant_api_key = userdata.get('QDRANT_API-KEY') | |
| groq_api_key = userdata.get('GROQ_API_KEY') | |
| # Function to extract text from PDFs | |
| def extract_text_from_pdf(pdf_path): | |
| pdf_text = "" | |
| with open(pdf_path, "rb") as pdf_file: | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| for page_num in range(len(reader.pages)): | |
| pdf_text += reader.pages[page_num].extract_text() | |
| return pdf_text | |
| # Function to load and extract text from different document types | |
| def load_documents_from_directory(directory_path): | |
| documents = [] | |
| # Iterate over files in the directory | |
| for filename in os.listdir(directory_path): | |
| file_path = os.path.join(directory_path, filename) | |
| # Handling text files (.txt) | |
| if filename.endswith(".txt"): | |
| with open(file_path, "r") as file: | |
| content = file.read() | |
| doc = Document(page_content=content, metadata={"filename": filename}) | |
| documents.append(doc) | |
| # Handling PDF files (.pdf) | |
| elif filename.endswith(".pdf"): | |
| pdf_text = extract_text_from_pdf(file_path) | |
| doc = Document(page_content=pdf_text, metadata={"filename": filename}) | |
| documents.append(doc) | |
| return documents | |
| # Step 1: Load documents from a directory (handling both .txt and .pdf) | |
| directory_path = "/content/drive/Othercomputers/My Laptop/Training/Atomcamp/DS6_Bootcamp/Projects/FYP/Rules_and_Policies" | |
| documents = load_documents_from_directory(directory_path) | |
| # Step 2: Split the documents into chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=250) | |
| split_docs = text_splitter.split_documents(documents) | |
| # Step 3: Embed the document chunks using HuggingFaceEmbeddings | |
| embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| # # Step 3: Connect to Qdrant | |
| # ########################################## | |
| # # Run once! | |
| # ########################################## | |
| qdrant = Qdrant.from_documents( | |
| split_docs, | |
| embedding = embeddings, | |
| url = qdrant_url, | |
| prefer_grpc = True, | |
| api_key = qdrant_api_key, | |
| collection_name = "university-rules-chatbot" | |
| ) | |
| def format_docs(docs): | |
| formatted_docs = [] | |
| for doc in docs: | |
| # Format the metadata into a string | |
| metadata_str = ', '.join(f"{key}: {value}" for key, value in doc.metadata.items()) | |
| # Combine page content with its metadata | |
| doc_str = f"{doc.page_content}\nMetadata: {metadata_str}" | |
| # Append to the list of formatted documents | |
| formatted_docs.append(doc_str) | |
| # print(f"Formatted Document {len(formatted_docs)}:\n{doc_str}\n{formatted_docs}\n") # my addition | |
| # Join all formatted documents with double newlines | |
| return "\n\n".join(formatted_docs) | |
| def retrieve_answer(question: str, bot: str): | |
| """ | |
| Retrieve the answer to a question from the documents. | |
| Args: | |
| question (str): The question to answer. | |
| Returns: | |
| str: The generated answer. | |
| """ | |
| prompt = PromptTemplate( | |
| template = """ | |
| # Your role | |
| You are a brilliant expert at understanding the intent of the questioner and the crux of the question, and providing the most optimal answer | |
| from the scraped content to the questioner's needs from the text you are given. | |
| # Instructions | |
| Your task is to answer the question using the following pieces of retrieved context delimited by XML tags. | |
| <retrieved context> | |
| Retrieved Context: | |
| {context} | |
| </retrieved context> | |
| # Constraint | |
| 1. Think deeply and multiple times about the user's question\nUser's question:\n{question}\nYou must understand the intent of their question | |
| and provide the most appropriate answer. | |
| - Ask yourself why to understand the context of the question and why the questioner asked it, reflect on it, and provide an appropriate | |
| response based on what you understand. | |
| 2. Choose the most relevant content(the key content that directly relates to the question) from the retrieved context and use it to generate an answer. | |
| 3. Generate a concise, logical answer. When generating the answer, Do Not just list your selections, But rearrange them in context | |
| so that they become paragraphs with a natural flow. | |
| 4. When you don't have retrieved context for the question or If you have a retrieved documents, but their content is irrelevant to the question, | |
| you should answer 'I can't find the answer to that question in the material I have'. | |
| 5. Use five sentences maximum. Keep the answer concise but logical/natural/in-depth. | |
| 6. At the end of the response provide metadata provided in the relevant docs, | |
| For example:"Metadata: page: 19, source: /content/OCR_RSCA/Analyse docs JVB + mails et convention FOOT INNOVATION.pdf'. Return Just the page and source | |
| Question: {question} | |
| Helpful Answer, formated in markdown:""", | |
| input_variables = ["context","question"] | |
| ) | |
| embeddings_model = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2") | |
| qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key) | |
| qdrant = Qdrant( | |
| client=qdrant_client, | |
| collection_name="university-rules-chatbot", | |
| embeddings=embeddings_model | |
| ) | |
| retriever = qdrant.as_retriever(search_kwargs={"k": 20}) | |
| # docs = retriever.get_relevant_documents(query) | |
| # for doc in docs: | |
| # print(f"Retrieved document:", doc.page_content) | |
| # print('*' * 60) | |
| # llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0,openai_api_key=openai_api_key) | |
| groq_llm = ChatGroq( | |
| model="llama-3.1-70b-versatile", # llma-3.1-70b-versatile | |
| temperature=0, | |
| groq_api_key=groq_api_key, | |
| max_retries=2, | |
| ) | |
| rag_chain = ( | |
| {"context": retriever| format_docs, "question": RunnablePassthrough()} | |
| | prompt | |
| | groq_llm | |
| | StrOutputParser() | |
| ) | |
| answer = rag_chain.invoke(question) | |
| return answer | |
| # Create an empty list to store chatbot messages | |
| messages = [] | |
| # Add initial instructions or welcome message | |
| messages.append(("Hello! How can I help you today?", "KIU-bot")) | |
| # Create Gradio chatbot with the messages list | |
| chatbot = gr.Chatbot(value=messages) | |
| # Create Gradio interface | |
| gr.ChatInterface( | |
| fn=retrieve_answer, | |
| chatbot=chatbot, | |
| title="university-rules-chatbot", | |
| description="Ask any question related to Karakoram International University Gilgit-Baltistan.", | |
| examples=[["What courses does KIU offer?"]] | |
| ).launch(debug=True) |