| import streamlit as st |
| from dotenv import load_dotenv |
| from PyPDF2 import PdfReader |
| from langchain.text_splitter import CharacterTextSplitter |
| from langchain.embeddings import HuggingFaceEmbeddings |
| from langchain.vectorstores import FAISS |
| |
| from langchain.memory import ConversationBufferMemory |
| from langchain.chains import ConversationalRetrievalChain |
| from htmlTemplates import css, bot_template, user_template |
| from langchain.llms import HuggingFaceHub |
| import os |
| |
| |
| hub_token = os.environ["HUGGINGFACE_HUB_TOKEN"] |
|
|
| def split_pdfs(pdf_docs): |
| """Splits a list of PDF documents into smaller chunks. |
| |
| Args: |
| pdf_docs: A list of PDF documents. |
| |
| Returns: |
| A list of lists of PDF documents, where each sublist contains a smaller chunk of the original PDF documents. |
| """ |
| |
| pdf_chunks = [] |
| |
| pdf_reader = PdfReader(pdf_doc) |
| pdf_pages = pdf_reader.pages |
|
|
| |
| pdf_chunks.append([]) |
| for pdf_page in pdf_pages: |
| |
| pdf_chunks[-1].append(pdf_page) |
|
|
| |
| if len(pdf_chunks[-1]) >= 10: |
| pdf_chunks.append([]) |
|
|
| return pdf_chunks |
|
|
| def generate_response(pdf_chunks, llm_model): |
| """Generates a response to a query using a list of PDF documents and an LLM model. |
| |
| Args: |
| pdf_chunks: A list of lists of PDF documents, where each sublist contains a smaller chunk of the original PDF documents. |
| llm_model: An LLM model. |
| |
| Returns: |
| A response to the query. |
| """ |
|
|
| |
| pdf_summaries = [] |
| for pdf_chunk in pdf_chunks: |
| |
| pdf_summary = llm_model.generate( |
| prompt=f"Summarize the following text:\n{get_pdf_text(pdf_chunk)}", |
| max_new_tokens=100 |
| ) |
|
|
| |
| pdf_summaries.append(pdf_summary) |
|
|
| |
| response = llm_model.generate( |
| prompt=f"Answer the following question using the following summaries:\n{get_text_chunks(pdf_summaries)}\n\nQuestion:", |
| max_new_tokens=200 |
| ) |
|
|
| return response |
|
|
| def main(): |
| load_dotenv() |
| st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:") |
| st.write(css, unsafe_allow_html=True) |
|
|
| |
| llm_model = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", huggingfacehub_api_token=hub_token, verbose=True) |
|
|
| if "conversation" not in st.session_state: |
| st.session_state.conversation = None |
|
|
| if "chat_history" not in st.session_state: |
| st.session_state.chat_history = None |
|
|
| st.header("Chat with multiple PDFs :books:") |
| user_question = st.text_input("Ask a question about your documents:") |
|
|
| |
| if user_question: |
| |
| pdf_chunks = split_pdfs("Geeta.pdf") |
|
|
| |
| response = generate_response(pdf_chunks, llm_model) |
|
|
| st.write(response) |
|
|
|
|
|
|
| main() |
|
|