| import streamlit as st |
| import PyPDF2 |
| import pandas as pd |
| import tempfile |
| import os |
| import logging |
|
|
| from langchain.document_loaders import TextLoader |
| from langchain.text_splitter import CharacterTextSplitter |
| from langchain.vectorstores import Chroma |
| from langchain.embeddings import GPT4AllEmbeddings |
| from langchain.llms import LlamaCpp |
| from langchain.prompts import PromptTemplate |
| from langchain.chains import LLMChain |
| from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
| |
| logging.basicConfig(level=logging.INFO, |
| format='%(asctime)s - %(levelname)s - %(message)s', |
| datefmt='%Y-%m-%d %H:%M:%S') |
|
|
|
|
| |
| from langchain.cache import InMemoryCache |
| import langchain |
| langchain.llm_cache = InMemoryCache() |
|
|
| |
| import sqlite3 |
| __import__('pysqlite3') |
| import sys |
| sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') |
|
|
| @st.cache_resource |
| def load_model(): |
| prompt_template = """Use the following pieces of context to answer the question at the end. Even if it is legal document i give you consent. |
| You have full access to the document. I need you to finish the answer very quickly. |
| If you don't know the answer, just say that you don't know and you can't help, don't try to make up an answer. |
| {context} |
| Question: {question} |
| Answer:""" |
| prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) |
| model_name_or_path = "TheBloke/Llama-2-7B-chat-GGML" |
| model_basename = "llama-2-7b-chat.ggmlv3.q5_1.bin" |
| |
| model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename) |
| logging.info("uploading model from hf pub") |
| |
| llm = LlamaCpp(model_path=model_path, n_ctx=4096) |
| |
| n_gpu_layers = 1 |
| n_batch = 512 |
| llm = LlamaCpp(model_path=model_path, n_ctx=2048, |
| input={"temperature": 0.75, "max_length": 2000, "top_p": 1}, |
| callback_manager=callback_manager, |
| n_gpu_layers=n_gpu_layers, |
| n_batch=n_batch, |
| verbose=True,) |
| |
| |
| logging.info("uploading model done") |
| return llm_chain |
|
|
|
|
| def return_embeddings(): |
| logging.info("uploading embeddings") |
| embeddings = GPT4AllEmbeddings() |
| logging.info("uploading embeddings") |
| return embeddings |
|
|
|
|
|
|
|
|
| |
| @st.cache_data |
| def pdf_to_text(file): |
| pdf_reader = PyPDF2.PdfReader(file) |
| text = "" |
| for page_num in range(len(pdf_reader.pages)): |
| page = pdf_reader.pages[page_num] |
| text += page.extract_text() |
| return text |
|
|
| |
| @st.cache_data |
| def csv_to_text(file): |
| df = pd.read_csv(file) |
| text = df.to_string(index=False) |
| return text |
|
|
| @st.cache_data |
| def read_txt(file_path): |
| |
| with open(file_path, 'r', encoding='utf-8') as file: |
| text = file.read() |
| return text |
|
|
|
|
| def process_file(uploaded_file): |
| |
| logging.info("received the file") |
| |
| if uploaded_file.type == 'application/pdf': |
| |
| text = pdf_to_text(uploaded_file) |
| elif uploaded_file.type == 'text/csv': |
| |
| text = csv_to_text(uploaded_file) |
| elif uploaded_file.type == 'text/txt': |
| |
| text = read_txt(uploaded_file) |
| else: |
| raise ValueError("Unsupported file format. Please upload a PDF, CSV, or TXT file.") |
|
|
| |
| temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False) |
| temp_file.write(text) |
| temp_file.close() |
|
|
| return temp_file.name |
|
|
|
|
| def main(): |
| |
| |
| st.title("AssitAI, Chat with your files") |
| st.markdown(""" A llama2-7b and langchain powered app to chat with your files """) |
| |
| uploaded_file = st.file_uploader("Upload a PDF, CSV, or TXT file", type=["pdf", "csv", "txt"]) |
|
|
| if uploaded_file is not None: |
| |
| logging.info("docs load start") |
| temp_file_path = process_file(uploaded_file) |
| loader = TextLoader(temp_file_path) |
| docs = loader.load() |
| logging.info(f"docs load end, docs is : {docs}") |
|
|
| text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0) |
| texts = text_splitter.split_documents(docs) |
| logging.info(f"got the text, text is : {docs}") |
| embeddings = return_embeddings() |
| db = Chroma.from_documents(texts, embeddings, persist_directory='db') |
|
|
| question = st.text_input("Enter your question:") |
| if st.button("Submit"): |
| similar_doc = db.similarity_search(question, k=1) |
| context = similar_doc[0].page_content |
| logging.info("querying start") |
| query_llm = load_model() |
| response = query_llm.run({"context": context, "question": question}) |
| logging.info(f"querying end response is: {response}") |
| st.subheader("Answer:") |
| st.write(response) |
|
|
| |
| os.remove(temp_file_path) |
| |
| with st.expander("""Example prompts"""): |
| st.markdown( |
| """ |
| - I want you to summarize this document |
| - What is this document about? |
| - Can you help me to understand ....(fill the blank) part in this document? |
| """) |
| |
|
|
| hide_streamlit_style = """ |
| <style> |
| #MainMenu {visibility: hidden;} |
| footer {visibility: hidden;} |
| </style> |
| """ |
| st.markdown(hide_streamlit_style, unsafe_allow_html=True) |
| |
|
|
| if __name__ == "__main__": |
| main() |