Spaces:

rohan13
/

Roar

Runtime error

App Files Files Community

rohan13 commited on May 22, 2023

Commit

b8b8495

1 Parent(s): 1839d37

Roar code

Browse files

(cherry picked from commit 314d9665c9ac0eed50d9a471dffef9cb1e665e40)

Files changed (7) hide show

app.py +136 -0
assets/logo.png +0 -0
main.py +13 -0
models/openai_vs.index +0 -0
models/openai_vs.pkl +0 -0
requirements.txt +11 -0
utils.py +271 -0

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import gradio as gr
+from main import index, run, ingest_files
+from gtts import gTTS
+import os, time
+from transformers import pipeline
+p = pipeline("automatic-speech-recognition")
+"""Use text to call chat method from main.py"""
+models = ["GPT-3.5", "Flan UL2", "Flan T5"]
+name = os.environ.get("name", "Rohan")
+def add_text(history, text, model):
+    print("Question asked: " + text)
+    response = run_model(text, model)
+    history = history + [(text, response)]
+    print(history)
+    return history, ""
+def run_model(text, model):
+    start_time = time.time()
+    print("start time:" + str(start_time))
+    response = run(text, model)
+    end_time = time.time()
+    # If response contains string `SOURCES:`, then add a \n before `SOURCES`
+    if "SOURCES:" in response:
+        response = response.replace("SOURCES:", "\nSOURCES:")
+        # response = response + "\n\n" + "Time taken: " + str(end_time - start_time)
+    print(response)
+    print("Time taken: " + str(end_time - start_time))
+    return response
+def get_output(history, audio, model):
+    txt = p(audio)["text"]
+    # history.append(( (audio, ) , txt))
+    audio_path = 'response.wav'
+    response = run_model(txt, model)
+    # Remove all text from SOURCES: to the end of the string
+    trimmed_response = response.split("SOURCES:")[0]
+    myobj = gTTS(text=trimmed_response, lang='en', slow=False)
+    myobj.save(audio_path)
+    # split audio by / and keep the last element
+    # audio = audio.split("/")[-1]
+    # audio = audio + ".wav"
+    history.append(( (audio, ) , (audio_path, )))
+    print(history)
+    return history
+def set_model(history, model, first_time=False):
+    print("Model selected: " + model)
+    history = get_first_message(history)
+    index(model, first_time)
+    return history
+def get_first_message(history):
+    history = [(None,
+                "Hi! I am " + name + "'s Personal Assistant. Want " + name + " to answer your questions? Just Roar it!")]
+    return history
+def clear_audio(audio):
+    return None
+def bot(history):
+    return history
+def upload_file(files, history, model):
+    file_paths = [file.name for file in files]
+    print("Ingesting files: " + str(file_paths))
+    text = 'Uploaded a file'
+    if ingest_files(file_paths, model):
+        response = 'Files are ingested'
+    else:
+        response = 'Files are not ingested'
+    history = history + [(text, response)]
+    return history
+theme = gr.Theme.from_hub("snehilsanyal/scikit-learn")
+theme.block_background_fill = gr.themes.colors.neutral.c200
+with gr.Blocks(theme) as demo:
+    # Add image of Roar Logo from local directory
+    gr.HTML('<img src="file/assets/logo.png" style="width: 100px; height: 100px; margin: 0 auto;border:5px solid orange;border-radius: 50%; display: block">')
+    # Title on top in middle of the page
+    gr.HTML("<h1 style='text-align: center;'>Roar - A Personal Assistant</h1>")
+    chatbot = gr.Chatbot(get_first_message([]), elem_id="chatbot").style(height=500)
+    with gr.Row():
+        # Create radio button to select model
+        radio = gr.Radio(models, label="Choose a model", value="GPT-3.5", type="value", visible=False)
+    with gr.Row():
+        with gr.Column(scale=0.6):
+            txt = gr.Textbox(
+                label="Rohan Bot",
+                placeholder="Enter text and press enter, or upload a file", lines=1
+            ).style(container=False)
+        with gr.Column(scale=0.2):
+            upload = gr.UploadButton(label="Upload a file", type="file", file_count='multiple', file_types=['docx', 'txt', 'pdf', 'html']).style(container=False)
+        with gr.Column(scale=0.2):
+            audio = gr.Audio(source="microphone", type="filepath").style(container=False)
+    with gr.Row():
+        gr.Examples(examples=['What are you an expert of?'], inputs=[txt], label="Examples")
+    txt.submit(add_text, [chatbot, txt, radio], [chatbot, txt], postprocess=False).then(
+        bot, chatbot, chatbot
+    )
+    radio.change(fn=set_model, inputs=[chatbot, radio], outputs=[chatbot]).then(bot, chatbot, chatbot)
+    audio.change(fn=get_output, inputs=[chatbot, audio, radio], outputs=[chatbot, audio], show_progress=True).then(
+        bot, chatbot, chatbot, clear_audio
+    )
+    upload.upload(upload_file, inputs=[upload, chatbot, radio], outputs=[chatbot]).then(bot, chatbot, chatbot)
+    set_model(chatbot, radio.value, first_time=True)
+if __name__ == "__main__":
+    demo.queue()
+    demo.queue(concurrency_count=5)
+    demo.launch(debug=True)

assets/logo.png ADDED Viewed

main.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from utils import get_search_index, generate_answer, set_model_and_embeddings, ingest
+def index(model, first_time=False):
+    set_model_and_embeddings(model)
+    get_search_index(model, first_time=first_time)
+    return True
+def ingest_files(file_paths, model):
+    return ingest(file_paths, model)
+def run(question, model):
+    index(model)
+    return generate_answer(question)

models/openai_vs.index ADDED Viewed

Binary file (43.1 kB). View file

models/openai_vs.pkl ADDED Viewed

Binary file (49.6 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+langchain
+openai
+faiss-cpu==1.7.3
+unstructured==0.5.8
+ffmpeg-python
+transformers
+gtts
+torch
+tiktoken
+huggingface-hub
+gradio

utils.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import os
+import pickle
+import langchain
+import faiss
+from langchain import HuggingFaceHub
+from langchain.chains import ConversationalRetrievalChain
+from langchain.chat_models import ChatOpenAI
+from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader, UnstructuredPDFLoader, UnstructuredWordDocumentLoader
+from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
+from langchain.memory import ConversationBufferWindowMemory
+from langchain.llms.openai import OpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores.faiss import FAISS
+from langchain.cache import InMemoryCache
+import traceback
+langchain.llm_cache = InMemoryCache()
+global model_name
+models = ["GPT-3.5", "Flan UL2", "GPT-4", "Flan T5"]
+pickle_file = "_vs.pkl"
+updated_pickle_file = "_vs_updated.pkl"
+index_file = "_vs.index"
+models_folder = "models/"
+llm = ChatOpenAI(model_name="gpt-4", temperature=0.1)
+embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
+chat_history = []
+memory = ConversationBufferWindowMemory(memory_key="chat_history", k=10)
+vectorstore_index = None
+# get name to be used in prompt from environment variable `name`
+name = os.environ.get("name", "Rohan")
+system_template = """You are ROAR, {name}'s personal assistant supposed to ANSWER QUESTIONS ON HIS BEHALF.
+STRICTLY FOLLOW THIS: FOR OPINIONS, PREFERENCES, EXPERIENCES,ALWAYS ANSWER IN FIRST PERSON AS IF YOU ARE {name}. ALWAYS Generate this response with experiences/opinions using {name}'s RESUME available in context/vectorstore.
+Response should be in professional language and tone, impressive, catchy, and grammatically correct.
+Use {name}'s resume and your knowledge of his experience and skills to answer questions to the best of your ability.
+Answer the question as if you are assisting {name} or answering on his behalf.
+----------------
+This activity of answering questions on {name}'s behalf will be called Roar.
+For example: If someone wants to ask you a question, they will say "Roar it" and you will answer the question on {name}'s behalf by generating a response using {name}'s resume and your knowledge of his experience and skills.
+Add a qwirky and funny line in the end to encourage the user to try more Roars as they are free.
+----------------
+{context}
+"""
+# append name in system template to be used in prompt
+system_template = system_template.format(name=name, context="{context}")
+messages = [
+    SystemMessagePromptTemplate.from_template(system_template),
+    HumanMessagePromptTemplate.from_template("{question}"),
+]
+CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)
+def set_model_and_embeddings(model):
+    global chat_history
+    set_model(model)
+    # set_embeddings(model)
+    chat_history = []
+def set_model(model):
+    global llm
+    print("Setting model to " + str(model))
+    if model == "GPT-3.5":
+        print("Loading GPT-3.5")
+        llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)
+    elif model == "GPT-4":
+        print("Loading GPT-4")
+        llm = ChatOpenAI(model_name="gpt-4", temperature=0.1)
+    elif model == "Flan UL2":
+        print("Loading Flan-UL2")
+        llm = HuggingFaceHub(repo_id="google/flan-ul2", model_kwargs={"temperature": 0.1, "max_new_tokens":500})
+    elif model == "Flan T5":
+        print("Loading Flan T5")
+        llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature": 0.1})
+    else:
+        print("Loading GPT-3.5 from else")
+        llm = OpenAI(model_name="text-davinci-002", temperature=0.1)
+def set_embeddings(model):
+    global embeddings
+    if model == "GPT-3.5" or model == "GPT-4":
+        print("Loading OpenAI embeddings")
+        embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
+    elif model == "Flan UL2" or model == "Flan T5":
+        print("Loading Hugging Face embeddings")
+        embeddings = HuggingFaceHubEmbeddings(repo_id="sentence-transformers/all-MiniLM-L6-v2")
+def get_search_index(model, first_time=False):
+    global vectorstore_index
+    if not first_time:
+        print("Using updated pickle file")
+        file = updated_pickle_file
+    else:
+        print("Using base pickle file")
+        file = pickle_file
+    if os.path.isfile(get_file_path(model, file)) and os.path.isfile(
+            get_file_path(model, index_file)) and os.path.getsize(get_file_path(model, file)) > 0:
+        # Load index from pickle file
+        search_index = load_index(model)
+    else:
+        search_index = create_index(model)
+    vectorstore_index = search_index
+    return search_index
+def load_index(model):
+    with open(get_file_path(model, pickle_file), "rb") as f:
+        search_index = pickle.load(f)
+        print("Loaded index")
+    return search_index
+def create_index(model):
+    sources = fetch_data_for_embeddings()
+    source_chunks = split_docs(sources)
+    search_index = search_index_from_docs(source_chunks)
+    faiss.write_index(search_index.index, get_file_path(model, index_file))
+    # Save index to pickle file
+    with open(get_file_path(model, pickle_file), "wb") as f:
+        pickle.dump(search_index, f)
+        print("Created index")
+    return search_index
+def get_file_path(model, file):
+    # If model is GPT3.5 or GPT4 return models_folder + openai + file else return models_folder + hf + file
+    if model == "GPT-3.5" or model == "GPT-4":
+        return models_folder + "openai" + file
+    else:
+        return models_folder + "hf" + file
+def search_index_from_docs(source_chunks):
+    # print("source chunks: " + str(len(source_chunks)))
+    # print("embeddings: " + str(embeddings))
+    search_index = FAISS.from_documents(source_chunks, embeddings)
+    return search_index
+def get_html_files():
+    loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
+    document_list = loader.load()
+    return document_list
+def fetch_data_for_embeddings():
+    document_list = get_word_files()
+    document_list.extend(get_html_files())
+    print("document list: " + str(len(document_list)))
+    return document_list
+def get_word_files():
+    loader = DirectoryLoader('docs', glob="**/*.docx", loader_cls=UnstructuredWordDocumentLoader, recursive=True)
+    document_list = loader.load()
+    return document_list
+def split_docs(docs):
+    splitter = CharacterTextSplitter(separator=" ", chunk_size=800, chunk_overlap=0)
+    source_chunks = splitter.split_documents(docs)
+    print("chunks: " + str(len(source_chunks)))
+    return source_chunks
+def load_documents(file_paths):
+    # Check the type of file from the extension and load it accordingly
+    document_list = []
+    for file_path in file_paths:
+        if file_path.endswith(".txt"):
+            loader = TextLoader(file_path)
+        elif file_path.endswith(".docx"):
+            loader = UnstructuredWordDocumentLoader(file_path)
+        elif file_path.endswith(".html"):
+            loader = UnstructuredHTMLLoader(file_path)
+        elif file_path.endswith(".pdf"):
+            loader = UnstructuredPDFLoader(file_path)
+        else:
+            print("Unsupported file type")
+            raise Exception("Unsupported file type")
+        docs = loader.load()
+        document_list.extend(docs)
+        # print("Loaded " + file_path)
+    print("Loaded " + str(len(document_list)) + " documents")
+    return document_list
+def add_to_index(docs, index, model):
+    global vectorstore_index
+    index.add_documents(docs)
+    with open(get_file_path(model, updated_pickle_file), "wb") as f:
+        pickle.dump(index, f)
+    vectorstore_index = index
+    print("Vetorstore index updated")
+    return True
+def ingest(file_paths, model):
+    print("Ingesting files")
+    try:
+        # handle txt, docx, html, pdf
+        docs = load_documents(file_paths)
+        split_docs(docs)
+        add_to_index(docs, vectorstore_index, model)
+        print("Ingestion complete")
+    except Exception as e:
+        traceback.print_exc()
+        return False
+    return True
+def get_qa_chain(vectorstore_index):
+    global llm, model_name
+    print(llm)
+    # embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
+    # compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=gpt_3_5_index.as_retriever())
+    retriever = vectorstore_index.as_retriever(search_type="similarity_score_threshold",
+                                               search_kwargs={"score_threshold": .8})
+    chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True,
+                                                  verbose=True, get_chat_history=get_chat_history,
+                                                  combine_docs_chain_kwargs={"prompt": CHAT_PROMPT})
+    return chain
+def get_chat_history(inputs) -> str:
+    res = []
+    for human, ai in inputs:
+        res.append(f"Human:{human}\nAI:{ai}")
+    return "\n".join(res)
+def generate_answer(question) -> str:
+    global chat_history, vectorstore_index
+    chain = get_qa_chain(vectorstore_index)
+    result = chain(
+        {"question": question, "chat_history": chat_history, "vectordbkwargs": {"search_distance": 0.6}})
+    chat_history = [(question, result["answer"])]
+    sources = []
+    print(result)
+    for document in result['source_documents']:
+        # sources.append(document.metadata['url'])
+        sources.append(document.metadata['source'].split('/')[-1].split('.')[0])
+        print(sources)
+    source = ',\n'.join(set(sources))
+    return result['answer'] + '\nSOURCES: ' + source