Spaces:

rohan13
/

Roar

Runtime error

App Files Files Community

Roar / utils.py

rohan13

pdf loader change

c932e0c almost 3 years ago

raw

history blame contribute delete

9.97 kB

	import os
	import pickle
	import langchain

	import faiss
	from langchain import HuggingFaceHub
	from langchain.chains import ConversationalRetrievalChain
	from langchain.chat_models import ChatOpenAI
	from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader, UnstructuredPDFLoader, UnstructuredWordDocumentLoader, PyPDFLoader
	from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
	from langchain.memory import ConversationBufferWindowMemory
	from langchain.llms.openai import OpenAI, OpenAIChat
	from langchain.prompts.chat import (
	ChatPromptTemplate,
	HumanMessagePromptTemplate,
	SystemMessagePromptTemplate,
	)
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.vectorstores.faiss import FAISS
	from langchain.cache import InMemoryCache
	from langchain.chat_models import ChatGooglePalm
	import traceback


	langchain.llm_cache = InMemoryCache()

	global model_name

	models = ["GPT-3.5", "Flan UL2", "GPT-4", "Flan T5", "Palm"]

	pickle_file = "_vs.pkl"
	updated_pickle_file = "_vs_updated.pkl"
	index_file = "_vs.index"
	models_folder = "models/"

	llm = ChatOpenAI(model_name="gpt-4", temperature=0.1)

	embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

	chat_history = []

	memory = ConversationBufferWindowMemory(memory_key="chat_history", k=10)

	vectorstore_index = None

	# get name to be used in prompt from environment variable `name`
	name = os.environ.get("name", "Rohan")

	system_template = """You are ROAR, {name}'s personal assistant supposed to ANSWER QUESTIONS ON HIS BEHALF.
	STRICTLY FOLLOW THIS: FOR OPINIONS, PREFERENCES, EXPERIENCES,ALWAYS ANSWER IN FIRST PERSON AS IF YOU ARE {name}. ALWAYS Generate this response with experiences/opinions using {name}'s RESUME available in context/vectorstore.
	Response should be in professional language and tone, impressive, catchy, and grammatically correct.
	Use {name}'s resume and your knowledge of his experience and skills to answer questions to the best of your ability.
	Answer the question as if you are assisting {name} or answering on his behalf.
	----------------
	This activity of answering questions on {name}'s behalf will be called Roar.
	For example: If someone wants to ask you a question, they will say "Roar it" and you will answer the question on {name}'s behalf by generating a response using {name}'s resume and your knowledge of his experience and skills.
	Add a qwirky and funny line in the end to encourage the user to try more Roars as they are free.
	----------------
	{context}
	"""
	# append name in system template to be used in prompt
	system_template = system_template.format(name=name, context="{context}")

	messages = [
	SystemMessagePromptTemplate.from_template(system_template),
	HumanMessagePromptTemplate.from_template("{question}"),
	]
	CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)


	def set_model_and_embeddings(model):
	global chat_history
	set_model(model)
	# set_embeddings(model)
	chat_history = []


	def set_model(model):
	global llm
	print("Setting model to " + str(model))
	if model == "GPT-3.5":
	print("Loading GPT-3.5")
	llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)
	elif model == "GPT-4":
	print("Loading GPT-4")
	llm = OpenAI(model_name="gpt-4", temperature=1)
	elif model == "Flan UL2":
	print("Loading Flan-UL2")
	llm = HuggingFaceHub(repo_id="google/flan-ul2", model_kwargs={"temperature": 0.1, "max_new_tokens":500})
	elif model == "Flan T5":
	print("Loading Flan T5")
	llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature": 0.1})
	elif model == "Palm":
	llm = ChatGooglePalm(temperature=0)
	else:
	print("Loading GPT-3.5 from else")
	llm = OpenAI(model_name="text-davinci-002", temperature=0.1)


	def set_embeddings(model):
	global embeddings
	if model == "GPT-3.5" or model == "GPT-4":
	print("Loading OpenAI embeddings")
	embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
	elif model == "Flan UL2" or model == "Flan T5":
	print("Loading Hugging Face embeddings")
	embeddings = HuggingFaceHubEmbeddings(repo_id="sentence-transformers/all-MiniLM-L6-v2")


	def get_search_index(model, first_time=False):
	global vectorstore_index
	if not first_time:
	print("Using updated pickle file")
	file = updated_pickle_file
	else:
	print("Using base pickle file")
	file = pickle_file
	if os.path.isfile(get_file_path(model, file)) and os.path.isfile(
	get_file_path(model, index_file)) and os.path.getsize(get_file_path(model, file)) > 0:
	# Load index from pickle file
	search_index = load_index(model)
	else:
	search_index = create_index(model)

	vectorstore_index = search_index
	return search_index


	def load_index(model):
	with open(get_file_path(model, pickle_file), "rb") as f:
	search_index = pickle.load(f)
	print("Loaded index")
	return search_index


	def create_index(model):
	sources = fetch_data_for_embeddings()
	source_chunks = split_docs(sources)
	search_index = search_index_from_docs(source_chunks)
	faiss.write_index(search_index.index, get_file_path(model, index_file))
	# Save index to pickle file
	with open(get_file_path(model, pickle_file), "wb") as f:
	pickle.dump(search_index, f)
	print("Created index")
	return search_index


	def get_file_path(model, file):
	# If model is GPT3.5 or GPT4 return models_folder + openai + file else return models_folder + hf + file
	if model == "GPT-3.5" or model == "GPT-4":
	return models_folder + "openai" + file
	elif model == "Palm":
	return models_folder + "palm" + file
	else:
	return models_folder + "hf" + file


	def search_index_from_docs(source_chunks):
	# print("source chunks: " + str(len(source_chunks)))
	# print("embeddings: " + str(embeddings))

	search_index = FAISS.from_documents(source_chunks, embeddings)
	return search_index


	def get_html_files():
	loader = DirectoryLoader('docs', glob="*/.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
	document_list = loader.load()
	return document_list


	def fetch_data_for_embeddings():
	document_list = get_word_files()
	document_list.extend(get_html_files())

	print("document list: " + str(len(document_list)))
	return document_list


	def get_word_files():
	loader = DirectoryLoader('docs', glob="*/.docx", loader_cls=UnstructuredWordDocumentLoader, recursive=True)
	document_list = loader.load()
	return document_list

	def split_docs(docs):
	splitter = CharacterTextSplitter(separator=" ", chunk_size=800, chunk_overlap=0)

	source_chunks = splitter.split_documents(docs)

	print("chunks: " + str(len(source_chunks)))

	return source_chunks

	def load_documents(file_paths):
	# Check the type of file from the extension and load it accordingly
	document_list = []
	for file_path in file_paths:
	if file_path.endswith(".txt"):
	loader = TextLoader(file_path)
	elif file_path.endswith(".docx"):
	loader = UnstructuredWordDocumentLoader(file_path)
	elif file_path.endswith(".html"):
	loader = UnstructuredHTMLLoader(file_path)
	elif file_path.endswith(".pdf"):
	loader = PyPDFLoader(file_path)
	else:
	print("Unsupported file type")
	raise Exception("Unsupported file type")
	docs = loader.load()
	document_list.extend(docs)
	# print("Loaded " + file_path)

	print("Loaded " + str(len(document_list)) + " documents")
	return document_list

	def add_to_index(docs, index, model):
	global vectorstore_index
	index.add_documents(docs)
	with open(get_file_path(model, updated_pickle_file), "wb") as f:
	pickle.dump(index, f)
	vectorstore_index = index
	print("Vetorstore index updated")
	return True
	def ingest(file_paths, model):
	print("Ingesting files")
	try:
	# handle txt, docx, html, pdf
	docs = load_documents(file_paths)
	split_docs(docs)
	add_to_index(docs, vectorstore_index, model)
	print("Ingestion complete")
	except Exception as e:
	traceback.print_exc()
	return False
	return True


	def get_qa_chain(vectorstore_index):
	global llm, model_name
	print(llm)

	# embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
	# compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=gpt_3_5_index.as_retriever())
	retriever = vectorstore_index.as_retriever(search_type="similarity_score_threshold",
	search_kwargs={"score_threshold": .8})

	chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True,
	verbose=True, get_chat_history=get_chat_history,
	combine_docs_chain_kwargs={"prompt": CHAT_PROMPT})
	return chain


	def get_chat_history(inputs) -> str:
	res = []
	for human, ai in inputs:
	res.append(f"Human:{human}\nAI:{ai}")
	return "\n".join(res)


	def generate_answer(question) -> str:
	global chat_history, vectorstore_index
	chain = get_qa_chain(vectorstore_index)

	result = chain(
	{"question": question, "chat_history": chat_history, "vectordbkwargs": {"search_distance": 0.6}})
	chat_history = [(question, result["answer"])]
	sources = []
	print(result)

	for document in result['source_documents']:
	# sources.append(document.metadata['url'])
	sources.append(document.metadata['source'].split('/')[-1].split('.')[0])
	print(sources)

	source = ',\n'.join(set(sources))
	return result['answer'] + '\nSOURCES: ' + source