Spaces:

KeshavRa
/

test2

Sleeping

App Files Files Community

test2 / app.py

KeshavRa

Update app.py

aff4b58 verified over 1 year ago

raw

history blame contribute delete

7.46 kB

	import os
	import streamlit as st
	from datasets import load_dataset
	import chromadb
	import string

	from openai import OpenAI

	import numpy as np
	import pandas as pd

	from scipy.spatial.distance import cosine

	from typing import Dict, List

	def merge_dataframes(dataframes):
	# Concatenate the list of dataframes
	combined_dataframe = pd.concat(dataframes, ignore_index=True)

	# Ensure that the resulting dataframe only contains the columns "context", "questions", "answers"
	combined_dataframe = combined_dataframe[['context', 'questions', 'answers']]

	return combined_dataframe

	def call_chatgpt(prompt: str, directions: str) -> str:
	'''
	Uses the OpenAI API to generate an AI response to a prompt.
	Args:
	prompt: A string representing the prompt to send to the OpenAI API.
	Returns:
	A string representing the AI's generated response.
	'''

	# Use the OpenAI API to generate a response based on the input prompt.
	client = OpenAI(api_key = os.environ["OPENAI_API_KEY"])

	completion = client.chat.completions.create(
	model="gpt-3.5-turbo-0125",
	messages=[
	{"role": "system", "content": directions},
	{"role": "user", "content": prompt}
	]
	)

	# Extract the text from the first (and only) choice in the response output.
	ans = completion.choices[0].message.content

	# Return the generated AI response.
	return ans

	def openai_text_embedding(prompt: str) -> str:
	return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
	"data"
	][0]["embedding"]

	def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
	# Compute sentence embeddings
	embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
	embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array

	# Convert to array
	embedding1 = np.asarray(embedding1)
	embedding2 = np.asarray(embedding2)

	# Calculate cosine similarity between the embeddings
	similarity_score = 1 - cosine(embedding1, embedding2)

	return similarity_score

	def add_dist_score_column(
	dataframe: pd.DataFrame, sentence: str,
	) -> pd.DataFrame:
	dataframe["stsopenai"] = dataframe["questions"].apply(
	lambda x: calculate_sts_openai_score(str(x), sentence)
	)

	sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)


	return sorted_dataframe.iloc[:5, :]

	def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
	'''
	Reads in a pandas DataFrame and produces a list of dictionaries with two keys each, 'question' and 'answer.'
	Args:
	df: A pandas DataFrame with columns named 'questions' and 'answers'.
	Returns:
	A list of dictionaries, with each dictionary containing a 'question' and 'answer' key-value pair.
	'''

	# Initialize an empty list to store the dictionaries
	result = []

	# Loop through each row of the DataFrame
	for index, row in df.iterrows():
	# Create a dictionary with the current question and answer
	qa_dict_quest = {"role": "user", "content": row["questions"]}
	qa_dict_ans = {"role": "assistant", "content": row["answers"]}

	# Add the dictionary to the result list
	result.append(qa_dict_quest)
	result.append(qa_dict_ans)

	# Return the list of dictionaries
	return result

	domain_info = [{'link': 'KeshavRa/About_YSA_Database', 'name': 'About YSA'}]

	st.sidebar.markdown('''This is a chatbot to help you learn more about YSA''')

	domain = st.sidebar.selectbox("Select a topic", [d["name"] for d in domain_info])

	special_threshold = 0.3

	n_results = 3

	clear_button = st.sidebar.button("Clear Conversation", key="clear")

	if clear_button:
	st.session_state.messages = []
	st.session_state.curr_domain = ""

	for d in domain_info:
	if domain == d['name']:
	dataset = load_dataset(d['link'])

	initial_input = "Tell me about YSA"

	# Initialize a new client for ChromeDB.
	client = chromadb.Client()

	# Generate a random number between 1 billion and 10 billion.
	random_number: int = np.random.randint(low=1e9, high=1e10)

	# Generate a random string consisting of 10 uppercase letters and digits.
	random_string: str = "".join(
	np.random.choice(list(string.ascii_uppercase + string.digits), size=10)
	)

	# Combine the random number and random string into one identifier.
	combined_string: str = f"{random_number}{random_string}"

	# Create a new collection in ChromeDB with the combined string as its name.
	collection = client.create_collection(combined_string)

	st.title("YSA Chatbot")

	# Initialize chat history
	if "messages" not in st.session_state:
	st.session_state.messages = []

	if "curr_domain" not in st.session_state:
	st.session_state.curr_domain = ""

	# Embed and store the first N supports for this demo
	with st.spinner("Loading, please be patient with us ... 🙏"):
	L = len(dataset["train"]["questions"])

	collection.add(
	ids=[str(i) for i in range(0, L)], # IDs are just strings
	documents=dataset["train"]["questions"], # Enter questions here
	metadatas=[{"type": "support"} for _ in range(0, L)],
	)

	if st.session_state.curr_domain != domain:
	st.session_state.messages = []
	st.session_state.curr_domain = domain

	# Display chat messages from history on app rerun
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# React to user input
	if prompt := st.chat_input("Tell me about a"):
	# Display user message in chat message container
	st.chat_message("user").markdown(prompt)
	# Add user message to chat history
	st.session_state.messages.append({"role": "user", "content": prompt})

	question = prompt

	results = collection.query(query_texts=question, n_results=n_results)

	idx = results["ids"][0]
	idx = [int(i) for i in idx]
	ref = pd.DataFrame(
	{
	"idx": idx,
	"questions": [dataset["train"]["questions"][i] for i in idx],
	"answers": [dataset["train"]["answers"][i] for i in idx],
	"distances": results["distances"][0],
	}
	)
	# special_threshold = st.sidebar.slider('How old are you?', 0, 0.6, 0.1) # 0.3
	# special_threshold = 0.3
	filtered_ref = ref[ref["distances"] < special_threshold]
	if filtered_ref.shape[0] > 0:
	# st.success("There are highly relevant information in our database.")
	ref_from_db_search = filtered_ref["answers"].str.cat(sep=" ")
	final_ref = filtered_ref
	else:
	# st.warning(
	# "The database may not have relevant information to help your question so please be aware of hallucinations."
	# )
	ref_from_db_search = ref["answers"].str.cat(sep=" ")
	final_ref = ref

	engineered_prompt = f'''
	Based on the context: {ref_from_db_search},
	answer the user question: {question}.
	'''
	answer = call_chatgpt(engineered_prompt, "You are a helpful assistant.")

	response = answer
	# Display assistant response in chat message container
	with st.chat_message("assistant"):
	st.markdown(response)
	with st.expander("See reference:"):
	st.table(final_ref)
	# Add assistant response to chat history
	st.session_state.messages.append({"role": "assistant", "content": response})