| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
|
|
| |
| import streamlit as st |
| import numpy as np |
| import pickle |
| import os |
| import gdown |
| from sentence_transformers import SentenceTransformer |
| import matplotlib.pyplot as plt |
|
|
|
|
| |
|
|
| |
| def load_glove_embeddings(glove_path="Data/embeddings.pkl"): |
| with open(glove_path, "rb") as f: |
| embeddings_dict = pickle.load(f, encoding="latin1") |
|
|
| return embeddings_dict |
|
|
|
|
| def get_model_id_gdrive(model_type): |
| if model_type == "25d": |
| word_index_id = "13qMXs3-oB9C6kfSRMwbAtzda9xuAUtt8" |
| embeddings_id = "1-RXcfBvWyE-Av3ZHLcyJVsps0RYRRr_2" |
| elif model_type == "50d": |
| embeddings_id = "1DBaVpJsitQ1qxtUvV1Kz7ThDc3az16kZ" |
| word_index_id = "1rB4ksHyHZ9skes-fJHMa2Z8J1Qa7awQ9" |
| elif model_type == "100d": |
| word_index_id = "1-oWV0LqG3fmrozRZ7WB1jzeTJHRUI3mq" |
| embeddings_id = "1SRHfX130_6Znz7zbdfqboKosz-PfNvNp" |
|
|
| return word_index_id, embeddings_id |
|
|
|
|
| def download_glove_embeddings_gdrive(model_type): |
| |
| word_index_id, embeddings_id = get_model_id_gdrive(model_type) |
|
|
| |
| embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy" |
| word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl" |
|
|
| |
| print("Downloading word index dictionary....\n") |
| gdown.download(id=word_index_id, output=word_index_temp, quiet=False) |
|
|
| |
| print("Donwloading embedings...\n\n") |
| gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False) |
|
|
|
|
| |
| def load_glove_embeddings_gdrive(model_type): |
| word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl" |
| embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy" |
|
|
| |
| word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin") |
|
|
| |
| embeddings = np.load(embeddings_temp) |
|
|
| return word_index_dict, embeddings |
|
|
|
|
| @st.cache_resource() |
| def load_sentence_transformer_model(model_name): |
| sentenceTransformer = SentenceTransformer(model_name) |
| return sentenceTransformer |
|
|
|
|
| def get_sentence_transformer_embeddings(sentence, model_name="all-MiniLM-L6-v2"): |
| """ |
| Get sentence transformer embeddings for a sentence |
| """ |
| |
| |
|
|
| sentenceTransformer = load_sentence_transformer_model(model_name) |
|
|
| try: |
| return sentenceTransformer.encode(sentence) |
| except: |
| if model_name == "all-MiniLM-L6-v2": |
| return np.zeros(384) |
| else: |
| return np.zeros(512) |
|
|
|
|
| def get_glove_embeddings(word, word_index_dict, embeddings, model_type): |
| """ |
| Get glove embedding for a single word |
| """ |
| if word.lower() in word_index_dict: |
| return embeddings[word_index_dict[word.lower()]] |
| else: |
| return np.zeros(int(model_type.split("d")[0])) |
|
|
|
|
| def get_category_embeddings(embeddings_metadata): |
| """ |
| Get embeddings for each category |
| 1. Split categories into words |
| 2. Get embeddings for each word |
| """ |
| model_name = embeddings_metadata["model_name"] |
| st.session_state["cat_embed_" + model_name] = {} |
| for category in st.session_state.categories.split(" "): |
| if model_name: |
| if not category in st.session_state["cat_embed_" + model_name]: |
| st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category, |
| model_name=model_name) |
| else: |
| if not category in st.session_state["cat_embed_" + model_name]: |
| st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category) |
|
|
|
|
| def update_category_embeddings(embeddings_metadata): |
| """ |
| Update embeddings for each category |
| """ |
| get_category_embeddings(embeddings_metadata) |
|
|
|
|
| |
|
|
| def plot_piechart(sorted_cosine_scores_items): |
| sorted_cosine_scores = np.array([ |
| sorted_cosine_scores_items[index][1] |
| for index in range(len(sorted_cosine_scores_items)) |
| ] |
| ) |
| categories = st.session_state.categories.split(" ") |
| categories_sorted = [ |
| categories[sorted_cosine_scores_items[index][0]] |
| for index in range(len(sorted_cosine_scores_items)) |
| ] |
| fig, ax = plt.subplots() |
| ax.pie(sorted_cosine_scores, labels=categories_sorted, autopct="%1.1f%%") |
| st.pyplot(fig) |
|
|
|
|
| def plot_piechart_helper(sorted_cosine_scores_items): |
| sorted_cosine_scores = np.array( |
| [ |
| sorted_cosine_scores_items[index][1] |
| for index in range(len(sorted_cosine_scores_items)) |
| ] |
| ) |
| categories = st.session_state.categories.split(" ") |
| categories_sorted = [ |
| categories[sorted_cosine_scores_items[index][0]] |
| for index in range(len(sorted_cosine_scores_items)) |
| ] |
| fig, ax = plt.subplots(figsize=(3, 3)) |
| my_explode = np.zeros(len(categories_sorted)) |
| my_explode[0] = 0.2 |
| if len(categories_sorted) == 3: |
| my_explode[1] = 0.1 |
| elif len(categories_sorted) > 3: |
| my_explode[2] = 0.05 |
| ax.pie( |
| sorted_cosine_scores, |
| labels=categories_sorted, |
| autopct="%1.1f%%", |
| explode=my_explode, |
| ) |
|
|
| return fig |
|
|
|
|
| def plot_piecharts(sorted_cosine_scores_models): |
| scores_list = [] |
| categories = st.session_state.categories.split(" ") |
| index = 0 |
| for model in sorted_cosine_scores_models: |
| scores_list.append(sorted_cosine_scores_models[model]) |
| |
| index += 1 |
|
|
| if len(sorted_cosine_scores_models) == 2: |
| fig, (ax1, ax2) = plt.subplots(2) |
|
|
| categories_sorted = [ |
| categories[scores_list[0][index][0]] for index in range(len(scores_list[0])) |
| ] |
| sorted_scores = np.array( |
| [scores_list[0][index][1] for index in range(len(scores_list[0]))] |
| ) |
| ax1.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%") |
|
|
| categories_sorted = [ |
| categories[scores_list[1][index][0]] for index in range(len(scores_list[1])) |
| ] |
| sorted_scores = np.array( |
| [scores_list[1][index][1] for index in range(len(scores_list[1]))] |
| ) |
| ax2.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%") |
|
|
| st.pyplot(fig) |
|
|
|
|
| def plot_alatirchart(sorted_cosine_scores_models): |
| models = list(sorted_cosine_scores_models.keys()) |
| tabs = st.tabs(models) |
| figs = {} |
| for model in models: |
| figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model]) |
|
|
| for index in range(len(tabs)): |
| with tabs[index]: |
| st.pyplot(figs[models[index]]) |
|
|
|
|
| |
|
|
| |
| def cosine_similarity(x, y): |
| """ |
| Exponentiated cosine similarity |
| 1. Compute cosine similarity |
| 2. Exponentiate cosine similarity |
| 3. Return exponentiated cosine similarity |
| (20 pts) |
| """ |
| |
| |
| |
|
|
| |
| x = np.array(x) |
| y = np.array(y) |
|
|
| |
| dot_product = np.dot(x, y) |
|
|
| |
| norm_x = np.linalg.norm(x) |
| norm_y = np.linalg.norm(y) |
|
|
| |
| cosine_sim = dot_product / (norm_x * norm_y) |
|
|
| |
| exp_cosine_sim = np.exp(cosine_sim) |
|
|
| return exp_cosine_sim |
|
|
|
|
| |
| def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, model_type=50): |
| """ |
| Get averaged glove embeddings for a sentence |
| 1. Split sentence into words |
| 2. Get embeddings for each word |
| 3. Add embeddings for each word |
| 4. Divide by number of words |
| 5. Return averaged embeddings |
| (30 pts) |
| """ |
| embedding = np.zeros(int(model_type.split("d")[0])) |
| |
| |
| |
| |
| words = sentence.lower().split() |
|
|
| |
| valid_word_count = 0 |
|
|
| for word in words: |
| if word in word_index_dict: |
| index = word_index_dict[word] |
| embedding += embeddings[index] |
| valid_word_count += 1 |
|
|
| |
| if valid_word_count > 0: |
| embedding /= valid_word_count |
|
|
| return embedding |
|
|
|
|
| |
| |
| |
| def get_sorted_cosine_similarity(text_search, embeddings_metadata): |
| """ |
| Get sorted cosine similarity between input sentence and categories |
| Steps: |
| 1. Get embeddings for input sentence |
| 2. Get embeddings for categories (if not found, update category embeddings) |
| 3. Compute cosine similarity between input sentence and categories |
| 4. Sort cosine similarity |
| 5. Return sorted cosine similarity |
| (50 pts) |
| """ |
| categories = st.session_state.categories.split(" ") |
| |
| cosine_sim = {} |
| if embeddings_metadata["embedding_model"] == "glove": |
| word_index_dict = embeddings_metadata["word_index_dict"] |
| embeddings = embeddings_metadata["embeddings"] |
| model_type = embeddings_metadata["model_type"] |
|
|
| input_embedding = averaged_glove_embeddings_gdrive(text_search, |
| word_index_dict, |
| embeddings, model_type) |
|
|
| |
| |
| |
| for index, category in enumerate(categories): |
| category_embedding = averaged_glove_embeddings_gdrive( |
| category, |
| word_index_dict, |
| embeddings, |
| model_type) |
| cosine_sim[index] = cosine_similarity(input_embedding, category_embedding) |
|
|
| else: |
| model_name = embeddings_metadata["model_name"] |
| if not "cat_embed_" + model_name in st.session_state: |
| get_category_embeddings(embeddings_metadata) |
|
|
| category_embeddings = st.session_state["cat_embed_" + model_name] |
|
|
| print("text_search = ", text_search) |
| if model_name: |
| input_embedding = get_sentence_transformer_embeddings(text_search, model_name=model_name) |
| else: |
| input_embedding = get_sentence_transformer_embeddings(text_search) |
|
|
| for index in range(len(categories)): |
| |
| |
| |
| |
| category = categories[index] |
| if category in category_embeddings: |
| category_embedding = category_embeddings[category] |
| cosine_sim[index] = cosine_similarity(input_embedding, category_embedding) |
| else: |
| update_category_embeddings(embeddings_metadata) |
| category_embedding = st.session_state["cat_embed_" + model_name][category] |
| cosine_sim[index] = cosine_similarity(input_embedding, category_embedding) |
|
|
| |
| sorted_items = sorted(cosine_sim.items(), key=lambda x: x[1], reverse=True) |
|
|
| return sorted_items |
|
|
|
|
| |
|
|
| if __name__ == "__main__": |
| |
| |
| |
|
|
| st.sidebar.title("GloVe Twitter") |
| st.sidebar.markdown( |
| """ |
| GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on |
| 2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip). |
| |
| Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*. |
| """ |
| ) |
|
|
| model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d", "100d"), index=1) |
|
|
| st.title("Search Based Retrieval Demo") |
| st.subheader( |
| "Pass in space separated categories you want this search demo to be about." |
| ) |
| |
| |
| |
| |
| st.text_input( |
| label="Categories", key="categories", value="Flowers Colors Cars Weather Food" |
| ) |
| print(st.session_state["categories"]) |
| print(type(st.session_state["categories"])) |
| |
| |
|
|
| st.subheader("Pass in an input word or even a sentence") |
| text_search = st.text_input( |
| label="Input your sentence", |
| key="text_search", |
| value="Roses are red, trucks are blue, and Seattle is grey right now", |
| ) |
| |
|
|
| |
| embeddings_path = "embeddings_" + str(model_type) + "_temp.npy" |
| word_index_dict_path = "word_index_dict_" + str(model_type) + "_temp.pkl" |
| if not os.path.isfile(embeddings_path) or not os.path.isfile(word_index_dict_path): |
| print("Model type = ", model_type) |
| glove_path = "Data/glove_" + str(model_type) + ".pkl" |
| print("glove_path = ", glove_path) |
|
|
| |
| with st.spinner("Downloading glove embeddings..."): |
| download_glove_embeddings_gdrive(model_type) |
|
|
| |
| word_index_dict, embeddings = load_glove_embeddings_gdrive(model_type) |
|
|
| |
| if st.session_state.text_search: |
| |
| print("Glove Embedding") |
| embeddings_metadata = { |
| "embedding_model": "glove", |
| "word_index_dict": word_index_dict, |
| "embeddings": embeddings, |
| "model_type": model_type, |
| } |
| with st.spinner("Obtaining Cosine similarity for Glove..."): |
| sorted_cosine_sim_glove = get_sorted_cosine_similarity( |
| st.session_state.text_search, embeddings_metadata |
| ) |
|
|
| |
| print("Sentence Transformer Embedding") |
| embeddings_metadata = {"embedding_model": "transformers", "model_name": ""} |
| with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."): |
| sorted_cosine_sim_transformer = get_sorted_cosine_similarity( |
| st.session_state.text_search, embeddings_metadata |
| ) |
|
|
| |
| print("Categories are: ", st.session_state.categories) |
| st.subheader( |
| "Closest word I have between: " |
| + st.session_state.categories |
| + " as per different Embeddings" |
| ) |
|
|
| print(sorted_cosine_sim_glove) |
| print(sorted_cosine_sim_transformer) |
| |
| |
| plot_alatirchart( |
| { |
| "glove_" + str(model_type): sorted_cosine_sim_glove, |
| "sentence_transformer_384": sorted_cosine_sim_transformer, |
| } |
| ) |
| |
|
|
| st.write("") |
| st.write( |
| "Demo developed by Hongyan Liu and Yinxiu Wang(https://www.linkedin.com/in/your_id/ - Optional)" |
| ) |
|
|