Spaces:

Harsh502s
/

Autonomous_Text_Tagging_App

Sleeping

App Files Files Community

Harsh502s commited on Nov 4, 2023

Commit

f24359a

1 Parent(s): e1f4ca7

Remove unused KeyBERT model and update BERTopic

Browse files

Files changed (3) hide show

Models/{stackexchange_topic_model.pkl → topic_key_model_130.pkl} +2 -2
Pages/Models.py +50 -92
Pages/Topic Model Results.py +1 -1

Models/{stackexchange_topic_model.pkl → topic_key_model_130.pkl} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f58e481e2bf3282ad0a39bd203dbad662ff3d1c70ae787a953c799fcc7159dbf
-size 597236536

 version https://git-lfs.github.com/spec/v1
+oid sha256:807e4facbc8beded07885eb54a9a7cd85871feb329828ec23d17cb45566d5133
+size 601417294

Pages/Models.py CHANGED Viewed

@@ -2,7 +2,6 @@ import streamlit as st
 from streamlit_extras.tags import tagger_component
 import re
 import pickle
-from keybert import KeyBERT
 from bertopic import BERTopic
 from keras.models import load_model
 from keras.preprocessing.sequence import pad_sequences
@@ -12,8 +11,7 @@ from keras.preprocessing.sequence import pad_sequences
 @st.cache_resource
 def load_models():
     return (
-        BERTopic.load(r"Models/stackexchange_topic_model.pkl"),
-        KeyBERT("all-MiniLM-L6-v2"),
         load_model(r"Models/tag_model.h5"),
         pickle.load(open(r"Models/token.pkl", "rb")),
         pickle.load(open(r"Models/bin.pkl", "rb")),
@@ -21,7 +19,7 @@ def load_models():
 # Load the model into memory
-bertopic_model, keybert_model, cnn_model, tokenizer, binarizer = load_models()
 # Clean the input text
@@ -43,72 +41,29 @@ def tag_cnn_model(text):
 # Retrieve the keyphrases from the input text using the KeyBERT model
-def retrieve_keyphrases(text, n, ngram_range):
-    keywords = keybert_model.extract_keywords(
-        text,
-        keyphrase_ngram_range=ngram_range,
-        top_n=n,
-        diversity=0.5,
-        use_maxsum=True,
-        use_mmr=True,
-        seed_keywords=[
-            "machine-learning",
-            "r",
-            "regression",
-            "deep-learning",
-            "neural-networks",
-            "data-request",
-            "python",
-            "reinforcement-learning",
-            "classification",
-            "time-series",
-            "probability",
-            "neural-network",
-            "distributions",
-            "bayesian",
-            "hypothesis-testing",
-            "keras",
-            "mathematical-statistics",
-            "scikit-learn",
-            "logistic",
-            "convolutional-neural-networks",
-            "clustering",
-            "tensorflow",
-            "terminology",
-            "nlp",
-            "correlation",
-            "self-study",
-            "normal-distribution",
-            "geospatial",
-            "cross-validation",
-            "optimization",
-            "random-forest",
-            "mixed-model",
-            "data-mining",
-            "feature-selection",
-            "pca",
-            "references",
-            "computer-vision",
-            "data-visualization",
-            "confidence-interval",
-            "generalized-linear-model",
-            "variance",
-            "natural-language-processing",
-            "dataset",
-            "svm",
-            "training",
-            "maximum-likelihood",
-            "statistical-significance",
-            "gradient-descent",
-            "multiple-regression",
-            "estimation",
-        ],
-    )
-    return sorted(keywords, key=lambda x: x[1], reverse=True)
 # Find the most similar topics for the input text using the BERTopic model
-def output_unsupervised(text, n):
     new_review = text
     similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
     similar_topics = sorted(similar_topics)
@@ -139,38 +94,34 @@ def unsupervised_page_bertopic():
         "Enter number of tags to assign", value=5, key="unsupervised_n_bertopic"
     )
     if st.button("Assign tags", key="unsupervised_button_bertopic"):
-        output_unsupervised(text, n)
-# Display the unsupervised model using keybert page of the app
-def semi_unsupervised_page_keybert():
-    st.header("Unsupervised Model Using KeyBERT Model")
     text = st.text_area(
         "Enter text to assign tags", height=200, key="unsupervised_text_keybert"
     )
     text = clean_text(text)
     n = st.number_input(
-        "Enter number of tags to assign", value=10, key="unsupervised_n_keybert"
     )
-    ngram_range_lower = st.number_input(
-        "Enter lower limit of ngram range",
-        value=1,
-        min_value=1,
-        max_value=6,
-        key="unsupervised_ngram_lower",
     )
-    ngram_range_upper = st.number_input(
-        "Enter upper limit of ngram range",
-        value=3,
-        min_value=1,
-        max_value=6,
-        key="unsupervised_ngram_upper",
     )
-    ngram_range = (ngram_range_lower, ngram_range_upper)
-    if st.button("Assign tags", key="unsupervised_button_keybert"):
-        topics = retrieve_keyphrases(text, n, ngram_range)
-        topics = [topic[0] for topic in topics]
-        tagger_component("Tags:", topics)
 # Display the model page of the app
@@ -187,14 +138,21 @@ def model_page():
     st.title("Select a model to use:")
     with st.container():
-        tab1, tab2, tab3 = st.tabs(
-            ["Supervised Using CNN", "Semi-Supervised-KeyBERT", "UnSupervised-BERTopic"]
         )
         with tab1:
             supervised_page()
         with tab2:
-            semi_unsupervised_page_keybert()
         with tab3:
             unsupervised_page_bertopic()
     with st.container():
         with st.expander("Example Texts", expanded=False):

 from streamlit_extras.tags import tagger_component
 import re
 import pickle
 from bertopic import BERTopic
 from keras.models import load_model
 from keras.preprocessing.sequence import pad_sequences
 @st.cache_resource
 def load_models():
     return (
+        BERTopic.load(r"Models/topic_key_model_130.pkl"),
         load_model(r"Models/tag_model.h5"),
         pickle.load(open(r"Models/token.pkl", "rb")),
         pickle.load(open(r"Models/bin.pkl", "rb")),
 # Load the model into memory
+bertopic_model, cnn_model, tokenizer, binarizer = load_models()
 # Clean the input text
 # Retrieve the keyphrases from the input text using the KeyBERT model
+def output_keybert(text, n):
+    new_review = text
+    similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
+    similar_topics = sorted(similar_topics)
+    for i in range(n):
+        tags = bertopic_model.get_topic(similar_topics[i], full=True)["KeyBERT"]
+        tags = [tag[0] for tag in tags]
+        tagger_component(f"Tags from cluster {i+1}:", tags)
+# Retrieve the keyphrases from the input text using the Bertopics MMR model
+def output_mmr(text, n):
+    new_review = text
+    similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
+    similar_topics = sorted(similar_topics)
+    for i in range(n):
+        tags = bertopic_model.get_topic(similar_topics[i], full=True)["MMR"]
+        tags = [tag[0] for tag in tags]
+        tagger_component(f"Tags from cluster {i+1}:", tags)
 # Find the most similar topics for the input text using the BERTopic model
+def output_bertopic(text, n):
     new_review = text
     similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
     similar_topics = sorted(similar_topics)
         "Enter number of tags to assign", value=5, key="unsupervised_n_bertopic"
     )
     if st.button("Assign tags", key="unsupervised_button_bertopic"):
+        output_bertopic(text, n)
+def unsupervised_page_keybert():
+    st.header("Unsupervised Model Using BERTopic Model")
     text = st.text_area(
         "Enter text to assign tags", height=200, key="unsupervised_text_keybert"
     )
     text = clean_text(text)
     n = st.number_input(
+        "Enter number of tags to assign", value=5, key="unsupervised_n_keybert"
     )
+    if st.button("Assign tags", key="unsupervised_button_keybert"):
+        output_keybert(text, n)
+# Display the unsupervised model using bertopic page of the app
+def unsupervised_page_mmr():
+    st.header("Unsupervised Model Using BERTopic Model")
+    text = st.text_area(
+        "Enter text to assign tags", height=200, key="unsupervised_text_mmr"
     )
+    text = clean_text(text)
+    n = st.number_input(
+        "Enter number of tags to assign", value=5, key="unsupervised_n_mmr"
     )
+    if st.button("Assign tags", key="unsupervised_button_mmr"):
+        output_mmr(text, n)
 # Display the model page of the app
     st.title("Select a model to use:")
     with st.container():
+        tab1, tab2, tab3, tab4 = st.tabs(
+            [
+                "Supervised Using CNN",
+                "UnSupervised-KeyBERT",
+                "UnSupervised-MMR",
+                "UnSupervised-BERTopic",
+            ]
         )
         with tab1:
             supervised_page()
         with tab2:
+            unsupervised_page_keybert()
         with tab3:
+            unsupervised_page_mmr()
+        with tab4:
             unsupervised_page_bertopic()
     with st.container():
         with st.expander("Example Texts", expanded=False):

Pages/Topic Model Results.py CHANGED Viewed

@@ -4,7 +4,7 @@ from bertopic import BERTopic
 @st.cache_resource
 def load_model():
-    return BERTopic.load(r"Models/stackexchange_topic_model.pkl")
 bertopic_model = load_model()

 @st.cache_resource
 def load_model():
+    return BERTopic.load(r"Models/topic_key_model_130.pkl")
 bertopic_model = load_model()