| |
| """Neuro_lab_screening.ipynb |
| |
| Automatically generated by Colaboratory. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/13s8A1SWTVZPc8oGY4BctRMyXX5yzfBxG |
| """ |
|
|
| import pandas as pd |
| import numpy as np |
| import tensorflow as tf |
| import tensorflow_hub as hub |
|
|
| df = pd.read_csv("/content/DataNeuron_Text_Similarity.csv") |
|
|
| df |
|
|
| df['text1'][1] |
|
|
| df['text1'][0] |
|
|
| """#Embedding text to vectors""" |
|
|
| from gensim.models import Word2Vec |
| from gensim.test.utils import common_texts |
|
|
| |
| text1 = df['text1'][245] |
| text2 = df['text2'][245] |
|
|
| |
| texts = [text1.split(), text2.split()] |
|
|
| |
| model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4) |
|
|
| |
| def text_to_embedding(text, model): |
| words = text.split() |
| embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0) |
| return embedding |
|
|
| |
| embedding_text1 = text_to_embedding(text1, model) |
| embedding_text2 = text_to_embedding(text2, model) |
|
|
| """#Using Cosine Similarity as Similarity Metric |
| |
| """ |
|
|
| from sklearn.metrics.pairwise import cosine_similarity |
|
|
| |
| cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0] |
|
|
| |
| cosine_sim_normalized = round(abs(cosine_sim),3) |
|
|
| print("Cosine Similarity (Normalized):", cosine_sim_normalized) |
|
|
| """#Generation of scores of df through function""" |
|
|
| def calculate_similarity_from_dataframe(df): |
| |
| texts = [text.split() for text in df['text1']] |
| texts.extend([text.split() for text in df['text2']]) |
| model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4) |
|
|
| |
| def text_to_embedding(text): |
| words = text.split() |
| embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0) |
| return embedding |
|
|
| |
| similarity_scores = [] |
| for index, row in df.iterrows(): |
| embedding_text1 = text_to_embedding(row['text1']) |
| embedding_text2 = text_to_embedding(row['text2']) |
| cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0] |
| cosine_sim_normalized = round(abs(cosine_sim), 3) |
| similarity_scores.append(cosine_sim_normalized) |
|
|
| |
| df['similarity_score'] = similarity_scores |
| return df |
|
|
| calculate_similarity_from_dataframe(df) |
|
|
| from matplotlib import pyplot as plt |
| df['similarity_score'].plot(kind='line', figsize=(8, 4), title='similarity_score') |
| plt.gca().spines[['top', 'right']].set_visible(False) |