Ankitxox
/

dn_sts

Sentence Similarity

Model card Files Files and versions

dn_sts / neuro_lab_screening.py

Ankitxox's picture

Upload neuro_lab_screening.py

1a05200 verified about 2 years ago

history blame contribute delete

2.82 kB

	# -- coding: utf-8 --
	"""Neuro_lab_screening.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/13s8A1SWTVZPc8oGY4BctRMyXX5yzfBxG
	"""

	import pandas as pd
	import numpy as np
	import tensorflow as tf # To work with USE4
	import tensorflow_hub as hub # contains USE4

	df = pd.read_csv("/content/DataNeuron_Text_Similarity.csv")

	df

	df['text1'][1]

	df['text1'][0]

	"""#Embedding text to vectors"""

	from gensim.models import Word2Vec
	from gensim.test.utils import common_texts

	# Sample texts
	text1 = df['text1'][245]
	text2 = df['text2'][245]

	# Tokenize the texts
	texts = [text1.split(), text2.split()]

	# Train Word2Vec model with CBOW
	model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4)

	# Convert text to word embeddings
	def text_to_embedding(text, model):
	words = text.split()
	embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
	return embedding

	# Example usage
	embedding_text1 = text_to_embedding(text1, model)
	embedding_text2 = text_to_embedding(text2, model)

	"""#Using Cosine Similarity as Similarity Metric

	"""

	from sklearn.metrics.pairwise import cosine_similarity

	# Calculate cosine similarity
	cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0]

	# Normalize cosine similarity to range [0, 1]
	cosine_sim_normalized = round(abs(cosine_sim),3)

	print("Cosine Similarity (Normalized):", cosine_sim_normalized)

	"""#Generation of scores of df through function"""

	def calculate_similarity_from_dataframe(df):
	# Train Word2Vec model with CBOW
	texts = [text.split() for text in df['text1']]
	texts.extend([text.split() for text in df['text2']])
	model = Word2Vec(sentences=texts, vector_size=100, window=5, sg=0, min_count=1, workers=4)

	# Convert text to word embeddings
	def text_to_embedding(text):
	words = text.split()
	embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
	return embedding

	# Calculate similarity for each row in the DataFrame
	similarity_scores = []
	for index, row in df.iterrows():
	embedding_text1 = text_to_embedding(row['text1'])
	embedding_text2 = text_to_embedding(row['text2'])
	cosine_sim = cosine_similarity([embedding_text1], [embedding_text2])[0][0]
	cosine_sim_normalized = round(abs(cosine_sim), 3)
	similarity_scores.append(cosine_sim_normalized)

	# Add similarity scores to the DataFrame
	df['similarity_score'] = similarity_scores
	return df

	calculate_similarity_from_dataframe(df)

	from matplotlib import pyplot as plt
	df['similarity_score'].plot(kind='line', figsize=(8, 4), title='similarity_score')
	plt.gca().spines[['top', 'right']].set_visible(False)