NLPScoring

45315e1 almost 3 years ago

4.59 kB

	''' Text Keyword Match'''
	# --------------------------------
	# Date : 19-06-2020
	# Project : Text Keyword Match
	# Category : NLP/NLTK sentence Scoring
	# Company : weblineindia
	# Department : AI/ML
	# --------------------------------
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import sent_tokenize
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

	nltk.download('omw-1.4')
	lemmatizer = WordNetLemmatizer()
	stop_words = set(stopwords.words('english'))


	class scoreText(object):
	"""
	A class used to score sentences based on the input keyword
	"""

	def __init__(self):

	self.sentences = []

	def cleanText(self, sentences):
	"""
	Eliminates the duplicates and cleans the text
	"""
	try:
	sentences = list(set(sentences))
	mainBody = []
	for i, text in enumerate(sentences):
	text = re.sub("[-()\"#/@&&^*();:<>{}`+=~\|!?,]", "", text)
	mainBody.append(text)
	return mainBody
	except Exception as e:
	print("Error occured in text clean", e)

	def preProcessText(self, sentences):
	"""
	Tokenization of sentence and lemmatization of words
	"""
	try:
	# Tokenize words in a sentence
	word_tokens = word_tokenize(sentences)
	# Lemmatization of words
	wordlist = [lemmatizer.lemmatize(
	w) for w in word_tokens if not w in stop_words]

	return wordlist
	except Exception as e:
	print("Error occured in text preprocessing", e)

	# similarity of subject
	def scoreText(self, keyword, sentences):
	"""
	Compares sentences with keyword with bleu scoring technique
	"""
	try:
	# Remove symbols from text
	sentences = self.cleanText(sentences)

	# Tokenization and Lennatization of the keyword
	keywordList = self.preProcessText(keyword)

	scoredSentencesList = []
	for i in range(len(sentences)):

	# Tokenization and Lennatization of the sentences
	wordlist = self.preProcessText(sentences[i])

	# list of keyword taken as reference
	reference = [keywordList]
	chencherry = SmoothingFunction()
	# sentence bleu calculates the score based on 1-gram,2-gram,3-gram-4-gram,
	# and a cumulative of the above is taken as score of the sentence.
	bleu_score_1 = sentence_bleu(
	reference, wordlist, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1)
	bleu_score_2 = sentence_bleu(
	reference, wordlist, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1)
	bleu_score_3 = sentence_bleu(
	reference, wordlist, weights=(0.33, 0.33, 0.34, 0), smoothing_function=chencherry.method1)
	bleu_score_4 = sentence_bleu(
	reference, wordlist, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1)
	bleu_score = (4bleu_score_4 + 3bleu_score_3 +
	2*bleu_score_2 + bleu_score_1)/10

	# append the score with sentence to the list
	scList = [bleu_score, sentences[i]]
	scoredSentencesList.append(scList)
	return scoredSentencesList

	except Exception as e:
	print("Error occured in score text", e)

	def sortText(self, scoredText):
	"""
	Returns 3 top scored list of sentences
	"""
	try:
	scoredTexts = sorted(scoredText, key=lambda x: x[0], reverse=True)
	scoredTexts = [v[1] for i, v in enumerate(scoredTexts) if i < 3]
	return scoredTexts
	except Exception as e:
	print("Error occured in sorting text", e)

	def sentenceMatch(self, keyword, paragraph):
	"""
	Converts paragraph into list and calls scoreText and sortText functions,
	and returns the most matching sentences with the keywords.
	"""
	try:
	sentencesList = sent_tokenize(paragraph)
	scoredSentence = self.scoreText(keyword, sentencesList)
	sortedSentence = self.sortText(scoredSentence)
	return sortedSentence
	except Exception as e:
	print("Error occured in sentence match", e)