| ''' Text Keyword Match''' |
| |
| |
| |
| |
| |
| |
| |
| import re |
| import nltk |
| from nltk.corpus import stopwords |
| from nltk.tokenize import sent_tokenize |
| from nltk.tokenize import word_tokenize |
| from nltk.stem import WordNetLemmatizer |
| from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|
|
| nltk.download('omw-1.4') |
| lemmatizer = WordNetLemmatizer() |
| stop_words = set(stopwords.words('english')) |
|
|
|
|
| class scoreText(object): |
| """ |
| A class used to score sentences based on the input keyword |
| """ |
|
|
| def __init__(self): |
|
|
| self.sentences = [] |
|
|
| def cleanText(self, sentences): |
| """ |
| Eliminates the duplicates and cleans the text |
| """ |
| try: |
| sentences = list(set(sentences)) |
| mainBody = [] |
| for i, text in enumerate(sentences): |
| text = re.sub("[-()\"#/@&&^*();:<>{}`+=~|!?,]", "", text) |
| mainBody.append(text) |
| return mainBody |
| except Exception as e: |
| print("Error occured in text clean", e) |
|
|
| def preProcessText(self, sentences): |
| """ |
| Tokenization of sentence and lemmatization of words |
| """ |
| try: |
| |
| word_tokens = word_tokenize(sentences) |
| |
| wordlist = [lemmatizer.lemmatize( |
| w) for w in word_tokens if not w in stop_words] |
|
|
| return wordlist |
| except Exception as e: |
| print("Error occured in text preprocessing", e) |
|
|
| |
| def scoreText(self, keyword, sentences): |
| """ |
| Compares sentences with keyword with bleu scoring technique |
| """ |
| try: |
| |
| sentences = self.cleanText(sentences) |
|
|
| |
| keywordList = self.preProcessText(keyword) |
|
|
| scoredSentencesList = [] |
| for i in range(len(sentences)): |
|
|
| |
| wordlist = self.preProcessText(sentences[i]) |
|
|
| |
| reference = [keywordList] |
| chencherry = SmoothingFunction() |
| |
| |
| bleu_score_1 = sentence_bleu( |
| reference, wordlist, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1) |
| bleu_score_2 = sentence_bleu( |
| reference, wordlist, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1) |
| bleu_score_3 = sentence_bleu( |
| reference, wordlist, weights=(0.33, 0.33, 0.34, 0), smoothing_function=chencherry.method1) |
| bleu_score_4 = sentence_bleu( |
| reference, wordlist, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1) |
| bleu_score = (4*bleu_score_4 + 3*bleu_score_3 + |
| 2*bleu_score_2 + bleu_score_1)/10 |
|
|
| |
| scList = [bleu_score, sentences[i]] |
| scoredSentencesList.append(scList) |
| return scoredSentencesList |
|
|
| except Exception as e: |
| print("Error occured in score text", e) |
|
|
| def sortText(self, scoredText): |
| """ |
| Returns 3 top scored list of sentences |
| """ |
| try: |
| scoredTexts = sorted(scoredText, key=lambda x: x[0], reverse=True) |
| scoredTexts = [v[1] for i, v in enumerate(scoredTexts) if i < 3] |
| return scoredTexts |
| except Exception as e: |
| print("Error occured in sorting text", e) |
|
|
| def sentenceMatch(self, keyword, paragraph): |
| """ |
| Converts paragraph into list and calls scoreText and sortText functions, |
| and returns the most matching sentences with the keywords. |
| """ |
| try: |
| sentencesList = sent_tokenize(paragraph) |
| scoredSentence = self.scoreText(keyword, sentencesList) |
| sortedSentence = self.sortText(scoredSentence) |
| return sortedSentence |
| except Exception as e: |
| print("Error occured in sentence match", e) |