from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.corpus import wordnet as wn from sematch.semantic.similarity import WordNetSimilarity from vocabulary.vocabulary import Vocabulary as vb import json from random import randint import spacy import os.path nlp = spacy.load('en_core_web_sm') # Function to tag sentence with part of speach def tag(sentence): words = word_tokenize(sentence) words = pos_tag(words) return words # Determine the POS to paraphrase def paraphraseable(tag): return tag.startswith('NN') or tag =='VB' or tag.startswith('JJ') # POS tagging def pos(tag): if tag.startswith('NN'): return wn.NOUN elif tag.startswith('V'): return wn.VERB # Function to crate synonyms using wordnet nltk def synonyms(word, tag): listOfLemmas = [baseWord.lemmas() for baseWord in wn.synsets(word, pos(tag))] if len(listOfLemmas) > 0: listOfLemmas = listOfLemmas[0] lemmas = [lemma.name().encode('ascii', 'ignore') for lemma in listOfLemmas] return set(lemmas) else: return set([]) # Create dictonary synonums def dictonarySynonums(word): synJSON = vb.synonym(word) if synJSON != False: synonyms_lists = [dictSyno["text"].encode('ascii', 'ignore') for dictSyno in json.loads(vb.synonym(word))] return set(synonyms_lists) else: return set([]) # controll set to calculate the semantic similarity of synonums from the base words using SPACY def controlledSetSpacy(word,similarWords): utf_en_word = nlp(word.decode('utf-8', 'ignore')) for similarWord in similarWords.copy(): utf_en_similarWord = nlp(similarWord.decode('utf-8','ignore')) if utf_en_word.similarity(utf_en_similarWord) <.76: # Variable to control accuracy of controlset similarWords.discard(similarWord) return similarWords # controll set to calculate the semantic similarity of synonums from the base words using WordNetSimilarity def controlledSetWordNetSimilarity(word,similarWords): wns = WordNetSimilarity() for similarWord in similarWords.copy(): if wns.word_similarity(word, similarWord, 'li') < 0.9996: # Variable to control accuracy of controlset similarWords.discard(similarWord) return similarWords # to to get synonums from wordnet nltk as well as from python dictonary synonums def synonymIfExists(sentence): for (word, t) in tag(sentence): if paraphraseable(t) and word not in ["i","I"]: syns = synonyms(word, t) syns.update(dictonarySynonums(word)) if syns: syns = controlledSetWordNetSimilarity(word,syns) # Or use the commented controlled set #syns = controlledSetSpacy(word,syns) if len(syns) > 1: yield [word, list(syns)] continue yield [word,[]] # Function to get the semantic similar synonums and the total count of synonums in the entire sentence def paraphrase(sentence): bagOfWords = [] counter = 1 for tempArray in synonymIfExists(sentence): eachBoW=[] eachBoW.append(tempArray[0]) eachBoW.extend(tempArray[1]) eachBoW=list(set(eachBoW)) counter *= len(eachBoW) bagOfWords.append(eachBoW) return bagOfWords,counter # Function to re-create sentence with synonums where the synonums are taken in randon order def paraPhraseThisSentence(sentence): ppList = [] vList,count = paraphrase(sentence) allWordsCount = len(vList) for y in range(count): str = [] returnStr = " " for w in range(allWordsCount): str.append(vList[w][randint(0,len(vList[w])-1)].replace("_"," ")) ppList.append(returnStr.join(str)) ppList = list(set(ppList)) print (ppList) return ppList paraPhraseThisSentence("Financial Institutes have always helped the society to become better version of itself.")