import re import os import unicodedata import nltk import inflect from nltk import word_tokenize, sent_tokenize from nltk.corpus import stopwords from nltk.stem import LancasterStemmer, WordNetLemmatizer # download_path = os.path.join(os.getcwd(), 'nltk_packages') # nltk.data.path.append(download_path) nltk.download('wordnet') nltk.download('stopwords') nltk.download('punkt') def remove_non_ascii(words): """Remove non-ASCII characters from list of tokenized words""" new_words = [] for word in words: new_word = unicodedata.normalize('NFKD', word).encode( 'ascii', 'ignore').decode('utf-8', 'ignore') new_words.append(new_word) return new_words def to_lowercase(words): """Convert all characters to lowercase from list of tokenized words""" new_words = [] for word in words: new_word = word.lower() new_words.append(new_word) return new_words def remove_punctuation(words): """Remove punctuation from list of tokenized words""" new_words = [] for word in words: new_word = re.sub(r'[^\w\s]', '', word) if new_word != '': new_words.append(new_word) return new_words def replace_numbers(words): """Replace all interger occurrences in list of tokenized words with textual representation""" p = inflect.engine() new_words = [] for word in words: if word.isdigit(): new_word = p.number_to_words(word) new_words.append(new_word) else: new_words.append(word) return new_words def remove_stopwords(words): """Remove stop words from list of tokenized words""" new_words = [] for word in words: # print(word) if word not in stopwords.words('english'): new_words.append(word) return new_words def stem_words(words): """Stem words in list of tokenized words""" stemmer = LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems def lemmatize_verbs(words): """Lemmatize verbs in list of tokenized words""" lemmatizer = WordNetLemmatizer() lemmas = [] for word in words: lemma = lemmatizer.lemmatize(word, pos='v') lemmas.append(lemma) return lemmas def normalize(words): words = remove_non_ascii(words) words = to_lowercase(words) words = remove_punctuation(words) # words = replace_numbers(words) words = remove_stopwords(words) # words = stem_words(words) # words = lemmatize_verbs(words) return words def preprocess(documents): preprocessed_documents = [] for document in documents: tokens = nltk.word_tokenize(document) preprocessed = normalize(tokens) preprocessed = ' '.join(map(str, preprocessed)) preprocessed_documents.append(preprocessed) return preprocessed_documents