from nltk import word_tokenize import wordninja def get_average_words_per_line(lines): sum = 0 for line in lines: tokens = word_tokenize(line) sum+= len(tokens) return sum/ len(lines) def get_average_line_len(lines): sum = 0 for line in lines: sum+=len(line) return sum / len(lines) def percentage_difference(value1, value2): average_value = (value1 + value2) / 2 diff = abs(value1 - value2) percentage_diff = (diff / average_value) * 100 return percentage_diff def recover_text(line): tokens = word_tokenize(line) condition = percentage_difference(len(tokens), len(wordninja.split(line))) > 150 #condition = percentage_difference(line_width, len(tokens)) > percentage_difference(average_width, avg_tokens) return " ".join(wordninja.split(line)) if condition else line