Spaces:
Running
Running
import json | |
import re | |
import unicodedata | |
from utils.norm_config import norm_config | |
def text_normalize( | |
text, | |
iso_code="xxx", | |
lower_case=True, | |
remove_numbers=False, | |
remove_brackets=False, | |
rm_extra_spaces=False, | |
): | |
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces | |
Args: | |
text : The string to be normalized | |
iso_code : | |
remove_numbers : Boolean flag to specify if words containing only digits should be removed | |
Returns: | |
normalized_text : the string after all normalization | |
""" | |
config = norm_config.get(iso_code, norm_config["*"]) | |
for field in [ | |
"lower_case", | |
"punc_set", | |
"del_set", | |
"mapping", | |
"digit_set", | |
"unicode_norm", | |
]: | |
if field not in config: | |
config[field] = norm_config["*"][field] | |
text = unicodedata.normalize(config["unicode_norm"], text) | |
# Convert to lower case | |
if config["lower_case"] and lower_case: | |
text = text.lower() | |
# brackets | |
# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)" | |
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text) | |
if remove_brackets: | |
text = re.sub(r"\([^\)]*\)", " ", text) | |
# Apply mappings | |
for old, new in config["mapping"].items(): | |
text = re.sub(old, new, text) | |
# Replace punctutations with space | |
punct_pattern = r"[" + config["punc_set"] | |
punct_pattern += "]" | |
normalized_text = re.sub(punct_pattern, " ", text) | |
# remove characters in delete list | |
delete_patten = r"[" + config["del_set"] + "]" | |
normalized_text = re.sub(delete_patten, "", normalized_text) | |
# Remove words containing only digits | |
# We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number | |
# For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space | |
# The lookaround enables overlapping pattern matches to be replaced | |
if remove_numbers: | |
digits_pattern = "[" + config["digit_set"] | |
digits_pattern += "]+" | |
complete_digit_pattern = ( | |
r"^" | |
+ digits_pattern | |
+ "(?=\s)|(?<=\s)" | |
+ digits_pattern | |
+ "(?=\s)|(?<=\s)" | |
+ digits_pattern | |
+ "$" | |
) | |
normalized_text = re.sub(complete_digit_pattern, " ", normalized_text) | |
if config["rm_diacritics"]: | |
from unidecode import unidecode | |
normalized_text = unidecode(normalized_text) | |
if rm_extra_spaces: | |
normalized_text = re.sub(r"\s+", " ", normalized_text).strip() | |
return normalized_text | |