import html import logging import re import pyarabic.araby as araby ACCEPTED_MODELS = [ "bert-base-arabertv01", "bert-base-arabert", "bert-base-arabertv02", "bert-base-arabertv2", "bert-large-arabertv02", "bert-large-arabertv2", "araelectra-base", "araelectra-base-discriminator", "araelectra-base-generator", "aragpt2-base", "aragpt2-medium", "aragpt2-large", "aragpt2-mega", ] SEGMENTED_MODELS = [ "bert-base-arabert", "bert-base-arabertv2", "bert-large-arabertv2", ] class ArabertPreprocessor: """ A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo. It also can unprocess the text ouput of the generated text Args: model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are: - :obj:`"bert-base-arabertv01"`: No farasa segmentation. - :obj:`"bert-base-arabert"`: with farasa segmentation. - :obj:`"bert-base-arabertv02"`: No farasas egmentation. - :obj:`"bert-base-arabertv2"`: with farasa segmentation. - :obj:`"bert-large-arabertv02"`: No farasas egmentation. - :obj:`"bert-large-arabertv2"`: with farasa segmentation. - :obj:`"araelectra-base"`: No farasa segmentation. - :obj:`"araelectra-base-discriminator"`: No farasa segmentation. - :obj:`"araelectra-base-generator"`: No farasa segmentation. - :obj:`"aragpt2-base"`: No farasa segmentation. - :obj:`"aragpt2-medium"`: No farasa segmentation. - :obj:`"aragpt2-large"`: No farasa segmentation. - :obj:`"aragpt2-mega"`: No farasa segmentation. keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA) strip_tatweel(:obj: `bool`): remove tatweel '\\u0640' insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character Returns: ArabertPreprocessor: the preprocessor class Example: from preprocess import ArabertPreprocessor arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2") arabert_prep.preprocess("SOME ARABIC TEXT") """ def __init__( self, model_name, keep_emojis=False, remove_html_markup=True, replace_urls_emails_mentions=True, strip_tashkeel=True, strip_tatweel=True, insert_white_spaces=True, remove_elongation=True, ): """ model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are: - :obj:`"bert-base-arabertv01"`: No farasa segmentation. - :obj:`"bert-base-arabert"`: with farasa segmentation. - :obj:`"bert-base-arabertv02"`: No farasas egmentation. - :obj:`"bert-base-arabertv2"`: with farasa segmentation. - :obj:`"bert-large-arabertv02"`: No farasas egmentation. - :obj:`"bert-large-arabertv2"`: with farasa segmentation. - :obj:`"araelectra-base"`: No farasa segmentation. - :obj:`"araelectra-base-discriminator"`: No farasa segmentation. - :obj:`"araelectra-base-generator"`: No farasa segmentation. - :obj:`"aragpt2-base"`: No farasa segmentation. - :obj:`"aragpt2-medium"`: No farasa segmentation. - :obj:`"aragpt2-large"`: No farasa segmentation. - :obj:`"aragpt2-mega"`: No farasa segmentation. keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA) strip_tatweel(:obj: `bool`): remove tatweel '\\u0640' insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character """ model_name = model_name.replace("aubmindlab/", "") if model_name not in ACCEPTED_MODELS: logging.warning( "Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation" ) self.model_name = "bert-base-arabertv02" else: self.model_name = model_name self.keep_emojis = keep_emojis self.remove_html_markup = remove_html_markup self.replace_urls_emails_mentions = replace_urls_emails_mentions self.strip_tashkeel = strip_tashkeel self.strip_tatweel = strip_tatweel self.insert_white_spaces = insert_white_spaces self.remove_elongation = remove_elongation def preprocess(self, text): """ Preprocess takes an input text line an applies the same preprocessing used in AraBERT pretraining Args: text (:obj:`str`): inout text string Returns: string: A preprocessed string depending on which model was selected """ text = str(text) text = html.unescape(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) if self.strip_tatweel: text = araby.strip_tatweel(text) if self.replace_urls_emails_mentions: # replace all possible URLs for reg in url_regexes: text = re.sub(reg, " [رابط] ", text) # REplace Emails with [بريد] for reg in email_regexes: text = re.sub(reg, " [بريد] ", text) # replace mentions with [مستخدم] text = re.sub(user_mention_regex, " [مستخدم] ", text) if self.remove_html_markup: # remove html line breaks text = re.sub("
", " ", text) # remove html markup text = re.sub("]+>", " ", text) # remove repeated characters >2 if self.remove_elongation: text = self._remove_elongation(text) # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])", r" \1 ", text, ) # insert whitespace between words and numbers or numbers and words text = re.sub( "(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text ) text = re.sub( "([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text ) text = re.sub(rejected_chars_regex, " ", text) # remove extra spaces text = " ".join(text.replace("\uFE0F", "").split()) # ALl the other models dont require Farasa Segmentation return text def unpreprocess(self, text, desegment=True): """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces. The objective is to make the generated text of any model appear natural and not preprocessed. Args: text (str): input text to be un-preprocessed desegment (bool, optional): [whether or not to remove farasa pre-segmentation before]. Defaults to True. Returns: str: The unpreprocessed (and possibly Farasa-desegmented) text. """ # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple # https://stackoverflow.com/a/53436792/5381220 text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text) text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text) text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text) text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text) # during generation, sometimes the models don't put a space after the dot, this handles it text = text.replace(".", " . ") text = " ".join(text.split()) # handle decimals text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text) text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text) text = re.sub(left_and_right_spaced_chars, r"\1", text) text = re.sub(left_spaced_chars, r"\1", text) text = re.sub(right_spaced_chars, r"\1", text) return text def _remove_elongation(self, text): """ :param text: the input text to remove elongation :return: delongated text """ # loop over the number of times the regex matched the text for index_ in range(len(re.findall(regex_tatweel, text))): elongation = re.search(regex_tatweel, text) if elongation: elongation_pattern = elongation.group() elongation_replacement = elongation_pattern[0] elongation_pattern = re.escape(elongation_pattern) text = re.sub( elongation_pattern, elongation_replacement, text, flags=re.MULTILINE ) else: break return text def _remove_redundant_punct(self, text): text_ = text result = re.search(redundant_punct_pattern, text) dif = 0 while result: sub = result.group() sub = sorted(set(sub), key=sub.index) sub = " " + "".join(list(sub)) + " " text = "".join( (text[: result.span()[0] + dif], sub, text[result.span()[1] + dif :]) ) text_ = "".join( (text_[: result.span()[0]], text_[result.span()[1] :]) ).strip() dif = abs(len(text) - len(text_)) result = re.search(redundant_punct_pattern, text_) text = re.sub(r"\s+", " ", text) return text.strip() prefix_list = [ "ال", "و", "ف", "ب", "ك", "ل", "لل", "\u0627\u0644", "\u0648", "\u0641", "\u0628", "\u0643", "\u0644", "\u0644\u0644", "س", ] suffix_list = [ "ه", "ها", "ك", "ي", "هما", "كما", "نا", "كم", "هم", "هن", "كن", "ا", "ان", "ين", "ون", "وا", "ات", "ت", "ن", "ة", "\u0647", "\u0647\u0627", "\u0643", "\u064a", "\u0647\u0645\u0627", "\u0643\u0645\u0627", "\u0646\u0627", "\u0643\u0645", "\u0647\u0645", "\u0647\u0646", "\u0643\u0646", "\u0627", "\u0627\u0646", "\u064a\u0646", "\u0648\u0646", "\u0648\u0627", "\u0627\u062a", "\u062a", "\u0646", "\u0629", ] other_tokens = ["[رابط]", "[مستخدم]", "[بريد]"] # the never_split list is ussed with the transformers library prefix_symbols = [x + "+" for x in prefix_list] suffix_symblos = ["+" + x for x in suffix_list] never_split_tokens = list(set(prefix_symbols + suffix_symblos + other_tokens)) url_regexes = [ r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", r"@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS", r"http[s]?://[a-zA-Z0-9_\-./~\?=%&]+", r"www[a-zA-Z0-9_\-?=%&/.~]+", r"[a-zA-Z]+\.com", r"(?=http)[^\s]+", r"(?=www)[^\s]+", r"://", ] user_mention_regex = r"@[\w\d]+" email_regexes = [r"[\w-]+@([\w-]+\.)+[\w-]+", r"\S+@\S+"] redundant_punct_pattern = ( r"([!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ【»؛\s+«–…‘]{2,})" ) regex_tatweel = r"(\D)\1{2,}" rejected_chars_regex = r"[^0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘]" regex_url_step1 = r"(?=http)[^\s]+" regex_url_step2 = r"(?=www)[^\s]+" regex_url = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)" regex_mention = r"@[\w\d]+" regex_email = r"\S+@\S+" chars_regex = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘" white_spaced_double_quotation_regex = r'\"\s+([^"]+)\s+\"' white_spaced_single_quotation_regex = r"\'\s+([^']+)\s+\'" white_spaced_back_quotation_regex = r"\`\s+([^`]+)\s+\`" white_spaced_em_dash = r"\—\s+([^—]+)\s+\—" left_spaced_chars = r" ([\]!#\$%\),\.:;\?}٪’،؟”؛…»·])" right_spaced_chars = r"([\[\(\{“«‘*\~]) " left_and_right_spaced_chars = r" ([\+\-\<\=\>\@\\\^\_\|\–]) "