File size: 1,790 Bytes
4462fcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re
import string


chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
    ".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
]
chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""

dictionary_mapping = {
    "\u200c": " ", 
    "\u200d": " ", 
    "\u200e": " ", 
    "\u200f": " ", 
    "\ufeff": " ",
    "\u0307": " ",
}


def multiple_replace(text, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))


def remove_special_characters(text, chars_to_ignore_regex):
    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
    return text


def normalizer_at_word_level(text):
    words = text.split()
    _text = []

    for word in words:
        # Normalizer at word level
        _text.append(word)

    return " ".join(_text) + " "

def normalizer(batch, return_dict=True, filter_trivials=False, remove_extra_space=False):
    text = batch["sentence"].lower().strip()

    # Dictionary mapping
    text = multiple_replace(text, dictionary_mapping)
    text = re.sub(" +", " ", text)

    # Remove specials
    text = remove_special_characters(text, chars_to_ignore)
    text = re.sub(" +", " ", text)

    # Normalizer at word level
    text = normalizer_at_word_level(text)
    text = re.sub(" +", " ", text)
    
    if remove_extra_space:
        text = text.strip()
    else:
        text = text.strip() + " "

    if filter_trivials:
        if not len(text) > 2:
            text = None

    if not return_dict:
        return text

    batch["sentence"] = text
    return batch