File size: 2,685 Bytes
3ddfd5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python
# coding=utf-8
import re
from tqdm import tqdm

from datasets import load_dataset, interleave_datasets, concatenate_datasets

TEXT_COLUMN_NAME = "text"
AUDIO_COLUMN_NAME = "audio"
CHARS_TO_IGNORE_REGEX = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/0-9]'

# Pre-processing dataset
def replace_hatted_characters(batch):
    text = batch["text"]
    text = re.sub(CHARS_TO_IGNORE_REGEX, '', text).lower() + ' '
    text = re.sub('[áàâ]', 'a', text)
    text = re.sub('[ä]', 'æ', text)
    text = re.sub('[éèëê]', 'e', text)
    text = re.sub('[íìïî]', 'i', text)
    text = re.sub('[óòöô]', 'o', text)
    text = re.sub('[ö]', 'ø', text)
    text = re.sub('[ç]', 'c', text)
    text = re.sub('[úùüû]', 'u', text)
    text = re.sub('\xa0', ' ', text)
    text = re.sub('<ee>', 'eee', text)
    text = re.sub('<qq>', 'qqq', text)
    text = re.sub('<mm>', 'mmm', text)
    text = re.sub('<inaudible>', 'xxx', text)
    text = re.sub('[<>]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return {"text": text}


def main():
    npsc = load_dataset(
        "NbAiLab/NPSC",
        "16K_mp3",
        split="train+validation",
        use_auth_token=True,
    )
    ncc = load_dataset(
        "NbAiLab/NCC",
        split="train+validation",
        use_auth_token=True
    )
    dataset = concatenate_datasets([npsc, ncc])
    dataset = dataset.map(
        replace_hatted_characters,
        desc="replacing hesitations and homophones",
    )

    # Create file with all text together
    text_count = len(dataset)
    with open("text.txt", "w") as text_file:
        for idx, text in tqdm(enumerate(dataset["text"]), total=text_count, desc="Writing text"):
            if idx == text_count:
                text_file.write(text)
            else:
                text_file.write(text + " ")

    # Create KenLM model
    !~/bin/lmplz -o 5 --text text.txt --arpa 5gram.arpa.orig -T $(pwd)

    # Adjusting for Huggingface decoding
    with open("5gram.arpa.orig", "r") as read_file, open("5gram.arpa", "w") as write_file:
        has_added_eos = False
        for line in read_file:
          if not has_added_eos and "ngram 1=" in line:
            count=line.strip().split("=")[-1]
            write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
          elif not has_added_eos and "<s>" in line:
            write_file.write(line)
            write_file.write(line.replace("<s>", "</s>"))
            has_added_eos = True
          else:
            write_file.write(line)

    # Compress as binary
    !~/bin/build_binary 5gram.arpa 5gram.bin -T $(pwd)
    !rm 5gram.arpa*


if __name__ == "__main__":
    main()