File size: 1,162 Bytes
3ddfd5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
import argparse

#chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' 
chars_to_ignore_regex = '[\'=&()\*\/,?.!\-\;\:"“%‘”�—’…–<>#@0123456789°′»«\[\]]'
radical_regex = r'[^abcdefghijklmnopqrstuvwxyzåæø ]'


def extract_text(text, is_radical=False):
    text = re.sub(chars_to_ignore_regex, " ", text.lower()) + " "
    if is_radical:
        text = re.sub(radical_regex, ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text


def main(args):
    with open(args.input_file, 'r') as file:
        data = file.read()

    data = extract_text(data, bool(args.radical))

    with open(args.output_file, 'w') as outputfile:
        outputfile.write(data)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', required=True, help='Path to input file.')
    parser.add_argument('--output_file', required=True, help='Path to output file.')
    parser.add_argument('--radical', dest='radical', action='store_true', help='Delete any non vocab character.')
    args = parser.parse_args()
    return args

if __name__ == "__main__":
    args = parse_args()
    main(args)