|
import re |
|
import argparse |
|
|
|
|
|
chars_to_ignore_regex = '[\'=&()\*\/,?.!\-\;\:"“%‘”�—’…–<>#@0123456789°′»«\[\]]' |
|
radical_regex = r'[^abcdefghijklmnopqrstuvwxyzåæø ]' |
|
|
|
|
|
def extract_text(text, is_radical=False): |
|
text = re.sub(chars_to_ignore_regex, " ", text.lower()) + " " |
|
if is_radical: |
|
text = re.sub(radical_regex, ' ', text) |
|
text = re.sub(r'\s+', ' ', text) |
|
return text |
|
|
|
|
|
def main(args): |
|
with open(args.input_file, 'r') as file: |
|
data = file.read() |
|
|
|
data = extract_text(data, bool(args.radical)) |
|
|
|
with open(args.output_file, 'w') as outputfile: |
|
outputfile.write(data) |
|
|
|
|
|
def parse_args(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--input_file', required=True, help='Path to input file.') |
|
parser.add_argument('--output_file', required=True, help='Path to output file.') |
|
parser.add_argument('--radical', dest='radical', action='store_true', help='Delete any non vocab character.') |
|
args = parser.parse_args() |
|
return args |
|
|
|
if __name__ == "__main__": |
|
args = parse_args() |
|
main(args) |
|
|