nb-wav2vec2-kenlm / clean_texts.py
versae's picture
Adding 5gram models
3ddfd5c
raw
history blame
1.16 kB
import re
import argparse
#chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'
chars_to_ignore_regex = '[\'=&()\*\/,?.!\-\;\:"“%‘”�—’…–<>#@0123456789°′»«\[\]]'
radical_regex = r'[^abcdefghijklmnopqrstuvwxyzåæø ]'
def extract_text(text, is_radical=False):
text = re.sub(chars_to_ignore_regex, " ", text.lower()) + " "
if is_radical:
text = re.sub(radical_regex, ' ', text)
text = re.sub(r'\s+', ' ', text)
return text
def main(args):
with open(args.input_file, 'r') as file:
data = file.read()
data = extract_text(data, bool(args.radical))
with open(args.output_file, 'w') as outputfile:
outputfile.write(data)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--input_file', required=True, help='Path to input file.')
parser.add_argument('--output_file', required=True, help='Path to output file.')
parser.add_argument('--radical', dest='radical', action='store_true', help='Delete any non vocab character.')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
main(args)