NbAiLab
/

nb-wav2vec2-kenlm

Model card Files Files and versions Community

nb-wav2vec2-kenlm / clean_texts.py

versae's picture

Adding 5gram models

3ddfd5c almost 3 years ago

history blame contribute delete

1.16 kB

	import re
	import argparse

	#chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'
	chars_to_ignore_regex = '[\'=&()\*\/,?.!\-\;\:"“%‘”�—’…–<>#@0123456789°′»«\[\]]'
	radical_regex = r'[^abcdefghijklmnopqrstuvwxyzåæø ]'


	def extract_text(text, is_radical=False):
	text = re.sub(chars_to_ignore_regex, " ", text.lower()) + " "
	if is_radical:
	text = re.sub(radical_regex, ' ', text)
	text = re.sub(r'\s+', ' ', text)
	return text


	def main(args):
	with open(args.input_file, 'r') as file:
	data = file.read()

	data = extract_text(data, bool(args.radical))

	with open(args.output_file, 'w') as outputfile:
	outputfile.write(data)


	def parse_args():
	parser = argparse.ArgumentParser()
	parser.add_argument('--input_file', required=True, help='Path to input file.')
	parser.add_argument('--output_file', required=True, help='Path to output file.')
	parser.add_argument('--radical', dest='radical', action='store_true', help='Delete any non vocab character.')
	args = parser.parse_args()
	return args

	if __name__ == "__main__":
	args = parse_args()
	main(args)