|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MIN_SCORE=-2 |
|
CUT_PREFIX=0 |
|
SCRIPTS_DIR="scripts" |
|
OFFSET=0 |
|
LANGUAGE='en' |
|
MAX_SEGMENT_LEN=30 |
|
ADDITIONAL_SPLIT_SYMBOLS=":|;" |
|
USE_NEMO_NORMALIZATION='True' |
|
NUM_JOBS=-2 |
|
SAMPLE_RATE=16000 |
|
MAX_DURATION=20 |
|
|
|
for ARG in "$@" |
|
do |
|
key=$(echo $ARG | cut -f1 -d=) |
|
value=$(echo $ARG | cut -f2 -d=) |
|
|
|
if [[ $key == *"--"* ]]; then |
|
v="${key/--/}" |
|
declare $v="${value}" |
|
fi |
|
done |
|
|
|
echo "MODEL_NAME_OR_PATH = $MODEL_NAME_OR_PATH" |
|
echo "DATA_DIR = $DATA_DIR" |
|
echo "OUTPUT_DIR = $OUTPUT_DIR" |
|
echo "MIN_SCORE = $MIN_SCORE" |
|
echo "CUT_PREFIX = $CUT_PREFIX" |
|
echo "SCRIPTS_DIR = $SCRIPTS_DIR" |
|
echo "OFFSET = $OFFSET" |
|
echo "LANGUAGE = $LANGUAGE" |
|
echo "MIN_SEGMENT_LEN = $MIN_SEGMENT_LEN" |
|
echo "MAX_SEGMENT_LEN = $MAX_SEGMENT_LEN" |
|
echo "SAMPLE_RATE = $SAMPLE_RATE" |
|
echo "ADDITIONAL_SPLIT_SYMBOLS = $ADDITIONAL_SPLIT_SYMBOLS" |
|
echo "USE_NEMO_NORMALIZATION = $USE_NEMO_NORMALIZATION" |
|
|
|
if [[ -z $MODEL_NAME_OR_PATH ]] || [[ -z $DATA_DIR ]] || [[ -z $OUTPUT_DIR ]]; then |
|
echo "Usage: $(basename "$0") |
|
--MODEL_NAME_OR_PATH=[model_name_or_path] |
|
--DATA_DIR=[data_dir] |
|
--OUTPUT_DIR=[output_dir] |
|
--LANGUAGE=[language (Optional)] |
|
--OFFSET=[offset value (Optional)] |
|
--CUT_PREFIX=[cut prefix in sec (Optional)] |
|
--SCRIPTS_DIR=[scripts_dir_path (Optional)] |
|
--MAX_SEGMENT_LEN=[max number of characters of the text segment for alignment (Optional)] |
|
--ADDITIONAL_SPLIT_SYMBOLS=[Additional symbols to use for |
|
sentence split if eos sentence split resulted in sequence longer than --max_length. |
|
Use '|' as a separator between symbols, for example: ';|:' (Optional)] |
|
--USE_NEMO_NORMALIZATION Set to 'True' to use NeMo Normalization tool to convert |
|
numbers from written to spoken format. By default num2words package will be used. (Optional)" |
|
exit 1 |
|
fi |
|
|
|
NEMO_NORMALIZATION="" |
|
if [[ ${USE_NEMO_NORMALIZATION,,} == "true" ]]; then |
|
NEMO_NORMALIZATION="--use_nemo_normalization " |
|
fi |
|
|
|
|
|
|
|
echo "TEXT AND AUDIO PREPROCESSING..." |
|
python $SCRIPTS_DIR/prepare_data.py \ |
|
--in_text=$DATA_DIR/text \ |
|
--audio_dir=$DATA_DIR/audio \ |
|
--output_dir=$OUTPUT_DIR/processed/ \ |
|
--language=$LANGUAGE \ |
|
--cut_prefix=$CUT_PREFIX \ |
|
--model=$MODEL_NAME_OR_PATH \ |
|
--max_length=$MAX_SEGMENT_LEN \ |
|
--sample_rate=$SAMPLE_RATE \ |
|
--additional_split_symbols=$ADDITIONAL_SPLIT_SYMBOLS $NEMO_NORMALIZATION || exit |
|
|
|
|
|
|
|
|
|
echo "SEGMENTATION STEP..." |
|
for WINDOW in 8000 12000 |
|
do |
|
python $SCRIPTS_DIR/run_ctc_segmentation.py \ |
|
--output_dir=$OUTPUT_DIR \ |
|
--data=$OUTPUT_DIR/processed \ |
|
--sample_rate=$SAMPLE_RATE \ |
|
--model=$MODEL_NAME_OR_PATH \ |
|
--window_len $WINDOW || exit |
|
done |
|
|
|
|
|
|
|
echo "VERIFYING SEGMENTS..." |
|
python $SCRIPTS_DIR/verify_segments.py \ |
|
--base_dir=$OUTPUT_DIR || exit |
|
|
|
|
|
|
|
|
|
echo "CUTTING AUDIO..." |
|
python $SCRIPTS_DIR/cut_audio.py \ |
|
--output_dir=$OUTPUT_DIR \ |
|
--alignment=$OUTPUT_DIR/verified_segments \ |
|
--threshold=$MIN_SCORE \ |
|
--offset=$OFFSET \ |
|
--sample_rate=$SAMPLE_RATE \ |
|
--max_duration=$MAX_DURATION || exit |
|
|