crystal-technologies's picture
Upload 1287 files
2d8da09
raw
history blame
4.47 kB
#!/bin/bash
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# default values for optional arguments
MIN_SCORE=-2
CUT_PREFIX=0
SCRIPTS_DIR="scripts" # /<PATH TO>/NeMo/tools/ctc_segmentation/tools/scripts/ directory
OFFSET=0
LANGUAGE='en' # 'en', 'es', 'ru'...
MAX_SEGMENT_LEN=30
ADDITIONAL_SPLIT_SYMBOLS=":|;"
USE_NEMO_NORMALIZATION='True'
NUM_JOBS=-2 # The maximum number of concurrently running jobs, `-2` - all CPUs but one are used
SAMPLE_RATE=16000 # Target sample rate (default for ASR data - 16000 Hz)
MAX_DURATION=20 # Maximum audio segment duration, in seconds. Samples that are longer will be dropped.
for ARG in "$@"
do
key=$(echo $ARG | cut -f1 -d=)
value=$(echo $ARG | cut -f2 -d=)
if [[ $key == *"--"* ]]; then
v="${key/--/}"
declare $v="${value}"
fi
done
echo "MODEL_NAME_OR_PATH = $MODEL_NAME_OR_PATH"
echo "DATA_DIR = $DATA_DIR"
echo "OUTPUT_DIR = $OUTPUT_DIR"
echo "MIN_SCORE = $MIN_SCORE"
echo "CUT_PREFIX = $CUT_PREFIX"
echo "SCRIPTS_DIR = $SCRIPTS_DIR"
echo "OFFSET = $OFFSET"
echo "LANGUAGE = $LANGUAGE"
echo "MIN_SEGMENT_LEN = $MIN_SEGMENT_LEN"
echo "MAX_SEGMENT_LEN = $MAX_SEGMENT_LEN"
echo "SAMPLE_RATE = $SAMPLE_RATE"
echo "ADDITIONAL_SPLIT_SYMBOLS = $ADDITIONAL_SPLIT_SYMBOLS"
echo "USE_NEMO_NORMALIZATION = $USE_NEMO_NORMALIZATION"
if [[ -z $MODEL_NAME_OR_PATH ]] || [[ -z $DATA_DIR ]] || [[ -z $OUTPUT_DIR ]]; then
echo "Usage: $(basename "$0")
--MODEL_NAME_OR_PATH=[model_name_or_path]
--DATA_DIR=[data_dir]
--OUTPUT_DIR=[output_dir]
--LANGUAGE=[language (Optional)]
--OFFSET=[offset value (Optional)]
--CUT_PREFIX=[cut prefix in sec (Optional)]
--SCRIPTS_DIR=[scripts_dir_path (Optional)]
--MAX_SEGMENT_LEN=[max number of characters of the text segment for alignment (Optional)]
--ADDITIONAL_SPLIT_SYMBOLS=[Additional symbols to use for
sentence split if eos sentence split resulted in sequence longer than --max_length.
Use '|' as a separator between symbols, for example: ';|:' (Optional)]
--USE_NEMO_NORMALIZATION Set to 'True' to use NeMo Normalization tool to convert
numbers from written to spoken format. By default num2words package will be used. (Optional)"
exit 1
fi
NEMO_NORMALIZATION=""
if [[ ${USE_NEMO_NORMALIZATION,,} == "true" ]]; then
NEMO_NORMALIZATION="--use_nemo_normalization "
fi
# STEP #1
# Prepare text and audio data for segmentation
echo "TEXT AND AUDIO PREPROCESSING..."
python $SCRIPTS_DIR/prepare_data.py \
--in_text=$DATA_DIR/text \
--audio_dir=$DATA_DIR/audio \
--output_dir=$OUTPUT_DIR/processed/ \
--language=$LANGUAGE \
--cut_prefix=$CUT_PREFIX \
--model=$MODEL_NAME_OR_PATH \
--max_length=$MAX_SEGMENT_LEN \
--sample_rate=$SAMPLE_RATE \
--additional_split_symbols=$ADDITIONAL_SPLIT_SYMBOLS $NEMO_NORMALIZATION || exit
# STEP #2
# Run CTC-segmentation. One might want to perform alignment with various window sizes
# Note, if the alignment with the initial window size isn't found, the window size will be double to re-attempt alignment
echo "SEGMENTATION STEP..."
for WINDOW in 8000 12000
do
python $SCRIPTS_DIR/run_ctc_segmentation.py \
--output_dir=$OUTPUT_DIR \
--data=$OUTPUT_DIR/processed \
--sample_rate=$SAMPLE_RATE \
--model=$MODEL_NAME_OR_PATH \
--window_len $WINDOW || exit
done
# STEP #3 (Optional)
# Verify aligned segments only if multiple WINDOWs used in the Step #2)
echo "VERIFYING SEGMENTS..."
python $SCRIPTS_DIR/verify_segments.py \
--base_dir=$OUTPUT_DIR || exit
# STEP #4
# Cut the original audio files based on the alignment score. Only segments with alignment confidence score
# above the MIN_SCORE value will be saved to $OUTPUT_DIR/manifests/manifest.json
echo "CUTTING AUDIO..."
python $SCRIPTS_DIR/cut_audio.py \
--output_dir=$OUTPUT_DIR \
--alignment=$OUTPUT_DIR/verified_segments \
--threshold=$MIN_SCORE \
--offset=$OFFSET \
--sample_rate=$SAMPLE_RATE \
--max_duration=$MAX_DURATION || exit