File size: 4,468 Bytes
2d8da09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash

# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# default values for optional arguments
MIN_SCORE=-2
CUT_PREFIX=0
SCRIPTS_DIR="scripts" # /<PATH TO>/NeMo/tools/ctc_segmentation/tools/scripts/ directory
OFFSET=0
LANGUAGE='en' # 'en', 'es', 'ru'...
MAX_SEGMENT_LEN=30
ADDITIONAL_SPLIT_SYMBOLS=":|;"
USE_NEMO_NORMALIZATION='True'
NUM_JOBS=-2 # The maximum number of concurrently running jobs, `-2` - all CPUs but one are used
SAMPLE_RATE=16000 # Target sample rate (default for ASR data - 16000 Hz)
MAX_DURATION=20 # Maximum audio segment duration, in seconds. Samples that are longer will be dropped.

for ARG in "$@"
do
    key=$(echo $ARG | cut -f1 -d=)
    value=$(echo $ARG | cut -f2 -d=)

    if [[ $key == *"--"* ]]; then
        v="${key/--/}"
        declare $v="${value}"
    fi
done

echo "MODEL_NAME_OR_PATH = $MODEL_NAME_OR_PATH"
echo "DATA_DIR = $DATA_DIR"
echo "OUTPUT_DIR = $OUTPUT_DIR"
echo "MIN_SCORE = $MIN_SCORE"
echo "CUT_PREFIX = $CUT_PREFIX"
echo "SCRIPTS_DIR = $SCRIPTS_DIR"
echo "OFFSET = $OFFSET"
echo "LANGUAGE = $LANGUAGE"
echo "MIN_SEGMENT_LEN = $MIN_SEGMENT_LEN"
echo "MAX_SEGMENT_LEN = $MAX_SEGMENT_LEN"
echo "SAMPLE_RATE = $SAMPLE_RATE"
echo "ADDITIONAL_SPLIT_SYMBOLS = $ADDITIONAL_SPLIT_SYMBOLS"
echo "USE_NEMO_NORMALIZATION = $USE_NEMO_NORMALIZATION"

if [[ -z $MODEL_NAME_OR_PATH ]] || [[ -z $DATA_DIR ]] || [[ -z $OUTPUT_DIR ]]; then
  echo "Usage: $(basename "$0")
  --MODEL_NAME_OR_PATH=[model_name_or_path]
  --DATA_DIR=[data_dir]
  --OUTPUT_DIR=[output_dir]
  --LANGUAGE=[language (Optional)]
  --OFFSET=[offset value (Optional)]
  --CUT_PREFIX=[cut prefix in sec (Optional)]
  --SCRIPTS_DIR=[scripts_dir_path (Optional)]
  --MAX_SEGMENT_LEN=[max number of characters of the text segment for alignment (Optional)]
  --ADDITIONAL_SPLIT_SYMBOLS=[Additional symbols to use for
    sentence split if eos sentence split resulted in sequence longer than --max_length.
    Use '|' as a separator between symbols, for example: ';|:' (Optional)]
  --USE_NEMO_NORMALIZATION Set to 'True' to use NeMo Normalization tool to convert
    numbers from written to spoken format. By default num2words package will be used. (Optional)"
  exit 1
fi

NEMO_NORMALIZATION=""
    if [[ ${USE_NEMO_NORMALIZATION,,} == "true" ]]; then
      NEMO_NORMALIZATION="--use_nemo_normalization "
    fi

# STEP #1
# Prepare text and audio data for segmentation
echo "TEXT AND AUDIO PREPROCESSING..."
python $SCRIPTS_DIR/prepare_data.py \
--in_text=$DATA_DIR/text \
--audio_dir=$DATA_DIR/audio \
--output_dir=$OUTPUT_DIR/processed/ \
--language=$LANGUAGE \
--cut_prefix=$CUT_PREFIX \
--model=$MODEL_NAME_OR_PATH \
--max_length=$MAX_SEGMENT_LEN \
--sample_rate=$SAMPLE_RATE \
--additional_split_symbols=$ADDITIONAL_SPLIT_SYMBOLS $NEMO_NORMALIZATION || exit

# STEP #2
# Run CTC-segmentation. One might want to perform alignment with various window sizes
# Note, if the alignment with the initial window size isn't found, the window size will be double to re-attempt alignment
echo "SEGMENTATION STEP..."
for WINDOW in 8000 12000
do
  python $SCRIPTS_DIR/run_ctc_segmentation.py \
  --output_dir=$OUTPUT_DIR \
  --data=$OUTPUT_DIR/processed \
  --sample_rate=$SAMPLE_RATE \
  --model=$MODEL_NAME_OR_PATH  \
  --window_len $WINDOW || exit
done

# STEP #3 (Optional)
# Verify aligned segments only if multiple WINDOWs used in the Step #2)
echo "VERIFYING SEGMENTS..."
python $SCRIPTS_DIR/verify_segments.py \
--base_dir=$OUTPUT_DIR  || exit

# STEP #4
# Cut the original audio files based on the alignment score. Only segments with alignment confidence score
# above the MIN_SCORE value will be saved to $OUTPUT_DIR/manifests/manifest.json
echo "CUTTING AUDIO..."
python $SCRIPTS_DIR/cut_audio.py \
--output_dir=$OUTPUT_DIR \
--alignment=$OUTPUT_DIR/verified_segments \
--threshold=$MIN_SCORE \
--offset=$OFFSET \
--sample_rate=$SAMPLE_RATE \
--max_duration=$MAX_DURATION || exit