File size: 19,135 Bytes

2d8da09

# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage:

python process_vad_data.py \
    --out_dir=<output path to where the generated manifest should be stored> \
    --speech_data_root=<path where the speech data are stored> \
    --background_data_root=<path where the background data are stored> \
    --rebalance_method=<'under' or 'over' or 'fixed'> \ 
    --log
    (Optional --demo (for demonstration in tutorial). If you want to use your own background noise data, make sure to delete --demo)
"""
import argparse
import glob
import json
import logging
import os
import tarfile
import urllib.request

import librosa
import numpy as np
import soundfile as sf
from sklearn.model_selection import train_test_split

sr = 16000

# google speech command v2
URL = "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz"


def __maybe_download_file(destination: str, source: str):
    """
    Downloads source to destination if it doesn't exist.
    If exists, skips download
    Args:
        destination: local filepath
        source: url of resource

    Returns:

    """
    if not os.path.exists(destination):
        logging.info(f"{destination} does not exist. Downloading ...")
        urllib.request.urlretrieve(source, filename=destination + '.tmp')
        os.rename(destination + '.tmp', destination)
        logging.info(f"Downloaded {destination}.")
    else:
        logging.info(f"Destination {destination} exists. Skipping.")
    return destination


def extract_file(filepath: str, data_dir: str):
    try:
        tar = tarfile.open(filepath)
        tar.extractall(data_dir)
        tar.close()
    except Exception:
        logging.info('Not extracting. Maybe already there?')


def __extract_all_files(filepath: str, data_root: str, data_dir: str):
    if not os.path.exists(data_dir):
        extract_file(filepath, data_dir)
    else:
        logging.info(f'Skipping extracting. Data already there {data_dir}')


def split_train_val_test(data_dir, file_type, test_size=0.1, val_size=0.1, demo=False):
    X = []
    if file_type == "speech":
        for o in os.listdir(data_dir):
            if os.path.isdir(os.path.join(data_dir, o)) and o.split("/")[-1] != "_background_noise_":
                X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav'))

        if demo:
            logging.info(
                f"For Demonstration, we use {int(len(X)/100)}/{len(X)} speech data. Make sure to remove --demo flag when you actually train your model!"
            )
            X = np.random.choice(X, int(len(X) / 100), replace=False)

    else:
        for o in os.listdir(data_dir):
            if os.path.isdir(os.path.join(data_dir, o)):
                X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav'))
            else:  # for using "_background_noise_" from google speech commands as background data
                if o.endswith(".wav"):
                    X.append(os.path.join(data_dir, o))

    X_train, X_test = train_test_split(X, test_size=test_size, random_state=1)
    val_size_tmp = val_size / (1 - test_size)
    X_train, X_val = train_test_split(X_train, test_size=val_size_tmp, random_state=1)

    with open(os.path.join(data_dir, file_type + "_training_list.txt"), "w") as outfile:
        outfile.write("\n".join(X_train))
    with open(os.path.join(data_dir, file_type + "_testing_list.txt"), "w") as outfile:
        outfile.write("\n".join(X_test))
    with open(os.path.join(data_dir, file_type + "_validation_list.txt"), "w") as outfile:
        outfile.write("\n".join(X_val))

    logging.info(f'Overall: {len(X)}, Train: {len(X_train)}, Validatoin: {len(X_val)}, Test: {len(X_test)}')
    logging.info(f"Finish spliting train, val and test for {file_type}. Write to files!")


def process_google_speech_train(data_dir):
    X = []
    for o in os.listdir(data_dir):
        if os.path.isdir(os.path.join(data_dir, o)) and o.split("/")[-1] != "_background_noise_":
            X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav'))

    short_files = [i.split(data_dir)[1] for i in files]

    with open(os.path.join(data_dir, 'testing_list.txt'), 'r') as allfile:
        testing_list = allfile.read().splitlines()

    with open(os.path.join(data_dir, 'validation_list.txt'), 'r') as allfile:
        validation_list = allfile.read().splitlines()

    exist_set = set(testing_list).copy()
    exist_set.update(set(validation_list))

    training_list = [i for i in short_files if i not in exist_set]

    with open(os.path.join(data_dir, "training_list.txt"), "w") as outfile:
        outfile.write("\n".join(training_list))

    logging.info(
        f'Overall: {len(files)}, Train: {len(training_list)}, Validatoin: {len(validation_list)}, Test: {len(testing_list)}'
    )


def write_manifest(
    out_dir,
    files,
    prefix,
    manifest_name,
    start=0.0,
    end=None,
    duration_stride=1.0,
    duration_max=None,
    duration_limit=100.0,
    filter_long=False,
):
    """
    Given a list of files, segment each file and write them to manifest with restrictions.
    Args:
        out_dir: directory of generated manifest
        files: list of files to be processed
        prefix: label of samples
        manifest_name: name of generated manifest
        start: beginning of audio of generating segment
        end: end of audio of generating segment
        duration_stride: stride for segmenting audio samples
        duration_max: duration for each segment
        duration_limit: duration threshold for filtering out long audio samples
        filter_long: boolean to determine whether to filter out long audio samples
    Returns:
    """
    seg_num = 0
    skip_num = 0
    if duration_max is None:
        duration_max = 1e9

    if not os.path.exists(out_dir):
        logging.info(f'Outdir {out_dir} does not exist. Creat directory.')
        os.mkdir(out_dir)

    output_path = os.path.join(out_dir, manifest_name + '.json')
    with open(output_path, 'w') as fout:
        for file in files:
            label = prefix

            try:
                x, _sr = librosa.load(file, sr=sr)
                duration = librosa.get_duration(y=x, sr=sr)

            except Exception:
                continue

            if filter_long and duration > duration_limit:
                skip_num += 1
                continue

            offsets = []
            durations = []

            if duration > duration_max:
                current_offset = start

                while current_offset < duration:
                    if end is not None and current_offset > end:
                        break

                    difference = duration - current_offset

                    if difference < duration_max:
                        break

                    offsets.append(current_offset)
                    durations.append(duration_max)

                    current_offset += duration_stride

            else:
                # Duration is not long enough! Skip
                skip_num += 1

            for duration, offset in zip(durations, offsets):
                metadata = {
                    'audio_filepath': file,
                    'duration': duration,
                    'label': label,
                    'text': '_',  # for compatibility with ASRAudioText
                    'offset': offset,
                }
                json.dump(metadata, fout)
                fout.write('\n')
                fout.flush()
                seg_num += 1
    return skip_num, seg_num, output_path


def load_list_write_manifest(
    data_dir,
    out_dir,
    filename,
    prefix,
    start,
    end,
    duration_stride=1.0,
    duration_max=1.0,
    duration_limit=100.0,
    filter_long=True,
):

    filename = prefix + '_' + filename
    file_path = os.path.join(data_dir, filename)

    with open(file_path, 'r') as allfile:
        files = allfile.read().splitlines()

    manifest_name = filename.split('_list.txt')[0] + '_manifest'
    skip_num, seg_num, output_path = write_manifest(
        out_dir,
        files,
        prefix,
        manifest_name,
        start,
        end,
        duration_stride,
        duration_max,
        duration_limit,
        filter_long=True,
    )
    return skip_num, seg_num, output_path


def rebalance_json(data_dir, data_json, num, prefix):
    data = []
    seg = 0
    with open(data_json, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    filename = data_json.split('/')[-1]
    fout_path = os.path.join(data_dir, prefix + "_" + filename)

    if len(data) >= num:
        selected_sample = np.random.choice(data, num, replace=False)
    else:
        selected_sample = np.random.choice(data, num, replace=True)

    with open(fout_path, 'a') as fout:
        for i in selected_sample:
            seg += 1
            json.dump(i, fout)
            fout.write('\n')
            fout.flush()

    logging.info(f'Get {seg}/{num} to  {fout_path} from {data_json}')
    return fout_path


def generate_variety_noise(data_dir, filename, prefix):

    curr_dir = data_dir.split("_background_noise_")[0]
    silence_path = os.path.join(curr_dir, "_background_noise_more")

    if not os.path.exists(silence_path):
        os.mkdir(silence_path)

    silence_stride = 1000  # stride = 1/16 seconds
    sampling_rate = 16000

    silence_files = []
    rng = np.random.RandomState(0)

    filename = prefix + '_' + filename
    file_path = os.path.join(data_dir, filename)

    with open(file_path, 'r') as allfile:
        files = allfile.read().splitlines()

    for file in files:
        y, sr = librosa.load(path=file, sr=sampling_rate)

        for i in range(
            0, len(y) - sampling_rate, silence_stride * 100
        ):  # stride * 100 to generate less samples for demo
            file_name = "{}_{}.wav".format(file.split("/")[-1], i)
            y_slice = y[i : i + sampling_rate]
            magnitude = rng.uniform(0.0, 1.0)
            y_slice *= magnitude
            out_file_path = os.path.join(silence_path, file_name)
            sf.write(out_file_path, y_slice, sr)

            silence_files.append(out_file_path)

    new_list_file = os.path.join(silence_path, filename)
    with open(new_list_file, "w") as outfile:
        outfile.write("\n".join(silence_files))

    logging.info(f"Generate {len(out_file_path)} background files for {file_path}. => {new_list_file} !")
    return len(silence_files)


def main():
    parser = argparse.ArgumentParser(description='Speech and backgound data download and preprocess')
    parser.add_argument("--out_dir", required=False, default='./manifest/', type=str)
    parser.add_argument("--speech_data_root", required=True, default=None, type=str)
    parser.add_argument("--background_data_root", required=True, default=None, type=str)
    parser.add_argument('--test_size', required=False, default=0.1, type=float)
    parser.add_argument('--val_size', required=False, default=0.1, type=float)
    parser.add_argument('--window_length_in_sec', required=False, default=0.63, type=float)
    parser.add_argument('--log', required=False, action='store_true')
    parser.add_argument('--rebalance_method', required=False, default=None, type=str)
    parser.add_argument('--demo', required=False, action='store_true')
    parser.set_defaults(log=False, generate=False)
    args = parser.parse_args()

    if not args.rebalance_method:
        rebalance = False
    else:
        if args.rebalance_method != 'over' and args.rebalance_method != 'under' and args.rebalance_method != 'fixed':
            raise NameError("Please select a valid sampling method: over/under/fixed.")
        else:
            rebalance = True

    if args.log:
        logging.basicConfig(level=logging.DEBUG)

    # Download speech data
    speech_data_root = args.speech_data_root
    data_set = "google_speech_recognition_v2"
    speech_data_folder = os.path.join(speech_data_root, data_set)

    background_data_folder = args.background_data_root
    logging.info(f"Working on: {data_set}")

    # Download and extract speech data
    if not os.path.exists(speech_data_folder):
        file_path = os.path.join(speech_data_root, data_set + ".tar.bz2")
        logging.info(f"Getting {data_set}")
        __maybe_download_file(file_path, URL)
        logging.info(f"Extracting {data_set}")
        __extract_all_files(file_path, speech_data_root, speech_data_folder)

    logging.info(f"Split speech data!")
    # dataset provide testing.txt and validation.txt feel free to split data using that with process_google_speech_train
    split_train_val_test(speech_data_folder, "speech", args.test_size, args.val_size, args.demo)

    logging.info(f"Split background data!")
    split_train_val_test(background_data_folder, "background", args.test_size, args.val_size)

    out_dir = args.out_dir

    # Process Speech manifest
    logging.info(f"=== Write speech data to manifest!")
    skip_num_val, speech_seg_num_val, speech_val = load_list_write_manifest(
        speech_data_folder,
        out_dir,
        'validation_list.txt',
        'speech',
        0.2,
        0.8,
        args.window_length_in_sec,
        args.window_length_in_sec,
    )
    skip_num_test, speech_seg_num_test, speech_test = load_list_write_manifest(
        speech_data_folder, out_dir, 'testing_list.txt', 'speech', 0.2, 0.8, 0.01, args.window_length_in_sec
    )
    skip_num_train, speech_seg_num_train, speech_train = load_list_write_manifest(
        speech_data_folder,
        out_dir,
        'training_list.txt',
        'speech',
        0.2,
        0.8,
        args.window_length_in_sec,
        args.window_length_in_sec,
    )

    logging.info(f'Val: Skip {skip_num_val} samples. Get {speech_seg_num_val} segments! => {speech_val} ')
    logging.info(f'Test: Skip {skip_num_test} samples. Get {speech_seg_num_test} segments! => {speech_test}')
    logging.info(f'Train: Skip {skip_num_train} samples. Get {speech_seg_num_train} segments!=> {speech_train}')

    # Process background manifest
    # if we select to generate more background noise data
    if args.demo:
        logging.info("Start generating more background noise data")
        generate_variety_noise(background_data_folder, 'validation_list.txt', 'background')
        generate_variety_noise(background_data_folder, 'training_list.txt', 'background')
        generate_variety_noise(background_data_folder, 'testing_list.txt', 'background')
        background_data_folder = os.path.join(
            background_data_folder.split("_background_noise_")[0], "_background_noise_more"
        )

    logging.info(f"=== Write background data to manifest!")
    skip_num_val, background_seg_num_val, background_val = load_list_write_manifest(
        background_data_folder, out_dir, 'validation_list.txt', 'background', 0, None, 0.15, args.window_length_in_sec
    )
    skip_num_test, background_seg_num_test, background_test = load_list_write_manifest(
        background_data_folder, out_dir, 'testing_list.txt', 'background', 0, None, 0.01, args.window_length_in_sec
    )
    skip_num_train, background_seg_num_train, background_train = load_list_write_manifest(
        background_data_folder, out_dir, 'training_list.txt', 'background', 0, None, 0.15, args.window_length_in_sec
    )

    logging.info(f'Val: Skip {skip_num_val} samples. Get {background_seg_num_val} segments! => {background_val}')
    logging.info(f'Test: Skip {skip_num_test} samples. Get {background_seg_num_test} segments! => {background_test}')
    logging.info(
        f'Train: Skip {skip_num_train} samples. Get {background_seg_num_train} segments! => {background_train}'
    )
    min_val, max_val = min(speech_seg_num_val, background_seg_num_val), max(speech_seg_num_val, background_seg_num_val)
    min_test, max_test = (
        min(speech_seg_num_test, background_seg_num_test),
        max(speech_seg_num_test, background_seg_num_test),
    )
    min_train, max_train = (
        min(speech_seg_num_train, background_seg_num_train),
        max(speech_seg_num_train, background_seg_num_train),
    )

    logging.info('Finish generating manifest!')

    if rebalance:
        # Random Oversampling: Randomly duplicate examples in the minority class.
        # Random Undersampling: Randomly delete examples in the majority class.
        if args.rebalance_method == 'under':
            logging.info(f"Rebalancing number of samples in classes using {args.rebalance_method} sampling.")
            logging.info(f'Val: {min_val} Test: {min_test} Train: {min_train}!')

            rebalance_json(out_dir, background_val, min_val, 'balanced')
            rebalance_json(out_dir, background_test, min_test, 'balanced')
            rebalance_json(out_dir, background_train, min_train, 'balanced')

            rebalance_json(out_dir, speech_val, min_val, 'balanced')
            rebalance_json(out_dir, speech_test, min_test, 'balanced')
            rebalance_json(out_dir, speech_train, min_train, 'balanced')

        if args.rebalance_method == 'over':
            logging.info(f"Rebalancing number of samples in classes using {args.rebalance_method} sampling.")
            logging.info(f'Val: {max_val} Test: {max_test} Train: {max_train}!')

            rebalance_json(out_dir, background_val, max_val, 'balanced')
            rebalance_json(out_dir, background_test, max_test, 'balanced')
            rebalance_json(out_dir, background_train, max_train, 'balanced')

            rebalance_json(out_dir, speech_val, max_val, 'balanced')
            rebalance_json(out_dir, speech_test, max_test, 'balanced')
            rebalance_json(out_dir, speech_train, max_train, 'balanced')

        if args.rebalance_method == 'fixed':
            fixed_test, fixed_val, fixed_train = 200, 100, 500
            logging.info(f"Rebalancing number of samples in classes using {args.rebalance_method} sampling.")
            logging.info(f'Val: {fixed_val} Test: {fixed_test} Train: {fixed_train}!')

            rebalance_json(out_dir, background_val, fixed_val, 'balanced')
            rebalance_json(out_dir, background_test, fixed_test, 'balanced')
            rebalance_json(out_dir, background_train, fixed_train, 'balanced')

            rebalance_json(out_dir, speech_val, fixed_val, 'balanced')
            rebalance_json(out_dir, speech_test, fixed_test, 'balanced')
            rebalance_json(out_dir, speech_train, fixed_train, 'balanced')
    else:
        logging.info("Don't rebalance number of samples in classes.")


if __name__ == '__main__':
    main()