|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Usage: |
|
|
|
python process_vad_data.py \ |
|
--out_dir=<output path to where the generated manifest should be stored> \ |
|
--speech_data_root=<path where the speech data are stored> \ |
|
--background_data_root=<path where the background data are stored> \ |
|
--rebalance_method=<'under' or 'over' or 'fixed'> \ |
|
--log |
|
(Optional --demo (for demonstration in tutorial). If you want to use your own background noise data, make sure to delete --demo) |
|
""" |
|
import argparse |
|
import glob |
|
import json |
|
import logging |
|
import os |
|
import tarfile |
|
import urllib.request |
|
|
|
import librosa |
|
import numpy as np |
|
import soundfile as sf |
|
from sklearn.model_selection import train_test_split |
|
|
|
sr = 16000 |
|
|
|
|
|
URL = "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz" |
|
|
|
|
|
def __maybe_download_file(destination: str, source: str): |
|
""" |
|
Downloads source to destination if it doesn't exist. |
|
If exists, skips download |
|
Args: |
|
destination: local filepath |
|
source: url of resource |
|
|
|
Returns: |
|
|
|
""" |
|
if not os.path.exists(destination): |
|
logging.info(f"{destination} does not exist. Downloading ...") |
|
urllib.request.urlretrieve(source, filename=destination + '.tmp') |
|
os.rename(destination + '.tmp', destination) |
|
logging.info(f"Downloaded {destination}.") |
|
else: |
|
logging.info(f"Destination {destination} exists. Skipping.") |
|
return destination |
|
|
|
|
|
def extract_file(filepath: str, data_dir: str): |
|
try: |
|
tar = tarfile.open(filepath) |
|
tar.extractall(data_dir) |
|
tar.close() |
|
except Exception: |
|
logging.info('Not extracting. Maybe already there?') |
|
|
|
|
|
def __extract_all_files(filepath: str, data_root: str, data_dir: str): |
|
if not os.path.exists(data_dir): |
|
extract_file(filepath, data_dir) |
|
else: |
|
logging.info(f'Skipping extracting. Data already there {data_dir}') |
|
|
|
|
|
def split_train_val_test(data_dir, file_type, test_size=0.1, val_size=0.1, demo=False): |
|
X = [] |
|
if file_type == "speech": |
|
for o in os.listdir(data_dir): |
|
if os.path.isdir(os.path.join(data_dir, o)) and o.split("/")[-1] != "_background_noise_": |
|
X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav')) |
|
|
|
if demo: |
|
logging.info( |
|
f"For Demonstration, we use {int(len(X)/100)}/{len(X)} speech data. Make sure to remove --demo flag when you actually train your model!" |
|
) |
|
X = np.random.choice(X, int(len(X) / 100), replace=False) |
|
|
|
else: |
|
for o in os.listdir(data_dir): |
|
if os.path.isdir(os.path.join(data_dir, o)): |
|
X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav')) |
|
else: |
|
if o.endswith(".wav"): |
|
X.append(os.path.join(data_dir, o)) |
|
|
|
X_train, X_test = train_test_split(X, test_size=test_size, random_state=1) |
|
val_size_tmp = val_size / (1 - test_size) |
|
X_train, X_val = train_test_split(X_train, test_size=val_size_tmp, random_state=1) |
|
|
|
with open(os.path.join(data_dir, file_type + "_training_list.txt"), "w") as outfile: |
|
outfile.write("\n".join(X_train)) |
|
with open(os.path.join(data_dir, file_type + "_testing_list.txt"), "w") as outfile: |
|
outfile.write("\n".join(X_test)) |
|
with open(os.path.join(data_dir, file_type + "_validation_list.txt"), "w") as outfile: |
|
outfile.write("\n".join(X_val)) |
|
|
|
logging.info(f'Overall: {len(X)}, Train: {len(X_train)}, Validatoin: {len(X_val)}, Test: {len(X_test)}') |
|
logging.info(f"Finish spliting train, val and test for {file_type}. Write to files!") |
|
|
|
|
|
def process_google_speech_train(data_dir): |
|
X = [] |
|
for o in os.listdir(data_dir): |
|
if os.path.isdir(os.path.join(data_dir, o)) and o.split("/")[-1] != "_background_noise_": |
|
X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav')) |
|
|
|
short_files = [i.split(data_dir)[1] for i in files] |
|
|
|
with open(os.path.join(data_dir, 'testing_list.txt'), 'r') as allfile: |
|
testing_list = allfile.read().splitlines() |
|
|
|
with open(os.path.join(data_dir, 'validation_list.txt'), 'r') as allfile: |
|
validation_list = allfile.read().splitlines() |
|
|
|
exist_set = set(testing_list).copy() |
|
exist_set.update(set(validation_list)) |
|
|
|
training_list = [i for i in short_files if i not in exist_set] |
|
|
|
with open(os.path.join(data_dir, "training_list.txt"), "w") as outfile: |
|
outfile.write("\n".join(training_list)) |
|
|
|
logging.info( |
|
f'Overall: {len(files)}, Train: {len(training_list)}, Validatoin: {len(validation_list)}, Test: {len(testing_list)}' |
|
) |
|
|
|
|
|
def write_manifest( |
|
out_dir, |
|
files, |
|
prefix, |
|
manifest_name, |
|
start=0.0, |
|
end=None, |
|
duration_stride=1.0, |
|
duration_max=None, |
|
duration_limit=100.0, |
|
filter_long=False, |
|
): |
|
""" |
|
Given a list of files, segment each file and write them to manifest with restrictions. |
|
Args: |
|
out_dir: directory of generated manifest |
|
files: list of files to be processed |
|
prefix: label of samples |
|
manifest_name: name of generated manifest |
|
start: beginning of audio of generating segment |
|
end: end of audio of generating segment |
|
duration_stride: stride for segmenting audio samples |
|
duration_max: duration for each segment |
|
duration_limit: duration threshold for filtering out long audio samples |
|
filter_long: boolean to determine whether to filter out long audio samples |
|
Returns: |
|
""" |
|
seg_num = 0 |
|
skip_num = 0 |
|
if duration_max is None: |
|
duration_max = 1e9 |
|
|
|
if not os.path.exists(out_dir): |
|
logging.info(f'Outdir {out_dir} does not exist. Creat directory.') |
|
os.mkdir(out_dir) |
|
|
|
output_path = os.path.join(out_dir, manifest_name + '.json') |
|
with open(output_path, 'w') as fout: |
|
for file in files: |
|
label = prefix |
|
|
|
try: |
|
x, _sr = librosa.load(file, sr=sr) |
|
duration = librosa.get_duration(y=x, sr=sr) |
|
|
|
except Exception: |
|
continue |
|
|
|
if filter_long and duration > duration_limit: |
|
skip_num += 1 |
|
continue |
|
|
|
offsets = [] |
|
durations = [] |
|
|
|
if duration > duration_max: |
|
current_offset = start |
|
|
|
while current_offset < duration: |
|
if end is not None and current_offset > end: |
|
break |
|
|
|
difference = duration - current_offset |
|
|
|
if difference < duration_max: |
|
break |
|
|
|
offsets.append(current_offset) |
|
durations.append(duration_max) |
|
|
|
current_offset += duration_stride |
|
|
|
else: |
|
|
|
skip_num += 1 |
|
|
|
for duration, offset in zip(durations, offsets): |
|
metadata = { |
|
'audio_filepath': file, |
|
'duration': duration, |
|
'label': label, |
|
'text': '_', |
|
'offset': offset, |
|
} |
|
json.dump(metadata, fout) |
|
fout.write('\n') |
|
fout.flush() |
|
seg_num += 1 |
|
return skip_num, seg_num, output_path |
|
|
|
|
|
def load_list_write_manifest( |
|
data_dir, |
|
out_dir, |
|
filename, |
|
prefix, |
|
start, |
|
end, |
|
duration_stride=1.0, |
|
duration_max=1.0, |
|
duration_limit=100.0, |
|
filter_long=True, |
|
): |
|
|
|
filename = prefix + '_' + filename |
|
file_path = os.path.join(data_dir, filename) |
|
|
|
with open(file_path, 'r') as allfile: |
|
files = allfile.read().splitlines() |
|
|
|
manifest_name = filename.split('_list.txt')[0] + '_manifest' |
|
skip_num, seg_num, output_path = write_manifest( |
|
out_dir, |
|
files, |
|
prefix, |
|
manifest_name, |
|
start, |
|
end, |
|
duration_stride, |
|
duration_max, |
|
duration_limit, |
|
filter_long=True, |
|
) |
|
return skip_num, seg_num, output_path |
|
|
|
|
|
def rebalance_json(data_dir, data_json, num, prefix): |
|
data = [] |
|
seg = 0 |
|
with open(data_json, 'r') as f: |
|
for line in f: |
|
data.append(json.loads(line)) |
|
|
|
filename = data_json.split('/')[-1] |
|
fout_path = os.path.join(data_dir, prefix + "_" + filename) |
|
|
|
if len(data) >= num: |
|
selected_sample = np.random.choice(data, num, replace=False) |
|
else: |
|
selected_sample = np.random.choice(data, num, replace=True) |
|
|
|
with open(fout_path, 'a') as fout: |
|
for i in selected_sample: |
|
seg += 1 |
|
json.dump(i, fout) |
|
fout.write('\n') |
|
fout.flush() |
|
|
|
logging.info(f'Get {seg}/{num} to {fout_path} from {data_json}') |
|
return fout_path |
|
|
|
|
|
def generate_variety_noise(data_dir, filename, prefix): |
|
|
|
curr_dir = data_dir.split("_background_noise_")[0] |
|
silence_path = os.path.join(curr_dir, "_background_noise_more") |
|
|
|
if not os.path.exists(silence_path): |
|
os.mkdir(silence_path) |
|
|
|
silence_stride = 1000 |
|
sampling_rate = 16000 |
|
|
|
silence_files = [] |
|
rng = np.random.RandomState(0) |
|
|
|
filename = prefix + '_' + filename |
|
file_path = os.path.join(data_dir, filename) |
|
|
|
with open(file_path, 'r') as allfile: |
|
files = allfile.read().splitlines() |
|
|
|
for file in files: |
|
y, sr = librosa.load(path=file, sr=sampling_rate) |
|
|
|
for i in range( |
|
0, len(y) - sampling_rate, silence_stride * 100 |
|
): |
|
file_name = "{}_{}.wav".format(file.split("/")[-1], i) |
|
y_slice = y[i : i + sampling_rate] |
|
magnitude = rng.uniform(0.0, 1.0) |
|
y_slice *= magnitude |
|
out_file_path = os.path.join(silence_path, file_name) |
|
sf.write(out_file_path, y_slice, sr) |
|
|
|
silence_files.append(out_file_path) |
|
|
|
new_list_file = os.path.join(silence_path, filename) |
|
with open(new_list_file, "w") as outfile: |
|
outfile.write("\n".join(silence_files)) |
|
|
|
logging.info(f"Generate {len(out_file_path)} background files for {file_path}. => {new_list_file} !") |
|
return len(silence_files) |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description='Speech and backgound data download and preprocess') |
|
parser.add_argument("--out_dir", required=False, default='./manifest/', type=str) |
|
parser.add_argument("--speech_data_root", required=True, default=None, type=str) |
|
parser.add_argument("--background_data_root", required=True, default=None, type=str) |
|
parser.add_argument('--test_size', required=False, default=0.1, type=float) |
|
parser.add_argument('--val_size', required=False, default=0.1, type=float) |
|
parser.add_argument('--window_length_in_sec', required=False, default=0.63, type=float) |
|
parser.add_argument('--log', required=False, action='store_true') |
|
parser.add_argument('--rebalance_method', required=False, default=None, type=str) |
|
parser.add_argument('--demo', required=False, action='store_true') |
|
parser.set_defaults(log=False, generate=False) |
|
args = parser.parse_args() |
|
|
|
if not args.rebalance_method: |
|
rebalance = False |
|
else: |
|
if args.rebalance_method != 'over' and args.rebalance_method != 'under' and args.rebalance_method != 'fixed': |
|
raise NameError("Please select a valid sampling method: over/under/fixed.") |
|
else: |
|
rebalance = True |
|
|
|
if args.log: |
|
logging.basicConfig(level=logging.DEBUG) |
|
|
|
|
|
speech_data_root = args.speech_data_root |
|
data_set = "google_speech_recognition_v2" |
|
speech_data_folder = os.path.join(speech_data_root, data_set) |
|
|
|
background_data_folder = args.background_data_root |
|
logging.info(f"Working on: {data_set}") |
|
|
|
|
|
if not os.path.exists(speech_data_folder): |
|
file_path = os.path.join(speech_data_root, data_set + ".tar.bz2") |
|
logging.info(f"Getting {data_set}") |
|
__maybe_download_file(file_path, URL) |
|
logging.info(f"Extracting {data_set}") |
|
__extract_all_files(file_path, speech_data_root, speech_data_folder) |
|
|
|
logging.info(f"Split speech data!") |
|
|
|
split_train_val_test(speech_data_folder, "speech", args.test_size, args.val_size, args.demo) |
|
|
|
logging.info(f"Split background data!") |
|
split_train_val_test(background_data_folder, "background", args.test_size, args.val_size) |
|
|
|
out_dir = args.out_dir |
|
|
|
|
|
logging.info(f"=== Write speech data to manifest!") |
|
skip_num_val, speech_seg_num_val, speech_val = load_list_write_manifest( |
|
speech_data_folder, |
|
out_dir, |
|
'validation_list.txt', |
|
'speech', |
|
0.2, |
|
0.8, |
|
args.window_length_in_sec, |
|
args.window_length_in_sec, |
|
) |
|
skip_num_test, speech_seg_num_test, speech_test = load_list_write_manifest( |
|
speech_data_folder, out_dir, 'testing_list.txt', 'speech', 0.2, 0.8, 0.01, args.window_length_in_sec |
|
) |
|
skip_num_train, speech_seg_num_train, speech_train = load_list_write_manifest( |
|
speech_data_folder, |
|
out_dir, |
|
'training_list.txt', |
|
'speech', |
|
0.2, |
|
0.8, |
|
args.window_length_in_sec, |
|
args.window_length_in_sec, |
|
) |
|
|
|
logging.info(f'Val: Skip {skip_num_val} samples. Get {speech_seg_num_val} segments! => {speech_val} ') |
|
logging.info(f'Test: Skip {skip_num_test} samples. Get {speech_seg_num_test} segments! => {speech_test}') |
|
logging.info(f'Train: Skip {skip_num_train} samples. Get {speech_seg_num_train} segments!=> {speech_train}') |
|
|
|
|
|
|
|
if args.demo: |
|
logging.info("Start generating more background noise data") |
|
generate_variety_noise(background_data_folder, 'validation_list.txt', 'background') |
|
generate_variety_noise(background_data_folder, 'training_list.txt', 'background') |
|
generate_variety_noise(background_data_folder, 'testing_list.txt', 'background') |
|
background_data_folder = os.path.join( |
|
background_data_folder.split("_background_noise_")[0], "_background_noise_more" |
|
) |
|
|
|
logging.info(f"=== Write background data to manifest!") |
|
skip_num_val, background_seg_num_val, background_val = load_list_write_manifest( |
|
background_data_folder, out_dir, 'validation_list.txt', 'background', 0, None, 0.15, args.window_length_in_sec |
|
) |
|
skip_num_test, background_seg_num_test, background_test = load_list_write_manifest( |
|
background_data_folder, out_dir, 'testing_list.txt', 'background', 0, None, 0.01, args.window_length_in_sec |
|
) |
|
skip_num_train, background_seg_num_train, background_train = load_list_write_manifest( |
|
background_data_folder, out_dir, 'training_list.txt', 'background', 0, None, 0.15, args.window_length_in_sec |
|
) |
|
|
|
logging.info(f'Val: Skip {skip_num_val} samples. Get {background_seg_num_val} segments! => {background_val}') |
|
logging.info(f'Test: Skip {skip_num_test} samples. Get {background_seg_num_test} segments! => {background_test}') |
|
logging.info( |
|
f'Train: Skip {skip_num_train} samples. Get {background_seg_num_train} segments! => {background_train}' |
|
) |
|
min_val, max_val = min(speech_seg_num_val, background_seg_num_val), max(speech_seg_num_val, background_seg_num_val) |
|
min_test, max_test = ( |
|
min(speech_seg_num_test, background_seg_num_test), |
|
max(speech_seg_num_test, background_seg_num_test), |
|
) |
|
min_train, max_train = ( |
|
min(speech_seg_num_train, background_seg_num_train), |
|
max(speech_seg_num_train, background_seg_num_train), |
|
) |
|
|
|
logging.info('Finish generating manifest!') |
|
|
|
if rebalance: |
|
|
|
|
|
if args.rebalance_method == 'under': |
|
logging.info(f"Rebalancing number of samples in classes using {args.rebalance_method} sampling.") |
|
logging.info(f'Val: {min_val} Test: {min_test} Train: {min_train}!') |
|
|
|
rebalance_json(out_dir, background_val, min_val, 'balanced') |
|
rebalance_json(out_dir, background_test, min_test, 'balanced') |
|
rebalance_json(out_dir, background_train, min_train, 'balanced') |
|
|
|
rebalance_json(out_dir, speech_val, min_val, 'balanced') |
|
rebalance_json(out_dir, speech_test, min_test, 'balanced') |
|
rebalance_json(out_dir, speech_train, min_train, 'balanced') |
|
|
|
if args.rebalance_method == 'over': |
|
logging.info(f"Rebalancing number of samples in classes using {args.rebalance_method} sampling.") |
|
logging.info(f'Val: {max_val} Test: {max_test} Train: {max_train}!') |
|
|
|
rebalance_json(out_dir, background_val, max_val, 'balanced') |
|
rebalance_json(out_dir, background_test, max_test, 'balanced') |
|
rebalance_json(out_dir, background_train, max_train, 'balanced') |
|
|
|
rebalance_json(out_dir, speech_val, max_val, 'balanced') |
|
rebalance_json(out_dir, speech_test, max_test, 'balanced') |
|
rebalance_json(out_dir, speech_train, max_train, 'balanced') |
|
|
|
if args.rebalance_method == 'fixed': |
|
fixed_test, fixed_val, fixed_train = 200, 100, 500 |
|
logging.info(f"Rebalancing number of samples in classes using {args.rebalance_method} sampling.") |
|
logging.info(f'Val: {fixed_val} Test: {fixed_test} Train: {fixed_train}!') |
|
|
|
rebalance_json(out_dir, background_val, fixed_val, 'balanced') |
|
rebalance_json(out_dir, background_test, fixed_test, 'balanced') |
|
rebalance_json(out_dir, background_train, fixed_train, 'balanced') |
|
|
|
rebalance_json(out_dir, speech_val, fixed_val, 'balanced') |
|
rebalance_json(out_dir, speech_test, fixed_test, 'balanced') |
|
rebalance_json(out_dir, speech_train, fixed_train, 'balanced') |
|
else: |
|
logging.info("Don't rebalance number of samples in classes.") |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|