|
import numpy as np |
|
import tqdm |
|
import matplotlib.pyplot as plt |
|
import os |
|
import shutil |
|
import wave |
|
|
|
WAV_MIN_LENGTH = 2 |
|
SAMPLE_RATE = 1 |
|
SAMPLE_MIN = 2 |
|
SAMPLE_MAX = 10 |
|
|
|
|
|
|
|
def check_duration(wav_file): |
|
|
|
f = wave.open(wav_file, "rb") |
|
|
|
frames = f.getnframes() |
|
rate = f.getframerate() |
|
|
|
duration = frames / float(rate) |
|
|
|
f.close() |
|
|
|
return duration > WAV_MIN_LENGTH |
|
|
|
|
|
def split_data(src_dir, dst_dir, ratio): |
|
|
|
if not os.path.exists(dst_dir): |
|
os.makedirs(dst_dir) |
|
|
|
|
|
subdirs, files, subfiles = [], [], [] |
|
for item in os.listdir(src_dir): |
|
item_path = os.path.join(src_dir, item) |
|
if os.path.isdir(item_path): |
|
subdirs.append(item) |
|
for subitem in os.listdir(item_path): |
|
subitem_path = os.path.join(item_path, subitem) |
|
if os.path.isfile(subitem_path) and subitem.endswith(".wav"): |
|
subfiles.append(subitem) |
|
elif os.path.isfile(item_path) and item.endswith(".wav"): |
|
files.append(item) |
|
|
|
|
|
if len(files) == 0: |
|
if len(subfiles) == 0: |
|
print(f"Error: No wav files found in {src_dir}") |
|
return |
|
|
|
|
|
num_files = int(len(files) * ratio) |
|
num_files = max(SAMPLE_MIN, min(SAMPLE_MAX, num_files)) |
|
|
|
|
|
np.random.shuffle(files) |
|
selected_files = files[:num_files] |
|
|
|
|
|
pbar = tqdm.tqdm(total=num_files) |
|
|
|
|
|
for file in selected_files: |
|
src_file = os.path.join(src_dir, file) |
|
|
|
if not check_duration(src_file): |
|
print(f"Skipped {src_file} because its duration is less than 2 seconds.") |
|
continue |
|
|
|
dst_file = os.path.join(dst_dir, file) |
|
shutil.move(src_file, dst_file) |
|
pbar.update(1) |
|
|
|
pbar.close() |
|
|
|
|
|
for subdir in subdirs: |
|
|
|
src_subdir = os.path.join(src_dir, subdir) |
|
dst_subdir = os.path.join(dst_dir, subdir) |
|
|
|
split_data(src_subdir, dst_subdir, ratio) |
|
|
|
|
|
|
|
def main(): |
|
root_dir = os.path.abspath('.') |
|
dst_dir = root_dir + "/data/val/audio" |
|
|
|
ratio = float(SAMPLE_RATE) / 100 |
|
|
|
|
|
src_dir = root_dir + "/data/train/audio" |
|
|
|
|
|
split_data(src_dir, dst_dir, ratio) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |