Spaces:
Sleeping
Sleeping
File size: 1,677 Bytes
41b9d24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
from pathlib import Path
import random
import shutil
import os
import json
import argbind
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map
from audiotools.core import util
@argbind.bind(without_prefix=True)
def train_test_split(
audio_folder: str = ".",
test_size: float = 0.2,
seed: int = 42,
):
print(f"finding audio")
audio_folder = Path(audio_folder)
audio_files = util.find_audio(audio_folder)
print(f"found {len(audio_files)} audio files")
# split according to test_size
n_test = int(len(audio_files) * test_size)
n_train = len(audio_files) - n_test
# shuffle
random.seed(seed)
random.shuffle(audio_files)
train_files = audio_files[:n_train]
test_files = audio_files[n_train:]
print(f"Train files: {len(train_files)}")
print(f"Test files: {len(test_files)}")
continue_ = input("Continue [yn]? ") or "n"
if continue_ != "y":
return
for split, files in (
("train", train_files), ("test", test_files)
):
for file in tqdm(files):
out_file = audio_folder.parent / f"{audio_folder.name}-{split}" / Path(file).name
out_file.parent.mkdir(exist_ok=True, parents=True)
try:
os.symlink(file, out_file)
except FileExistsError:
print(f"File {out_file} already exists, skipping")
# save split as json
with open(Path(audio_folder) / f"{split}.json", "w") as f:
json.dump([str(f) for f in files], f)
if __name__ == "__main__":
args = argbind.parse_args()
with argbind.scope(args):
train_test_split() |