from __future__ import annotations import warnings from logging import getLogger from pathlib import Path from typing import Iterable import librosa import soundfile from joblib import Parallel, delayed from tqdm_joblib import tqdm_joblib from .preprocess_utils import check_hubert_min_duration LOG = getLogger(__name__) # input_dir and output_dir exists. # write code to convert input dir audio files to output dir audio files, # without changing folder structure. Use joblib to parallelize. # Converting audio files includes: # - resampling to specified sampling rate # - trim silence # - adjust volume in a smart way # - save as 16-bit wav file def _get_unique_filename(path: Path, existing_paths: Iterable[Path]) -> Path: """Return a unique path by appending a number to the original path.""" if path not in existing_paths: return path i = 1 while True: new_path = path.parent / f"{path.stem}_{i}{path.suffix}" if new_path not in existing_paths: return new_path i += 1 def is_relative_to(path: Path, *other): """Return True if the path is relative to another path or False. Python 3.9+ has Path.is_relative_to() method, but we need to support Python 3.8. """ try: path.relative_to(*other) return True except ValueError: return False def _preprocess_one( input_path: Path, output_path: Path, sr: int, *, top_db: int, frame_seconds: float, hop_seconds: float, ) -> None: """Preprocess one audio file.""" try: audio, sr = librosa.load(input_path, sr=sr, mono=True) # Audioread is the last backend it will attempt, so this is the exception thrown on failure except Exception as e: # Failure due to attempting to load a file that is not audio, so return early LOG.warning(f"Failed to load {input_path} due to {e}") return if not check_hubert_min_duration(audio, sr): LOG.info(f"Skip {input_path} because it is too short.") return # Adjust volume audio /= max(audio.max(), -audio.min()) # Trim silence audio, _ = librosa.effects.trim( audio, top_db=top_db, frame_length=int(frame_seconds * sr), hop_length=int(hop_seconds * sr), ) if not check_hubert_min_duration(audio, sr): LOG.info(f"Skip {input_path} because it is too short.") return soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16") def preprocess_resample( input_dir: Path | str, output_dir: Path | str, sampling_rate: int, n_jobs: int = -1, *, top_db: int = 30, frame_seconds: float = 0.1, hop_seconds: float = 0.05, ) -> None: input_dir = Path(input_dir) output_dir = Path(output_dir) """Preprocess audio files in input_dir and save them to output_dir.""" out_paths = [] in_paths = list(input_dir.rglob("*.*")) if not in_paths: raise ValueError(f"No audio files found in {input_dir}") for in_path in in_paths: in_path_relative = in_path.relative_to(input_dir) if not in_path.is_absolute() and is_relative_to( in_path, Path("dataset_raw") / "44k" ): new_in_path_relative = in_path_relative.relative_to("44k") warnings.warn( f"Recommended folder structure has changed since v1.0.0. " "Please move your dataset directly under dataset_raw folder. " f"Recoginzed {in_path_relative} as {new_in_path_relative}" ) in_path_relative = new_in_path_relative if len(in_path_relative.parts) < 2: continue speaker_name = in_path_relative.parts[0] file_name = in_path_relative.with_suffix(".wav").name out_path = output_dir / speaker_name / file_name out_path = _get_unique_filename(out_path, out_paths) out_path.parent.mkdir(parents=True, exist_ok=True) out_paths.append(out_path) in_and_out_paths = list(zip(in_paths, out_paths)) with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)): Parallel(n_jobs=n_jobs)( delayed(_preprocess_one)( *args, sr=sampling_rate, top_db=top_db, frame_seconds=frame_seconds, hop_seconds=hop_seconds, ) for args in in_and_out_paths )