|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
This script is used to preprocess audio before TTS model training. |
|
|
|
It can be configured to do several processing steps such as silence trimming, volume normalization, |
|
and duration filtering. |
|
|
|
These can be done separately through multiple executions of the script, or all at once to avoid saving |
|
too many copies of the same audio. |
|
|
|
Most of these can also be done by the TTS data loader at training time, but doing them ahead of time |
|
lets us implement more complex processing, validate the correctness of the output, and save on compute time. |
|
|
|
$ python <nemo_root_path>/scripts/dataset_processing/tts/preprocess_audio.py \ |
|
--input_manifest="<data_root_path>/manifest.json" \ |
|
--output_manifest="<data_root_path>/manifest_processed.json" \ |
|
--input_audio_dir="<data_root_path>/audio" \ |
|
--output_audio_dir="<data_root_path>/audio_processed" \ |
|
--num_workers=1 \ |
|
--trim_config_path="<nemo_root_path>/examples/tts/conf/trim/energy.yaml" \ |
|
--output_sample_rate=22050 \ |
|
--output_format=flac \ |
|
--volume_level=0.95 \ |
|
--min_duration=0.5 \ |
|
--max_duration=20.0 \ |
|
--filter_file="filtered.txt" |
|
""" |
|
|
|
import argparse |
|
import os |
|
from pathlib import Path |
|
from typing import Tuple |
|
|
|
import librosa |
|
import soundfile as sf |
|
from hydra.utils import instantiate |
|
from joblib import Parallel, delayed |
|
from omegaconf import OmegaConf |
|
from tqdm import tqdm |
|
|
|
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest |
|
from nemo.collections.tts.parts.preprocessing.audio_trimming import AudioTrimmer |
|
from nemo.collections.tts.parts.utils.tts_dataset_utils import get_abs_rel_paths, normalize_volume |
|
from nemo.utils import logging |
|
|
|
|
|
def get_args(): |
|
parser = argparse.ArgumentParser( |
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Compute speaker level pitch statistics.", |
|
) |
|
parser.add_argument( |
|
"--input_manifest", required=True, type=Path, help="Path to input training manifest.", |
|
) |
|
parser.add_argument( |
|
"--input_audio_dir", required=True, type=Path, help="Path to base directory with audio files.", |
|
) |
|
parser.add_argument( |
|
"--output_manifest", required=True, type=Path, help="Path to output training manifest with processed audio.", |
|
) |
|
parser.add_argument( |
|
"--output_audio_dir", required=True, type=Path, help="Path to output directory for audio files.", |
|
) |
|
parser.add_argument( |
|
"--overwrite_audio", |
|
action=argparse.BooleanOptionalAction, |
|
help="Whether to reprocess and overwrite existing audio files in output_audio_dir.", |
|
) |
|
parser.add_argument( |
|
"--overwrite_manifest", |
|
action=argparse.BooleanOptionalAction, |
|
help="Whether to overwrite the output manifest file if it exists.", |
|
) |
|
parser.add_argument( |
|
"--num_workers", default=1, type=int, help="Number of parallel threads to use. If -1 all CPUs are used." |
|
) |
|
parser.add_argument( |
|
"--trim_config_path", |
|
required=False, |
|
type=Path, |
|
help="Path to config file for nemo.collections.tts.data.audio_trimming.AudioTrimmer", |
|
) |
|
parser.add_argument( |
|
"--max_entries", default=0, type=int, help="If provided, maximum number of entries in the manifest to process." |
|
) |
|
parser.add_argument( |
|
"--output_sample_rate", default=0, type=int, help="If provided, rate to resample the audio to." |
|
) |
|
parser.add_argument( |
|
"--output_format", |
|
default="wav", |
|
type=str, |
|
help="If provided, format output audio will be saved as. If not provided, will keep original format.", |
|
) |
|
parser.add_argument( |
|
"--volume_level", default=0.0, type=float, help="If provided, peak volume to normalize audio to." |
|
) |
|
parser.add_argument( |
|
"--min_duration", default=0.0, type=float, help="If provided, filter out utterances shorter than min_duration." |
|
) |
|
parser.add_argument( |
|
"--max_duration", default=0.0, type=float, help="If provided, filter out utterances longer than max_duration." |
|
) |
|
parser.add_argument( |
|
"--filter_file", |
|
required=False, |
|
type=Path, |
|
help="If provided, output filter_file will contain list of " "utterances filtered out.", |
|
) |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
def _process_entry( |
|
entry: dict, |
|
input_audio_dir: Path, |
|
output_audio_dir: Path, |
|
overwrite_audio: bool, |
|
audio_trimmer: AudioTrimmer, |
|
output_sample_rate: int, |
|
output_format: str, |
|
volume_level: float, |
|
) -> Tuple[dict, float, float]: |
|
audio_filepath = Path(entry["audio_filepath"]) |
|
|
|
audio_path, audio_path_rel = get_abs_rel_paths(input_path=audio_filepath, base_path=input_audio_dir) |
|
|
|
if not output_format: |
|
output_format = audio_path.suffix |
|
|
|
output_path = output_audio_dir / audio_path_rel |
|
output_path = output_path.with_suffix(output_format) |
|
output_path.parent.mkdir(exist_ok=True, parents=True) |
|
|
|
if output_path.exists() and not overwrite_audio: |
|
original_duration = librosa.get_duration(path=audio_path) |
|
output_duration = librosa.get_duration(path=output_path) |
|
else: |
|
audio, sample_rate = librosa.load(audio_path, sr=None) |
|
original_duration = librosa.get_duration(y=audio, sr=sample_rate) |
|
if audio_trimmer is not None: |
|
audio, start_i, end_i = audio_trimmer.trim_audio( |
|
audio=audio, sample_rate=int(sample_rate), audio_id=str(audio_path) |
|
) |
|
|
|
if output_sample_rate: |
|
audio = librosa.resample(y=audio, orig_sr=sample_rate, target_sr=output_sample_rate) |
|
sample_rate = output_sample_rate |
|
|
|
if volume_level: |
|
audio = normalize_volume(audio, volume_level=volume_level) |
|
|
|
if audio.size > 0: |
|
sf.write(file=output_path, data=audio, samplerate=sample_rate) |
|
output_duration = librosa.get_duration(y=audio, sr=sample_rate) |
|
else: |
|
output_duration = 0.0 |
|
|
|
entry["duration"] = round(output_duration, 2) |
|
|
|
if os.path.isabs(audio_filepath): |
|
entry["audio_filepath"] = str(output_path) |
|
else: |
|
output_filepath = audio_path_rel.with_suffix(output_format) |
|
entry["audio_filepath"] = str(output_filepath) |
|
|
|
return entry, original_duration, output_duration |
|
|
|
|
|
def main(): |
|
args = get_args() |
|
|
|
input_manifest_path = args.input_manifest |
|
output_manifest_path = args.output_manifest |
|
input_audio_dir = args.input_audio_dir |
|
output_audio_dir = args.output_audio_dir |
|
overwrite_audio = args.overwrite_audio |
|
overwrite_manifest = args.overwrite_manifest |
|
num_workers = args.num_workers |
|
max_entries = args.max_entries |
|
output_sample_rate = args.output_sample_rate |
|
output_format = args.output_format |
|
volume_level = args.volume_level |
|
min_duration = args.min_duration |
|
max_duration = args.max_duration |
|
filter_file = args.filter_file |
|
|
|
if output_manifest_path.exists(): |
|
if overwrite_manifest: |
|
print(f"Will overwrite existing manifest path: {output_manifest_path}") |
|
else: |
|
raise ValueError(f"Manifest path already exists: {output_manifest_path}") |
|
|
|
if args.trim_config_path: |
|
audio_trimmer_config = OmegaConf.load(args.trim_config_path) |
|
audio_trimmer = instantiate(audio_trimmer_config) |
|
else: |
|
audio_trimmer = None |
|
|
|
if output_format: |
|
if output_format.upper() not in sf.available_formats(): |
|
raise ValueError(f"Unsupported output audio format: {output_format}") |
|
output_format = f".{output_format}" |
|
|
|
output_audio_dir.mkdir(exist_ok=True, parents=True) |
|
|
|
entries = read_manifest(input_manifest_path) |
|
if max_entries: |
|
entries = entries[:max_entries] |
|
|
|
|
|
job_outputs = Parallel(n_jobs=num_workers, backend='threading')( |
|
delayed(_process_entry)( |
|
entry=entry, |
|
input_audio_dir=input_audio_dir, |
|
output_audio_dir=output_audio_dir, |
|
overwrite_audio=overwrite_audio, |
|
audio_trimmer=audio_trimmer, |
|
output_sample_rate=output_sample_rate, |
|
output_format=output_format, |
|
volume_level=volume_level, |
|
) |
|
for entry in tqdm(entries) |
|
) |
|
|
|
output_entries = [] |
|
filtered_entries = [] |
|
original_durations = 0.0 |
|
output_durations = 0.0 |
|
for output_entry, original_duration, output_duration in job_outputs: |
|
original_durations += original_duration |
|
|
|
if ( |
|
output_duration == 0.0 |
|
or (min_duration and output_duration < min_duration) |
|
or (max_duration and output_duration > max_duration) |
|
): |
|
if output_duration != original_duration: |
|
output_entry["original_duration"] = original_duration |
|
filtered_entries.append(output_entry) |
|
continue |
|
|
|
output_durations += output_duration |
|
output_entries.append(output_entry) |
|
|
|
write_manifest(output_path=output_manifest_path, target_manifest=output_entries, ensure_ascii=False) |
|
if filter_file: |
|
write_manifest(output_path=str(filter_file), target_manifest=filtered_entries, ensure_ascii=False) |
|
|
|
logging.info(f"Duration of original audio: {original_durations / 3600:.2f} hours") |
|
logging.info(f"Duration of processed audio: {output_durations / 3600:.2f} hours") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|