# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Download the AMI test dataset used to evaluate Speaker Diarization # More information here: https://groups.inf.ed.ac.uk/ami/corpus/ # USAGE: python get_ami_data.py import argparse import os from nemo.collections.asr.parts.utils.manifest_utils import create_manifest rttm_url = "https://raw.githubusercontent.com/BUTSpeechFIT/AMI-diarization-setup/main/only_words/rttms/{}/{}.rttm" uem_url = "https://raw.githubusercontent.com/BUTSpeechFIT/AMI-diarization-setup/main/uems/{}/{}.uem" list_url = "https://raw.githubusercontent.com/BUTSpeechFIT/AMI-diarization-setup/main/lists/{}.meetings.txt" audio_types = ['Mix-Headset', 'Array1-01'] # these two IDs in the train set are missing download links for Array1-01. # We exclude them as a result. not_found_ids = ['IS1007d', 'IS1003b'] if __name__ == "__main__": parser = argparse.ArgumentParser(description="Download the AMI Corpus Dataset for Speaker Diarization") parser.add_argument( "--test_manifest_filepath", help="path to output test manifest file", type=str, default='AMI_test_manifest.json', ) parser.add_argument( "--dev_manifest_filepath", help="path to output dev manifest file", type=str, default='AMI_dev_manifest.json', ) parser.add_argument( "--train_manifest_filepath", help="path to output train manifest file", type=str, default='AMI_train_manifest.json', ) parser.add_argument("--data_root", help="path to output data directory", type=str, default="ami_dataset") args = parser.parse_args() data_path = os.path.abspath(args.data_root) os.makedirs(data_path, exist_ok=True) for manifest_path, split in ( (args.test_manifest_filepath, 'test'), (args.dev_manifest_filepath, 'dev'), (args.train_manifest_filepath, 'train'), ): split_path = os.path.join(data_path, split) audio_path = os.path.join(split_path, "audio") os.makedirs(split_path, exist_ok=True) rttm_path = os.path.join(split_path, "rttm") uem_path = os.path.join(split_path, "uem") os.system(f"wget -P {split_path} {list_url.format(split)}") with open(os.path.join(split_path, f"{split}.meetings.txt")) as f: ids = f.read().strip().split('\n') for id in [file_id for file_id in ids if file_id not in not_found_ids]: for audio_type in audio_types: audio_type_path = os.path.join(audio_path, audio_type) os.makedirs(audio_type_path, exist_ok=True) os.system( f"wget -P {audio_type_path} https://groups.inf.ed.ac.uk/ami/AMICorpusMirror//amicorpus/{id}/audio/{id}.{audio_type}.wav" ) rttm_download = rttm_url.format(split, id) os.system(f"wget -P {rttm_path} {rttm_download}") uem_download = uem_url.format(split, id) os.system(f"wget -P {uem_path} {uem_download}") rttm_files_path = os.path.join(split_path, 'rttm_files.txt') with open(rttm_files_path, 'w') as f: f.write('\n'.join(os.path.join(rttm_path, p) for p in os.listdir(rttm_path))) uem_files_path = os.path.join(split_path, 'uem_files.txt') with open(uem_files_path, 'w') as f: f.write('\n'.join(os.path.join(uem_path, p) for p in os.listdir(uem_path))) for audio_type in audio_types: audio_type_path = os.path.join(audio_path, audio_type) audio_files_path = os.path.join(split_path, f'audio_files_{audio_type}.txt') with open(audio_files_path, 'w') as f: f.write('\n'.join(os.path.join(audio_type_path, p) for p in os.listdir(audio_type_path))) audio_type_manifest_path = manifest_path.replace('.json', f'.{audio_type}.json') create_manifest( audio_files_path, audio_type_manifest_path, rttm_path=rttm_files_path, uem_path=uem_files_path )