File size: 4,580 Bytes
2d8da09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Download the AMI test dataset used to evaluate Speaker Diarization
# More information here: https://groups.inf.ed.ac.uk/ami/corpus/
# USAGE: python get_ami_data.py
import argparse
import os
from nemo.collections.asr.parts.utils.manifest_utils import create_manifest
rttm_url = "https://raw.githubusercontent.com/BUTSpeechFIT/AMI-diarization-setup/main/only_words/rttms/{}/{}.rttm"
uem_url = "https://raw.githubusercontent.com/BUTSpeechFIT/AMI-diarization-setup/main/uems/{}/{}.uem"
list_url = "https://raw.githubusercontent.com/BUTSpeechFIT/AMI-diarization-setup/main/lists/{}.meetings.txt"
audio_types = ['Mix-Headset', 'Array1-01']
# these two IDs in the train set are missing download links for Array1-01.
# We exclude them as a result.
not_found_ids = ['IS1007d', 'IS1003b']
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download the AMI Corpus Dataset for Speaker Diarization")
parser.add_argument(
"--test_manifest_filepath",
help="path to output test manifest file",
type=str,
default='AMI_test_manifest.json',
)
parser.add_argument(
"--dev_manifest_filepath", help="path to output dev manifest file", type=str, default='AMI_dev_manifest.json',
)
parser.add_argument(
"--train_manifest_filepath",
help="path to output train manifest file",
type=str,
default='AMI_train_manifest.json',
)
parser.add_argument("--data_root", help="path to output data directory", type=str, default="ami_dataset")
args = parser.parse_args()
data_path = os.path.abspath(args.data_root)
os.makedirs(data_path, exist_ok=True)
for manifest_path, split in (
(args.test_manifest_filepath, 'test'),
(args.dev_manifest_filepath, 'dev'),
(args.train_manifest_filepath, 'train'),
):
split_path = os.path.join(data_path, split)
audio_path = os.path.join(split_path, "audio")
os.makedirs(split_path, exist_ok=True)
rttm_path = os.path.join(split_path, "rttm")
uem_path = os.path.join(split_path, "uem")
os.system(f"wget -P {split_path} {list_url.format(split)}")
with open(os.path.join(split_path, f"{split}.meetings.txt")) as f:
ids = f.read().strip().split('\n')
for id in [file_id for file_id in ids if file_id not in not_found_ids]:
for audio_type in audio_types:
audio_type_path = os.path.join(audio_path, audio_type)
os.makedirs(audio_type_path, exist_ok=True)
os.system(
f"wget -P {audio_type_path} https://groups.inf.ed.ac.uk/ami/AMICorpusMirror//amicorpus/{id}/audio/{id}.{audio_type}.wav"
)
rttm_download = rttm_url.format(split, id)
os.system(f"wget -P {rttm_path} {rttm_download}")
uem_download = uem_url.format(split, id)
os.system(f"wget -P {uem_path} {uem_download}")
rttm_files_path = os.path.join(split_path, 'rttm_files.txt')
with open(rttm_files_path, 'w') as f:
f.write('\n'.join(os.path.join(rttm_path, p) for p in os.listdir(rttm_path)))
uem_files_path = os.path.join(split_path, 'uem_files.txt')
with open(uem_files_path, 'w') as f:
f.write('\n'.join(os.path.join(uem_path, p) for p in os.listdir(uem_path)))
for audio_type in audio_types:
audio_type_path = os.path.join(audio_path, audio_type)
audio_files_path = os.path.join(split_path, f'audio_files_{audio_type}.txt')
with open(audio_files_path, 'w') as f:
f.write('\n'.join(os.path.join(audio_type_path, p) for p in os.listdir(audio_type_path)))
audio_type_manifest_path = manifest_path.replace('.json', f'.{audio_type}.json')
create_manifest(
audio_files_path, audio_type_manifest_path, rttm_path=rttm_files_path, uem_path=uem_files_path
)
|