|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import glob |
|
import json |
|
import os |
|
import re |
|
import subprocess |
|
import sys |
|
from collections import namedtuple |
|
from math import ceil, floor |
|
from operator import attrgetter |
|
|
|
import numpy as np |
|
import scipy.io.wavfile as wavfile |
|
from tqdm import tqdm |
|
|
|
parser = argparse.ArgumentParser(description="Prepare HUB5 data for training/eval") |
|
parser.add_argument( |
|
"--data_root", default=None, type=str, required=True, help="The path to the root LDC HUB5 dataset directory.", |
|
) |
|
parser.add_argument( |
|
"--dest_root", |
|
default=None, |
|
type=str, |
|
required=True, |
|
help="Path to the destination root directory for processed files.", |
|
) |
|
|
|
|
|
parser.add_argument( |
|
"--min_slice_duration", default=10.0, type=float, help="Minimum audio slice duration after processing.", |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
StmUtterance = namedtuple( |
|
'StmUtterance', ['filename', 'channel', 'speaker_id', 'begin', 'end', 'label', 'transcript',], |
|
) |
|
STM_LINE_FMT = re.compile(r"^(\w+)\s+(\w+)\s+(\w+)\s+([0-9.]+)\s+([0-9.]+)\s+(<.*>)?\s+(.+)$") |
|
|
|
|
|
TRANSCRIPT_BUGS = {"en_4622-B-12079-12187": "KIND OF WEIRD BUT"} |
|
|
|
|
|
def get_utt_id(segment): |
|
""" |
|
Gives utterance IDs in a form like: en_4156-a-36558-37113 |
|
""" |
|
return "{}-{}-{}-{}".format(segment.filename, segment.channel, int(segment.begin * 100), int(segment.end * 100),) |
|
|
|
|
|
def convert_utterances(sph_path, wav_path): |
|
""" |
|
Converts a sphere audio file to wav. |
|
""" |
|
cmd = ["sph2pipe", "-f", "wav", "-p", sph_path, wav_path] |
|
subprocess.run(cmd) |
|
|
|
|
|
def create_wavs(data_root, dest_root): |
|
""" |
|
Converts the English sph files to wav using sph2pipe. |
|
""" |
|
sph_root = os.path.join(data_root, "hub5e_00", "english") |
|
sph_list = glob.glob(os.path.join(sph_root, "*.sph")) |
|
|
|
|
|
for sph_path in tqdm(sph_list, desc="Converting to wav", unit="file"): |
|
sph_name, _ = os.path.splitext(os.path.basename(sph_path)) |
|
wav_path = os.path.join(dest_root, 'full_audio_wav', sph_name + ".wav") |
|
cmd = ["sph2pipe", "-f", "wav", "-p", sph_path, wav_path] |
|
subprocess.run(cmd) |
|
|
|
|
|
def process_transcripts(dataset_root): |
|
""" |
|
Reads in transcripts for each audio segment and processes them. |
|
""" |
|
stm_path = os.path.join(dataset_root, "2000_hub5_eng_eval_tr", "reference", "hub5e00.english.000405.stm",) |
|
results = [] |
|
chars = set() |
|
|
|
with open(stm_path, "r") as fh: |
|
for line in fh: |
|
|
|
if line.startswith(";;"): |
|
continue |
|
|
|
if "IGNORE_TIME_SEGMENT_" in line: |
|
continue |
|
line = line.replace("<B_ASIDE>", "").replace("<E_ASIDE>", "") |
|
line = line.replace("(%HESITATION)", "UH") |
|
line = line.replace("-", "") |
|
line = line.replace("(%UH)", "UH") |
|
line = line.replace("(%AH)", "UH") |
|
line = line.replace("(", "").replace(")", "") |
|
|
|
line = line.lower() |
|
|
|
m = STM_LINE_FMT.search(line.strip()) |
|
utt = StmUtterance(*m.groups()) |
|
|
|
|
|
utt = utt._replace(begin=float(utt.begin)) |
|
utt = utt._replace(end=float(utt.end)) |
|
|
|
|
|
transcript_update = TRANSCRIPT_BUGS.get(get_utt_id(utt)) |
|
if transcript_update is not None: |
|
utt = utt._replace(transcript=transcript_update) |
|
|
|
results.append(utt) |
|
chars.update(list(utt.transcript)) |
|
return results, chars |
|
|
|
|
|
def write_one_segment(dest_root, speaker_id, count, audio, sr, duration, transcript): |
|
""" |
|
Writes out one segment of audio, and writes its corresponding transcript |
|
in the manifest. |
|
|
|
Args: |
|
dest_root: the path to the output directory root |
|
speaker_id: ID of the speaker, used in file naming |
|
count: number of segments from this speaker so far |
|
audio: the segment's audio data |
|
sr: sample rate of the audio |
|
duration: duration of the audio |
|
transcript: the corresponding transcript |
|
""" |
|
audio_path = os.path.join(dest_root, "audio", f"{speaker_id}_{count:03}.wav") |
|
|
|
manifest_path = os.path.join(dest_root, "manifest_hub5.json") |
|
|
|
|
|
wavfile.write(audio_path, sr, audio) |
|
|
|
|
|
transcript = { |
|
"audio_filepath": audio_path, |
|
"duration": duration, |
|
"text": transcript, |
|
} |
|
with open(manifest_path, 'a') as f: |
|
json.dump(transcript, f) |
|
f.write('\n') |
|
|
|
|
|
def segment_audio(info_list, dest_root, min_slice_duration): |
|
""" |
|
Combines audio into >= min_slice_duration segments of the same speaker, |
|
and writes the combined transcripts into a manifest. |
|
|
|
Args: |
|
info_list: list of StmUtterance objects with transcript information. |
|
dest_root: path to output destination |
|
min_slice_duration: min number of seconds per output audio slice |
|
""" |
|
info_list = sorted(info_list, key=attrgetter('speaker_id', 'begin')) |
|
|
|
prev_id = None |
|
id_count = 0 |
|
|
|
sample_rate, audio_data = None, None |
|
transcript_buffer = '' |
|
audio_buffer = [] |
|
buffer_duration = 0.0 |
|
|
|
|
|
for info in info_list: |
|
if info.speaker_id != prev_id: |
|
|
|
prev_id = info.speaker_id |
|
id_count = 0 |
|
|
|
sample_rate, audio_data = wavfile.read(os.path.join(dest_root, 'full_audio_wav', info.filename + '.wav')) |
|
transcript_buffer = '' |
|
audio_buffer = [] |
|
buffer_duration = 0.0 |
|
|
|
|
|
transcript_buffer += info.transcript |
|
channel = 0 if info.channel.lower() == 'a' else 1 |
|
audio_buffer.append( |
|
audio_data[floor(info.begin * sample_rate) : ceil(info.end * sample_rate), channel,] |
|
) |
|
buffer_duration += info.end - info.begin |
|
|
|
if buffer_duration < min_slice_duration: |
|
transcript_buffer += ' ' |
|
else: |
|
|
|
id_count += 1 |
|
write_one_segment( |
|
dest_root, |
|
info.speaker_id, |
|
id_count, |
|
np.concatenate(audio_buffer, axis=0), |
|
sample_rate, |
|
buffer_duration, |
|
transcript_buffer, |
|
) |
|
|
|
transcript_buffer = '' |
|
audio_buffer = [] |
|
buffer_duration = 0.0 |
|
|
|
|
|
def main(): |
|
data_root = args.data_root |
|
dest_root = args.dest_root |
|
|
|
min_slice_duration = args.min_slice_duration |
|
|
|
if not os.path.exists(os.path.join(dest_root, 'full_audio_wav')): |
|
os.makedirs(os.path.join(dest_root, 'full_audio_wav')) |
|
if not os.path.exists(os.path.join(dest_root, 'audio')): |
|
os.makedirs(os.path.join(dest_root, 'audio')) |
|
|
|
|
|
open(os.path.join(dest_root, "manifest_hub5.json"), 'w').close() |
|
|
|
|
|
create_wavs(data_root, dest_root) |
|
|
|
|
|
info_list, chars = process_transcripts(data_root) |
|
|
|
print("Writing out vocab file", file=sys.stderr) |
|
with open(os.path.join(dest_root, "vocab.txt"), 'w') as fh: |
|
for x in sorted(list(chars)): |
|
fh.write(x + "\n") |
|
|
|
|
|
print("Segmenting audio and writing manifest") |
|
segment_audio(info_list, dest_root, min_slice_duration) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|