from espnet2.bin.asr_inference import Speech2Text import resampy from espnet_model_zoo.downloader import ModelDownloader class DemoASR: def __init__(self, model_path, device): model_file = 'asr_improved_tts-phn_en.zip' d = ModelDownloader() self.speech2text = Speech2Text( **d.download_and_unpack(str(model_path / model_file)), device=str(device), minlenratio=0.0, maxlenratio=0.0, ctc_weight=0.4, beam_size=15, batch_size=1, nbest=1 ) def recognize_speech(self, audio, sr): if len(audio.shape) == 2: audio = audio.T[0] speech = resampy.resample(audio, sr, 16000) nbests = self.speech2text(speech) text, *_ = nbests[0] return text