File size: 826 Bytes
dcd7182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from espnet2.bin.asr_inference import Speech2Text
import resampy
from espnet_model_zoo.downloader import ModelDownloader



class DemoASR:

    def __init__(self, model_path, device):
        model_file = 'asr_improved_tts-phn_en.zip'

        d = ModelDownloader()

        self.speech2text = Speech2Text(
            **d.download_and_unpack(str(model_path / model_file)),
            device=str(device),
            minlenratio=0.0,
            maxlenratio=0.0,
            ctc_weight=0.4,
            beam_size=15,
            batch_size=1,
            nbest=1
        )

    def recognize_speech(self, audio, sr):
        if len(audio.shape) == 2:
            audio = audio.T[0]
        speech = resampy.resample(audio, sr, 16000)
        nbests = self.speech2text(speech)
        text, *_ = nbests[0]
        return text