model_type: "NemoSTT" | |
# frame size in ms for incremental transcription | |
frame_size: 1000 | |
# Parameters from https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Online_ASR_Microphone_Demo.ipynb | |
frame_overlap: 2 | |
offset: 4 | |
# timestep_duration = model._cfg.preprocessor['window_stride'] | |
# for block in model._cfg.encoder['jasper']: | |
# timestep_duration *= block['stride'][0] ** block['repeat'] | |
timestep_duration: 0.02 | |
# Sample rate | |
sample_rate: 16000 | |
# Minimum detectable VAD section in ms | |
min_speech_duration: 400 | |
# Timeout in ms to flush results if speech wasn't finished semantically | |
max_silence_duration: 1000 | |
# VAD frame size in ms | |
vad_frame_ms: 20 | |
transcribe_realtime: False | |
predict_punctuation: False | |
alpha: 0.0253813572180912 | |
beta: 0.08 | |