File size: 4,640 Bytes
2d8da09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import sys
import wave
import api.nmt_pb2 as nmt
import api.nmt_pb2_grpc as nmtsrv
import grpc
import pyaudio
import riva_api.audio_pb2 as riva
import riva_api.riva_asr_pb2 as rivaasr
import riva_api.riva_asr_pb2_grpc as rivaasr_srv
def get_args():
parser = argparse.ArgumentParser(description="Streaming transcription via Riva AI Speech Services")
parser.add_argument("--riva-server", default="localhost:50051", type=str, help="URI to GRPC server endpoint")
parser.add_argument("--audio-file", required=True, help="path to local file to stream")
parser.add_argument("--output-device", type=int, default=None, help="output device to use")
parser.add_argument("--list-devices", action="store_true", help="list output devices indices")
parser.add_argument("--nmt-server", default="localhost:50052", help="port on which NMT server runs")
parser.add_argument("--asr_only", action="store_true", help="Whether to skip MT and just display")
parser.add_argument("--target_language", default="es", help="Target language to translate into.")
parser.add_argument(
"--asr_punctuation",
action="store_true",
help="Whether to use Riva's punctuation model for ASR transcript postprocessing.",
)
return parser.parse_args()
def listen_print_loop(responses, nmt_stub, target_language, asr_only=False):
num_chars_printed = 0
prev_utterances = []
for response in responses:
if not response.results:
continue
result = response.results[0]
if not result.alternatives:
continue
transcript = result.alternatives[0].transcript
original_transcript = transcript
if not asr_only:
req = nmt.TranslateTextRequest(texts=[transcript], source_language='en', target_language=target_language)
translation = nmt_stub.TranslateText(req).translations[0].translation
transcript = translation
overwrite_chars = ' ' * (num_chars_printed - len(transcript))
if not result.is_final:
sys.stdout.write(">> " + transcript + overwrite_chars + '\r')
sys.stdout.flush()
num_chars_printed = len(transcript) + 3
else:
print("## " + transcript + overwrite_chars + "\n")
num_chars_printed = 0
prev_utterances.append(original_transcript)
CHUNK = 1024
args = get_args()
wf = wave.open(args.audio_file, 'rb')
channel = grpc.insecure_channel(args.riva_server)
client = rivaasr_srv.RivaSpeechRecognitionStub(channel)
nmt_channel = grpc.insecure_channel(args.nmt_server)
nmt_stub = nmtsrv.RivaTranslateStub(nmt_channel)
config = rivaasr.RecognitionConfig(
encoding=riva.AudioEncoding.LINEAR_PCM,
sample_rate_hertz=wf.getframerate(),
language_code="en-US",
max_alternatives=1,
enable_automatic_punctuation=args.asr_punctuation,
)
streaming_config = rivaasr.StreamingRecognitionConfig(config=config, interim_results=True)
# instantiate PyAudio (1)
p = pyaudio.PyAudio()
if args.list_devices:
for i in range(p.get_device_count()):
info = p.get_device_info_by_index(i)
if info['maxOutputChannels'] < 1:
continue
print(f"{info['index']}: {info['name']}")
sys.exit(0)
# open stream (2)
stream = p.open(
output_device_index=args.output_device,
format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True,
)
# read data
def generator(w, s):
d = w.readframes(CHUNK)
yield rivaasr.StreamingRecognizeRequest(streaming_config=s)
while len(d) > 0:
yield rivaasr.StreamingRecognizeRequest(audio_content=d)
stream.write(d)
d = w.readframes(CHUNK)
return
responses = client.StreamingRecognize(generator(wf, streaming_config))
listen_print_loop(responses, nmt_stub, target_language=args.target_language, asr_only=args.asr_only)
# stop stream (4)
stream.stop_stream()
stream.close()
# close PyAudio (5)
p.terminate()
|