|
import logging |
|
import sys |
|
import gradio as gr |
|
import vosk |
|
import json |
|
import subprocess |
|
|
|
logging.basicConfig( |
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", |
|
datefmt="%m/%d/%Y %H:%M:%S", |
|
handlers=[logging.StreamHandler(sys.stdout)], |
|
) |
|
logger = logging.getLogger(__name__) |
|
logger.setLevel(logging.DEBUG) |
|
|
|
LARGE_MODEL_BY_LANGUAGE = { |
|
"Russian": {"model_id": "vosk-model-ru-0.22"}, |
|
"Chinese": {"model_id": "vosk-model-cn-0.22"}, |
|
"English": {"model_id": "vosk-model-en-us-0.22"}, |
|
"French": {"model_id": "vosk-model-fr-0.22"}, |
|
"German": {"model_id": "vosk-model-de-0.22"}, |
|
"Italian": {"model_id": "vosk-model-it-0.22"}, |
|
"Japanese": {"model_id": "vosk-model-ja-0.22"}, |
|
"Persian": {"model_id": "vosk-model-fa-0.5"}, |
|
} |
|
|
|
LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys()) |
|
CACHED_MODELS_BY_ID = {} |
|
|
|
def asr(model, input_file): |
|
|
|
rec = vosk.KaldiRecognizer(model, 16000.0) |
|
results = [] |
|
|
|
process = subprocess.Popen(f'ffmpeg -loglevel quiet -i {input_file} -ar 16000 -ac 1 -f s16le -'.split(), |
|
stdout=subprocess.PIPE) |
|
|
|
while True: |
|
data = process.stdout.read(4000) |
|
if len(data) == 0: |
|
break |
|
if rec.AcceptWaveform(data): |
|
jres = json.loads(rec.Result()) |
|
results.append(jres['text']) |
|
|
|
jres = json.loads(rec.FinalResult()) |
|
results.append(jres['text']) |
|
|
|
return " ".join(results) |
|
|
|
|
|
def run(input_file, language, history): |
|
|
|
logger.info(f"Running ASR for {language} for {input_file}") |
|
|
|
history = history or [] |
|
|
|
model = LARGE_MODEL_BY_LANGUAGE.get(language, None) |
|
|
|
if model is None: |
|
history.append({ |
|
"error_message": f"Failed to find a model for {language} language :(" |
|
}) |
|
else: |
|
model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None) |
|
if model_instance is None: |
|
model_instance = vosk.Model(model_name=model["model_id"]) |
|
CACHED_MODELS_BY_ID[model["model_id"]] = model_instance |
|
|
|
transcription = asr(model_instance, input_file.name) |
|
|
|
logger.info(f"Transcription for {input_file}: {transcription}") |
|
|
|
history.append({ |
|
"model_id": model["model_id"], |
|
"language": language, |
|
"transcription": transcription, |
|
"error_message": None |
|
}) |
|
|
|
html_output = "<div class='result'>" |
|
for item in history: |
|
if item["error_message"] is not None: |
|
html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>" |
|
else: |
|
html_output += "<div class='result_item result_item_success'>" |
|
html_output += f'{item["transcription"]}<br/>' |
|
html_output += "</div>" |
|
html_output += "</div>" |
|
|
|
return html_output, history |
|
|
|
|
|
gr.Interface( |
|
run, |
|
inputs=[ |
|
gr.inputs.Audio(source="microphone", type="file", label="Record something..."), |
|
gr.inputs.Radio(label="Language", choices=LANGUAGES), |
|
"state" |
|
], |
|
outputs=[ |
|
gr.outputs.HTML(label="Outputs"), |
|
"state" |
|
], |
|
title="Automatic Speech Recognition", |
|
description="", |
|
css=""" |
|
.result {display:flex;flex-direction:column} |
|
.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%} |
|
.result_item_success {background-color:mediumaquamarine;color:white;align-self:start} |
|
.result_item_error {background-color:#ff7070;color:white;align-self:start} |
|
""", |
|
allow_screenshot=False, |
|
allow_flagging="never", |
|
theme="grass" |
|
).launch(enable_queue=True) |
|
|