File size: 3,587 Bytes
196d65f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import logging
import sys
import gradio as gr
import vosk
import json
import subprocess

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

LARGE_MODEL_BY_LANGUAGE = {
    "Russian": {"model_id": "vosk-model-ru-0.22"},
    "Chinese": {"model_id": "vosk-model-cn-0.22"},
    "English": {"model_id": "vosk-model-en-us-0.22"},
    "French": {"model_id": "vosk-model-fr-0.22"},
    "German": {"model_id": "vosk-model-de-0.22"},
    "Italian": {"model_id": "vosk-model-it-0.22"},
    "Japanese": {"model_id": "vosk-model-ja-0.22"},
    "Persian": {"model_id": "vosk-model-fa-0.5"},
}

LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys())
CACHED_MODELS_BY_ID = {}

def asr(model, input_file):

    rec = vosk.KaldiRecognizer(model, 16000.0)
    results = []

    process = subprocess.Popen(f'ffmpeg -loglevel quiet -i {input_file} -ar 16000 -ac 1 -f s16le -'.split(),
                            stdout=subprocess.PIPE)

    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            jres = json.loads(rec.Result())
            results.append(jres['text'])

    jres = json.loads(rec.FinalResult())
    results.append(jres['text'])

    return " ".join(results)


def run(input_file, language, history):

    logger.info(f"Running ASR for {language} for {input_file}")

    history = history or []

    model = LARGE_MODEL_BY_LANGUAGE.get(language, None)

    if model is None:
        history.append({
            "error_message": f"Failed to find a model for {language} language :("
        })
    else:
        model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
        if model_instance is None:
            model_instance = vosk.Model(model_name=model["model_id"])
            CACHED_MODELS_BY_ID[model["model_id"]] = model_instance

        transcription = asr(model_instance, input_file.name)

        logger.info(f"Transcription for {input_file}: {transcription}")

        history.append({
            "model_id": model["model_id"],
            "language": language,
            "transcription": transcription,
            "error_message": None
        })

    html_output = "<div class='result'>"
    for item in history:
        if item["error_message"] is not None:
            html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
        else:
            html_output += "<div class='result_item result_item_success'>"
            html_output += f'{item["transcription"]}<br/>'
            html_output += "</div>"
    html_output += "</div>"

    return html_output, history


gr.Interface(
    run,
    inputs=[
        gr.inputs.Audio(source="microphone", type="file", label="Record something..."),
        gr.inputs.Radio(label="Language", choices=LANGUAGES),
        "state"
    ],
    outputs=[
        gr.outputs.HTML(label="Outputs"),
        "state"
    ],
    title="Automatic Speech Recognition",
    description="",
    css="""
    .result {display:flex;flex-direction:column}
    .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
    .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
    .result_item_error {background-color:#ff7070;color:white;align-self:start}
    """,
    allow_screenshot=False,
    allow_flagging="never",
    theme="grass"
).launch(enable_queue=True)