import gradio as gr
from zeroshot import process, ZS_EXAMPLES
with gr.Blocks(css="style.css") as demo:
gr.Markdown(
"
MMS Zero-shot ASR Demo. See our arXiV paper for model details.
"
)
gr.HTML(
"""The demo works on input audio in any language, as long as you provide a list of words or sentences for that language and an optional n-gram language model (even a simple 1-gram model will work!) to help with accuracy.
We recommend having a minimum of 5000 distinct words in the textfile to acheive a good performance."""
)
with gr.Row():
with gr.Column():
audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)")
with gr.Row():
words_file = gr.File(label="Text Data")
lm_file = gr.File(label="Language Model\n(optional)")
with gr.Accordion("Advanced Settings", open=False):
gr.Markdown(
"The following parameters are used for beam-search decoding. Use the default values if you are not sure."
)
with gr.Row():
wscore = gr.Slider(
minimum=-10.0,
maximum=10.0,
value=0,
step=0.1,
interactive=True,
label="Word Insertion Score",
)
lmscore = gr.Slider(
minimum=-10.0,
maximum=10.0,
value=0,
step=0.1,
interactive=True,
label="Language Model Score",
)
with gr.Row():
wscore_usedefault = gr.Checkbox(
label="Use Default Word Insertion Score", value=True
)
lmscore_usedefault = gr.Checkbox(
label="Use Default Language Model Score", value=True
)
btn = gr.Button("Submit", elem_id="submit")
with gr.Column():
text = gr.Textbox(label="Transcript")
btn.click(
process,
inputs=[
audio,
words_file,
lm_file,
wscore,
lmscore,
wscore_usedefault,
lmscore_usedefault,
],
outputs=text,
)
examples = gr.Examples(examples=ZS_EXAMPLES, inputs=[audio, words_file])
demo.launch()