import os import gradio as gr import whisper from whisper import tokenizer import time current_size = 'base' model = whisper.load_model(current_size) AUTO_DETECT_LANG = "Auto Detection" def transcribe(audio, state={}, model_size='base', delay=1.2, lang=None, translate=False): time.sleep(delay - 1) global current_size global model if model_size != current_size: current_size = model_size model = whisper.load_model(current_size) transcription = model.transcribe( audio, language = lang if lang != AUTO_DETECT_LANG else None ) state['transcription'] += transcription['text'] + " " if translate: x = whisper.load_audio(audio) x = whisper.pad_or_trim(x) mel = whisper.log_mel_spectrogram(x).to(model.device) options = whisper.DecodingOptions(task = "translation") translation = whisper.decode(model, mel, options) state['translation'] += translation.text + " " return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}" title = "OpenAI's Whisper Real-time Demo" description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model. This demo runs on a CPU. For faster inference choose 'tiny' model size and set the language explicitly." model_size = gr.Dropdown(label="Model size", choices=['base', 'tiny', 'small', 'medium', 'large'], value='base') delay_slider = gr.inputs.Slider(minimum=1, maximum=5, default=1.2, label="Rate of transcription") available_languages = sorted(tokenizer.TO_LANGUAGE_CODE.keys()) available_languages = [lang.capitalize() for lang in available_languages] available_languages = [AUTO_DETECT_LANG]+available_languages lang_dropdown = gr.inputs.Dropdown(choices=available_languages, label="Language", default=AUTO_DETECT_LANG, type="value") if lang_dropdown==AUTO_DETECT_LANG: lang_dropdown=None translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False) transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20) translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20) detected_lang = gr.outputs.HTML(label="Detected Language") state = gr.State({"transcription": "", "translation": ""}) gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath", streaming=True), state, model_size, delay_slider, lang_dropdown, translate_checkbox ], outputs=[ transcription_tb, translation_tb, state, detected_lang ], live=True, allow_flagging='never', title=title, description=description, ).launch( # enable_queue=True, # debug=True )