import accelerate import gradio as gr import time import io import librosa import torch import soundfile as sf from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline #Instantiating the model object. model = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path= "openai/whisper-large-v3", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, use_safetensors=True).to("cuda") #Instantiating the processor object. processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path="openai/whisper-large-v3") #Instantiating the transformer class' pipeline object. pipe = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3", tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=16, return_timestamps=True, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device="cuda") #Defining speech-to-text function. def convert(audio, state=""): """ This function performs speech to text conversion and will be used in Gradio's Interface function. Parameters: - audio: audio data as a bytes-like object. - state: a string representing the accumulated text from previous conversions. """ time.sleep(3) try: result = pipe(audio) transcribed_text = result['text'] state += transcribed_text + " " except Exception as e: return f"Error processing audio: Please start recording!", state return state, state #Instantiating Gradio Interface. gr_interface = gr.Interface( fn = convert, title = "Automatic Speech-to-Text", description = "### Record your speech and watch it get converted to text!", inputs = [ gr.Audio( label="Please Record Your Speech Here!", sources="microphone", type="filepath"), "state"], outputs = [ "textbox", "state" ], theme="dark", live=True ) #Launching the app (share=True). gr_interface.launch()