import gradio as gr import warnings import torch from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor import soundfile as sf from huggingface_hub import spaces warnings.filterwarnings("ignore") # Load tokenizer + model tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium") model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium") processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium") # set up device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # move model to device model.to(device) @spaces.GPU def transcribe_audio(audio_file): audio_input, _ = sf.read(audio_file) inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt") inputs = inputs.to(device) with torch.no_grad(): output = model.generate( inputs.input_features, max_length=448, num_beams=5, task="transcribe", language="no" ) transcription = processor.batch_decode(output, skip_special_tokens=True)[0] return transcription # HTML for banner image banner_html = """
Banner
""" # Gradio interface iface = gr.Blocks() with iface: gr.HTML(banner_html) gr.Markdown("# Audio Transcription App\nUpload an audio file to get the transcription") audio_input = gr.Audio(type="filepath") transcription_output = gr.Textbox() transcribe_button = gr.Button("Transcribe") transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output) # Launch the interface iface.launch(share=True, debug=True)