import gradio as gr import warnings import torch from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor import soundfile as sf warnings.filterwarnings("ignore") # Load tokenizer + model tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium") model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium") processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium") # set up device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # move model to device model.to(device) def transcribe_audio(audio_file): audio_input, sample_rate = sf.read(audio_file) chunk_size = 16000 * 28 # 28 seconds chunks, seems to work best chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] transcription = "" for chunk in chunks: inputs = processor(chunk, sampling_rate=16000, return_tensors="pt") inputs = inputs.to(device) with torch.no_grad(): output = model.generate( inputs.input_features, max_length=1024, # Increase max_length@longer outputs num_beams=5, task="transcribe", language="no" ) transcription += processor.batch_decode(output, skip_special_tokens=True)[0] + " " return transcription.strip() # HTML |banner image banner_html = """