# app.py import gradio as gr import warnings import torch from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor import soundfile as sf import ffmpeg warnings.filterwarnings("ignore") # Load tokenizer, model, and processor tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium") model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium") processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium") # Set up device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # Move model to device model.to(device) def convert_audio_format(audio_path): output_path = "converted_audio.wav" ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True) return output_path def transcribe_audio(audio_file, batch_size=4): audio_path = convert_audio_format(audio_file) audio_input, sample_rate = sf.read(audio_path) chunk_size = 16000 * 28 # 28 seconds chunks chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] transcription = "" for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i + batch_size] inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) inputs = inputs.to(device) attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None with torch.no_grad(): output = model.generate( inputs.input_features, max_length=1024, num_beams=7, attention_mask=attention_mask ) transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " return transcription.strip() # HTML | Banner image banner_html = """