Spaces:
Build error
Build error
File size: 2,847 Bytes
8cb8264 592f7e1 930fb43 afe3d6c 8cb8264 592f7e1 8cb8264 f9cd637 592f7e1 f9cd637 592f7e1 f9cd637 5c56ed6 592f7e1 47661bd 6de75ee 47661bd aea18b3 47661bd 1b9402b 47661bd 1b9402b 47661bd 1b9402b 47661bd 1b9402b 3696c15 47661bd 1b9402b 592f7e1 6de75ee aea18b3 592f7e1 6de75ee 592f7e1 f9cd637 592f7e1 1b9402b 5e4096f fc933fb 5e4096f 1b9402b 592f7e1 aea18b3 47661bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import gradio as gr
import warnings
import torch
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
import soundfile as sf
warnings.filterwarnings("ignore")
# Load tokenizer + model
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
# set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
# move model to device
model.to(device)
def transcribe_audio(audio_file, batch_size=4):
audio_input, sample_rate = sf.read(audio_file)
chunk_size = 16000 * 28 # 28 seconds chunks, seems to work best
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = model.generate(
inputs.input_features,
max_length=1024, # Increase max_length for longer outputs
num_beams=7,
task="transcribe",
attention_mask=attention_mask,
forced_decoder_ids=None, # forced_decoder_ids must not be set
language="no"
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
return transcription.strip()
# HTML |banner image
banner_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
<div style="text-align: center; margin-top: 20px;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="Additional Image" width="50%" height="auto">
</div>
"""
# Gradio interface
iface = gr.Blocks()
with iface:
gr.HTML(banner_html)
gr.Markdown("# ππ―π’ππ’π ππππ ππΌπΎπ¦Ύβ‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file:β")
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
transcription_output = gr.Textbox()
transcribe_button = gr.Button("Transcribe")
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)
# Launch interface
iface.launch(share=True, debug=True) |