nb / app.py
camparchimedes's picture
Update app.py
6de75ee verified
raw
history blame
2.85 kB
import gradio as gr
import warnings
import torch
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
import soundfile as sf
warnings.filterwarnings("ignore")
# Load tokenizer + model
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
# set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
# move model to device
model.to(device)
def transcribe_audio(audio_file, batch_size=4):
audio_input, sample_rate = sf.read(audio_file)
chunk_size = 16000 * 28 # 28 seconds chunks, seems to work best
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = model.generate(
inputs.input_features,
max_length=1024, # Increase max_length for longer outputs
num_beams=7,
task="transcribe",
attention_mask=attention_mask,
forced_decoder_ids=None, # forced_decoder_ids must not be set
language="no"
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
return transcription.strip()
# HTML |banner image
banner_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
<div style="text-align: center; margin-top: 20px;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="Additional Image" width="50%" height="auto">
</div>
"""
# Gradio interface
iface = gr.Blocks()
with iface:
gr.HTML(banner_html)
gr.Markdown("# 𝐍𝐯𝐒𝐝𝐒𝐚 π€πŸπŸŽπŸŽ πŸ‘‹πŸΌπŸ‘ΎπŸ¦Ύβš‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file:β˜•")
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
transcription_output = gr.Textbox()
transcribe_button = gr.Button("Transcribe")
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)
# Launch interface
iface.launch(share=True, debug=True)