File size: 2,847 Bytes
8cb8264
592f7e1
 
 
 
930fb43
afe3d6c
8cb8264
592f7e1
8cb8264
f9cd637
592f7e1
 
 
 
f9cd637
 
592f7e1
f9cd637
 
5c56ed6
592f7e1
47661bd
6de75ee
47661bd
aea18b3
47661bd
 
 
1b9402b
 
 
47661bd
1b9402b
47661bd
 
 
1b9402b
 
47661bd
1b9402b
3696c15
47661bd
 
1b9402b
592f7e1
6de75ee
 
aea18b3
592f7e1
 
6de75ee
 
 
 
592f7e1
 
 
f9cd637
592f7e1
 
 
 
1b9402b
5e4096f
fc933fb
5e4096f
 
 
1b9402b
592f7e1
aea18b3
47661bd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import warnings
import torch
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
import soundfile as sf



warnings.filterwarnings("ignore")

# Load tokenizer + model
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")

# set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32

# move model to device
model.to(device)


def transcribe_audio(audio_file, batch_size=4):
    audio_input, sample_rate = sf.read(audio_file)
    chunk_size = 16000 * 28  # 28 seconds chunks, seems to work best
    chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]

    transcription = ""
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i + batch_size]
        inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = inputs.to(device)
        attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
        with torch.no_grad():
            output = model.generate(
                inputs.input_features,
                max_length=1024,  # Increase max_length for longer outputs
                num_beams=7,
                task="transcribe",
                attention_mask=attention_mask,
                forced_decoder_ids=None,  # forced_decoder_ids must not be set
                language="no"
            )
        transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "

    return transcription.strip()

# HTML |banner image
banner_html = """
<div style="text-align: center;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
<div style="text-align: center; margin-top: 20px;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="Additional Image" width="50%" height="auto">
</div>
"""

# Gradio interface
iface = gr.Blocks()

with iface:
    gr.HTML(banner_html)
    gr.Markdown("# 𝐍𝐯𝐒𝐝𝐒𝐚 π€πŸπŸŽπŸŽ πŸ‘‹πŸΌπŸ‘ΎπŸ¦Ύβš‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file:β˜•")
    audio_input = gr.Audio(type="filepath")
    batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
    transcription_output = gr.Textbox()
    transcribe_button = gr.Button("Transcribe")

    transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)

# Launch interface
iface.launch(share=True, debug=True)