File size: 2,256 Bytes
8cb8264
592f7e1
 
 
 
8cb8264
592f7e1
8cb8264
592f7e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import warnings
import torch
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
import soundfile as sf

warnings.filterwarnings("ignore")

# Load tokenizer and model
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")

# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32

# Initialize pipeline
#asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch_dtype)

#def transcribe_audio(audio_file):
    #with torch.no_grad():
        #output = asr(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 5, "task": "transcribe", "language": "no"})
    #return output["text"]

def transcribe_audio(audio_file):
    audio_input, _ = sf.read(audio_file)
    inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt")
    inputs = inputs.to(device)
    with torch.no_grad():
        output = model.generate(
            inputs.input_features,
            max_length=448,
            chunk_length_s=28,
            num_beams=5,
            task="transcribe",
            language="no"
        )
    transcription = processor.batch_decode(output, skip_special_tokens=True)[0]
    return transcription
    #print(transcription)

# HTML for banner image
banner_html = """
<div style="text-align: center;">
    <img src="https://huggingface.co/spaces/camparchimedes/work_harder/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%; height:auto;">
</div>
"""

# Create Gradio interface
iface = gr.Blocks()

with iface:
    gr.HTML(banner_html)
    gr.Interface(
        fn=transcribe_audio,
        inputs=gr.Audio(type="filepath"),
        outputs="text",
        title="Audio Transcription App",
        description="Upload an audio file to get the transcription",
        theme="default",
        layout="vertical",
        live=False
    )

# Launch the interface
iface.launch(share=True, debug=True)