Spaces:
Build error
Build error
Commit
•
869e885
1
Parent(s):
1863902
Update app.py
Browse files
app.py
CHANGED
@@ -1,29 +1,35 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import warnings
|
3 |
import torch
|
4 |
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
|
5 |
import soundfile as sf
|
6 |
-
|
7 |
-
|
8 |
|
9 |
warnings.filterwarnings("ignore")
|
10 |
|
11 |
-
# Load tokenizer
|
12 |
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
|
13 |
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
|
14 |
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
|
15 |
|
16 |
-
#
|
17 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
18 |
torch_dtype = torch.float32
|
19 |
|
20 |
-
#
|
21 |
model.to(device)
|
22 |
|
|
|
|
|
|
|
|
|
23 |
|
24 |
def transcribe_audio(audio_file, batch_size=4):
|
25 |
-
|
26 |
-
|
|
|
27 |
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
|
28 |
|
29 |
transcription = ""
|
@@ -35,18 +41,15 @@ def transcribe_audio(audio_file, batch_size=4):
|
|
35 |
with torch.no_grad():
|
36 |
output = model.generate(
|
37 |
inputs.input_features,
|
38 |
-
max_length=1024,
|
39 |
num_beams=7,
|
40 |
-
|
41 |
-
attention_mask=attention_mask,
|
42 |
-
forced_decoder_ids=None, # forced_decoder_ids must not be set
|
43 |
-
language="no"
|
44 |
)
|
45 |
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
|
46 |
|
47 |
return transcription.strip()
|
48 |
|
49 |
-
# HTML |
|
50 |
banner_html = """
|
51 |
<div style="text-align: center;">
|
52 |
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
|
@@ -70,4 +73,4 @@ with iface:
|
|
70 |
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)
|
71 |
|
72 |
# Launch interface
|
73 |
-
iface.launch(share=True, debug=True)
|
|
|
1 |
+
# app.py
|
2 |
+
|
3 |
import gradio as gr
|
4 |
import warnings
|
5 |
import torch
|
6 |
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
|
7 |
import soundfile as sf
|
8 |
+
import ffmpeg
|
|
|
9 |
|
10 |
warnings.filterwarnings("ignore")
|
11 |
|
12 |
+
# Load tokenizer, model, and processor
|
13 |
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
|
14 |
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
|
15 |
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
|
16 |
|
17 |
+
# Set up device
|
18 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
19 |
torch_dtype = torch.float32
|
20 |
|
21 |
+
# Move model to device
|
22 |
model.to(device)
|
23 |
|
24 |
+
def convert_audio_format(audio_path):
|
25 |
+
output_path = "converted_audio.wav"
|
26 |
+
ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
|
27 |
+
return output_path
|
28 |
|
29 |
def transcribe_audio(audio_file, batch_size=4):
|
30 |
+
audio_path = convert_audio_format(audio_file)
|
31 |
+
audio_input, sample_rate = sf.read(audio_path)
|
32 |
+
chunk_size = 16000 * 28 # 28 seconds chunks
|
33 |
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
|
34 |
|
35 |
transcription = ""
|
|
|
41 |
with torch.no_grad():
|
42 |
output = model.generate(
|
43 |
inputs.input_features,
|
44 |
+
max_length=1024,
|
45 |
num_beams=7,
|
46 |
+
attention_mask=attention_mask
|
|
|
|
|
|
|
47 |
)
|
48 |
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
|
49 |
|
50 |
return transcription.strip()
|
51 |
|
52 |
+
# HTML | Banner image
|
53 |
banner_html = """
|
54 |
<div style="text-align: center;">
|
55 |
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
|
|
|
73 |
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)
|
74 |
|
75 |
# Launch interface
|
76 |
+
iface.launch(share=True, debug=True)
|