Spaces:
Build error
Build error
camparchimedes
commited on
Commit
β’
14c8f51
1
Parent(s):
fbecba6
Update app.py
Browse files
app.py
CHANGED
@@ -7,9 +7,11 @@ from transformers import WhisperTokenizer, WhisperForConditionalGeneration, Whis
|
|
7 |
import soundfile as sf
|
8 |
import ffmpeg
|
9 |
import os
|
|
|
10 |
from huggingface_hub import InferenceClient
|
11 |
from gradio_client import Client, file
|
12 |
import spaces
|
|
|
13 |
|
14 |
warnings.filterwarnings("ignore")
|
15 |
|
@@ -30,9 +32,9 @@ def convert_audio_format(audio_path):
|
|
30 |
ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
|
31 |
return output_path
|
32 |
|
33 |
-
|
34 |
@spaces.GPU(duration=120, queue=False)
|
35 |
def transcribe_audio(audio_file, batch_size=4):
|
|
|
36 |
audio_path = convert_audio_format(audio_file)
|
37 |
audio_input, sample_rate = sf.read(audio_path)
|
38 |
chunk_size = 16000 * 28 # 28 seconds chunks
|
@@ -47,40 +49,38 @@ def transcribe_audio(audio_file, batch_size=4):
|
|
47 |
with torch.no_grad():
|
48 |
output = model.generate(
|
49 |
inputs.input_features,
|
50 |
-
max_length=
|
51 |
num_beams=7,
|
52 |
attention_mask=attention_mask
|
53 |
)
|
54 |
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
|
55 |
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# HTML
|
59 |
banner_html = """
|
60 |
<div style="text-align: center;">
|
61 |
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
|
62 |
</div>
|
|
|
|
|
63 |
<div style="text-align: center; margin-top: 20px;">
|
64 |
-
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/
|
65 |
</div>
|
66 |
"""
|
67 |
|
68 |
-
images_path = os.path.dirname(__file__)
|
69 |
-
IMAGES = [
|
70 |
-
[
|
71 |
-
{
|
72 |
-
"text": "What usual stuff happens in this image? :)",
|
73 |
-
"files": [f"{images_path}/500x_picture.png"],
|
74 |
-
}
|
75 |
-
]
|
76 |
-
]
|
77 |
-
|
78 |
# Gradio interface
|
79 |
iface = gr.Blocks()
|
80 |
|
81 |
with iface:
|
82 |
gr.HTML(banner_html)
|
83 |
-
gr.Markdown("# ππ―π’ππ’π ππππ ππΌπΎπ¦Ύβ‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file
|
84 |
audio_input = gr.Audio(type="filepath")
|
85 |
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
|
86 |
transcription_output = gr.Textbox()
|
|
|
7 |
import soundfile as sf
|
8 |
import ffmpeg
|
9 |
import os
|
10 |
+
from PIL import Image
|
11 |
from huggingface_hub import InferenceClient
|
12 |
from gradio_client import Client, file
|
13 |
import spaces
|
14 |
+
import time
|
15 |
|
16 |
warnings.filterwarnings("ignore")
|
17 |
|
|
|
32 |
ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
|
33 |
return output_path
|
34 |
|
|
|
35 |
@spaces.GPU(duration=120, queue=False)
|
36 |
def transcribe_audio(audio_file, batch_size=4):
|
37 |
+
start_time = time.time()
|
38 |
audio_path = convert_audio_format(audio_file)
|
39 |
audio_input, sample_rate = sf.read(audio_path)
|
40 |
chunk_size = 16000 * 28 # 28 seconds chunks
|
|
|
49 |
with torch.no_grad():
|
50 |
output = model.generate(
|
51 |
inputs.input_features,
|
52 |
+
max_length=2048,
|
53 |
num_beams=7,
|
54 |
attention_mask=attention_mask
|
55 |
)
|
56 |
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
|
57 |
|
58 |
+
end_time = time.time()
|
59 |
+
transcription_time = end_time - start_time
|
60 |
+
word_count = len(transcription.split())
|
61 |
+
|
62 |
+
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
|
63 |
+
|
64 |
+
return result
|
65 |
|
66 |
# HTML
|
67 |
banner_html = """
|
68 |
<div style="text-align: center;">
|
69 |
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
|
70 |
</div>
|
71 |
+
"""
|
72 |
+
image_html = """
|
73 |
<div style="text-align: center; margin-top: 20px;">
|
74 |
+
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/500x_picture.png" alt="picture" width="50%" height="auto">
|
75 |
</div>
|
76 |
"""
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
# Gradio interface
|
79 |
iface = gr.Blocks()
|
80 |
|
81 |
with iface:
|
82 |
gr.HTML(banner_html)
|
83 |
+
gr.Markdown("# ππ―π’ππ’π ππππ ππΌπΎπ¦Ύβ‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file: β")
|
84 |
audio_input = gr.Audio(type="filepath")
|
85 |
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
|
86 |
transcription_output = gr.Textbox()
|