|
import gradio as gr |
|
import numpy as np |
|
|
|
from librosa import resample |
|
from transformers import pipeline |
|
|
|
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", chunk_length_s=30) |
|
|
|
def transcribe(audio_in): |
|
orig_sr, samples = audio_in |
|
min_s, max_s = min(samples), max(samples) |
|
range_in = (max_s - min_s) |
|
samples_scl = np.array(samples) / range_in |
|
min_scl = min_s / range_in |
|
samples_f = 2.0 * (samples_scl - min_scl) - 1.0 |
|
resamples = resample(samples_f, orig_sr=orig_sr, target_sr=16000) |
|
prediction = pipe(resamples.copy(), batch_size=8) |
|
return prediction["text"].strip().lower() |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(""" |
|
# 9103H 2024F Audio Transcription. |
|
## API for [whisper-base.en](https://huggingface.co/openai/whisper-base.en) english model\ |
|
to help check [HW03](https://github.com/DM-GY-9103-2024F-H/HW03) exercises. |
|
""") |
|
|
|
gr.Interface( |
|
transcribe, |
|
inputs=gr.Audio(type="numpy"), |
|
outputs="text", |
|
cache_examples=True, |
|
examples=[["./audio/plain_01.wav"]] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|