import gradio as gr import numpy as np from librosa import resample from transformers import pipeline pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", chunk_length_s=30) def transcribe(audio_in): orig_sr, samples = audio_in min_s, max_s = min(samples), max(samples) range_in = (max_s - min_s) samples_scl = np.array(samples) / range_in min_scl = min_s / range_in samples_f = 2.0 * (samples_scl - min_scl) - 1.0 resamples = resample(samples_f, orig_sr=orig_sr, target_sr=16000) prediction = pipe(resamples.copy(), batch_size=8) return prediction["text"].strip().lower() with gr.Blocks() as demo: gr.Markdown(""" # 9103H 2024F Audio Transcription. ## API for [whisper-base.en](https://huggingface.co/openai/whisper-base.en) english model\ to help check [HW03](https://github.com/DM-GY-9103-2024F-H/HW03) exercises. """) gr.Interface( transcribe, inputs=gr.Audio(type="numpy"), outputs="text", cache_examples=True, examples=[["./audio/plain_01.wav"]] ) if __name__ == "__main__": demo.launch()