{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import wave\n", "\n", "from librosa import resample\n", "from IPython.display import Audio\n", "from transformers import pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def open_wave(wav_filename):\n", " with wave.open(wav_filename, mode=\"rb\") as wav_in:\n", " if wav_in.getsampwidth() != 2:\n", " raise Exception(\"Input not 16-bit\")\n", "\n", " nchannels = wav_in.getnchannels()\n", " nframes = wav_in.getnframes()\n", " nsamples = nchannels * nframes\n", " xb = wav_in.readframes(nframes)\n", " b_np = np.frombuffer(xb, dtype=np.int16) / nchannels\n", " samples = [int(sum(b_np[b0 : b0 + nchannels])) for b0 in range(0, nsamples, nchannels)]\n", "\n", " return (samples, wav_in.getframerate())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipe = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base\", chunk_length_s=30)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def transcribe(samples, orig_sr=44100, target_sr=16000):\n", " min_s, max_s = min(samples), max(samples)\n", " samples_f = 2.0 * (np.array(samples) - min_s) / (max_s - min_s) - 1.0\n", " resamples = resample(samples_f, orig_sr=orig_sr, target_sr=target_sr)\n", " prediction = pipe(resamples.copy(), batch_size=8)\n", " return prediction[\"text\"].strip().lower()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "samples, sr = open_wave(\"./audio/plain_01.wav\")\n", "display(Audio(samples, rate=sr))\n", "transcribe(samples, sr)" ] } ], "metadata": { "kernelspec": { "display_name": "gradio", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 2 }