In [None]:
import numpy as np
import wave

from librosa import resample
from IPython.display import Audio
from transformers import pipeline

In [None]:
def open_wave(wav_filename):
  with wave.open(wav_filename, mode="rb") as wav_in:
    if wav_in.getsampwidth() != 2:
      raise Exception("Input not 16-bit")

    nchannels = wav_in.getnchannels()
    nframes = wav_in.getnframes()
    nsamples = nchannels * nframes
    xb = wav_in.readframes(nframes)
    b_np = np.frombuffer(xb, dtype=np.int16) / nchannels
    samples = [int(sum(b_np[b0 : b0 + nchannels])) for b0 in range(0, nsamples, nchannels)]

    return (samples, wav_in.getframerate())

In [None]:
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", chunk_length_s=30)

In [None]:
def transcribe(samples, orig_sr=44100, target_sr=16000):
  min_s, max_s = min(samples), max(samples)
  samples_f = 2.0 * (np.array(samples) - min_s) / (max_s - min_s) - 1.0
  resamples = resample(samples_f, orig_sr=orig_sr, target_sr=target_sr)
  prediction = pipe(resamples.copy(), batch_size=8)
  return prediction["text"].strip().lower()

In [None]:
samples, sr = open_wave("./audio/plain_01.wav")
display(Audio(samples, rate=sr))
transcribe(samples, sr)