Spaces:

IDMNYU
/

9103H-2024F-whisper-base-en-gradio

Running

App Files Files Community

Thiago Hersan commited on 9 days ago

Commit

5c641bc

•

0 Parent(s):

initial commit

Browse files

Files changed (16) hide show

.gitattributes +10 -0
.gitignore +3 -0
README.md +10 -0
app.py +37 -0
audio/plain_01.wav +3 -0
audio/plain_02.wav +3 -0
audio/plain_03.wav +3 -0
audio/plain_04.wav +3 -0
audio/plain_04b.wav +3 -0
audio/plain_05.wav +3 -0
audio/secret_01.wav +3 -0
audio/secret_02.wav +3 -0
audio/secret_03.wav +3 -0
audio/secret_04.wav +3 -0
requirements.txt +3 -0
whisper.ipynb +94 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,10 @@

+audio/plain_03.wav filter=lfs diff=lfs merge=lfs -text
+audio/plain_04.wav filter=lfs diff=lfs merge=lfs -text
+audio/plain_05.wav filter=lfs diff=lfs merge=lfs -text
+audio/secret_01.wav filter=lfs diff=lfs merge=lfs -text
+audio/secret_02.wav filter=lfs diff=lfs merge=lfs -text
+audio/plain_02.wav filter=lfs diff=lfs merge=lfs -text
+audio/plain_04b.wav filter=lfs diff=lfs merge=lfs -text
+audio/secret_03.wav filter=lfs diff=lfs merge=lfs -text
+audio/secret_04.wav filter=lfs diff=lfs merge=lfs -text
+audio/plain_01.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.DS_S*
+__pycache__/
+gradio_cached_examples/

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: 9103H 2024F whisper-base-en-gradio
+emoji: 🔊📝
+colorFrom: indigo
+colorTo: yellow
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import gradio as gr
+import numpy as np
+from librosa import resample
+from transformers import pipeline
+pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", chunk_length_s=30)
+def transcribe(audio_in):
+  orig_sr, samples = audio_in
+  min_s, max_s = min(samples), max(samples)
+  range_in = (max_s - min_s)
+  samples_scl = np.array(samples) / range_in
+  min_scl = min_s / range_in
+  samples_f = 2.0 * (samples_scl - min_scl) - 1.0
+  resamples = resample(samples_f, orig_sr=orig_sr, target_sr=16000)
+  prediction = pipe(resamples.copy(), batch_size=8)
+  return prediction["text"].strip().lower()
+with gr.Blocks() as demo:
+  gr.Markdown("""
+              # 9103H 2024F Audio Transcription.
+              ## API for [whisper-base.en](https://huggingface.co/openai/whisper-base.en) english model\
+              to help check [HW03](https://github.com/DM-GY-9103-2024F-H/HW03) exercises.
+              """)
+  gr.Interface(
+    transcribe,
+    inputs=gr.Audio(type="numpy"),
+    outputs="text",
+    cache_examples=True,
+    examples=[["./audio/plain_01.wav"]]
+  )
+if __name__ == "__main__":
+   demo.launch()

audio/plain_01.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4db5391e8429e21d7c19f05c6d551e01fe168186c91d1debb055c0305e8f84f
+size 176440

audio/plain_02.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4222969e675d59440f5fec9dc7dd1fa83f4901a5b370f4fb116cdced83bfdc4f
+size 453704

audio/plain_03.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58b18ff12c5ab02b3886669aebe0dfa2181006f65b8c7ae271bea60cdc0c9f19
+size 308740

audio/plain_04.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6165df34e92ab966590fe5bd33b5afdae568ed2d26726bbaea142622805d0445
+size 439442

audio/plain_04b.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72aad56c76fa0e1e3d0c16dd41bfbf62ecc406ed216711c16b9b2b852464b1be
+size 5974316

audio/plain_05.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64aa9b09942a77e68820468e91af7ba00008ed520c03db90dad36292048feb31
+size 441040

audio/secret_01.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b76b42a98c06aaeb1ac65ea7cf50063a76708d4e52466dc59684eab677857ae1
+size 176440

audio/secret_02.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be2e88e04a8a28b8bcb848a7c5a08f50fb5835ad0b1e5436d03eac18e16ef656
+size 453704

audio/secret_03.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0555b2ef8e1c29c24cd31eb622c146d028dba3b3bc0bab5467e3325d5811d769
+size 308740

audio/secret_04.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd460839bcd94d5f51f2b641fa6b557561525ecdf04ab9c18e157d8bdf99b9f8
+size 5712216

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+librosa
+torch
+transformers

whisper.ipynb ADDED Viewed

	@@ -0,0 +1,94 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import wave\n",
+    "\n",
+    "from librosa import resample\n",
+    "from IPython.display import Audio\n",
+    "from transformers import pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def open_wave(wav_filename):\n",
+    "  with wave.open(wav_filename, mode=\"rb\") as wav_in:\n",
+    "    if wav_in.getsampwidth() != 2:\n",
+    "      raise Exception(\"Input not 16-bit\")\n",
+    "\n",
+    "    nchannels = wav_in.getnchannels()\n",
+    "    nframes = wav_in.getnframes()\n",
+    "    nsamples = nchannels * nframes\n",
+    "    xb = wav_in.readframes(nframes)\n",
+    "    b_np = np.frombuffer(xb, dtype=np.int16) / nchannels\n",
+    "    samples = [int(sum(b_np[b0 : b0 + nchannels])) for b0 in range(0, nsamples, nchannels)]\n",
+    "\n",
+    "    return (samples, wav_in.getframerate())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base\", chunk_length_s=30)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def transcribe(samples, orig_sr=44100, target_sr=16000):\n",
+    "  min_s, max_s = min(samples), max(samples)\n",
+    "  samples_f = 2.0 * (np.array(samples) - min_s) / (max_s - min_s) - 1.0\n",
+    "  resamples = resample(samples_f, orig_sr=orig_sr, target_sr=target_sr)\n",
+    "  prediction = pipe(resamples.copy(), batch_size=8)\n",
+    "  return prediction[\"text\"].strip().lower()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "samples, sr = open_wave(\"./audio/plain_01.wav\")\n",
+    "display(Audio(samples, rate=sr))\n",
+    "transcribe(samples, sr)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "gradio",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}