Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +142 -193
  3. packages.txt +1 -0
  4. requirements.txt +3 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ“‰
4
  colorFrom: pink
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 3.41.2
8
  app_file: app.py
9
  pinned: false
10
  ---
 
4
  colorFrom: pink
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 3.38.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -1,202 +1,151 @@
1
- import os
2
- os.system("pip install git+https://github.com/openai/whisper.git")
3
  import gradio as gr
4
- import whisper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- from share_btn import community_icon_html, loading_icon_html, share_js
7
 
8
- model = whisper.load_model("small")
 
 
9
 
 
 
10
 
11
-
12
- def inference(audio):
13
- audio = whisper.load_audio(audio)
14
- audio = whisper.pad_or_trim(audio)
 
 
 
 
 
 
 
15
 
16
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
 
 
 
17
 
18
- _, probs = model.detect_language(mel)
 
 
19
 
20
- options = whisper.DecodingOptions(fp16 = False)
21
- result = whisper.decode(model, mel, options)
 
 
 
22
 
23
- print(result.text)
24
- return result.text, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
25
-
26
-
27
-
28
-
29
- css = """
30
- .gradio-container {
31
- font-family: 'IBM Plex Sans', sans-serif;
32
- }
33
- .gr-button {
34
- color: white;
35
- border-color: black;
36
- background: black;
37
- }
38
- input[type='range'] {
39
- accent-color: black;
40
- }
41
- .dark input[type='range'] {
42
- accent-color: #dfdfdf;
43
- }
44
- .container {
45
- max-width: 730px;
46
- margin: auto;
47
- padding-top: 1.5rem;
48
- }
49
-
50
- .details:hover {
51
- text-decoration: underline;
52
- }
53
- .gr-button {
54
- white-space: nowrap;
55
- }
56
- .gr-button:focus {
57
- border-color: rgb(147 197 253 / var(--tw-border-opacity));
58
- outline: none;
59
- box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
60
- --tw-border-opacity: 1;
61
- --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
62
- --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
63
- --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
64
- --tw-ring-opacity: .5;
65
- }
66
- .footer {
67
- margin-bottom: 45px;
68
- margin-top: 35px;
69
- text-align: center;
70
- border-bottom: 1px solid #e5e5e5;
71
- }
72
- .footer>p {
73
- font-size: .8rem;
74
- display: inline-block;
75
- padding: 0 10px;
76
- transform: translateY(10px);
77
- background: white;
78
- }
79
- .dark .footer {
80
- border-color: #303030;
81
- }
82
- .dark .footer>p {
83
- background: #0b0f19;
84
- }
85
- .prompt h4{
86
- margin: 1.25em 0 .25em 0;
87
- font-weight: bold;
88
- font-size: 115%;
89
- }
90
- .animate-spin {
91
- animation: spin 1s linear infinite;
92
- }
93
- @keyframes spin {
94
- from {
95
- transform: rotate(0deg);
96
- }
97
- to {
98
- transform: rotate(360deg);
99
- }
100
- }
101
- #share-btn-container {
102
- display: flex; margin-top: 1.5rem !important; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
103
- }
104
- #share-btn {
105
- all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;
106
- }
107
- #share-btn * {
108
- all: unset;
109
- }
110
- """
111
-
112
- block = gr.Blocks(css=css)
113
-
114
-
115
-
116
- with block:
117
- gr.HTML(
118
- """
119
- <div style="text-align: center; max-width: 650px; margin: 0 auto;">
120
- <div
121
- style="
122
- display: inline-flex;
123
- align-items: center;
124
- gap: 0.8rem;
125
- font-size: 1.75rem;
126
- "
127
- >
128
- <svg
129
- width="0.65em"
130
- height="0.65em"
131
- viewBox="0 0 115 115"
132
- fill="none"
133
- xmlns="http://www.w3.org/2000/svg"
134
- >
135
- <rect width="23" height="23" fill="white"></rect>
136
- <rect y="69" width="23" height="23" fill="white"></rect>
137
- <rect x="23" width="23" height="23" fill="#AEAEAE"></rect>
138
- <rect x="23" y="69" width="23" height="23" fill="#AEAEAE"></rect>
139
- <rect x="46" width="23" height="23" fill="white"></rect>
140
- <rect x="46" y="69" width="23" height="23" fill="white"></rect>
141
- <rect x="69" width="23" height="23" fill="black"></rect>
142
- <rect x="69" y="69" width="23" height="23" fill="black"></rect>
143
- <rect x="92" width="23" height="23" fill="#D9D9D9"></rect>
144
- <rect x="92" y="69" width="23" height="23" fill="#AEAEAE"></rect>
145
- <rect x="115" y="46" width="23" height="23" fill="white"></rect>
146
- <rect x="115" y="115" width="23" height="23" fill="white"></rect>
147
- <rect x="115" y="69" width="23" height="23" fill="#D9D9D9"></rect>
148
- <rect x="92" y="46" width="23" height="23" fill="#AEAEAE"></rect>
149
- <rect x="92" y="115" width="23" height="23" fill="#AEAEAE"></rect>
150
- <rect x="92" y="69" width="23" height="23" fill="white"></rect>
151
- <rect x="69" y="46" width="23" height="23" fill="white"></rect>
152
- <rect x="69" y="115" width="23" height="23" fill="white"></rect>
153
- <rect x="69" y="69" width="23" height="23" fill="#D9D9D9"></rect>
154
- <rect x="46" y="46" width="23" height="23" fill="black"></rect>
155
- <rect x="46" y="115" width="23" height="23" fill="black"></rect>
156
- <rect x="46" y="69" width="23" height="23" fill="black"></rect>
157
- <rect x="23" y="46" width="23" height="23" fill="#D9D9D9"></rect>
158
- <rect x="23" y="115" width="23" height="23" fill="#AEAEAE"></rect>
159
- <rect x="23" y="69" width="23" height="23" fill="black"></rect>
160
- </svg>
161
- <h1 style="font-weight: 900; margin-bottom: 7px;">
162
- Whisper
163
- </h1>
164
- </div>
165
- <p style="margin-bottom: 10px; font-size: 94%">
166
- Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. This demo cuts audio after around 30 secs.
167
- </p>
168
- <p>You can skip the queue by using google colab for the space: <a href="https://colab.research.google.com/drive/1WJ98KHgZxFGrHiMm4TyWZllSew_Af_ff?usp=sharing"><img data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" src="https://camo.githubusercontent.com/84f0493939e0c4de4e6dbe113251b4bfb5353e57134ffd9fcab6b8714514d4d1/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667"></a></p>
169
- </div>
170
- """
171
- )
172
- with gr.Group():
173
- with gr.Box():
174
- with gr.Row().style(mobile_collapse=False, equal_height=True):
175
- audio = gr.Audio(
176
- label="Input Audio",
177
- show_label=False,
178
- source="microphone",
179
- type="filepath"
180
- )
181
-
182
- btn = gr.Button("Transcribe")
183
- text = gr.Textbox(show_label=False, elem_id="result-textarea")
184
- with gr.Group(elem_id="share-btn-container"):
185
- community_icon = gr.HTML(community_icon_html, visible=False)
186
- loading_icon = gr.HTML(loading_icon_html, visible=False)
187
- share_button = gr.Button("Share to community", elem_id="share-btn", visible=False)
188
-
189
-
190
-
191
-
192
- btn.click(inference, inputs=[audio], outputs=[text, community_icon, loading_icon, share_button])
193
- share_button.click(None, [], [], _js=share_js)
194
-
195
- gr.HTML('''
196
- <div class="footer">
197
- <p>Model by <a href="https://github.com/openai/whisper" style="text-decoration: underline;" target="_blank">OpenAI</a> - Gradio Demo by πŸ€— Hugging Face
198
- </p>
199
- </div>
200
- ''')
201
-
202
- block.launch()
 
1
+ import torch
2
+
3
  import gradio as gr
4
+ import yt_dlp as youtube_dl
5
+ from transformers import pipeline
6
+ from transformers.pipelines.audio_utils import ffmpeg_read
7
+
8
+ import tempfile
9
+ import os
10
+
11
+ MODEL_NAME = "openai/whisper-large-v3"
12
+ BATCH_SIZE = 8
13
+ FILE_LIMIT_MB = 1000
14
+ YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
15
+
16
+ device = 0 if torch.cuda.is_available() else "cpu"
17
+
18
+ pipe = pipeline(
19
+ task="automatic-speech-recognition",
20
+ model=MODEL_NAME,
21
+ chunk_length_s=30,
22
+ device=device,
23
+ )
24
 
 
25
 
26
+ def transcribe(inputs, task):
27
+ if inputs is None:
28
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
29
 
30
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
31
+ return text
32
 
33
+
34
+ def _return_yt_html_embed(yt_url):
35
+ video_id = yt_url.split("?v=")[-1]
36
+ HTML_str = (
37
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
38
+ " </center>"
39
+ )
40
+ return HTML_str
41
+
42
+ def download_yt_audio(yt_url, filename):
43
+ info_loader = youtube_dl.YoutubeDL()
44
 
45
+ try:
46
+ info = info_loader.extract_info(yt_url, download=False)
47
+ except youtube_dl.utils.DownloadError as err:
48
+ raise gr.Error(str(err))
49
 
50
+ file_length = info["duration_string"]
51
+ file_h_m_s = file_length.split(":")
52
+ file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
53
 
54
+ if len(file_h_m_s) == 1:
55
+ file_h_m_s.insert(0, 0)
56
+ if len(file_h_m_s) == 2:
57
+ file_h_m_s.insert(0, 0)
58
+ file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
59
 
60
+ if file_length_s > YT_LENGTH_LIMIT_S:
61
+ yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
62
+ file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
63
+ raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
64
+
65
+ ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
66
+
67
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
68
+ try:
69
+ ydl.download([yt_url])
70
+ except youtube_dl.utils.ExtractorError as err:
71
+ raise gr.Error(str(err))
72
+
73
+
74
+ def yt_transcribe(yt_url, task, max_filesize=75.0):
75
+ html_embed_str = _return_yt_html_embed(yt_url)
76
+
77
+ with tempfile.TemporaryDirectory() as tmpdirname:
78
+ filepath = os.path.join(tmpdirname, "video.mp4")
79
+ download_yt_audio(yt_url, filepath)
80
+ with open(filepath, "rb") as f:
81
+ inputs = f.read()
82
+
83
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
84
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
85
+
86
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
87
+
88
+ return html_embed_str, text
89
+
90
+
91
+ demo = gr.Blocks()
92
+
93
+ mf_transcribe = gr.Interface(
94
+ fn=transcribe,
95
+ inputs=[
96
+ gr.inputs.Audio(source="microphone", type="filepath", optional=True),
97
+ gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
98
+ ],
99
+ outputs="text",
100
+ layout="horizontal",
101
+ theme="huggingface",
102
+ title="Whisper Large V3: Transcribe Audio",
103
+ description=(
104
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
105
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
106
+ " of arbitrary length."
107
+ ),
108
+ allow_flagging="never",
109
+ )
110
+
111
+ file_transcribe = gr.Interface(
112
+ fn=transcribe,
113
+ inputs=[
114
+ gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
115
+ gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
116
+ ],
117
+ outputs="text",
118
+ layout="horizontal",
119
+ theme="huggingface",
120
+ title="Whisper Large V3: Transcribe Audio",
121
+ description=(
122
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
123
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
124
+ " of arbitrary length."
125
+ ),
126
+ allow_flagging="never",
127
+ )
128
+
129
+ yt_transcribe = gr.Interface(
130
+ fn=yt_transcribe,
131
+ inputs=[
132
+ gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
133
+ gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe")
134
+ ],
135
+ outputs=["html", "text"],
136
+ layout="horizontal",
137
+ theme="huggingface",
138
+ title="Whisper Large V3: Transcribe YouTube",
139
+ description=(
140
+ "Transcribe long-form YouTube videos with the click of a button! Demo uses the OpenAI Whisper checkpoint"
141
+ f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe video files of"
142
+ " arbitrary length."
143
+ ),
144
+ allow_flagging="never",
145
+ )
146
+
147
+ with demo:
148
+ gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
149
+
150
+ demo.launch(enable_queue=True)
151
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt CHANGED
@@ -1 +1,3 @@
1
- transformers
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ torch
3
+ yt-dlp