Spaces:
Build error
Build error
yama
commited on
Commit
•
631631a
1
Parent(s):
3f207e8
Update app.py
Browse files
app.py
CHANGED
@@ -374,69 +374,70 @@ demo = gr.Blocks(title=title)
|
|
374 |
demo.encrypt = False
|
375 |
|
376 |
with demo:
|
377 |
-
gr.
|
378 |
-
<div>
|
379 |
-
<h1 style='text-align: center'>Whisper speaker diarization</h1>
|
380 |
-
This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
|
381 |
-
and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
|
382 |
-
</div>
|
383 |
-
''')
|
384 |
-
|
385 |
-
with gr.Row():
|
386 |
gr.Markdown('''
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
|
|
391 |
''')
|
392 |
|
393 |
-
|
394 |
-
|
395 |
-
###
|
|
|
|
|
|
|
396 |
''')
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
label="Examples", inputs=[youtube_url_in])
|
402 |
-
|
403 |
-
with gr.Row():
|
404 |
-
with gr.Column():
|
405 |
-
youtube_url_in.render()
|
406 |
-
download_youtube_btn = gr.Button("Download Youtube video")
|
407 |
-
download_youtube_btn.click(get_youtube, [youtube_url_in], [
|
408 |
-
video_in])
|
409 |
-
print(video_in)
|
410 |
-
|
411 |
-
with gr.Row():
|
412 |
-
with gr.Column():
|
413 |
-
video_in.render()
|
414 |
-
with gr.Column():
|
415 |
-
gr.Markdown('''
|
416 |
-
##### Here you can start the transcription process.
|
417 |
-
##### Please select the source language for transcription.
|
418 |
-
##### You can select a range of assumed numbers of speakers.
|
419 |
''')
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
441 |
|
442 |
demo.launch(debug=True)
|
|
|
374 |
demo.encrypt = False
|
375 |
|
376 |
with demo:
|
377 |
+
with gr.Tab("Whisper speaker diarization"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
gr.Markdown('''
|
379 |
+
<div>
|
380 |
+
<h1 style='text-align: center'>Whisper speaker diarization</h1>
|
381 |
+
This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
|
382 |
+
and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
|
383 |
+
</div>
|
384 |
''')
|
385 |
|
386 |
+
with gr.Row():
|
387 |
+
gr.Markdown('''
|
388 |
+
### Transcribe youtube link using OpenAI Whisper
|
389 |
+
##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
|
390 |
+
##### 2. Generating speaker embeddings for each segments.
|
391 |
+
##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
|
392 |
''')
|
393 |
+
|
394 |
+
with gr.Row():
|
395 |
+
gr.Markdown('''
|
396 |
+
### You can test by following examples:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
''')
|
398 |
+
examples = gr.Examples(examples=
|
399 |
+
["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
|
400 |
+
"https://www.youtube.com/watch?v=-UX0X45sYe4",
|
401 |
+
"https://www.youtube.com/watch?v=7minSgqi-Gw"],
|
402 |
+
label="Examples", inputs=[youtube_url_in])
|
403 |
+
|
404 |
+
with gr.Row():
|
405 |
+
with gr.Column():
|
406 |
+
youtube_url_in.render()
|
407 |
+
download_youtube_btn = gr.Button("Download Youtube video")
|
408 |
+
download_youtube_btn.click(get_youtube, [youtube_url_in], [
|
409 |
+
video_in])
|
410 |
+
print(video_in)
|
411 |
+
|
412 |
+
with gr.Row():
|
413 |
+
with gr.Column():
|
414 |
+
video_in.render()
|
415 |
+
with gr.Column():
|
416 |
+
gr.Markdown('''
|
417 |
+
##### Here you can start the transcription process.
|
418 |
+
##### Please select the source language for transcription.
|
419 |
+
##### You can select a range of assumed numbers of speakers.
|
420 |
+
''')
|
421 |
+
selected_source_lang.render()
|
422 |
+
selected_whisper_model.render()
|
423 |
+
number_speakers.render()
|
424 |
+
transcribe_btn = gr.Button("Transcribe audio and diarization")
|
425 |
+
transcribe_btn.click(speech_to_text,
|
426 |
+
[video_in, selected_source_lang, selected_whisper_model, number_speakers],
|
427 |
+
[transcription_df, system_info, download_transcript]
|
428 |
+
)
|
429 |
+
|
430 |
+
with gr.Row():
|
431 |
+
gr.Markdown('''
|
432 |
+
##### Here you will get transcription output
|
433 |
+
##### ''')
|
434 |
+
|
435 |
+
with gr.Row():
|
436 |
+
with gr.Column():
|
437 |
+
download_transcript.render()
|
438 |
+
transcription_df.render()
|
439 |
+
system_info.render()
|
440 |
+
gr.Markdown(
|
441 |
+
'''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
|
442 |
|
443 |
demo.launch(debug=True)
|