Spaces:

yamashiro3
/

Whisper-gpt-voicescribe

Build error

App Files Files Community

yama commited on Jun 28, 2023

Commit

631631a

•

1 Parent(s): 3f207e8

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -59

app.py CHANGED Viewed

@@ -374,69 +374,70 @@ demo = gr.Blocks(title=title)
 demo.encrypt = False
 with demo:
-    gr.Markdown('''
-        <div>
-        <h1 style='text-align: center'>Whisper speaker diarization</h1>
-        This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
-        and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
-        </div>
-    ''')
-    with gr.Row():
         gr.Markdown('''
-        ### Transcribe youtube link using OpenAI Whisper
-        ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
-        ##### 2. Generating speaker embeddings for each segments.
-        ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
         ''')
-    with gr.Row():
-        gr.Markdown('''
-            ### You can test by following examples:
             ''')
-    examples = gr.Examples(examples=
-                           ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
-                            "https://www.youtube.com/watch?v=-UX0X45sYe4",
-                            "https://www.youtube.com/watch?v=7minSgqi-Gw"],
-                           label="Examples", inputs=[youtube_url_in])
-    with gr.Row():
-        with gr.Column():
-            youtube_url_in.render()
-            download_youtube_btn = gr.Button("Download Youtube video")
-            download_youtube_btn.click(get_youtube, [youtube_url_in], [
-                video_in])
-            print(video_in)
-    with gr.Row():
-        with gr.Column():
-            video_in.render()
-            with gr.Column():
-                gr.Markdown('''
-                ##### Here you can start the transcription process.
-                ##### Please select the source language for transcription.
-                ##### You can select a range of assumed numbers of speakers.
                 ''')
-            selected_source_lang.render()
-            selected_whisper_model.render()
-            number_speakers.render()
-            transcribe_btn = gr.Button("Transcribe audio and diarization")
-            transcribe_btn.click(speech_to_text,
-                                 [video_in, selected_source_lang, selected_whisper_model, number_speakers],
-                                 [transcription_df, system_info, download_transcript]
-                                 )
-    with gr.Row():
-        gr.Markdown('''
-        ##### Here you will get transcription  output
-        ##### ''')
-    with gr.Row():
-        with gr.Column():
-            download_transcript.render()
-            transcription_df.render()
-            system_info.render()
-            gr.Markdown(
-                '''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
 demo.launch(debug=True)

 demo.encrypt = False
 with demo:
+    with gr.Tab("Whisper speaker diarization"):
         gr.Markdown('''
+            <div>
+            <h1 style='text-align: center'>Whisper speaker diarization</h1>
+            This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
+            and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
+            </div>
         ''')
+        with gr.Row():
+            gr.Markdown('''
+            ### Transcribe youtube link using OpenAI Whisper
+            ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
+            ##### 2. Generating speaker embeddings for each segments.
+            ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
             ''')
+        with gr.Row():
+            gr.Markdown('''
+                ### You can test by following examples:
                 ''')
+        examples = gr.Examples(examples=
+                               ["https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
+                                "https://www.youtube.com/watch?v=-UX0X45sYe4",
+                                "https://www.youtube.com/watch?v=7minSgqi-Gw"],
+                               label="Examples", inputs=[youtube_url_in])
+        with gr.Row():
+            with gr.Column():
+                youtube_url_in.render()
+                download_youtube_btn = gr.Button("Download Youtube video")
+                download_youtube_btn.click(get_youtube, [youtube_url_in], [
+                    video_in])
+                print(video_in)
+        with gr.Row():
+            with gr.Column():
+                video_in.render()
+                with gr.Column():
+                    gr.Markdown('''
+                    ##### Here you can start the transcription process.
+                    ##### Please select the source language for transcription.
+                    ##### You can select a range of assumed numbers of speakers.
+                    ''')
+                selected_source_lang.render()
+                selected_whisper_model.render()
+                number_speakers.render()
+                transcribe_btn = gr.Button("Transcribe audio and diarization")
+                transcribe_btn.click(speech_to_text,
+                                     [video_in, selected_source_lang, selected_whisper_model, number_speakers],
+                                     [transcription_df, system_info, download_transcript]
+                                     )
+        with gr.Row():
+            gr.Markdown('''
+            ##### Here you will get transcription  output
+            ##### ''')
+        with gr.Row():
+            with gr.Column():
+                download_transcript.render()
+                transcription_df.render()
+                system_info.render()
+                gr.Markdown(
+                    '''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
 demo.launch(debug=True)