Remove vocals from ui code
Browse files- app.py +31 -38
- concat_transcriptions.py +1 -10
- transcribe.py +9 -21
app.py
CHANGED
@@ -358,6 +358,7 @@ def reset_frontend():
|
|
358 |
gr.Dropdown(visible=visible),
|
359 |
gr.Dropdown(visible=visible),
|
360 |
gr.Dropdown(visible=visible),
|
|
|
361 |
gr.Button(visible=visible),
|
362 |
gr.Textbox(visible=visible),
|
363 |
gr.Textbox(visible=visible),
|
@@ -401,6 +402,7 @@ def is_valid_url(url):
|
|
401 |
num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
402 |
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
|
403 |
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
|
|
|
404 |
number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
|
405 |
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
|
406 |
|
@@ -413,6 +415,7 @@ def is_valid_url(url):
|
|
413 |
gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
|
414 |
source_languaje,
|
415 |
target_languaje,
|
|
|
416 |
number_of_speakers,
|
417 |
subtify_button,
|
418 |
)
|
@@ -421,6 +424,7 @@ def is_valid_url(url):
|
|
421 |
gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
|
422 |
source_languaje,
|
423 |
target_languaje,
|
|
|
424 |
number_of_speakers,
|
425 |
subtify_button,
|
426 |
)
|
@@ -432,6 +436,7 @@ def is_valid_url(url):
|
|
432 |
gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
|
433 |
source_languaje,
|
434 |
target_languaje,
|
|
|
435 |
number_of_speakers,
|
436 |
subtify_button,
|
437 |
)
|
@@ -441,12 +446,14 @@ def is_valid_url(url):
|
|
441 |
image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
|
442 |
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
|
443 |
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
|
|
|
444 |
number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
|
445 |
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
|
446 |
return (
|
447 |
image,
|
448 |
source_languaje,
|
449 |
target_languaje,
|
|
|
450 |
number_of_speakers,
|
451 |
subtify_button,
|
452 |
)
|
@@ -491,44 +498,26 @@ def slice_audio(audio_path):
|
|
491 |
command = f"python {python_file} {audio_path} {SECONDS}"
|
492 |
os.system(command)
|
493 |
|
494 |
-
with open(f"{folder_vocals}/speakers.txt", 'w') as f:
|
495 |
-
f.write(str(0))
|
496 |
-
command = f"mv {folder_chunck}/*.mp3 {folder_vocals}/"
|
497 |
-
os.system(command)
|
498 |
-
|
499 |
return (
|
500 |
gr.Textbox(value="Ok")
|
501 |
)
|
502 |
|
503 |
-
def trascribe_audio(source_languaje):
|
504 |
-
|
505 |
python_file = "transcribe.py"
|
506 |
-
|
507 |
-
|
508 |
-
command = f"python {python_file} {chunck_file} {source_languaje} {speakers_file} {DEVICE} {not SEPARE_VOCALS}"
|
509 |
os.system(command)
|
510 |
|
511 |
-
with open(
|
512 |
files = f.read().splitlines()
|
513 |
-
with open(speakers_file, 'r') as f:
|
514 |
-
speakers = f.read().splitlines()
|
515 |
-
speakers = int(speakers[0])
|
516 |
for file in files:
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
command = f"rm {vocal}"
|
524 |
-
os.system(command)
|
525 |
-
else:
|
526 |
-
vocals_extension = "mp3"
|
527 |
-
file_name, _ = file.split(".")
|
528 |
-
_, file_name = file_name.split("/")
|
529 |
-
vocal = f'{folder_vocals}/{file_name}.{vocals_extension}'
|
530 |
-
command = f"rm {vocal}"
|
531 |
-
os.system(command)
|
532 |
|
533 |
return (
|
534 |
gr.Textbox(value="Ok")
|
@@ -540,9 +529,8 @@ def concatenate_transcriptions():
|
|
540 |
os.makedirs(folder_concatenated)
|
541 |
|
542 |
chunck_file = "chunks/output_files.txt"
|
543 |
-
speakers_file = "vocals/speakers.txt"
|
544 |
python_file = "concat_transcriptions.py"
|
545 |
-
command = f"python {python_file} {chunck_file} {SECONDS}
|
546 |
os.system(command)
|
547 |
|
548 |
with open(chunck_file, 'r') as f:
|
@@ -595,14 +583,18 @@ def add_translated_subtitles_to_video(original_video_path, original_audio_path,
|
|
595 |
os.system(command)
|
596 |
command = f"rm chunks/output_files.txt"
|
597 |
os.system(command)
|
598 |
-
command = f"rm vocals/speakers.txt"
|
599 |
-
os.system(command)
|
600 |
|
601 |
subtitled_video = "videos/download_video_with_subtitles.mp4"
|
602 |
|
|
|
603 |
return (
|
604 |
-
gr.Textbox(value="Ok"),
|
605 |
gr.Video(value=subtitled_video, visible=True),
|
|
|
|
|
|
|
|
|
|
|
|
|
606 |
)
|
607 |
|
608 |
def subtify():
|
@@ -626,7 +618,7 @@ def subtify():
|
|
626 |
with gr.Row():
|
627 |
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
|
628 |
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
|
629 |
-
with gr.Accordion("Advanced settings", open=
|
630 |
number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True, info="Number of speakers in the video, if you don't know, select 10")
|
631 |
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
|
632 |
|
@@ -654,6 +646,7 @@ def subtify():
|
|
654 |
image,
|
655 |
source_languaje,
|
656 |
target_languaje,
|
|
|
657 |
number_of_speakers,
|
658 |
subtify_button,
|
659 |
auxiliar_block2,
|
@@ -673,7 +666,7 @@ def subtify():
|
|
673 |
auxiliar_block1.change(
|
674 |
fn=is_valid_url,
|
675 |
inputs=url_textbox,
|
676 |
-
outputs=[image, source_languaje, target_languaje, number_of_speakers, subtify_button]
|
677 |
)
|
678 |
subtify_button.click(
|
679 |
fn=change_visibility_texboxes,
|
@@ -691,7 +684,7 @@ def subtify():
|
|
691 |
)
|
692 |
video_sliced_progress_info.change(
|
693 |
fn=trascribe_audio,
|
694 |
-
inputs=[source_languaje],
|
695 |
outputs=[video_transcribed_progress_info]
|
696 |
)
|
697 |
video_transcribed_progress_info.change(
|
@@ -706,7 +699,7 @@ def subtify():
|
|
706 |
video_translated_progress_info.change(
|
707 |
fn=add_translated_subtitles_to_video,
|
708 |
inputs=[original_video_path, original_audio_path, original_audio_translated_path],
|
709 |
-
outputs=[
|
710 |
)
|
711 |
|
712 |
gr.Markdown(html_buy_me_a_coffe)
|
|
|
358 |
gr.Dropdown(visible=visible),
|
359 |
gr.Dropdown(visible=visible),
|
360 |
gr.Dropdown(visible=visible),
|
361 |
+
gr.Accordion(visible=visible),
|
362 |
gr.Button(visible=visible),
|
363 |
gr.Textbox(visible=visible),
|
364 |
gr.Textbox(visible=visible),
|
|
|
402 |
num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
403 |
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
|
404 |
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
|
405 |
+
advanced_setings = gr.Accordion(visible=True)
|
406 |
number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
|
407 |
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
|
408 |
|
|
|
415 |
gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
|
416 |
source_languaje,
|
417 |
target_languaje,
|
418 |
+
advanced_setings,
|
419 |
number_of_speakers,
|
420 |
subtify_button,
|
421 |
)
|
|
|
424 |
gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
|
425 |
source_languaje,
|
426 |
target_languaje,
|
427 |
+
advanced_setings,
|
428 |
number_of_speakers,
|
429 |
subtify_button,
|
430 |
)
|
|
|
436 |
gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
|
437 |
source_languaje,
|
438 |
target_languaje,
|
439 |
+
advanced_setings,
|
440 |
number_of_speakers,
|
441 |
subtify_button,
|
442 |
)
|
|
|
446 |
image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
|
447 |
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
|
448 |
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
|
449 |
+
advanced_setings = gr.Accordion(visible=visible)
|
450 |
number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
|
451 |
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
|
452 |
return (
|
453 |
image,
|
454 |
source_languaje,
|
455 |
target_languaje,
|
456 |
+
advanced_setings,
|
457 |
number_of_speakers,
|
458 |
subtify_button,
|
459 |
)
|
|
|
498 |
command = f"python {python_file} {audio_path} {SECONDS}"
|
499 |
os.system(command)
|
500 |
|
|
|
|
|
|
|
|
|
|
|
501 |
return (
|
502 |
gr.Textbox(value="Ok")
|
503 |
)
|
504 |
|
505 |
+
def trascribe_audio(source_languaje, number_of_speakers):
|
506 |
+
folder_chunks = "chunks"
|
507 |
python_file = "transcribe.py"
|
508 |
+
chunks_file = "chunks/output_files.txt"
|
509 |
+
command = f"python {python_file} {chunks_file} {source_languaje} {number_of_speakers} {DEVICE}"
|
|
|
510 |
os.system(command)
|
511 |
|
512 |
+
with open(chunks_file, 'r') as f:
|
513 |
files = f.read().splitlines()
|
|
|
|
|
|
|
514 |
for file in files:
|
515 |
+
audios_extension = "mp3"
|
516 |
+
file_name, _ = file.split(".")
|
517 |
+
_, file_name = file_name.split("/")
|
518 |
+
vocal = f'{folder_chunks}/{file_name}.{audios_extension}'
|
519 |
+
command = f"rm {vocal}"
|
520 |
+
os.system(command)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
521 |
|
522 |
return (
|
523 |
gr.Textbox(value="Ok")
|
|
|
529 |
os.makedirs(folder_concatenated)
|
530 |
|
531 |
chunck_file = "chunks/output_files.txt"
|
|
|
532 |
python_file = "concat_transcriptions.py"
|
533 |
+
command = f"python {python_file} {chunck_file} {SECONDS}"
|
534 |
os.system(command)
|
535 |
|
536 |
with open(chunck_file, 'r') as f:
|
|
|
583 |
os.system(command)
|
584 |
command = f"rm chunks/output_files.txt"
|
585 |
os.system(command)
|
|
|
|
|
586 |
|
587 |
subtitled_video = "videos/download_video_with_subtitles.mp4"
|
588 |
|
589 |
+
visible = False
|
590 |
return (
|
|
|
591 |
gr.Video(value=subtitled_video, visible=True),
|
592 |
+
gr.Textbox(visible=visible),
|
593 |
+
gr.Textbox(visible=visible),
|
594 |
+
gr.Textbox(visible=visible),
|
595 |
+
gr.Textbox(visible=visible),
|
596 |
+
gr.Textbox(visible=visible),
|
597 |
+
gr.Textbox(value="Ok", visible=visible),
|
598 |
)
|
599 |
|
600 |
def subtify():
|
|
|
618 |
with gr.Row():
|
619 |
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
|
620 |
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
|
621 |
+
with gr.Accordion("Advanced settings", open=False, visible=visible) as Advanced_setings:
|
622 |
number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True, info="Number of speakers in the video, if you don't know, select 10")
|
623 |
subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
|
624 |
|
|
|
646 |
image,
|
647 |
source_languaje,
|
648 |
target_languaje,
|
649 |
+
Advanced_setings,
|
650 |
number_of_speakers,
|
651 |
subtify_button,
|
652 |
auxiliar_block2,
|
|
|
666 |
auxiliar_block1.change(
|
667 |
fn=is_valid_url,
|
668 |
inputs=url_textbox,
|
669 |
+
outputs=[image, source_languaje, target_languaje, Advanced_setings, number_of_speakers, subtify_button]
|
670 |
)
|
671 |
subtify_button.click(
|
672 |
fn=change_visibility_texboxes,
|
|
|
684 |
)
|
685 |
video_sliced_progress_info.change(
|
686 |
fn=trascribe_audio,
|
687 |
+
inputs=[source_languaje, number_of_speakers],
|
688 |
outputs=[video_transcribed_progress_info]
|
689 |
)
|
690 |
video_transcribed_progress_info.change(
|
|
|
699 |
video_translated_progress_info.change(
|
700 |
fn=add_translated_subtitles_to_video,
|
701 |
inputs=[original_video_path, original_audio_path, original_audio_translated_path],
|
702 |
+
outputs=[subtitled_video, video_donwloaded_progress_info, video_sliced_progress_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
|
703 |
)
|
704 |
|
705 |
gr.Markdown(html_buy_me_a_coffe)
|
concat_transcriptions.py
CHANGED
@@ -94,17 +94,8 @@ if __name__ == "__main__":
|
|
94 |
parser = argparse.ArgumentParser()
|
95 |
parser.add_argument("chunk_files", help="Path to the file containing the paths to the chunk files")
|
96 |
parser.add_argument("seconds", help="Duration of each chunk in seconds")
|
97 |
-
parser.add_argument('speakers_file', help='File with the number of speakers')
|
98 |
args = parser.parse_args()
|
99 |
|
100 |
chunk_files = args.chunk_files
|
101 |
seconds = int(args.seconds)
|
102 |
-
|
103 |
-
speakers = f.read().splitlines()
|
104 |
-
speakers = int(speakers[0])
|
105 |
-
|
106 |
-
if speakers > 0:
|
107 |
-
for speaker in range(speakers):
|
108 |
-
pass
|
109 |
-
else:
|
110 |
-
concatenate_transcriptions(chunk_files, seconds)
|
|
|
94 |
parser = argparse.ArgumentParser()
|
95 |
parser.add_argument("chunk_files", help="Path to the file containing the paths to the chunk files")
|
96 |
parser.add_argument("seconds", help="Duration of each chunk in seconds")
|
|
|
97 |
args = parser.parse_args()
|
98 |
|
99 |
chunk_files = args.chunk_files
|
100 |
seconds = int(args.seconds)
|
101 |
+
concatenate_transcriptions(chunk_files, seconds)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
transcribe.py
CHANGED
@@ -21,7 +21,7 @@ for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
|
|
21 |
"translator": language_code
|
22 |
}
|
23 |
|
24 |
-
def transcribe(audio_file, language,
|
25 |
output_folder = "transcriptions"
|
26 |
|
27 |
# Transcribe audio file
|
@@ -37,31 +37,25 @@ def transcribe(audio_file, language, device, vocals):
|
|
37 |
batch_size = 8
|
38 |
verbose = False
|
39 |
min_speakers = 1
|
40 |
-
max_speakers =
|
41 |
threads = 4
|
42 |
output_format = "srt"
|
43 |
hf_token = "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"
|
44 |
command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
|
45 |
--output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
|
46 |
-
--fp16 {fp16} --threads {threads} --print_progress {print_progress} --device {device}
|
47 |
-
|
48 |
-
command += f' --diarize --max_speakers {max_speakers} --min_speakers {min_speakers} --hf_token {hf_token}'
|
49 |
os.system(command)
|
50 |
|
51 |
if __name__ == "__main__":
|
52 |
parser = argparse.ArgumentParser(description='Transcribe audio files')
|
53 |
parser.add_argument('input_files', help='Input audio files')
|
54 |
parser.add_argument('language', help='Language of the audio file')
|
55 |
-
parser.add_argument('
|
56 |
parser.add_argument('device', help='Device to use for PyTorch inference')
|
57 |
-
parser.add_argument('vocals', help='Vocals or not')
|
58 |
args = parser.parse_args()
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
with open(args.speakers_file, 'r') as f:
|
63 |
-
speakers = f.read().splitlines()
|
64 |
-
speakers = int(speakers[0])
|
65 |
|
66 |
with open(args.input_files, 'r') as f:
|
67 |
inputs = f.read().splitlines()
|
@@ -70,13 +64,7 @@ if __name__ == "__main__":
|
|
70 |
for input in inputs:
|
71 |
input_file, _ = input.split('.')
|
72 |
_, input_name = input_file.split('/')
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
file = f'{vocals_folder}/{input_name}_speaker{i:003d}.{extension}'
|
77 |
-
transcribe(file, language_dict[args.language]["transcriber"], args.device, args.vocals)
|
78 |
-
else:
|
79 |
-
extension = "mp3"
|
80 |
-
file = f'{vocals_folder}/{input_name}.{extension}'
|
81 |
-
transcribe(file, language_dict[args.language]["transcriber"], args.device, args.vocals)
|
82 |
progress_bar.update(1)
|
|
|
21 |
"translator": language_code
|
22 |
}
|
23 |
|
24 |
+
def transcribe(audio_file, language, num_speakers, device):
|
25 |
output_folder = "transcriptions"
|
26 |
|
27 |
# Transcribe audio file
|
|
|
37 |
batch_size = 8
|
38 |
verbose = False
|
39 |
min_speakers = 1
|
40 |
+
max_speakers = num_speakers
|
41 |
threads = 4
|
42 |
output_format = "srt"
|
43 |
hf_token = "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"
|
44 |
command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
|
45 |
--output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
|
46 |
+
--fp16 {fp16} --threads {threads} --print_progress {print_progress} --device {device} \
|
47 |
+
--diarize --max_speakers {max_speakers} --min_speakers {min_speakers} --hf_token {hf_token}'
|
|
|
48 |
os.system(command)
|
49 |
|
50 |
if __name__ == "__main__":
|
51 |
parser = argparse.ArgumentParser(description='Transcribe audio files')
|
52 |
parser.add_argument('input_files', help='Input audio files')
|
53 |
parser.add_argument('language', help='Language of the audio file')
|
54 |
+
parser.add_argument('num_speakers', help='Number of speakers in the audio file')
|
55 |
parser.add_argument('device', help='Device to use for PyTorch inference')
|
|
|
56 |
args = parser.parse_args()
|
57 |
|
58 |
+
chunks_folder = "chunks"
|
|
|
|
|
|
|
|
|
59 |
|
60 |
with open(args.input_files, 'r') as f:
|
61 |
inputs = f.read().splitlines()
|
|
|
64 |
for input in inputs:
|
65 |
input_file, _ = input.split('.')
|
66 |
_, input_name = input_file.split('/')
|
67 |
+
extension = "mp3"
|
68 |
+
file = f'{chunks_folder}/{input_name}.{extension}'
|
69 |
+
transcribe(file, language_dict[args.language]["transcriber"], args.num_speakers, args.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
progress_bar.update(1)
|