Spaces:

AlexK-PL
/

vits-v2-8khz-inference

Runtime error

App Files Files Community

vits-v2-8khz-inference / app.py

AlexK-PL

Update app.py

0860c4d verified 11 months ago

raw

history blame

6.56 kB

	import tempfile
	import subprocess
	import time

	from typing import Optional
	from AinaTheme import AinaGradioTheme
	import gradio as gr
	import numpy as np
	import torch
	import os
	from TTS.utils.synthesizer import Synthesizer

	from dotenv import load_dotenv

	torch.manual_seed(0)
	np.random.seed(0)

	import json
	from copy import deepcopy

	import numpy as np
	import torch

	import torchaudio
	import torchaudio.transforms as T

	import random

	random.seed(0)
	torch.manual_seed(0)
	np.random.seed(0)

	SAMPLE_RATE = 8000

	#############################################################################################################

	load_dotenv()

	MAX_INPUT_TEXT_LEN = int(os.environ.get("MAX_INPUT_TEXT_LEN", default=500))

	# Dynamically read model files, exclude 'speakers.pth'
	model_files = [f for f in os.listdir(os.getcwd()) if f.endswith('.pth') and f != 'speakers.pth']
	# model_files.sort(key=lambda x: os.path.getmtime(os.path.join(os.getcwd(), x)), reverse=True)

	speakers_path = "speakers.pth"
	speakers_list = torch.load(speakers_path)
	speakers_list = list(speakers_list.keys())
	speakers_list = [speaker for speaker in speakers_list]

	default_speaker_list = speakers_list #

	# Filtered lists based on dataset
	festcat_speakers = [s for s in speakers_list if len(s) == 3] #
	google_speakers = [s for s in speakers_list if 3 < len(s) < 20] #
	commonvoice_speakers = [s for s in speakers_list if len(s) > 20] #

	DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="pau")
	model_file = model_files[0] # change this!!

	model_path = os.path.join(os.getcwd(), model_file)
	config_path = "config.json"

	vocoder_path = None
	vocoder_config_path = None

	synthesizer = Synthesizer(
	model_path, config_path, speakers_path, None, vocoder_path, vocoder_config_path,
	)


	def get_phonetic_transcription(text: str):
	try:
	result = subprocess.run(
	['espeak-ng', '--ipa', '-v', 'ca', text],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	check=True
	)
	return result.stdout.strip()
	except subprocess.CalledProcessError as e:
	print(f"An error occurred: {e}")
	return None


	def tts_inference(text: str, speaker_idx: str = None):
	# synthesize
	if synthesizer is None:
	raise NameError("model not found")
	t1 = time.time()
	wavs = synthesizer.tts(text, speaker_idx)
	# print(type(wavs))
	wavs_den = wavs

	# return output
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	# wavs must be a list of integers
	synthesizer.save_wav(wavs_den, fp)
	t2 = time.time() - t1
	print(round(t2, 2))
	output_audio = fp.name

	return output_audio


	title = "🗣️ Catalan Multispeaker TTS Tester 🗣️"
	description = """
	1️⃣ Enter the text to synthesize.
	2️⃣ Select a voice from the dropdown menu.
	3️⃣ Enjoy!
	"""


	def submit_input(input_, speaker_id, use_dn):
	output_audio = None
	output_phonetic = None
	if input_ is not None and len(input_) < MAX_INPUT_TEXT_LEN:
	output_audio, output_audio_den = tts_inference(input_, speaker_id, use_dn)
	output_phonetic = get_phonetic_transcription(input_)
	else:
	gr.Warning(f"Your text exceeds the {MAX_INPUT_TEXT_LEN}-character limit.")
	return output_audio, output_audio_den, output_phonetic


	def change_interactive(text):
	input_state = text
	if input_state.strip() != "":
	return gr.update(interactive=True)
	else:
	return gr.update(interactive=False)


	def clean():
	return (
	None,
	None,
	)


	with gr.Blocks(**AinaGradioTheme().get_kwargs()) as app:
	gr.Markdown(f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>")
	gr.Markdown(description)

	with gr.Row(equal_height=False):

	with gr.Column(variant='panel'):
	input_ = gr.Textbox(
	label="Text",
	value="Introdueix el text a sintetitzar.",
	lines=4
	)

	dataset = gr.Radio(["All", "Festcat", "Google TTS", "CommonVoice"], label="Speakers Dataset",
	value="All")


	def update_speaker_list(dataset):
	print("Updating speaker list based on dataset:", dataset)
	if dataset == "Festcat":
	current_speakers = festcat_speakers
	elif dataset == "Google TTS":
	current_speakers = google_speakers
	elif dataset == "CommonVoice":
	current_speakers = commonvoice_speakers
	else:
	current_speakers = speakers_list

	return gr.update(choices=current_speakers, value=current_speakers[0])


	speaker_id = gr.Dropdown(label="Select a voice", choices=speakers_list, value=DEFAULT_SPEAKER_ID,
	interactive=True)
	dataset.change(fn=update_speaker_list, inputs=dataset, outputs=speaker_id)

	# model = gr.Dropdown(label="Select a model", choices=model_files, value=DEFAULT_MODEL_FILE_NAME)
	with gr.Row():
	clear_btn = gr.ClearButton(value='Clean', components=[input_])
	# clear_btn = gr.Button(
	# "Clean",
	# )
	submit_btn = gr.Button(
	"Submit",
	variant="primary",
	)
	# use_denoise = gr.Radio(choices=[("Yes", 0), ("No", 1)], value=0)
	with gr.Column(variant='panel'):
	output_audio = gr.Audio(label="Output", type="filepath", autoplay=True, show_share_button=False)
	# output_audio_den = gr.Audio(label="Output denoised", type="filepath", autoplay=False, show_share_button=False)

	output_phonetic = gr.Textbox(label="Phonetic Transcription", readonly=True)

	for button in [submit_btn]: # clear_btn
	input_.change(fn=change_interactive, inputs=[input_], outputs=button)

	# clear_btn.click(fn=clean, inputs=[], outputs=[input_, output_audio, output_phonetic], queue=False)
	submit_btn.click(fn=submit_input, inputs=[input_, speaker_id], outputs=[output_audio,
	output_audio_den,
	output_phonetic])

	app.queue(concurrency_count=1, api_open=False)
	app.launch(show_api=False, server_name="0.0.0.0", server_port=7860)