import tempfile import subprocess import time from typing import Optional from AinaTheme import AinaGradioTheme import gradio as gr import numpy as np import torch import os from TTS.utils.synthesizer import Synthesizer from dotenv import load_dotenv torch.manual_seed(0) np.random.seed(0) import json from copy import deepcopy import numpy as np import torch import torchaudio import torchaudio.transforms as T import random random.seed(0) torch.manual_seed(0) np.random.seed(0) SAMPLE_RATE = 8000 ############################################################################################################# load_dotenv() MAX_INPUT_TEXT_LEN = int(os.environ.get("MAX_INPUT_TEXT_LEN", default=500)) # Dynamically read model files, exclude 'speakers.pth' model_files = [f for f in os.listdir(os.getcwd()) if f.endswith('.pth') and f != 'speakers.pth'] # model_files = [f for f in os.listdir(os.path.join(os.getcwd(), 'checkpoints')) if f.endswith('.pth')] # model_files.sort(key=lambda x: os.path.getmtime(os.path.join(os.getcwd(), x)), reverse=True) speakers_path = "speakers.pth" speakers_list = torch.load(speakers_path) speakers_list = list(speakers_list.keys()) speakers_list = [speaker for speaker in speakers_list] default_speaker_list = speakers_list # # Filtered lists based on dataset festcat_speakers = [s for s in speakers_list if len(s) == 3] # google_speakers = [s for s in speakers_list if 3 < len(s) < 20] # commonvoice_speakers = [s for s in speakers_list if len(s) > 20] # DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="pau") DEFAULT_CHECKPOINT = os.environ.get("DEFAULT_CHECKPOINT", default=model_files[0]) # model_file = model_files[0] # change this!! # model_path = os.path.join(os.getcwd(), model_file) # config_path = os.path.join(os.getcwd(), "config.json") vocoder_path = None vocoder_config_path = None # synthesizer = Synthesizer( # model_path, config_path, speakers_path, None, vocoder_path, vocoder_config_path, # ) def get_phonetic_transcription(text: str): try: result = subprocess.run( ['espeak-ng', '--ipa', '-v', 'ca', text], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True ) return result.stdout.strip() except subprocess.CalledProcessError as e: print(f"An error occurred: {e}") return None def tts_inference(text: str, speaker_idx: str = None, model_file: str=None): model_path = os.path.join(os.getcwd(), model_file) speakers_file_path = "speakers.pth" config_path = "config.json" vocoder_path = None vocoder_config_path = None synthesizer = Synthesizer(model_path, config_path, speakers_path, None, vocoder_path, vocoder_config_path) # synthesize if synthesizer is None: raise NameError("model not found") t1 = time.time() wavs = synthesizer.tts(text, speaker_idx) # print(type(wavs)) wavs_den = wavs # return output with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # wavs must be a list of integers synthesizer.save_wav(wavs_den, fp) t2 = time.time() - t1 print(round(t2, 2)) output_audio = fp.name return output_audio title = "🗣️ Catalan Multispeaker TTS Tester 🗣️" description = """ 1️⃣ Enter the text to synthesize. 2️⃣ Select a voice from the dropdown menu. 3️⃣ Enjoy! """ def submit_input(input_, speaker_id, model_chkpt): output_audio = None output_phonetic = None if input_ is not None and len(input_) < MAX_INPUT_TEXT_LEN: output_audio = tts_inference(input_, speaker_id, model_chkpt) output_phonetic = get_phonetic_transcription(input_) else: gr.Warning(f"Your text exceeds the {MAX_INPUT_TEXT_LEN}-character limit.") return output_audio, output_phonetic def change_interactive(text): input_state = text if input_state.strip() != "": return gr.update(interactive=True) else: return gr.update(interactive=False) def clean(): return ( None, None, ) with gr.Blocks(**AinaGradioTheme().get_kwargs()) as app: gr.Markdown(f"