import os import sys import time import requests import json from subprocess import Popen, PIPE import threading from huggingface_hub import HfApi import gradio as gr # start xVASynth service (no HTTP) import resources.app.no_server as xvaserver # model hf_model_name = "Pendrokar/xvapitch_nvidia" model_repo = HfApi() commits = model_repo.list_repo_commits(repo_id=hf_model_name) latest_commit_sha = commits[0].commit_id hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/' models_path = hf_cache_models_path # ordered from most emotional and respects pauses to ones that do the least voice_models = [ ("👨‍🦳 #6671", "ccby_nvidia_hifi_6671_M"), ("👱‍♀️ 🇬🇧 #92", "ccby_nvidia_hifi_92_F"), ("🧔 #6670", "ccby_nvidia_hifi_6670_M"), ("Male #9017", "ccby_nvidia_hifi_9017_M"), ("Male #6097", "ccby_nvidia_hifi_6097_M"), ("👩‍🦱 #12787", "ccby_nvidia_hifi_12787_F"), ("👵 #11614", "ccby_nv_hifi_11614_F"), ("Female #8051", "ccby_nvidia_hifi_8051_F"), ("👩‍🦳 #11697", "ccby_nvidia_hifi_11697_F"), ("Female #9136", "ccby_nvidia_hifi_9136_F"), ] current_voice_model = None base_speaker_emb = '' # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA languages = [ ("🇺🇸 EN", "en"), ("🇩🇪 DE", "de"), ("🇪🇸 ES", "es"), ("🇮🇹 IT", "it"), ("🇳🇱 NL", "nl"), ("🇧🇷 PT", "pt"), ("🇵🇱 PL", "pl"), ("🇷🇴 RO", "ro"), ("🇸🇪 SV", "sv"), ("🇩🇰 DA", "da"), ("🇫🇮 FI", "fi"), ("🇭🇺 HU", "hu"), ("🇬🇷 EL", "el"), ("🇫🇷 FR", "fr"), ("🇷🇺 RU", "ru"), ("🇺🇦 UA", "uk"), ("🇹🇷 TR", "tr"), ("🇸🇦 AR", "ar"), ("🇮🇳 HI", "hi"), ("🇯🇵 JP", "jp"), ("🇰🇷 KO", "ko"), ("🇨🇳 ZH", "zh"), ("🇻🇳 VI", "vi"), ("🇻🇦 LA", "la"), ("🇳🇬 YO", "yo"), ("Swahili", "sw"), ("Hausa", "ha"), ("Wolof", "wo"), ] # Translated from English by DeepMind's Gemini Pro default_text = { "ar": "هذا هو صوتي.", "da": "Sådan lyder min stemme.", "de": "So klingt meine Stimme.", "el": "Έτσι ακούγεται η φωνή μου.", "en": "This is what my voice sounds like.", "es": "Así suena mi voz.", "fi": "Näin ääneni kuulostaa.", "fr": "Voici à quoi ressemble ma voix.", "ha": "Wannan ne muryata ke.", "hi": "यह मेरी आवाज़ कैसी लगती है।", "hu": "Így hangzik a hangom.", "it": "Così suona la mia voce.", "jp": "これが私の声です。", "ko": "여기 제 목소리가 어떤지 들어보세요.", "la": "Haec est vox mea sonans.", "nl": "Dit is hoe mijn stem klinkt.", "pl": "Tak brzmi mój głos.", "pt": "É assim que minha voz soa.", "ro": "Așa sună vocea mea.", "ru": "Вот как звучит мой голос.", "sv": "Såhär låter min röst.", "sw": "Baba, yetu, yetu, uliye. Mbinguni, yetu, yetu. Amiiinaa!!", #civ4 "tr": "Benim sesimin sesi böyle.", "uk": "Ось як звучить мій голос.", "vi": "Đây là giọng nói của tôi.", "wo": "Ndox li neen xewnaal ma.", "yo": "Ìyí ni ohùn mi ńlá.", "zh": "这是我的声音。", } def run_xvaserver(): # start the process without waiting for a response print('Running xVAServer subprocess...\n') xvaserver = Popen(['python', f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/server.py'], stdout=PIPE, stderr=PIPE, cwd=f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/') # Wait for a moment to ensure the server starts up time.sleep(10) # Check if the server is running if xvaserver.poll() is not None: print("Web server failed to start.") sys.exit(0) # contact local xVASynth server print('Attempting to connect to xVASynth...') try: response = requests.get('http://0.0.0.0:8008') response.raise_for_status() # If the response contains an HTTP error status code, raise an exception except requests.exceptions.RequestException as err: print('Failed to connect!') return print('xVAServer running on port 8008') # load default model load_model("ccby_nvidia_hifi_6671_M") # Wait for the process to exit xvaserver.wait() def load_model(voice_model_name): model_path = models_path + voice_model_name model_type = 'xVAPitch' language = 'en' data = { 'outputs': None, 'version': '3.0', 'model': model_path, 'modelType': model_type, 'base_lang': language, 'pluginsContext': '{}', } embs = base_speaker_emb print('Loading voice model...') try: json_data = xvaserver.loadModel(data) current_voice_model = voice_model_name with open(model_path + '.json', 'r', encoding='utf-8') as f: voice_model_json = json.load(f) embs = voice_model_json['games'][0]['base_speaker_emb'] except requests.exceptions.RequestException as err: print(f'FAILED to load voice model: {err}') return embs def predict( input_text, voice, lang, pacing, pitch, energy, anger, happy, sad, surprise, use_deepmoji ): # grab only the first 1000 characters input_text = input_text[:1000] # load voice model if not the current model if (current_voice_model != voice): base_speaker_emb = load_model(voice) model_type = 'xVAPitch' pace = pacing if pacing else 1.0 save_path = '/tmp/xvapitch_audio_sample.wav' language = lang use_sr = 0 use_cleanup = 0 pluginsContext = {} pluginsContext["mantella_settings"] = { "emAngry": (anger if anger > 0 else 0), "emHappy": (happy if happy > 0 else 0), "emSad": (sad if sad > 0 else 0), "emSurprise": (surprise if surprise > 0 else 0), "run_model": use_deepmoji } data = { 'pluginsContext': json.dumps(pluginsContext), 'modelType': model_type, # pad with whitespaces as a workaround to avoid cutoffs 'sequence': input_text.center(len(input_text) + 2, ' '), 'pace': pace, 'outfile': save_path, 'vocoder': 'n/a', 'base_lang': language, 'base_emb': base_speaker_emb, 'useSR': use_sr, 'useCleanup': use_cleanup, } print('Synthesizing...') try: json_data = xvaserver.synthesize(data) # response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60) # response.raise_for_status() # If the response contains an HTTP error status code, raise an exception # json_data = json.loads(response.text) except requests.exceptions.RequestException as err: print('FAILED to synthesize: {err}') save_path = '' response = {'text': '{"message": "Failed"}'} json_data = { 'arpabet': ['Failed'], 'durations': [0], 'em_anger': anger, 'em_happy': happy, 'em_sad': sad, 'em_surprise': surprise, } # print('server.log contents:') # with open('resources/app/server.log', 'r') as f: # print(f.read()) arpabet_html = '
ARPAbet & Phoneme lengths
' arpabet_symbols = json_data['arpabet'].split('|') utter_time = 0 for symb_i in range(len(json_data['durations'])): # skip PAD symbol if (arpabet_symbols[symb_i] == ''): continue length = float(json_data['durations'][symb_i]) arpa_length = str(round(length/2, 1)) arpabet_html += ''\ + arpabet_symbols[symb_i]\ + ' ' utter_time += round(length, 1) return [ save_path, arpabet_html, round(json_data['em_angry'][0], 2), round(json_data['em_happy'][0], 2), round(json_data['em_sad'][0], 2), round(json_data['em_surprise'][0], 2), json_data ] input_textbox = gr.Textbox( label="Input Text", value="This is what my voice sounds like.", info="Also accepts ARPAbet symbols placed within {} brackets.", lines=1, max_lines=5, autofocus=True ) pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration") pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False) energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False) anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😠 Anger", info="Tread lightly beyond 0.9") happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😃 Happiness", info="Tread lightly beyond 0.7") sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😭 Sadness", info="Duration increased when beyond 0.2") surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😮 Surprise", info="Does not play well with Happiness with either being beyond 0.3") voice_radio = gr.Radio( voice_models, value="ccby_nvidia_hifi_6671_M", label="Voice", info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model" ) def set_default_text(lang, deepmoji_checked): # DeepMoji only works on English Text # checkbox_enabled = True # if lang != 'en': # checkbox_enabled = False if lang == 'en': checkbox_enabled = gr.Checkbox( label="Use DeepMoji", info="Auto adjust emotional values", value=deepmoji_checked, interactive=True ) else: checkbox_enabled = gr.Checkbox( label="Use DeepMoji", info="Works only with English!", value=False, interactive=False ) return default_text[lang], checkbox_enabled # Return the modified textbox (important for Blocks) en_examples = [ "This is what my voice sounds like.", "If there is anything else you need, feel free to ask.", "Amazing! Could you do that again?", "Why, I would be more than happy to help you!", "That was unexpected.", "How dare you! . You have no right.", "Ahh, well, you see. There is more to it.", "I can't believe she is gone.", "Stay out of my way!!!", # ARPAbet example "This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.", ] def set_example_as_input(example_text): return example_text def reset_em_sliders( deepmoji_enabled, anger, happy, sad, surprise ): if (deepmoji_enabled): return (0, 0, 0, 0) else: return ( anger, happy, sad, surprise ) def set_default_audio(voice_id): return models_path + voice_id + '.wav' def toggle_deepmoji( checked, anger, happy, sad, surprise ): if checked: return (0, 0, 0, 0) else: return ( anger, happy, sad, surprise ) language_radio = gr.Radio( languages, value="en", label="Language", info="Will be more monotone and have an English accent. Tested mostly by a native Briton." ) _DESCRIPTION = '''
Duplicate Space for a personal CPU-run one
''' with gr.Blocks(css=".arpabet {display: inline-block; background-color: gray; border-radius: 5px; font-size: 120%; margin: 0.1em 0}") as demo: gr.Markdown("# xVASynth TTS") gr.HTML(label="description", value=_DESCRIPTION) with gr.Row(): # Main row for inputs and language selection with gr.Column(): # Input column input_textbox = gr.Textbox( label="Input Text", value="This is what my voice sounds like.", info="Also accepts ARPAbet symbols placed within {} brackets.", lines=1, max_lines=5, autofocus=True ) language_radio = gr.Radio( languages, value="en", label="Language", info="Will be more monotone and have an English accent. Tested mostly by a native Briton." ) with gr.Row(): with gr.Column(): en_examples_dropdown = gr.Dropdown( en_examples, value=en_examples[0], label="Example dropdown", show_label=False, info="English Examples", visible=(language_radio.value == 'en') ) with gr.Column(): pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration") with gr.Column(): # Control column voice_radio = gr.Radio( voice_models, value="ccby_nvidia_hifi_6671_M", label="Voice", info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model" ) pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False) energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False) with gr.Row(): # Main row for inputs and language selection with gr.Column(): # Input column anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😠 Anger", info="Tread lightly beyond 0.9") sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😭 Sadness", info="Duration increased when beyond 0.2") with gr.Column(): # Input column happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😃 Happiness", info="Tread lightly beyond 0.7") surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😮 Surprise", info="Can oversaturate Happiness") deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values", value=True) # Event handling using click btn = gr.Button("Generate", variant="primary") with gr.Row(): # Main row for inputs and language selection with gr.Column(): # Input column output_wav = gr.Audio( label="22kHz audio output (autoplay enabled)", type="filepath", editable=False, autoplay=True ) with gr.Column(): # Input column output_arpabet = gr.HTML(label="ARPAbet") btn.click( fn=predict, inputs=[ input_textbox, voice_radio, language_radio, pacing_slider, pitch_slider, energy_slider, anger_slider, happy_slider, sad_slider, surprise_slider, deepmoji_checkbox ], outputs=[ output_wav, output_arpabet, anger_slider, happy_slider, sad_slider, surprise_slider, # xVAServer response gr.Textbox(visible=False) ] ) input_textbox.submit( fn=predict, inputs=[ input_textbox, voice_radio, language_radio, pacing_slider, pitch_slider, energy_slider, anger_slider, happy_slider, sad_slider, surprise_slider, deepmoji_checkbox ], outputs=[ output_wav, output_arpabet, anger_slider, happy_slider, sad_slider, surprise_slider, # xVAServer response gr.Textbox(visible=False) ] ) language_radio.change( set_default_text, inputs=[language_radio, deepmoji_checkbox], outputs=[input_textbox, deepmoji_checkbox] ) en_examples_dropdown.change( set_example_as_input, inputs=[en_examples_dropdown], outputs=[input_textbox] ) deepmoji_checkbox.change( toggle_deepmoji, inputs=[ deepmoji_checkbox, anger_slider, happy_slider, sad_slider, surprise_slider ], outputs=[ anger_slider, happy_slider, sad_slider, surprise_slider ] ) input_textbox.change( reset_em_sliders, inputs=[ deepmoji_checkbox, anger_slider, happy_slider, sad_slider, surprise_slider ], outputs=[ anger_slider, happy_slider, sad_slider, surprise_slider ] ) voice_radio.change( reset_em_sliders, inputs=[ deepmoji_checkbox, anger_slider, happy_slider, sad_slider, surprise_slider ], outputs=[ anger_slider, happy_slider, sad_slider, surprise_slider ] ) voice_radio.change( set_default_audio, inputs=voice_radio, outputs=output_wav ) if __name__ == "__main__": print('running custom Gradio interface') demo.launch()