import sys sys.path.append('tacotron2/') sys.path.append('tacotron2/waveglow') import torch import os import torchaudio import gradio as gr import matplotlib.pyplot as plt import numpy as np from scipy.io.wavfile import write from text import symbols, text_to_sequence import wave device="cpu" # Load Nvidia Tacotron2 from Hub tacotron2 = torch.hub.load( "NVIDIA/DeepLearningExamples:torchhub", "nvidia_tacotron2", model_math='fp32', pretrained=False, ) # Load Weights and bias of nepali text tacotron2_checkpoint_path = os.path.join(os.getcwd(), 'model_E45.ckpt') state_dict = torch.load(tacotron2_checkpoint_path, map_location=device) tacotron2.load_state_dict(state_dict) tacotron2 = tacotron2.to(device) tacotron2.eval() # Load Nvidia Waveglow from Hub waveglow = torch.hub.load( "NVIDIA/DeepLearningExamples:torchhub", "nvidia_waveglow", model_math="fp32", pretrained=False, ) checkpoint = torch.hub.load_state_dict_from_url( "https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth", # noqa: E501 progress=False, map_location=device, ) state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()} waveglow.load_state_dict(state_dict) waveglow = waveglow.remove_weightnorm(waveglow) waveglow = waveglow.to(device) waveglow.eval() # ERR: module glow not found # waveglow_pretrained_model = os.path.join(os.getcwd(), 'waveglow_256channels_ljs_v3.pt') # waveglow = torch.load(waveglow_pretrained_model, map_location=device)['model'] # waveglow = waveglow.to(device) # waveglow.eval() # Load Nvidia Utils from Hub # utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils') # sequences, lengths = utils.prepare_input_sequence([text]) def inference(text): for i in [x for x in text.split("\n") if len(x)]: if i[-1] != ";": i=i+";" with torch.no_grad(): sequence = np.array(text_to_sequence(i, ['transliteration_cleaners']))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long() mel_outputs, mel_outputs_postnet, _, alignments = tacotron2.inference(sequence) # plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T)) audio = waveglow.infer(mel_outputs_postnet, sigma=0.8) #Save Mel Spectrogram plt.imshow(mel_outputs_postnet[0].cpu().detach()) plt.axis('off') plt.savefig("test.png", bbox_inches='tight') #Save Audio audio_numpy = audio[0].data.cpu().numpy() rate = 22050 write("output1.wav", rate, audio_numpy) torchaudio.save("output2.wav", audio[0:1].cpu(), sample_rate=rate) # sequence = np.array(text_to_sequence(i, ['transliteration_cleaners']))[None, :] # sequence = torch.autograd.Variable(torch.from_numpy(sequence)).to(device).long() # mel_outputs, mel_outputs_postnet, _, alignments = tacotron2.inference(sequence) # audio = hifigan(mel_outputs_postnet.float()).to("cpu") # audio = audio * MAX_WAV_VALUE # data = audio.squeeze().detach().cpu().numpy() # rate = 22050 # scaled = np.int16(data / np.max(np.abs(data)) * 32767) # write('test.wav', rate, scaled) # concatenate_audio_wave(["output.wav","test.wav"],"output.wav") # with torch.no_grad(): # sequences, lengths = utils.prepare_input_sequence([text]) # sequences = sequences.to(device) # lengths = lengths.to(device) # mel, _, _ = tacotron2.infer(sequences, lengths) # audio = waveglow.infer(mel) # #Save Mel Spectrogram # plt.imshow(mel[0].cpu().detach()) # plt.axis('off') # plt.savefig("test.png", bbox_inches='tight') # #Save Audio # audio_numpy = audio[0].data.cpu().numpy() # rate = 22050 # write("output1.wav", rate, audio_numpy) # torchaudio.save("output2.wav", audio[0:1].cpu(), sample_rate=22050) return "output1.wav", "output2.wav", "test.png" title="TACOTRON 2" description="Nepali Speech TACOTRON 2: The Tacotron 2 model for generating mel spectrograms from text. To use it, simply add you text or click on one of the examples to load them. Read more at the links below." article = "
Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions | Github Repo
" examples=[["म नेपाली टिटिएस हुँ"]] gr.Interface(inference,"text",[gr.outputs.Audio(type="file",label="Audio"),gr.outputs.Image(type="file",label="Spectrogram")],title=title,description=description,article=article,examples=examples).launch(enable_queue=True)