import torch
import os
import torchaudio
import gradio as gr
import matplotlib.pyplot as plt


device="cpu"

# Load Nvidia Tacotron2 from Hub
tacotron2 = torch.hub.load(
    "NVIDIA/DeepLearningExamples:torchhub",
    "nvidia_tacotron2",
    model_math='fp32',
    pretrained=False,
)

# Load Weights and bias of nepali text
tacotron2_checkpoint_path = os.path.join(os.getcwd(), 'model_E45.ckpt')
state_dict = torch.load(tacotron2_checkpoint_path, map_location=device)

tacotron2.load_state_dict(state_dict)
tacotron2 = tacotron2.to(device)
tacotron2.eval()

# Load Nvidia Waveglow from Hub
# waveglow = torch.hub.load(
#     "NVIDIA/DeepLearningExamples:torchhub",
#     "nvidia_waveglow",
#     model_math="fp32",
#     pretrained=False,
# )
# checkpoint = torch.hub.load_state_dict_from_url(
#     "https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth",  # noqa: E501
#     progress=False,
#     map_location=device,
# )
# state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()}

# waveglow.load_state_dict(state_dict)
# waveglow = waveglow.remove_weightnorm(waveglow)
# waveglow = waveglow.to(device)
# waveglow.eval()

waveglow_pretrained_model = os.path.join(os.getcwd(), 'waveglow_256channels_ljs_v3.pt')
waveglow = torch.load(waveglow_pretrained_model, map_location=device)['model']
waveglow = waveglow.to(device)
waveglow.eval()

# Load Nvidia Utils from Hub
utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')
# sequences, lengths = utils.prepare_input_sequence([text])

def inference(text):
    with torch.no_grad():
        sequences, lengths = utils.prepare_input_sequence([text])
        sequences = sequences.to(device)
        lengths = lengths.to(device)
        mel, _, _ = tacotron2.infer(sequences, lengths)
        audio = waveglow.infer(mel)

    #Save Mel Spectrogram
    plt.imshow(mel[0].cpu().detach())
    plt.axis('off')
    plt.savefig("test.png", bbox_inches='tight')

    #Save Audio
    audio_numpy = audio[0].data.cpu().numpy()
    rate = 22050
    write("output1.wav", rate, audio_numpy)
    torchaudio.save("output2.wav", audio[0:1].cpu(), sample_rate=22050)
    
    return "output1.wav", "output2.wav", "test.png"
  
title="TACOTRON 2"
description="Nepali Speech TACOTRON 2: The Tacotron 2 model for generating mel spectrograms from text. To use it, simply add you text or click on one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1712.05884' target='_blank'>Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions</a> | <a href='https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2' target='_blank'>Github Repo</a></p>"
examples=[["म नेपाली टिटिएस हुँ"]]
gr.Interface(inference,"text",[gr.outputs.Audio(type="file",label="Audio"),gr.outputs.Image(type="file",label="Spectrogram")],title=title,description=description,article=article,examples=examples).launch(enable_queue=True)