jadechoghari
/

openmusic

Text-to-Audio

Diffusers

QAMDTModel

music

Model card Files Files and versions Community

jadechoghari commited on Sep 21

Commit

1752041

•

1 Parent(s): 893807d

Create tools.py

Browse files

Files changed (1) hide show

tools.py +566 -0

tools.py ADDED Viewed

	@@ -0,0 +1,566 @@

+# Author: Haohe Liu
+# Email: haoheliu@gmail.com
+# Date: 11 Feb 2023
+import os
+import json
+import torch
+import torch.nn.functional as F
+import numpy as np
+import matplotlib
+from scipy.io import wavfile
+from matplotlib import pyplot as plt
+matplotlib.use("Agg")
+import hashlib
+import os
+import requests
+from tqdm import tqdm
+URL_MAP = {
+    "vggishish_lpaps": "https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/specvqgan_public/vggishish16.pt",
+    "vggishish_mean_std_melspec_10s_22050hz": "https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/specvqgan_public/train_means_stds_melspec_10s_22050hz.txt",
+    "melception": "https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/specvqgan_public/melception-21-05-10T09-28-40.pt",
+}
+CKPT_MAP = {
+    "vggishish_lpaps": "vggishish16.pt",
+    "vggishish_mean_std_melspec_10s_22050hz": "train_means_stds_melspec_10s_22050hz.txt",
+    "melception": "melception-21-05-10T09-28-40.pt",
+}
+MD5_MAP = {
+    "vggishish_lpaps": "197040c524a07ccacf7715d7080a80bd",
+    "vggishish_mean_std_melspec_10s_22050hz": "f449c6fd0e248936c16f6d22492bb625",
+    "melception": "a71a41041e945b457c7d3d814bbcf72d",
+}
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def read_list(fname):
+    result = []
+    with open(fname, "r") as f:
+        for each in f.readlines():
+            each = each.strip("\n")
+            result.append(each)
+    return result
+def build_dataset_json_from_list(list_path):
+    data = []
+    for each in read_list(list_path):
+        if "|" in each:
+            wav, caption = each.split("|")
+        else:
+            caption = each
+            wav = ""
+        data.append(
+            {
+                "wav": wav,
+                "caption": caption,
+            }
+        )
+    return {"data": data}
+def load_json(fname):
+    with open(fname, "r") as f:
+        data = json.load(f)
+        return data
+def read_json(dataset_json_file):
+    with open(dataset_json_file, "r") as fp:
+        data_json = json.load(fp)
+    return data_json["data"]
+def copy_test_subset_data(metadata, testset_copy_target_path):
+    # metadata = read_json(testset_metadata)
+    os.makedirs(testset_copy_target_path, exist_ok=True)
+    if len(os.listdir(testset_copy_target_path)) == len(metadata):
+        return
+    else:
+        # delete files in folder testset_copy_target_path
+        for file in os.listdir(testset_copy_target_path):
+            try:
+                os.remove(os.path.join(testset_copy_target_path, file))
+            except Exception as e:
+                print(e)
+    print("Copying test subset data to {}".format(testset_copy_target_path))
+    for each in tqdm(metadata):
+        cmd = "cp {} {}".format(each["wav"], os.path.join(testset_copy_target_path))
+        os.system(cmd)
+def listdir_nohidden(path):
+    for f in os.listdir(path):
+        if not f.startswith("."):
+            yield f
+def get_restore_step(path):
+    checkpoints = os.listdir(path)
+    if os.path.exists(os.path.join(path, "final.ckpt")):
+        return "final.ckpt", 0
+    elif not os.path.exists(os.path.join(path, "last.ckpt")):
+        steps = [int(x.split(".ckpt")[0].split("step=")[1]) for x in checkpoints]
+        return checkpoints[np.argmax(steps)], np.max(steps)
+    else:
+        steps = []
+        for x in checkpoints:
+            if "last" in x:
+                if "-v" not in x:
+                    fname = "last.ckpt"
+                else:
+                    this_version = int(x.split(".ckpt")[0].split("-v")[1])
+                    steps.append(this_version)
+                    if len(steps) == 0 or this_version > np.max(steps):
+                        fname = "last-v%s.ckpt" % this_version
+        return fname, 0
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+def get_ckpt_path(name, root, check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+class KeyNotFoundError(Exception):
+    def __init__(self, cause, keys=None, visited=None):
+        self.cause = cause
+        self.keys = keys
+        self.visited = visited
+        messages = list()
+        if keys is not None:
+            messages.append("Key not found: {}".format(keys))
+        if visited is not None:
+            messages.append("Visited: {}".format(visited))
+        messages.append("Cause:\n{}".format(cause))
+        message = "\n".join(messages)
+        super().__init__(message)
+def retrieve(
+    list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False
+):
+    """Given a nested list or dict return the desired value at key expanding
+    callable nodes if necessary and :attr:`expand` is ``True``. The expansion
+    is done in-place.
+    Parameters
+    ----------
+        list_or_dict : list or dict
+            Possibly nested list or dictionary.
+        key : str
+            key/to/value, path like string describing all keys necessary to
+            consider to get to the desired value. List indices can also be
+            passed here.
+        splitval : str
+            String that defines the delimiter between keys of the
+            different depth levels in `key`.
+        default : obj
+            Value returned if :attr:`key` is not found.
+        expand : bool
+            Whether to expand callable nodes on the path or not.
+    Returns
+    -------
+        The desired value or if :attr:`default` is not ``None`` and the
+        :attr:`key` is not found returns ``default``.
+    Raises
+    ------
+        Exception if ``key`` not in ``list_or_dict`` and :attr:`default` is
+        ``None``.
+    """
+    keys = key.split(splitval)
+    success = True
+    try:
+        visited = []
+        parent = None
+        last_key = None
+        for key in keys:
+            if callable(list_or_dict):
+                if not expand:
+                    raise KeyNotFoundError(
+                        ValueError(
+                            "Trying to get past callable node with expand=False."
+                        ),
+                        keys=keys,
+                        visited=visited,
+                    )
+                list_or_dict = list_or_dict()
+                parent[last_key] = list_or_dict
+            last_key = key
+            parent = list_or_dict
+            try:
+                if isinstance(list_or_dict, dict):
+                    list_or_dict = list_or_dict[key]
+                else:
+                    list_or_dict = list_or_dict[int(key)]
+            except (KeyError, IndexError, ValueError) as e:
+                raise KeyNotFoundError(e, keys=keys, visited=visited)
+            visited += [key]
+        # final expansion of retrieved value
+        if expand and callable(list_or_dict):
+            list_or_dict = list_or_dict()
+            parent[last_key] = list_or_dict
+    except KeyNotFoundError as e:
+        if default is None:
+            raise e
+        else:
+            list_or_dict = default
+            success = False
+    if not pass_success:
+        return list_or_dict
+    else:
+        return list_or_dict, success
+def to_device(data, device):
+    if len(data) == 12:
+        (
+            ids,
+            raw_texts,
+            speakers,
+            texts,
+            src_lens,
+            max_src_len,
+            mels,
+            mel_lens,
+            max_mel_len,
+            pitches,
+            energies,
+            durations,
+        ) = data
+        speakers = torch.from_numpy(speakers).long().to(device)
+        texts = torch.from_numpy(texts).long().to(device)
+        src_lens = torch.from_numpy(src_lens).to(device)
+        mels = torch.from_numpy(mels).float().to(device)
+        mel_lens = torch.from_numpy(mel_lens).to(device)
+        pitches = torch.from_numpy(pitches).float().to(device)
+        energies = torch.from_numpy(energies).to(device)
+        durations = torch.from_numpy(durations).long().to(device)
+        return (
+            ids,
+            raw_texts,
+            speakers,
+            texts,
+            src_lens,
+            max_src_len,
+            mels,
+            mel_lens,
+            max_mel_len,
+            pitches,
+            energies,
+            durations,
+        )
+    if len(data) == 6:
+        (ids, raw_texts, speakers, texts, src_lens, max_src_len) = data
+        speakers = torch.from_numpy(speakers).long().to(device)
+        texts = torch.from_numpy(texts).long().to(device)
+        src_lens = torch.from_numpy(src_lens).to(device)
+        return (ids, raw_texts, speakers, texts, src_lens, max_src_len)
+def log(logger, step=None, fig=None, audio=None, sampling_rate=22050, tag=""):
+    # if losses is not None:
+    #     logger.add_scalar("Loss/total_loss", losses[0], step)
+    #     logger.add_scalar("Loss/mel_loss", losses[1], step)
+    #     logger.add_scalar("Loss/mel_postnet_loss", losses[2], step)
+    #     logger.add_scalar("Loss/pitch_loss", losses[3], step)
+    #     logger.add_scalar("Loss/energy_loss", losses[4], step)
+    #     logger.add_scalar("Loss/duration_loss", losses[5], step)
+    #     if(len(losses) > 6):
+    #         logger.add_scalar("Loss/disc_loss", losses[6], step)
+    #         logger.add_scalar("Loss/fmap_loss", losses[7], step)
+    #         logger.add_scalar("Loss/r_loss", losses[8], step)
+    #         logger.add_scalar("Loss/g_loss", losses[9], step)
+    #         logger.add_scalar("Loss/gen_loss", losses[10], step)
+    #         logger.add_scalar("Loss/diff_loss", losses[11], step)
+    if fig is not None:
+        logger.add_figure(tag, fig)
+    if audio is not None:
+        audio = audio / (max(abs(audio)) * 1.1)
+        logger.add_audio(
+            tag,
+            audio,
+            sample_rate=sampling_rate,
+        )
+def get_mask_from_lengths(lengths, max_len=None):
+    batch_size = lengths.shape[0]
+    if max_len is None:
+        max_len = torch.max(lengths).item()
+    ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(device)
+    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
+    return mask
+def expand(values, durations):
+    out = list()
+    for value, d in zip(values, durations):
+        out += [value] * max(0, int(d))
+    return np.array(out)
+def synth_one_sample_val(
+    targets, predictions, vocoder, model_config, preprocess_config
+):
+    index = np.random.choice(list(np.arange(targets[6].size(0))))
+    basename = targets[0][index]
+    src_len = predictions[8][index].item()
+    mel_len = predictions[9][index].item()
+    mel_target = targets[6][index, :mel_len].detach().transpose(0, 1)
+    mel_prediction = predictions[0][index, :mel_len].detach().transpose(0, 1)
+    postnet_mel_prediction = predictions[1][index, :mel_len].detach().transpose(0, 1)
+    duration = targets[11][index, :src_len].detach().cpu().numpy()
+    if preprocess_config["preprocessing"]["pitch"]["feature"] == "phoneme_level":
+        pitch = predictions[2][index, :src_len].detach().cpu().numpy()
+        pitch = expand(pitch, duration)
+    else:
+        pitch = predictions[2][index, :mel_len].detach().cpu().numpy()
+    if preprocess_config["preprocessing"]["energy"]["feature"] == "phoneme_level":
+        energy = predictions[3][index, :src_len].detach().cpu().numpy()
+        energy = expand(energy, duration)
+    else:
+        energy = predictions[3][index, :mel_len].detach().cpu().numpy()
+    with open(
+        os.path.join(preprocess_config["path"]["preprocessed_path"], "stats.json")
+    ) as f:
+        stats = json.load(f)
+        stats = stats["pitch"] + stats["energy"][:2]
+    # from datetime import datetime
+    # now = datetime.now()
+    # current_time = now.strftime("%D:%H:%M:%S")
+    # np.save(("mel_pred_%s.npy" % current_time).replace("/","-"), mel_prediction.cpu().numpy())
+    # np.save(("postnet_mel_prediction_%s.npy" % current_time).replace("/","-"), postnet_mel_prediction.cpu().numpy())
+    # np.save(("mel_target_%s.npy" % current_time).replace("/","-"), mel_target.cpu().numpy())
+    fig = plot_mel(
+        [
+            (mel_prediction.cpu().numpy(), pitch, energy),
+            (postnet_mel_prediction.cpu().numpy(), pitch, energy),
+            (mel_target.cpu().numpy(), pitch, energy),
+        ],
+        stats,
+        [
+            "Raw mel spectrogram prediction",
+            "Postnet mel prediction",
+            "Ground-Truth Spectrogram",
+        ],
+    )
+    if vocoder is not None:
+        from .model_util import vocoder_infer
+        wav_reconstruction = vocoder_infer(
+            mel_target.unsqueeze(0),
+            vocoder,
+            model_config,
+            preprocess_config,
+        )[0]
+        wav_prediction = vocoder_infer(
+            postnet_mel_prediction.unsqueeze(0),
+            vocoder,
+            model_config,
+            preprocess_config,
+        )[0]
+    else:
+        wav_reconstruction = wav_prediction = None
+    return fig, wav_reconstruction, wav_prediction, basename
+def synth_one_sample(mel_input, mel_prediction, labels, vocoder):
+    if vocoder is not None:
+        from .model_util import vocoder_infer
+        wav_reconstruction = vocoder_infer(
+            mel_input.permute(0, 2, 1),
+            vocoder,
+        )
+        wav_prediction = vocoder_infer(
+            mel_prediction.permute(0, 2, 1),
+            vocoder,
+        )
+    else:
+        wav_reconstruction = wav_prediction = None
+    return wav_reconstruction, wav_prediction
+def synth_samples(targets, predictions, vocoder, model_config, preprocess_config, path):
+    # (diff_output, diff_loss, latent_loss) = diffusion
+    basenames = targets[0]
+    for i in range(len(predictions[1])):
+        basename = basenames[i]
+        src_len = predictions[8][i].item()
+        mel_len = predictions[9][i].item()
+        mel_prediction = predictions[1][i, :mel_len].detach().transpose(0, 1)
+        # diff_output = diff_output[i, :mel_len].detach().transpose(0, 1)
+        # duration = predictions[5][i, :src_len].detach().cpu().numpy()
+        if preprocess_config["preprocessing"]["pitch"]["feature"] == "phoneme_level":
+            pitch = predictions[2][i, :src_len].detach().cpu().numpy()
+            # pitch = expand(pitch, duration)
+        else:
+            pitch = predictions[2][i, :mel_len].detach().cpu().numpy()
+        if preprocess_config["preprocessing"]["energy"]["feature"] == "phoneme_level":
+            energy = predictions[3][i, :src_len].detach().cpu().numpy()
+            # energy = expand(energy, duration)
+        else:
+            energy = predictions[3][i, :mel_len].detach().cpu().numpy()
+        # import ipdb; ipdb.set_trace()
+        with open(
+            os.path.join(preprocess_config["path"]["preprocessed_path"], "stats.json")
+        ) as f:
+            stats = json.load(f)
+            stats = stats["pitch"] + stats["energy"][:2]
+        fig = plot_mel(
+            [
+                (mel_prediction.cpu().numpy(), pitch, energy),
+            ],
+            stats,
+            ["Synthetized Spectrogram by PostNet"],
+        )
+        # np.save("{}_postnet.npy".format(basename), mel_prediction.cpu().numpy())
+        plt.savefig(os.path.join(path, "{}_postnet_2.png".format(basename)))
+        plt.close()
+    from .model_util import vocoder_infer
+    mel_predictions = predictions[1].transpose(1, 2)
+    lengths = predictions[9] * preprocess_config["preprocessing"]["stft"]["hop_length"]
+    wav_predictions = vocoder_infer(
+        mel_predictions, vocoder, model_config, preprocess_config, lengths=lengths
+    )
+    sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"]
+    for wav, basename in zip(wav_predictions, basenames):
+        wavfile.write(os.path.join(path, "{}.wav".format(basename)), sampling_rate, wav)
+def plot_mel(data, titles=None):
+    fig, axes = plt.subplots(len(data), 1, squeeze=False)
+    if titles is None:
+        titles = [None for i in range(len(data))]
+    for i in range(len(data)):
+        mel = data[i]
+        axes[i][0].imshow(mel, origin="lower", aspect="auto")
+        axes[i][0].set_aspect(2.5, adjustable="box")
+        axes[i][0].set_ylim(0, mel.shape[0])
+        axes[i][0].set_title(titles[i], fontsize="medium")
+        axes[i][0].tick_params(labelsize="x-small", left=False, labelleft=False)
+        axes[i][0].set_anchor("W")
+    return fig
+def pad_1D(inputs, PAD=0):
+    def pad_data(x, length, PAD):
+        x_padded = np.pad(
+            x, (0, length - x.shape[0]), mode="constant", constant_values=PAD
+        )
+        return x_padded
+    max_len = max((len(x) for x in inputs))
+    padded = np.stack([pad_data(x, max_len, PAD) for x in inputs])
+    return padded
+def pad_2D(inputs, maxlen=None):
+    def pad(x, max_len):
+        PAD = 0
+        if np.shape(x)[0] > max_len:
+            raise ValueError("not max_len")
+        s = np.shape(x)[1]
+        x_padded = np.pad(
+            x, (0, max_len - np.shape(x)[0]), mode="constant", constant_values=PAD
+        )
+        return x_padded[:, :s]
+    if maxlen:
+        output = np.stack([pad(x, maxlen) for x in inputs])
+    else:
+        max_len = max(np.shape(x)[0] for x in inputs)
+        output = np.stack([pad(x, max_len) for x in inputs])
+    return output
+def pad(input_ele, mel_max_length=None):
+    if mel_max_length:
+        max_len = mel_max_length
+    else:
+        max_len = max([input_ele[i].size(0) for i in range(len(input_ele))])
+    out_list = list()
+    for i, batch in enumerate(input_ele):
+        if len(batch.shape) == 1:
+            one_batch_padded = F.pad(
+                batch, (0, max_len - batch.size(0)), "constant", 0.0
+            )
+        elif len(batch.shape) == 2:
+            one_batch_padded = F.pad(
+                batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0
+            )
+        out_list.append(one_batch_padded)
+    out_padded = torch.stack(out_list)
+    return out_padded