|
import librosa |
|
import matplotlib |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import torch |
|
|
|
matplotlib.use("Agg") |
|
|
|
|
|
def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None, output_fig=False): |
|
if isinstance(alignment, torch.Tensor): |
|
alignment_ = alignment.detach().cpu().numpy().squeeze() |
|
else: |
|
alignment_ = alignment |
|
alignment_ = alignment_.astype(np.float32) if alignment_.dtype == np.float16 else alignment_ |
|
fig, ax = plt.subplots(figsize=fig_size) |
|
im = ax.imshow(alignment_.T, aspect="auto", origin="lower", interpolation="none") |
|
fig.colorbar(im, ax=ax) |
|
xlabel = "Decoder timestep" |
|
if info is not None: |
|
xlabel += "\n\n" + info |
|
plt.xlabel(xlabel) |
|
plt.ylabel("Encoder timestep") |
|
|
|
plt.tight_layout() |
|
if title is not None: |
|
plt.title(title) |
|
if not output_fig: |
|
plt.close() |
|
return fig |
|
|
|
|
|
def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False): |
|
if isinstance(spectrogram, torch.Tensor): |
|
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T |
|
else: |
|
spectrogram_ = spectrogram.T |
|
spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_ |
|
if ap is not None: |
|
spectrogram_ = ap.denormalize(spectrogram_) |
|
fig = plt.figure(figsize=fig_size) |
|
plt.imshow(spectrogram_, aspect="auto", origin="lower") |
|
plt.colorbar() |
|
plt.tight_layout() |
|
if not output_fig: |
|
plt.close() |
|
return fig |
|
|
|
|
|
def plot_pitch(pitch, spectrogram, ap=None, fig_size=(30, 10), output_fig=False): |
|
"""Plot pitch curves on top of the spectrogram. |
|
|
|
Args: |
|
pitch (np.array): Pitch values. |
|
spectrogram (np.array): Spectrogram values. |
|
|
|
Shapes: |
|
pitch: :math:`(T,)` |
|
spec: :math:`(C, T)` |
|
""" |
|
|
|
if isinstance(spectrogram, torch.Tensor): |
|
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T |
|
else: |
|
spectrogram_ = spectrogram.T |
|
spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_ |
|
if ap is not None: |
|
spectrogram_ = ap.denormalize(spectrogram_) |
|
|
|
old_fig_size = plt.rcParams["figure.figsize"] |
|
if fig_size is not None: |
|
plt.rcParams["figure.figsize"] = fig_size |
|
|
|
fig, ax = plt.subplots() |
|
|
|
ax.imshow(spectrogram_, aspect="auto", origin="lower") |
|
ax.set_xlabel("time") |
|
ax.set_ylabel("spec_freq") |
|
|
|
ax2 = ax.twinx() |
|
ax2.plot(pitch, linewidth=5.0, color="red") |
|
ax2.set_ylabel("F0") |
|
|
|
plt.rcParams["figure.figsize"] = old_fig_size |
|
if not output_fig: |
|
plt.close() |
|
return fig |
|
|
|
|
|
def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False): |
|
"""Plot pitch curves on top of the input characters. |
|
|
|
Args: |
|
pitch (np.array): Pitch values. |
|
chars (str): Characters to place to the x-axis. |
|
|
|
Shapes: |
|
pitch: :math:`(T,)` |
|
""" |
|
old_fig_size = plt.rcParams["figure.figsize"] |
|
if fig_size is not None: |
|
plt.rcParams["figure.figsize"] = fig_size |
|
|
|
fig, ax = plt.subplots() |
|
|
|
x = np.array(range(len(chars))) |
|
my_xticks = chars |
|
plt.xticks(x, my_xticks) |
|
|
|
ax.set_xlabel("characters") |
|
ax.set_ylabel("freq") |
|
|
|
ax2 = ax.twinx() |
|
ax2.plot(pitch, linewidth=5.0, color="red") |
|
ax2.set_ylabel("F0") |
|
|
|
plt.rcParams["figure.figsize"] = old_fig_size |
|
if not output_fig: |
|
plt.close() |
|
return fig |
|
|
|
|
|
def visualize( |
|
alignment, |
|
postnet_output, |
|
text, |
|
hop_length, |
|
CONFIG, |
|
tokenizer, |
|
stop_tokens=None, |
|
decoder_output=None, |
|
output_path=None, |
|
figsize=(8, 24), |
|
output_fig=False, |
|
): |
|
"""Intended to be used in Notebooks.""" |
|
|
|
if decoder_output is not None: |
|
num_plot = 4 |
|
else: |
|
num_plot = 3 |
|
|
|
label_fontsize = 16 |
|
fig = plt.figure(figsize=figsize) |
|
|
|
plt.subplot(num_plot, 1, 1) |
|
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) |
|
plt.xlabel("Decoder timestamp", fontsize=label_fontsize) |
|
plt.ylabel("Encoder timestamp", fontsize=label_fontsize) |
|
|
|
if CONFIG.use_phonemes: |
|
seq = tokenizer.text_to_ids(text) |
|
text = tokenizer.ids_to_text(seq) |
|
print(text) |
|
plt.yticks(range(len(text)), list(text)) |
|
plt.colorbar() |
|
|
|
if stop_tokens is not None: |
|
|
|
plt.subplot(num_plot, 1, 2) |
|
plt.plot(range(len(stop_tokens)), list(stop_tokens)) |
|
|
|
|
|
plt.subplot(num_plot, 1, 3) |
|
librosa.display.specshow( |
|
postnet_output.T, |
|
sr=CONFIG.audio["sample_rate"], |
|
hop_length=hop_length, |
|
x_axis="time", |
|
y_axis="linear", |
|
fmin=CONFIG.audio["mel_fmin"], |
|
fmax=CONFIG.audio["mel_fmax"], |
|
) |
|
|
|
plt.xlabel("Time", fontsize=label_fontsize) |
|
plt.ylabel("Hz", fontsize=label_fontsize) |
|
plt.tight_layout() |
|
plt.colorbar() |
|
|
|
if decoder_output is not None: |
|
plt.subplot(num_plot, 1, 4) |
|
librosa.display.specshow( |
|
decoder_output.T, |
|
sr=CONFIG.audio["sample_rate"], |
|
hop_length=hop_length, |
|
x_axis="time", |
|
y_axis="linear", |
|
fmin=CONFIG.audio["mel_fmin"], |
|
fmax=CONFIG.audio["mel_fmax"], |
|
) |
|
plt.xlabel("Time", fontsize=label_fontsize) |
|
plt.ylabel("Hz", fontsize=label_fontsize) |
|
plt.tight_layout() |
|
plt.colorbar() |
|
|
|
if output_path: |
|
print(output_path) |
|
fig.savefig(output_path) |
|
plt.close() |
|
|
|
if not output_fig: |
|
plt.close() |
|
|