|
|
|
|
|
|
|
|
|
|
|
""" |
|
Signal processing-based evaluation using waveforms |
|
""" |
|
import numpy as np |
|
import os.path as op |
|
|
|
import torchaudio |
|
import tqdm |
|
from tabulate import tabulate |
|
|
|
from examples.speech_synthesis.utils import ( |
|
gross_pitch_error, voicing_decision_error, f0_frame_error |
|
) |
|
from examples.speech_synthesis.evaluation.eval_sp import load_eval_spec |
|
|
|
|
|
def difference_function(x, n, tau_max): |
|
""" |
|
Compute difference function of data x. This solution is implemented directly |
|
with Numpy fft. |
|
|
|
|
|
:param x: audio data |
|
:param n: length of data |
|
:param tau_max: integration window size |
|
:return: difference function |
|
:rtype: list |
|
""" |
|
|
|
x = np.array(x, np.float64) |
|
w = x.size |
|
tau_max = min(tau_max, w) |
|
x_cumsum = np.concatenate((np.array([0.]), (x * x).cumsum())) |
|
size = w + tau_max |
|
p2 = (size // 32).bit_length() |
|
nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32) |
|
size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size) |
|
fc = np.fft.rfft(x, size_pad) |
|
conv = np.fft.irfft(fc * fc.conjugate())[:tau_max] |
|
return x_cumsum[w:w - tau_max:-1] + x_cumsum[w] - x_cumsum[:tau_max] - \ |
|
2 * conv |
|
|
|
|
|
def cumulative_mean_normalized_difference_function(df, n): |
|
""" |
|
Compute cumulative mean normalized difference function (CMND). |
|
|
|
:param df: Difference function |
|
:param n: length of data |
|
:return: cumulative mean normalized difference function |
|
:rtype: list |
|
""" |
|
|
|
|
|
cmn_df = df[1:] * range(1, n) / np.cumsum(df[1:]).astype(float) |
|
return np.insert(cmn_df, 0, 1) |
|
|
|
|
|
def get_pitch(cmdf, tau_min, tau_max, harmo_th=0.1): |
|
""" |
|
Return fundamental period of a frame based on CMND function. |
|
|
|
:param cmdf: Cumulative Mean Normalized Difference function |
|
:param tau_min: minimum period for speech |
|
:param tau_max: maximum period for speech |
|
:param harmo_th: harmonicity threshold to determine if it is necessary to |
|
compute pitch frequency |
|
:return: fundamental period if there is values under threshold, 0 otherwise |
|
:rtype: float |
|
""" |
|
tau = tau_min |
|
while tau < tau_max: |
|
if cmdf[tau] < harmo_th: |
|
while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]: |
|
tau += 1 |
|
return tau |
|
tau += 1 |
|
|
|
return 0 |
|
|
|
|
|
def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500, |
|
harmo_thresh=0.1): |
|
""" |
|
|
|
Compute the Yin Algorithm. Return fundamental frequency and harmonic rate. |
|
|
|
https://github.com/NVIDIA/mellotron adaption of |
|
https://github.com/patriceguyot/Yin |
|
|
|
:param sig: Audio signal (list of float) |
|
:param sr: sampling rate (int) |
|
:param w_len: size of the analysis window (samples) |
|
:param w_step: size of the lag between two consecutives windows (samples) |
|
:param f0_min: Minimum fundamental frequency that can be detected (hertz) |
|
:param f0_max: Maximum fundamental frequency that can be detected (hertz) |
|
:param harmo_thresh: Threshold of detection. The yalgorithmù return the |
|
first minimum of the CMND function below this threshold. |
|
|
|
:returns: |
|
|
|
* pitches: list of fundamental frequencies, |
|
* harmonic_rates: list of harmonic rate values for each fundamental |
|
frequency value (= confidence value) |
|
* argmins: minimums of the Cumulative Mean Normalized DifferenceFunction |
|
* times: list of time of each estimation |
|
:rtype: tuple |
|
""" |
|
|
|
tau_min = int(sr / f0_max) |
|
tau_max = int(sr / f0_min) |
|
|
|
|
|
time_scale = range(0, len(sig) - w_len, w_step) |
|
times = [t/float(sr) for t in time_scale] |
|
frames = [sig[t:t + w_len] for t in time_scale] |
|
|
|
pitches = [0.0] * len(time_scale) |
|
harmonic_rates = [0.0] * len(time_scale) |
|
argmins = [0.0] * len(time_scale) |
|
|
|
for i, frame in enumerate(frames): |
|
|
|
df = difference_function(frame, w_len, tau_max) |
|
cm_df = cumulative_mean_normalized_difference_function(df, tau_max) |
|
p = get_pitch(cm_df, tau_min, tau_max, harmo_thresh) |
|
|
|
|
|
if np.argmin(cm_df) > tau_min: |
|
argmins[i] = float(sr / np.argmin(cm_df)) |
|
if p != 0: |
|
pitches[i] = float(sr / p) |
|
harmonic_rates[i] = cm_df[p] |
|
else: |
|
harmonic_rates[i] = min(cm_df) |
|
|
|
return pitches, harmonic_rates, argmins, times |
|
|
|
|
|
def extract_f0(samples): |
|
f0_samples = [] |
|
for sample in tqdm.tqdm(samples): |
|
if not op.isfile(sample["ref"]) or not op.isfile(sample["syn"]): |
|
f0_samples.append(None) |
|
continue |
|
|
|
|
|
yref, sr = torchaudio.load(sample["ref"]) |
|
ysyn, _sr = torchaudio.load(sample["syn"]) |
|
yref, ysyn = yref[0], ysyn[0] |
|
assert sr == _sr, f"{sr} != {_sr}" |
|
|
|
yref_f0 = compute_yin(yref, sr) |
|
ysyn_f0 = compute_yin(ysyn, sr) |
|
|
|
f0_samples += [ |
|
{ |
|
"ref": yref_f0, |
|
"syn": ysyn_f0 |
|
} |
|
] |
|
|
|
return f0_samples |
|
|
|
|
|
def eval_f0_error(samples, distortion_fn): |
|
results = [] |
|
for sample in tqdm.tqdm(samples): |
|
if sample is None: |
|
results.append(None) |
|
continue |
|
|
|
yref_f, _, _, yref_t = sample["ref"] |
|
ysyn_f, _, _, ysyn_t = sample["syn"] |
|
|
|
yref_f = np.array(yref_f) |
|
yref_t = np.array(yref_t) |
|
ysyn_f = np.array(ysyn_f) |
|
ysyn_t = np.array(ysyn_t) |
|
|
|
distortion = distortion_fn(yref_t, yref_f, ysyn_t, ysyn_f) |
|
results.append((distortion.item(), |
|
len(yref_f), |
|
len(ysyn_f) |
|
)) |
|
return results |
|
|
|
|
|
def eval_gross_pitch_error(samples): |
|
return eval_f0_error(samples, gross_pitch_error) |
|
|
|
|
|
def eval_voicing_decision_error(samples): |
|
return eval_f0_error(samples, voicing_decision_error) |
|
|
|
|
|
def eval_f0_frame_error(samples): |
|
return eval_f0_error(samples, f0_frame_error) |
|
|
|
|
|
def print_results(results, show_bin): |
|
results = np.array(list(filter(lambda x: x is not None, results))) |
|
|
|
np.set_printoptions(precision=3) |
|
|
|
def _print_result(results): |
|
res = { |
|
"nutt": len(results), |
|
"error": results[:, 0].mean(), |
|
"std": results[:, 0].std(), |
|
"dur_ref": int(results[:, 1].sum()), |
|
"dur_syn": int(results[:, 2].sum()), |
|
} |
|
print(tabulate([res.values()], res.keys(), floatfmt=".4f")) |
|
|
|
print(">>>> ALL") |
|
_print_result(results) |
|
|
|
if show_bin: |
|
edges = [0, 200, 400, 600, 800, 1000, 2000, 4000] |
|
for i in range(1, len(edges)): |
|
mask = np.logical_and(results[:, 1] >= edges[i-1], |
|
results[:, 1] < edges[i]) |
|
if not mask.any(): |
|
continue |
|
bin_results = results[mask] |
|
print(f">>>> ({edges[i-1]}, {edges[i]})") |
|
_print_result(bin_results) |
|
|
|
|
|
def main(eval_f0, gpe, vde, ffe, show_bin): |
|
samples = load_eval_spec(eval_f0) |
|
if gpe or vde or ffe: |
|
f0_samples = extract_f0(samples) |
|
|
|
if gpe: |
|
print("===== Evaluate Gross Pitch Error =====") |
|
results = eval_gross_pitch_error(f0_samples) |
|
print_results(results, show_bin) |
|
if vde: |
|
print("===== Evaluate Voicing Decision Error =====") |
|
results = eval_voicing_decision_error(f0_samples) |
|
print_results(results, show_bin) |
|
if ffe: |
|
print("===== Evaluate F0 Frame Error =====") |
|
results = eval_f0_frame_error(f0_samples) |
|
print_results(results, show_bin) |
|
|
|
|
|
if __name__ == "__main__": |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("eval_f0") |
|
parser.add_argument("--gpe", action="store_true") |
|
parser.add_argument("--vde", action="store_true") |
|
parser.add_argument("--ffe", action="store_true") |
|
parser.add_argument("--show-bin", action="store_true") |
|
args = parser.parse_args() |
|
|
|
main(args.eval_f0, args.gpe, args.vde, args.ffe, args.show_bin) |
|
|