In [16]:
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio
import re
from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
class SpeechData(torch.utils.data.Dataset):
    def __init__(self, device=DEVICE):
        self.device = device
        self.dataset = []
        self.data_path = "/media/leo/OS/knn-vc-master/eval/expand_result_XXL_cond_unpeq_100k_nondenoise"
        self.source_path = "/media/leo/OS/knn-vc-master/eval/data"
        for each in os.listdir(self.data_path):
            self.dataset.append(os.path.join(self.data_path, each))
        self.p = re.compile("{(.*)}_{(.*)}_to_.*.wav")
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, item):
        path = self.dataset[item]
        text = self.wav_to_text(path)
        audio, sr = torchaudio.load(path)
        assert sr == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)
        return (mel, text, audio)
    
    def wav_to_text(self, path):
        result = self.p.search(path)
        spk = result.group(1)
        utt = result.group(2)
        utt_split = utt.split("-")
        text_file = utt_split[0] + "-" + utt_split[1] + ".trans.txt"
        f = open(os.path.join(self.source_path, spk, text_file), 'r')
        text_annotation = f.readlines()
        for each in text_annotation:
            if utt in each:
                return each[len(utt)+1:].strip()
        
    

In [17]:
class SpeechData(torch.utils.data.Dataset):
    def __init__(self, device=DEVICE):
        self.device = device
        self.dataset = []
        self.source_path = "/media/leo/OS/knn-vc-master/eval/data"
        for spk in os.listdir(self.source_path):
            for utt in os.listdir(os.path.join(self.source_path, spk)):
                if utt.endswith("txt"):
                    continue
                self.dataset.append(os.path.join(self.source_path, spk, utt))

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, item):
        path = self.dataset[item]
        text = self.wav_to_text(path)
        audio, sr = torchaudio.load(path)
        assert sr == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)
        return (mel, text, audio)
    
    def wav_to_text(self, path):
        base_name = os.path.basename(path).split(".")[0]
        dir_name = os.path.dirname(path)
        text_name = base_name.split("-")
        text_name = text_name[0] + "-" + text_name[1] + ".trans.txt"
        f = open(os.path.join(dir_name, text_name), 'r')
        text_annotation = f.readlines()
        for each in text_annotation:
            if base_name in each:
                return each[len(base_name)+1:].strip()
        
    

In [18]:
loader = torch.utils.data.DataLoader(SpeechData(), batch_size=1)
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)
options = whisper.DecodingOptions(language="en", without_timestamps=True)

Model is English-only and has 71,825,408 parameters.


In [19]:
hypotheses = []
references = []
from tqdm import tqdm
for mels, texts, audio in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

100%|█████████████████████████████████████████| 100/100 [00:09<00:00, 10.10it/s]


In [5]:
hypotheses

['In his return to the camp, his acute and practiced intellects were intently engaged in devising means to counteract a watchfulness and suspicion on the part of his enemies, that he knew were in no degree inferior to his own.',
 'In his return to the camp, his acute and practiced intellects were intently engaged in devising means to counteract a watchfulness and suspicion on the part of his enemies that he knew were in no degree inferior to his own.',
 'In his return to the camp, his acute and practiced intellects were intently engaged in devising means to counteract a watchfulness and suspicion on the part of his enemies that he knew were in no degree inferior to his own.',
 'In his return to the camp, his acute and practiced intellects were intently engaged in devising means to counteract a watchfulness and suspicion on the part of his enemies that he knew were in no degree inferior to his own.',
 'In his return to the camp, his acute and practiced intellects were intently engaged i

In [6]:
references

['IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTISED INTELLECTS WERE INTENTLY ENGAGED IN DEVISING MEANS TO COUNTERACT A WATCHFULNESS AND SUSPICION ON THE PART OF HIS ENEMIES THAT HE KNEW WERE IN NO DEGREE INFERIOR TO HIS OWN',
 'IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTISED INTELLECTS WERE INTENTLY ENGAGED IN DEVISING MEANS TO COUNTERACT A WATCHFULNESS AND SUSPICION ON THE PART OF HIS ENEMIES THAT HE KNEW WERE IN NO DEGREE INFERIOR TO HIS OWN',
 'IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTISED INTELLECTS WERE INTENTLY ENGAGED IN DEVISING MEANS TO COUNTERACT A WATCHFULNESS AND SUSPICION ON THE PART OF HIS ENEMIES THAT HE KNEW WERE IN NO DEGREE INFERIOR TO HIS OWN',
 'IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTISED INTELLECTS WERE INTENTLY ENGAGED IN DEVISING MEANS TO COUNTERACT A WATCHFULNESS AND SUSPICION ON THE PART OF HIS ENEMIES THAT HE KNEW WERE IN NO DEGREE INFERIOR TO HIS OWN',
 'IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTISED INTELLECTS WERE INTENTLY ENGAGED IN DEVISING

In [7]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,"In his return to the camp, his acute and pract...",IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTI...
1,"In his return to the camp, his acute and pract...",IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTI...
2,"In his return to the camp, his acute and pract...",IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTI...
3,"In his return to the camp, his acute and pract...",IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTI...
4,"In his return to the camp, his acute and pract...",IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTI...
...,...,...
9495,But first I will tell you that for many years ...,BUT FIRST I WILL TELL YOU THAT FOR MANY YEARS ...
9496,But first I will tell you that for many years ...,BUT FIRST I WILL TELL YOU THAT FOR MANY YEARS ...
9497,But first I will tell you that for many years ...,BUT FIRST I WILL TELL YOU THAT FOR MANY YEARS ...
9498,But first I will tell you that for many years ...,BUT FIRST I WILL TELL YOU THAT FOR MANY YEARS ...


In [8]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"In his return to the camp, his acute and pract...",IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTI...,in his return to the camp his acute and practi...,in his return to the camp his acute and practi...
1,"In his return to the camp, his acute and pract...",IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTI...,in his return to the camp his acute and practi...,in his return to the camp his acute and practi...
2,"In his return to the camp, his acute and pract...",IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTI...,in his return to the camp his acute and practi...,in his return to the camp his acute and practi...
3,"In his return to the camp, his acute and pract...",IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTI...,in his return to the camp his acute and practi...,in his return to the camp his acute and practi...
4,"In his return to the camp, his acute and pract...",IN HIS RETURN TO THE CAMP HIS ACUTE AND PRACTI...,in his return to the camp his acute and practi...,in his return to the camp his acute and practi...
...,...,...,...,...
9495,But first I will tell you that for many years ...,BUT FIRST I WILL TELL YOU THAT FOR MANY YEARS ...,but 1st i will tell you that for many years i ...,but 1st i will tell you that for many years i ...
9496,But first I will tell you that for many years ...,BUT FIRST I WILL TELL YOU THAT FOR MANY YEARS ...,but 1st i will tell you that for many years i ...,but 1st i will tell you that for many years i ...
9497,But first I will tell you that for many years ...,BUT FIRST I WILL TELL YOU THAT FOR MANY YEARS ...,but 1st i will tell you that for many years i ...,but 1st i will tell you that for many years i ...
9498,But first I will tell you that for many years ...,BUT FIRST I WILL TELL YOU THAT FOR MANY YEARS ...,but 1st i will tell you that for many years i ...,but 1st i will tell you that for many years i ...


In [9]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
print(f"WER: {wer * 100:.2f} %")

WER: 5.17 %


In [10]:
wers = []
for i in range(len(list(data["reference_clean"]))):
    wer = jiwer.wer(list(data["reference_clean"])[i], list(data["hypothesis_clean"])[i])
    wers.append(wer)
print(f"WER: {np.mean(wers) * 100:.2f} +- {np.std(wers) * 100:.2f}%")

WER: 5.39 +- 6.47%


In [26]:
import torchaudio
import torch
from speechbrain.pretrained import EncoderClassifier
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb", run_opts={"device":"cuda"})
target_signal, fs =torchaudio.load('./audio/7729.wav')
target_signal_embeddings = classifier.encode_batch(target_signal).squeeze()
for mels, texts, audio in tqdm(loader):
    embedding = classifier.encode_batch(audio).squeeze()
    print(cos(embedding, target_signal_embeddings))

100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 72.30it/s]

tensor([ 0.9981, -0.9844,  0.9353,  0.9083,  0.9383,  0.9480,  0.9983,  0.9970,
         0.9981,  0.9997,  0.9964,  0.9978,  0.9990,  0.9635,  0.9934,  0.9975,
         0.8338,  0.9894,  0.9578, -0.9975,  0.9953,  0.9596,  0.9953,  0.9887,
         0.9954,  0.9746,  0.9962, -0.9790,  0.8314,  0.9283,  0.9997,  0.9971,
         0.9963,  0.6858,  0.9914,  0.9941,  0.9965,  0.9992, -0.9959,  0.9965,
         0.9994,  0.9944,  0.9959,  0.9996,  0.9948,  0.9970,  0.9976,  0.9999,
         0.9999,  0.9850,  0.9986,  0.9825,  0.9440, -0.0214,  0.9859, -0.9651,
         0.9589,  0.9983,  0.9990,  0.9460,  0.9738,  0.9969,  0.9973,  0.3082,
         0.9769,  0.9979,  0.7766,  0.9944,  0.9991,  0.9967,  0.9934,  0.9946,
        -0.9440,  0.9767,  0.9771,  0.9917,  0.9971,  0.9858,  0.9900,  0.9984,
         0.9978,  0.9979,  0.9980,  0.9986,  0.9974,  0.9953,  0.9976,  0.9990,
        -0.9984,  0.9936,  0.9983,  0.9972,  0.9645,  0.9972,  0.9983, -0.9882,
         0.9991,  0.9962,  0.9985,  0.99




In [85]:
from matplotlib import pyplot as plt
import matplotlib
from importlib import reload
plt.style.use('ggplot')
matplotlib=reload(matplotlib)
plt.figure(figsize=(6,4.5))
x = [1,3,5,10,15]
y1 = [17.39,15.22,17.39,19.57,19.57]
y2 = [97.83,47.83,21.74,23.91,23.91]
plt.subplot(211)
plt.plot(x,y1,'--bo', label="Phoneme Hallucinator (Ours)")
plt.plot(x,y2,'--r+', label="kNN-VC")
plt.ylabel("Word Error Rate")

plt.legend()
plt.xticks([])
plt.subplot(212)
y1 = [0.8077,0.8222,0.8203,0.8221,0.8241]
y2 = [0.7524,0.8235, 0.8255, 0.8242, 0.8263]
plt.plot(x,y1,'--bo', label="Phoneme Hallucinator (Ours)")
plt.plot(x,y2,'--r+', label="kNN-VC")
plt.xlabel("Target Voice Duration (s)")
plt.ylabel("Speaker Similarity (cos)")
plt.xticks(x)
#plt.legend()
plt.tight_layout()
plt.savefig("knn-vc-ablation.pdf",dpi=300)

In [11]:
import torchaudio
import torch
from speechbrain.pretrained import EncoderClassifier
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb", run_opts={"device":"cuda"})
target_signal, fs =torchaudio.load('./audio/7729_short.wav')
our_signal, fs =torchaudio.load('./amazing_sound.wav')
freevc_signal, fs =torchaudio.load('./freevc.wav')
target_signal_embeddings = classifier.encode_batch(target_signal).squeeze()
our_signal_embeddings = classifier.encode_batch(our_signal).squeeze()
freevc_signal_embeddings = classifier.encode_batch(freevc_signal).squeeze()
print(cos(our_signal_embeddings, target_signal_embeddings))
print(cos(freevc_signal_embeddings, target_signal_embeddings))

RuntimeError: Failed to open the input "./amazing_sound.wav" (No such file or directory).

In [11]:
import random
import numpy as np
import sklearn.metrics

def compute_eer(label, pred):
    # all fpr, tpr, fnr, fnr, threshold are lists (in the format of np.array)
    fpr, tpr, threshold = sklearn.metrics.roc_curve(label, pred)
    fnr = 1 - tpr

    # the threshold of fnr == fpr
    eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]

    # theoretically eer from fpr and eer from fnr should be identical but they can be slightly differ in reality
    eer_1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
    eer_2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))]

    # return the mean of eer from fpr and from fnr
    eer = (eer_1 + eer_2) / 2
    return eer


class SpeechData(torch.utils.data.Dataset):
    def __init__(self, device=DEVICE):
        self.device = device
        self.dataset = []
        self.ours_data_path = "/media/leo/OS/knn-vc-master/eval/expand_result_XXL_cond_unpeq_100k_nondenoise"
        self.freevc_data_path = "/media/leo/work/projects/FreeVC/eval"
        self.target_path = "/media/leo/OS/knn-vc-master/eval/data_short"
        #self.target_path = "/media/leo/OS/knn-vc-master/eval/data"
        for each in os.listdir(self.ours_data_path):
            self.dataset.append(os.path.join(self.ours_data_path, each))
        self.p = re.compile(".*to_{(.*)}_{(.*)}.*.wav")
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, item):
        path = self.dataset[item]
        ours_filename = os.path.basename(path)
        freevc_filename = os.path.join(self.freevc_data_path, ours_filename)
        result = self.p.search(path)
        spk = result.group(1)
        utt = result.group(2)
        tgt_filename = os.path.join(self.target_path, spk, utt + ".flac")
        tgt_others = os.listdir(os.path.join(self.target_path, spk))
        tgt_others.remove(utt + ".flac")
        tgt_others = [each for each in tgt_others if '.txt' not in each]
        tgt_others = random.choice(tgt_others)
        tgt_others_filename = os.path.join(self.target_path, spk, tgt_others)
        ours_audio, sr = torchaudio.load(path)
        freevc_audio, _ = torchaudio.load(freevc_filename)
        tgt_audio, _ = torchaudio.load(tgt_filename)
        tgt_other_audio, _ = torchaudio.load(tgt_others_filename)
        assert sr == 16000
        return (ours_audio[0].to(self.device), freevc_audio[0].to(self.device), tgt_audio[0].to(self.device), tgt_other_audio[0].to(self.device))

loader = torch.utils.data.DataLoader(SpeechData(), batch_size=1)
from speechbrain.pretrained import EncoderClassifier
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb", run_opts={"device":"cuda"})
cos_ours = []
cos_freevc = []
cos_tgt = []
from tqdm import tqdm
for ours, freevc, tgt, tgt_other in tqdm(loader):
    ours_embed = classifier.encode_batch(ours).squeeze()
    freevc_embed = classifier.encode_batch(freevc).squeeze()
    tgt_embed = classifier.encode_batch(tgt).squeeze()
    tgt_others_embed = classifier.encode_batch(tgt_other).squeeze()
    
    cos_ours.append(cos(ours_embed, tgt_embed).item())
    cos_freevc.append(cos(freevc_embed, tgt_embed).item())
    cos_tgt.append(cos(tgt_embed, tgt_others_embed).item())
ours = cos_ours + cos_tgt
freevc = cos_freevc + cos_tgt
gt = [0] * len(cos_ours) + [1] * len(cos_tgt)
print(compute_eer(gt, ours))
print(compute_eer(gt, freevc))
print(np.mean(cos_ours),np.std(cos_ours))
print(np.mean(cos_freevc),np.std(cos_freevc))


    

Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /opt/conda/conda-bld/pytorch_1678402411778/work/aten/src/ATen/native/SpectralOps.cpp:862.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
100%|███████████████████████████████████| 9500/9500 [03:58<00:00, 39.89it/s]

0.472
0.3551578947368421
0.9540138582367647 0.010178320460466229
0.9460661411410883 0.011913444046103392





In [16]:
import numpy as np
from scipy.optimize import brentq
from scipy.interpolate import interp1d

def calculate_eer(genuine_scores, impostor_scores):
    # Create a function to interpolate the Receiver Operating Characteristic (ROC) curve
    fpr, tpr, thresholds = roc_curve(genuine_scores, impostor_scores)
    roc_interp = interp1d(fpr, tpr)

    # Find the point on the ROC curve closest to the coordinate (1, 1)
    eer_threshold = brentq(lambda x : 1. - x - roc_interp(x), 0., 1.)
    eer = roc_interp(eer_threshold)

    return eer
print(calculate_eer(gt,ours))

0.4961052631578947
