File size: 1,455 Bytes
2d8da09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import SoundScribe.SpeakerID.nemo.collections.asr as nemo_asr
import torch
import os
speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
"nvidia/speakerverification_en_titanet_large")
speakers = os.listdir("SoundScribe/voices")
identified_speakers = []
for speaker in speakers:
if not speaker.startswith("."):
identified_speakers.append(speaker)
embeddings = [speaker_model.get_embedding("./SoundScribe/voices/"+audio_file).squeeze() for audio_file in identified_speakers]
def verify_speaker(file):
embs1 = speaker_model.get_embedding(file).squeeze()
similarity_scores = []
probably_speaker = ""
for embs2, speaker in zip(embeddings, identified_speakers):
X = embs1 / torch.linalg.norm(embs1)
Y = embs2 / torch.linalg.norm(embs2)
# Score
similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
similarity_score = (similarity_score + 1) / 2
print(f"{speaker} - {similarity_score}")
if similarity_score > 0.7:
similarity_scores.append(similarity_score)
if similarity_scores:
if max(similarity_scores) == similarity_score:
probably_speaker = speaker.split(".")[0]
if probably_speaker:
return probably_speaker
else:
return "Unidentified User"
def find_user(audio):
speaker = verify_speaker(audio)
print(speaker)
return speaker |