|
import SoundScribe.SpeakerID.nemo.collections.asr as nemo_asr |
|
import torch |
|
import os |
|
|
|
|
|
speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained( |
|
"nvidia/speakerverification_en_titanet_large") |
|
|
|
speakers = os.listdir("SoundScribe/voices") |
|
identified_speakers = [] |
|
for speaker in speakers: |
|
if not speaker.startswith("."): |
|
identified_speakers.append(speaker) |
|
|
|
embeddings = [speaker_model.get_embedding("./SoundScribe/voices/"+audio_file).squeeze() for audio_file in identified_speakers] |
|
|
|
def verify_speaker(file): |
|
embs1 = speaker_model.get_embedding(file).squeeze() |
|
|
|
similarity_scores = [] |
|
probably_speaker = "" |
|
for embs2, speaker in zip(embeddings, identified_speakers): |
|
X = embs1 / torch.linalg.norm(embs1) |
|
Y = embs2 / torch.linalg.norm(embs2) |
|
|
|
similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5) |
|
similarity_score = (similarity_score + 1) / 2 |
|
print(f"{speaker} - {similarity_score}") |
|
|
|
if similarity_score > 0.7: |
|
similarity_scores.append(similarity_score) |
|
if similarity_scores: |
|
if max(similarity_scores) == similarity_score: |
|
probably_speaker = speaker.split(".")[0] |
|
|
|
if probably_speaker: |
|
return probably_speaker |
|
else: |
|
return "Unidentified User" |
|
|
|
|
|
def find_user(audio): |
|
speaker = verify_speaker(audio) |
|
print(speaker) |
|
return speaker |