File size: 1,455 Bytes
2d8da09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import SoundScribe.SpeakerID.nemo.collections.asr as nemo_asr
import torch
import os


speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
    "nvidia/speakerverification_en_titanet_large")

speakers = os.listdir("SoundScribe/voices")
identified_speakers = []
for speaker in speakers:
    if not speaker.startswith("."):
        identified_speakers.append(speaker)

embeddings = [speaker_model.get_embedding("./SoundScribe/voices/"+audio_file).squeeze() for audio_file in identified_speakers]

def verify_speaker(file):
    embs1 = speaker_model.get_embedding(file).squeeze()
    
    similarity_scores = []
    probably_speaker = ""
    for embs2, speaker in zip(embeddings, identified_speakers):
        X = embs1 / torch.linalg.norm(embs1)
        Y = embs2 / torch.linalg.norm(embs2)
        # Score
        similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
        similarity_score = (similarity_score + 1) / 2
        print(f"{speaker} - {similarity_score}")

        if similarity_score > 0.7:
            similarity_scores.append(similarity_score)
        if similarity_scores:
            if max(similarity_scores) == similarity_score:
                probably_speaker = speaker.split(".")[0]

    if probably_speaker:
        return probably_speaker
    else:
        return "Unidentified User"


def find_user(audio):
    speaker = verify_speaker(audio)
    print(speaker)
    return speaker