CRYSTAL-R1 / SoundScribe /speakerID.py
crystal-technologies's picture
Upload 1287 files
2d8da09
raw
history blame
1.46 kB
import SoundScribe.SpeakerID.nemo.collections.asr as nemo_asr
import torch
import os
speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
"nvidia/speakerverification_en_titanet_large")
speakers = os.listdir("SoundScribe/voices")
identified_speakers = []
for speaker in speakers:
if not speaker.startswith("."):
identified_speakers.append(speaker)
embeddings = [speaker_model.get_embedding("./SoundScribe/voices/"+audio_file).squeeze() for audio_file in identified_speakers]
def verify_speaker(file):
embs1 = speaker_model.get_embedding(file).squeeze()
similarity_scores = []
probably_speaker = ""
for embs2, speaker in zip(embeddings, identified_speakers):
X = embs1 / torch.linalg.norm(embs1)
Y = embs2 / torch.linalg.norm(embs2)
# Score
similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
similarity_score = (similarity_score + 1) / 2
print(f"{speaker} - {similarity_score}")
if similarity_score > 0.7:
similarity_scores.append(similarity_score)
if similarity_scores:
if max(similarity_scores) == similarity_score:
probably_speaker = speaker.split(".")[0]
if probably_speaker:
return probably_speaker
else:
return "Unidentified User"
def find_user(audio):
speaker = verify_speaker(audio)
print(speaker)
return speaker