Spaces:
Running
Running
File size: 2,030 Bytes
2c7b92a ec804af 2c7b92a 6b6fc4f 2a30323 6b6fc4f 15564d2 2c7b92a 15564d2 2c7b92a 15564d2 2c7b92a 15564d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import torch
import torchaudio
import numpy as np
from decoder_base import AcousticModel
class InferencePipeline():
def __init__(self):
# download hubert content encoder
self.hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)#.cuda()
# initialize decoder with checkpoint
ckpts_path = 'model-best.pt'
self.model = AcousticModel()
cp = torch.load(ckpts_path, map_location=torch.device('cpu'))
self.model.load_state_dict(cp['acoustic-model'])
# download vocoder
self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu'))
# load target speaker embedding
self.trg_spk_emb = np.load('p226_007_mic1.npy')#'heidi.npy')#'p225_007_mic1.npy')
self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb)
self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda()
def voice_conversion(self, audio_data):
# Extract the file path from the tuple
audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data
print(f"Loading audio from: {audio_path}")
# load source audio
source, sr = torchaudio.load(audio_path) #"test.wav")
source = torchaudio.functional.resample(source, sr, 16000)
source = source.unsqueeze(0)#.cuda()
# run inference
self.model.eval()
with torch.inference_mode():
# Extract speech units
units = self.hubert.units(source)
# Generate target spectrogram
mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2)
# Generate audio waveform
target = self.hifigan(mel)
# Assuming `target` is a tensor with the audio waveform
# Convert it to numpy array and save it as an output audio file
output_audio_path = "output.wav"
torchaudio.save("output.wav", target.squeeze(0), 16000)
return output_audio_path |