Spaces:
Running
Running
Update inference.py
Browse files- inference.py +9 -11
inference.py
CHANGED
@@ -17,22 +17,22 @@ class InferencePipeline():
|
|
17 |
# download vocoder
|
18 |
self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu'))
|
19 |
|
20 |
-
# load source audio
|
21 |
-
#self.source, sr = torchaudio.load("test.wav")
|
22 |
-
#self.source = torchaudio.functional.resample(self.source, sr, 16000)
|
23 |
-
#self.source = self.source.unsqueeze(0)#.cuda()
|
24 |
-
|
25 |
# load target speaker embedding
|
26 |
self.trg_spk_emb = np.load('p225_007_mic1.npy')
|
27 |
self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb)
|
28 |
self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda()
|
29 |
|
30 |
-
def voice_conversion(self,
|
|
|
|
|
|
|
|
|
|
|
31 |
# run inference
|
32 |
self.model.eval()
|
33 |
with torch.inference_mode():
|
34 |
# Extract speech units
|
35 |
-
units = self.hubert.units(
|
36 |
# Generate target spectrogram
|
37 |
mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2)
|
38 |
# Generate audio waveform
|
@@ -41,8 +41,6 @@ class InferencePipeline():
|
|
41 |
# Assuming `target` is a tensor with the audio waveform
|
42 |
# Convert it to numpy array and save it as an output audio file
|
43 |
output_audio_path = "output.wav"
|
44 |
-
torchaudio.save(
|
45 |
-
|
46 |
-
return output_audio_path
|
47 |
|
48 |
-
|
|
|
17 |
# download vocoder
|
18 |
self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu'))
|
19 |
|
|
|
|
|
|
|
|
|
|
|
20 |
# load target speaker embedding
|
21 |
self.trg_spk_emb = np.load('p225_007_mic1.npy')
|
22 |
self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb)
|
23 |
self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda()
|
24 |
|
25 |
+
def voice_conversion(self, audio_path):
|
26 |
+
# load source audio
|
27 |
+
source, sr = torchaudio.load(audio_path) #"test.wav")
|
28 |
+
source = torchaudio.functional.resample(source, sr, 16000)
|
29 |
+
source = source.unsqueeze(0)#.cuda()
|
30 |
+
|
31 |
# run inference
|
32 |
self.model.eval()
|
33 |
with torch.inference_mode():
|
34 |
# Extract speech units
|
35 |
+
units = self.hubert.units(source)
|
36 |
# Generate target spectrogram
|
37 |
mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2)
|
38 |
# Generate audio waveform
|
|
|
41 |
# Assuming `target` is a tensor with the audio waveform
|
42 |
# Convert it to numpy array and save it as an output audio file
|
43 |
output_audio_path = "output.wav"
|
44 |
+
torchaudio.save("output.wav", target.squeeze(0), 16000)
|
|
|
|
|
45 |
|
46 |
+
return output_audio_path
|