Spaces:

uzdzn
/

voice_conversion_demo

Running

App Files Files Community

voice_conversion_demo / inference.py

uzdzn

Update inference.py

ec804af verified 9 months ago

raw

history blame

2.03 kB

	import torch
	import torchaudio
	import numpy as np
	from decoder_base import AcousticModel

	class InferencePipeline():
	def __init__(self):
	# download hubert content encoder
	self.hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)#.cuda()

	# initialize decoder with checkpoint
	ckpts_path = 'model-best.pt'
	self.model = AcousticModel()
	cp = torch.load(ckpts_path, map_location=torch.device('cpu'))
	self.model.load_state_dict(cp['acoustic-model'])

	# download vocoder
	self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu'))

	# load target speaker embedding
	self.trg_spk_emb = np.load('p226_007_mic1.npy')#'heidi.npy')#'p225_007_mic1.npy')
	self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb)
	self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda()

	def voice_conversion(self, audio_data):
	# Extract the file path from the tuple
	audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data
	print(f"Loading audio from: {audio_path}")

	# load source audio
	source, sr = torchaudio.load(audio_path) #"test.wav")
	source = torchaudio.functional.resample(source, sr, 16000)
	source = source.unsqueeze(0)#.cuda()

	# run inference
	self.model.eval()
	with torch.inference_mode():
	# Extract speech units
	units = self.hubert.units(source)
	# Generate target spectrogram
	mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2)
	# Generate audio waveform
	target = self.hifigan(mel)

	# Assuming `target` is a tensor with the audio waveform
	# Convert it to numpy array and save it as an output audio file
	output_audio_path = "output.wav"
	torchaudio.save("output.wav", target.squeeze(0), 16000)

	return output_audio_path