File size: 2,030 Bytes
2c7b92a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec804af
2c7b92a
 
 
6b6fc4f
 
 
2a30323
6b6fc4f
15564d2
 
 
 
 
2c7b92a
 
 
 
15564d2
2c7b92a
 
 
 
 
 
 
 
15564d2
2c7b92a
15564d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import torch
import torchaudio
import numpy as np
from decoder_base import AcousticModel

class InferencePipeline():
    def __init__(self):
        # download hubert content encoder
        self.hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)#.cuda()

        # initialize decoder with checkpoint
        ckpts_path = 'model-best.pt'
        self.model = AcousticModel()
        cp = torch.load(ckpts_path, map_location=torch.device('cpu'))
        self.model.load_state_dict(cp['acoustic-model'])

        # download vocoder
        self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu'))

        # load target speaker embedding
        self.trg_spk_emb = np.load('p226_007_mic1.npy')#'heidi.npy')#'p225_007_mic1.npy')
        self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb)
        self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda()

    def voice_conversion(self, audio_data):
        # Extract the file path from the tuple
        audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data
        print(f"Loading audio from: {audio_path}")
        
        # load source audio
        source, sr = torchaudio.load(audio_path) #"test.wav")
        source = torchaudio.functional.resample(source, sr, 16000)
        source = source.unsqueeze(0)#.cuda()

        # run inference
        self.model.eval()
        with torch.inference_mode():
            # Extract speech units
            units = self.hubert.units(source)
            # Generate target spectrogram
            mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2)
            # Generate audio waveform
            target = self.hifigan(mel)

        # Assuming `target` is a tensor with the audio waveform
        # Convert it to numpy array and save it as an output audio file
        output_audio_path = "output.wav"
        torchaudio.save("output.wav", target.squeeze(0), 16000)

        return output_audio_path