File size: 7,759 Bytes
85a7d2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import io
import logging
import torch
import numpy as np
import slicer
import soundfile as sf
import librosa
from flask import Flask, request, send_file
from flask_cors import CORS
from ddsp.vocoder import load_model, F0_Extractor, Volume_Extractor, Units_Encoder
from ddsp.core import upsample
from enhancer import Enhancer
app = Flask(__name__)
CORS(app)
logging.getLogger("numba").setLevel(logging.WARNING)
@app.route("/voiceChangeModel", methods=["POST"])
def voice_change_model():
request_form = request.form
wave_file = request.files.get("sample", None)
# get fSafePrefixPadLength
f_safe_prefix_pad_length = float(request_form.get("fSafePrefixPadLength", 0))
print("f_safe_prefix_pad_length:"+str(f_safe_prefix_pad_length))
# 变调信息
f_pitch_change = float(request_form.get("fPitchChange", 0))
# 获取spk_id
int_speak_id = int(request_form.get("sSpeakId", 0))
if enable_spk_id_cover:
int_speak_id = spk_id
# print("说话人:" + str(int_speak_id))
# DAW所需的采样率
daw_sample = int(float(request_form.get("sampleRate", 0)))
# http获得wav文件并转换
input_wav_read = io.BytesIO(wave_file.read())
# 模型推理
_audio, _model_sr = svc_model.infer(input_wav_read, f_pitch_change, int_speak_id, f_safe_prefix_pad_length)
tar_audio = librosa.resample(_audio, _model_sr, daw_sample)
# 返回音频
out_wav_path = io.BytesIO()
sf.write(out_wav_path, tar_audio, daw_sample, format="wav")
out_wav_path.seek(0)
return send_file(out_wav_path, download_name="temp.wav", as_attachment=True)
class SvcDDSP:
def __init__(self, model_path, vocoder_based_enhancer, enhancer_adaptive_key, input_pitch_extractor,
f0_min, f0_max, threhold, spk_id, spk_mix_dict, enable_spk_id_cover):
self.model_path = model_path
self.vocoder_based_enhancer = vocoder_based_enhancer
self.enhancer_adaptive_key = enhancer_adaptive_key
self.input_pitch_extractor = input_pitch_extractor
self.f0_min = f0_min
self.f0_max = f0_max
self.threhold = threhold
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.spk_id = spk_id
self.spk_mix_dict = spk_mix_dict
self.enable_spk_id_cover = enable_spk_id_cover
# load ddsp model
self.model, self.args = load_model(self.model_path, device=self.device)
# load units encoder
if self.args.data.encoder == 'cnhubertsoftfish':
cnhubertsoft_gate = self.args.data.cnhubertsoft_gate
else:
cnhubertsoft_gate = 10
self.units_encoder = Units_Encoder(
self.args.data.encoder,
self.args.data.encoder_ckpt,
self.args.data.encoder_sample_rate,
self.args.data.encoder_hop_size,
cnhubertsoft_gate=cnhubertsoft_gate,
device=self.device)
# load enhancer
if self.vocoder_based_enhancer:
self.enhancer = Enhancer(self.args.enhancer.type, self.args.enhancer.ckpt, device=self.device)
def infer(self, input_wav, pitch_adjust, speaker_id, safe_prefix_pad_length):
print("Infer!")
# load input
audio, sample_rate = librosa.load(input_wav, sr=None, mono=True)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio)
hop_size = self.args.data.block_size * sample_rate / self.args.data.sampling_rate
# safe front silence
if safe_prefix_pad_length > 0.03:
silence_front = safe_prefix_pad_length - 0.03
else:
silence_front = 0
# extract f0
pitch_extractor = F0_Extractor(
self.input_pitch_extractor,
sample_rate,
hop_size,
float(self.f0_min),
float(self.f0_max))
f0 = pitch_extractor.extract(audio, uv_interp=True, device=self.device, silence_front=silence_front)
f0 = torch.from_numpy(f0).float().to(self.device).unsqueeze(-1).unsqueeze(0)
f0 = f0 * 2 ** (float(pitch_adjust) / 12)
# extract volume
volume_extractor = Volume_Extractor(hop_size)
volume = volume_extractor.extract(audio)
mask = (volume > 10 ** (float(self.threhold) / 20)).astype('float')
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
mask = np.array([np.max(mask[n : n + 9]) for n in range(len(mask) - 8)])
mask = torch.from_numpy(mask).float().to(self.device).unsqueeze(-1).unsqueeze(0)
mask = upsample(mask, self.args.data.block_size).squeeze(-1)
volume = torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0)
# extract units
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
units = self.units_encoder.encode(audio_t, sample_rate, hop_size)
# spk_id or spk_mix_dict
if self.enable_spk_id_cover:
spk_id = self.spk_id
else:
spk_id = speaker_id
spk_id = torch.LongTensor(np.array([[spk_id]])).to(self.device)
# forward and return the output
with torch.no_grad():
output, _, (s_h, s_n) = self.model(units, f0, volume, spk_id = spk_id, spk_mix_dict = self.spk_mix_dict)
output *= mask
if self.vocoder_based_enhancer:
output, output_sample_rate = self.enhancer.enhance(
output,
self.args.data.sampling_rate,
f0,
self.args.data.block_size,
adaptive_key = self.enhancer_adaptive_key,
silence_front = silence_front)
else:
output_sample_rate = self.args.data.sampling_rate
output = output.squeeze().cpu().numpy()
return output, output_sample_rate
if __name__ == "__main__":
# ddsp-svc下只需传入下列参数。
# 对接的是串串香火锅大佬https://github.com/zhaohui8969/VST_NetProcess-。建议使用最新版本。
# flask部分来自diffsvc小狼大佬编写的代码。
# config和模型得同一目录。
checkpoint_path = "exp/multi_speaker/model_300000.pt"
# 是否使用预训练的基于声码器的增强器增强输出,但对硬件要求更高。
use_vocoder_based_enhancer = True
# 结合增强器使用,0为正常音域范围(最高G5)内的高音频质量,大于0则可以防止超高音破音
enhancer_adaptive_key = 0
# f0提取器,有parselmouth, dio, harvest, crepe
select_pitch_extractor = 'crepe'
# f0范围限制(Hz)
limit_f0_min = 50
limit_f0_max = 1100
# 音量响应阈值(dB)
threhold = -60
# 默认说话人。以及是否优先使用默认说话人覆盖vst传入的参数。
spk_id = 1
enable_spk_id_cover = True
# 混合说话人字典(捏音色功能)
# 设置为非 None 字典会覆盖 spk_id
spk_mix_dict = None # {1:0.5, 2:0.5} 表示1号说话人和2号说话人的音色按照0.5:0.5的比例混合
svc_model = SvcDDSP(checkpoint_path, use_vocoder_based_enhancer, enhancer_adaptive_key, select_pitch_extractor,
limit_f0_min, limit_f0_max, threhold, spk_id, spk_mix_dict, enable_spk_id_cover)
# 此处与vst插件对应,端口必须接上。
app.run(port=6844, host="0.0.0.0", debug=False, threaded=False)
|