File size: 16,353 Bytes
72dd395 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 |
import io
import os
# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
import gradio as gr
import gradio.processing_utils as gr_pu
import librosa
import numpy as np
import soundfile
from inference.infer_tool import Svc
import logging
import re
import json
import subprocess
import edge_tts
import asyncio
from scipy.io import wavfile
import librosa
import torch
import time
import traceback
from itertools import chain
from utils import mix_model
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('multipart').setLevel(logging.WARNING)
model = None
spk = None
debug = False
cuda = {}
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
device_name = torch.cuda.get_device_properties(i).name
cuda[f"CUDA:{i} {device_name}"] = f"cuda:{i}"
def upload_mix_append_file(files,sfiles):
try:
if(sfiles == None):
file_paths = [file.name for file in files]
else:
file_paths = [file.name for file in chain(files,sfiles)]
p = {file:100 for file in file_paths}
return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2))
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def mix_submit_click(js,mode):
try:
assert js.lstrip()!=""
modes = {"凸组合":0, "线性组合":1}
mode = modes[mode]
data = json.loads(js)
data = list(data.items())
model_path,mix_rate = zip(*data)
path = mix_model(model_path,mix_rate,mode)
return f"成功,文件被保存在了{path}"
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def updata_mix_info(files):
try:
if files == None : return mix_model_output1.update(value="")
p = {file.name:100 for file in files}
return mix_model_output1.update(value=json.dumps(p,indent=2))
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance):
global model
try:
device = cuda[device] if "CUDA" in device else device
model = Svc(model_path.name, config_path.name, device=device if device!="Auto" else None, cluster_model_path = cluster_model_path.name if cluster_model_path != None else "",nsf_hifigan_enhance=enhance)
spks = list(model.spk2id.keys())
device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev)
msg = f"成功加载模型到设备{device_name}上\n"
if cluster_model_path is None:
msg += "未加载聚类模型\n"
else:
msg += f"聚类模型{cluster_model_path.name}加载成功\n"
msg += "当前模型的可用音色:\n"
for i in spks:
msg += i + " "
return sid.update(choices = spks,value=spks[0]), msg
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def modelUnload():
global model
if model is None:
return sid.update(choices = [],value=""),"没有模型需要卸载!"
else:
model.unload_model()
model = None
torch.cuda.empty_cache()
return sid.update(choices = [],value=""),"模型卸载完毕!"
def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold):
global model
try:
if input_audio is None:
raise gr.Error("你需要上传音频")
if model is None:
raise gr.Error("你需要指定模型")
sampling_rate, audio = input_audio
# print(audio.shape,sampling_rate)
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
temp_path = "temp.wav"
soundfile.write(temp_path, audio, sampling_rate, format="wav")
_audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold)
model.clear_empty()
os.remove(temp_path)
#构建保存文件的路径,并保存到results文件夹内
try:
timestamp = str(int(time.time()))
filename = sid + "_" + timestamp + ".wav"
output_file = os.path.join("./results", filename)
soundfile.write(output_file, _audio, model.target_sample, format="wav")
return f"推理成功,音频文件保存为results/{filename}", (model.target_sample, _audio)
except Exception as e:
if debug: traceback.print_exc()
return f"文件保存失败,请手动保存", (model.target_sample, _audio)
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def tts_func(_text,_rate,_voice):
#使用edge-tts把文字转成音频
# voice = "zh-CN-XiaoyiNeural"#女性,较高音
# voice = "zh-CN-YunxiNeural"#男性
voice = "zh-CN-YunxiNeural"#男性
if ( _voice == "女" ) : voice = "zh-CN-XiaoyiNeural"
output_file = _text[0:10]+".wav"
# communicate = edge_tts.Communicate(_text, voice)
# await communicate.save(output_file)
if _rate>=0:
ratestr="+{:.0%}".format(_rate)
elif _rate<0:
ratestr="{:.0%}".format(_rate)#减号自带
p=subprocess.Popen("edge-tts "+
" --text "+_text+
" --write-media "+output_file+
" --voice "+voice+
" --rate="+ratestr
,shell=True,
stdout=subprocess.PIPE,
stdin=subprocess.PIPE)
p.wait()
return output_file
def text_clear(text):
return re.sub(r"[\n\,\(\) ]", "", text)
def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key,cr_threshold):
#使用edge-tts把文字转成音频
text2tts=text_clear(text2tts)
output_file=tts_func(text2tts,tts_rate,tts_voice)
#调整采样率
sr2=44100
wav, sr = librosa.load(output_file)
wav2 = librosa.resample(wav, orig_sr=sr, target_sr=sr2)
save_path2= text2tts[0:10]+"_44k"+".wav"
wavfile.write(save_path2,sr2,
(wav2 * np.iinfo(np.int16).max).astype(np.int16)
)
#读取音频
sample_rate, data=gr_pu.audio_from_file(save_path2)
vc_input=(sample_rate, data)
a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold)
os.remove(output_file)
os.remove(save_path2)
return a,b
def debug_change():
global debug
debug = debug_button.value
with gr.Blocks(
theme=gr.themes.Base(
primary_hue = gr.themes.colors.green,
font=["Source Sans Pro", "Arial", "sans-serif"],
font_mono=['JetBrains mono', "Consolas", 'Courier New']
),
) as app:
with gr.Tabs():
with gr.TabItem("推理"):
gr.Markdown(value="""
So-vits-svc 4.0 推理 webui
""")
with gr.Row(variant="panel"):
with gr.Column():
gr.Markdown(value="""
<font size=2> 模型设置</font>
""")
model_path = gr.File(label="选择模型文件")
config_path = gr.File(label="选择配置文件")
cluster_model_path = gr.File(label="选择聚类模型文件(没有可以不选)")
device = gr.Dropdown(label="推理设备,默认为自动选择CPU和GPU", choices=["Auto",*cuda.keys(),"CPU"], value="Auto")
enhance = gr.Checkbox(label="是否使用NSF_HIFIGAN增强,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭", value=False)
with gr.Column():
gr.Markdown(value="""
<font size=3>左侧文件全部选择完毕后(全部文件模块显示download),点击“加载模型”进行解析:</font>
""")
model_load_button = gr.Button(value="加载模型", variant="primary")
model_unload_button = gr.Button(value="卸载模型", variant="primary")
sid = gr.Dropdown(label="音色(说话人)")
sid_output = gr.Textbox(label="Output Message")
with gr.Row(variant="panel"):
with gr.Column():
gr.Markdown(value="""
<font size=2> 推理设置</font>
""")
auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声勾选此项会究极跑调)", value=False)
F0_mean_pooling = gr.Checkbox(label="是否对F0使用均值滤波器(池化),对部分哑音有改善。注意,启动该选项会导致推理速度下降,默认关闭", value=False)
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
slice_db = gr.Number(label="切片阈值", value=-40)
noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
with gr.Column():
pad_seconds = gr.Number(label="推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现", value=0.5)
cl_num = gr.Number(label="音频自动切片,0为不切片,单位为秒(s)", value=0)
lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=0)
lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75)
enhancer_adaptive_key = gr.Number(label="使增强器适应更高的音域(单位为半音数)|默认为0", value=0)
cr_threshold = gr.Number(label="F0过滤阈值,只有启动f0_mean_pooling时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音", value=0.05)
with gr.Tabs():
with gr.TabItem("音频转音频"):
vc_input3 = gr.Audio(label="选择音频")
vc_submit = gr.Button("音频转换", variant="primary")
with gr.TabItem("文字转音频"):
text2tts=gr.Textbox(label="在此输入要转译的文字。注意,使用该功能建议打开F0预测,不然会很怪")
tts_rate = gr.Number(label="tts语速", value=0)
tts_voice = gr.Radio(label="性别",choices=["男","女"], value="男")
vc_submit2 = gr.Button("文字转换", variant="primary")
with gr.Row():
with gr.Column():
vc_output1 = gr.Textbox(label="Output Message")
with gr.Column():
vc_output2 = gr.Audio(label="Output Audio", interactive=False)
with gr.TabItem("小工具/实验室特性"):
gr.Markdown(value="""
<font size=2> So-vits-svc 4.0 小工具/实验室特性</font>
""")
with gr.Tabs():
with gr.TabItem("静态声线融合"):
gr.Markdown(value="""
<font size=2> 介绍:该功能可以将多个声音模型合成为一个声音模型(多个模型参数的凸组合或线性组合),从而制造出现实中不存在的声线
注意:
1.该功能仅支持单说话人的模型
2.如果强行使用多说话人模型,需要保证多个模型的说话人数量相同,这样可以混合同一个SpaekerID下的声音
3.保证所有待混合模型的config.json中的model字段是相同的
4.输出的混合模型可以使用待合成模型的任意一个config.json,但聚类模型将不能使用
5.批量上传模型的时候最好把模型放到一个文件夹选中后一起上传
6.混合比例调整建议大小在0-100之间,也可以调为其他数字,但在线性组合模式下会出现未知的效果
7.混合完毕后,文件将会保存在项目根目录中,文件名为output.pth
8.凸组合模式会将混合比例执行Softmax使混合比例相加为1,而线性组合模式不会
</font>
""")
mix_model_path = gr.Files(label="选择需要混合模型文件")
mix_model_upload_button = gr.UploadButton("选择/追加需要混合模型文件", file_count="multiple", variant="primary")
mix_model_output1 = gr.Textbox(
label="混合比例调整,单位/%",
interactive = True
)
mix_mode = gr.Radio(choices=["凸组合", "线性组合"], label="融合模式",value="凸组合",interactive = True)
mix_submit = gr.Button("声线融合启动", variant="primary")
mix_model_output2 = gr.Textbox(
label="Output Message"
)
mix_model_path.change(updata_mix_info,[mix_model_path],[mix_model_output1])
mix_model_upload_button.upload(upload_mix_append_file, [mix_model_upload_button,mix_model_path], [mix_model_path,mix_model_output1])
mix_submit.click(mix_submit_click, [mix_model_output1,mix_mode], [mix_model_output2])
with gr.Tabs():
with gr.Row(variant="panel"):
with gr.Column():
gr.Markdown(value="""
<font size=2> WebUI设置</font>
""")
debug_button = gr.Checkbox(label="Debug模式,如果向社区反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug)
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2])
vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2])
debug_button.change(debug_change,[],[])
model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance],[sid,sid_output])
model_unload_button.click(modelUnload,[],[sid,sid_output])
app.launch()
|