Julius8888's picture
Upload 226 files
bc4f3f8 verified
# flake8: noqa: E402
import os
import logging
import re_matching
from tools.sentence import split_by_language
logging.getLogger("numba").setLevel(logging.WARNING)
logging.getLogger("markdown_it").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("matplotlib").setLevel(logging.WARNING)
logging.basicConfig(
level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger(__name__)
import torch
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import nltk
nltk.download('cmudict')
import utils
from infer import infer, latest_version, get_net_g, infer_multilang
import gradio as gr
import webbrowser
import numpy as np
from config import config
from tools.translate import translate
import librosa
net_g = None
device = config.webui_config.device
if device == "mps":
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
def generate_audio(
slices,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
speaker,
language,
reference_audio,
emotion,
style_text,
style_weight,
skip_start=False,
skip_end=False,
):
audio_list = []
# silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
with torch.no_grad():
for idx, piece in enumerate(slices):
skip_start = idx != 0
skip_end = idx != len(slices) - 1
audio = infer(
piece,
reference_audio=reference_audio,
emotion=emotion,
sdp_ratio=sdp_ratio,
noise_scale=noise_scale,
noise_scale_w=noise_scale_w,
length_scale=length_scale,
sid=speaker,
language=language,
hps=hps,
net_g=net_g,
device=device,
skip_start=skip_start,
skip_end=skip_end,
style_text=style_text,
style_weight=style_weight,
)
audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
audio_list.append(audio16bit)
return audio_list
def generate_audio_multilang(
slices,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
speaker,
language,
reference_audio,
emotion,
skip_start=False,
skip_end=False,
):
audio_list = []
# silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
with torch.no_grad():
for idx, piece in enumerate(slices):
skip_start = idx != 0
skip_end = idx != len(slices) - 1
audio = infer_multilang(
piece,
reference_audio=reference_audio,
emotion=emotion,
sdp_ratio=sdp_ratio,
noise_scale=noise_scale,
noise_scale_w=noise_scale_w,
length_scale=length_scale,
sid=speaker,
language=language[idx],
hps=hps,
net_g=net_g,
device=device,
skip_start=skip_start,
skip_end=skip_end,
)
audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
audio_list.append(audio16bit)
return audio_list
def tts_split(
text: str,
speaker,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
language,
cut_by_sent,
interval_between_para,
interval_between_sent,
reference_audio,
emotion,
style_text,
style_weight,
):
while text.find("\n\n") != -1:
text = text.replace("\n\n", "\n")
text = text.replace("|", "")
para_list = re_matching.cut_para(text)
para_list = [p for p in para_list if p != ""]
audio_list = []
for p in para_list:
if not cut_by_sent:
audio_list += process_text(
p,
speaker,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
language,
reference_audio,
emotion,
style_text,
style_weight,
)
silence = np.zeros((int)(44100 * interval_between_para), dtype=np.int16)
audio_list.append(silence)
else:
audio_list_sent = []
sent_list = re_matching.cut_sent(p)
sent_list = [s for s in sent_list if s != ""]
for s in sent_list:
audio_list_sent += process_text(
s,
speaker,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
language,
reference_audio,
emotion,
style_text,
style_weight,
)
silence = np.zeros((int)(44100 * interval_between_sent))
audio_list_sent.append(silence)
if (interval_between_para - interval_between_sent) > 0:
silence = np.zeros(
(int)(44100 * (interval_between_para - interval_between_sent))
)
audio_list_sent.append(silence)
audio16bit = gr.processing_utils.convert_to_16_bit_wav(
np.concatenate(audio_list_sent)
) # 对完整句子做音量归一
audio_list.append(audio16bit)
audio_concat = np.concatenate(audio_list)
return ("Success", (hps.data.sampling_rate, audio_concat))
def process_mix(slice):
_speaker = slice.pop()
_text, _lang = [], []
for lang, content in slice:
content = content.split("|")
content = [part for part in content if part != ""]
if len(content) == 0:
continue
if len(_text) == 0:
_text = [[part] for part in content]
_lang = [[lang] for part in content]
else:
_text[-1].append(content[0])
_lang[-1].append(lang)
if len(content) > 1:
_text += [[part] for part in content[1:]]
_lang += [[lang] for part in content[1:]]
return _text, _lang, _speaker
def process_auto(text):
_text, _lang = [], []
for slice in text.split("|"):
if slice == "":
continue
temp_text, temp_lang = [], []
sentences_list = split_by_language(slice, target_languages=["zh", "ja", "en"])
for sentence, lang in sentences_list:
if sentence == "":
continue
temp_text.append(sentence)
temp_lang.append(lang.upper())
_text.append(temp_text)
_lang.append(temp_lang)
return _text, _lang
def process_text(
text: str,
speaker,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
language,
reference_audio,
emotion,
style_text=None,
style_weight=0,
):
audio_list = []
if language == "mix":
bool_valid, str_valid = re_matching.validate_text(text)
if not bool_valid:
return str_valid, (
hps.data.sampling_rate,
np.concatenate([np.zeros(hps.data.sampling_rate // 2)]),
)
for slice in re_matching.text_matching(text):
_text, _lang, _speaker = process_mix(slice)
if _speaker is None:
continue
print(f"Text: {_text}\nLang: {_lang}")
audio_list.extend(
generate_audio_multilang(
_text,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
_speaker,
_lang,
reference_audio,
emotion,
)
)
elif language.lower() == "auto":
_text, _lang = process_auto(text)
print(f"Text: {_text}\nLang: {_lang}")
_lang = [[lang.replace("JA", "JP") for lang in lang_list] for lang_list in _lang]
audio_list.extend(
generate_audio_multilang(
_text,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
speaker,
_lang,
reference_audio,
emotion,
)
)
else:
audio_list.extend(
generate_audio(
text.split("|"),
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
speaker,
language,
reference_audio,
emotion,
style_text,
style_weight,
)
)
return audio_list
def tts_fn(
text: str,
speaker,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
language,
reference_audio,
emotion,
prompt_mode,
style_text=None,
style_weight=0,
):
if style_text == "":
style_text = None
if prompt_mode == "Audio prompt":
if reference_audio == None:
return ("Invalid audio prompt", None)
else:
reference_audio = load_audio(reference_audio)[1]
else:
reference_audio = None
audio_list = process_text(
text,
speaker,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
language,
reference_audio,
emotion,
style_text,
style_weight,
)
audio_concat = np.concatenate(audio_list)
return "Success", (hps.data.sampling_rate, audio_concat)
def format_utils(text, speaker):
_text, _lang = process_auto(text)
res = f"[{speaker}]"
for lang_s, content_s in zip(_lang, _text):
for lang, content in zip(lang_s, content_s):
res += f"<{lang.lower()}>{content}"
res += "|"
return "mix", res[:-1]
def load_audio(path):
audio, sr = librosa.load(path, 48000)
# audio = librosa.resample(audio, 44100, 48000)
return sr, audio
def gr_util(item):
if item == "Text prompt":
return {"visible": True, "__type__": "update"}, {
"visible": False,
"__type__": "update",
}
else:
return {"visible": False, "__type__": "update"}, {
"visible": True,
"__type__": "update",
}
import json
def load_json(file_path):
with open(file_path, 'r', encoding="utf-8") as file:
data = json.load(file)
return data
if __name__ == "__main__":
if config.webui_config.debug:
logger.info("Enable DEBUG-LEVEL log")
logging.basicConfig(level=logging.DEBUG)
hps = utils.get_hparams_from_file(config.webui_config.config_path)
# 若config.json中未指定版本则默认为最新版本
version = hps.version if hasattr(hps, "version") else latest_version
net_g = get_net_g(
model_path=config.webui_config.model, version=version, device=device, hps=hps
)
speaker_ids = hps.data.spk2id
speakers = list(speaker_ids.keys())
languages = ["ZH", "JP", "EN", "auto", "mix"]
author_and_voice_data = load_json('author_and_voice_data.json')
with gr.Blocks() as app:
with gr.Row():
with gr.Column():
gr.Markdown(value=f"""
作者:{author_and_voice_data["author"]}\n
聲音歸屬:{author_and_voice_data["voice"]}\n
Bert-VITS2項目:https://github.com/fishaudio/Bert-VITS2\n
Bert-VITS2-Colab:https://github.com/ADT109119/Bert-VITS2-Colab\n
使用本模型請嚴格遵守法規! \n
發布二創作品請標註本計畫作者及連結、作品使用Bert-VITS2 AI生成! \n
【提示】手機端容易誤觸調節,請刷新恢復預設! 每次產生的結果都不一樣,效果不好請嘗試多次產生與調節,選擇最佳結果! \n """)
text = gr.TextArea(
label="輸入文本內容",
placeholder="""
推薦不同語言分開推理,因為無法連貫且可能影響最終效果!
若選擇語言為\'mix\',必須依照格式輸入,否則報錯:
格式舉例(zh是中文,jp是日語,en是英語;不區分大小寫):
[說話者]<zh>你好 <jp>こんにちは <en>Hello
另外,所有的語言選項都可以用'|'分割長段實現分句生成。
""", )
speaker = gr.Dropdown(
choices=speakers, value=speakers[0], label="Speaker"
)
_ = gr.Markdown(
value="提示模式(Prompt mode):可選文字提示或音訊提示,用於產生文字或音訊指定風格的聲音。\n",
visible=False,
)
prompt_mode = gr.Radio(
["Text prompt", "Audio prompt"],
label="Prompt Mode",
value="Text prompt",
visible=False,
)
text_prompt = gr.Textbox(
label="Text prompt",
placeholder="用文字描述生成風格。如:Happy",
value="Happy",
visible=False,
)
audio_prompt = gr.Audio(
label="Audio prompt", type="filepath", visible=False
)
sdp_ratio = gr.Slider(
minimum=0, maximum=1, value=0.5, step=0.01, label="SDP Ratio"
)
noise_scale = gr.Slider(
minimum=0.1, maximum=2, value=0.5, step=0.01, label="Noise"
)
noise_scale_w = gr.Slider(
minimum=0.1, maximum=2, value=0.9, step=0.01, label="Noise_W"
)
length_scale = gr.Slider(
minimum=0.1, maximum=2, value=1.0, step=0.01, label="Length"
)
language = gr.Dropdown(
choices=languages, value=languages[0], label="Language"
)
btn = gr.Button("點擊生成", variant="primary")
with gr.Column():
with gr.Accordion("融合文本語義", open=False):
gr.Markdown(
value="使用輔助文本的語意來輔助生成對話(語言保持與主文本相同)\n\n"
"**注意**:不要使用**指令式文字**(如:開心),要使用**帶有強烈情感的文本**(如:我好快樂!!!)\n\n"
"效果較不明確,留空即為不使用該功能"
)
style_text = gr.Textbox(label="輔助文本")
style_weight = gr.Slider(
minimum=0,
maximum=1,
value=0.7,
step=0.1,
label="Weight",
info="主文本和輔助文本的bert混合比率,0表示僅主文本,1表示僅輔助文本",
)
with gr.Row():
with gr.Column():
interval_between_sent = gr.Slider(
minimum=0,
maximum=5,
value=0.2,
step=0.1,
label="句間停頓(秒),勾選按句切分才生效",
)
interval_between_para = gr.Slider(
minimum=0,
maximum=10,
value=1,
step=0.1,
label="段間停頓(秒),需要大於句間停頓才有效",
)
opt_cut_by_sent = gr.Checkbox(
label="按句切分 在按段落切分的基礎上再按句子切分文本"
)
slicer = gr.Button("切分生成", variant="primary")
text_output = gr.Textbox(label="狀態訊息")
audio_output = gr.Audio(label="輸出音頻")
# explain_image = gr.Image(
# label="参数解释信息",
# show_label=True,
# show_share_button=False,
# show_download_button=False,
# value=os.path.abspath("./img/参数说明.png"),
# )
btn.click(
tts_fn,
inputs=[
text,
speaker,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
language,
audio_prompt,
text_prompt,
prompt_mode,
style_text,
style_weight,
],
outputs=[text_output, audio_output],
api_name="api"
)
slicer.click(
tts_split,
inputs=[
text,
speaker,
sdp_ratio,
noise_scale,
noise_scale_w,
length_scale,
language,
opt_cut_by_sent,
interval_between_para,
interval_between_sent,
audio_prompt,
text_prompt,
style_text,
style_weight,
],
outputs=[text_output, audio_output],
)
prompt_mode.change(
lambda x: gr_util(x),
inputs=[prompt_mode],
outputs=[text_prompt, audio_prompt],
)
audio_prompt.upload(
lambda x: load_audio(x),
inputs=[audio_prompt],
outputs=[audio_prompt],
)
app.launch(show_error=True)