Linly-Talker / VITS /XTTS.py
linxianzhong0128's picture
Upload folder using huggingface_hub
7088d16 verified
import os
import subprocess
import uuid
import time
import torch
import torchaudio
# langid is used to detect language for longer text
# Most users expect text to be their own language, there is checkbox to disable it
import langid
import re
import gradio as gr
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
class XTTS():
def __init__(self):
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
# ModelManager().download_model(model_name)
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
# print("XTTS downloaded")
config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))
self.model = Xtts.init_from_config(config)
self.model.load_checkpoint(
config,
checkpoint_path=os.path.join(model_path, "model.pth"),
vocab_path=os.path.join(model_path, "vocab.json"),
eval=True,
use_deepspeed=True,
)
self.model.cuda()
self.supported_languages = config.languages
def predict(self,
prompt,
language,
audio_file_pth,
voice_cleanup,
):
# 模型不支持语言
if language not in self.supported_languages:
gr.Warning(
f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
)
return (
None,
None,
None,
None,
)
language_predicted = langid.classify(prompt)[
0
].strip() # strip need as there is space at end!
# tts expects chinese as zh-cn
if language_predicted == "zh":
# we use zh-cn
language_predicted = "zh-cn"
print(f"Detected language:{language_predicted}, Chosen language:{language}")
speaker_wav = audio_file_pth
# Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
# This is fast filtering not perfect
# Apply all on demand
lowpassfilter = denoise = trim = loudness = True
if lowpassfilter:
lowpass_highpass = "lowpass=8000,highpass=75,"
else:
lowpass_highpass = ""
if trim:
# better to remove silence in beginning and end for microphone
trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
else:
trim_silence = ""
if voice_cleanup:
try:
out_filename = (
speaker_wav + str(uuid.uuid4()) + ".wav"
) # ffmpeg to know output format
# we will use newer ffmpeg as that has afftn denoise filter
shell_command = f"ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
" "
)
command_result = subprocess.run(
[item for item in shell_command],
capture_output=False,
text=True,
check=True,
)
speaker_wav = out_filename
print("Filtered microphone input")
except subprocess.CalledProcessError:
# There was an error - command exited with non-zero code
print("Error: failed filtering, use original microphone input")
else:
speaker_wav = speaker_wav
if len(prompt) < 2:
gr.Warning("Please give a longer prompt text")
return (
None,
None,
None,
None,
)
metrics_text = ""
t_latent = time.time()
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
try:
(
gpt_cond_latent,
speaker_embedding,
) = self.model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
except Exception as e:
print("Speaker encoding error", str(e))
gr.Warning(
"It appears something wrong with reference, did you unmute your microphone?"
)
return (
None,
None,
None,
)
latent_calculation_time = time.time() - t_latent
# metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
# temporary comma fix
prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
wav_chunks = []
## Direct mode
print("I: Generating new audio...")
t0 = time.time()
out = self.model.inference(
prompt,
language,
gpt_cond_latent,
speaker_embedding,
repetition_penalty=5.0,
temperature=0.75,
)
inference_time = time.time() - t0
print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
print(f"Real-time factor (RTF): {real_time_factor}")
metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
return (
"output.wav",
metrics_text,
speaker_wav,
)
title = "语音克隆 Coqui🐸 XTTS"
examples = [
[
"Once when I was six years old I saw a magnificent picture",
"en",
"examples/female.wav",
None,
False,
False,
False,
True,
],
[
"Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
"fr",
"examples/male.wav",
None,
False,
False,
False,
True,
],
[
"Als ich sechs war, sah ich einmal ein wunderbares Bild",
"de",
"examples/female.wav",
None,
False,
False,
False,
True,
],
[
"Cuando tenía seis años, vi una vez una imagen magnífica",
"es",
"examples/male.wav",
None,
False,
False,
False,
True,
],
[
"Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
"pt",
"examples/female.wav",
None,
False,
False,
False,
True,
],
[
"Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
"pl",
"examples/male.wav",
None,
False,
False,
False,
True,
],
[
"Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
"it",
"examples/female.wav",
None,
False,
False,
False,
True,
],
[
"Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
"tr",
"examples/female.wav",
None,
False,
False,
False,
True,
],
[
"Когда мне было шесть лет, я увидел однажды удивительную картинку",
"ru",
"examples/female.wav",
None,
False,
False,
False,
True,
],
[
"Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
"nl",
"examples/male.wav",
None,
False,
False,
False,
True,
],
[
"Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
"cs",
"examples/female.wav",
None,
False,
False,
False,
True,
],
[
"当我还只有六岁的时候, 看到了一副精彩的插画",
"zh-cn",
"examples/female.wav",
None,
False,
False,
False,
True,
],
[
"かつて 六歳のとき、素晴らしい絵を見ました",
"ja",
"examples/female.wav",
None,
False,
True,
False,
True,
],
[
"한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
"ko",
"examples/female.wav",
None,
False,
True,
False,
True,
],
[
"Egyszer hat éves koromban láttam egy csodálatos képet",
"hu",
"examples/male.wav",
None,
False,
True,
False,
True,
],
]
def main():
with gr.Blocks(analytics_enabled=False) as demo:
with gr.Row():
with gr.Column():
gr.Markdown(
"""
## 语音克隆 (Coqui TTS)
"""
)
with gr.Column():
# 用于对齐图像的占位符
pass
with gr.Row():
with gr.Column():
gr.Markdown("文本提示: 一次一两个句子最好。最多200个文本字符。")
with gr.Column():
gr.Markdown("语言: 选择用于合成语音的输出语言。")
with gr.Row():
with gr.Column():
input_text_gr = gr.Textbox(
label="文本提示",
info="一次一两个句子最好。最多200个文本字符。",
value="嗨,我是你的新语音克隆。请尽量上传高质量的音频。",
)
language_gr = gr.Dropdown(
label="语言",
info="选择用于合成语音的输出语言",
choices=[
"en",
"es",
"fr",
"de",
"it",
"pt",
"pl",
"tr",
"ru",
"nl",
"cs",
"ar",
"zh-cn",
"ja",
"ko",
"hu",
"hi"
],
max_choices=1,
value="en",
)
ref_gr = gr.Audio(
label="参考音频",
type="filepath",
value="examples/female.wav",
sources=["microphone", "upload"],
)
clean_ref_gr = gr.Checkbox(
label="清理参考语音",
value=False,
info="如果您的麦克风或参考语音有噪音,此选项可以改善输出。",
)
tts_button = gr.Button("发送", elem_id="send-btn", visible=True)
with gr.Column():
audio_gr = gr.Audio(label="合成音频", autoplay=True)
out_text_gr = gr.Text(label="指标")
ref_audio_gr = gr.Audio(label="使用的参考音频")
with gr.Row():
gr.Examples(examples,
label="示例",
inputs=[input_text_gr, language_gr, ref_gr, clean_ref_gr],
outputs=[audio_gr, out_text_gr, ref_audio_gr],
fn=XTTS().predict,
cache_examples=False,)
tts_button.click(XTTS().predict, [input_text_gr, language_gr, ref_gr, clean_ref_gr], outputs=[audio_gr, out_text_gr, ref_audio_gr])
return demo
if __name__ == "__main__":
demo = main()
demo.launch()