Spaces:
Runtime error
Runtime error
Serhiy Stetskovych
commited on
Commit
•
3d2700d
1
Parent(s):
9cb2738
New multispeaker model
Browse files- .gitattributes +24 -0
- app.py +49 -77
- pflow/data/text_mel_datamodule.py +7 -2
- pflow/models/pflow_tts.py +5 -3
- pflow/text/cleaners.py +2 -2
- pflow/text/symbols.py +28 -2
- pflow/utils/model.py +5 -2
- pflow/utils/utils.py +0 -1
- prompts/speaker_1.wav +3 -0
- prompts/speaker_10.wav +3 -0
- prompts/speaker_11.wav +3 -0
- prompts/speaker_12.wav +3 -0
- prompts/speaker_13.wav +3 -0
- prompts/speaker_14.wav +3 -0
- prompts/speaker_15.wav +3 -0
- prompts/speaker_16.wav +3 -0
- prompts/speaker_17.wav +3 -0
- prompts/speaker_18.wav +3 -0
- prompts/speaker_19.wav +3 -0
- prompts/speaker_2.wav +3 -0
- prompts/speaker_20.wav +3 -0
- prompts/speaker_21.wav +3 -0
- prompts/speaker_22.wav +3 -0
- prompts/speaker_23.wav +3 -0
- prompts/speaker_24.wav +3 -0
- prompts/speaker_3.wav +3 -0
- prompts/speaker_5.wav +3 -0
- prompts/speaker_6.wav +3 -0
- prompts/speaker_7.wav +3 -0
- prompts/speaker_8.wav +3 -0
- prompts/speaker_9.wav +3 -0
.gitattributes
CHANGED
@@ -40,3 +40,27 @@ checkpoint_epoch=599.ckpt filter=lfs diff=lfs merge=lfs -text
|
|
40 |
checkpoint_epoch=649.ckpt filter=lfs diff=lfs merge=lfs -text
|
41 |
g_00140000_m filter=lfs diff=lfs merge=lfs -text
|
42 |
checkpoints/g_00120000 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
checkpoint_epoch=649.ckpt filter=lfs diff=lfs merge=lfs -text
|
41 |
g_00140000_m filter=lfs diff=lfs merge=lfs -text
|
42 |
checkpoints/g_00120000 filter=lfs diff=lfs merge=lfs -text
|
43 |
+
checkpoints/checkpoint_epoch=100.ckpt filter=lfs diff=lfs merge=lfs -text
|
44 |
+
prompts/speaker_16.wav filter=lfs diff=lfs merge=lfs -text
|
45 |
+
prompts/speaker_23.wav filter=lfs diff=lfs merge=lfs -text
|
46 |
+
prompts/speaker_7.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
prompts/speaker_22.wav filter=lfs diff=lfs merge=lfs -text
|
48 |
+
prompts/speaker_3.wav filter=lfs diff=lfs merge=lfs -text
|
49 |
+
prompts/speaker_5.wav filter=lfs diff=lfs merge=lfs -text
|
50 |
+
prompts/speaker_9.wav filter=lfs diff=lfs merge=lfs -text
|
51 |
+
prompts/speaker_13.wav filter=lfs diff=lfs merge=lfs -text
|
52 |
+
prompts/speaker_14.wav filter=lfs diff=lfs merge=lfs -text
|
53 |
+
prompts/speaker_15.wav filter=lfs diff=lfs merge=lfs -text
|
54 |
+
prompts/speaker_17.wav filter=lfs diff=lfs merge=lfs -text
|
55 |
+
prompts/speaker_1.wav filter=lfs diff=lfs merge=lfs -text
|
56 |
+
prompts/speaker_12.wav filter=lfs diff=lfs merge=lfs -text
|
57 |
+
prompts/speaker_19.wav filter=lfs diff=lfs merge=lfs -text
|
58 |
+
prompts/speaker_21.wav filter=lfs diff=lfs merge=lfs -text
|
59 |
+
prompts/speaker_20.wav filter=lfs diff=lfs merge=lfs -text
|
60 |
+
prompts/speaker_24.wav filter=lfs diff=lfs merge=lfs -text
|
61 |
+
prompts/speaker_6.wav filter=lfs diff=lfs merge=lfs -text
|
62 |
+
prompts/speaker_8.wav filter=lfs diff=lfs merge=lfs -text
|
63 |
+
prompts/speaker_10.wav filter=lfs diff=lfs merge=lfs -text
|
64 |
+
prompts/speaker_11.wav filter=lfs diff=lfs merge=lfs -text
|
65 |
+
prompts/speaker_18.wav filter=lfs diff=lfs merge=lfs -text
|
66 |
+
prompts/speaker_2.wav filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,49 +1,34 @@
|
|
1 |
import os
|
2 |
-
from pathlib import Path
|
3 |
import torchaudio
|
4 |
-
import gradio as gr
|
5 |
-
|
6 |
-
import numpy as np
|
7 |
-
|
8 |
import torch
|
9 |
-
import
|
10 |
-
|
11 |
-
|
12 |
-
from hifigan.config import v1
|
13 |
-
from hifigan.denoiser import Denoiser
|
14 |
-
from hifigan.env import AttrDict
|
15 |
-
from hifigan.models import Generator as HiFiGAN
|
16 |
|
17 |
|
18 |
from pflow.models.pflow_tts import pflowTTS
|
19 |
-
from pflow.text import text_to_sequence, sequence_to_text
|
20 |
from pflow.utils.utils import intersperse
|
21 |
from pflow.data.text_mel_datamodule import mel_spectrogram
|
22 |
from pflow.utils.model import normalize
|
23 |
from vocos import Vocos
|
24 |
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt'
|
30 |
-
#PFLOW_MODEL_PATH = 'checkpoint_m_epoch=054.ckpt'
|
31 |
VOCODER22_MODEL_PATH = 'BSC-LT/vocos-mel-22khz'
|
32 |
VOCODER44_MODEL_PATH = 'patriotyk/vocos-mel-hifigan-compat-44100khz'
|
33 |
|
34 |
-
HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000'
|
35 |
-
|
36 |
-
|
37 |
-
volnorm = torchaudio.transforms.Vol(gain=-32, gain_type="db")
|
38 |
-
|
39 |
|
|
|
40 |
prompts_dir = 'prompts'
|
41 |
-
print(os.listdir(prompts_dir))
|
42 |
prompts_list = sorted(os.listdir(prompts_dir), key=lambda x: x.split('.')[0])
|
43 |
|
44 |
-
def process_text(text: str, device: torch.device):
|
|
|
|
|
|
|
|
|
|
|
45 |
x = torch.tensor(
|
46 |
-
intersperse(
|
47 |
dtype=torch.long,
|
48 |
device=device,
|
49 |
)[None]
|
@@ -53,18 +38,6 @@ def process_text(text: str, device: torch.device):
|
|
53 |
|
54 |
|
55 |
|
56 |
-
|
57 |
-
def load_hifigan(checkpoint_path, device):
|
58 |
-
h = AttrDict(v1)
|
59 |
-
hifigan = HiFiGAN(h).to(device)
|
60 |
-
hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"])
|
61 |
-
_ = hifigan.eval()
|
62 |
-
hifigan.remove_weight_norm()
|
63 |
-
return hifigan
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
def load_vocos(checkpoint_path, config_path, device):
|
69 |
model = Vocos.from_hparams(config_path).to(device)
|
70 |
|
@@ -75,9 +48,6 @@ def load_vocos(checkpoint_path, config_path, device):
|
|
75 |
return model
|
76 |
|
77 |
|
78 |
-
def to_waveform(mel, vocoder, denoiser=None):
|
79 |
-
return vocoder.decode(mel).clamp(-1, 1).cpu().squeeze()
|
80 |
-
|
81 |
|
82 |
def get_device():
|
83 |
if torch.cuda.is_available():
|
@@ -93,51 +63,54 @@ device = get_device()
|
|
93 |
model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
|
94 |
_ = model.eval()
|
95 |
|
96 |
-
|
97 |
-
hifigan = load_hifigan(HIFIGAN_MODEL_PATH, device)
|
98 |
-
vocos_22050 = Vocos.from_pretrained(VOCODER22_MODEL_PATH).to(device)
|
99 |
-
#vocos_44100 = load_vocos('checkpoints/vocos_checkpoint_epoch=209_step=3924480_val_loss=3.7036_44100_11.ckpt', 'vocos.yaml', device)
|
100 |
vocos_44100 = Vocos.from_pretrained(VOCODER44_MODEL_PATH).to(device)
|
101 |
-
|
102 |
|
103 |
|
104 |
@torch.inference_mode()
|
105 |
-
def synthesise(text, prompt_selection, speed):
|
|
|
106 |
if len(text) > 1000:
|
107 |
raise gr.Error("Текст повинен бути коротшим за 1000 символів.")
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
output = model.synthesise(
|
115 |
text_processed["x"].to(device),
|
116 |
text_processed["x_lengths"].to(device),
|
117 |
n_timesteps=40,
|
118 |
-
temperature=
|
119 |
length_scale=1/speed,
|
120 |
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
|
121 |
|
122 |
-
guidance_scale=
|
123 |
|
124 |
)
|
125 |
-
|
126 |
waveform_vocos_44100 = vocos_44100.decode(output["mel"]).cpu().squeeze()
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
return text_processed['x_phones'][1::2], (44100, waveform_vocos_44100.numpy()), (22050, waveform_vocos.numpy()), (22050, transform(waveform_hifigan).numpy())
|
132 |
|
133 |
|
134 |
description = f'''
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
HIFIGAN 22050 аудіо - {HIFIGAN_MODEL_PATH}
|
141 |
'''
|
142 |
|
143 |
|
@@ -147,7 +120,10 @@ if __name__ == "__main__":
|
|
147 |
description=description,
|
148 |
inputs=[
|
149 |
gr.Text(label='Текст для синтезу:', lines=5, max_lines=10),
|
150 |
-
gr.
|
|
|
|
|
|
|
151 |
gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.1)
|
152 |
],
|
153 |
outputs=[
|
@@ -164,21 +140,17 @@ if __name__ == "__main__":
|
|
164 |
streaming=False,
|
165 |
type="numpy",
|
166 |
),
|
167 |
-
gr.Audio(
|
168 |
-
label="HIFIGAN 22050 аудіо:",
|
169 |
-
autoplay=False,
|
170 |
-
streaming=False,
|
171 |
-
type="numpy",
|
172 |
-
)
|
173 |
|
174 |
],
|
175 |
allow_flagging ='manual',
|
176 |
-
#flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")],
|
177 |
cache_examples=True,
|
178 |
-
title='',
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
182 |
)
|
183 |
i.queue(max_size=20, default_concurrency_limit=4)
|
184 |
i.launch(share=False, server_name="0.0.0.0")
|
|
|
1 |
import os
|
|
|
2 |
import torchaudio
|
|
|
|
|
|
|
|
|
3 |
import torch
|
4 |
+
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
from pflow.models.pflow_tts import pflowTTS
|
8 |
+
from pflow.text import text_to_sequence, sequence_to_text, cleaned_text_to_sequence
|
9 |
from pflow.utils.utils import intersperse
|
10 |
from pflow.data.text_mel_datamodule import mel_spectrogram
|
11 |
from pflow.utils.model import normalize
|
12 |
from vocos import Vocos
|
13 |
|
14 |
|
15 |
+
PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=100.ckpt'
|
|
|
|
|
|
|
|
|
16 |
VOCODER22_MODEL_PATH = 'BSC-LT/vocos-mel-22khz'
|
17 |
VOCODER44_MODEL_PATH = 'patriotyk/vocos-mel-hifigan-compat-44100khz'
|
18 |
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
volnorm = torchaudio.transforms.Vol(gain=-15, gain_type="db")
|
21 |
prompts_dir = 'prompts'
|
|
|
22 |
prompts_list = sorted(os.listdir(prompts_dir), key=lambda x: x.split('.')[0])
|
23 |
|
24 |
+
def process_text(text: str, device: torch.device, ipa=False):
|
25 |
+
if ipa:
|
26 |
+
seq = cleaned_text_to_sequence(text)
|
27 |
+
else:
|
28 |
+
seq = text_to_sequence(text, ["ukr_cleaners"])
|
29 |
+
|
30 |
x = torch.tensor(
|
31 |
+
intersperse(seq, 0),
|
32 |
dtype=torch.long,
|
33 |
device=device,
|
34 |
)[None]
|
|
|
38 |
|
39 |
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
def load_vocos(checkpoint_path, config_path, device):
|
42 |
model = Vocos.from_hparams(config_path).to(device)
|
43 |
|
|
|
48 |
return model
|
49 |
|
50 |
|
|
|
|
|
|
|
51 |
|
52 |
def get_device():
|
53 |
if torch.cuda.is_available():
|
|
|
63 |
model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
|
64 |
_ = model.eval()
|
65 |
|
|
|
|
|
|
|
|
|
66 |
vocos_44100 = Vocos.from_pretrained(VOCODER44_MODEL_PATH).to(device)
|
67 |
+
vocos_22050 = Vocos.from_pretrained(VOCODER22_MODEL_PATH).to(device)
|
68 |
|
69 |
|
70 |
@torch.inference_mode()
|
71 |
+
def synthesise(text, ipa, prompt_selection, audio_prompt, temperature, speed):
|
72 |
+
print(text, prompt_selection, temperature, speed)
|
73 |
if len(text) > 1000:
|
74 |
raise gr.Error("Текст повинен бути коротшим за 1000 символів.")
|
75 |
|
76 |
+
if audio_prompt:
|
77 |
+
wav, sr = torchaudio.load(audio_prompt)
|
78 |
+
wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=44100)
|
79 |
+
else:
|
80 |
+
prompt_audio_path = os.path.join(prompts_dir, prompt_selection)
|
81 |
+
wav, _ = torchaudio.load(prompt_audio_path)
|
82 |
|
83 |
+
if ipa:
|
84 |
+
text_processed = process_text(ipa, device, ipa=True)
|
85 |
+
else:
|
86 |
+
text_processed = process_text(text.strip(), device, ipa=False)
|
87 |
+
|
88 |
+
prompt = mel_spectrogram(volnorm(wav), 2048, 80, 44100, 512, 2048, 0, 8000, center=False)[:,:,:264]
|
89 |
+
|
90 |
+
|
91 |
output = model.synthesise(
|
92 |
text_processed["x"].to(device),
|
93 |
text_processed["x_lengths"].to(device),
|
94 |
n_timesteps=40,
|
95 |
+
temperature=temperature,
|
96 |
length_scale=1/speed,
|
97 |
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
|
98 |
|
99 |
+
guidance_scale=1.8
|
100 |
|
101 |
)
|
102 |
+
|
103 |
waveform_vocos_44100 = vocos_44100.decode(output["mel"]).cpu().squeeze()
|
104 |
+
waveform_vocos = vocos_22050.decode(output["mel"]).cpu().squeeze()
|
105 |
+
return text_processed['x_phones'][1::2], (44100, waveform_vocos_44100.numpy()), (22050, waveform_vocos.numpy())
|
|
|
|
|
|
|
106 |
|
107 |
|
108 |
description = f'''
|
109 |
+
Модель натренована на приватному датасеті з аудіо книжок створненому за допомогою програми
|
110 |
+
[narizaka](https://github.com/patriotyk/narizaka).
|
111 |
+
Програма може не коректно визначати деякі наголоси і не дуже добре перетворює цифри, акроніми і різні скорочення в словесну форму.
|
112 |
+
На даний момент, відкритого рішення для української мови для цих проблем нема, тому якщо у вас є запитання,
|
113 |
+
чи ви хочете допомогти їх вирішити приєднуйтесь до нашого чату в [телеграм](https://t.me/speech_synthesis_uk) або [discord](https://discord.gg/yVAjkBgmt4)
|
|
|
114 |
'''
|
115 |
|
116 |
|
|
|
120 |
description=description,
|
121 |
inputs=[
|
122 |
gr.Text(label='Текст для синтезу:', lines=5, max_lines=10),
|
123 |
+
gr.Text(label='Aбо IPA:', lines=5, max_lines=10),
|
124 |
+
gr.Dropdown(label="Виберіть промт", choices=prompts_list, value=prompts_list[0]),
|
125 |
+
gr.Audio(label="Або завантажте свій:", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'}),
|
126 |
+
gr.Slider(minimum=0.0, maximum=1.0, label="Шум", value=0.7),
|
127 |
gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.1)
|
128 |
],
|
129 |
outputs=[
|
|
|
140 |
streaming=False,
|
141 |
type="numpy",
|
142 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
],
|
145 |
allow_flagging ='manual',
|
|
|
146 |
cache_examples=True,
|
147 |
+
title='Генерація української мови за допомогою pflowtts.',
|
148 |
+
examples=[
|
149 |
+
['Мені тринадцятий минало. Я пас ягнята за селом. Чи то так сонечко сіяло, Чи так мені чого було? Мені так любо, любо стало, Неначе в бога. Уже прокликали до паю, А я собі у бур\'яні Молюся богу І не знаю, Чого маленькому мені Тоді так приязно молилось, Чого так весело було?', "meˈnʲi trɪˈnad͡zʲt͡sʲɐtɪi̯ mɪˈnaɫɔ. jɐ pɐs jɐɦˈnʲatɐ zɐ seˈɫɔm. t͡ʃɪ tɔ tɐk ˈsɔnet͡ʃkɔ sʲiˈjɐɫɔ, t͡ʃɪ tɐk meˈnʲi t͡ʃɔˈɦɔ bʊˈɫɔ? meˈnʲi tɐk ˈlʲubɔ, ˈlʲubɔ ˈstaɫɔ, neˈnat͡ʃe ʋ ˈbɔɦɐ. ʊˈʒɛ prɔkɫɪkɐɫɪ dɔ ˈpajʊ, ɐ jɐ soˈbʲi ʊ bur-jɐˈnʲi moˈlʲusʲɐ ˈbɔɦʊ i ne ˈznajʊ, t͡ʃɔˈɦɔ mɐˈɫɛnʲkɔmʊ meˈnʲi toˈdʲi tɐk ˈprɪjɐznɔ mɔˈɫɪɫɔsʲ, t͡ʃɔˈɦɔ tɐk ˈʋɛseɫɔ bʊˈɫɔ?", '', 'prompts/speaker_22.wav', 0.6, 1.1],
|
150 |
+
['Ти, малий, скажи малому, хай малий малому скаже, хай малий теля прив\'яже.', '', '', 'prompts/speaker_11.wav', 0.4, 1.1 ],
|
151 |
+
['По мірі розвитку клубу зростатиме і кількість його членів, а отже, команда менеджменту теж буде пропорційно збільшуватись. Яка ж команда потрібна клубу, що налічує, скажімо, сто осіб, і які компетенції повинна мати?', '', '', 'prompts/speaker_20.wav', 0.7, 1.1],
|
152 |
+
['Да ти дєтка гоніш! один рік? І що? Як ви задрали нити, рік вона не може, в когось діти мруть в день народження, викидні, а вона, бляха, рік не може, купи собі рожеве поні і реви побільше, дурепа.', 'dɐ tɪ dʲetkɐ ɦ��nʲiʃ! ɔˈdɪn rʲik? i ʃt͡ʃɔ? jɐk ʋɪ zɐˈdraɫɪ ˈnɪtɪ, rʲik wɔˈna ne ˈmɔʒe, ʋ kɔɦɔsʲ ˈdʲitɪ mrʊtʲ ʋ denʲ nɐˈrɔd͡ʒenʲːɐ, ˈʋɪkɪdʲnʲi, ɐ wɔˈna, ˈblʲaxɐ, rʲik ne ˈmɔʒe, kʊpɪ soˈbʲi rɔˈʒɛʋe ˈpɔnʲi i reʋɪ poˈbʲilʲʃe, dʊˈrɛpɐ.', '', 'prompts/speaker_5.wav', 0.7, 1.2]
|
153 |
+
],
|
154 |
)
|
155 |
i.queue(max_size=20, default_concurrency_limit=4)
|
156 |
i.launch(share=False, server_name="0.0.0.0")
|
pflow/data/text_mel_datamodule.py
CHANGED
@@ -39,6 +39,7 @@ class TextMelDataModule(LightningDataModule):
|
|
39 |
f_max,
|
40 |
data_statistics,
|
41 |
seed,
|
|
|
42 |
):
|
43 |
super().__init__()
|
44 |
|
@@ -68,6 +69,7 @@ class TextMelDataModule(LightningDataModule):
|
|
68 |
self.hparams.f_max,
|
69 |
self.hparams.data_statistics,
|
70 |
self.hparams.seed,
|
|
|
71 |
)
|
72 |
self.validset = TextMelDataset( # pylint: disable=attribute-defined-outside-init
|
73 |
self.hparams.valid_filelist_path,
|
@@ -83,6 +85,7 @@ class TextMelDataModule(LightningDataModule):
|
|
83 |
self.hparams.f_max,
|
84 |
self.hparams.data_statistics,
|
85 |
self.hparams.seed,
|
|
|
86 |
)
|
87 |
|
88 |
def train_dataloader(self):
|
@@ -134,6 +137,7 @@ class TextMelDataset(torch.utils.data.Dataset):
|
|
134 |
f_max=8000,
|
135 |
data_parameters=None,
|
136 |
seed=None,
|
|
|
137 |
):
|
138 |
self.filepaths_and_text = parse_filelist(filelist_path)
|
139 |
self.n_spks = n_spks
|
@@ -146,6 +150,7 @@ class TextMelDataset(torch.utils.data.Dataset):
|
|
146 |
self.win_length = win_length
|
147 |
self.f_min = f_min
|
148 |
self.f_max = f_max
|
|
|
149 |
if data_parameters is not None:
|
150 |
self.data_parameters = data_parameters
|
151 |
else:
|
@@ -196,9 +201,9 @@ class TextMelDataset(torch.utils.data.Dataset):
|
|
196 |
|
197 |
def __getitem__(self, index):
|
198 |
datapoint = self.get_datapoint(self.filepaths_and_text[index])
|
199 |
-
if datapoint["wav"].shape[1] <=
|
200 |
'''
|
201 |
-
skip datapoint if too short (3s)
|
202 |
TODO To not waste data, we can concatenate wavs less than 3s and use them
|
203 |
TODO as a hyperparameter; multispeaker dataset can use another wav of same speaker
|
204 |
'''
|
|
|
39 |
f_max,
|
40 |
data_statistics,
|
41 |
seed,
|
42 |
+
min_sample_size,
|
43 |
):
|
44 |
super().__init__()
|
45 |
|
|
|
69 |
self.hparams.f_max,
|
70 |
self.hparams.data_statistics,
|
71 |
self.hparams.seed,
|
72 |
+
self.hparams.min_sample_size,
|
73 |
)
|
74 |
self.validset = TextMelDataset( # pylint: disable=attribute-defined-outside-init
|
75 |
self.hparams.valid_filelist_path,
|
|
|
85 |
self.hparams.f_max,
|
86 |
self.hparams.data_statistics,
|
87 |
self.hparams.seed,
|
88 |
+
self.hparams.min_sample_size,
|
89 |
)
|
90 |
|
91 |
def train_dataloader(self):
|
|
|
137 |
f_max=8000,
|
138 |
data_parameters=None,
|
139 |
seed=None,
|
140 |
+
min_sample_size=4,
|
141 |
):
|
142 |
self.filepaths_and_text = parse_filelist(filelist_path)
|
143 |
self.n_spks = n_spks
|
|
|
150 |
self.win_length = win_length
|
151 |
self.f_min = f_min
|
152 |
self.f_max = f_max
|
153 |
+
self.min_sample_size = min_sample_size
|
154 |
if data_parameters is not None:
|
155 |
self.data_parameters = data_parameters
|
156 |
else:
|
|
|
201 |
|
202 |
def __getitem__(self, index):
|
203 |
datapoint = self.get_datapoint(self.filepaths_and_text[index])
|
204 |
+
if datapoint["wav"].shape[1] <= self.min_sample_size * self.sample_rate:
|
205 |
'''
|
206 |
+
skip datapoint if too short (<4s , prompt is 3s)
|
207 |
TODO To not waste data, we can concatenate wavs less than 3s and use them
|
208 |
TODO as a hyperparameter; multispeaker dataset can use another wav of same speaker
|
209 |
'''
|
pflow/models/pflow_tts.py
CHANGED
@@ -5,7 +5,7 @@ import random
|
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
7 |
|
8 |
-
|
9 |
from pflow.models.baselightningmodule import BaseLightningClass
|
10 |
from pflow.models.components.flow_matching import CFM
|
11 |
from pflow.models.components.speech_prompt_encoder import TextEncoder
|
@@ -19,7 +19,7 @@ from pflow.utils.model import (
|
|
19 |
from pflow.models.components import commons
|
20 |
from pflow.models.components.aligner import Aligner, ForwardSumLoss, BinLoss
|
21 |
|
22 |
-
|
23 |
|
24 |
class pflowTTS(BaseLightningClass): #
|
25 |
def __init__(
|
@@ -31,6 +31,7 @@ class pflowTTS(BaseLightningClass): #
|
|
31 |
cfm,
|
32 |
data_statistics,
|
33 |
prompt_size=264,
|
|
|
34 |
optimizer=None,
|
35 |
scheduler=None,
|
36 |
**kwargs,
|
@@ -42,6 +43,7 @@ class pflowTTS(BaseLightningClass): #
|
|
42 |
self.n_vocab = n_vocab
|
43 |
self.n_feats = n_feats
|
44 |
self.prompt_size = prompt_size
|
|
|
45 |
speech_in_channels = n_feats
|
46 |
|
47 |
self.encoder = TextEncoder(
|
@@ -151,7 +153,7 @@ class pflowTTS(BaseLightningClass): #
|
|
151 |
)
|
152 |
|
153 |
logw_ = torch.log(1e-8 + attn.sum(2)) * x_mask
|
154 |
-
dur_loss = duration_loss(logw, logw_, x_lengths)
|
155 |
|
156 |
# aln_hard, aln_soft, aln_log, aln_mask = self.aligner(
|
157 |
# mu_x.transpose(1,2), x_mask, y, y_mask
|
|
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
7 |
|
8 |
+
from pflow import utils
|
9 |
from pflow.models.baselightningmodule import BaseLightningClass
|
10 |
from pflow.models.components.flow_matching import CFM
|
11 |
from pflow.models.components.speech_prompt_encoder import TextEncoder
|
|
|
19 |
from pflow.models.components import commons
|
20 |
from pflow.models.components.aligner import Aligner, ForwardSumLoss, BinLoss
|
21 |
|
22 |
+
log = utils.get_pylogger(__name__)
|
23 |
|
24 |
class pflowTTS(BaseLightningClass): #
|
25 |
def __init__(
|
|
|
31 |
cfm,
|
32 |
data_statistics,
|
33 |
prompt_size=264,
|
34 |
+
dur_p_use_log=False,
|
35 |
optimizer=None,
|
36 |
scheduler=None,
|
37 |
**kwargs,
|
|
|
43 |
self.n_vocab = n_vocab
|
44 |
self.n_feats = n_feats
|
45 |
self.prompt_size = prompt_size
|
46 |
+
self.dur_p_use_log = dur_p_use_log
|
47 |
speech_in_channels = n_feats
|
48 |
|
49 |
self.encoder = TextEncoder(
|
|
|
153 |
)
|
154 |
|
155 |
logw_ = torch.log(1e-8 + attn.sum(2)) * x_mask
|
156 |
+
dur_loss = duration_loss(logw, logw_, x_lengths, use_log=self.dur_p_use_log)
|
157 |
|
158 |
# aln_hard, aln_soft, aln_log, aln_mask = self.aligner(
|
159 |
# mu_x.transpose(1,2), x_mask, y, y_mask
|
pflow/text/cleaners.py
CHANGED
@@ -3,7 +3,7 @@ from ukrainian_word_stress import Stressifier
|
|
3 |
import regex
|
4 |
import re
|
5 |
from ipa_uk import ipa
|
6 |
-
stressify = Stressifier()
|
7 |
|
8 |
|
9 |
_whitespace_re = re.compile(r"\s+")
|
@@ -15,5 +15,5 @@ def ukr_cleaners(text):
|
|
15 |
text = collapse_whitespace(text)
|
16 |
text = norm(text).lower()
|
17 |
|
18 |
-
text = regex.sub(r'[
|
19 |
return ipa(stressify(text), False)
|
|
|
3 |
import regex
|
4 |
import re
|
5 |
from ipa_uk import ipa
|
6 |
+
stressify = Stressifier(stress_symbol="ˈ")
|
7 |
|
8 |
|
9 |
_whitespace_re = re.compile(r"\s+")
|
|
|
15 |
text = collapse_whitespace(text)
|
16 |
text = norm(text).lower()
|
17 |
|
18 |
+
text = regex.sub(r'[^\ˈ\p{L}\p{N}\?\!\,\.\-\: ]', '', text)
|
19 |
return ipa(stressify(text), False)
|
pflow/text/symbols.py
CHANGED
@@ -2,14 +2,40 @@
|
|
2 |
|
3 |
Defines the set of symbols used in text input to the model.
|
4 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
_pad = "_"
|
6 |
_punctuation = '-´;:,.!?¡¿—…"«»“” '
|
7 |
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
8 |
_letters_ipa = (
|
9 |
-
"éýíó'̯'
|
10 |
)
|
11 |
|
12 |
-
|
13 |
# Export all symbols:
|
14 |
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
15 |
|
|
|
2 |
|
3 |
Defines the set of symbols used in text input to the model.
|
4 |
"""
|
5 |
+
# _pad = "_"
|
6 |
+
# _punctuation = '-´;:,.!?¡¿—…"«»“” '
|
7 |
+
# _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
8 |
+
# _letters_ipa = (
|
9 |
+
# "éýíó'̯'͡ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
10 |
+
# )
|
11 |
+
|
12 |
+
|
13 |
+
# # Export all symbols:
|
14 |
+
# symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
15 |
+
|
16 |
+
# # Special symbol ids
|
17 |
+
# SPACE_ID = symbols.index(" ")
|
18 |
+
|
19 |
+
# _pad = "_"
|
20 |
+
# _punctuation = '()-;:,.!?¡¿—…"«»“” '
|
21 |
+
# _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzАБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгґдеєжзиіїйклмнопрстуфхцчшщьюя"
|
22 |
+
# _letters_ipa = (
|
23 |
+
# "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ̯͡"
|
24 |
+
# )
|
25 |
+
|
26 |
+
# # Export all symbols:
|
27 |
+
# symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
28 |
+
|
29 |
+
# # Special symbol ids
|
30 |
+
# SPACE_ID = symbols.index(" ")
|
31 |
+
|
32 |
_pad = "_"
|
33 |
_punctuation = '-´;:,.!?¡¿—…"«»“” '
|
34 |
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
35 |
_letters_ipa = (
|
36 |
+
"éýíó'̯'͡ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲ'̩'ᵻ"
|
37 |
)
|
38 |
|
|
|
39 |
# Export all symbols:
|
40 |
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
41 |
|
pflow/utils/model.py
CHANGED
@@ -41,8 +41,11 @@ def generate_path(duration, mask):
|
|
41 |
return path
|
42 |
|
43 |
|
44 |
-
def duration_loss(logw, logw_, lengths):
|
45 |
-
|
|
|
|
|
|
|
46 |
return loss
|
47 |
|
48 |
|
|
|
41 |
return path
|
42 |
|
43 |
|
44 |
+
def duration_loss(logw, logw_, lengths, use_log=False):
|
45 |
+
if use_log:
|
46 |
+
loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths)
|
47 |
+
else:
|
48 |
+
loss = torch.sum((torch.exp(logw) - torch.exp(logw_)) ** 2) / torch.sum(lengths)
|
49 |
return loss
|
50 |
|
51 |
|
pflow/utils/utils.py
CHANGED
@@ -206,7 +206,6 @@ def get_user_data_dir(appname="pflow_tts"):
|
|
206 |
|
207 |
|
208 |
def assert_model_downloaded(checkpoint_path, url, use_wget=False):
|
209 |
-
print(checkpoint_path)
|
210 |
if Path(checkpoint_path).exists():
|
211 |
log.debug(f"[+] Model already present at {checkpoint_path}!")
|
212 |
return
|
|
|
206 |
|
207 |
|
208 |
def assert_model_downloaded(checkpoint_path, url, use_wget=False):
|
|
|
209 |
if Path(checkpoint_path).exists():
|
210 |
log.debug(f"[+] Model already present at {checkpoint_path}!")
|
211 |
return
|
prompts/speaker_1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e0dd649f417994c06fa40481eb41f7356eeb401881f567668c4526dd58567e10
|
3 |
+
size 344026
|
prompts/speaker_10.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:686e88155eef9caac215f4a538be47656d4b992a3ae59f33e026b6f547bef1e8
|
3 |
+
size 379046
|
prompts/speaker_11.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d86982b3e7dba7704088f8459e90c8aacfb9b27a011372c05746c3b906aa88fb
|
3 |
+
size 396946
|
prompts/speaker_12.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1521a2400b70de8ab89dd7083ea7b2d69547dfa0fd44ba98d42c1c16ba959438
|
3 |
+
size 458686
|
prompts/speaker_13.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e2ce78bb8e505ad6687139c3fb6ff8ca0d649a139d65e58c0cbfff1b9095efc7
|
3 |
+
size 458426
|
prompts/speaker_14.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:397b9e44669ead0f6baac314ad2f0b32218db9054b20a9c371474b5d49525ef9
|
3 |
+
size 480736
|
prompts/speaker_15.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d614a13acf2f999a715194c3efb4f74a30f33b98823e6bf693bf26bf79d6f653
|
3 |
+
size 573086
|
prompts/speaker_16.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c90db783c7be686c70d1b2d15400f414bc6ede6f59952b9a1b5c6bb1a96a16c7
|
3 |
+
size 511346
|
prompts/speaker_17.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2ce2df72aa34de17f0e833c903a561e650b35d3b03b142c0bd6d0f1d7d4d4e2
|
3 |
+
size 635086
|
prompts/speaker_18.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f0212146531c97752dbdf5615950298518748e655c9cf0d8f07f57e571eaf9a
|
3 |
+
size 445198
|
prompts/speaker_19.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6e3e9226de668b70ab0c61e79b9ab42f3a5fb933fe874565b0a6daea1f0e570
|
3 |
+
size 427556
|
prompts/speaker_2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22b38f05add585086e84aa5b1ad5cc60902e97d911c7f5c25efa9907e3a3bd32
|
3 |
+
size 674514
|
prompts/speaker_20.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b27c473712a82d638acfba3a1ea54bde3f0efefe111452101f3798c3aaa46f05
|
3 |
+
size 485146
|
prompts/speaker_21.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f24e374c7ffcc4a2ceee696d6ee21996300997bf56a437212455780d2feab34
|
3 |
+
size 379046
|
prompts/speaker_22.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ad69e9dd8c7c4886319730811920c0e4f56f486203b4ef76acab9f145cd9168
|
3 |
+
size 1300738
|
prompts/speaker_23.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e97404bfd2b086ccb387ec5dff7ab5285dcb68ff789be361b03b0f368425ccf1
|
3 |
+
size 1243406
|
prompts/speaker_24.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a8dbbf87ac7a9c229ef8ab0cdc446c1e2408eeb7c0dab56784c221313fe2ffb7
|
3 |
+
size 626266
|
prompts/speaker_3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aef97ae48b0aac98ac3b3eb01494546d3a39fa1d864361efb01878f21b933fc7
|
3 |
+
size 582166
|
prompts/speaker_5.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2430689318110e4730db7d9f83ad404535a1ac6c632b59483c0693d4804b6f9
|
3 |
+
size 418738
|
prompts/speaker_6.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cc9155a46c023af54cd0c1154a2d7fbc0a9f91d7aa1a43fef44d8726439d7c77
|
3 |
+
size 388126
|
prompts/speaker_7.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db46d1910a8fe98d6d8e9b8cb680d732a1800cd8241eb7aa7690d30a2d46931d
|
3 |
+
size 462836
|
prompts/speaker_8.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2895ab68d8d8093a3b51aac1202b0eb68e2d97082f399a09332ccd8df31377e0
|
3 |
+
size 418736
|
prompts/speaker_9.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52555d2474f8d67237e11ef4c329c89388743a826767fb8a9431c196c2a5a021
|
3 |
+
size 551036
|