Serhiy Stetskovych commited on
Commit
3d2700d
1 Parent(s): 9cb2738

New multispeaker model

Browse files
.gitattributes CHANGED
@@ -40,3 +40,27 @@ checkpoint_epoch=599.ckpt filter=lfs diff=lfs merge=lfs -text
40
  checkpoint_epoch=649.ckpt filter=lfs diff=lfs merge=lfs -text
41
  g_00140000_m filter=lfs diff=lfs merge=lfs -text
42
  checkpoints/g_00120000 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  checkpoint_epoch=649.ckpt filter=lfs diff=lfs merge=lfs -text
41
  g_00140000_m filter=lfs diff=lfs merge=lfs -text
42
  checkpoints/g_00120000 filter=lfs diff=lfs merge=lfs -text
43
+ checkpoints/checkpoint_epoch=100.ckpt filter=lfs diff=lfs merge=lfs -text
44
+ prompts/speaker_16.wav filter=lfs diff=lfs merge=lfs -text
45
+ prompts/speaker_23.wav filter=lfs diff=lfs merge=lfs -text
46
+ prompts/speaker_7.wav filter=lfs diff=lfs merge=lfs -text
47
+ prompts/speaker_22.wav filter=lfs diff=lfs merge=lfs -text
48
+ prompts/speaker_3.wav filter=lfs diff=lfs merge=lfs -text
49
+ prompts/speaker_5.wav filter=lfs diff=lfs merge=lfs -text
50
+ prompts/speaker_9.wav filter=lfs diff=lfs merge=lfs -text
51
+ prompts/speaker_13.wav filter=lfs diff=lfs merge=lfs -text
52
+ prompts/speaker_14.wav filter=lfs diff=lfs merge=lfs -text
53
+ prompts/speaker_15.wav filter=lfs diff=lfs merge=lfs -text
54
+ prompts/speaker_17.wav filter=lfs diff=lfs merge=lfs -text
55
+ prompts/speaker_1.wav filter=lfs diff=lfs merge=lfs -text
56
+ prompts/speaker_12.wav filter=lfs diff=lfs merge=lfs -text
57
+ prompts/speaker_19.wav filter=lfs diff=lfs merge=lfs -text
58
+ prompts/speaker_21.wav filter=lfs diff=lfs merge=lfs -text
59
+ prompts/speaker_20.wav filter=lfs diff=lfs merge=lfs -text
60
+ prompts/speaker_24.wav filter=lfs diff=lfs merge=lfs -text
61
+ prompts/speaker_6.wav filter=lfs diff=lfs merge=lfs -text
62
+ prompts/speaker_8.wav filter=lfs diff=lfs merge=lfs -text
63
+ prompts/speaker_10.wav filter=lfs diff=lfs merge=lfs -text
64
+ prompts/speaker_11.wav filter=lfs diff=lfs merge=lfs -text
65
+ prompts/speaker_18.wav filter=lfs diff=lfs merge=lfs -text
66
+ prompts/speaker_2.wav filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,49 +1,34 @@
1
  import os
2
- from pathlib import Path
3
  import torchaudio
4
- import gradio as gr
5
-
6
- import numpy as np
7
-
8
  import torch
9
- import json
10
-
11
-
12
- from hifigan.config import v1
13
- from hifigan.denoiser import Denoiser
14
- from hifigan.env import AttrDict
15
- from hifigan.models import Generator as HiFiGAN
16
 
17
 
18
  from pflow.models.pflow_tts import pflowTTS
19
- from pflow.text import text_to_sequence, sequence_to_text
20
  from pflow.utils.utils import intersperse
21
  from pflow.data.text_mel_datamodule import mel_spectrogram
22
  from pflow.utils.model import normalize
23
  from vocos import Vocos
24
 
25
 
26
-
27
-
28
-
29
- PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt'
30
- #PFLOW_MODEL_PATH = 'checkpoint_m_epoch=054.ckpt'
31
  VOCODER22_MODEL_PATH = 'BSC-LT/vocos-mel-22khz'
32
  VOCODER44_MODEL_PATH = 'patriotyk/vocos-mel-hifigan-compat-44100khz'
33
 
34
- HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000'
35
-
36
-
37
- volnorm = torchaudio.transforms.Vol(gain=-32, gain_type="db")
38
-
39
 
 
40
  prompts_dir = 'prompts'
41
- print(os.listdir(prompts_dir))
42
  prompts_list = sorted(os.listdir(prompts_dir), key=lambda x: x.split('.')[0])
43
 
44
- def process_text(text: str, device: torch.device):
 
 
 
 
 
45
  x = torch.tensor(
46
- intersperse(text_to_sequence(text, ["ukr_cleaners"]), 0),
47
  dtype=torch.long,
48
  device=device,
49
  )[None]
@@ -53,18 +38,6 @@ def process_text(text: str, device: torch.device):
53
 
54
 
55
 
56
-
57
- def load_hifigan(checkpoint_path, device):
58
- h = AttrDict(v1)
59
- hifigan = HiFiGAN(h).to(device)
60
- hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"])
61
- _ = hifigan.eval()
62
- hifigan.remove_weight_norm()
63
- return hifigan
64
-
65
-
66
-
67
-
68
  def load_vocos(checkpoint_path, config_path, device):
69
  model = Vocos.from_hparams(config_path).to(device)
70
 
@@ -75,9 +48,6 @@ def load_vocos(checkpoint_path, config_path, device):
75
  return model
76
 
77
 
78
- def to_waveform(mel, vocoder, denoiser=None):
79
- return vocoder.decode(mel).clamp(-1, 1).cpu().squeeze()
80
-
81
 
82
  def get_device():
83
  if torch.cuda.is_available():
@@ -93,51 +63,54 @@ device = get_device()
93
  model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
94
  _ = model.eval()
95
 
96
-
97
- hifigan = load_hifigan(HIFIGAN_MODEL_PATH, device)
98
- vocos_22050 = Vocos.from_pretrained(VOCODER22_MODEL_PATH).to(device)
99
- #vocos_44100 = load_vocos('checkpoints/vocos_checkpoint_epoch=209_step=3924480_val_loss=3.7036_44100_11.ckpt', 'vocos.yaml', device)
100
  vocos_44100 = Vocos.from_pretrained(VOCODER44_MODEL_PATH).to(device)
101
- denoiser = None#Denoiser(vocoder, mode="zeros")
102
 
103
 
104
  @torch.inference_mode()
105
- def synthesise(text, prompt_selection, speed):
 
106
  if len(text) > 1000:
107
  raise gr.Error("Текст повинен бути коротшим за 1000 символів.")
108
 
109
- text_processed = process_text(text.strip(), device)
110
- prompt_audio_path = os.path.join(prompts_dir, prompt_selection)
111
- wav, sr = torchaudio.load(prompt_audio_path)
112
- prompt = mel_spectrogram(volnorm(wav), 1024, 80, 22050, 256, 1024, 0, 8000, center=False)[:,:,:264]
 
 
113
 
 
 
 
 
 
 
 
 
114
  output = model.synthesise(
115
  text_processed["x"].to(device),
116
  text_processed["x_lengths"].to(device),
117
  n_timesteps=40,
118
- temperature=0.0,
119
  length_scale=1/speed,
120
  prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
121
 
122
- guidance_scale=2.0
123
 
124
  )
125
- waveform_vocos = vocos_22050.decode(output["mel"]).cpu().squeeze()
126
  waveform_vocos_44100 = vocos_44100.decode(output["mel"]).cpu().squeeze()
127
- waveform_hifigan = hifigan(output["mel"]).clamp(-1, 1).cpu().squeeze()
128
- transform = torchaudio.transforms.Vol(gain=-18, gain_type="db")
129
-
130
-
131
- return text_processed['x_phones'][1::2], (44100, waveform_vocos_44100.numpy()), (22050, waveform_vocos.numpy()), (22050, transform(waveform_hifigan).numpy())
132
 
133
 
134
  description = f'''
135
- # Експериментальна апка для генерації аудіо з тексту.
136
-
137
- pflow checkpoint {PFLOW_MODEL_PATH}
138
- Vocos 44100 аудіо - {VOCODER44_MODEL_PATH}
139
- Vocos 22050 аудіо - {VOCODER22_MODEL_PATH}
140
- HIFIGAN 22050 аудіо - {HIFIGAN_MODEL_PATH}
141
  '''
142
 
143
 
@@ -147,7 +120,10 @@ if __name__ == "__main__":
147
  description=description,
148
  inputs=[
149
  gr.Text(label='Текст для синтезу:', lines=5, max_lines=10),
150
- gr.Dropdown(label="Prompt audio", choices=prompts_list, value=prompts_list[0]),
 
 
 
151
  gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.1)
152
  ],
153
  outputs=[
@@ -164,21 +140,17 @@ if __name__ == "__main__":
164
  streaming=False,
165
  type="numpy",
166
  ),
167
- gr.Audio(
168
- label="HIFIGAN 22050 аудіо:",
169
- autoplay=False,
170
- streaming=False,
171
- type="numpy",
172
- )
173
 
174
  ],
175
  allow_flagging ='manual',
176
- #flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")],
177
  cache_examples=True,
178
- title='',
179
- # description=description,
180
- # article=article,
181
- # examples=examples,
 
 
 
182
  )
183
  i.queue(max_size=20, default_concurrency_limit=4)
184
  i.launch(share=False, server_name="0.0.0.0")
 
1
  import os
 
2
  import torchaudio
 
 
 
 
3
  import torch
4
+ import gradio as gr
 
 
 
 
 
 
5
 
6
 
7
  from pflow.models.pflow_tts import pflowTTS
8
+ from pflow.text import text_to_sequence, sequence_to_text, cleaned_text_to_sequence
9
  from pflow.utils.utils import intersperse
10
  from pflow.data.text_mel_datamodule import mel_spectrogram
11
  from pflow.utils.model import normalize
12
  from vocos import Vocos
13
 
14
 
15
+ PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=100.ckpt'
 
 
 
 
16
  VOCODER22_MODEL_PATH = 'BSC-LT/vocos-mel-22khz'
17
  VOCODER44_MODEL_PATH = 'patriotyk/vocos-mel-hifigan-compat-44100khz'
18
 
 
 
 
 
 
19
 
20
+ volnorm = torchaudio.transforms.Vol(gain=-15, gain_type="db")
21
  prompts_dir = 'prompts'
 
22
  prompts_list = sorted(os.listdir(prompts_dir), key=lambda x: x.split('.')[0])
23
 
24
+ def process_text(text: str, device: torch.device, ipa=False):
25
+ if ipa:
26
+ seq = cleaned_text_to_sequence(text)
27
+ else:
28
+ seq = text_to_sequence(text, ["ukr_cleaners"])
29
+
30
  x = torch.tensor(
31
+ intersperse(seq, 0),
32
  dtype=torch.long,
33
  device=device,
34
  )[None]
 
38
 
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def load_vocos(checkpoint_path, config_path, device):
42
  model = Vocos.from_hparams(config_path).to(device)
43
 
 
48
  return model
49
 
50
 
 
 
 
51
 
52
  def get_device():
53
  if torch.cuda.is_available():
 
63
  model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
64
  _ = model.eval()
65
 
 
 
 
 
66
  vocos_44100 = Vocos.from_pretrained(VOCODER44_MODEL_PATH).to(device)
67
+ vocos_22050 = Vocos.from_pretrained(VOCODER22_MODEL_PATH).to(device)
68
 
69
 
70
  @torch.inference_mode()
71
+ def synthesise(text, ipa, prompt_selection, audio_prompt, temperature, speed):
72
+ print(text, prompt_selection, temperature, speed)
73
  if len(text) > 1000:
74
  raise gr.Error("Текст повинен бути коротшим за 1000 символів.")
75
 
76
+ if audio_prompt:
77
+ wav, sr = torchaudio.load(audio_prompt)
78
+ wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=44100)
79
+ else:
80
+ prompt_audio_path = os.path.join(prompts_dir, prompt_selection)
81
+ wav, _ = torchaudio.load(prompt_audio_path)
82
 
83
+ if ipa:
84
+ text_processed = process_text(ipa, device, ipa=True)
85
+ else:
86
+ text_processed = process_text(text.strip(), device, ipa=False)
87
+
88
+ prompt = mel_spectrogram(volnorm(wav), 2048, 80, 44100, 512, 2048, 0, 8000, center=False)[:,:,:264]
89
+
90
+
91
  output = model.synthesise(
92
  text_processed["x"].to(device),
93
  text_processed["x_lengths"].to(device),
94
  n_timesteps=40,
95
+ temperature=temperature,
96
  length_scale=1/speed,
97
  prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
98
 
99
+ guidance_scale=1.8
100
 
101
  )
102
+
103
  waveform_vocos_44100 = vocos_44100.decode(output["mel"]).cpu().squeeze()
104
+ waveform_vocos = vocos_22050.decode(output["mel"]).cpu().squeeze()
105
+ return text_processed['x_phones'][1::2], (44100, waveform_vocos_44100.numpy()), (22050, waveform_vocos.numpy())
 
 
 
106
 
107
 
108
  description = f'''
109
+ Модель натренована на приватному датасеті з аудіо книжок створненому за допомогою програми
110
+ [narizaka](https://github.com/patriotyk/narizaka).
111
+ Програма може не коректно визначати деякі наголоси і не дуже добре перетворює цифри, акроніми і різні скорочення в словесну форму.
112
+ На даний момент, відкритого рішення для української мови для цих проблем нема, тому якщо у вас є запитання,
113
+ чи ви хочете допомогти їх вирішити приєднуйтесь до нашого чату в [телеграм](https://t.me/speech_synthesis_uk) або [discord](https://discord.gg/yVAjkBgmt4)
 
114
  '''
115
 
116
 
 
120
  description=description,
121
  inputs=[
122
  gr.Text(label='Текст для синтезу:', lines=5, max_lines=10),
123
+ gr.Text(label='Aбо IPA:', lines=5, max_lines=10),
124
+ gr.Dropdown(label="Виберіть промт", choices=prompts_list, value=prompts_list[0]),
125
+ gr.Audio(label="Або завантажте свій:", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'}),
126
+ gr.Slider(minimum=0.0, maximum=1.0, label="Шум", value=0.7),
127
  gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.1)
128
  ],
129
  outputs=[
 
140
  streaming=False,
141
  type="numpy",
142
  ),
 
 
 
 
 
 
143
 
144
  ],
145
  allow_flagging ='manual',
 
146
  cache_examples=True,
147
+ title='Генерація української мови за допомогою pflowtts.',
148
+ examples=[
149
+ ['Мені тринадцятий минало. Я пас ягнята за селом. Чи то так сонечко сіяло, Чи так мені чого було? Мені так любо, любо стало, Неначе в бога. Уже прокликали до паю, А я собі у бур\'яні Молюся богу І не знаю, Чого маленькому мені Тоді так приязно молилось, Чого так весело було?', "meˈnʲi trɪˈnad͡zʲt͡sʲɐtɪi̯ mɪˈnaɫɔ. jɐ pɐs jɐɦˈnʲatɐ zɐ seˈɫɔm. t͡ʃɪ tɔ tɐk ˈsɔnet͡ʃkɔ sʲiˈjɐɫɔ, t͡ʃɪ tɐk meˈnʲi t͡ʃɔˈɦɔ bʊˈɫɔ? meˈnʲi tɐk ˈlʲubɔ, ˈlʲubɔ ˈstaɫɔ, neˈnat͡ʃe ʋ ˈbɔɦɐ. ʊˈʒɛ prɔkɫɪkɐɫɪ dɔ ˈpajʊ, ɐ jɐ soˈbʲi ʊ bur-jɐˈnʲi moˈlʲusʲɐ ˈbɔɦʊ i ne ˈznajʊ, t͡ʃɔˈɦɔ mɐˈɫɛnʲkɔmʊ meˈnʲi toˈdʲi tɐk ˈprɪjɐznɔ mɔˈɫɪɫɔsʲ, t͡ʃɔˈɦɔ tɐk ˈʋɛseɫɔ bʊˈɫɔ?", '', 'prompts/speaker_22.wav', 0.6, 1.1],
150
+ ['Ти, малий, скажи малому, хай малий малому скаже, хай малий теля прив\'яже.', '', '', 'prompts/speaker_11.wav', 0.4, 1.1 ],
151
+ ['По мірі розвитку клубу зростатиме і кількість його членів, а отже, команда менеджменту теж буде пропорційно збільшуватись. Яка ж команда потрібна клубу, що налічує, скажімо, сто осіб, і які компетенції повинна мати?', '', '', 'prompts/speaker_20.wav', 0.7, 1.1],
152
+ ['Да ти дєтка гоніш! один рік? І що? Як ви задрали нити, рік вона не може, в когось діти мруть в день народження, викидні, а вона, бляха, рік не може, купи собі рожеве поні і реви побільше, дурепа.', 'dɐ tɪ dʲetkɐ ɦ��nʲiʃ! ɔˈdɪn rʲik? i ʃt͡ʃɔ? jɐk ʋɪ zɐˈdraɫɪ ˈnɪtɪ, rʲik wɔˈna ne ˈmɔʒe, ʋ kɔɦɔsʲ ˈdʲitɪ mrʊtʲ ʋ denʲ nɐˈrɔd͡ʒenʲːɐ, ˈʋɪkɪdʲnʲi, ɐ wɔˈna, ˈblʲaxɐ, rʲik ne ˈmɔʒe, kʊpɪ soˈbʲi rɔˈʒɛʋe ˈpɔnʲi i reʋɪ poˈbʲilʲʃe, dʊˈrɛpɐ.', '', 'prompts/speaker_5.wav', 0.7, 1.2]
153
+ ],
154
  )
155
  i.queue(max_size=20, default_concurrency_limit=4)
156
  i.launch(share=False, server_name="0.0.0.0")
pflow/data/text_mel_datamodule.py CHANGED
@@ -39,6 +39,7 @@ class TextMelDataModule(LightningDataModule):
39
  f_max,
40
  data_statistics,
41
  seed,
 
42
  ):
43
  super().__init__()
44
 
@@ -68,6 +69,7 @@ class TextMelDataModule(LightningDataModule):
68
  self.hparams.f_max,
69
  self.hparams.data_statistics,
70
  self.hparams.seed,
 
71
  )
72
  self.validset = TextMelDataset( # pylint: disable=attribute-defined-outside-init
73
  self.hparams.valid_filelist_path,
@@ -83,6 +85,7 @@ class TextMelDataModule(LightningDataModule):
83
  self.hparams.f_max,
84
  self.hparams.data_statistics,
85
  self.hparams.seed,
 
86
  )
87
 
88
  def train_dataloader(self):
@@ -134,6 +137,7 @@ class TextMelDataset(torch.utils.data.Dataset):
134
  f_max=8000,
135
  data_parameters=None,
136
  seed=None,
 
137
  ):
138
  self.filepaths_and_text = parse_filelist(filelist_path)
139
  self.n_spks = n_spks
@@ -146,6 +150,7 @@ class TextMelDataset(torch.utils.data.Dataset):
146
  self.win_length = win_length
147
  self.f_min = f_min
148
  self.f_max = f_max
 
149
  if data_parameters is not None:
150
  self.data_parameters = data_parameters
151
  else:
@@ -196,9 +201,9 @@ class TextMelDataset(torch.utils.data.Dataset):
196
 
197
  def __getitem__(self, index):
198
  datapoint = self.get_datapoint(self.filepaths_and_text[index])
199
- if datapoint["wav"].shape[1] <= 66150:
200
  '''
201
- skip datapoint if too short (3s)
202
  TODO To not waste data, we can concatenate wavs less than 3s and use them
203
  TODO as a hyperparameter; multispeaker dataset can use another wav of same speaker
204
  '''
 
39
  f_max,
40
  data_statistics,
41
  seed,
42
+ min_sample_size,
43
  ):
44
  super().__init__()
45
 
 
69
  self.hparams.f_max,
70
  self.hparams.data_statistics,
71
  self.hparams.seed,
72
+ self.hparams.min_sample_size,
73
  )
74
  self.validset = TextMelDataset( # pylint: disable=attribute-defined-outside-init
75
  self.hparams.valid_filelist_path,
 
85
  self.hparams.f_max,
86
  self.hparams.data_statistics,
87
  self.hparams.seed,
88
+ self.hparams.min_sample_size,
89
  )
90
 
91
  def train_dataloader(self):
 
137
  f_max=8000,
138
  data_parameters=None,
139
  seed=None,
140
+ min_sample_size=4,
141
  ):
142
  self.filepaths_and_text = parse_filelist(filelist_path)
143
  self.n_spks = n_spks
 
150
  self.win_length = win_length
151
  self.f_min = f_min
152
  self.f_max = f_max
153
+ self.min_sample_size = min_sample_size
154
  if data_parameters is not None:
155
  self.data_parameters = data_parameters
156
  else:
 
201
 
202
  def __getitem__(self, index):
203
  datapoint = self.get_datapoint(self.filepaths_and_text[index])
204
+ if datapoint["wav"].shape[1] <= self.min_sample_size * self.sample_rate:
205
  '''
206
+ skip datapoint if too short (<4s , prompt is 3s)
207
  TODO To not waste data, we can concatenate wavs less than 3s and use them
208
  TODO as a hyperparameter; multispeaker dataset can use another wav of same speaker
209
  '''
pflow/models/pflow_tts.py CHANGED
@@ -5,7 +5,7 @@ import random
5
  import torch
6
  import torch.nn.functional as F
7
 
8
-
9
  from pflow.models.baselightningmodule import BaseLightningClass
10
  from pflow.models.components.flow_matching import CFM
11
  from pflow.models.components.speech_prompt_encoder import TextEncoder
@@ -19,7 +19,7 @@ from pflow.utils.model import (
19
  from pflow.models.components import commons
20
  from pflow.models.components.aligner import Aligner, ForwardSumLoss, BinLoss
21
 
22
-
23
 
24
  class pflowTTS(BaseLightningClass): #
25
  def __init__(
@@ -31,6 +31,7 @@ class pflowTTS(BaseLightningClass): #
31
  cfm,
32
  data_statistics,
33
  prompt_size=264,
 
34
  optimizer=None,
35
  scheduler=None,
36
  **kwargs,
@@ -42,6 +43,7 @@ class pflowTTS(BaseLightningClass): #
42
  self.n_vocab = n_vocab
43
  self.n_feats = n_feats
44
  self.prompt_size = prompt_size
 
45
  speech_in_channels = n_feats
46
 
47
  self.encoder = TextEncoder(
@@ -151,7 +153,7 @@ class pflowTTS(BaseLightningClass): #
151
  )
152
 
153
  logw_ = torch.log(1e-8 + attn.sum(2)) * x_mask
154
- dur_loss = duration_loss(logw, logw_, x_lengths)
155
 
156
  # aln_hard, aln_soft, aln_log, aln_mask = self.aligner(
157
  # mu_x.transpose(1,2), x_mask, y, y_mask
 
5
  import torch
6
  import torch.nn.functional as F
7
 
8
+ from pflow import utils
9
  from pflow.models.baselightningmodule import BaseLightningClass
10
  from pflow.models.components.flow_matching import CFM
11
  from pflow.models.components.speech_prompt_encoder import TextEncoder
 
19
  from pflow.models.components import commons
20
  from pflow.models.components.aligner import Aligner, ForwardSumLoss, BinLoss
21
 
22
+ log = utils.get_pylogger(__name__)
23
 
24
  class pflowTTS(BaseLightningClass): #
25
  def __init__(
 
31
  cfm,
32
  data_statistics,
33
  prompt_size=264,
34
+ dur_p_use_log=False,
35
  optimizer=None,
36
  scheduler=None,
37
  **kwargs,
 
43
  self.n_vocab = n_vocab
44
  self.n_feats = n_feats
45
  self.prompt_size = prompt_size
46
+ self.dur_p_use_log = dur_p_use_log
47
  speech_in_channels = n_feats
48
 
49
  self.encoder = TextEncoder(
 
153
  )
154
 
155
  logw_ = torch.log(1e-8 + attn.sum(2)) * x_mask
156
+ dur_loss = duration_loss(logw, logw_, x_lengths, use_log=self.dur_p_use_log)
157
 
158
  # aln_hard, aln_soft, aln_log, aln_mask = self.aligner(
159
  # mu_x.transpose(1,2), x_mask, y, y_mask
pflow/text/cleaners.py CHANGED
@@ -3,7 +3,7 @@ from ukrainian_word_stress import Stressifier
3
  import regex
4
  import re
5
  from ipa_uk import ipa
6
- stressify = Stressifier()
7
 
8
 
9
  _whitespace_re = re.compile(r"\s+")
@@ -15,5 +15,5 @@ def ukr_cleaners(text):
15
  text = collapse_whitespace(text)
16
  text = norm(text).lower()
17
 
18
- text = regex.sub(r'[^\p{L}\p{N}\?\!\,\.\-\: ]', '', text)
19
  return ipa(stressify(text), False)
 
3
  import regex
4
  import re
5
  from ipa_uk import ipa
6
+ stressify = Stressifier(stress_symbol="ˈ")
7
 
8
 
9
  _whitespace_re = re.compile(r"\s+")
 
15
  text = collapse_whitespace(text)
16
  text = norm(text).lower()
17
 
18
+ text = regex.sub(r'[^\ˈ\p{L}\p{N}\?\!\,\.\-\: ]', '', text)
19
  return ipa(stressify(text), False)
pflow/text/symbols.py CHANGED
@@ -2,14 +2,40 @@
2
 
3
  Defines the set of symbols used in text input to the model.
4
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  _pad = "_"
6
  _punctuation = '-´;:,.!?¡¿—…"«»“” '
7
  _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
8
  _letters_ipa = (
9
- "éýíó'̯'͡ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
10
  )
11
 
12
-
13
  # Export all symbols:
14
  symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
15
 
 
2
 
3
  Defines the set of symbols used in text input to the model.
4
  """
5
+ # _pad = "_"
6
+ # _punctuation = '-´;:,.!?¡¿—…"«»“” '
7
+ # _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
8
+ # _letters_ipa = (
9
+ # "éýíó'̯'͡ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
10
+ # )
11
+
12
+
13
+ # # Export all symbols:
14
+ # symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
15
+
16
+ # # Special symbol ids
17
+ # SPACE_ID = symbols.index(" ")
18
+
19
+ # _pad = "_"
20
+ # _punctuation = '()-;:,.!?¡¿—…"«»“” '
21
+ # _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzАБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгґдеєжзиіїйклмнопрстуфхцчшщьюя"
22
+ # _letters_ipa = (
23
+ # "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ̯͡"
24
+ # )
25
+
26
+ # # Export all symbols:
27
+ # symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
28
+
29
+ # # Special symbol ids
30
+ # SPACE_ID = symbols.index(" ")
31
+
32
  _pad = "_"
33
  _punctuation = '-´;:,.!?¡¿—…"«»“” '
34
  _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
35
  _letters_ipa = (
36
+ "éýíó'̯'͡ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲ'̩'ᵻ"
37
  )
38
 
 
39
  # Export all symbols:
40
  symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
41
 
pflow/utils/model.py CHANGED
@@ -41,8 +41,11 @@ def generate_path(duration, mask):
41
  return path
42
 
43
 
44
- def duration_loss(logw, logw_, lengths):
45
- loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths)
 
 
 
46
  return loss
47
 
48
 
 
41
  return path
42
 
43
 
44
+ def duration_loss(logw, logw_, lengths, use_log=False):
45
+ if use_log:
46
+ loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths)
47
+ else:
48
+ loss = torch.sum((torch.exp(logw) - torch.exp(logw_)) ** 2) / torch.sum(lengths)
49
  return loss
50
 
51
 
pflow/utils/utils.py CHANGED
@@ -206,7 +206,6 @@ def get_user_data_dir(appname="pflow_tts"):
206
 
207
 
208
  def assert_model_downloaded(checkpoint_path, url, use_wget=False):
209
- print(checkpoint_path)
210
  if Path(checkpoint_path).exists():
211
  log.debug(f"[+] Model already present at {checkpoint_path}!")
212
  return
 
206
 
207
 
208
  def assert_model_downloaded(checkpoint_path, url, use_wget=False):
 
209
  if Path(checkpoint_path).exists():
210
  log.debug(f"[+] Model already present at {checkpoint_path}!")
211
  return
prompts/speaker_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0dd649f417994c06fa40481eb41f7356eeb401881f567668c4526dd58567e10
3
+ size 344026
prompts/speaker_10.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:686e88155eef9caac215f4a538be47656d4b992a3ae59f33e026b6f547bef1e8
3
+ size 379046
prompts/speaker_11.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d86982b3e7dba7704088f8459e90c8aacfb9b27a011372c05746c3b906aa88fb
3
+ size 396946
prompts/speaker_12.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1521a2400b70de8ab89dd7083ea7b2d69547dfa0fd44ba98d42c1c16ba959438
3
+ size 458686
prompts/speaker_13.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2ce78bb8e505ad6687139c3fb6ff8ca0d649a139d65e58c0cbfff1b9095efc7
3
+ size 458426
prompts/speaker_14.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:397b9e44669ead0f6baac314ad2f0b32218db9054b20a9c371474b5d49525ef9
3
+ size 480736
prompts/speaker_15.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d614a13acf2f999a715194c3efb4f74a30f33b98823e6bf693bf26bf79d6f653
3
+ size 573086
prompts/speaker_16.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c90db783c7be686c70d1b2d15400f414bc6ede6f59952b9a1b5c6bb1a96a16c7
3
+ size 511346
prompts/speaker_17.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2ce2df72aa34de17f0e833c903a561e650b35d3b03b142c0bd6d0f1d7d4d4e2
3
+ size 635086
prompts/speaker_18.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f0212146531c97752dbdf5615950298518748e655c9cf0d8f07f57e571eaf9a
3
+ size 445198
prompts/speaker_19.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6e3e9226de668b70ab0c61e79b9ab42f3a5fb933fe874565b0a6daea1f0e570
3
+ size 427556
prompts/speaker_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22b38f05add585086e84aa5b1ad5cc60902e97d911c7f5c25efa9907e3a3bd32
3
+ size 674514
prompts/speaker_20.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b27c473712a82d638acfba3a1ea54bde3f0efefe111452101f3798c3aaa46f05
3
+ size 485146
prompts/speaker_21.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f24e374c7ffcc4a2ceee696d6ee21996300997bf56a437212455780d2feab34
3
+ size 379046
prompts/speaker_22.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ad69e9dd8c7c4886319730811920c0e4f56f486203b4ef76acab9f145cd9168
3
+ size 1300738
prompts/speaker_23.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e97404bfd2b086ccb387ec5dff7ab5285dcb68ff789be361b03b0f368425ccf1
3
+ size 1243406
prompts/speaker_24.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8dbbf87ac7a9c229ef8ab0cdc446c1e2408eeb7c0dab56784c221313fe2ffb7
3
+ size 626266
prompts/speaker_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aef97ae48b0aac98ac3b3eb01494546d3a39fa1d864361efb01878f21b933fc7
3
+ size 582166
prompts/speaker_5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2430689318110e4730db7d9f83ad404535a1ac6c632b59483c0693d4804b6f9
3
+ size 418738
prompts/speaker_6.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc9155a46c023af54cd0c1154a2d7fbc0a9f91d7aa1a43fef44d8726439d7c77
3
+ size 388126
prompts/speaker_7.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db46d1910a8fe98d6d8e9b8cb680d732a1800cd8241eb7aa7690d30a2d46931d
3
+ size 462836
prompts/speaker_8.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2895ab68d8d8093a3b51aac1202b0eb68e2d97082f399a09332ccd8df31377e0
3
+ size 418736
prompts/speaker_9.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52555d2474f8d67237e11ef4c329c89388743a826767fb8a9431c196c2a5a021
3
+ size 551036