Spaces:
Running
on
Zero
Running
on
Zero
update to current version
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- InferenceInterfaces/ControllableInterface.py +52 -7
- InferenceInterfaces/ToucanTTSInterface.py +7 -7
- InferenceInterfaces/UtteranceCloner.py +4 -4
- Models/ToucanTTS_Meta/best.pt +0 -3
- Models/Vocoder/best.pt +0 -3
- {Architectures → Modules}/Aligner/Aligner.py +0 -0
- {Architectures → Modules}/Aligner/CodecAlignerDataset.py +0 -0
- {Architectures → Modules}/Aligner/README.md +0 -0
- {Architectures → Modules}/Aligner/Reconstructor.py +0 -0
- {Architectures → Modules}/Aligner/__init__.py +0 -0
- {Architectures → Modules}/Aligner/autoaligner_train_loop.py +2 -2
- {Architectures → Modules}/ControllabilityGAN/GAN.py +1 -1
- {Architectures → Modules}/ControllabilityGAN/__init__.py +0 -0
- {Architectures → Modules}/ControllabilityGAN/dataset/__init__.py +0 -0
- {Architectures → Modules}/ControllabilityGAN/dataset/speaker_embeddings_dataset.py +0 -0
- {Architectures → Modules}/ControllabilityGAN/wgan/__init__.py +0 -0
- {Architectures → Modules}/ControllabilityGAN/wgan/init_weights.py +0 -0
- {Architectures → Modules}/ControllabilityGAN/wgan/init_wgan.py +2 -2
- {Architectures → Modules}/ControllabilityGAN/wgan/resnet_1.py +0 -0
- {Architectures → Modules}/ControllabilityGAN/wgan/resnet_init.py +4 -4
- {Architectures → Modules}/ControllabilityGAN/wgan/wgan_qc.py +0 -0
- {Architectures → Modules}/EmbeddingModel/GST.py +1 -1
- {Architectures → Modules}/EmbeddingModel/README.md +0 -0
- {Architectures → Modules}/EmbeddingModel/StyleEmbedding.py +2 -2
- {Architectures → Modules}/EmbeddingModel/StyleTTSEncoder.py +0 -0
- {Architectures → Modules}/EmbeddingModel/__init__.py +0 -0
- {Architectures → Modules}/GeneralLayers/Attention.py +0 -0
- {Architectures → Modules}/GeneralLayers/ConditionalLayerNorm.py +0 -1
- {Architectures → Modules}/GeneralLayers/Conformer.py +27 -17
- {Architectures → Modules}/GeneralLayers/Convolution.py +0 -0
- {Architectures → Modules}/GeneralLayers/DurationPredictor.py +3 -3
- {Architectures → Modules}/GeneralLayers/EncoderLayer.py +1 -1
- {Architectures → Modules}/GeneralLayers/LayerNorm.py +0 -0
- {Architectures → Modules}/GeneralLayers/LengthRegulator.py +0 -0
- {Architectures → Modules}/GeneralLayers/MultiLayeredConv1d.py +0 -0
- {Architectures → Modules}/GeneralLayers/MultiSequential.py +0 -0
- {Architectures → Modules}/GeneralLayers/PositionalEncoding.py +0 -0
- {Architectures → Modules}/GeneralLayers/PositionwiseFeedForward.py +0 -0
- {Architectures → Modules}/GeneralLayers/README.md +0 -0
- {Architectures → Modules}/GeneralLayers/ResidualBlock.py +0 -0
- {Architectures → Modules}/GeneralLayers/ResidualStack.py +0 -0
- {Architectures → Modules}/GeneralLayers/STFT.py +0 -0
- {Architectures → Modules}/GeneralLayers/Swish.py +0 -0
- {Architectures → Modules}/GeneralLayers/VariancePredictor.py +3 -3
- {Architectures → Modules}/GeneralLayers/__init__.py +0 -0
- {Architectures → Modules}/README.md +0 -0
- {Architectures → Modules}/ToucanTTS/CodecDiscriminator.py +0 -0
- {Architectures → Modules}/ToucanTTS/CodecRefinementTransformer.py +2 -2
- {Architectures → Modules}/ToucanTTS/DurationCalculator.py +0 -0
- {Architectures → Modules}/ToucanTTS/EnergyCalculator.py +1 -1
InferenceInterfaces/ControllableInterface.py
CHANGED
@@ -2,8 +2,8 @@ import os
|
|
2 |
|
3 |
import torch
|
4 |
|
5 |
-
from Architectures.ControllabilityGAN.GAN import GanWrapper
|
6 |
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
|
|
|
7 |
from Utility.storage_config import MODELS_DIR
|
8 |
|
9 |
|
@@ -16,14 +16,18 @@ class ControllableInterface:
|
|
16 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
17 |
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
|
18 |
self.device = "cuda" if gpu_id != "cpu" else "cpu"
|
19 |
-
self.model = ToucanTTSInterface(device=self.device, tts_model_path="Meta"
|
20 |
self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device=self.device)
|
21 |
self.generated_speaker_embeds = list()
|
22 |
self.available_artificial_voices = available_artificial_voices
|
|
|
|
|
23 |
|
24 |
def read(self,
|
25 |
prompt,
|
26 |
-
|
|
|
|
|
27 |
voice_seed,
|
28 |
prosody_creativity,
|
29 |
duration_scaling_factor,
|
@@ -38,7 +42,15 @@ class ControllableInterface:
|
|
38 |
emb_slider_6,
|
39 |
loudness_in_db
|
40 |
):
|
41 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
self.wgan.set_latent(voice_seed)
|
43 |
controllability_vector = torch.tensor([emb_slider_1,
|
44 |
emb_slider_2,
|
@@ -49,13 +61,46 @@ class ControllableInterface:
|
|
49 |
embedding = self.wgan.modify_embed(controllability_vector)
|
50 |
self.model.set_utterance_embedding(embedding=embedding)
|
51 |
else:
|
52 |
-
self.model.set_utterance_embedding(
|
53 |
|
54 |
phones = self.model.text2phone.get_phone_string(prompt)
|
55 |
if len(phones) > 1800:
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
-
print(prompt)
|
59 |
wav, sr, fig = self.model(prompt,
|
60 |
input_is_phones=False,
|
61 |
duration_scaling_factor=duration_scaling_factor,
|
|
|
2 |
|
3 |
import torch
|
4 |
|
|
|
5 |
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
|
6 |
+
from Modules.ControllabilityGAN.GAN import GanWrapper
|
7 |
from Utility.storage_config import MODELS_DIR
|
8 |
|
9 |
|
|
|
16 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
17 |
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
|
18 |
self.device = "cuda" if gpu_id != "cpu" else "cpu"
|
19 |
+
self.model = ToucanTTSInterface(device=self.device, tts_model_path="Meta")
|
20 |
self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device=self.device)
|
21 |
self.generated_speaker_embeds = list()
|
22 |
self.available_artificial_voices = available_artificial_voices
|
23 |
+
self.current_language = ""
|
24 |
+
self.current_accent = ""
|
25 |
|
26 |
def read(self,
|
27 |
prompt,
|
28 |
+
reference_audio,
|
29 |
+
language,
|
30 |
+
accent,
|
31 |
voice_seed,
|
32 |
prosody_creativity,
|
33 |
duration_scaling_factor,
|
|
|
42 |
emb_slider_6,
|
43 |
loudness_in_db
|
44 |
):
|
45 |
+
if self.current_language != language:
|
46 |
+
self.model.set_phonemizer_language(language)
|
47 |
+
print(f"switched phonemizer language to {language}")
|
48 |
+
self.current_language = language
|
49 |
+
if self.current_accent != accent:
|
50 |
+
self.model.set_accent_language(accent)
|
51 |
+
print(f"switched accent language to {accent}")
|
52 |
+
self.current_accent = accent
|
53 |
+
if reference_audio is None:
|
54 |
self.wgan.set_latent(voice_seed)
|
55 |
controllability_vector = torch.tensor([emb_slider_1,
|
56 |
emb_slider_2,
|
|
|
61 |
embedding = self.wgan.modify_embed(controllability_vector)
|
62 |
self.model.set_utterance_embedding(embedding=embedding)
|
63 |
else:
|
64 |
+
self.model.set_utterance_embedding(reference_audio)
|
65 |
|
66 |
phones = self.model.text2phone.get_phone_string(prompt)
|
67 |
if len(phones) > 1800:
|
68 |
+
if language == "deu":
|
69 |
+
prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf."
|
70 |
+
elif language == "ell":
|
71 |
+
prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη."
|
72 |
+
elif language == "spa":
|
73 |
+
prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes."
|
74 |
+
elif language == "fin":
|
75 |
+
prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan."
|
76 |
+
elif language == "rus":
|
77 |
+
prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
|
78 |
+
elif language == "hun":
|
79 |
+
prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
|
80 |
+
elif language == "nld":
|
81 |
+
prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
|
82 |
+
elif language == "fra":
|
83 |
+
prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
|
84 |
+
elif language == 'pol':
|
85 |
+
prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
|
86 |
+
elif language == 'por':
|
87 |
+
prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
|
88 |
+
elif language == 'ita':
|
89 |
+
prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
|
90 |
+
elif language == 'cmn':
|
91 |
+
prompt = "你的输入太长了。请尝试使用较短的文本或将其拆分为多个部分。"
|
92 |
+
elif language == 'vie':
|
93 |
+
prompt = "Đầu vào của bạn quá dài. Vui lòng thử một văn bản ngắn hơn hoặc chia nó thành nhiều phần."
|
94 |
+
else:
|
95 |
+
prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
|
96 |
+
if self.current_language != "eng":
|
97 |
+
self.model.set_phonemizer_language("eng")
|
98 |
+
self.current_language = "eng"
|
99 |
+
if self.current_accent != "eng":
|
100 |
+
self.model.set_accent_language("eng")
|
101 |
+
self.current_accent = "eng"
|
102 |
|
103 |
+
print(prompt + "\n\n")
|
104 |
wav, sr, fig = self.model(prompt,
|
105 |
input_is_phones=False,
|
106 |
duration_scaling_factor=duration_scaling_factor,
|
InferenceInterfaces/ToucanTTSInterface.py
CHANGED
@@ -10,8 +10,8 @@ import torch
|
|
10 |
from speechbrain.pretrained import EncoderClassifier
|
11 |
from torchaudio.transforms import Resample
|
12 |
|
13 |
-
from
|
14 |
-
from
|
15 |
from Preprocessing.AudioPreprocessor import AudioPreprocessor
|
16 |
from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
|
17 |
from Preprocessing.TextFrontend import get_language_id
|
@@ -109,7 +109,7 @@ class ToucanTTSInterface(torch.nn.Module):
|
|
109 |
self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, device=self.device)
|
110 |
|
111 |
def set_accent_language(self, lang_id):
|
112 |
-
if lang_id in
|
113 |
if lang_id == 'vi-so' or lang_id == 'vi-ctr':
|
114 |
lang_id = 'vie'
|
115 |
elif lang_id == 'spa-lat':
|
@@ -121,7 +121,7 @@ class ToucanTTSInterface(torch.nn.Module):
|
|
121 |
elif lang_id == 'en-sc' or lang_id == 'en-us':
|
122 |
lang_id = 'eng'
|
123 |
else:
|
124 |
-
# no clue where these others are even coming from, they are not in ISO 639-
|
125 |
lang_id = 'eng'
|
126 |
|
127 |
self.lang_id = get_language_id(lang_id).to(self.device)
|
@@ -139,7 +139,7 @@ class ToucanTTSInterface(torch.nn.Module):
|
|
139 |
input_is_phones=False,
|
140 |
return_plot_as_filepath=False,
|
141 |
loudness_in_db=-24.0,
|
142 |
-
prosody_creativity=0.
|
143 |
"""
|
144 |
duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
|
145 |
1.0 means no scaling happens, higher values increase durations for the whole
|
@@ -241,7 +241,7 @@ class ToucanTTSInterface(torch.nn.Module):
|
|
241 |
dur_list=None,
|
242 |
pitch_list=None,
|
243 |
energy_list=None,
|
244 |
-
prosody_creativity=0.
|
245 |
"""
|
246 |
Args:
|
247 |
silent: Whether to be verbose about the process
|
@@ -299,7 +299,7 @@ class ToucanTTSInterface(torch.nn.Module):
|
|
299 |
pitch_variance_scale=1.0,
|
300 |
energy_variance_scale=1.0,
|
301 |
blocking=False,
|
302 |
-
prosody_creativity=0.
|
303 |
if text.strip() == "":
|
304 |
return
|
305 |
wav, sr = self(text,
|
|
|
10 |
from speechbrain.pretrained import EncoderClassifier
|
11 |
from torchaudio.transforms import Resample
|
12 |
|
13 |
+
from Modules.ToucanTTS.InferenceToucanTTS import ToucanTTS
|
14 |
+
from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
|
15 |
from Preprocessing.AudioPreprocessor import AudioPreprocessor
|
16 |
from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
|
17 |
from Preprocessing.TextFrontend import get_language_id
|
|
|
109 |
self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, device=self.device)
|
110 |
|
111 |
def set_accent_language(self, lang_id):
|
112 |
+
if lang_id in {'ajp', 'ajt', 'lak', 'lno', 'nul', 'pii', 'plj', 'slq', 'smd', 'snb', 'tpw', 'wya', 'zua', 'en-us', 'en-sc', 'fr-be', 'fr-sw', 'pt-br', 'spa-lat', 'vi-ctr', 'vi-so'}:
|
113 |
if lang_id == 'vi-so' or lang_id == 'vi-ctr':
|
114 |
lang_id = 'vie'
|
115 |
elif lang_id == 'spa-lat':
|
|
|
121 |
elif lang_id == 'en-sc' or lang_id == 'en-us':
|
122 |
lang_id = 'eng'
|
123 |
else:
|
124 |
+
# no clue where these others are even coming from, they are not in ISO 639-3
|
125 |
lang_id = 'eng'
|
126 |
|
127 |
self.lang_id = get_language_id(lang_id).to(self.device)
|
|
|
139 |
input_is_phones=False,
|
140 |
return_plot_as_filepath=False,
|
141 |
loudness_in_db=-24.0,
|
142 |
+
prosody_creativity=0.1):
|
143 |
"""
|
144 |
duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
|
145 |
1.0 means no scaling happens, higher values increase durations for the whole
|
|
|
241 |
dur_list=None,
|
242 |
pitch_list=None,
|
243 |
energy_list=None,
|
244 |
+
prosody_creativity=0.1):
|
245 |
"""
|
246 |
Args:
|
247 |
silent: Whether to be verbose about the process
|
|
|
299 |
pitch_variance_scale=1.0,
|
300 |
energy_variance_scale=1.0,
|
301 |
blocking=False,
|
302 |
+
prosody_creativity=0.1):
|
303 |
if text.strip() == "":
|
304 |
return
|
305 |
wav, sr = self(text,
|
InferenceInterfaces/UtteranceCloner.py
CHANGED
@@ -4,11 +4,11 @@ import numpy
|
|
4 |
import soundfile as sf
|
5 |
import torch
|
6 |
|
7 |
-
from Architectures.Aligner.Aligner import Aligner
|
8 |
-
from Architectures.ToucanTTS.DurationCalculator import DurationCalculator
|
9 |
-
from Architectures.ToucanTTS.EnergyCalculator import EnergyCalculator
|
10 |
-
from Architectures.ToucanTTS.PitchCalculator import Parselmouth
|
11 |
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
|
|
|
|
|
|
|
|
|
12 |
from Preprocessing.AudioPreprocessor import AudioPreprocessor
|
13 |
from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
|
14 |
from Preprocessing.articulatory_features import get_feature_to_index_lookup
|
|
|
4 |
import soundfile as sf
|
5 |
import torch
|
6 |
|
|
|
|
|
|
|
|
|
7 |
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
|
8 |
+
from Modules.Aligner.Aligner import Aligner
|
9 |
+
from Modules.ToucanTTS.DurationCalculator import DurationCalculator
|
10 |
+
from Modules.ToucanTTS.EnergyCalculator import EnergyCalculator
|
11 |
+
from Modules.ToucanTTS.PitchCalculator import Parselmouth
|
12 |
from Preprocessing.AudioPreprocessor import AudioPreprocessor
|
13 |
from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
|
14 |
from Preprocessing.articulatory_features import get_feature_to_index_lookup
|
Models/ToucanTTS_Meta/best.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3f1f562f9473f227b4425938c80dec1808d0cd3a54fd3629b327613dae3be694
|
3 |
-
size 112081651
|
|
|
|
|
|
|
|
Models/Vocoder/best.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:68a4db7d7d96a554eab75c5d8b79267760d7d4c7af65504947ab807ab18d680b
|
3 |
-
size 56113099
|
|
|
|
|
|
|
|
{Architectures → Modules}/Aligner/Aligner.py
RENAMED
File without changes
|
{Architectures → Modules}/Aligner/CodecAlignerDataset.py
RENAMED
File without changes
|
{Architectures → Modules}/Aligner/README.md
RENAMED
File without changes
|
{Architectures → Modules}/Aligner/Reconstructor.py
RENAMED
File without changes
|
{Architectures → Modules}/Aligner/__init__.py
RENAMED
File without changes
|
{Architectures → Modules}/Aligner/autoaligner_train_loop.py
RENAMED
@@ -8,8 +8,8 @@ from torch.optim import RAdam
|
|
8 |
from torch.utils.data.dataloader import DataLoader
|
9 |
from tqdm import tqdm
|
10 |
|
11 |
-
from
|
12 |
-
from
|
13 |
from Preprocessing.AudioPreprocessor import AudioPreprocessor
|
14 |
from Preprocessing.EnCodecAudioPreprocessor import CodecAudioPreprocessor
|
15 |
|
|
|
8 |
from torch.utils.data.dataloader import DataLoader
|
9 |
from tqdm import tqdm
|
10 |
|
11 |
+
from Modules.Aligner.Aligner import Aligner
|
12 |
+
from Modules.Aligner.Reconstructor import Reconstructor
|
13 |
from Preprocessing.AudioPreprocessor import AudioPreprocessor
|
14 |
from Preprocessing.EnCodecAudioPreprocessor import CodecAudioPreprocessor
|
15 |
|
{Architectures → Modules}/ControllabilityGAN/GAN.py
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
import torch
|
2 |
|
3 |
-
from
|
4 |
|
5 |
|
6 |
class GanWrapper:
|
|
|
1 |
import torch
|
2 |
|
3 |
+
from Modules.ControllabilityGAN.wgan.init_wgan import create_wgan
|
4 |
|
5 |
|
6 |
class GanWrapper:
|
{Architectures → Modules}/ControllabilityGAN/__init__.py
RENAMED
File without changes
|
{Architectures → Modules}/ControllabilityGAN/dataset/__init__.py
RENAMED
File without changes
|
{Architectures → Modules}/ControllabilityGAN/dataset/speaker_embeddings_dataset.py
RENAMED
File without changes
|
{Architectures → Modules}/ControllabilityGAN/wgan/__init__.py
RENAMED
File without changes
|
{Architectures → Modules}/ControllabilityGAN/wgan/init_weights.py
RENAMED
File without changes
|
{Architectures → Modules}/ControllabilityGAN/wgan/init_wgan.py
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
import torch
|
2 |
|
3 |
-
from
|
4 |
-
from
|
5 |
|
6 |
|
7 |
def create_wgan(parameters, device, optimizer='adam'):
|
|
|
1 |
import torch
|
2 |
|
3 |
+
from Modules.ControllabilityGAN.wgan.resnet_init import init_resnet
|
4 |
+
from Modules.ControllabilityGAN.wgan.wgan_qc import WassersteinGanQuadraticCost
|
5 |
|
6 |
|
7 |
def create_wgan(parameters, device, optimizer='adam'):
|
{Architectures → Modules}/ControllabilityGAN/wgan/resnet_1.py
RENAMED
File without changes
|
{Architectures → Modules}/ControllabilityGAN/wgan/resnet_init.py
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
-
from
|
2 |
-
from
|
3 |
-
from
|
4 |
-
from
|
5 |
|
6 |
|
7 |
def init_resnet(parameters):
|
|
|
1 |
+
from Modules.ControllabilityGAN.wgan.init_weights import weights_init_D
|
2 |
+
from Modules.ControllabilityGAN.wgan.init_weights import weights_init_G
|
3 |
+
from Modules.ControllabilityGAN.wgan.resnet_1 import ResNet_D
|
4 |
+
from Modules.ControllabilityGAN.wgan.resnet_1 import ResNet_G
|
5 |
|
6 |
|
7 |
def init_resnet(parameters):
|
{Architectures → Modules}/ControllabilityGAN/wgan/wgan_qc.py
RENAMED
File without changes
|
{Architectures → Modules}/EmbeddingModel/GST.py
RENAMED
@@ -3,7 +3,7 @@
|
|
3 |
|
4 |
import torch
|
5 |
|
6 |
-
from
|
7 |
|
8 |
|
9 |
class GSTStyleEncoder(torch.nn.Module):
|
|
|
3 |
|
4 |
import torch
|
5 |
|
6 |
+
from Modules.GeneralLayers.Attention import MultiHeadedAttention as BaseMultiHeadedAttention
|
7 |
|
8 |
|
9 |
class GSTStyleEncoder(torch.nn.Module):
|
{Architectures → Modules}/EmbeddingModel/README.md
RENAMED
File without changes
|
{Architectures → Modules}/EmbeddingModel/StyleEmbedding.py
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
import torch
|
2 |
|
3 |
-
from
|
4 |
-
from
|
5 |
|
6 |
|
7 |
class StyleEmbedding(torch.nn.Module):
|
|
|
1 |
import torch
|
2 |
|
3 |
+
from Modules.EmbeddingModel.GST import GSTStyleEncoder
|
4 |
+
from Modules.EmbeddingModel.StyleTTSEncoder import StyleEncoder as StyleTTSEncoder
|
5 |
|
6 |
|
7 |
class StyleEmbedding(torch.nn.Module):
|
{Architectures → Modules}/EmbeddingModel/StyleTTSEncoder.py
RENAMED
File without changes
|
{Architectures → Modules}/EmbeddingModel/__init__.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/Attention.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/ConditionalLayerNorm.py
RENAMED
@@ -112,7 +112,6 @@ class AdaIN1d(nn.Module):
|
|
112 |
self.fc = nn.Linear(style_dim, num_features * 2)
|
113 |
|
114 |
def forward(self, x, s):
|
115 |
-
s = torch.nn.functional.normalize(s)
|
116 |
h = self.fc(s)
|
117 |
h = h.view(h.size(0), h.size(1), 1)
|
118 |
gamma, beta = torch.chunk(h, chunks=2, dim=1)
|
|
|
112 |
self.fc = nn.Linear(style_dim, num_features * 2)
|
113 |
|
114 |
def forward(self, x, s):
|
|
|
115 |
h = self.fc(s)
|
116 |
h = h.view(h.size(0), h.size(1), 1)
|
117 |
gamma, beta = torch.chunk(h, chunks=2, dim=1)
|
{Architectures → Modules}/GeneralLayers/Conformer.py
RENAMED
@@ -4,16 +4,16 @@ Taken from ESPNet, but heavily modified
|
|
4 |
|
5 |
import torch
|
6 |
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
-
from
|
13 |
-
from
|
14 |
-
from
|
15 |
-
from
|
16 |
-
from
|
17 |
from Utility.utils import integrate_with_utt_embed
|
18 |
|
19 |
|
@@ -88,6 +88,8 @@ class Conformer(torch.nn.Module):
|
|
88 |
self.language_embedding_projection = lambda x: x
|
89 |
else:
|
90 |
self.language_embedding_projection = torch.nn.Linear(lang_emb_size, attention_dim)
|
|
|
|
|
91 |
# self-attention module definition
|
92 |
encoder_selfattn_layer = RelPositionMultiHeadedAttention
|
93 |
encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu)
|
@@ -130,6 +132,7 @@ class Conformer(torch.nn.Module):
|
|
130 |
if lang_ids is not None:
|
131 |
lang_embs = self.language_embedding(lang_ids)
|
132 |
projected_lang_embs = self.language_embedding_projection(lang_embs).unsqueeze(-1).transpose(1, 2)
|
|
|
133 |
xs = xs + projected_lang_embs # offset phoneme representation by language specific offset
|
134 |
|
135 |
xs = self.pos_enc(xs)
|
@@ -139,21 +142,28 @@ class Conformer(torch.nn.Module):
|
|
139 |
if isinstance(xs, tuple):
|
140 |
x, pos_emb = xs[0], xs[1]
|
141 |
if self.conformer_type != "encoder":
|
142 |
-
x = integrate_with_utt_embed(hs=x,
|
|
|
|
|
|
|
143 |
xs = (x, pos_emb)
|
144 |
else:
|
145 |
if self.conformer_type != "encoder":
|
146 |
-
xs = integrate_with_utt_embed(hs=xs,
|
|
|
|
|
|
|
147 |
xs, masks = encoder(xs, masks)
|
148 |
|
149 |
if isinstance(xs, tuple):
|
150 |
xs = xs[0]
|
151 |
|
152 |
-
if self.use_output_norm and not (self.utt_embed and self.conformer_type == "encoder"):
|
153 |
-
xs = self.output_norm(xs)
|
154 |
-
|
155 |
if self.utt_embed and self.conformer_type == "encoder":
|
156 |
-
xs = integrate_with_utt_embed(hs=xs,
|
157 |
-
|
|
|
|
|
|
|
|
|
158 |
|
159 |
return xs, masks
|
|
|
4 |
|
5 |
import torch
|
6 |
|
7 |
+
from Modules.GeneralLayers.Attention import RelPositionMultiHeadedAttention
|
8 |
+
from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d
|
9 |
+
from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
|
10 |
+
from Modules.GeneralLayers.Convolution import ConvolutionModule
|
11 |
+
from Modules.GeneralLayers.EncoderLayer import EncoderLayer
|
12 |
+
from Modules.GeneralLayers.LayerNorm import LayerNorm
|
13 |
+
from Modules.GeneralLayers.MultiLayeredConv1d import MultiLayeredConv1d
|
14 |
+
from Modules.GeneralLayers.MultiSequential import repeat
|
15 |
+
from Modules.GeneralLayers.PositionalEncoding import RelPositionalEncoding
|
16 |
+
from Modules.GeneralLayers.Swish import Swish
|
17 |
from Utility.utils import integrate_with_utt_embed
|
18 |
|
19 |
|
|
|
88 |
self.language_embedding_projection = lambda x: x
|
89 |
else:
|
90 |
self.language_embedding_projection = torch.nn.Linear(lang_emb_size, attention_dim)
|
91 |
+
self.language_emb_norm = LayerNorm(attention_dim)
|
92 |
+
|
93 |
# self-attention module definition
|
94 |
encoder_selfattn_layer = RelPositionMultiHeadedAttention
|
95 |
encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu)
|
|
|
132 |
if lang_ids is not None:
|
133 |
lang_embs = self.language_embedding(lang_ids)
|
134 |
projected_lang_embs = self.language_embedding_projection(lang_embs).unsqueeze(-1).transpose(1, 2)
|
135 |
+
projected_lang_embs = self.language_emb_norm(projected_lang_embs)
|
136 |
xs = xs + projected_lang_embs # offset phoneme representation by language specific offset
|
137 |
|
138 |
xs = self.pos_enc(xs)
|
|
|
142 |
if isinstance(xs, tuple):
|
143 |
x, pos_emb = xs[0], xs[1]
|
144 |
if self.conformer_type != "encoder":
|
145 |
+
x = integrate_with_utt_embed(hs=x,
|
146 |
+
utt_embeddings=utterance_embedding,
|
147 |
+
projection=self.decoder_embedding_projections[encoder_index],
|
148 |
+
embedding_training=self.use_conditional_layernorm_embedding_integration)
|
149 |
xs = (x, pos_emb)
|
150 |
else:
|
151 |
if self.conformer_type != "encoder":
|
152 |
+
xs = integrate_with_utt_embed(hs=xs,
|
153 |
+
utt_embeddings=utterance_embedding,
|
154 |
+
projection=self.decoder_embedding_projections[encoder_index],
|
155 |
+
embedding_training=self.use_conditional_layernorm_embedding_integration)
|
156 |
xs, masks = encoder(xs, masks)
|
157 |
|
158 |
if isinstance(xs, tuple):
|
159 |
xs = xs[0]
|
160 |
|
|
|
|
|
|
|
161 |
if self.utt_embed and self.conformer_type == "encoder":
|
162 |
+
xs = integrate_with_utt_embed(hs=xs,
|
163 |
+
utt_embeddings=utterance_embedding,
|
164 |
+
projection=self.encoder_embedding_projection,
|
165 |
+
embedding_training=self.use_conditional_layernorm_embedding_integration)
|
166 |
+
elif self.use_output_norm:
|
167 |
+
xs = self.output_norm(xs)
|
168 |
|
169 |
return xs, masks
|
{Architectures → Modules}/GeneralLayers/Convolution.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/DurationPredictor.py
RENAMED
@@ -5,9 +5,9 @@
|
|
5 |
|
6 |
import torch
|
7 |
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
from Utility.utils import integrate_with_utt_embed
|
12 |
|
13 |
|
|
|
5 |
|
6 |
import torch
|
7 |
|
8 |
+
from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d
|
9 |
+
from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
|
10 |
+
from Modules.GeneralLayers.LayerNorm import LayerNorm
|
11 |
from Utility.utils import integrate_with_utt_embed
|
12 |
|
13 |
|
{Architectures → Modules}/GeneralLayers/EncoderLayer.py
RENAMED
@@ -7,7 +7,7 @@
|
|
7 |
import torch
|
8 |
from torch import nn
|
9 |
|
10 |
-
from
|
11 |
|
12 |
|
13 |
class EncoderLayer(nn.Module):
|
|
|
7 |
import torch
|
8 |
from torch import nn
|
9 |
|
10 |
+
from Modules.GeneralLayers.LayerNorm import LayerNorm
|
11 |
|
12 |
|
13 |
class EncoderLayer(nn.Module):
|
{Architectures → Modules}/GeneralLayers/LayerNorm.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/LengthRegulator.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/MultiLayeredConv1d.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/MultiSequential.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/PositionalEncoding.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/PositionwiseFeedForward.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/README.md
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/ResidualBlock.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/ResidualStack.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/STFT.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/Swish.py
RENAMED
File without changes
|
{Architectures → Modules}/GeneralLayers/VariancePredictor.py
RENAMED
@@ -6,9 +6,9 @@ from abc import ABC
|
|
6 |
|
7 |
import torch
|
8 |
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
from Utility.utils import integrate_with_utt_embed
|
13 |
|
14 |
|
|
|
6 |
|
7 |
import torch
|
8 |
|
9 |
+
from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d
|
10 |
+
from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
|
11 |
+
from Modules.GeneralLayers.LayerNorm import LayerNorm
|
12 |
from Utility.utils import integrate_with_utt_embed
|
13 |
|
14 |
|
{Architectures → Modules}/GeneralLayers/__init__.py
RENAMED
File without changes
|
{Architectures → Modules}/README.md
RENAMED
File without changes
|
{Architectures → Modules}/ToucanTTS/CodecDiscriminator.py
RENAMED
File without changes
|
{Architectures → Modules}/ToucanTTS/CodecRefinementTransformer.py
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
import torch
|
2 |
|
3 |
-
from
|
4 |
|
5 |
|
6 |
class CodecRefinementTransformer(torch.nn.Module):
|
@@ -151,7 +151,7 @@ def one_hot_sequence_to_token_sequence(batch_of_indexes_one_hot_per_codebook):
|
|
151 |
|
152 |
|
153 |
if __name__ == '__main__':
|
154 |
-
from
|
155 |
from Utility.utils import make_pad_mask
|
156 |
|
157 |
# prepare dummy inputs
|
|
|
1 |
import torch
|
2 |
|
3 |
+
from Modules.GeneralLayers.Conformer import Conformer
|
4 |
|
5 |
|
6 |
class CodecRefinementTransformer(torch.nn.Module):
|
|
|
151 |
|
152 |
|
153 |
if __name__ == '__main__':
|
154 |
+
from Modules.ToucanTTS.ToucanTTS import ToucanTTS
|
155 |
from Utility.utils import make_pad_mask
|
156 |
|
157 |
# prepare dummy inputs
|
{Architectures → Modules}/ToucanTTS/DurationCalculator.py
RENAMED
File without changes
|
{Architectures → Modules}/ToucanTTS/EnergyCalculator.py
RENAMED
@@ -5,7 +5,7 @@
|
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
7 |
|
8 |
-
from
|
9 |
from Utility.utils import pad_list
|
10 |
|
11 |
|
|
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
7 |
|
8 |
+
from Modules.GeneralLayers.STFT import STFT
|
9 |
from Utility.utils import pad_list
|
10 |
|
11 |
|