Spaces:
Running
Running
Mylo
commited on
Commit
•
9449f27
1
Parent(s):
d6a5ff9
Initial commit
Browse files- app.py +87 -0
- hubert/__init__.py +0 -0
- hubert/customtokenizer.py +182 -0
- hubert/hubert_manager.py +33 -0
- hubert/pre_kmeans_hubert.py +85 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os.path
|
2 |
+
import uuid
|
3 |
+
|
4 |
+
import gradio
|
5 |
+
import numpy
|
6 |
+
import torch
|
7 |
+
|
8 |
+
from hubert.hubert_manager import HuBERTManager
|
9 |
+
from hubert.pre_kmeans_hubert import CustomHubert
|
10 |
+
from hubert.customtokenizer import CustomTokenizer
|
11 |
+
from encodec import EncodecModel
|
12 |
+
from encodec.utils import convert_audio
|
13 |
+
|
14 |
+
|
15 |
+
hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed())
|
16 |
+
tokenizer_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model='quantifier_V1_hubert_base_ls960_23.pth'))
|
17 |
+
encodec_model = EncodecModel.encodec_model_24khz()
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
def clone(audio, *args):
|
22 |
+
sr, wav = audio
|
23 |
+
if wav.shape[0] == 2: # Stereo to mono if needed
|
24 |
+
wav = wav.mean(0, keepdim=True)
|
25 |
+
|
26 |
+
wav = wav[-int(sr*20):] # Take only the last 20 seconds
|
27 |
+
|
28 |
+
wav = wav.reshape(1, -1) # Reshape from gradio style to HuBERT shape. (N, 1) to (1, N)
|
29 |
+
|
30 |
+
wav = torch.tensor(wav, dtype=torch.float32)
|
31 |
+
|
32 |
+
semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
|
33 |
+
semantic_tokens = tokenizer_model.get_token(semantic_vectors)
|
34 |
+
|
35 |
+
encodec_model.set_target_bandwidth(6.0)
|
36 |
+
wav = convert_audio(wav, sr, encodec_model.sample_rate, 1)
|
37 |
+
wav = wav.unsqueeze(0)
|
38 |
+
|
39 |
+
with torch.no_grad():
|
40 |
+
encoded_frames = encodec_model.encode(wav)
|
41 |
+
|
42 |
+
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1) # [B, n_q, T]
|
43 |
+
|
44 |
+
if not os.path.isdir('data/speakers'):
|
45 |
+
os.makedirs('data/speakers')
|
46 |
+
|
47 |
+
file_path = f'data/speakers/{uuid.uuid4().hex}.npz'
|
48 |
+
|
49 |
+
numpy.savez(
|
50 |
+
file_path,
|
51 |
+
semantic_prompt=semantic_tokens,
|
52 |
+
fine_prompt=codes,
|
53 |
+
coarse_prompt=codes[:2, :]
|
54 |
+
)
|
55 |
+
|
56 |
+
return file_path
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
iface = gradio.interface.Interface(fn=clone, inputs=[
|
61 |
+
'audio',
|
62 |
+
gradio.Markdown(
|
63 |
+
'''
|
64 |
+
# Bark text to speech voice cloning
|
65 |
+
[Model](https://huggingface.co/GitMylo/bark-voice-cloning/), [Model GitHub](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer), [Webui GitHub](https://github.com/gitmylo/audio-webui)
|
66 |
+
|
67 |
+
For faster creation of voice clones [Duplicate this space](https://huggingface.co/spaces/GitMylo/bark-voice-cloning?duplicate=true)
|
68 |
+
|
69 |
+
Uploaded audio files get cut to 20 seconds in order to keep it fast for everyone. Only the last 20 seconds will be used. (Bark only uses the last 14 seconds anyway)
|
70 |
+
|
71 |
+
## Tips for better cloning
|
72 |
+
### Make sure these things are **NOT** in your voice input: (in no particular order)
|
73 |
+
* Noise (You can use a noise remover before)
|
74 |
+
* Music (There are also music remover tools) (Unless you want music in the background)
|
75 |
+
* A cut-off at the end (This will cause it to try and continue on the generation)
|
76 |
+
* Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)
|
77 |
+
|
78 |
+
### What makes for good prompt audio? (in no particular order)
|
79 |
+
* Clearly spoken
|
80 |
+
* No weird background noises
|
81 |
+
* Only one speaker
|
82 |
+
* Audio which ends after a sentence ends
|
83 |
+
* Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
|
84 |
+
* Around 10 seconds of data
|
85 |
+
''')
|
86 |
+
], outputs='file')
|
87 |
+
iface.launch()
|
hubert/__init__.py
ADDED
File without changes
|
hubert/customtokenizer.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os.path
|
3 |
+
from zipfile import ZipFile
|
4 |
+
|
5 |
+
import numpy
|
6 |
+
import torch
|
7 |
+
from torch import nn, optim
|
8 |
+
from torch.serialization import MAP_LOCATION
|
9 |
+
|
10 |
+
|
11 |
+
class CustomTokenizer(nn.Module):
|
12 |
+
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
|
13 |
+
super(CustomTokenizer, self).__init__()
|
14 |
+
next_size = input_size
|
15 |
+
if version == 0:
|
16 |
+
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
|
17 |
+
next_size = hidden_size
|
18 |
+
if version == 1:
|
19 |
+
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
|
20 |
+
self.intermediate = nn.Linear(hidden_size, 4096)
|
21 |
+
next_size = 4096
|
22 |
+
|
23 |
+
self.fc = nn.Linear(next_size, output_size)
|
24 |
+
self.softmax = nn.LogSoftmax(dim=1)
|
25 |
+
self.optimizer: optim.Optimizer = None
|
26 |
+
self.lossfunc = nn.CrossEntropyLoss()
|
27 |
+
self.input_size = input_size
|
28 |
+
self.hidden_size = hidden_size
|
29 |
+
self.output_size = output_size
|
30 |
+
self.version = version
|
31 |
+
|
32 |
+
def forward(self, x):
|
33 |
+
x, _ = self.lstm(x)
|
34 |
+
if self.version == 1:
|
35 |
+
x = self.intermediate(x)
|
36 |
+
x = self.fc(x)
|
37 |
+
x = self.softmax(x)
|
38 |
+
return x
|
39 |
+
|
40 |
+
@torch.no_grad()
|
41 |
+
def get_token(self, x):
|
42 |
+
"""
|
43 |
+
Used to get the token for the first
|
44 |
+
:param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
|
45 |
+
:return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
|
46 |
+
"""
|
47 |
+
return torch.argmax(self(x), dim=1)
|
48 |
+
|
49 |
+
def prepare_training(self):
|
50 |
+
self.optimizer = optim.Adam(self.parameters(), 0.001)
|
51 |
+
|
52 |
+
def train_step(self, x_train, y_train, log_loss=False):
|
53 |
+
# y_train = y_train[:-1]
|
54 |
+
# y_train = y_train[1:]
|
55 |
+
|
56 |
+
optimizer = self.optimizer
|
57 |
+
lossfunc = self.lossfunc
|
58 |
+
# Zero the gradients
|
59 |
+
self.zero_grad()
|
60 |
+
|
61 |
+
# Forward pass
|
62 |
+
y_pred = self(x_train)
|
63 |
+
|
64 |
+
y_train_len = len(y_train)
|
65 |
+
y_pred_len = y_pred.shape[0]
|
66 |
+
|
67 |
+
if y_train_len > y_pred_len:
|
68 |
+
diff = y_train_len - y_pred_len
|
69 |
+
y_train = y_train[diff:]
|
70 |
+
elif y_train_len < y_pred_len:
|
71 |
+
diff = y_pred_len - y_train_len
|
72 |
+
y_pred = y_pred[:-diff, :]
|
73 |
+
|
74 |
+
y_train_hot = torch.zeros(len(y_train), self.output_size)
|
75 |
+
y_train_hot[range(len(y_train)), y_train] = 1
|
76 |
+
y_train_hot = y_train_hot.to('cuda')
|
77 |
+
|
78 |
+
# Calculate the loss
|
79 |
+
loss = lossfunc(y_pred, y_train_hot)
|
80 |
+
|
81 |
+
# Print loss
|
82 |
+
if log_loss:
|
83 |
+
print('Loss', loss.item())
|
84 |
+
|
85 |
+
# Backward pass
|
86 |
+
loss.backward()
|
87 |
+
|
88 |
+
# Update the weights
|
89 |
+
optimizer.step()
|
90 |
+
|
91 |
+
def save(self, path):
|
92 |
+
info_path = os.path.basename(path) + '/.info'
|
93 |
+
torch.save(self.state_dict(), path)
|
94 |
+
data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
|
95 |
+
with ZipFile(path, 'a') as model_zip:
|
96 |
+
model_zip.writestr(info_path, data_from_model.save())
|
97 |
+
model_zip.close()
|
98 |
+
|
99 |
+
@staticmethod
|
100 |
+
def load_from_checkpoint(path, map_location: MAP_LOCATION = None):
|
101 |
+
old = True
|
102 |
+
with ZipFile(path) as model_zip:
|
103 |
+
filesMatch = [file for file in model_zip.namelist() if file.endswith('/.info')]
|
104 |
+
file = filesMatch[0] if filesMatch else None
|
105 |
+
if file:
|
106 |
+
old = False
|
107 |
+
data_from_model = Data.load(model_zip.read(file).decode('utf-8'))
|
108 |
+
model_zip.close()
|
109 |
+
if old:
|
110 |
+
model = CustomTokenizer()
|
111 |
+
else:
|
112 |
+
model = CustomTokenizer(data_from_model.hidden_size, data_from_model.input_size, data_from_model.output_size, data_from_model.version)
|
113 |
+
model.load_state_dict(torch.load(path, map_location))
|
114 |
+
return model
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
class Data:
|
119 |
+
input_size: int
|
120 |
+
hidden_size: int
|
121 |
+
output_size: int
|
122 |
+
version: int
|
123 |
+
|
124 |
+
def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
|
125 |
+
self.input_size = input_size
|
126 |
+
self.hidden_size = hidden_size
|
127 |
+
self.output_size = output_size
|
128 |
+
self.version = version
|
129 |
+
|
130 |
+
@staticmethod
|
131 |
+
def load(string):
|
132 |
+
data = json.loads(string)
|
133 |
+
return Data(data['input_size'], data['hidden_size'], data['output_size'], data['version'])
|
134 |
+
|
135 |
+
def save(self):
|
136 |
+
data = {
|
137 |
+
'input_size': self.input_size,
|
138 |
+
'hidden_size': self.hidden_size,
|
139 |
+
'output_size': self.output_size,
|
140 |
+
'version': self.version,
|
141 |
+
}
|
142 |
+
return json.dumps(data)
|
143 |
+
|
144 |
+
|
145 |
+
def auto_train(data_path, save_path='model.pth', load_model: str | None = None, save_epochs=1):
|
146 |
+
data_x, data_y = [], []
|
147 |
+
|
148 |
+
if load_model and os.path.isfile(load_model):
|
149 |
+
print('Loading model from', load_model)
|
150 |
+
model_training = CustomTokenizer.load_from_checkpoint(load_model, 'cuda')
|
151 |
+
else:
|
152 |
+
print('Creating new model.')
|
153 |
+
model_training = CustomTokenizer(version=1).to('cuda') # Settings for the model to run without lstm
|
154 |
+
save_path = os.path.join(data_path, save_path)
|
155 |
+
base_save_path = '.'.join(save_path.split('.')[:-1])
|
156 |
+
|
157 |
+
sem_string = '_semantic.npy'
|
158 |
+
feat_string = '_semantic_features.npy'
|
159 |
+
|
160 |
+
ready = os.path.join(data_path, 'ready')
|
161 |
+
for input_file in os.listdir(ready):
|
162 |
+
full_path = os.path.join(ready, input_file)
|
163 |
+
if input_file.endswith(sem_string):
|
164 |
+
data_y.append(numpy.load(full_path))
|
165 |
+
elif input_file.endswith(feat_string):
|
166 |
+
data_x.append(numpy.load(full_path))
|
167 |
+
model_training.prepare_training()
|
168 |
+
|
169 |
+
epoch = 1
|
170 |
+
|
171 |
+
while 1:
|
172 |
+
for i in range(save_epochs):
|
173 |
+
j = 0
|
174 |
+
for x, y in zip(data_x, data_y):
|
175 |
+
model_training.train_step(torch.tensor(x).to('cuda'), torch.tensor(y).to('cuda'), j % 50 == 0) # Print loss every 50 steps
|
176 |
+
j += 1
|
177 |
+
save_p = save_path
|
178 |
+
save_p_2 = f'{base_save_path}_epoch_{epoch}.pth'
|
179 |
+
model_training.save(save_p)
|
180 |
+
model_training.save(save_p_2)
|
181 |
+
print(f'Epoch {epoch} completed')
|
182 |
+
epoch += 1
|
hubert/hubert_manager.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os.path
|
2 |
+
import shutil
|
3 |
+
import urllib.request
|
4 |
+
|
5 |
+
import huggingface_hub
|
6 |
+
|
7 |
+
|
8 |
+
class HuBERTManager:
|
9 |
+
@staticmethod
|
10 |
+
def make_sure_hubert_installed(download_url: str = 'https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', file_name: str = 'hubert.pt'):
|
11 |
+
install_dir = os.path.join('data', 'models', 'hubert')
|
12 |
+
if not os.path.isdir(install_dir):
|
13 |
+
os.makedirs(install_dir, exist_ok=True)
|
14 |
+
install_file = os.path.join(install_dir, file_name)
|
15 |
+
if not os.path.isfile(install_file):
|
16 |
+
print('Downloading HuBERT base model')
|
17 |
+
urllib.request.urlretrieve(download_url, install_file)
|
18 |
+
print('Downloaded HuBERT')
|
19 |
+
return install_file
|
20 |
+
|
21 |
+
|
22 |
+
@staticmethod
|
23 |
+
def make_sure_tokenizer_installed(model: str = 'quantifier_hubert_base_ls960_14.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'):
|
24 |
+
install_dir = os.path.join('data', 'models', 'hubert')
|
25 |
+
if not os.path.isdir(install_dir):
|
26 |
+
os.makedirs(install_dir, exist_ok=True)
|
27 |
+
install_file = os.path.join(install_dir, local_file)
|
28 |
+
if not os.path.isfile(install_file):
|
29 |
+
print('Downloading HuBERT custom tokenizer')
|
30 |
+
huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False)
|
31 |
+
shutil.move(os.path.join(install_dir, model), install_file)
|
32 |
+
print('Downloaded tokenizer')
|
33 |
+
return install_file
|
hubert/pre_kmeans_hubert.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from einops import pack, unpack
|
6 |
+
|
7 |
+
import fairseq
|
8 |
+
|
9 |
+
from torchaudio.functional import resample
|
10 |
+
|
11 |
+
import logging
|
12 |
+
logging.root.setLevel(logging.ERROR)
|
13 |
+
|
14 |
+
|
15 |
+
def exists(val):
|
16 |
+
return val is not None
|
17 |
+
|
18 |
+
|
19 |
+
def default(val, d):
|
20 |
+
return val if exists(val) else d
|
21 |
+
|
22 |
+
|
23 |
+
class CustomHubert(nn.Module):
|
24 |
+
"""
|
25 |
+
checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
|
26 |
+
or you can train your own
|
27 |
+
"""
|
28 |
+
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
checkpoint_path,
|
32 |
+
target_sample_hz=16000,
|
33 |
+
seq_len_multiple_of=None,
|
34 |
+
output_layer=9
|
35 |
+
):
|
36 |
+
super().__init__()
|
37 |
+
self.target_sample_hz = target_sample_hz
|
38 |
+
self.seq_len_multiple_of = seq_len_multiple_of
|
39 |
+
self.output_layer = output_layer
|
40 |
+
|
41 |
+
model_path = Path(checkpoint_path)
|
42 |
+
|
43 |
+
assert model_path.exists(), f'path {checkpoint_path} does not exist'
|
44 |
+
|
45 |
+
checkpoint = torch.load(checkpoint_path)
|
46 |
+
load_model_input = {checkpoint_path: checkpoint}
|
47 |
+
model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)
|
48 |
+
|
49 |
+
self.model = model[0]
|
50 |
+
self.model.eval()
|
51 |
+
|
52 |
+
@property
|
53 |
+
def groups(self):
|
54 |
+
return 1
|
55 |
+
|
56 |
+
@torch.no_grad()
|
57 |
+
def forward(
|
58 |
+
self,
|
59 |
+
wav_input,
|
60 |
+
flatten=True,
|
61 |
+
input_sample_hz=None
|
62 |
+
):
|
63 |
+
device = wav_input.device
|
64 |
+
|
65 |
+
if exists(input_sample_hz):
|
66 |
+
wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
|
67 |
+
|
68 |
+
embed = self.model(
|
69 |
+
wav_input,
|
70 |
+
features_only=True,
|
71 |
+
mask=False, # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
|
72 |
+
output_layer=self.output_layer
|
73 |
+
)
|
74 |
+
|
75 |
+
embed, packed_shape = pack([embed['x']], '* d')
|
76 |
+
|
77 |
+
# codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())
|
78 |
+
|
79 |
+
codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) # .long()
|
80 |
+
|
81 |
+
if flatten:
|
82 |
+
return codebook_indices
|
83 |
+
|
84 |
+
codebook_indices, = unpack(codebook_indices, packed_shape, '*')
|
85 |
+
return codebook_indices
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
torchaudio
|
3 |
+
encodec
|
4 |
+
joblib
|
5 |
+
fairseq
|