seanghay commited on
Commit
b9c7aba
·
verified ·
1 Parent(s): d5ed1ca

remove unused code

Browse files
Files changed (9) hide show
  1. README.md +11 -0
  2. app.py +0 -1
  3. attentions.py +0 -3
  4. commons.py +0 -3
  5. data_utils.py +0 -392
  6. losses.py +0 -4
  7. mel_processing.py +0 -12
  8. modules.py +0 -1
  9. preprocess.py +0 -25
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: KLEA
3
+ emoji: 📈
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.46.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
app.py CHANGED
@@ -1,7 +1,6 @@
1
  # -*- coding: utf-8 -*-
2
  import gradio as gr
3
  from models import SynthesizerTrn
4
- from scipy.io.wavfile import write
5
  from khmer_phonemizer import phonemize_single
6
  import utils
7
  import commons
 
1
  # -*- coding: utf-8 -*-
2
  import gradio as gr
3
  from models import SynthesizerTrn
 
4
  from khmer_phonemizer import phonemize_single
5
  import utils
6
  import commons
attentions.py CHANGED
@@ -1,10 +1,7 @@
1
- import copy
2
  import math
3
- import numpy as np
4
  import torch
5
  from torch import nn
6
  from torch.nn import functional as F
7
-
8
  import commons
9
  import modules
10
  from modules import LayerNorm
 
 
1
  import math
 
2
  import torch
3
  from torch import nn
4
  from torch.nn import functional as F
 
5
  import commons
6
  import modules
7
  from modules import LayerNorm
commons.py CHANGED
@@ -1,10 +1,7 @@
1
  import math
2
- import numpy as np
3
  import torch
4
- from torch import nn
5
  from torch.nn import functional as F
6
 
7
-
8
  def init_weights(m, mean=0.0, std=0.01):
9
  classname = m.__class__.__name__
10
  if classname.find("Conv") != -1:
 
1
  import math
 
2
  import torch
 
3
  from torch.nn import functional as F
4
 
 
5
  def init_weights(m, mean=0.0, std=0.01):
6
  classname = m.__class__.__name__
7
  if classname.find("Conv") != -1:
data_utils.py DELETED
@@ -1,392 +0,0 @@
1
- import time
2
- import os
3
- import random
4
- import numpy as np
5
- import torch
6
- import torch.utils.data
7
-
8
- import commons
9
- from mel_processing import spectrogram_torch
10
- from utils import load_wav_to_torch, load_filepaths_and_text
11
- from text import text_to_sequence, cleaned_text_to_sequence
12
-
13
-
14
- class TextAudioLoader(torch.utils.data.Dataset):
15
- """
16
- 1) loads audio, text pairs
17
- 2) normalizes text and converts them to sequences of integers
18
- 3) computes spectrograms from audio files.
19
- """
20
- def __init__(self, audiopaths_and_text, hparams):
21
- self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
22
- self.text_cleaners = hparams.text_cleaners
23
- self.max_wav_value = hparams.max_wav_value
24
- self.sampling_rate = hparams.sampling_rate
25
- self.filter_length = hparams.filter_length
26
- self.hop_length = hparams.hop_length
27
- self.win_length = hparams.win_length
28
- self.sampling_rate = hparams.sampling_rate
29
-
30
- self.cleaned_text = getattr(hparams, "cleaned_text", False)
31
-
32
- self.add_blank = hparams.add_blank
33
- self.min_text_len = getattr(hparams, "min_text_len", 1)
34
- self.max_text_len = getattr(hparams, "max_text_len", 190)
35
-
36
- random.seed(1234)
37
- random.shuffle(self.audiopaths_and_text)
38
- self._filter()
39
-
40
-
41
- def _filter(self):
42
- """
43
- Filter text & store spec lengths
44
- """
45
- # Store spectrogram lengths for Bucketing
46
- # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
47
- # spec_length = wav_length // hop_length
48
-
49
- audiopaths_and_text_new = []
50
- lengths = []
51
- for audiopath, text in self.audiopaths_and_text:
52
- if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
53
- audiopaths_and_text_new.append([audiopath, text])
54
- lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
55
- self.audiopaths_and_text = audiopaths_and_text_new
56
- self.lengths = lengths
57
-
58
- def get_audio_text_pair(self, audiopath_and_text):
59
- # separate filename and text
60
- audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
61
- text = self.get_text(text)
62
- spec, wav = self.get_audio(audiopath)
63
- return (text, spec, wav)
64
-
65
- def get_audio(self, filename):
66
- audio, sampling_rate = load_wav_to_torch(filename)
67
- if sampling_rate != self.sampling_rate:
68
- raise ValueError("{} {} SR doesn't match target {} SR".format(
69
- sampling_rate, self.sampling_rate))
70
- audio_norm = audio / self.max_wav_value
71
- audio_norm = audio_norm.unsqueeze(0)
72
- spec_filename = filename.replace(".wav", ".spec.pt")
73
- if os.path.exists(spec_filename):
74
- spec = torch.load(spec_filename)
75
- else:
76
- spec = spectrogram_torch(audio_norm, self.filter_length,
77
- self.sampling_rate, self.hop_length, self.win_length,
78
- center=False)
79
- spec = torch.squeeze(spec, 0)
80
- torch.save(spec, spec_filename)
81
- return spec, audio_norm
82
-
83
- def get_text(self, text):
84
- if self.cleaned_text:
85
- text_norm = cleaned_text_to_sequence(text)
86
- else:
87
- text_norm = text_to_sequence(text, self.text_cleaners)
88
- if self.add_blank:
89
- text_norm = commons.intersperse(text_norm, 0)
90
- text_norm = torch.LongTensor(text_norm)
91
- return text_norm
92
-
93
- def __getitem__(self, index):
94
- return self.get_audio_text_pair(self.audiopaths_and_text[index])
95
-
96
- def __len__(self):
97
- return len(self.audiopaths_and_text)
98
-
99
-
100
- class TextAudioCollate():
101
- """ Zero-pads model inputs and targets
102
- """
103
- def __init__(self, return_ids=False):
104
- self.return_ids = return_ids
105
-
106
- def __call__(self, batch):
107
- """Collate's training batch from normalized text and aduio
108
- PARAMS
109
- ------
110
- batch: [text_normalized, spec_normalized, wav_normalized]
111
- """
112
- # Right zero-pad all one-hot text sequences to max input length
113
- _, ids_sorted_decreasing = torch.sort(
114
- torch.LongTensor([x[1].size(1) for x in batch]),
115
- dim=0, descending=True)
116
-
117
- max_text_len = max([len(x[0]) for x in batch])
118
- max_spec_len = max([x[1].size(1) for x in batch])
119
- max_wav_len = max([x[2].size(1) for x in batch])
120
-
121
- text_lengths = torch.LongTensor(len(batch))
122
- spec_lengths = torch.LongTensor(len(batch))
123
- wav_lengths = torch.LongTensor(len(batch))
124
-
125
- text_padded = torch.LongTensor(len(batch), max_text_len)
126
- spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
127
- wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
128
- text_padded.zero_()
129
- spec_padded.zero_()
130
- wav_padded.zero_()
131
- for i in range(len(ids_sorted_decreasing)):
132
- row = batch[ids_sorted_decreasing[i]]
133
-
134
- text = row[0]
135
- text_padded[i, :text.size(0)] = text
136
- text_lengths[i] = text.size(0)
137
-
138
- spec = row[1]
139
- spec_padded[i, :, :spec.size(1)] = spec
140
- spec_lengths[i] = spec.size(1)
141
-
142
- wav = row[2]
143
- wav_padded[i, :, :wav.size(1)] = wav
144
- wav_lengths[i] = wav.size(1)
145
-
146
- if self.return_ids:
147
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing
148
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths
149
-
150
-
151
- """Multi speaker version"""
152
- class TextAudioSpeakerLoader(torch.utils.data.Dataset):
153
- """
154
- 1) loads audio, speaker_id, text pairs
155
- 2) normalizes text and converts them to sequences of integers
156
- 3) computes spectrograms from audio files.
157
- """
158
- def __init__(self, audiopaths_sid_text, hparams):
159
- self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
160
- self.text_cleaners = hparams.text_cleaners
161
- self.max_wav_value = hparams.max_wav_value
162
- self.sampling_rate = hparams.sampling_rate
163
- self.filter_length = hparams.filter_length
164
- self.hop_length = hparams.hop_length
165
- self.win_length = hparams.win_length
166
- self.sampling_rate = hparams.sampling_rate
167
-
168
- self.cleaned_text = getattr(hparams, "cleaned_text", False)
169
-
170
- self.add_blank = hparams.add_blank
171
- self.min_text_len = getattr(hparams, "min_text_len", 1)
172
- self.max_text_len = getattr(hparams, "max_text_len", 190)
173
-
174
- random.seed(1234)
175
- random.shuffle(self.audiopaths_sid_text)
176
- self._filter()
177
-
178
- def _filter(self):
179
- """
180
- Filter text & store spec lengths
181
- """
182
- # Store spectrogram lengths for Bucketing
183
- # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
184
- # spec_length = wav_length // hop_length
185
-
186
- audiopaths_sid_text_new = []
187
- lengths = []
188
- for audiopath, sid, text in self.audiopaths_sid_text:
189
- if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
190
- audiopaths_sid_text_new.append([audiopath, sid, text])
191
- lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
192
- self.audiopaths_sid_text = audiopaths_sid_text_new
193
- self.lengths = lengths
194
-
195
- def get_audio_text_speaker_pair(self, audiopath_sid_text):
196
- # separate filename, speaker_id and text
197
- audiopath, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2]
198
- text = self.get_text(text)
199
- spec, wav = self.get_audio(audiopath)
200
- sid = self.get_sid(sid)
201
- return (text, spec, wav, sid)
202
-
203
- def get_audio(self, filename):
204
- audio, sampling_rate = load_wav_to_torch(filename)
205
- if sampling_rate != self.sampling_rate:
206
- raise ValueError("{} {} SR doesn't match target {} SR".format(
207
- sampling_rate, self.sampling_rate))
208
- audio_norm = audio / self.max_wav_value
209
- audio_norm = audio_norm.unsqueeze(0)
210
- spec_filename = filename.replace(".wav", ".spec.pt")
211
- if os.path.exists(spec_filename):
212
- spec = torch.load(spec_filename)
213
- else:
214
- spec = spectrogram_torch(audio_norm, self.filter_length,
215
- self.sampling_rate, self.hop_length, self.win_length,
216
- center=False)
217
- spec = torch.squeeze(spec, 0)
218
- torch.save(spec, spec_filename)
219
- return spec, audio_norm
220
-
221
- def get_text(self, text):
222
- if self.cleaned_text:
223
- text_norm = cleaned_text_to_sequence(text)
224
- else:
225
- text_norm = text_to_sequence(text, self.text_cleaners)
226
- if self.add_blank:
227
- text_norm = commons.intersperse(text_norm, 0)
228
- text_norm = torch.LongTensor(text_norm)
229
- return text_norm
230
-
231
- def get_sid(self, sid):
232
- sid = torch.LongTensor([int(sid)])
233
- return sid
234
-
235
- def __getitem__(self, index):
236
- return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
237
-
238
- def __len__(self):
239
- return len(self.audiopaths_sid_text)
240
-
241
-
242
- class TextAudioSpeakerCollate():
243
- """ Zero-pads model inputs and targets
244
- """
245
- def __init__(self, return_ids=False):
246
- self.return_ids = return_ids
247
-
248
- def __call__(self, batch):
249
- """Collate's training batch from normalized text, audio and speaker identities
250
- PARAMS
251
- ------
252
- batch: [text_normalized, spec_normalized, wav_normalized, sid]
253
- """
254
- # Right zero-pad all one-hot text sequences to max input length
255
- _, ids_sorted_decreasing = torch.sort(
256
- torch.LongTensor([x[1].size(1) for x in batch]),
257
- dim=0, descending=True)
258
-
259
- max_text_len = max([len(x[0]) for x in batch])
260
- max_spec_len = max([x[1].size(1) for x in batch])
261
- max_wav_len = max([x[2].size(1) for x in batch])
262
-
263
- text_lengths = torch.LongTensor(len(batch))
264
- spec_lengths = torch.LongTensor(len(batch))
265
- wav_lengths = torch.LongTensor(len(batch))
266
- sid = torch.LongTensor(len(batch))
267
-
268
- text_padded = torch.LongTensor(len(batch), max_text_len)
269
- spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
270
- wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
271
- text_padded.zero_()
272
- spec_padded.zero_()
273
- wav_padded.zero_()
274
- for i in range(len(ids_sorted_decreasing)):
275
- row = batch[ids_sorted_decreasing[i]]
276
-
277
- text = row[0]
278
- text_padded[i, :text.size(0)] = text
279
- text_lengths[i] = text.size(0)
280
-
281
- spec = row[1]
282
- spec_padded[i, :, :spec.size(1)] = spec
283
- spec_lengths[i] = spec.size(1)
284
-
285
- wav = row[2]
286
- wav_padded[i, :, :wav.size(1)] = wav
287
- wav_lengths[i] = wav.size(1)
288
-
289
- sid[i] = row[3]
290
-
291
- if self.return_ids:
292
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
293
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
294
-
295
-
296
- class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
297
- """
298
- Maintain similar input lengths in a batch.
299
- Length groups are specified by boundaries.
300
- Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
301
-
302
- It removes samples which are not included in the boundaries.
303
- Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
304
- """
305
- def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
306
- super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
307
- self.lengths = dataset.lengths
308
- self.batch_size = batch_size
309
- self.boundaries = boundaries
310
-
311
- self.buckets, self.num_samples_per_bucket = self._create_buckets()
312
- self.total_size = sum(self.num_samples_per_bucket)
313
- self.num_samples = self.total_size // self.num_replicas
314
-
315
- def _create_buckets(self):
316
- buckets = [[] for _ in range(len(self.boundaries) - 1)]
317
- for i in range(len(self.lengths)):
318
- length = self.lengths[i]
319
- idx_bucket = self._bisect(length)
320
- if idx_bucket != -1:
321
- buckets[idx_bucket].append(i)
322
-
323
- for i in range(len(buckets) - 1, 0, -1):
324
- if len(buckets[i]) == 0:
325
- buckets.pop(i)
326
- self.boundaries.pop(i+1)
327
-
328
- num_samples_per_bucket = []
329
- for i in range(len(buckets)):
330
- len_bucket = len(buckets[i])
331
- total_batch_size = self.num_replicas * self.batch_size
332
- rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
333
- num_samples_per_bucket.append(len_bucket + rem)
334
- return buckets, num_samples_per_bucket
335
-
336
- def __iter__(self):
337
- # deterministically shuffle based on epoch
338
- g = torch.Generator()
339
- g.manual_seed(self.epoch)
340
-
341
- indices = []
342
- if self.shuffle:
343
- for bucket in self.buckets:
344
- indices.append(torch.randperm(len(bucket), generator=g).tolist())
345
- else:
346
- for bucket in self.buckets:
347
- indices.append(list(range(len(bucket))))
348
-
349
- batches = []
350
- for i in range(len(self.buckets)):
351
- bucket = self.buckets[i]
352
- len_bucket = len(bucket)
353
- ids_bucket = indices[i]
354
- num_samples_bucket = self.num_samples_per_bucket[i]
355
-
356
- # add extra samples to make it evenly divisible
357
- rem = num_samples_bucket - len_bucket
358
- ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
359
-
360
- # subsample
361
- ids_bucket = ids_bucket[self.rank::self.num_replicas]
362
-
363
- # batching
364
- for j in range(len(ids_bucket) // self.batch_size):
365
- batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]]
366
- batches.append(batch)
367
-
368
- if self.shuffle:
369
- batch_ids = torch.randperm(len(batches), generator=g).tolist()
370
- batches = [batches[i] for i in batch_ids]
371
- self.batches = batches
372
-
373
- assert len(self.batches) * self.batch_size == self.num_samples
374
- return iter(self.batches)
375
-
376
- def _bisect(self, x, lo=0, hi=None):
377
- if hi is None:
378
- hi = len(self.boundaries) - 1
379
-
380
- if hi > lo:
381
- mid = (hi + lo) // 2
382
- if self.boundaries[mid] < x and x <= self.boundaries[mid+1]:
383
- return mid
384
- elif x <= self.boundaries[mid]:
385
- return self._bisect(x, lo, mid)
386
- else:
387
- return self._bisect(x, mid + 1, hi)
388
- else:
389
- return -1
390
-
391
- def __len__(self):
392
- return self.num_samples // self.batch_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
losses.py CHANGED
@@ -1,8 +1,4 @@
1
  import torch
2
- from torch.nn import functional as F
3
-
4
- import commons
5
-
6
 
7
  def feature_loss(fmap_r, fmap_g):
8
  loss = 0
 
1
  import torch
 
 
 
 
2
 
3
  def feature_loss(fmap_r, fmap_g):
4
  loss = 0
mel_processing.py CHANGED
@@ -1,21 +1,9 @@
1
- import math
2
- import os
3
- import random
4
  import torch
5
- from torch import nn
6
- import torch.nn.functional as F
7
  import torch.utils.data
8
- import numpy as np
9
- import librosa
10
- import librosa.util as librosa_util
11
- from librosa.util import normalize, pad_center, tiny
12
- from scipy.signal import get_window
13
- from scipy.io.wavfile import read
14
  from librosa.filters import mel as librosa_mel_fn
15
 
16
  MAX_WAV_VALUE = 32768.0
17
 
18
-
19
  def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20
  """
21
  PARAMS
 
 
 
 
1
  import torch
 
 
2
  import torch.utils.data
 
 
 
 
 
 
3
  from librosa.filters import mel as librosa_mel_fn
4
 
5
  MAX_WAV_VALUE = 32768.0
6
 
 
7
  def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
8
  """
9
  PARAMS
modules.py CHANGED
@@ -1,7 +1,6 @@
1
  import copy
2
  import math
3
  import numpy as np
4
- import scipy
5
  import torch
6
  from torch import nn
7
  from torch.nn import functional as F
 
1
  import copy
2
  import math
3
  import numpy as np
 
4
  import torch
5
  from torch import nn
6
  from torch.nn import functional as F
preprocess.py DELETED
@@ -1,25 +0,0 @@
1
- import argparse
2
- import text
3
- from utils import load_filepaths_and_text
4
-
5
- if __name__ == '__main__':
6
- parser = argparse.ArgumentParser()
7
- parser.add_argument("--out_extension", default="cleaned")
8
- parser.add_argument("--text_index", default=1, type=int)
9
- parser.add_argument("--filelists", nargs="+", default=["filelists/ljs_audio_text_val_filelist.txt", "filelists/ljs_audio_text_test_filelist.txt"])
10
- parser.add_argument("--text_cleaners", nargs="+", default=["english_cleaners2"])
11
-
12
- args = parser.parse_args()
13
-
14
-
15
- for filelist in args.filelists:
16
- print("START:", filelist)
17
- filepaths_and_text = load_filepaths_and_text(filelist)
18
- for i in range(len(filepaths_and_text)):
19
- original_text = filepaths_and_text[i][args.text_index]
20
- cleaned_text = text._clean_text(original_text, args.text_cleaners)
21
- filepaths_and_text[i][args.text_index] = cleaned_text
22
-
23
- new_filelist = filelist + "." + args.out_extension
24
- with open(new_filelist, "w", encoding="utf-8") as f:
25
- f.writelines(["|".join(x) + "\n" for x in filepaths_and_text])