Spaces:
Running
Running
Mahiruoshi
commited on
Commit
โข
dc23363
1
Parent(s):
f6fda8c
Upload 112 files
Browse files- app.py +4 -6
- bert/bert-base-japanese-v3/README.md +1 -1
- bert/bert-base-japanese-v3/vocab.txt +1 -1
- bert/chinese-roberta-wwm-ext-large/.gitignore +1 -0
- bert/chinese-roberta-wwm-ext-large/README.md +5 -5
- bert/chinese-roberta-wwm-ext-large/added_tokens.json +1 -1
- bert/chinese-roberta-wwm-ext-large/special_tokens_map.json +1 -1
- bert/chinese-roberta-wwm-ext-large/tokenizer.json +0 -0
- bert/chinese-roberta-wwm-ext-large/tokenizer_config.json +1 -1
- bert_gen.py +6 -7
- configs/config.json +250 -27
- data_utils.py +1 -1
- filelists/esd.list +3 -0
- image/41JjBPWdHtL._SX342_SY445_.jpg +0 -0
- image/41JjBPWdHtL.jpg +0 -0
- logs/Bangdream/G_7000.pth +3 -0
- logs/Bangdream/config.json +154 -0
- models.py +1 -1
- monotonic_align/__pycache__/__init__.cpython-39.pyc +0 -0
- monotonic_align/__pycache__/core.cpython-39.pyc +0 -0
- preprocess_text.py +16 -1
- requirements.txt +0 -3
- text/__init__.py +0 -1
- text/__pycache__/__init__.cpython-39.pyc +0 -0
- text/__pycache__/chinese.cpython-39.pyc +0 -0
- text/__pycache__/chinese_bert.cpython-39.pyc +0 -0
- text/__pycache__/cleaner.cpython-39.pyc +0 -0
- text/__pycache__/english_bert_mock.cpython-39.pyc +0 -0
- text/__pycache__/japanese.cpython-39.pyc +0 -0
- text/__pycache__/japanese_bert.cpython-39.pyc +0 -0
- text/__pycache__/symbols.cpython-39.pyc +0 -0
- text/__pycache__/tone_sandhi.cpython-39.pyc +0 -0
- train_ms.py +2 -6
- utils.py +3 -4
- webui.py +224 -0
app.py
CHANGED
@@ -216,11 +216,9 @@ WrapStyle: 0
|
|
216 |
PlayResX: 640
|
217 |
PlayResY: 360
|
218 |
ScaledBorderAndShadow: yes
|
219 |
-
|
220 |
[V4+ Styles]
|
221 |
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
222 |
Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
|
223 |
-
|
224 |
[Events]
|
225 |
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
226 |
"""
|
@@ -338,7 +336,7 @@ def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale
|
|
338 |
if __name__ == "__main__":
|
339 |
parser = argparse.ArgumentParser()
|
340 |
parser.add_argument(
|
341 |
-
"-m", "--model", default="./logs/BangDream/
|
342 |
)
|
343 |
parser.add_argument(
|
344 |
"-c",
|
@@ -387,7 +385,7 @@ if __name__ == "__main__":
|
|
387 |
]
|
388 |
with gr.Blocks() as app:
|
389 |
gr.Markdown(
|
390 |
-
f"
|
391 |
)
|
392 |
for band in BandList:
|
393 |
with gr.TabItem(band):
|
@@ -444,9 +442,9 @@ if __name__ == "__main__":
|
|
444 |
with gr.Row():
|
445 |
with gr.Column():
|
446 |
gr.Markdown(
|
447 |
-
f"ไป <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>ๆ็ๅๅฎข็ซ็น</a>
|
448 |
)
|
449 |
-
inputFile = gr.inputs.File(label="
|
450 |
groupSize = gr.Slider(
|
451 |
minimum=10, maximum=1000,value = i[1], step=1, label="ๅฝไธช้ณ้ขๆไปถๅ
ๅซ็ๆๅคงๅญๆฐ"
|
452 |
)
|
|
|
216 |
PlayResX: 640
|
217 |
PlayResY: 360
|
218 |
ScaledBorderAndShadow: yes
|
|
|
219 |
[V4+ Styles]
|
220 |
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
221 |
Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
|
|
|
222 |
[Events]
|
223 |
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
224 |
"""
|
|
|
336 |
if __name__ == "__main__":
|
337 |
parser = argparse.ArgumentParser()
|
338 |
parser.add_argument(
|
339 |
+
"-m", "--model", default="./logs/BangDream/G_7000.pth", help="path of your model"
|
340 |
)
|
341 |
parser.add_argument(
|
342 |
"-c",
|
|
|
385 |
]
|
386 |
with gr.Blocks() as app:
|
387 |
gr.Markdown(
|
388 |
+
f"ๅฐๆญ้ฆ้ฆๅ
จๅTTS,ไฝฟ็จๆฌๆจกๅ่ฏทไธฅๆ ผ้ตๅฎๆณๅพๆณ่ง!\n ๅๅธไบๅไฝๅ่ฏทๆ ๆณจๆฌ้กน็ฎไฝ่
<a href='https://space.bilibili.com/19874615/'>B็ซ@Mahiroshi</a>ๅ้กน็ฎ้พๆฅ\nไป <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>ๆ็ๅๅฎข็ซ็น</a> ๆฅ็ไฝฟ็จ่ฏดๆ</a>"
|
389 |
)
|
390 |
for band in BandList:
|
391 |
with gr.TabItem(band):
|
|
|
442 |
with gr.Row():
|
443 |
with gr.Column():
|
444 |
gr.Markdown(
|
445 |
+
f"ไป <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>ๆ็ๅๅฎข็ซ็น</a> ๆฅ็่ชๅถgalgameไฝฟ็จ่ฏดๆ\n</a>"
|
446 |
)
|
447 |
+
inputFile = gr.inputs.File(label="ไธไผ txt(ๅฏ่ฎพ็ฝฎ่ง่ฒๅฏนๅบ่กจ)ใepubๆmobiๆไปถ")
|
448 |
groupSize = gr.Slider(
|
449 |
minimum=10, maximum=1000,value = i[1], step=1, label="ๅฝไธช้ณ้ขๆไปถๅ
ๅซ็ๆๅคงๅญๆฐ"
|
450 |
)
|
bert/bert-base-japanese-v3/README.md
CHANGED
@@ -50,4 +50,4 @@ The pretrained models are distributed under the Apache License 2.0.
|
|
50 |
|
51 |
## Acknowledgments
|
52 |
|
53 |
-
This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
|
|
|
50 |
|
51 |
## Acknowledgments
|
52 |
|
53 |
+
This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
|
bert/bert-base-japanese-v3/vocab.txt
CHANGED
@@ -13,7 +13,7 @@
|
|
13 |
[unused7]
|
14 |
[unused8]
|
15 |
[unused9]
|
16 |
-
|
17 |
!
|
18 |
"
|
19 |
#
|
|
|
13 |
[unused7]
|
14 |
[unused8]
|
15 |
[unused9]
|
16 |
+
|
17 |
!
|
18 |
"
|
19 |
#
|
bert/chinese-roberta-wwm-ext-large/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.bin
|
bert/chinese-roberta-wwm-ext-large/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
language:
|
3 |
- zh
|
4 |
tags:
|
5 |
- bert
|
@@ -9,9 +9,9 @@ license: "apache-2.0"
|
|
9 |
# Please use 'Bert' related functions to load this model!
|
10 |
|
11 |
## Chinese BERT with Whole Word Masking
|
12 |
-
For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
|
13 |
|
14 |
-
**[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
|
15 |
Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
|
16 |
|
17 |
This repository is developed based on๏ผhttps://github.com/google-research/bert
|
@@ -46,7 +46,7 @@ If you find the technical report or resource is useful, please cite the followin
|
|
46 |
pages = "657--668",
|
47 |
}
|
48 |
```
|
49 |
-
- Secondary: https://arxiv.org/abs/1906.08101
|
50 |
```
|
51 |
@article{chinese-bert-wwm,
|
52 |
title={Pre-Training with Whole Word Masking for Chinese BERT},
|
@@ -54,4 +54,4 @@ If you find the technical report or resource is useful, please cite the followin
|
|
54 |
journal={arXiv preprint arXiv:1906.08101},
|
55 |
year={2019}
|
56 |
}
|
57 |
-
```
|
|
|
1 |
---
|
2 |
+
language:
|
3 |
- zh
|
4 |
tags:
|
5 |
- bert
|
|
|
9 |
# Please use 'Bert' related functions to load this model!
|
10 |
|
11 |
## Chinese BERT with Whole Word Masking
|
12 |
+
For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
|
13 |
|
14 |
+
**[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
|
15 |
Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
|
16 |
|
17 |
This repository is developed based on๏ผhttps://github.com/google-research/bert
|
|
|
46 |
pages = "657--668",
|
47 |
}
|
48 |
```
|
49 |
+
- Secondary: https://arxiv.org/abs/1906.08101
|
50 |
```
|
51 |
@article{chinese-bert-wwm,
|
52 |
title={Pre-Training with Whole Word Masking for Chinese BERT},
|
|
|
54 |
journal={arXiv preprint arXiv:1906.08101},
|
55 |
year={2019}
|
56 |
}
|
57 |
+
```
|
bert/chinese-roberta-wwm-ext-large/added_tokens.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{}
|
|
|
1 |
+
{}
|
bert/chinese-roberta-wwm-ext-large/special_tokens_map.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
bert/chinese-roberta-wwm-ext-large/tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
bert/chinese-roberta-wwm-ext-large/tokenizer_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"init_inputs": []}
|
|
|
1 |
+
{"init_inputs": []}
|
bert_gen.py
CHANGED
@@ -21,13 +21,12 @@ def process_line(line):
|
|
21 |
word2ph = [i for i in word2ph]
|
22 |
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
word2ph[0] += 1
|
31 |
|
32 |
bert_path = wav_path.replace(".wav", ".bert.pt")
|
33 |
|
|
|
21 |
word2ph = [i for i in word2ph]
|
22 |
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
|
23 |
|
24 |
+
phone = commons.intersperse(phone, 0)
|
25 |
+
tone = commons.intersperse(tone, 0)
|
26 |
+
language = commons.intersperse(language, 0)
|
27 |
+
for i in range(len(word2ph)):
|
28 |
+
word2ph[i] = word2ph[i] * 2
|
29 |
+
word2ph[0] += 1
|
|
|
30 |
|
31 |
bert_path = wav_path.replace(".wav", ".bert.pt")
|
32 |
|
configs/config.json
CHANGED
@@ -10,7 +10,7 @@
|
|
10 |
0.99
|
11 |
],
|
12 |
"eps": 1e-09,
|
13 |
-
"batch_size":
|
14 |
"fp16_run": false,
|
15 |
"lr_decay": 0.999875,
|
16 |
"segment_size": 16384,
|
@@ -35,31 +35,254 @@
|
|
35 |
"n_speakers": 256,
|
36 |
"cleaned_text": true,
|
37 |
"spk2id": {
|
38 |
-
"
|
39 |
-
"
|
40 |
-
"
|
41 |
-
"
|
42 |
-
"
|
43 |
-
"
|
44 |
-
"
|
45 |
-
"
|
46 |
-
"
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"
|
50 |
-
"
|
51 |
-
"
|
52 |
-
"
|
53 |
-
"
|
54 |
-
"
|
55 |
-
"
|
56 |
-
"
|
57 |
-
"
|
58 |
-
"
|
59 |
-
"
|
60 |
-
"
|
61 |
-
"
|
62 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
}
|
64 |
},
|
65 |
"model": {
|
@@ -116,4 +339,4 @@
|
|
116 |
"use_spectral_norm": false,
|
117 |
"gin_channels": 256
|
118 |
}
|
119 |
-
}
|
|
|
10 |
0.99
|
11 |
],
|
12 |
"eps": 1e-09,
|
13 |
+
"batch_size": 8,
|
14 |
"fp16_run": false,
|
15 |
"lr_decay": 0.999875,
|
16 |
"segment_size": 16384,
|
|
|
35 |
"n_speakers": 256,
|
36 |
"cleaned_text": true,
|
37 |
"spk2id": {
|
38 |
+
"ไธนๆ": 0,
|
39 |
+
"ๅ
ๆๆ": 1,
|
40 |
+
"็ฉน": 2,
|
41 |
+
"ใไฟกไฝฟใ": 3,
|
42 |
+
"ๅฒ็ฆ็ฝ": 4,
|
43 |
+
"ๅฝฆๅฟ": 5,
|
44 |
+
"ๆด้": 6,
|
45 |
+
"ๆฐๅธๅพท": 7,
|
46 |
+
"็ด ่ฃณ": 8,
|
47 |
+
"็ปฟ่่": 9,
|
48 |
+
"็ฝๅน": 10,
|
49 |
+
"่พไธๅฆฒ": 11,
|
50 |
+
"้ปๅก": 12,
|
51 |
+
"ไธนๆข": 13,
|
52 |
+
"ๅธ้ฒ็ฆ": 14,
|
53 |
+
"็ฝ้ฒ": 15,
|
54 |
+
"่ดนๆฏๆผ": 16,
|
55 |
+
"ๅไบ": 17,
|
56 |
+
"ๅฏๅฏๅฉไบ": 18,
|
57 |
+
"ๆฏๅ
": 19,
|
58 |
+
"่บไธๅๅง": 20,
|
59 |
+
"้้": 21,
|
60 |
+
"ๅ
ฌ่พๅธๅ
": 22,
|
61 |
+
"ๅก่ๅก": 23,
|
62 |
+
"ๅคงๆฏซ": 24,
|
63 |
+
"้ฉญ็ฉบ": 25,
|
64 |
+
"ๅๅค": 26,
|
65 |
+
"ๅฅฅๅๆ ผ": 27,
|
66 |
+
"ๅจๅก่": 28,
|
67 |
+
"ๆกๅ": 29,
|
68 |
+
"็ฆๅฐ็น": 30,
|
69 |
+
"้ฟๅ
ฐ": 31,
|
70 |
+
"ไผฆ็บณๅพท": 32,
|
71 |
+
"ไฝฉๆ": 33,
|
72 |
+
"ๅกๆณข็น": 34,
|
73 |
+
"ๅธๅง": 35,
|
74 |
+
"ๅธๆฏๅก": 36,
|
75 |
+
"้้": 37,
|
76 |
+
"ไธๆไธ": 38,
|
77 |
+
"ๅ": 39,
|
78 |
+
"ๅงฌๅญ": 40,
|
79 |
+
"ๅธๆดๅฆฎๅจ
": 41,
|
80 |
+
"ๅธๅฟ": 42,
|
81 |
+
"ๆ": 43,
|
82 |
+
"็ฌฆ็": 44,
|
83 |
+
"่ๅ
": 45,
|
84 |
+
"้ถ็ผ": 46,
|
85 |
+
"้ๆต": 47,
|
86 |
+
"ใๅๅฃซใ": 48,
|
87 |
+
"ใๅคง่ไธธใ": 49,
|
88 |
+
"ไนๆก่ฃ็ฝ": 50,
|
89 |
+
"ไฝ่ฅฟๆฉๆฏ": 51,
|
90 |
+
"ๅปๆด": 52,
|
91 |
+
"ๅๆ": 53,
|
92 |
+
"ๅก็ปด": 54,
|
93 |
+
"ๅฏ่": 55,
|
94 |
+
"ๅ็": 56,
|
95 |
+
"ๅ่ๅฐ": 57,
|
96 |
+
"ๅกๆฐยทๆๅพทๅกๅฐผ": 58,
|
97 |
+
"ๅคงๆ
ๆ ็": 59,
|
98 |
+
"ๅฎตๅฎซ": 60,
|
99 |
+
"ๅบท็บณ": 61,
|
100 |
+
"ๅฝฑ": 62,
|
101 |
+
"ๆซๅไธๅถ": 63,
|
102 |
+
"ๆฌง่ฒๅฆฎ": 64,
|
103 |
+
"็ไนไธฝ": 65,
|
104 |
+
"็็": 66,
|
105 |
+
"็ฐ้ๅด": 67,
|
106 |
+
"็ ็ณ": 68,
|
107 |
+
"็ฅ้็ปซๅ": 69,
|
108 |
+
"็ฝ่่ไบ": 70,
|
109 |
+
"่ๆณทไธๆ": 71,
|
110 |
+
"่ๆ": 72,
|
111 |
+
"่ฟชๅธ้
": 73,
|
112 |
+
"้็ฆป": 74,
|
113 |
+
"้ฟๅ": 75,
|
114 |
+
"้ฟๅจ่ถ": 76,
|
115 |
+
"้ฟๆๅคซ": 77,
|
116 |
+
"้ทๆณฝ": 78,
|
117 |
+
"้ฆ่ฑ": 79,
|
118 |
+
"้พไบ": 80,
|
119 |
+
"ใๅ
ฌๅญใ": 81,
|
120 |
+
"ใ็ฝ่ๅ
็ใ": 82,
|
121 |
+
"ไผ่": 83,
|
122 |
+
"ๅฏ็็ณ": 84,
|
123 |
+
"ๅฒๅนณ": 85,
|
124 |
+
"ๅคๆด่": 86,
|
125 |
+
"ๅฎๆ": 87,
|
126 |
+
"ๅทด่พพ็ปด": 88,
|
127 |
+
"ๅผๅคงๅฐ": 89,
|
128 |
+
"ๆฏๅฆๅฉ": 90,
|
129 |
+
"ๆฏไผฝๅฐ": 91,
|
130 |
+
"ๆตทๅฆฎ่ถ": 92,
|
131 |
+
"็ฑๅพท็ณ": 93,
|
132 |
+
"็บณ่ฅฟๅฆฒ": 94,
|
133 |
+
"่ๅญ": 95,
|
134 |
+
"่ๅฎๅจ": 96,
|
135 |
+
"้ฟๅฎ": 97,
|
136 |
+
"้ฟ็ฅ": 98,
|
137 |
+
"ไธนๅๅฐ": 99,
|
138 |
+
"ไธฝ่": 100,
|
139 |
+
"ไบ้": 101,
|
140 |
+
"ๅ
ๅคช": 102,
|
141 |
+
"ๅ
ๅ้จ็น": 103,
|
142 |
+
"ๅ
็ฝ็ดข": 104,
|
143 |
+
"ๅๆ": 105,
|
144 |
+
"ๅๅๆผ": 106,
|
145 |
+
"ๅคฉ็ฎๅไบ": 107,
|
146 |
+
"ๅฅฅๅ
น": 108,
|
147 |
+
"ๆถ้พ": 109,
|
148 |
+
"ๆฉๆ": 110,
|
149 |
+
"ๆๆๅคซ": 111,
|
150 |
+
"ๆพๆตฆ": 112,
|
151 |
+
"ๆๅ้": 113,
|
152 |
+
"็้จ": 114,
|
153 |
+
"็ณๅคด": 115,
|
154 |
+
"็บฏๆฐด็ฒพ็ต๏ผ": 116,
|
155 |
+
"็พฝ็็ฐๅ้นค": 117,
|
156 |
+
"่ฑไพๆ": 118,
|
157 |
+
"่ฒ่ฐขๅฐ": 119,
|
158 |
+
"่จ็ฌ": 120,
|
159 |
+
"่ฏบ่พๅฐ": 121,
|
160 |
+
"่ต่ฏบ": 122,
|
161 |
+
"่พ็ฑ": 123,
|
162 |
+
"่ฟชๅจๆณฝ้ป": 124,
|
163 |
+
"้ฃ็ปด่ฑ็น": 125,
|
164 |
+
"ๅ
ซ้็ฅๅญ": 126,
|
165 |
+
"ๅฏไบ": 127,
|
166 |
+
"ๅด่น้ฟ": 128,
|
167 |
+
"ๅๅพท": 129,
|
168 |
+
"ๅคฉๅ": 130,
|
169 |
+
"ๅฅณๅฃซ": 131,
|
170 |
+
"ๆ็ญ ": 132,
|
171 |
+
"ๆ็บณ้": 133,
|
172 |
+
"ๆดพ่": 134,
|
173 |
+
"ๆตๆตช่
": 135,
|
174 |
+
"ๆทฑๆธไฝฟๅพ": 136,
|
175 |
+
"็ๆ ผไธฝ็น": 137,
|
176 |
+
"็้ฒ็": 138,
|
177 |
+
"็ด": 139,
|
178 |
+
"็ถ็ถ": 140,
|
179 |
+
"็ไบๅ้ฃ็ๅ": 141,
|
180 |
+
"็ปฎ่ฏ่ฏ": 142,
|
181 |
+
"่ไผฏ็น": 143,
|
182 |
+
"่ง": 144,
|
183 |
+
"่ซๅจ": 145,
|
184 |
+
"่ก็ง": 146,
|
185 |
+
"่ฟๅๆฏ": 147,
|
186 |
+
"้ฟไฝฉๆฎ": 148,
|
187 |
+
"้นฟ้ๅฅๅฅ": 149,
|
188 |
+
"ไธไธ": 150,
|
189 |
+
"ไผ่ฟชๅจ
": 151,
|
190 |
+
"ๅๆฅ": 152,
|
191 |
+
"ๅ่ไธ": 153,
|
192 |
+
"ๅๅฐๆฌฃๆ น": 154,
|
193 |
+
"ๅๆณฝ": 155,
|
194 |
+
"ๅก็ๆฏ": 156,
|
195 |
+
"ๅคๅ
ฐ": 157,
|
196 |
+
"ๅธธไน็ท": 158,
|
197 |
+
"ๆฆ": 159,
|
198 |
+
"ๆดๅ ๆฏ้ทๅธ": 160,
|
199 |
+
"็ฌผ้็ถไธๅฟ": 161,
|
200 |
+
"็บณๆฏๅฐ": 162,
|
201 |
+
"่กๆก": 163,
|
202 |
+
"่พๅฐๆตทๆฃฎ": 164,
|
203 |
+
"่พ่ไธ": 165,
|
204 |
+
"่ฒ็ฑณๅฐผ": 166,
|
205 |
+
"่็ไนๆฏ": 167,
|
206 |
+
"่ฟชๅฅฅๅจ": 168,
|
207 |
+
"้ฟๆ": 169,
|
208 |
+
"้ฟๆด็ฆ": 170,
|
209 |
+
"้่กๅฒฉๆฌ็่ยทๅ
็ด ็ๅฝ": 171,
|
210 |
+
"้ท็ตๅฐๅ": 172,
|
211 |
+
"้ญ": 173,
|
212 |
+
"้นฟ้้ขๅนณ่": 174,
|
213 |
+
"ใๅฅณๅฃซใ": 175,
|
214 |
+
"ใๆฃๅ
ตใ": 176,
|
215 |
+
"ๅๅ
": 177,
|
216 |
+
"ๅฆฎ้ฒ": 178,
|
217 |
+
"ๅจ็ปดๅจ
": 179,
|
218 |
+
"ๅฎ็": 180,
|
219 |
+
"ๆ
งๅฟ": 181,
|
220 |
+
"ๆๅ
": 182,
|
221 |
+
"ๆ้ฉฌ": 183,
|
222 |
+
"ๆๆๆซ่พฐๅคฉๅ": 184,
|
223 |
+
"ๆ็ฝ": 185,
|
224 |
+
"ๆตฎๆธธๆฐด่ๅ
ฝยทๅ
็ด ็ๅฝ": 186,
|
225 |
+
"็็ปฏ": 187,
|
226 |
+
"็ๅกๅ": 188,
|
227 |
+
"็พ้ป": 189,
|
228 |
+
"็ฅๆ": 190,
|
229 |
+
"็ฑณๅก": 191,
|
230 |
+
"่ฅฟๆๆฐ": 192,
|
231 |
+
"่ฟชๅขๅ
": 193,
|
232 |
+
"้ไบ": 194,
|
233 |
+
"้ฟๆๅฐ": 195,
|
234 |
+
"้ๅคซๆผ": 196,
|
235 |
+
"ไธๆ": 197,
|
236 |
+
"ไน
ๅฉ้กป": 198,
|
237 |
+
"ๅ่ฏ": 199,
|
238 |
+
"ๅๅฃฐๆตท่บ": 200,
|
239 |
+
"ๅค่": 201,
|
240 |
+
"ๅฎ่ฅฟ": 202,
|
241 |
+
"ๅพทๆฒๆฒๅ
": 203,
|
242 |
+
"ๆ่ตซๆผ": 204,
|
243 |
+
"ๆๅฐผ": 205,
|
244 |
+
"ๆฅๅฐๆฏ": 206,
|
245 |
+
"ๆทฑๆธๆณๅธ": 207,
|
246 |
+
"ๆธฉ่ฟช": 208,
|
247 |
+
"็ฑ่ดๅฐ": 209,
|
248 |
+
"็็ๅฎซๅฟๆตท": 210,
|
249 |
+
"็ญๅฐผ็น": 211,
|
250 |
+
"็ณๅฆฎ็น": 212,
|
251 |
+
"็ณ้นค": 213,
|
252 |
+
"็ฅ้็ปซไบบ": 214,
|
253 |
+
"่พไผฏ็น": 215,
|
254 |
+
"่ๅงฅๅงฅ": 216,
|
255 |
+
"่จ่ตซๅ่": 217,
|
256 |
+
"่จ้ฝๅ ": 218,
|
257 |
+
"้ฟๅฐๅก็ฑณ": 219,
|
258 |
+
"้ฟ่ดๅค": 220,
|
259 |
+
"anzai": 221,
|
260 |
+
"ไน
ๅฒๅฟ": 222,
|
261 |
+
"ไนๆก้ฐๆฒป": 223,
|
262 |
+
"ไบๅ ": 224,
|
263 |
+
"ไผๅฉไบๆฏ": 225,
|
264 |
+
"ๅๆดไผ": 226,
|
265 |
+
"ๅกๅก่พ": 227,
|
266 |
+
"ๆ้ฝ": 228,
|
267 |
+
"ๆ้ง": 229,
|
268 |
+
"ๆฏ่ฑ": 230,
|
269 |
+
"ๆฒๆๆผ": 231,
|
270 |
+
"ๆตท่ญๅค": 232,
|
271 |
+
"็ฝๆฏ": 233,
|
272 |
+
"็ฉบ": 234,
|
273 |
+
"่พๆ": 235,
|
274 |
+
"่ญ่ญๆ": 236,
|
275 |
+
"่ซๅกไผๆ": 237,
|
276 |
+
"่บๅฟ": 238,
|
277 |
+
"่พพ่พพๅฉไบ": 239,
|
278 |
+
"่ฟ่": 240,
|
279 |
+
"้ฟ็": 241,
|
280 |
+
"้ฟๅทดๅพไผ": 242,
|
281 |
+
"้ๆฏๅ": 243,
|
282 |
+
"่ซๅผ": 244,
|
283 |
+
"ๅคๅฝฆ": 245,
|
284 |
+
"ๅทฆ็ถ": 246,
|
285 |
+
"ๆ ่ด": 247
|
286 |
}
|
287 |
},
|
288 |
"model": {
|
|
|
339 |
"use_spectral_norm": false,
|
340 |
"gin_channels": 256
|
341 |
}
|
342 |
+
}
|
data_utils.py
CHANGED
@@ -155,7 +155,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
155 |
if language_str == "ZH":
|
156 |
bert = bert
|
157 |
ja_bert = torch.zeros(768, len(phone))
|
158 |
-
elif language_str == "
|
159 |
ja_bert = bert
|
160 |
bert = torch.zeros(1024, len(phone))
|
161 |
else:
|
|
|
155 |
if language_str == "ZH":
|
156 |
bert = bert
|
157 |
ja_bert = torch.zeros(768, len(phone))
|
158 |
+
elif language_str == "JP":
|
159 |
ja_bert = bert
|
160 |
bert = torch.zeros(1024, len(phone))
|
161 |
else:
|
filelists/esd.list
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
Example:
|
2 |
+
{wav_path}|{speaker_name}|{language}|{text}
|
3 |
+
ๆดพ่_1.wav|ๆดพ่|ZH|ๅ้ข็ๅบๅ๏ผไปฅๅๅๆฅๆข็ดขๅง๏ผ
|
image/41JjBPWdHtL._SX342_SY445_.jpg
ADDED
image/41JjBPWdHtL.jpg
ADDED
logs/Bangdream/G_7000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92e3ea6239c8f2b16efff571ba07232dd5de71067d2fc87e3f2e0ef490e2d7eb
|
3 |
+
size 857912686
|
logs/Bangdream/config.json
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 52,
|
6 |
+
"epochs": 10000,
|
7 |
+
"learning_rate": 0.0003,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 16,
|
14 |
+
"fp16_run": false,
|
15 |
+
"lr_decay": 0.999875,
|
16 |
+
"segment_size": 16384,
|
17 |
+
"init_lr_ratio": 1,
|
18 |
+
"warmup_epochs": 0,
|
19 |
+
"c_mel": 45,
|
20 |
+
"c_kl": 1.0,
|
21 |
+
"skip_optimizer": true
|
22 |
+
},
|
23 |
+
"data": {
|
24 |
+
"training_files": "filelists/train.list",
|
25 |
+
"validation_files": "filelists/val.list",
|
26 |
+
"max_wav_value": 32768.0,
|
27 |
+
"sampling_rate": 44100,
|
28 |
+
"filter_length": 2048,
|
29 |
+
"hop_length": 512,
|
30 |
+
"win_length": 2048,
|
31 |
+
"n_mel_channels": 128,
|
32 |
+
"mel_fmin": 0.0,
|
33 |
+
"mel_fmax": null,
|
34 |
+
"add_blank": true,
|
35 |
+
"n_speakers": 256,
|
36 |
+
"cleaned_text": true,
|
37 |
+
"spk2id": {
|
38 |
+
"ไธๆไธ": 0,
|
39 |
+
"้ฆๆพ": 1,
|
40 |
+
"ๆๅฒ": 2,
|
41 |
+
"ๆฒ็ถพ": 3,
|
42 |
+
"ใใฟ": 4,
|
43 |
+
"ใใ": 5,
|
44 |
+
"ๆฒ็ถพใใใฟใใใ": 6,
|
45 |
+
"ๅทด": 7,
|
46 |
+
"ไธๅ": 8,
|
47 |
+
"ใพใใช": 9,
|
48 |
+
"ใใ": 10,
|
49 |
+
"ๆๆฅ้ฆ": 11,
|
50 |
+
"๏ผ๏ผ๏ผ": 12,
|
51 |
+
"ใฒใพใ": 13,
|
52 |
+
"ใขใซ": 14,
|
53 |
+
"ใคใใฟ": 15,
|
54 |
+
"่ญ": 16,
|
55 |
+
"ใชใต": 17,
|
56 |
+
"ๅ่": 18,
|
57 |
+
"่ฑ้ณ": 19,
|
58 |
+
"ใคใด": 20,
|
59 |
+
"ๆฅ่": 21,
|
60 |
+
"ๅๅธ้ฃ": 22,
|
61 |
+
"็ดๅค": 23,
|
62 |
+
"ใใใ": 24,
|
63 |
+
"็พๅฒ": 25,
|
64 |
+
"่ซ": 26,
|
65 |
+
"ใฏใใฟ": 27,
|
66 |
+
"ใใใทใงใซ": 28,
|
67 |
+
"ใใชใผ": 29,
|
68 |
+
"ๆช็ใใญใใใใผ": 30,
|
69 |
+
"ใใณใชใผใ": 31,
|
70 |
+
"ๅฝฉ": 32,
|
71 |
+
"้บปๅผฅ": 33,
|
72 |
+
"็ๅญ": 34,
|
73 |
+
"ใใ": 35,
|
74 |
+
"ใใใช": 36,
|
75 |
+
"ใพใใ": 37,
|
76 |
+
"ใคใใ": 38,
|
77 |
+
"้ๅญ": 39,
|
78 |
+
"ไธๆทฑ": 40,
|
79 |
+
"็ ๅฏ": 41,
|
80 |
+
"ๅ
ญ่ฑ": 42,
|
81 |
+
"ใใฌใช": 43,
|
82 |
+
"ใฌใคใค": 44,
|
83 |
+
"ใในใญใณใฐ": 45,
|
84 |
+
"ใใฅใใฅ": 46,
|
85 |
+
"ใพใใ": 47,
|
86 |
+
"ใญใใฏ": 48,
|
87 |
+
"ไปค็้ฃ": 49,
|
88 |
+
"CHIYU": 50,
|
89 |
+
"ใฌใค": 51,
|
90 |
+
"็": 52,
|
91 |
+
"ใใ": 53,
|
92 |
+
"็ฅฅๅญ": 54,
|
93 |
+
"็ซๅธ": 55,
|
94 |
+
"็ฆ": 56,
|
95 |
+
"ๆ้ณ": 57,
|
96 |
+
"ๆฅฝๅฅ": 58,
|
97 |
+
"ๆตท้ด": 59
|
98 |
+
}
|
99 |
+
},
|
100 |
+
"model": {
|
101 |
+
"use_spk_conditioned_encoder": true,
|
102 |
+
"use_noise_scaled_mas": true,
|
103 |
+
"use_mel_posterior_encoder": false,
|
104 |
+
"use_duration_discriminator": true,
|
105 |
+
"inter_channels": 192,
|
106 |
+
"hidden_channels": 192,
|
107 |
+
"filter_channels": 768,
|
108 |
+
"n_heads": 2,
|
109 |
+
"n_layers": 6,
|
110 |
+
"kernel_size": 3,
|
111 |
+
"p_dropout": 0.1,
|
112 |
+
"resblock": "1",
|
113 |
+
"resblock_kernel_sizes": [
|
114 |
+
3,
|
115 |
+
7,
|
116 |
+
11
|
117 |
+
],
|
118 |
+
"resblock_dilation_sizes": [
|
119 |
+
[
|
120 |
+
1,
|
121 |
+
3,
|
122 |
+
5
|
123 |
+
],
|
124 |
+
[
|
125 |
+
1,
|
126 |
+
3,
|
127 |
+
5
|
128 |
+
],
|
129 |
+
[
|
130 |
+
1,
|
131 |
+
3,
|
132 |
+
5
|
133 |
+
]
|
134 |
+
],
|
135 |
+
"upsample_rates": [
|
136 |
+
8,
|
137 |
+
8,
|
138 |
+
2,
|
139 |
+
2,
|
140 |
+
2
|
141 |
+
],
|
142 |
+
"upsample_initial_channel": 512,
|
143 |
+
"upsample_kernel_sizes": [
|
144 |
+
16,
|
145 |
+
16,
|
146 |
+
8,
|
147 |
+
2,
|
148 |
+
2
|
149 |
+
],
|
150 |
+
"n_layers_q": 3,
|
151 |
+
"use_spectral_norm": false,
|
152 |
+
"gin_channels": 256
|
153 |
+
}
|
154 |
+
}
|
models.py
CHANGED
@@ -763,7 +763,7 @@ class SynthesizerTrn(nn.Module):
|
|
763 |
gin_channels=256,
|
764 |
use_sdp=True,
|
765 |
n_flow_layer=4,
|
766 |
-
n_layers_trans_flow=
|
767 |
flow_share_parameter=False,
|
768 |
use_transformer_flow=True,
|
769 |
**kwargs
|
|
|
763 |
gin_channels=256,
|
764 |
use_sdp=True,
|
765 |
n_flow_layer=4,
|
766 |
+
n_layers_trans_flow=6,
|
767 |
flow_share_parameter=False,
|
768 |
use_transformer_flow=True,
|
769 |
**kwargs
|
monotonic_align/__pycache__/__init__.cpython-39.pyc
CHANGED
Binary files a/monotonic_align/__pycache__/__init__.cpython-39.pyc and b/monotonic_align/__pycache__/__init__.cpython-39.pyc differ
|
|
monotonic_align/__pycache__/core.cpython-39.pyc
CHANGED
Binary files a/monotonic_align/__pycache__/core.cpython-39.pyc and b/monotonic_align/__pycache__/core.cpython-39.pyc differ
|
|
preprocess_text.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import json
|
|
|
2 |
from collections import defaultdict
|
3 |
from random import shuffle
|
4 |
from typing import Optional
|
@@ -11,7 +12,7 @@ from text.cleaner import clean_text
|
|
11 |
@click.command()
|
12 |
@click.option(
|
13 |
"--transcription-path",
|
14 |
-
default="filelists/
|
15 |
type=click.Path(exists=True, file_okay=True, dir_okay=False),
|
16 |
)
|
17 |
@click.option("--cleaned-path", default=None)
|
@@ -67,13 +68,27 @@ def main(
|
|
67 |
current_sid = 0
|
68 |
|
69 |
with open(transcription_path, encoding="utf-8") as f:
|
|
|
|
|
|
|
70 |
for line in f.readlines():
|
71 |
utt, spk, language, text, phones, tones, word2ph = line.strip().split("|")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
spk_utt_map[spk].append(line)
|
73 |
|
74 |
if spk not in spk_id_map.keys():
|
75 |
spk_id_map[spk] = current_sid
|
76 |
current_sid += 1
|
|
|
77 |
|
78 |
train_list = []
|
79 |
val_list = []
|
|
|
1 |
import json
|
2 |
+
import os.path
|
3 |
from collections import defaultdict
|
4 |
from random import shuffle
|
5 |
from typing import Optional
|
|
|
12 |
@click.command()
|
13 |
@click.option(
|
14 |
"--transcription-path",
|
15 |
+
default="filelists/genshin.list",
|
16 |
type=click.Path(exists=True, file_okay=True, dir_okay=False),
|
17 |
)
|
18 |
@click.option("--cleaned-path", default=None)
|
|
|
68 |
current_sid = 0
|
69 |
|
70 |
with open(transcription_path, encoding="utf-8") as f:
|
71 |
+
audioPaths = set()
|
72 |
+
countSame = 0
|
73 |
+
countNotFound = 0
|
74 |
for line in f.readlines():
|
75 |
utt, spk, language, text, phones, tones, word2ph = line.strip().split("|")
|
76 |
+
if utt in audioPaths:
|
77 |
+
# ่ฟๆปคๆฐๆฎ้้่ฏฏ๏ผ็ธๅ็้ณ้ขๅน้
ๅคไธชๆๆฌ๏ผๅฏผ่ดๅ็ปญbertๅบ้ฎ้ข
|
78 |
+
print(f"้ๅค้ณ้ขๆๆฌ๏ผ{line}")
|
79 |
+
countSame += 1
|
80 |
+
continue
|
81 |
+
if not os.path.isfile(utt):
|
82 |
+
print(f"ๆฒกๆๆพๅฐๅฏนๅบ็้ณ้ข๏ผ{utt}")
|
83 |
+
countNotFound += 1
|
84 |
+
continue
|
85 |
+
audioPaths.add(utt)
|
86 |
spk_utt_map[spk].append(line)
|
87 |
|
88 |
if spk not in spk_id_map.keys():
|
89 |
spk_id_map[spk] = current_sid
|
90 |
current_sid += 1
|
91 |
+
print(f"ๆป้ๅค้ณ้ขๆฐ๏ผ{countSame}๏ผๆปๆชๆพๅฐ็้ณ้ขๆฐ:{countNotFound}")
|
92 |
|
93 |
train_list = []
|
94 |
val_list = []
|
requirements.txt
CHANGED
@@ -21,6 +21,3 @@ unidic-lite
|
|
21 |
cmudict
|
22 |
fugashi
|
23 |
num2words
|
24 |
-
PyPDF2
|
25 |
-
ebooklib
|
26 |
-
beautifulsoup4
|
|
|
21 |
cmudict
|
22 |
fugashi
|
23 |
num2words
|
|
|
|
|
|
text/__init__.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
from text.symbols import *
|
2 |
|
3 |
-
|
4 |
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
5 |
|
6 |
|
|
|
1 |
from text.symbols import *
|
2 |
|
|
|
3 |
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
4 |
|
5 |
|
text/__pycache__/__init__.cpython-39.pyc
CHANGED
Binary files a/text/__pycache__/__init__.cpython-39.pyc and b/text/__pycache__/__init__.cpython-39.pyc differ
|
|
text/__pycache__/chinese.cpython-39.pyc
CHANGED
Binary files a/text/__pycache__/chinese.cpython-39.pyc and b/text/__pycache__/chinese.cpython-39.pyc differ
|
|
text/__pycache__/chinese_bert.cpython-39.pyc
CHANGED
Binary files a/text/__pycache__/chinese_bert.cpython-39.pyc and b/text/__pycache__/chinese_bert.cpython-39.pyc differ
|
|
text/__pycache__/cleaner.cpython-39.pyc
CHANGED
Binary files a/text/__pycache__/cleaner.cpython-39.pyc and b/text/__pycache__/cleaner.cpython-39.pyc differ
|
|
text/__pycache__/english_bert_mock.cpython-39.pyc
CHANGED
Binary files a/text/__pycache__/english_bert_mock.cpython-39.pyc and b/text/__pycache__/english_bert_mock.cpython-39.pyc differ
|
|
text/__pycache__/japanese.cpython-39.pyc
CHANGED
Binary files a/text/__pycache__/japanese.cpython-39.pyc and b/text/__pycache__/japanese.cpython-39.pyc differ
|
|
text/__pycache__/japanese_bert.cpython-39.pyc
CHANGED
Binary files a/text/__pycache__/japanese_bert.cpython-39.pyc and b/text/__pycache__/japanese_bert.cpython-39.pyc differ
|
|
text/__pycache__/symbols.cpython-39.pyc
CHANGED
Binary files a/text/__pycache__/symbols.cpython-39.pyc and b/text/__pycache__/symbols.cpython-39.pyc differ
|
|
text/__pycache__/tone_sandhi.cpython-39.pyc
CHANGED
Binary files a/text/__pycache__/tone_sandhi.cpython-39.pyc and b/text/__pycache__/tone_sandhi.cpython-39.pyc differ
|
|
train_ms.py
CHANGED
@@ -42,12 +42,6 @@ torch.backends.cuda.enable_mem_efficient_sdp(
|
|
42 |
torch.backends.cuda.enable_math_sdp(True)
|
43 |
global_step = 0
|
44 |
|
45 |
-
import os
|
46 |
-
|
47 |
-
os.environ['MASTER_ADDR'] = '127.0.0.1'
|
48 |
-
os.environ['MASTER_PORT'] = '8880'
|
49 |
-
os.environ['WORLD_SIZE'] = '1'
|
50 |
-
os.environ['RANK'] = '0'
|
51 |
|
52 |
def run():
|
53 |
dist.init_process_group(
|
@@ -197,6 +191,8 @@ def run():
|
|
197 |
optim_g.param_groups[0]["initial_lr"] = g_resume_lr
|
198 |
if not optim_d.param_groups[0].get("initial_lr"):
|
199 |
optim_d.param_groups[0]["initial_lr"] = d_resume_lr
|
|
|
|
|
200 |
|
201 |
epoch_str = max(epoch_str, 1)
|
202 |
global_step = (epoch_str - 1) * len(train_loader)
|
|
|
42 |
torch.backends.cuda.enable_math_sdp(True)
|
43 |
global_step = 0
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
def run():
|
47 |
dist.init_process_group(
|
|
|
191 |
optim_g.param_groups[0]["initial_lr"] = g_resume_lr
|
192 |
if not optim_d.param_groups[0].get("initial_lr"):
|
193 |
optim_d.param_groups[0]["initial_lr"] = d_resume_lr
|
194 |
+
if not optim_dur_disc.param_groups[0].get("initial_lr"):
|
195 |
+
optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr
|
196 |
|
197 |
epoch_str = max(epoch_str, 1)
|
198 |
global_step = (epoch_str - 1) * len(train_loader)
|
utils.py
CHANGED
@@ -206,15 +206,14 @@ def get_hparams(init=True):
|
|
206 |
config_path = args.config
|
207 |
config_save_path = os.path.join(model_dir, "config.json")
|
208 |
if init:
|
209 |
-
with open(config_path, "r") as f:
|
210 |
data = f.read()
|
211 |
-
with open(config_save_path, "w") as f:
|
212 |
f.write(data)
|
213 |
else:
|
214 |
-
with open(config_save_path, "r") as f:
|
215 |
data = f.read()
|
216 |
config = json.loads(data)
|
217 |
-
|
218 |
hparams = HParams(**config)
|
219 |
hparams.model_dir = model_dir
|
220 |
return hparams
|
|
|
206 |
config_path = args.config
|
207 |
config_save_path = os.path.join(model_dir, "config.json")
|
208 |
if init:
|
209 |
+
with open(config_path, "r", encoding="utf-8") as f:
|
210 |
data = f.read()
|
211 |
+
with open(config_save_path, "w", encoding="utf-8") as f:
|
212 |
f.write(data)
|
213 |
else:
|
214 |
+
with open(config_save_path, "r", vencoding="utf-8") as f:
|
215 |
data = f.read()
|
216 |
config = json.loads(data)
|
|
|
217 |
hparams = HParams(**config)
|
218 |
hparams.model_dir = model_dir
|
219 |
return hparams
|
webui.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# flake8: noqa: E402
|
2 |
+
|
3 |
+
import sys, os
|
4 |
+
import logging
|
5 |
+
|
6 |
+
logging.getLogger("numba").setLevel(logging.WARNING)
|
7 |
+
logging.getLogger("markdown_it").setLevel(logging.WARNING)
|
8 |
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
9 |
+
logging.getLogger("matplotlib").setLevel(logging.WARNING)
|
10 |
+
|
11 |
+
logging.basicConfig(
|
12 |
+
level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s"
|
13 |
+
)
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
import torch
|
18 |
+
import argparse
|
19 |
+
import commons
|
20 |
+
import utils
|
21 |
+
from models import SynthesizerTrn
|
22 |
+
from text.symbols import symbols
|
23 |
+
from text import cleaned_text_to_sequence, get_bert
|
24 |
+
from text.cleaner import clean_text
|
25 |
+
import gradio as gr
|
26 |
+
import webbrowser
|
27 |
+
import numpy as np
|
28 |
+
|
29 |
+
net_g = None
|
30 |
+
|
31 |
+
if sys.platform == "darwin" and torch.backends.mps.is_available():
|
32 |
+
device = "mps"
|
33 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
34 |
+
else:
|
35 |
+
device = "cuda"
|
36 |
+
|
37 |
+
|
38 |
+
def get_text(text, language_str, hps):
|
39 |
+
norm_text, phone, tone, word2ph = clean_text(text, language_str)
|
40 |
+
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
|
41 |
+
|
42 |
+
if hps.data.add_blank:
|
43 |
+
phone = commons.intersperse(phone, 0)
|
44 |
+
tone = commons.intersperse(tone, 0)
|
45 |
+
language = commons.intersperse(language, 0)
|
46 |
+
for i in range(len(word2ph)):
|
47 |
+
word2ph[i] = word2ph[i] * 2
|
48 |
+
word2ph[0] += 1
|
49 |
+
bert = get_bert(norm_text, word2ph, language_str, device)
|
50 |
+
del word2ph
|
51 |
+
assert bert.shape[-1] == len(phone), phone
|
52 |
+
|
53 |
+
if language_str == "ZH":
|
54 |
+
bert = bert
|
55 |
+
ja_bert = torch.zeros(768, len(phone))
|
56 |
+
elif language_str == "JP":
|
57 |
+
ja_bert = bert
|
58 |
+
bert = torch.zeros(1024, len(phone))
|
59 |
+
else:
|
60 |
+
bert = torch.zeros(1024, len(phone))
|
61 |
+
ja_bert = torch.zeros(768, len(phone))
|
62 |
+
|
63 |
+
assert bert.shape[-1] == len(
|
64 |
+
phone
|
65 |
+
), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
|
66 |
+
|
67 |
+
phone = torch.LongTensor(phone)
|
68 |
+
tone = torch.LongTensor(tone)
|
69 |
+
language = torch.LongTensor(language)
|
70 |
+
return bert, ja_bert, phone, tone, language
|
71 |
+
|
72 |
+
|
73 |
+
def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, language):
|
74 |
+
global net_g
|
75 |
+
bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps)
|
76 |
+
with torch.no_grad():
|
77 |
+
x_tst = phones.to(device).unsqueeze(0)
|
78 |
+
tones = tones.to(device).unsqueeze(0)
|
79 |
+
lang_ids = lang_ids.to(device).unsqueeze(0)
|
80 |
+
bert = bert.to(device).unsqueeze(0)
|
81 |
+
ja_bert = ja_bert.to(device).unsqueeze(0)
|
82 |
+
x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
|
83 |
+
del phones
|
84 |
+
speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
|
85 |
+
audio = (
|
86 |
+
net_g.infer(
|
87 |
+
x_tst,
|
88 |
+
x_tst_lengths,
|
89 |
+
speakers,
|
90 |
+
tones,
|
91 |
+
lang_ids,
|
92 |
+
bert,
|
93 |
+
ja_bert,
|
94 |
+
sdp_ratio=sdp_ratio,
|
95 |
+
noise_scale=noise_scale,
|
96 |
+
noise_scale_w=noise_scale_w,
|
97 |
+
length_scale=length_scale,
|
98 |
+
)[0][0, 0]
|
99 |
+
.data.cpu()
|
100 |
+
.float()
|
101 |
+
.numpy()
|
102 |
+
)
|
103 |
+
del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
|
104 |
+
torch.cuda.empty_cache()
|
105 |
+
return audio
|
106 |
+
|
107 |
+
|
108 |
+
def tts_fn(
|
109 |
+
text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language
|
110 |
+
):
|
111 |
+
slices = text.split("|")
|
112 |
+
audio_list = []
|
113 |
+
with torch.no_grad():
|
114 |
+
for slice in slices:
|
115 |
+
audio = infer(
|
116 |
+
slice,
|
117 |
+
sdp_ratio=sdp_ratio,
|
118 |
+
noise_scale=noise_scale,
|
119 |
+
noise_scale_w=noise_scale_w,
|
120 |
+
length_scale=length_scale,
|
121 |
+
sid=speaker,
|
122 |
+
language=language,
|
123 |
+
)
|
124 |
+
audio_list.append(audio)
|
125 |
+
silence = np.zeros(hps.data.sampling_rate) # ็ๆ1็ง็้้ณ
|
126 |
+
audio_list.append(silence) # ๅฐ้้ณๆทปๅ ๅฐๅ่กจไธญ
|
127 |
+
audio_concat = np.concatenate(audio_list)
|
128 |
+
return "Success", (hps.data.sampling_rate, audio_concat)
|
129 |
+
|
130 |
+
|
131 |
+
if __name__ == "__main__":
|
132 |
+
parser = argparse.ArgumentParser()
|
133 |
+
parser.add_argument(
|
134 |
+
"-m", "--model", default="./logs/as/G_8000.pth", help="path of your model"
|
135 |
+
)
|
136 |
+
parser.add_argument(
|
137 |
+
"-c",
|
138 |
+
"--config",
|
139 |
+
default="./configs/config.json",
|
140 |
+
help="path of your config file",
|
141 |
+
)
|
142 |
+
parser.add_argument(
|
143 |
+
"--share", default=False, help="make link public", action="store_true"
|
144 |
+
)
|
145 |
+
parser.add_argument(
|
146 |
+
"-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log"
|
147 |
+
)
|
148 |
+
|
149 |
+
args = parser.parse_args()
|
150 |
+
if args.debug:
|
151 |
+
logger.info("Enable DEBUG-LEVEL log")
|
152 |
+
logging.basicConfig(level=logging.DEBUG)
|
153 |
+
hps = utils.get_hparams_from_file(args.config)
|
154 |
+
|
155 |
+
device = (
|
156 |
+
"cuda:0"
|
157 |
+
if torch.cuda.is_available()
|
158 |
+
else (
|
159 |
+
"mps"
|
160 |
+
if sys.platform == "darwin" and torch.backends.mps.is_available()
|
161 |
+
else "cpu"
|
162 |
+
)
|
163 |
+
)
|
164 |
+
net_g = SynthesizerTrn(
|
165 |
+
len(symbols),
|
166 |
+
hps.data.filter_length // 2 + 1,
|
167 |
+
hps.train.segment_size // hps.data.hop_length,
|
168 |
+
n_speakers=hps.data.n_speakers,
|
169 |
+
**hps.model,
|
170 |
+
).to(device)
|
171 |
+
_ = net_g.eval()
|
172 |
+
|
173 |
+
_ = utils.load_checkpoint(args.model, net_g, None, skip_optimizer=True)
|
174 |
+
|
175 |
+
speaker_ids = hps.data.spk2id
|
176 |
+
speakers = list(speaker_ids.keys())
|
177 |
+
languages = ["ZH", "JP"]
|
178 |
+
with gr.Blocks() as app:
|
179 |
+
with gr.Row():
|
180 |
+
with gr.Column():
|
181 |
+
text = gr.TextArea(
|
182 |
+
label="Text",
|
183 |
+
placeholder="Input Text Here",
|
184 |
+
value="ๅ่ก่ไธๅ่ก่็ฎ๏ผไธๅ่ก่ๅๅ่ก่็ฎใ",
|
185 |
+
)
|
186 |
+
speaker = gr.Dropdown(
|
187 |
+
choices=speakers, value=speakers[0], label="Speaker"
|
188 |
+
)
|
189 |
+
sdp_ratio = gr.Slider(
|
190 |
+
minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
|
191 |
+
)
|
192 |
+
noise_scale = gr.Slider(
|
193 |
+
minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise Scale"
|
194 |
+
)
|
195 |
+
noise_scale_w = gr.Slider(
|
196 |
+
minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise Scale W"
|
197 |
+
)
|
198 |
+
length_scale = gr.Slider(
|
199 |
+
minimum=0.1, maximum=2, value=1, step=0.1, label="Length Scale"
|
200 |
+
)
|
201 |
+
language = gr.Dropdown(
|
202 |
+
choices=languages, value=languages[0], label="Language"
|
203 |
+
)
|
204 |
+
btn = gr.Button("Generate!", variant="primary")
|
205 |
+
with gr.Column():
|
206 |
+
text_output = gr.Textbox(label="Message")
|
207 |
+
audio_output = gr.Audio(label="Output Audio")
|
208 |
+
|
209 |
+
btn.click(
|
210 |
+
tts_fn,
|
211 |
+
inputs=[
|
212 |
+
text,
|
213 |
+
speaker,
|
214 |
+
sdp_ratio,
|
215 |
+
noise_scale,
|
216 |
+
noise_scale_w,
|
217 |
+
length_scale,
|
218 |
+
language,
|
219 |
+
],
|
220 |
+
outputs=[text_output, audio_output],
|
221 |
+
)
|
222 |
+
|
223 |
+
webbrowser.open("http://127.0.0.1:7860")
|
224 |
+
app.launch(share=args.share)
|