Artrajz commited on
Commit
960cd20
·
1 Parent(s): 40ac4d0
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +19 -0
  2. Dockerfile +38 -0
  3. Dockerfile_GPU +38 -0
  4. LICENSE +661 -0
  5. README.md +5 -10
  6. README_zh.md +584 -0
  7. api_test.py +575 -0
  8. app.py +74 -0
  9. bert_vits2/LICENSE +674 -0
  10. bert_vits2/README.md +5 -0
  11. bert_vits2/__init__.py +2 -0
  12. bert_vits2/attentions.py +352 -0
  13. bert_vits2/bert_vits2.py +403 -0
  14. bert_vits2/clap_wrapper.py +17 -0
  15. bert_vits2/commons.py +158 -0
  16. bert_vits2/g2pW/pypinyin_G2pW_bv2/__init__.py +5 -0
  17. bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw.py +121 -0
  18. bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/__init__.py +0 -0
  19. bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/bopomofo_to_pinyin_wo_tune_dict.json +1 -0
  20. bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/char_bopomofo_dict.json +0 -0
  21. bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/char_convert.py +44 -0
  22. bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/dataset.py +181 -0
  23. bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/onnx_api.py +273 -0
  24. bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/utils.py +144 -0
  25. bert_vits2/get_emo.py +92 -0
  26. bert_vits2/models.py +799 -0
  27. bert_vits2/models_ja_extra.py +1016 -0
  28. bert_vits2/models_v230.py +1019 -0
  29. bert_vits2/modules.py +459 -0
  30. bert_vits2/requirements.txt +15 -0
  31. bert_vits2/text/__init__.py +25 -0
  32. bert_vits2/text/chinese.py +198 -0
  33. bert_vits2/text/chinese_bert.py +59 -0
  34. bert_vits2/text/chinese_bert_extra.py +60 -0
  35. bert_vits2/text/chinese_v100.py +197 -0
  36. bert_vits2/text/chinese_v240.py +211 -0
  37. bert_vits2/text/cleaner.py +53 -0
  38. bert_vits2/text/cmudict.rep +0 -0
  39. bert_vits2/text/cmudict_cache.pickle +3 -0
  40. bert_vits2/text/english.py +449 -0
  41. bert_vits2/text/english_bert_mock.py +36 -0
  42. bert_vits2/text/english_bert_mock_v200.py +22 -0
  43. bert_vits2/text/english_v200.py +360 -0
  44. bert_vits2/text/english_v230.py +493 -0
  45. bert_vits2/text/japanese.py +428 -0
  46. bert_vits2/text/japanese_bert.py +43 -0
  47. bert_vits2/text/japanese_bert_extra.py +42 -0
  48. bert_vits2/text/japanese_bert_v111.py +22 -0
  49. bert_vits2/text/japanese_bert_v200.py +39 -0
  50. bert_vits2/text/japanese_extra.py +524 -0
.gitignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **/__pycache__
2
+ /Model/
3
+ /logs/
4
+ /cache/
5
+ /upload/
6
+ **/pytorch_model.bin
7
+ **/spm.model
8
+ /**/*.pt
9
+ /**/*.onnx
10
+ phrases_dict.txt
11
+ /config.yml
12
+ /config.yaml
13
+ /data/emotional/dimensional_emotion_model/model.onnx
14
+ /data/hubert_soft/hubert-soft-0d54a1f4.pt
15
+ /data/emotional/dimensional_emotion_npy/
16
+ /data/bert/vits_chinese_bert/prosody_model.pt
17
+ /data/emotional/dimensional_emotion_npy/
18
+ /data/models/
19
+ /vits/text/chinese_dialect_lexicons
Dockerfile ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM artrajz/pytorch:2.2.1-cpu-py3.10.11-ubuntu22.04
2
+
3
+ RUN mkdir -p /app
4
+ WORKDIR /app
5
+
6
+ ENV DEBIAN_FRONTEND=noninteractive
7
+
8
+
9
+ RUN apt-get update && \
10
+ apt-get install -yq build-essential espeak-ng cmake wget ca-certificates tzdata&& \
11
+ update-ca-certificates && \
12
+ apt-get clean && \
13
+ apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
14
+ rm -rf /var/lib/apt/lists/*
15
+
16
+ # Install jemalloc
17
+ RUN wget https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2 && \
18
+ tar -xvf jemalloc-5.3.0.tar.bz2 && \
19
+ cd jemalloc-5.3.0 && \
20
+ ./configure && \
21
+ make -j$(nproc) && \
22
+ make install && \
23
+ cd .. && \
24
+ rm -rf jemalloc-5.3.0* && \
25
+ ldconfig
26
+
27
+ ENV LD_PRELOAD=/usr/local/lib/libjemalloc.so
28
+
29
+ COPY requirements.txt /app/
30
+ RUN pip install gunicorn --no-cache-dir && \
31
+ pip install -r requirements.txt --no-cache-dir&& \
32
+ rm -rf /root/.cache/pip/*
33
+
34
+ COPY . /app
35
+
36
+ EXPOSE 23456
37
+
38
+ CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
Dockerfile_GPU ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM artrajz/pytorch:2.2.1-cu118-py3.10.11-ubuntu22.04
2
+
3
+ RUN mkdir -p /app
4
+ WORKDIR /app
5
+
6
+ ENV DEBIAN_FRONTEND=noninteractive
7
+
8
+ RUN apt-get update && \
9
+ apt-get install -yq build-essential espeak-ng cmake wget ca-certificates tzdata&& \
10
+ update-ca-certificates && \
11
+ apt-get clean && \
12
+ apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
13
+ rm -rf /var/lib/apt/lists/*
14
+
15
+
16
+ # Install jemalloc
17
+ RUN wget https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2 && \
18
+ tar -xvf jemalloc-5.3.0.tar.bz2 && \
19
+ cd jemalloc-5.3.0 && \
20
+ ./configure && \
21
+ make -j$(nproc) && \
22
+ make install && \
23
+ cd .. && \
24
+ rm -rf jemalloc-5.3.0* && \
25
+ ldconfig
26
+
27
+ ENV LD_PRELOAD=/usr/local/lib/libjemalloc.so
28
+
29
+ COPY requirements.txt /app/
30
+ RUN pip install gunicorn --no-cache-dir && \
31
+ pip install -r requirements.txt --no-cache-dir&& \
32
+ rm -rf /root/.cache/pip/*
33
+
34
+ COPY . /app
35
+
36
+ EXPOSE 23456
37
+
38
+ CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
LICENSE ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU AFFERO GENERAL PUBLIC LICENSE
2
+ Version 3, 19 November 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ The GNU Affero General Public License is a free, copyleft license for
11
+ software and other kinds of works, specifically designed to ensure
12
+ cooperation with the community in the case of network server software.
13
+
14
+ The licenses for most software and other practical works are designed
15
+ to take away your freedom to share and change the works. By contrast,
16
+ our General Public Licenses are intended to guarantee your freedom to
17
+ share and change all versions of a program--to make sure it remains free
18
+ software for all its users.
19
+
20
+ When we speak of free software, we are referring to freedom, not
21
+ price. Our General Public Licenses are designed to make sure that you
22
+ have the freedom to distribute copies of free software (and charge for
23
+ them if you wish), that you receive source code or can get it if you
24
+ want it, that you can change the software or use pieces of it in new
25
+ free programs, and that you know you can do these things.
26
+
27
+ Developers that use our General Public Licenses protect your rights
28
+ with two steps: (1) assert copyright on the software, and (2) offer
29
+ you this License which gives you legal permission to copy, distribute
30
+ and/or modify the software.
31
+
32
+ A secondary benefit of defending all users' freedom is that
33
+ improvements made in alternate versions of the program, if they
34
+ receive widespread use, become available for other developers to
35
+ incorporate. Many developers of free software are heartened and
36
+ encouraged by the resulting cooperation. However, in the case of
37
+ software used on network servers, this result may fail to come about.
38
+ The GNU General Public License permits making a modified version and
39
+ letting the public access it on a server without ever releasing its
40
+ source code to the public.
41
+
42
+ The GNU Affero General Public License is designed specifically to
43
+ ensure that, in such cases, the modified source code becomes available
44
+ to the community. It requires the operator of a network server to
45
+ provide the source code of the modified version running there to the
46
+ users of that server. Therefore, public use of a modified version, on
47
+ a publicly accessible server, gives the public access to the source
48
+ code of the modified version.
49
+
50
+ An older license, called the Affero General Public License and
51
+ published by Affero, was designed to accomplish similar goals. This is
52
+ a different license, not a version of the Affero GPL, but Affero has
53
+ released a new version of the Affero GPL which permits relicensing under
54
+ this license.
55
+
56
+ The precise terms and conditions for copying, distribution and
57
+ modification follow.
58
+
59
+ TERMS AND CONDITIONS
60
+
61
+ 0. Definitions.
62
+
63
+ "This License" refers to version 3 of the GNU Affero General Public License.
64
+
65
+ "Copyright" also means copyright-like laws that apply to other kinds of
66
+ works, such as semiconductor masks.
67
+
68
+ "The Program" refers to any copyrightable work licensed under this
69
+ License. Each licensee is addressed as "you". "Licensees" and
70
+ "recipients" may be individuals or organizations.
71
+
72
+ To "modify" a work means to copy from or adapt all or part of the work
73
+ in a fashion requiring copyright permission, other than the making of an
74
+ exact copy. The resulting work is called a "modified version" of the
75
+ earlier work or a work "based on" the earlier work.
76
+
77
+ A "covered work" means either the unmodified Program or a work based
78
+ on the Program.
79
+
80
+ To "propagate" a work means to do anything with it that, without
81
+ permission, would make you directly or secondarily liable for
82
+ infringement under applicable copyright law, except executing it on a
83
+ computer or modifying a private copy. Propagation includes copying,
84
+ distribution (with or without modification), making available to the
85
+ public, and in some countries other activities as well.
86
+
87
+ To "convey" a work means any kind of propagation that enables other
88
+ parties to make or receive copies. Mere interaction with a user through
89
+ a computer network, with no transfer of a copy, is not conveying.
90
+
91
+ An interactive user interface displays "Appropriate Legal Notices"
92
+ to the extent that it includes a convenient and prominently visible
93
+ feature that (1) displays an appropriate copyright notice, and (2)
94
+ tells the user that there is no warranty for the work (except to the
95
+ extent that warranties are provided), that licensees may convey the
96
+ work under this License, and how to view a copy of this License. If
97
+ the interface presents a list of user commands or options, such as a
98
+ menu, a prominent item in the list meets this criterion.
99
+
100
+ 1. Source Code.
101
+
102
+ The "source code" for a work means the preferred form of the work
103
+ for making modifications to it. "Object code" means any non-source
104
+ form of a work.
105
+
106
+ A "Standard Interface" means an interface that either is an official
107
+ standard defined by a recognized standards body, or, in the case of
108
+ interfaces specified for a particular programming language, one that
109
+ is widely used among developers working in that language.
110
+
111
+ The "System Libraries" of an executable work include anything, other
112
+ than the work as a whole, that (a) is included in the normal form of
113
+ packaging a Major Component, but which is not part of that Major
114
+ Component, and (b) serves only to enable use of the work with that
115
+ Major Component, or to implement a Standard Interface for which an
116
+ implementation is available to the public in source code form. A
117
+ "Major Component", in this context, means a major essential component
118
+ (kernel, window system, and so on) of the specific operating system
119
+ (if any) on which the executable work runs, or a compiler used to
120
+ produce the work, or an object code interpreter used to run it.
121
+
122
+ The "Corresponding Source" for a work in object code form means all
123
+ the source code needed to generate, install, and (for an executable
124
+ work) run the object code and to modify the work, including scripts to
125
+ control those activities. However, it does not include the work's
126
+ System Libraries, or general-purpose tools or generally available free
127
+ programs which are used unmodified in performing those activities but
128
+ which are not part of the work. For example, Corresponding Source
129
+ includes interface definition files associated with source files for
130
+ the work, and the source code for shared libraries and dynamically
131
+ linked subprograms that the work is specifically designed to require,
132
+ such as by intimate data communication or control flow between those
133
+ subprograms and other parts of the work.
134
+
135
+ The Corresponding Source need not include anything that users
136
+ can regenerate automatically from other parts of the Corresponding
137
+ Source.
138
+
139
+ The Corresponding Source for a work in source code form is that
140
+ same work.
141
+
142
+ 2. Basic Permissions.
143
+
144
+ All rights granted under this License are granted for the term of
145
+ copyright on the Program, and are irrevocable provided the stated
146
+ conditions are met. This License explicitly affirms your unlimited
147
+ permission to run the unmodified Program. The output from running a
148
+ covered work is covered by this License only if the output, given its
149
+ content, constitutes a covered work. This License acknowledges your
150
+ rights of fair use or other equivalent, as provided by copyright law.
151
+
152
+ You may make, run and propagate covered works that you do not
153
+ convey, without conditions so long as your license otherwise remains
154
+ in force. You may convey covered works to others for the sole purpose
155
+ of having them make modifications exclusively for you, or provide you
156
+ with facilities for running those works, provided that you comply with
157
+ the terms of this License in conveying all material for which you do
158
+ not control copyright. Those thus making or running the covered works
159
+ for you must do so exclusively on your behalf, under your direction
160
+ and control, on terms that prohibit them from making any copies of
161
+ your copyrighted material outside their relationship with you.
162
+
163
+ Conveying under any other circumstances is permitted solely under
164
+ the conditions stated below. Sublicensing is not allowed; section 10
165
+ makes it unnecessary.
166
+
167
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168
+
169
+ No covered work shall be deemed part of an effective technological
170
+ measure under any applicable law fulfilling obligations under article
171
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172
+ similar laws prohibiting or restricting circumvention of such
173
+ measures.
174
+
175
+ When you convey a covered work, you waive any legal power to forbid
176
+ circumvention of technological measures to the extent such circumvention
177
+ is effected by exercising rights under this License with respect to
178
+ the covered work, and you disclaim any intention to limit operation or
179
+ modification of the work as a means of enforcing, against the work's
180
+ users, your or third parties' legal rights to forbid circumvention of
181
+ technological measures.
182
+
183
+ 4. Conveying Verbatim Copies.
184
+
185
+ You may convey verbatim copies of the Program's source code as you
186
+ receive it, in any medium, provided that you conspicuously and
187
+ appropriately publish on each copy an appropriate copyright notice;
188
+ keep intact all notices stating that this License and any
189
+ non-permissive terms added in accord with section 7 apply to the code;
190
+ keep intact all notices of the absence of any warranty; and give all
191
+ recipients a copy of this License along with the Program.
192
+
193
+ You may charge any price or no price for each copy that you convey,
194
+ and you may offer support or warranty protection for a fee.
195
+
196
+ 5. Conveying Modified Source Versions.
197
+
198
+ You may convey a work based on the Program, or the modifications to
199
+ produce it from the Program, in the form of source code under the
200
+ terms of section 4, provided that you also meet all of these conditions:
201
+
202
+ a) The work must carry prominent notices stating that you modified
203
+ it, and giving a relevant date.
204
+
205
+ b) The work must carry prominent notices stating that it is
206
+ released under this License and any conditions added under section
207
+ 7. This requirement modifies the requirement in section 4 to
208
+ "keep intact all notices".
209
+
210
+ c) You must license the entire work, as a whole, under this
211
+ License to anyone who comes into possession of a copy. This
212
+ License will therefore apply, along with any applicable section 7
213
+ additional terms, to the whole of the work, and all its parts,
214
+ regardless of how they are packaged. This License gives no
215
+ permission to license the work in any other way, but it does not
216
+ invalidate such permission if you have separately received it.
217
+
218
+ d) If the work has interactive user interfaces, each must display
219
+ Appropriate Legal Notices; however, if the Program has interactive
220
+ interfaces that do not display Appropriate Legal Notices, your
221
+ work need not make them do so.
222
+
223
+ A compilation of a covered work with other separate and independent
224
+ works, which are not by their nature extensions of the covered work,
225
+ and which are not combined with it such as to form a larger program,
226
+ in or on a volume of a storage or distribution medium, is called an
227
+ "aggregate" if the compilation and its resulting copyright are not
228
+ used to limit the access or legal rights of the compilation's users
229
+ beyond what the individual works permit. Inclusion of a covered work
230
+ in an aggregate does not cause this License to apply to the other
231
+ parts of the aggregate.
232
+
233
+ 6. Conveying Non-Source Forms.
234
+
235
+ You may convey a covered work in object code form under the terms
236
+ of sections 4 and 5, provided that you also convey the
237
+ machine-readable Corresponding Source under the terms of this License,
238
+ in one of these ways:
239
+
240
+ a) Convey the object code in, or embodied in, a physical product
241
+ (including a physical distribution medium), accompanied by the
242
+ Corresponding Source fixed on a durable physical medium
243
+ customarily used for software interchange.
244
+
245
+ b) Convey the object code in, or embodied in, a physical product
246
+ (including a physical distribution medium), accompanied by a
247
+ written offer, valid for at least three years and valid for as
248
+ long as you offer spare parts or customer support for that product
249
+ model, to give anyone who possesses the object code either (1) a
250
+ copy of the Corresponding Source for all the software in the
251
+ product that is covered by this License, on a durable physical
252
+ medium customarily used for software interchange, for a price no
253
+ more than your reasonable cost of physically performing this
254
+ conveying of source, or (2) access to copy the
255
+ Corresponding Source from a network server at no charge.
256
+
257
+ c) Convey individual copies of the object code with a copy of the
258
+ written offer to provide the Corresponding Source. This
259
+ alternative is allowed only occasionally and noncommercially, and
260
+ only if you received the object code with such an offer, in accord
261
+ with subsection 6b.
262
+
263
+ d) Convey the object code by offering access from a designated
264
+ place (gratis or for a charge), and offer equivalent access to the
265
+ Corresponding Source in the same way through the same place at no
266
+ further charge. You need not require recipients to copy the
267
+ Corresponding Source along with the object code. If the place to
268
+ copy the object code is a network server, the Corresponding Source
269
+ may be on a different server (operated by you or a third party)
270
+ that supports equivalent copying facilities, provided you maintain
271
+ clear directions next to the object code saying where to find the
272
+ Corresponding Source. Regardless of what server hosts the
273
+ Corresponding Source, you remain obligated to ensure that it is
274
+ available for as long as needed to satisfy these requirements.
275
+
276
+ e) Convey the object code using peer-to-peer transmission, provided
277
+ you inform other peers where the object code and Corresponding
278
+ Source of the work are being offered to the general public at no
279
+ charge under subsection 6d.
280
+
281
+ A separable portion of the object code, whose source code is excluded
282
+ from the Corresponding Source as a System Library, need not be
283
+ included in conveying the object code work.
284
+
285
+ A "User Product" is either (1) a "consumer product", which means any
286
+ tangible personal property which is normally used for personal, family,
287
+ or household purposes, or (2) anything designed or sold for incorporation
288
+ into a dwelling. In determining whether a product is a consumer product,
289
+ doubtful cases shall be resolved in favor of coverage. For a particular
290
+ product received by a particular user, "normally used" refers to a
291
+ typical or common use of that class of product, regardless of the status
292
+ of the particular user or of the way in which the particular user
293
+ actually uses, or expects or is expected to use, the product. A product
294
+ is a consumer product regardless of whether the product has substantial
295
+ commercial, industrial or non-consumer uses, unless such uses represent
296
+ the only significant mode of use of the product.
297
+
298
+ "Installation Information" for a User Product means any methods,
299
+ procedures, authorization keys, or other information required to install
300
+ and execute modified versions of a covered work in that User Product from
301
+ a modified version of its Corresponding Source. The information must
302
+ suffice to ensure that the continued functioning of the modified object
303
+ code is in no case prevented or interfered with solely because
304
+ modification has been made.
305
+
306
+ If you convey an object code work under this section in, or with, or
307
+ specifically for use in, a User Product, and the conveying occurs as
308
+ part of a transaction in which the right of possession and use of the
309
+ User Product is transferred to the recipient in perpetuity or for a
310
+ fixed term (regardless of how the transaction is characterized), the
311
+ Corresponding Source conveyed under this section must be accompanied
312
+ by the Installation Information. But this requirement does not apply
313
+ if neither you nor any third party retains the ability to install
314
+ modified object code on the User Product (for example, the work has
315
+ been installed in ROM).
316
+
317
+ The requirement to provide Installation Information does not include a
318
+ requirement to continue to provide support service, warranty, or updates
319
+ for a work that has been modified or installed by the recipient, or for
320
+ the User Product in which it has been modified or installed. Access to a
321
+ network may be denied when the modification itself materially and
322
+ adversely affects the operation of the network or violates the rules and
323
+ protocols for communication across the network.
324
+
325
+ Corresponding Source conveyed, and Installation Information provided,
326
+ in accord with this section must be in a format that is publicly
327
+ documented (and with an implementation available to the public in
328
+ source code form), and must require no special password or key for
329
+ unpacking, reading or copying.
330
+
331
+ 7. Additional Terms.
332
+
333
+ "Additional permissions" are terms that supplement the terms of this
334
+ License by making exceptions from one or more of its conditions.
335
+ Additional permissions that are applicable to the entire Program shall
336
+ be treated as though they were included in this License, to the extent
337
+ that they are valid under applicable law. If additional permissions
338
+ apply only to part of the Program, that part may be used separately
339
+ under those permissions, but the entire Program remains governed by
340
+ this License without regard to the additional permissions.
341
+
342
+ When you convey a copy of a covered work, you may at your option
343
+ remove any additional permissions from that copy, or from any part of
344
+ it. (Additional permissions may be written to require their own
345
+ removal in certain cases when you modify the work.) You may place
346
+ additional permissions on material, added by you to a covered work,
347
+ for which you have or can give appropriate copyright permission.
348
+
349
+ Notwithstanding any other provision of this License, for material you
350
+ add to a covered work, you may (if authorized by the copyright holders of
351
+ that material) supplement the terms of this License with terms:
352
+
353
+ a) Disclaiming warranty or limiting liability differently from the
354
+ terms of sections 15 and 16 of this License; or
355
+
356
+ b) Requiring preservation of specified reasonable legal notices or
357
+ author attributions in that material or in the Appropriate Legal
358
+ Notices displayed by works containing it; or
359
+
360
+ c) Prohibiting misrepresentation of the origin of that material, or
361
+ requiring that modified versions of such material be marked in
362
+ reasonable ways as different from the original version; or
363
+
364
+ d) Limiting the use for publicity purposes of names of licensors or
365
+ authors of the material; or
366
+
367
+ e) Declining to grant rights under trademark law for use of some
368
+ trade names, trademarks, or service marks; or
369
+
370
+ f) Requiring indemnification of licensors and authors of that
371
+ material by anyone who conveys the material (or modified versions of
372
+ it) with contractual assumptions of liability to the recipient, for
373
+ any liability that these contractual assumptions directly impose on
374
+ those licensors and authors.
375
+
376
+ All other non-permissive additional terms are considered "further
377
+ restrictions" within the meaning of section 10. If the Program as you
378
+ received it, or any part of it, contains a notice stating that it is
379
+ governed by this License along with a term that is a further
380
+ restriction, you may remove that term. If a license document contains
381
+ a further restriction but permits relicensing or conveying under this
382
+ License, you may add to a covered work material governed by the terms
383
+ of that license document, provided that the further restriction does
384
+ not survive such relicensing or conveying.
385
+
386
+ If you add terms to a covered work in accord with this section, you
387
+ must place, in the relevant source files, a statement of the
388
+ additional terms that apply to those files, or a notice indicating
389
+ where to find the applicable terms.
390
+
391
+ Additional terms, permissive or non-permissive, may be stated in the
392
+ form of a separately written license, or stated as exceptions;
393
+ the above requirements apply either way.
394
+
395
+ 8. Termination.
396
+
397
+ You may not propagate or modify a covered work except as expressly
398
+ provided under this License. Any attempt otherwise to propagate or
399
+ modify it is void, and will automatically terminate your rights under
400
+ this License (including any patent licenses granted under the third
401
+ paragraph of section 11).
402
+
403
+ However, if you cease all violation of this License, then your
404
+ license from a particular copyright holder is reinstated (a)
405
+ provisionally, unless and until the copyright holder explicitly and
406
+ finally terminates your license, and (b) permanently, if the copyright
407
+ holder fails to notify you of the violation by some reasonable means
408
+ prior to 60 days after the cessation.
409
+
410
+ Moreover, your license from a particular copyright holder is
411
+ reinstated permanently if the copyright holder notifies you of the
412
+ violation by some reasonable means, this is the first time you have
413
+ received notice of violation of this License (for any work) from that
414
+ copyright holder, and you cure the violation prior to 30 days after
415
+ your receipt of the notice.
416
+
417
+ Termination of your rights under this section does not terminate the
418
+ licenses of parties who have received copies or rights from you under
419
+ this License. If your rights have been terminated and not permanently
420
+ reinstated, you do not qualify to receive new licenses for the same
421
+ material under section 10.
422
+
423
+ 9. Acceptance Not Required for Having Copies.
424
+
425
+ You are not required to accept this License in order to receive or
426
+ run a copy of the Program. Ancillary propagation of a covered work
427
+ occurring solely as a consequence of using peer-to-peer transmission
428
+ to receive a copy likewise does not require acceptance. However,
429
+ nothing other than this License grants you permission to propagate or
430
+ modify any covered work. These actions infringe copyright if you do
431
+ not accept this License. Therefore, by modifying or propagating a
432
+ covered work, you indicate your acceptance of this License to do so.
433
+
434
+ 10. Automatic Licensing of Downstream Recipients.
435
+
436
+ Each time you convey a covered work, the recipient automatically
437
+ receives a license from the original licensors, to run, modify and
438
+ propagate that work, subject to this License. You are not responsible
439
+ for enforcing compliance by third parties with this License.
440
+
441
+ An "entity transaction" is a transaction transferring control of an
442
+ organization, or substantially all assets of one, or subdividing an
443
+ organization, or merging organizations. If propagation of a covered
444
+ work results from an entity transaction, each party to that
445
+ transaction who receives a copy of the work also receives whatever
446
+ licenses to the work the party's predecessor in interest had or could
447
+ give under the previous paragraph, plus a right to possession of the
448
+ Corresponding Source of the work from the predecessor in interest, if
449
+ the predecessor has it or can get it with reasonable efforts.
450
+
451
+ You may not impose any further restrictions on the exercise of the
452
+ rights granted or affirmed under this License. For example, you may
453
+ not impose a license fee, royalty, or other charge for exercise of
454
+ rights granted under this License, and you may not initiate litigation
455
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
456
+ any patent claim is infringed by making, using, selling, offering for
457
+ sale, or importing the Program or any portion of it.
458
+
459
+ 11. Patents.
460
+
461
+ A "contributor" is a copyright holder who authorizes use under this
462
+ License of the Program or a work on which the Program is based. The
463
+ work thus licensed is called the contributor's "contributor version".
464
+
465
+ A contributor's "essential patent claims" are all patent claims
466
+ owned or controlled by the contributor, whether already acquired or
467
+ hereafter acquired, that would be infringed by some manner, permitted
468
+ by this License, of making, using, or selling its contributor version,
469
+ but do not include claims that would be infringed only as a
470
+ consequence of further modification of the contributor version. For
471
+ purposes of this definition, "control" includes the right to grant
472
+ patent sublicenses in a manner consistent with the requirements of
473
+ this License.
474
+
475
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
476
+ patent license under the contributor's essential patent claims, to
477
+ make, use, sell, offer for sale, import and otherwise run, modify and
478
+ propagate the contents of its contributor version.
479
+
480
+ In the following three paragraphs, a "patent license" is any express
481
+ agreement or commitment, however denominated, not to enforce a patent
482
+ (such as an express permission to practice a patent or covenant not to
483
+ sue for patent infringement). To "grant" such a patent license to a
484
+ party means to make such an agreement or commitment not to enforce a
485
+ patent against the party.
486
+
487
+ If you convey a covered work, knowingly relying on a patent license,
488
+ and the Corresponding Source of the work is not available for anyone
489
+ to copy, free of charge and under the terms of this License, through a
490
+ publicly available network server or other readily accessible means,
491
+ then you must either (1) cause the Corresponding Source to be so
492
+ available, or (2) arrange to deprive yourself of the benefit of the
493
+ patent license for this particular work, or (3) arrange, in a manner
494
+ consistent with the requirements of this License, to extend the patent
495
+ license to downstream recipients. "Knowingly relying" means you have
496
+ actual knowledge that, but for the patent license, your conveying the
497
+ covered work in a country, or your recipient's use of the covered work
498
+ in a country, would infringe one or more identifiable patents in that
499
+ country that you have reason to believe are valid.
500
+
501
+ If, pursuant to or in connection with a single transaction or
502
+ arrangement, you convey, or propagate by procuring conveyance of, a
503
+ covered work, and grant a patent license to some of the parties
504
+ receiving the covered work authorizing them to use, propagate, modify
505
+ or convey a specific copy of the covered work, then the patent license
506
+ you grant is automatically extended to all recipients of the covered
507
+ work and works based on it.
508
+
509
+ A patent license is "discriminatory" if it does not include within
510
+ the scope of its coverage, prohibits the exercise of, or is
511
+ conditioned on the non-exercise of one or more of the rights that are
512
+ specifically granted under this License. You may not convey a covered
513
+ work if you are a party to an arrangement with a third party that is
514
+ in the business of distributing software, under which you make payment
515
+ to the third party based on the extent of your activity of conveying
516
+ the work, and under which the third party grants, to any of the
517
+ parties who would receive the covered work from you, a discriminatory
518
+ patent license (a) in connection with copies of the covered work
519
+ conveyed by you (or copies made from those copies), or (b) primarily
520
+ for and in connection with specific products or compilations that
521
+ contain the covered work, unless you entered into that arrangement,
522
+ or that patent license was granted, prior to 28 March 2007.
523
+
524
+ Nothing in this License shall be construed as excluding or limiting
525
+ any implied license or other defenses to infringement that may
526
+ otherwise be available to you under applicable patent law.
527
+
528
+ 12. No Surrender of Others' Freedom.
529
+
530
+ If conditions are imposed on you (whether by court order, agreement or
531
+ otherwise) that contradict the conditions of this License, they do not
532
+ excuse you from the conditions of this License. If you cannot convey a
533
+ covered work so as to satisfy simultaneously your obligations under this
534
+ License and any other pertinent obligations, then as a consequence you may
535
+ not convey it at all. For example, if you agree to terms that obligate you
536
+ to collect a royalty for further conveying from those to whom you convey
537
+ the Program, the only way you could satisfy both those terms and this
538
+ License would be to refrain entirely from conveying the Program.
539
+
540
+ 13. Remote Network Interaction; Use with the GNU General Public License.
541
+
542
+ Notwithstanding any other provision of this License, if you modify the
543
+ Program, your modified version must prominently offer all users
544
+ interacting with it remotely through a computer network (if your version
545
+ supports such interaction) an opportunity to receive the Corresponding
546
+ Source of your version by providing access to the Corresponding Source
547
+ from a network server at no charge, through some standard or customary
548
+ means of facilitating copying of software. This Corresponding Source
549
+ shall include the Corresponding Source for any work covered by version 3
550
+ of the GNU General Public License that is incorporated pursuant to the
551
+ following paragraph.
552
+
553
+ Notwithstanding any other provision of this License, you have
554
+ permission to link or combine any covered work with a work licensed
555
+ under version 3 of the GNU General Public License into a single
556
+ combined work, and to convey the resulting work. The terms of this
557
+ License will continue to apply to the part which is the covered work,
558
+ but the work with which it is combined will remain governed by version
559
+ 3 of the GNU General Public License.
560
+
561
+ 14. Revised Versions of this License.
562
+
563
+ The Free Software Foundation may publish revised and/or new versions of
564
+ the GNU Affero General Public License from time to time. Such new versions
565
+ will be similar in spirit to the present version, but may differ in detail to
566
+ address new problems or concerns.
567
+
568
+ Each version is given a distinguishing version number. If the
569
+ Program specifies that a certain numbered version of the GNU Affero General
570
+ Public License "or any later version" applies to it, you have the
571
+ option of following the terms and conditions either of that numbered
572
+ version or of any later version published by the Free Software
573
+ Foundation. If the Program does not specify a version number of the
574
+ GNU Affero General Public License, you may choose any version ever published
575
+ by the Free Software Foundation.
576
+
577
+ If the Program specifies that a proxy can decide which future
578
+ versions of the GNU Affero General Public License can be used, that proxy's
579
+ public statement of acceptance of a version permanently authorizes you
580
+ to choose that version for the Program.
581
+
582
+ Later license versions may give you additional or different
583
+ permissions. However, no additional obligations are imposed on any
584
+ author or copyright holder as a result of your choosing to follow a
585
+ later version.
586
+
587
+ 15. Disclaimer of Warranty.
588
+
589
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597
+
598
+ 16. Limitation of Liability.
599
+
600
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608
+ SUCH DAMAGES.
609
+
610
+ 17. Interpretation of Sections 15 and 16.
611
+
612
+ If the disclaimer of warranty and limitation of liability provided
613
+ above cannot be given local legal effect according to their terms,
614
+ reviewing courts shall apply local law that most closely approximates
615
+ an absolute waiver of all civil liability in connection with the
616
+ Program, unless a warranty or assumption of liability accompanies a
617
+ copy of the Program in return for a fee.
618
+
619
+ END OF TERMS AND CONDITIONS
620
+
621
+ How to Apply These Terms to Your New Programs
622
+
623
+ If you develop a new program, and you want it to be of the greatest
624
+ possible use to the public, the best way to achieve this is to make it
625
+ free software which everyone can redistribute and change under these terms.
626
+
627
+ To do so, attach the following notices to the program. It is safest
628
+ to attach them to the start of each source file to most effectively
629
+ state the exclusion of warranty; and each file should have at least
630
+ the "copyright" line and a pointer to where the full notice is found.
631
+
632
+ <one line to give the program's name and a brief idea of what it does.>
633
+ Copyright (C) <year> <name of author>
634
+
635
+ This program is free software: you can redistribute it and/or modify
636
+ it under the terms of the GNU Affero General Public License as published
637
+ by the Free Software Foundation, either version 3 of the License, or
638
+ (at your option) any later version.
639
+
640
+ This program is distributed in the hope that it will be useful,
641
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
642
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643
+ GNU Affero General Public License for more details.
644
+
645
+ You should have received a copy of the GNU Affero General Public License
646
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
647
+
648
+ Also add information on how to contact you by electronic and paper mail.
649
+
650
+ If your software can interact with users remotely through a computer
651
+ network, you should also make sure that it provides a way for users to
652
+ get its source. For example, if your program is a web application, its
653
+ interface could display a "Source" link that leads users to an archive
654
+ of the code. There are many ways you could offer source, and different
655
+ solutions will be better for different programs; see section 13 for the
656
+ specific requirements.
657
+
658
+ You should also get your employer (if you work as a programmer) or school,
659
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
660
+ For more information on this, and how to apply and follow the GNU AGPL, see
661
+ <https://www.gnu.org/licenses/>.
README.md CHANGED
@@ -1,12 +1,7 @@
1
- ---
2
- title: Vits Simple Api Gsv
3
- emoji: 🐨
4
- colorFrom: yellow
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.21.0
 
 
8
  app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ license: mit
2
+ title: vits-simple-api-gsv
 
 
 
3
  sdk: gradio
4
+ pinned: true
5
+ python_version: 3.10.11
6
+ emoji: 👀
7
  app_file: app.py
 
 
 
 
README_zh.md ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="title" align=center>
2
+ <h1>vits-simple-api</h1>
3
+ <div>Simply call the vits api</div>
4
+ <br/>
5
+ <br/>
6
+ <p>
7
+ <img src="https://img.shields.io/github/license/Artrajz/vits-simple-api">
8
+ <img src="https://img.shields.io/badge/python-3.10-green">
9
+ <a href="https://hub.docker.com/r/artrajz/vits-simple-api">
10
+ <img src="https://img.shields.io/docker/pulls/artrajz/vits-simple-api"></a>
11
+ </p>
12
+ <a href="https://github.com/Artrajz/vits-simple-api/blob/main/README.md">English</a>|<a href="https://github.com/Artrajz/vits-simple-api/blob/main/README_zh.md">中文文档</a>
13
+ <br/>
14
+ </div>
15
+
16
+
17
+
18
+
19
+
20
+ # Feature
21
+
22
+ - [x] VITS语音合成,语音转换
23
+ - [x] HuBert-soft VITS模型
24
+ - [x] W2V2 VITS / [emotional-vits](https://github.com/innnky/emotional-vits)维度情感模型
25
+ - [x] [vits_chinese](https://github.com/PlayVoice/vits_chinese)
26
+ - [x] [Bert-VITS2](https://github.com/Stardust-minus/Bert-VITS2)
27
+ - [x] [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
28
+ - [x] 加载多模型
29
+ - [x] 自动识别语言并处理,根据模型的cleaner设置语言类型识别的范围,支持自定义语言类型范围
30
+ - [x] 自定义默认参数
31
+ - [x] 长文本批处理
32
+ - [x] GPU加速推理
33
+ - [x] SSML语音合成标记语言(完善中...)
34
+
35
+
36
+ ## 在线demo
37
+
38
+ [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Artrajz/vits-simple-api) 感谢hugging face喵
39
+
40
+ 注意不同的id支持的语言可能有所不同。[speakers](https://artrajz-vits-simple-api.hf.space/voice/speakers)
41
+
42
+
43
+ - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
44
+ - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=我觉得1%2B1≠3&id=164&lang=zh`(get中一些字符需要转义不然会被过滤掉)
45
+ - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
46
+ - 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
47
+ - 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
48
+
49
+ https://user-images.githubusercontent.com/73542220/237995061-c1f25b4e-dd86-438a-9363-4bb1fe65b425.mov
50
+
51
+ # 部署
52
+
53
+ 有两种部署方式可供选择。不论你选择哪一种,完成部署后都需要导入模型才能使用。
54
+
55
+ ## Docker部署(Linux推荐)
56
+
57
+ ### 步骤1: 镜像拉取
58
+
59
+ 运行以下命令以拉取 Docker 镜像,根据脚本中的提示选择需要下载的必要文件和拉取镜像:
60
+
61
+ ```bash
62
+ bash -c "$(wget -O- https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/vits-simple-api-installer-latest.sh)"
63
+ ```
64
+
65
+ 项目配置文件以及模型文件夹的默认路径为`/usr/local/vits-simple-api/`
66
+
67
+ ### 步骤2: 启动
68
+
69
+ 运行以下命令启动容器:
70
+
71
+ ```bash
72
+ docker-compose up -d
73
+ ```
74
+
75
+ ### 镜像更新
76
+
77
+ 运行以下命令更新镜像:
78
+
79
+ ```bash
80
+ docker-compose pull
81
+ ```
82
+
83
+ 重新启动容器:
84
+
85
+ ```bash
86
+ docker-compose up -d
87
+ ```
88
+
89
+ ## 虚拟环境部署
90
+
91
+ ### 步骤1: 克隆项目
92
+
93
+ 使用以下命令克隆项目仓库:
94
+
95
+ ```bash
96
+ git clone https://github.com/Artrajz/vits-simple-api.git
97
+ ```
98
+
99
+ ### 步骤2: 下载 Python 依赖
100
+
101
+ 推荐使用 Python 3.10版本的虚拟环境。运行以下命令安装项目所需的 Python 依赖:
102
+
103
+ 如果遇到某些无法安装的依赖,请看下面的常见问题。
104
+
105
+ ```bash
106
+ pip install -r requirements.txt
107
+ ```
108
+
109
+ ### 步骤3: 启动
110
+
111
+ 运行以下命令启动程序:
112
+
113
+ ```bash
114
+ python app.py
115
+ ```
116
+
117
+ ## Windows快速部署包
118
+
119
+ ### 步骤1:下载并解压部署包
120
+
121
+ 进入[releases页面](https://github.com/Artrajz/vits-simple-api/releases)下载并解压最新的部署包
122
+
123
+ ### 步骤2:启动
124
+
125
+ 运行start.bat启动程序
126
+
127
+ ## 模型加载
128
+
129
+ ### 步骤1: 下载 VITS 模型
130
+
131
+ 将 VITS 模型文件下载并放入 `data/models`文件夹。
132
+
133
+ ### 步骤2: 加载模型
134
+
135
+ #### 自动加载模型
136
+
137
+ v0.6.6版本之后默认会**自动加载**`data/models`文件夹下的所有模型,方便新手使用。
138
+
139
+ #### 手动加载模型
140
+
141
+ 首次启动之后会生成一个config.yaml配置文件,需要将`tts_config.auto_load`改为`false`以启用手动加载模式。
142
+
143
+ 可以修改配置文件中的`tts_config.models`或者在浏览器中进入管理员后台进行修改。
144
+
145
+ **注意:v0.6.6版本之后已修改模型读取路径,请重新按照以下步骤配置模型路径!**
146
+
147
+ 路径可填绝对路径或相对路径,相对路径则是从项目根目录中的`data/models`文件夹开始。
148
+
149
+ 比如`data/models`文件夹中有如下文件
150
+
151
+ ```
152
+ ├─model1
153
+ │ │─G_1000.pth
154
+ │ └─config.json
155
+ └─model2
156
+ │─G_1000.pth
157
+ └─config.json
158
+ ```
159
+
160
+ 填写
161
+
162
+ ```yaml
163
+ tts_config:
164
+ auto_load: false
165
+ models:
166
+ - config_path: model1/config.json
167
+ model_path: model1/G_1000.pth
168
+ - config_path: model2/config.json
169
+ model_path: model2/G_1000.pth
170
+ # GPT-SoVITS则为
171
+ - sovits_path: gpt_sovits1/model1_e8_s11536.pth
172
+ gpt_path: gpt_sovits1/model1-e15.ckpt
173
+ - sovits_path: gpt_sovits2/model2_e8_s11536.pth
174
+ gpt_path: gpt_sovits2/model2-e15.ckpt
175
+
176
+ ```
177
+
178
+ 在管理员后台加载模型比较方便,但如果想加载`data/models`文件夹之外的模型,则只能通过修改config.yaml配置文件来加载,方法是直接填写绝对路径。
179
+
180
+ 绝对路径填写:
181
+
182
+ ```yaml
183
+ tts_config:
184
+ auto_load: false
185
+ models:
186
+ - config_path: D://model3/config.json
187
+ model_path: D://model3/G_1000.pth
188
+ ```
189
+
190
+ - models_path:是相对于data目录下的模型文件夹,默认为models,auto_load为true时将会加载models_path目录下的所有模型。
191
+
192
+ #### 其他模型
193
+
194
+ bert模型以及情感模型下载之后放在`data/bert`文件夹和`data/emotional`文件夹中,找到对应名字放入即可。
195
+
196
+ # GPU 加速
197
+
198
+ ## windows
199
+
200
+ ### 安装CUDA
201
+
202
+ 查看显卡最高支持CUDA的版本
203
+
204
+ ```
205
+ nvidia-smi
206
+ ```
207
+
208
+ 以CUDA11.7为例,[官网](https://developer.nvidia.com/cuda-11-7-0-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exe_local)
209
+
210
+ ### 安装GPU版pytorch
211
+
212
+ CUDA11.7对应的pytorch是用这个命令安装,推荐使用1.13.1+cu117,其他版本可能存在内存不稳定的问题。
213
+
214
+ ```
215
+ pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
216
+ ```
217
+
218
+ ## Linux
219
+
220
+ 安装过程类似,可以查阅网上的安装资料。也可以直接使用docker部署脚本中的gpu版本。
221
+
222
+ # WebUI
223
+
224
+ ## 推理前端
225
+
226
+ http://127.0.0.1:23456
227
+
228
+ *在默认端口为23456的情况下,端口可修改
229
+
230
+ ## 管理员后台
231
+
232
+ 默认为http://127.0.0.1:23456/admin
233
+
234
+ **初始账号密码在初次启动后,在config.yaml中搜索admin可找到。**
235
+
236
+ # 功能选项说明
237
+
238
+ ## 关闭管理员后台
239
+
240
+ 由于管理员后台可以对模型进行加载和卸载操作,虽然有登录验证的保障,为了绝对安全,当对公网开放时,可以在`config.yaml`中关闭管理员后台:
241
+
242
+ ```yaml
243
+ 'IS_ADMIN_ENABLED': !!bool 'false'
244
+ ```
245
+
246
+ ## Bert-VITS2配置使用语言/Bert模型
247
+
248
+ 在Bert-VITS2 v2.0以后,一个模型需要加载三个不同语言的Bert模型。如果只需要使用其中一或两种语言,可以在模型的config.json的data中,添加`lang`参数,值为`['zh']`,表示该模型只使用中文,同时也只会加载中文的Bert模型。值为`['zh','ja']`表示只使用中日双语,同时也只会加载中文和日文的Bert模型。以此类推。
249
+
250
+ 示例:
251
+
252
+ ```json
253
+ "data": {
254
+ "lang": ["zh","ja"],
255
+ "training_files": "filelists/train.list",
256
+ "validation_files": "filelists/val.list",
257
+ "max_wav_value": 32768.0,
258
+ ...
259
+ ```
260
+
261
+ ## 自定义中文多音字词典
262
+
263
+ 如果遇到多音字发音不正确,可以尝试用这种办法解决。
264
+
265
+ 在data目录创建并打开phrases_dict.txt添加多音字词。
266
+
267
+ ```python
268
+ {
269
+ "一骑当千": [["yí"], ["jì"], ["dāng"], ["qiān"]],
270
+ }
271
+ ```
272
+
273
+ ## GPT-SoVITS参考音频预设
274
+
275
+ 在config.yaml中找到gpt_sovits的配置,在presets下添加预设,预设可添加多个,其中key作为预设名称,如下有两个默认的预设default和default2:
276
+
277
+ ```
278
+ gpt_sovits_config:
279
+ hz: 50
280
+ is_half: false
281
+ id: 0
282
+ lang: auto
283
+ format: wav
284
+ segment_size: 50
285
+ presets:
286
+ default:
287
+ refer_wav_path: null
288
+ prompt_text: null
289
+ prompt_lang: auto
290
+ default2:
291
+ refer_wav_path: null
292
+ prompt_text: null
293
+ prompt_lang: auto
294
+ ```
295
+
296
+ ## 阅读API
297
+
298
+ 在[开源阅读](https://gedoor.github.io/)中测试
299
+
300
+ 可使用多种模型朗读,包括VITS,Bert-VITS2,GPT-SoVITS,`in`开头的参数配置的是对话即引号中的文本的说话人,`nr`开头的参数配置的是旁白。
301
+
302
+ 使用GPT-SoVITS需要提前在`config.yaml`配置好`presets`里的参考音频,并修改下方url中的preset
303
+
304
+ url中的IP可在API启动后找到,一般使用192.168开头的局域网IP。
305
+
306
+ 修改好后,选择朗读引擎-添加朗读引擎-粘贴源,并启用该朗读引擎。
307
+
308
+ ```js
309
+ {
310
+ "concurrentRate": "1",
311
+ "contentType": "audio/wav",
312
+ "enabledCookieJar": false,
313
+ "header": "",
314
+ "id": 1709643305070,
315
+ "lastUpdateTime": 1709821070082,
316
+ "loginCheckJs": "",
317
+ "loginUi": "",
318
+ "loginUrl": "",
319
+ "name": "vits-simple-api",
320
+ "url": "http://192.168.xxx.xxx:23456/voice/reading?text={{java.encodeURI(speakText)}}&in_model_type=GPT-SOVITS&in_id=0&in_preset=default&nr_model_type=BERT-VITS2&nr_id=0&nr_preset=default&format=wav&lang=zh"
321
+ }
322
+ ```
323
+
324
+
325
+
326
+ # 常见问题
327
+
328
+ ## fasttext依赖安装问题
329
+
330
+ windows下可能安装不了fasttext,可以用以下命令安装,附[wheels下载地址](https://www.lfd.uci.edu/~gohlke/pythonlibs/#fasttext)
331
+
332
+ ```
333
+ # python3.10 win_amd64
334
+ pip install https://github.com/Artrajz/archived/raw/main/fasttext/fasttext-0.9.2-cp310-cp310-win_amd64.whl
335
+ ```
336
+
337
+ 或者
338
+
339
+ ```
340
+ pip install fasttext -i https://pypi.artrajz.cn/simple
341
+ ```
342
+
343
+ ## pyopenjtalk依赖安装问题
344
+
345
+ 由于pypi.org没有pyopenjtalk的whl文件,通常需要从源代��来安装,这一过程对于一些人来说可能比较麻烦,所以你也可以使用我构建的whl来安装。
346
+
347
+ ```
348
+ pip install pyopenjtalk -i https://pypi.artrajz.cn/simple
349
+ ```
350
+
351
+ ## Bert-VITS2版本兼容
352
+
353
+ 修改Bert-VITS2模型的config.json,加入版本号参数`"version": "x.x.x"`,比如模型版本为1.0.1时,配置文件应该写成:
354
+
355
+ ```
356
+ {
357
+ "version": "1.0.1",
358
+ "train": {
359
+ "log_interval": 10,
360
+ "eval_interval": 100,
361
+ "seed": 52,
362
+ ...
363
+ ```
364
+
365
+ 需要注意的是,中文特化版的版本号应改为`extra`或`zh-clap`,特化修复版的版本号为`2.4`或`extra-fix`。
366
+
367
+ # API
368
+
369
+ ## GET
370
+
371
+ #### speakers list
372
+
373
+ - GET http://127.0.0.1:23456/voice/speakers
374
+
375
+ 返回id对应角色的映射表
376
+
377
+ #### voice vits
378
+
379
+ - GET http://127.0.0.1:23456/voice/vits?text=text
380
+
381
+ 其他参数不指定时均为默认值
382
+
383
+ - GET http://127.0.0.1:23456/voice/vits?text=[ZH]text[ZH][JA]text[JA]&lang=mix
384
+
385
+ lang=mix时文本要标注
386
+
387
+ - GET http://127.0.0.1:23456/voice/vits?text=text&id=142&format=wav&lang=zh&length=1.4
388
+
389
+ 文本为text,角色id为142,音频格式为wav,文本语言为zh,语音长度为1.4,其余参数默认
390
+
391
+ #### check
392
+
393
+ - GET http://127.0.0.1:23456/voice/check?id=0&model=vits
394
+
395
+ ## POST
396
+
397
+ - 见`api_test.py`
398
+
399
+
400
+
401
+ ## API KEY
402
+
403
+ 在config.yaml中设置`api_key_enabled: true`以启用,api key填写:`api_key: api-key`。
404
+
405
+ 启用后,GET请求中使用需要增加参数api_key,POST请求中使用需要在header中添加参数`X-API-KEY`。
406
+
407
+ # Parameter
408
+
409
+ ## VITS语音合成
410
+
411
+ | Name | Parameter | Is must | Default | Type | Instruction |
412
+ | ------------- | ------------ | ------- | -------------------- | ----- | ------------------------------------------------------------ |
413
+ | 合成文本 | text | true | | str | 需要合成语音的文本。 |
414
+ | 角色id | id | false | 从`config.yaml`中获取 | int | 即说话人id。 |
415
+ | 音频格式 | format | false | 从`config.yaml`中获取 | str | 支持wav,ogg,silk,mp3,flac |
416
+ | 文本语言 | lang | false | 从`config.yaml`中获取 | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
417
+ | 语音长度/语速 | length | false | 从`config.yaml`中获取 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢。 |
418
+ | 噪声 | noise | false | 从`config.yaml`中获取 | float | 样本噪声,控制合成的随机性。 |
419
+ | sdp噪声 | noisew | false | 从`config.yaml`中获取 | float | 随机时长预测器噪声,控制音素发音长度。 |
420
+ | 分段阈值 | segment_size | false | 从`config.yaml`中获取 | int | 按标点符号分段,加起来大于segment_size时为一段文本。segment_size<=0表示不分段。 |
421
+ | 流式响应 | streaming | false | false | bool | 流式合成语音,更快的首包响应。 |
422
+
423
+ ## VITS 语音转换
424
+
425
+ | Name | Parameter | Is must | Default | Type | Instruction |
426
+ | ---------- | ----------- | ------- | ------- | ---- | ---------------------- |
427
+ | 上传音频 | upload | true | | file | wav or ogg |
428
+ | 源角色id | original_id | true | | int | 上传文件所使用的角色id |
429
+ | 目标角色id | target_id | true | | int | 要转换的目标角色id |
430
+
431
+ ## HuBert-VITS 语音转换
432
+
433
+ | Name | Parameter | Is must | Default | Type | Instruction |
434
+ | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------ |
435
+ | 上传音频 | upload | true | | file | 需要转换说话人的音频文件。 |
436
+ | 目标角色id | id | true | | int | 目标说话人id。 |
437
+ | 音频格式 | format | true | | str | wav,ogg,silk |
438
+ | 语音长度/语速 | length | true | | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
439
+ | 噪声 | noise | true | | float | 样本噪声,控制合成的随机性。 |
440
+ | sdp噪声 | noisew | true | | float | 随机时长预测器噪声,控制音素发音长度。 |
441
+
442
+ ## W2V2-VITS
443
+
444
+ | Name | Parameter | Is must | Default | Type | Instruction |
445
+ | ------------- | ------------ | ------- | -------------------- | ----- | ------------------------------------------------------------ |
446
+ | 合成文本 | text | true | | str | 需要合成语音的文本。 |
447
+ | 角色id | id | false | 从`config.yaml`中获取 | int | 即说话人id。 |
448
+ | 音频格式 | format | false | 从`config.yaml`中获取 | str | 支持wav,ogg,silk,mp3,flac |
449
+ | 文本语言 | lang | false | 从`config.yaml`中获取 | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
450
+ | 语音长度/语速 | length | false | 从`config.yaml`中获取 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
451
+ | 噪声 | noise | false | 从`config.yaml`中获取 | float | 样本噪声,控制合成的随机性。 |
452
+ | sdp噪声 | noisew | false | 从`config.yaml`中获取 | float | 随机时长预测器噪声,控制音素发音长度。 |
453
+ | 分段阈值 | segment_size | false | 从`config.yaml`中获取 | int | 按标点符号分段,加起来大于segment_size时为一段文本。segment_size<=0表示不分段。 |
454
+ | 维度情感 | emotion | false | 0 | int | 范围取决于npy情感参考文件,如[innnky](https://huggingface.co/spaces/innnky/nene-emotion/tree/main)的all_emotions.npy模型范围是0-5457 |
455
+
456
+ ## Dimensional emotion
457
+
458
+ | Name | Parameter | Is must | Default | Type | Instruction |
459
+ | -------- | --------- | ------- | ------- | ---- | ----------------------------- |
460
+ | 上传音频 | upload | true | | file | 返回存储维度情感向量的npy文件 |
461
+
462
+ ## Bert-VITS2语音合成
463
+
464
+ | Name | Parameter | Is must | Default | Type | Instruction |
465
+ | -------------- | --------------- | ------- | -------------------- | ----- | ------------------------------------------------------------ |
466
+ | 合成文本 | text | true | | str | 需要合成语音的文本。 |
467
+ | 角色id | id | false | 从`config.yaml`中获取 | int | 即说话人id。 |
468
+ | 音频格式 | format | false | 从`config.yaml`中获取 | str | 支持wav,ogg,silk,mp3,flac |
469
+ | 文本语言 | lang | false | 从`config.yaml`中获取 | str | auto为自动识别语言模式,也是默认模式,但目前只支持识别整段文本的语言,无法细分到每个句子。其余可选语言zh和ja。 |
470
+ | 语音长度/语速 | length | false | 从`config.yaml`中获取 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢。 |
471
+ | 噪声 | noise | false | 从`config.yaml`中获取 | float | 样本噪声,控制合成的随机性。 |
472
+ | sdp噪声 | noisew | false | 从`config.yaml`中获取 | float | 随机时长预测器噪声,控制音素发音长度。 |
473
+ | 分段阈值 | segment_size | false | 从`config.yaml`中获取 | int | 按标点符号分段,加起来大于segment_size时为一段文本。segment_size<=0表示不分段。 |
474
+ | SDP/DP混合比 | sdp_ratio | false | 从`config.yaml`中获取 | int | SDP在合成时的占比,理论上此比率越高,合成的语音语调方差越大。 |
475
+ | 情感控制 | emotion | false | 从`config.yaml`中获取 | int | Bert-VITS2 v2.1可用,范围为0-9 |
476
+ | 情感参考音频 | reference_audio | false | None | | Bert-VITS2 v2.1 使用参考音频来控制合成音频的情感 |
477
+ | 文本提示词 | text_prompt | false | 从`config.yaml`中获取 | str | Bert-VITS2 v2.2 文本提示词,用于控制情感 |
478
+ | 文本提示词 | style_text | false | 从`config.yaml`中获取 | str | Bert-VITS2 v2.3 文本提示词,用于控制情感 |
479
+ | 文本提示词权重 | style_weight | false | 从`config.yaml`中获取 | float | Bert-VITS2 v2.3 文本提示词,用于提示词权重 |
480
+ | 流式响应 | streaming | false | false | bool | 流式合成语音,更快的首包响应。 |
481
+
482
+ ## GPT-SoVITS语音合成
483
+
484
+ | Name | Parameter | Is must | Default | Type | Instruction |
485
+ | ------------ | --------------- | ------- | --------------------- | ----- | ------------------------------------------------------------ |
486
+ | 合成文本 | text | true | | str | 需要合成语音的文本。 |
487
+ | 角色id | id | false | 从`config.yaml`中获取 | int | 即说话人id。在GPT-SoVITS中,每一个模型作为一个角色id,音色通过参考音频预设来切换。 |
488
+ | 音频格式 | format | false | 从`config.yaml`中获取 | str | 支持wav,ogg,silk,mp3,flac |
489
+ | 文本语言 | lang | false | 从`config.yaml`中获取 | str | auto为自动识别语言模式,也是默认模式,但目前只支持识别整段文本的语言,无法细分到每个句子。 |
490
+ | 参考音频 | reference_audio | false | None | | 参考音频是必须的,但是可以被预设代替 |
491
+ | 参考音频文本 | prompt_text | false | 从`config.yaml`中获取 | float | 需要和参考音频实际文本保持一致。 |
492
+ | 参考音频语言 | prompt_lang | false | 从`config.yaml`中获取 | str | 默认为auto,自动识别文本语言。如果识别失败则手动填写,中文就是zh,日文是ja,英文是en。 |
493
+ | 参考音频预设 | preset | false | default | str | 通过提前设置好的预设代替参考音频,可设置多个预设。 |
494
+
495
+
496
+ ## SSML语音合成标记语言
497
+
498
+ 目前支持的元素与属性
499
+
500
+ `speak`元素
501
+
502
+ | Attribute | Description | Is must |
503
+ | ------------ | ------------------------------------------------------------ | ------- |
504
+ | id | 默认值从`config.yaml`中读取 | false |
505
+ | lang | 默认值从`config.yaml`中读取 | false |
506
+ | length | 默认值从`config.yaml`中读取 | false |
507
+ | noise | 默认值从`config.yaml`中读取 | false |
508
+ | noisew | 默认值从`config.yaml`中读取 | false |
509
+ | segment_size | 按标点符号分段,加起来大于segment_size时为一段文本。segment_size<=0表示不分段,这里默认为0。 | false |
510
+ | model_type | 默认为VITS,可选W2V2-VITS,BERT-VITS2 | false |
511
+ | emotion | 只有用W2V2-VITS时`emotion`才会生效,范围取决于npy情感参考文件 | false |
512
+ | sdp_ratio | 只有用BERT-VITS2时`sdp_ratio`才会生效 | false |
513
+
514
+ `voice`元素
515
+
516
+ 优先级大于`speak`
517
+
518
+ | Attribute | Description | Is must |
519
+ | ------------ | ------------------------------------------------------------ | ------- |
520
+ | id | 默认值从`config.yaml`中读取 | false |
521
+ | lang | 默认值从`config.yaml`中读取 | false |
522
+ | length | 默认值从`config.yaml`中读取 | false |
523
+ | noise | 默认值从`config.yaml`中读取 | false |
524
+ | noisew | 默认值从`config.yaml`中读取 | false |
525
+ | segment_size | 按标点符号分段,加起来大于segment_size时为一段文本。segment_size<=0表示不分段,这里默认为0。 | false |
526
+ | model_type | 默认为VITS,可选W2V2-VITS,BERT-VITS2 | false |
527
+ | emotion | 只有用W2V2-VITS时`emotion`才会生效,范围取决于npy情感参考文件 | false |
528
+ | sdp_ratio | 只有用BERT-VITS2时`sdp_ratio`才会生效 | false |
529
+
530
+ `break`元素
531
+
532
+ | Attribute | Description | Is must |
533
+ | --------- | ------------------------------------------------------------ | ------- |
534
+ | strength | x-weak,weak,medium(默认值),strong,x-strong | false |
535
+ | time | 暂停的绝对持续时间,以秒为单位(例如 `2s`)或以毫秒为单位(例如 `500ms`)。 有效值的范围为 0 到 5000 毫秒。 如果设置的值大于支持的最大值,则服务将使用 `5000ms`。 如果设置了 `time` 属性,则会忽略 `strength` 属性。 | false |
536
+
537
+ | Strength | Relative Duration |
538
+ | :------- | :---------------- |
539
+ | x-weak | 250 毫秒 |
540
+ | weak | 500 毫秒 |
541
+ | Medium | 750 毫秒 |
542
+ | Strong | 1000 毫秒 |
543
+ | x-strong | 1250 毫秒 |
544
+
545
+ ## 阅读
546
+
547
+ | Name | Parameter | Is must | Default | Type | Instruction |
548
+ | -------------------- | ------------- | ------- | --------------------- | ---- | ------------------------------------------------------------ |
549
+ | 合成文本 | text | true | | str | 需要合成语音的文本。 |
550
+ | 对话角色模型类型 | in_model_type | false | 从`config.yaml`中获取 | str | |
551
+ | 对话角色id | in_id | false | 从`config.yaml`中获取 | int | |
552
+ | 对话角色参考音频预设 | in_preset | false | default | str | 通过提前设置好的预设代替参考音频,可设置多个预设。 |
553
+ | 旁白角色模型类型 | nr_model_type | false | 从`config.yaml`中获取 | str | |
554
+ | 旁白角色id | nr_id | false | 从`config.yaml`中获取 | int | |
555
+ | 旁白角色参考音频预设 | nr_preset | false | default | str | 通过提前设置好的预设代替参考音频,可设置多个预设。 |
556
+ | 音频格式 | format | false | 从`config.yaml`中获取 | str | 支持wav,ogg,silk,mp3,flac |
557
+ | 文本语言 | lang | false | 从`config.yaml`中获取 | str | auto为自动识别语言模式,也是默认模式,但目前只支持识别整段文本的语言,无法细分到每个句子。 |
558
+
559
+ 模型的其他参数将使用config.yaml文件中对应模型的默认参数。
560
+
561
+
562
+
563
+ ## 示例
564
+
565
+ 见`api_test.py`
566
+
567
+ # 交流平台
568
+
569
+ 现在只有 [Q群](https://qm.qq.com/cgi-bin/qm/qr?k=-1GknIe4uXrkmbDKBGKa1aAUteq40qs_&jump_from=webapi&authKey=x5YYt6Dggs1ZqWxvZqvj3fV8VUnxRyXm5S5Kzntc78+Nv3iXOIawplGip9LWuNR/)
570
+
571
+ # 鸣谢
572
+
573
+ - vits:https://github.com/jaywalnut310/vits
574
+ - MoeGoe:https://github.com/CjangCjengh/MoeGoe
575
+ - emotional-vits:https://github.com/innnky/emotional-vits
576
+ - vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
577
+ - vits_chinese:https://github.com/PlayVoice/vits_chinese
578
+ - Bert_VITS2:https://github.com/fishaudio/Bert-VITS2
579
+ - GPT-SoVITS:https://github.com/RVC-Boss/GPT-SoVITS
580
+
581
+ # 感谢所有的贡献者
582
+
583
+ <a href="https://github.com/artrajz/vits-simple-api/graphs/contributors" target="_blank">
584
+ <img src="https://contrib.rocks/image?repo=artrajz/vits-simple-api"/></a>
api_test.py ADDED
@@ -0,0 +1,575 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import requests
4
+ import os
5
+ import time
6
+ import random
7
+ import string
8
+ from requests_toolbelt.multipart.encoder import MultipartEncoder
9
+
10
+ absolute_path = os.path.dirname(__file__)
11
+ base_url = "http://127.0.0.1:23456"
12
+
13
+
14
+ # 映射表
15
+ def voice_speakers():
16
+ url = f"{base_url}/voice/speakers"
17
+
18
+ res = requests.post(url=url)
19
+ json = res.json()
20
+ for i in json:
21
+ print(i)
22
+ for j in json[i]:
23
+ print(j)
24
+ return json
25
+
26
+
27
+ # 语音合成 voice vits
28
+ def voice_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
29
+ save_audio=True,
30
+ save_path=None):
31
+ fields = {
32
+ "text": text,
33
+ "id": str(id),
34
+ "format": format,
35
+ "lang": lang,
36
+ "length": str(length),
37
+ "noise": str(noise),
38
+ "noisew": str(noisew),
39
+ "segment_size": str(segment_size)
40
+ }
41
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
42
+
43
+ m = MultipartEncoder(fields=fields, boundary=boundary)
44
+ headers = {"Content-Type": m.content_type}
45
+ url = f"{base_url}/voice/vits"
46
+
47
+ res = requests.post(url=url, data=m, headers=headers)
48
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
49
+ if save_path is not None:
50
+ path = os.path.join(save_path, fname)
51
+ else:
52
+ path = os.path.join(absolute_path, fname)
53
+ if save_audio:
54
+ with open(path, "wb") as f:
55
+ f.write(res.content)
56
+ print(path)
57
+ return path
58
+ return None
59
+
60
+
61
+ def voice_vits_streaming(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
62
+ save_audio=True, save_path=None):
63
+ fields = {
64
+ "text": text,
65
+ "id": str(id),
66
+ "format": format,
67
+ "lang": lang,
68
+ "length": str(length),
69
+ "noise": str(noise),
70
+ "noisew": str(noisew),
71
+ "segment_size": str(segment_size),
72
+ "streaming": 'True'
73
+ }
74
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
75
+
76
+ m = MultipartEncoder(fields=fields, boundary=boundary)
77
+ headers = {"Content-Type": m.content_type}
78
+ url = f"{base_url}/voice"
79
+
80
+ res = requests.post(url=url, data=m, headers=headers)
81
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
82
+ if save_path is not None:
83
+ path = os.path.join(save_path, fname)
84
+ else:
85
+ path = os.path.join(absolute_path, fname)
86
+ if save_audio:
87
+ with open(path, "wb") as f:
88
+ f.write(res.content)
89
+ print(path)
90
+ return path
91
+ return None
92
+
93
+
94
+ def voice_vits_streaming(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
95
+ save_path=None):
96
+ fields = {
97
+ "text": text,
98
+ "id": str(id),
99
+ "format": format,
100
+ "lang": lang,
101
+ "length": str(length),
102
+ "noise": str(noise),
103
+ "noisew": str(noisew),
104
+ "segment_size": str(segment_size),
105
+ "streaming": 'True'
106
+ }
107
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
108
+
109
+ m = MultipartEncoder(fields=fields, boundary=boundary)
110
+ headers = {"Content-Type": m.content_type}
111
+ url = f"{base_url}/voice"
112
+
113
+ res = requests.post(url=url, data=m, headers=headers, stream=True)
114
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
115
+ if save_path is not None:
116
+ path = os.path.join(save_path, fname)
117
+ else:
118
+ path = os.path.join(absolute_path, fname)
119
+ audio = res.content
120
+
121
+ def get_file_size_from_bytes(byte_data):
122
+ file_size_offset = 4
123
+ file_size_length = 4
124
+
125
+ try:
126
+ file_size_bytes = byte_data[file_size_offset:file_size_offset + file_size_length]
127
+ file_size = int.from_bytes(file_size_bytes, byteorder='little')
128
+ return file_size + 8
129
+ except IndexError:
130
+ return None
131
+
132
+ audio = None
133
+ p = 0
134
+ audio_size = None
135
+ audios = []
136
+
137
+ for chunk in res.iter_content(chunk_size=1024):
138
+ if audio is None:
139
+ audio = chunk
140
+ else:
141
+ audio += chunk
142
+
143
+ p += len(chunk)
144
+ if audio_size is not None:
145
+ if p >= audio_size:
146
+ p = p - audio_size
147
+ audios.append(audio[:audio_size])
148
+ audio = audio[audio_size:]
149
+ audio_size = get_file_size_from_bytes(audio)
150
+ else:
151
+ audio_size = get_file_size_from_bytes(audio)
152
+ for i, audio in enumerate(audios):
153
+ with open(f"{path[:-4]}-{i}.wav", "wb") as f:
154
+ f.write(audio)
155
+
156
+ print(f"{path[:-4]}-{i}.wav")
157
+ return path
158
+
159
+
160
+ # 语音转换 hubert-vits
161
+ def voice_hubert_vits(upload_path, id, format="wav", length=1, noise=0.667, noisew=0.8, save_audio=True,
162
+ save_path=None):
163
+ upload_name = os.path.basename(upload_path)
164
+ upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
165
+
166
+ with open(upload_path, 'rb') as upload_file:
167
+ fields = {
168
+ "upload": (upload_name, upload_file, upload_type),
169
+ "id": str(id),
170
+ "format": format,
171
+ "length": str(length),
172
+ "noise": str(noise),
173
+ "noisew": str(noisew),
174
+ }
175
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
176
+
177
+ m = MultipartEncoder(fields=fields, boundary=boundary)
178
+ headers = {"Content-Type": m.content_type}
179
+ url = f"{base_url}/voice/hubert-vits"
180
+
181
+ res = requests.post(url=url, data=m, headers=headers)
182
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
183
+ if save_path is not None:
184
+ path = os.path.join(save_path, fname)
185
+ else:
186
+ path = os.path.join(absolute_path, fname)
187
+ if save_audio:
188
+ with open(path, "wb") as f:
189
+ f.write(res.content)
190
+ print(path)
191
+ return path
192
+ return None
193
+
194
+
195
+ # 维度情感模型 w2v2-vits
196
+ def voice_w2v2_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
197
+ emotion=0,
198
+ save_audio=True, save_path=None):
199
+ fields = {
200
+ "text": text,
201
+ "id": str(id),
202
+ "format": format,
203
+ "lang": lang,
204
+ "length": str(length),
205
+ "noise": str(noise),
206
+ "noisew": str(noisew),
207
+ "segment_size": str(segment_size),
208
+ "emotion": str(emotion)
209
+ }
210
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
211
+
212
+ m = MultipartEncoder(fields=fields, boundary=boundary)
213
+ headers = {"Content-Type": m.content_type}
214
+ url = f"{base_url}/voice/w2v2-vits"
215
+
216
+ res = requests.post(url=url, data=m, headers=headers)
217
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
218
+ if save_path is not None:
219
+ path = os.path.join(save_path, fname)
220
+ else:
221
+ path = os.path.join(absolute_path, fname)
222
+ if save_audio:
223
+ with open(path, "wb") as f:
224
+ f.write(res.content)
225
+ print(path)
226
+ return path
227
+ return None
228
+
229
+
230
+ # 语音转换 同VITS模型内角色之间的音色转换
231
+ def voice_conversion(upload_path, original_id, target_id, save_audio=True, save_path=None):
232
+ upload_name = os.path.basename(upload_path)
233
+ upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
234
+
235
+ with open(upload_path, 'rb') as upload_file:
236
+ fields = {
237
+ "upload": (upload_name, upload_file, upload_type),
238
+ "original_id": str(original_id),
239
+ "target_id": str(target_id),
240
+ }
241
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
242
+ m = MultipartEncoder(fields=fields, boundary=boundary)
243
+
244
+ headers = {"Content-Type": m.content_type}
245
+ url = f"{base_url}/voice/conversion"
246
+
247
+ res = requests.post(url=url, data=m, headers=headers)
248
+
249
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
250
+ if save_path is not None:
251
+ path = os.path.join(save_path, fname)
252
+ else:
253
+ path = os.path.join(absolute_path, fname)
254
+
255
+ if save_audio:
256
+ with open(path, "wb") as f:
257
+ f.write(res.content)
258
+ print(path)
259
+ return path
260
+ return None
261
+
262
+
263
+ def voice_ssml(ssml, save_audio=True, save_path=None):
264
+ fields = {
265
+ "ssml": ssml,
266
+ }
267
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
268
+
269
+ m = MultipartEncoder(fields=fields, boundary=boundary)
270
+ headers = {"Content-Type": m.content_type}
271
+ url = f"{base_url}/voice/ssml"
272
+
273
+ res = requests.post(url=url, data=m, headers=headers)
274
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
275
+ if save_path is not None:
276
+ path = os.path.join(save_path, fname)
277
+ else:
278
+ path = os.path.join(absolute_path, fname)
279
+
280
+ if save_audio:
281
+ with open(path, "wb") as f:
282
+ f.write(res.content)
283
+ print(path)
284
+ return path
285
+ return None
286
+
287
+
288
+ def voice_dimensional_emotion(upload_path, save_audio=True,
289
+ save_path=None):
290
+ upload_name = os.path.basename(upload_path)
291
+ upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
292
+
293
+ with open(upload_path, 'rb') as upload_file:
294
+ fields = {
295
+ "upload": (upload_name, upload_file, upload_type),
296
+ }
297
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
298
+
299
+ m = MultipartEncoder(fields=fields, boundary=boundary)
300
+ headers = {"Content-Type": m.content_type}
301
+ url = f"{base_url}/voice/dimension-emotion"
302
+
303
+ res = requests.post(url=url, data=m, headers=headers)
304
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
305
+ if save_path is not None:
306
+ path = os.path.join(save_path, fname)
307
+ else:
308
+ path = os.path.join(absolute_path, fname)
309
+ if save_audio:
310
+ with open(path, "wb") as f:
311
+ f.write(res.content)
312
+ print(path)
313
+ return path
314
+ return None
315
+
316
+
317
+ def vits_json(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
318
+ save_audio=True, save_path=None):
319
+ fields = {
320
+ "text": text,
321
+ "id": str(id),
322
+ "format": format,
323
+ "lang": lang,
324
+ "length": str(length),
325
+ "noise": str(noise),
326
+ "noisew": str(noisew),
327
+ "segment_size": str(segment_size)
328
+ }
329
+ f = json.dumps(fields)
330
+ url = f"{base_url}/voice"
331
+ header = {"Content-Type": 'application/json'}
332
+ res = requests.post(url=url, data=f, headers=header)
333
+
334
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
335
+ if save_path is not None:
336
+ path = os.path.join(save_path, fname)
337
+ else:
338
+ path = os.path.join(absolute_path, fname)
339
+
340
+ with open(path, "wb") as f:
341
+ f.write(res.content)
342
+
343
+ if save_audio:
344
+ with open(path, "wb") as f:
345
+ f.write(res.content)
346
+ print(path)
347
+ return path
348
+ return None
349
+
350
+
351
+ # Bert_vits2
352
+ def voice_bert_vits2(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
353
+ sdp_ratio=0.2, save_audio=True, save_path=None):
354
+ fields = {
355
+ "text": text,
356
+ "id": str(id),
357
+ "format": format,
358
+ "lang": lang,
359
+ "length": str(length),
360
+ "noise": str(noise),
361
+ "noisew": str(noisew),
362
+ "segment_size": str(segment_size),
363
+ "sdp_ratio": str(sdp_ratio)
364
+ }
365
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
366
+
367
+ m = MultipartEncoder(fields=fields, boundary=boundary)
368
+ headers = {"Content-Type": m.content_type}
369
+ url = f"{base_url}/voice/bert-vits2"
370
+
371
+ res = requests.post(url=url, data=m, headers=headers)
372
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
373
+ if save_path is not None:
374
+ path = os.path.join(save_path, fname)
375
+ else:
376
+ path = os.path.join(absolute_path, fname)
377
+ if save_audio:
378
+ with open(path, "wb") as f:
379
+ f.write(res.content)
380
+ print(path)
381
+ return path
382
+ return None
383
+
384
+
385
+ # gpt_sovits
386
+ def voice_gpt_sovits(text, id=0, format="wav", lang="auto", preset=None, prompt_text=None, prompt_lang="auto",
387
+ segment_size=50, reference_audio=None, save_audio=True, save_path=None):
388
+ upload_name, upload_type, upload_file = None, None, None
389
+ if reference_audio is not None:
390
+ upload_name = os.path.basename(reference_audio)
391
+ upload_type = f'audio/{upload_name.split(".")[1]}'
392
+ with open(reference_audio, 'rb') as f:
393
+ upload_file = f.read()
394
+
395
+ fields = {
396
+ "text": text,
397
+ "id": str(id),
398
+ "format": format,
399
+ "lang": lang,
400
+ "segment_size": str(segment_size),
401
+ "preset": preset,
402
+ "reference_audio": (upload_name, upload_file, upload_type) if reference_audio else None,
403
+ "prompt_text": prompt_text,
404
+ "prompt_lang": prompt_lang
405
+ }
406
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
407
+
408
+ m = MultipartEncoder(fields=fields, boundary=boundary)
409
+ headers = {"Content-Type": m.content_type}
410
+ url = f"{base_url}/voice/gpt-sovits"
411
+
412
+ res = requests.post(url=url, data=m, headers=headers)
413
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
414
+ if save_path is not None:
415
+ path = os.path.join(save_path, fname)
416
+ else:
417
+ path = os.path.join(absolute_path, fname)
418
+ if save_audio:
419
+ with open(path, "wb") as f:
420
+ f.write(res.content)
421
+ print(path)
422
+ return path
423
+ return None
424
+
425
+
426
+ # Reading
427
+ def voice_reading_get(text, in_model_type, in_id, nr_model_type, nr_id, format="wav", lang="auto", preset=None,
428
+ save_audio=True, save_path=None):
429
+ res = requests.get(
430
+ url=f"{base_url}/voice/reading?text={text}&in_model_type={in_model_type}&in_id={in_id}&preset={preset}&nr_model_type={nr_model_type}&nr_id={nr_id}&lang={lang}&format={format}")
431
+
432
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
433
+ if save_path is not None:
434
+ path = os.path.join(save_path, fname)
435
+ else:
436
+ path = os.path.join(absolute_path, fname)
437
+
438
+ with open(path, "wb") as f:
439
+ f.write(res.content)
440
+
441
+ if save_audio:
442
+ with open(path, "wb") as f:
443
+ f.write(res.content)
444
+ print(path)
445
+ return path
446
+ return None
447
+
448
+
449
+ # Reading
450
+ def voice_reading_json(text, in_model_type, in_id, nr_model_type, nr_id, format="wav", lang="auto", preset=None,
451
+ save_audio=True, save_path=None):
452
+ fields = {
453
+ "text": text,
454
+ "in_model_type": in_model_type,
455
+ "in_id": str(in_id),
456
+ "nr_model_type": nr_model_type,
457
+ "nr_id": str(nr_id),
458
+ "format": format,
459
+ "lang": lang,
460
+ }
461
+ f = json.dumps(fields)
462
+ url = f"{base_url}/voice/reading"
463
+ header = {"Content-Type": 'application/json'}
464
+ res = requests.post(url=url, data=f, headers=header)
465
+
466
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
467
+ if save_path is not None:
468
+ path = os.path.join(save_path, fname)
469
+ else:
470
+ path = os.path.join(absolute_path, fname)
471
+
472
+ with open(path, "wb") as f:
473
+ f.write(res.content)
474
+
475
+ if save_audio:
476
+ with open(path, "wb") as f:
477
+ f.write(res.content)
478
+ print(path)
479
+ return path
480
+ return None
481
+
482
+
483
+ # Reading
484
+ def voice_reading(text, in_model_type, in_id, nr_model_type, nr_id, format="wav", lang="auto", preset=None,
485
+ save_audio=True, save_path=None):
486
+ fields = {
487
+ "text": text,
488
+ "in_model_type": in_model_type,
489
+ "in_id": str(in_id),
490
+ "nr_model_type": nr_model_type,
491
+ "nr_id": str(nr_id),
492
+ "format": format,
493
+ "lang": lang,
494
+ }
495
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
496
+
497
+ m = MultipartEncoder(fields=fields, boundary=boundary)
498
+ headers = {"Content-Type": m.content_type}
499
+ url = f"{base_url}/voice/reading"
500
+
501
+ res = requests.post(url=url, data=m, headers=headers)
502
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
503
+ if save_path is not None:
504
+ path = os.path.join(save_path, fname)
505
+ else:
506
+ path = os.path.join(absolute_path, fname)
507
+ if save_audio:
508
+ with open(path, "wb") as f:
509
+ f.write(res.content)
510
+ print(path)
511
+ return path
512
+ return None
513
+
514
+
515
+ def test_interface(text):
516
+ error_num = 0
517
+ for i in range(100):
518
+ try:
519
+ time.sleep(1)
520
+ t1 = time.time()
521
+ voice_vits(text, format="wav", lang="zh", save_audio=False)
522
+ t2 = time.time()
523
+ print(f"{i}:len:{len(text)}耗时:{t2 - t1}")
524
+ except Exception as e:
525
+ error_num += 1
526
+ print(e)
527
+ print(f"error_num={error_num}")
528
+
529
+
530
+ if __name__ == '__main__':
531
+ cache_path = os.path.join(os.path.curdir, "cache")
532
+
533
+ text = "你好,こんにちは"
534
+
535
+ ssml = """
536
+ <speak lang="zh" format="mp3" length="1.2">
537
+ <voice id="0" model_type="GPT-SOVITS" preset="default">这几天心里颇不宁静。</voice>
538
+ <voice id="0" model_type="Bert-VITS2">今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
539
+ <voice id="142">月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice>
540
+ <voice id="0" model_type="Bert-VITS2">妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。</voice>
541
+ <voice id="120">我悄悄地披了大衫,带上门出去。</voice><break time="2s"/>
542
+ <voice id="121">沿着荷塘,是一条曲折的小煤屑路。</voice>
543
+ <voice id="122">这是一条幽僻的路;白天也少人走,夜晚更加寂寞。</voice>
544
+ <voice id="123">荷塘四面,长着许多树,蓊蓊郁郁的。</voice>
545
+ <voice id="124">路的一旁,是些杨柳,和一些不知道名字的树。</voice>
546
+ <voice id="125">没有月光的晚上,这路上阴森森的,有些怕人。</voice>
547
+ <voice id="126">今晚却很好,虽然月光也还是淡淡的。</voice><break time="2s"/>
548
+ <voice id="127">路上只我一个人,背着手踱着。</voice>
549
+ <voice id="128">这一片天地好像是我的;我也像超出了平常的自己,到了另一个世界里。</voice>
550
+ <voice id="129">我爱热闹,也爱冷静;<break strength="x-weak"/>爱群居,也爱独处。</voice>
551
+ <voice id="130">像今晚上,一个人在这苍茫的月下,什么都可以想,什么都可以不想,便觉是个自由的人。</voice>
552
+ <voice id="131">白天里一定要做的事,一定要说的话,现在都可不理。</voice>
553
+ <voice id="132">这是独处的妙处,我且受用这无边的荷香月色好了。</voice>
554
+ </speak>
555
+ """
556
+
557
+ # path = voice_vits(text, save_path=cache_path)
558
+ # path =voice_vits_streaming(text, save_path=cache_path)
559
+ # path = voice_w2v2_vits(text, save_path=cache_path)
560
+ # path = voice_conversion(path, 1, 3, save_path=cache_path)
561
+ # path = voice_hubert_vits(path, 0, save_path=cache_path)
562
+ # path = voice_dimensional_emotion(path, save_path=cache_path)
563
+ # path = voice_ssml(ssml, save_path=cache_path)
564
+ # path = voice_bert_vits2("你好", lang="zh", save_path=cache_path)
565
+ # path = voice_bert_vits2("こんにちは", lang="ja", save_path=cache_path)
566
+ # path = voice_gpt_sovits(text=text, id=2, preset="wz")
567
+ # path = voice_gpt_sovits(text=text, id=2, reference_audio=r"H:\git\vits-simple-api\data\reference_audio\wz_10068.wav",prompt_text="……嗯……大概、快上课的时候开始的。到这个程度的话,……半个小时吧?")
568
+
569
+ # os.system(path)
570
+
571
+ # text = "你好“你的修炼速度有些出乎我的意料”"
572
+ # path = voice_reading_json(text=text, in_model_type="GPT-SOVITS", preset="wz", in_id=2, nr_model_type="BERT-VITS2",
573
+ # nr_id=0)
574
+
575
+ # os.system(path)
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+
3
+ from flask import Flask
4
+ from flask_apscheduler import APScheduler
5
+ from flask_login import LoginManager
6
+ from flask_wtf import CSRFProtect
7
+
8
+ from utils.data_utils import clean_folder
9
+ from utils.phrases_dict import phrases_dict_init
10
+ from tts_app.frontend.views import frontend
11
+ from tts_app.voice_api.views import voice_api
12
+ from tts_app.auth.views import auth
13
+ from tts_app.admin.views import admin
14
+
15
+ from contants import config
16
+
17
+ app = Flask(__name__, template_folder=os.path.join(os.path.dirname(__file__), 'tts_app', 'templates'),
18
+ static_folder=os.path.join(os.path.dirname(__file__), 'tts_app', 'static'))
19
+
20
+ app.config.from_pyfile("config.py")
21
+ # app.config.update(config)
22
+
23
+ phrases_dict_init()
24
+
25
+ csrf = CSRFProtect(app)
26
+ # 禁用tts api请求的CSRF防护
27
+ csrf.exempt(voice_api)
28
+
29
+ if config.system.is_admin_enabled:
30
+ login_manager = LoginManager()
31
+ login_manager.init_app(app)
32
+ login_manager.login_view = 'auth.login'
33
+
34
+
35
+ @login_manager.user_loader
36
+ def load_user(user_id):
37
+ admin = config.admin
38
+ if admin.get_id() == user_id:
39
+ return admin
40
+ return None
41
+
42
+ # Initialize scheduler
43
+ scheduler = APScheduler()
44
+ scheduler.init_app(app)
45
+ if config.system.clean_interval_seconds > 0:
46
+ scheduler.start()
47
+
48
+ app.register_blueprint(frontend, url_prefix='/')
49
+ app.register_blueprint(voice_api, url_prefix='/voice')
50
+ if config.system.is_admin_enabled:
51
+ app.register_blueprint(auth, url_prefix=config.system.admin_route)
52
+ app.register_blueprint(admin, url_prefix=config.system.admin_route)
53
+
54
+
55
+ def create_folders(paths):
56
+ for path in paths:
57
+ if not os.path.exists(path):
58
+ os.makedirs(path, exist_ok=True)
59
+
60
+
61
+ create_folders([os.path.join(config.abs_path, config.system.upload_folder),
62
+ os.path.join(config.abs_path, config.system.cache_path), ])
63
+
64
+
65
+ # regular cleaning
66
+ @scheduler.task('interval', id='clean_task', seconds=config.system.clean_interval_seconds,
67
+ misfire_grace_time=900)
68
+ def clean_task():
69
+ clean_folder(os.path.join(config.abs_path, config.system.upload_folder))
70
+ clean_folder(os.path.join(config.abs_path, config.system.cache_path))
71
+
72
+
73
+ if __name__ == '__main__':
74
+ app.run(host=config.http_service.host, port=config.http_service.port, debug=config.http_service.debug)
bert_vits2/LICENSE ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU GENERAL PUBLIC LICENSE
2
+ Version 3, 29 June 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ The GNU General Public License is a free, copyleft license for
11
+ software and other kinds of works.
12
+
13
+ The licenses for most software and other practical works are designed
14
+ to take away your freedom to share and change the works. By contrast,
15
+ the GNU General Public License is intended to guarantee your freedom to
16
+ share and change all versions of a program--to make sure it remains free
17
+ software for all its users. We, the Free Software Foundation, use the
18
+ GNU General Public License for most of our software; it applies also to
19
+ any other work released this way by its authors. You can apply it to
20
+ your programs, too.
21
+
22
+ When we speak of free software, we are referring to freedom, not
23
+ price. Our General Public Licenses are designed to make sure that you
24
+ have the freedom to distribute copies of free software (and charge for
25
+ them if you wish), that you receive source code or can get it if you
26
+ want it, that you can change the software or use pieces of it in new
27
+ free programs, and that you know you can do these things.
28
+
29
+ To protect your rights, we need to prevent others from denying you
30
+ these rights or asking you to surrender the rights. Therefore, you have
31
+ certain responsibilities if you distribute copies of the software, or if
32
+ you modify it: responsibilities to respect the freedom of others.
33
+
34
+ For example, if you distribute copies of such a program, whether
35
+ gratis or for a fee, you must pass on to the recipients the same
36
+ freedoms that you received. You must make sure that they, too, receive
37
+ or can get the source code. And you must show them these terms so they
38
+ know their rights.
39
+
40
+ Developers that use the GNU GPL protect your rights with two steps:
41
+ (1) assert copyright on the software, and (2) offer you this License
42
+ giving you legal permission to copy, distribute and/or modify it.
43
+
44
+ For the developers' and authors' protection, the GPL clearly explains
45
+ that there is no warranty for this free software. For both users' and
46
+ authors' sake, the GPL requires that modified versions be marked as
47
+ changed, so that their problems will not be attributed erroneously to
48
+ authors of previous versions.
49
+
50
+ Some devices are designed to deny users access to install or run
51
+ modified versions of the software inside them, although the manufacturer
52
+ can do so. This is fundamentally incompatible with the aim of
53
+ protecting users' freedom to change the software. The systematic
54
+ pattern of such abuse occurs in the area of products for individuals to
55
+ use, which is precisely where it is most unacceptable. Therefore, we
56
+ have designed this version of the GPL to prohibit the practice for those
57
+ products. If such problems arise substantially in other domains, we
58
+ stand ready to extend this provision to those domains in future versions
59
+ of the GPL, as needed to protect the freedom of users.
60
+
61
+ Finally, every program is threatened constantly by software patents.
62
+ States should not allow patents to restrict development and use of
63
+ software on general-purpose computers, but in those that do, we wish to
64
+ avoid the special danger that patents applied to a free program could
65
+ make it effectively proprietary. To prevent this, the GPL assures that
66
+ patents cannot be used to render the program non-free.
67
+
68
+ The precise terms and conditions for copying, distribution and
69
+ modification follow.
70
+
71
+ TERMS AND CONDITIONS
72
+
73
+ 0. Definitions.
74
+
75
+ "This License" refers to version 3 of the GNU General Public License.
76
+
77
+ "Copyright" also means copyright-like laws that apply to other kinds of
78
+ works, such as semiconductor masks.
79
+
80
+ "The Program" refers to any copyrightable work licensed under this
81
+ License. Each licensee is addressed as "you". "Licensees" and
82
+ "recipients" may be individuals or organizations.
83
+
84
+ To "modify" a work means to copy from or adapt all or part of the work
85
+ in a fashion requiring copyright permission, other than the making of an
86
+ exact copy. The resulting work is called a "modified version" of the
87
+ earlier work or a work "based on" the earlier work.
88
+
89
+ A "covered work" means either the unmodified Program or a work based
90
+ on the Program.
91
+
92
+ To "propagate" a work means to do anything with it that, without
93
+ permission, would make you directly or secondarily liable for
94
+ infringement under applicable copyright law, except executing it on a
95
+ computer or modifying a private copy. Propagation includes copying,
96
+ distribution (with or without modification), making available to the
97
+ public, and in some countries other activities as well.
98
+
99
+ To "convey" a work means any kind of propagation that enables other
100
+ parties to make or receive copies. Mere interaction with a user through
101
+ a computer network, with no transfer of a copy, is not conveying.
102
+
103
+ An interactive user interface displays "Appropriate Legal Notices"
104
+ to the extent that it includes a convenient and prominently visible
105
+ feature that (1) displays an appropriate copyright notice, and (2)
106
+ tells the user that there is no warranty for the work (except to the
107
+ extent that warranties are provided), that licensees may convey the
108
+ work under this License, and how to view a copy of this License. If
109
+ the interface presents a list of user commands or options, such as a
110
+ menu, a prominent item in the list meets this criterion.
111
+
112
+ 1. Source Code.
113
+
114
+ The "source code" for a work means the preferred form of the work
115
+ for making modifications to it. "Object code" means any non-source
116
+ form of a work.
117
+
118
+ A "Standard Interface" means an interface that either is an official
119
+ standard defined by a recognized standards body, or, in the case of
120
+ interfaces specified for a particular programming language, one that
121
+ is widely used among developers working in that language.
122
+
123
+ The "System Libraries" of an executable work include anything, other
124
+ than the work as a whole, that (a) is included in the normal form of
125
+ packaging a Major Component, but which is not part of that Major
126
+ Component, and (b) serves only to enable use of the work with that
127
+ Major Component, or to implement a Standard Interface for which an
128
+ implementation is available to the public in source code form. A
129
+ "Major Component", in this context, means a major essential component
130
+ (kernel, window system, and so on) of the specific operating system
131
+ (if any) on which the executable work runs, or a compiler used to
132
+ produce the work, or an object code interpreter used to run it.
133
+
134
+ The "Corresponding Source" for a work in object code form means all
135
+ the source code needed to generate, install, and (for an executable
136
+ work) run the object code and to modify the work, including scripts to
137
+ control those activities. However, it does not include the work's
138
+ System Libraries, or general-purpose tools or generally available free
139
+ programs which are used unmodified in performing those activities but
140
+ which are not part of the work. For example, Corresponding Source
141
+ includes interface definition files associated with source files for
142
+ the work, and the source code for shared libraries and dynamically
143
+ linked subprograms that the work is specifically designed to require,
144
+ such as by intimate data communication or control flow between those
145
+ subprograms and other parts of the work.
146
+
147
+ The Corresponding Source need not include anything that users
148
+ can regenerate automatically from other parts of the Corresponding
149
+ Source.
150
+
151
+ The Corresponding Source for a work in source code form is that
152
+ same work.
153
+
154
+ 2. Basic Permissions.
155
+
156
+ All rights granted under this License are granted for the term of
157
+ copyright on the Program, and are irrevocable provided the stated
158
+ conditions are met. This License explicitly affirms your unlimited
159
+ permission to run the unmodified Program. The output from running a
160
+ covered work is covered by this License only if the output, given its
161
+ content, constitutes a covered work. This License acknowledges your
162
+ rights of fair use or other equivalent, as provided by copyright law.
163
+
164
+ You may make, run and propagate covered works that you do not
165
+ convey, without conditions so long as your license otherwise remains
166
+ in force. You may convey covered works to others for the sole purpose
167
+ of having them make modifications exclusively for you, or provide you
168
+ with facilities for running those works, provided that you comply with
169
+ the terms of this License in conveying all material for which you do
170
+ not control copyright. Those thus making or running the covered works
171
+ for you must do so exclusively on your behalf, under your direction
172
+ and control, on terms that prohibit them from making any copies of
173
+ your copyrighted material outside their relationship with you.
174
+
175
+ Conveying under any other circumstances is permitted solely under
176
+ the conditions stated below. Sublicensing is not allowed; section 10
177
+ makes it unnecessary.
178
+
179
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180
+
181
+ No covered work shall be deemed part of an effective technological
182
+ measure under any applicable law fulfilling obligations under article
183
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184
+ similar laws prohibiting or restricting circumvention of such
185
+ measures.
186
+
187
+ When you convey a covered work, you waive any legal power to forbid
188
+ circumvention of technological measures to the extent such circumvention
189
+ is effected by exercising rights under this License with respect to
190
+ the covered work, and you disclaim any intention to limit operation or
191
+ modification of the work as a means of enforcing, against the work's
192
+ users, your or third parties' legal rights to forbid circumvention of
193
+ technological measures.
194
+
195
+ 4. Conveying Verbatim Copies.
196
+
197
+ You may convey verbatim copies of the Program's source code as you
198
+ receive it, in any medium, provided that you conspicuously and
199
+ appropriately publish on each copy an appropriate copyright notice;
200
+ keep intact all notices stating that this License and any
201
+ non-permissive terms added in accord with section 7 apply to the code;
202
+ keep intact all notices of the absence of any warranty; and give all
203
+ recipients a copy of this License along with the Program.
204
+
205
+ You may charge any price or no price for each copy that you convey,
206
+ and you may offer support or warranty protection for a fee.
207
+
208
+ 5. Conveying Modified Source Versions.
209
+
210
+ You may convey a work based on the Program, or the modifications to
211
+ produce it from the Program, in the form of source code under the
212
+ terms of section 4, provided that you also meet all of these conditions:
213
+
214
+ a) The work must carry prominent notices stating that you modified
215
+ it, and giving a relevant date.
216
+
217
+ b) The work must carry prominent notices stating that it is
218
+ released under this License and any conditions added under section
219
+ 7. This requirement modifies the requirement in section 4 to
220
+ "keep intact all notices".
221
+
222
+ c) You must license the entire work, as a whole, under this
223
+ License to anyone who comes into possession of a copy. This
224
+ License will therefore apply, along with any applicable section 7
225
+ additional terms, to the whole of the work, and all its parts,
226
+ regardless of how they are packaged. This License gives no
227
+ permission to license the work in any other way, but it does not
228
+ invalidate such permission if you have separately received it.
229
+
230
+ d) If the work has interactive user interfaces, each must display
231
+ Appropriate Legal Notices; however, if the Program has interactive
232
+ interfaces that do not display Appropriate Legal Notices, your
233
+ work need not make them do so.
234
+
235
+ A compilation of a covered work with other separate and independent
236
+ works, which are not by their nature extensions of the covered work,
237
+ and which are not combined with it such as to form a larger program,
238
+ in or on a volume of a storage or distribution medium, is called an
239
+ "aggregate" if the compilation and its resulting copyright are not
240
+ used to limit the access or legal rights of the compilation's users
241
+ beyond what the individual works permit. Inclusion of a covered work
242
+ in an aggregate does not cause this License to apply to the other
243
+ parts of the aggregate.
244
+
245
+ 6. Conveying Non-Source Forms.
246
+
247
+ You may convey a covered work in object code form under the terms
248
+ of sections 4 and 5, provided that you also convey the
249
+ machine-readable Corresponding Source under the terms of this License,
250
+ in one of these ways:
251
+
252
+ a) Convey the object code in, or embodied in, a physical product
253
+ (including a physical distribution medium), accompanied by the
254
+ Corresponding Source fixed on a durable physical medium
255
+ customarily used for software interchange.
256
+
257
+ b) Convey the object code in, or embodied in, a physical product
258
+ (including a physical distribution medium), accompanied by a
259
+ written offer, valid for at least three years and valid for as
260
+ long as you offer spare parts or customer support for that product
261
+ model, to give anyone who possesses the object code either (1) a
262
+ copy of the Corresponding Source for all the software in the
263
+ product that is covered by this License, on a durable physical
264
+ medium customarily used for software interchange, for a price no
265
+ more than your reasonable cost of physically performing this
266
+ conveying of source, or (2) access to copy the
267
+ Corresponding Source from a network server at no charge.
268
+
269
+ c) Convey individual copies of the object code with a copy of the
270
+ written offer to provide the Corresponding Source. This
271
+ alternative is allowed only occasionally and noncommercially, and
272
+ only if you received the object code with such an offer, in accord
273
+ with subsection 6b.
274
+
275
+ d) Convey the object code by offering access from a designated
276
+ place (gratis or for a charge), and offer equivalent access to the
277
+ Corresponding Source in the same way through the same place at no
278
+ further charge. You need not require recipients to copy the
279
+ Corresponding Source along with the object code. If the place to
280
+ copy the object code is a network server, the Corresponding Source
281
+ may be on a different server (operated by you or a third party)
282
+ that supports equivalent copying facilities, provided you maintain
283
+ clear directions next to the object code saying where to find the
284
+ Corresponding Source. Regardless of what server hosts the
285
+ Corresponding Source, you remain obligated to ensure that it is
286
+ available for as long as needed to satisfy these requirements.
287
+
288
+ e) Convey the object code using peer-to-peer transmission, provided
289
+ you inform other peers where the object code and Corresponding
290
+ Source of the work are being offered to the general public at no
291
+ charge under subsection 6d.
292
+
293
+ A separable portion of the object code, whose source code is excluded
294
+ from the Corresponding Source as a System Library, need not be
295
+ included in conveying the object code work.
296
+
297
+ A "User Product" is either (1) a "consumer product", which means any
298
+ tangible personal property which is normally used for personal, family,
299
+ or household purposes, or (2) anything designed or sold for incorporation
300
+ into a dwelling. In determining whether a product is a consumer product,
301
+ doubtful cases shall be resolved in favor of coverage. For a particular
302
+ product received by a particular user, "normally used" refers to a
303
+ typical or common use of that class of product, regardless of the status
304
+ of the particular user or of the way in which the particular user
305
+ actually uses, or expects or is expected to use, the product. A product
306
+ is a consumer product regardless of whether the product has substantial
307
+ commercial, industrial or non-consumer uses, unless such uses represent
308
+ the only significant mode of use of the product.
309
+
310
+ "Installation Information" for a User Product means any methods,
311
+ procedures, authorization keys, or other information required to install
312
+ and execute modified versions of a covered work in that User Product from
313
+ a modified version of its Corresponding Source. The information must
314
+ suffice to ensure that the continued functioning of the modified object
315
+ code is in no case prevented or interfered with solely because
316
+ modification has been made.
317
+
318
+ If you convey an object code work under this section in, or with, or
319
+ specifically for use in, a User Product, and the conveying occurs as
320
+ part of a transaction in which the right of possession and use of the
321
+ User Product is transferred to the recipient in perpetuity or for a
322
+ fixed term (regardless of how the transaction is characterized), the
323
+ Corresponding Source conveyed under this section must be accompanied
324
+ by the Installation Information. But this requirement does not apply
325
+ if neither you nor any third party retains the ability to install
326
+ modified object code on the User Product (for example, the work has
327
+ been installed in ROM).
328
+
329
+ The requirement to provide Installation Information does not include a
330
+ requirement to continue to provide support service, warranty, or updates
331
+ for a work that has been modified or installed by the recipient, or for
332
+ the User Product in which it has been modified or installed. Access to a
333
+ network may be denied when the modification itself materially and
334
+ adversely affects the operation of the network or violates the rules and
335
+ protocols for communication across the network.
336
+
337
+ Corresponding Source conveyed, and Installation Information provided,
338
+ in accord with this section must be in a format that is publicly
339
+ documented (and with an implementation available to the public in
340
+ source code form), and must require no special password or key for
341
+ unpacking, reading or copying.
342
+
343
+ 7. Additional Terms.
344
+
345
+ "Additional permissions" are terms that supplement the terms of this
346
+ License by making exceptions from one or more of its conditions.
347
+ Additional permissions that are applicable to the entire Program shall
348
+ be treated as though they were included in this License, to the extent
349
+ that they are valid under applicable law. If additional permissions
350
+ apply only to part of the Program, that part may be used separately
351
+ under those permissions, but the entire Program remains governed by
352
+ this License without regard to the additional permissions.
353
+
354
+ When you convey a copy of a covered work, you may at your option
355
+ remove any additional permissions from that copy, or from any part of
356
+ it. (Additional permissions may be written to require their own
357
+ removal in certain cases when you modify the work.) You may place
358
+ additional permissions on material, added by you to a covered work,
359
+ for which you have or can give appropriate copyright permission.
360
+
361
+ Notwithstanding any other provision of this License, for material you
362
+ add to a covered work, you may (if authorized by the copyright holders of
363
+ that material) supplement the terms of this License with terms:
364
+
365
+ a) Disclaiming warranty or limiting liability differently from the
366
+ terms of sections 15 and 16 of this License; or
367
+
368
+ b) Requiring preservation of specified reasonable legal notices or
369
+ author attributions in that material or in the Appropriate Legal
370
+ Notices displayed by works containing it; or
371
+
372
+ c) Prohibiting misrepresentation of the origin of that material, or
373
+ requiring that modified versions of such material be marked in
374
+ reasonable ways as different from the original version; or
375
+
376
+ d) Limiting the use for publicity purposes of names of licensors or
377
+ authors of the material; or
378
+
379
+ e) Declining to grant rights under trademark law for use of some
380
+ trade names, trademarks, or service marks; or
381
+
382
+ f) Requiring indemnification of licensors and authors of that
383
+ material by anyone who conveys the material (or modified versions of
384
+ it) with contractual assumptions of liability to the recipient, for
385
+ any liability that these contractual assumptions directly impose on
386
+ those licensors and authors.
387
+
388
+ All other non-permissive additional terms are considered "further
389
+ restrictions" within the meaning of section 10. If the Program as you
390
+ received it, or any part of it, contains a notice stating that it is
391
+ governed by this License along with a term that is a further
392
+ restriction, you may remove that term. If a license document contains
393
+ a further restriction but permits relicensing or conveying under this
394
+ License, you may add to a covered work material governed by the terms
395
+ of that license document, provided that the further restriction does
396
+ not survive such relicensing or conveying.
397
+
398
+ If you add terms to a covered work in accord with this section, you
399
+ must place, in the relevant source files, a statement of the
400
+ additional terms that apply to those files, or a notice indicating
401
+ where to find the applicable terms.
402
+
403
+ Additional terms, permissive or non-permissive, may be stated in the
404
+ form of a separately written license, or stated as exceptions;
405
+ the above requirements apply either way.
406
+
407
+ 8. Termination.
408
+
409
+ You may not propagate or modify a covered work except as expressly
410
+ provided under this License. Any attempt otherwise to propagate or
411
+ modify it is void, and will automatically terminate your rights under
412
+ this License (including any patent licenses granted under the third
413
+ paragraph of section 11).
414
+
415
+ However, if you cease all violation of this License, then your
416
+ license from a particular copyright holder is reinstated (a)
417
+ provisionally, unless and until the copyright holder explicitly and
418
+ finally terminates your license, and (b) permanently, if the copyright
419
+ holder fails to notify you of the violation by some reasonable means
420
+ prior to 60 days after the cessation.
421
+
422
+ Moreover, your license from a particular copyright holder is
423
+ reinstated permanently if the copyright holder notifies you of the
424
+ violation by some reasonable means, this is the first time you have
425
+ received notice of violation of this License (for any work) from that
426
+ copyright holder, and you cure the violation prior to 30 days after
427
+ your receipt of the notice.
428
+
429
+ Termination of your rights under this section does not terminate the
430
+ licenses of parties who have received copies or rights from you under
431
+ this License. If your rights have been terminated and not permanently
432
+ reinstated, you do not qualify to receive new licenses for the same
433
+ material under section 10.
434
+
435
+ 9. Acceptance Not Required for Having Copies.
436
+
437
+ You are not required to accept this License in order to receive or
438
+ run a copy of the Program. Ancillary propagation of a covered work
439
+ occurring solely as a consequence of using peer-to-peer transmission
440
+ to receive a copy likewise does not require acceptance. However,
441
+ nothing other than this License grants you permission to propagate or
442
+ modify any covered work. These actions infringe copyright if you do
443
+ not accept this License. Therefore, by modifying or propagating a
444
+ covered work, you indicate your acceptance of this License to do so.
445
+
446
+ 10. Automatic Licensing of Downstream Recipients.
447
+
448
+ Each time you convey a covered work, the recipient automatically
449
+ receives a license from the original licensors, to run, modify and
450
+ propagate that work, subject to this License. You are not responsible
451
+ for enforcing compliance by third parties with this License.
452
+
453
+ An "entity transaction" is a transaction transferring control of an
454
+ organization, or substantially all assets of one, or subdividing an
455
+ organization, or merging organizations. If propagation of a covered
456
+ work results from an entity transaction, each party to that
457
+ transaction who receives a copy of the work also receives whatever
458
+ licenses to the work the party's predecessor in interest had or could
459
+ give under the previous paragraph, plus a right to possession of the
460
+ Corresponding Source of the work from the predecessor in interest, if
461
+ the predecessor has it or can get it with reasonable efforts.
462
+
463
+ You may not impose any further restrictions on the exercise of the
464
+ rights granted or affirmed under this License. For example, you may
465
+ not impose a license fee, royalty, or other charge for exercise of
466
+ rights granted under this License, and you may not initiate litigation
467
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
468
+ any patent claim is infringed by making, using, selling, offering for
469
+ sale, or importing the Program or any portion of it.
470
+
471
+ 11. Patents.
472
+
473
+ A "contributor" is a copyright holder who authorizes use under this
474
+ License of the Program or a work on which the Program is based. The
475
+ work thus licensed is called the contributor's "contributor version".
476
+
477
+ A contributor's "essential patent claims" are all patent claims
478
+ owned or controlled by the contributor, whether already acquired or
479
+ hereafter acquired, that would be infringed by some manner, permitted
480
+ by this License, of making, using, or selling its contributor version,
481
+ but do not include claims that would be infringed only as a
482
+ consequence of further modification of the contributor version. For
483
+ purposes of this definition, "control" includes the right to grant
484
+ patent sublicenses in a manner consistent with the requirements of
485
+ this License.
486
+
487
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
488
+ patent license under the contributor's essential patent claims, to
489
+ make, use, sell, offer for sale, import and otherwise run, modify and
490
+ propagate the contents of its contributor version.
491
+
492
+ In the following three paragraphs, a "patent license" is any express
493
+ agreement or commitment, however denominated, not to enforce a patent
494
+ (such as an express permission to practice a patent or covenant not to
495
+ sue for patent infringement). To "grant" such a patent license to a
496
+ party means to make such an agreement or commitment not to enforce a
497
+ patent against the party.
498
+
499
+ If you convey a covered work, knowingly relying on a patent license,
500
+ and the Corresponding Source of the work is not available for anyone
501
+ to copy, free of charge and under the terms of this License, through a
502
+ publicly available network server or other readily accessible means,
503
+ then you must either (1) cause the Corresponding Source to be so
504
+ available, or (2) arrange to deprive yourself of the benefit of the
505
+ patent license for this particular work, or (3) arrange, in a manner
506
+ consistent with the requirements of this License, to extend the patent
507
+ license to downstream recipients. "Knowingly relying" means you have
508
+ actual knowledge that, but for the patent license, your conveying the
509
+ covered work in a country, or your recipient's use of the covered work
510
+ in a country, would infringe one or more identifiable patents in that
511
+ country that you have reason to believe are valid.
512
+
513
+ If, pursuant to or in connection with a single transaction or
514
+ arrangement, you convey, or propagate by procuring conveyance of, a
515
+ covered work, and grant a patent license to some of the parties
516
+ receiving the covered work authorizing them to use, propagate, modify
517
+ or convey a specific copy of the covered work, then the patent license
518
+ you grant is automatically extended to all recipients of the covered
519
+ work and works based on it.
520
+
521
+ A patent license is "discriminatory" if it does not include within
522
+ the scope of its coverage, prohibits the exercise of, or is
523
+ conditioned on the non-exercise of one or more of the rights that are
524
+ specifically granted under this License. You may not convey a covered
525
+ work if you are a party to an arrangement with a third party that is
526
+ in the business of distributing software, under which you make payment
527
+ to the third party based on the extent of your activity of conveying
528
+ the work, and under which the third party grants, to any of the
529
+ parties who would receive the covered work from you, a discriminatory
530
+ patent license (a) in connection with copies of the covered work
531
+ conveyed by you (or copies made from those copies), or (b) primarily
532
+ for and in connection with specific products or compilations that
533
+ contain the covered work, unless you entered into that arrangement,
534
+ or that patent license was granted, prior to 28 March 2007.
535
+
536
+ Nothing in this License shall be construed as excluding or limiting
537
+ any implied license or other defenses to infringement that may
538
+ otherwise be available to you under applicable patent law.
539
+
540
+ 12. No Surrender of Others' Freedom.
541
+
542
+ If conditions are imposed on you (whether by court order, agreement or
543
+ otherwise) that contradict the conditions of this License, they do not
544
+ excuse you from the conditions of this License. If you cannot convey a
545
+ covered work so as to satisfy simultaneously your obligations under this
546
+ License and any other pertinent obligations, then as a consequence you may
547
+ not convey it at all. For example, if you agree to terms that obligate you
548
+ to collect a royalty for further conveying from those to whom you convey
549
+ the Program, the only way you could satisfy both those terms and this
550
+ License would be to refrain entirely from conveying the Program.
551
+
552
+ 13. Use with the GNU Affero General Public License.
553
+
554
+ Notwithstanding any other provision of this License, you have
555
+ permission to link or combine any covered work with a work licensed
556
+ under version 3 of the GNU Affero General Public License into a single
557
+ combined work, and to convey the resulting work. The terms of this
558
+ License will continue to apply to the part which is the covered work,
559
+ but the special requirements of the GNU Affero General Public License,
560
+ section 13, concerning interaction through a network will apply to the
561
+ combination as such.
562
+
563
+ 14. Revised Versions of this License.
564
+
565
+ The Free Software Foundation may publish revised and/or new versions of
566
+ the GNU General Public License from time to time. Such new versions will
567
+ be similar in spirit to the present version, but may differ in detail to
568
+ address new problems or concerns.
569
+
570
+ Each version is given a distinguishing version number. If the
571
+ Program specifies that a certain numbered version of the GNU General
572
+ Public License "or any later version" applies to it, you have the
573
+ option of following the terms and conditions either of that numbered
574
+ version or of any later version published by the Free Software
575
+ Foundation. If the Program does not specify a version number of the
576
+ GNU General Public License, you may choose any version ever published
577
+ by the Free Software Foundation.
578
+
579
+ If the Program specifies that a proxy can decide which future
580
+ versions of the GNU General Public License can be used, that proxy's
581
+ public statement of acceptance of a version permanently authorizes you
582
+ to choose that version for the Program.
583
+
584
+ Later license versions may give you additional or different
585
+ permissions. However, no additional obligations are imposed on any
586
+ author or copyright holder as a result of your choosing to follow a
587
+ later version.
588
+
589
+ 15. Disclaimer of Warranty.
590
+
591
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599
+
600
+ 16. Limitation of Liability.
601
+
602
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610
+ SUCH DAMAGES.
611
+
612
+ 17. Interpretation of Sections 15 and 16.
613
+
614
+ If the disclaimer of warranty and limitation of liability provided
615
+ above cannot be given local legal effect according to their terms,
616
+ reviewing courts shall apply local law that most closely approximates
617
+ an absolute waiver of all civil liability in connection with the
618
+ Program, unless a warranty or assumption of liability accompanies a
619
+ copy of the Program in return for a fee.
620
+
621
+ END OF TERMS AND CONDITIONS
622
+
623
+ How to Apply These Terms to Your New Programs
624
+
625
+ If you develop a new program, and you want it to be of the greatest
626
+ possible use to the public, the best way to achieve this is to make it
627
+ free software which everyone can redistribute and change under these terms.
628
+
629
+ To do so, attach the following notices to the program. It is safest
630
+ to attach them to the start of each source file to most effectively
631
+ state the exclusion of warranty; and each file should have at least
632
+ the "copyright" line and a pointer to where the full notice is found.
633
+
634
+ <one line to give the program's name and a brief idea of what it does.>
635
+ Copyright (C) <year> <name of author>
636
+
637
+ This program is free software: you can redistribute it and/or modify
638
+ it under the terms of the GNU General Public License as published by
639
+ the Free Software Foundation, either version 3 of the License, or
640
+ (at your option) any later version.
641
+
642
+ This program is distributed in the hope that it will be useful,
643
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
644
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645
+ GNU General Public License for more details.
646
+
647
+ You should have received a copy of the GNU General Public License
648
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
649
+
650
+ Also add information on how to contact you by electronic and paper mail.
651
+
652
+ If the program does terminal interaction, make it output a short
653
+ notice like this when it starts in an interactive mode:
654
+
655
+ <program> Copyright (C) <year> <name of author>
656
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657
+ This is free software, and you are welcome to redistribute it
658
+ under certain conditions; type `show c' for details.
659
+
660
+ The hypothetical commands `show w' and `show c' should show the appropriate
661
+ parts of the General Public License. Of course, your program's commands
662
+ might be different; for a GUI interface, you would use an "about box".
663
+
664
+ You should also get your employer (if you work as a programmer) or school,
665
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
666
+ For more information on this, and how to apply and follow the GNU GPL, see
667
+ <https://www.gnu.org/licenses/>.
668
+
669
+ The GNU General Public License does not permit incorporating your program
670
+ into proprietary programs. If your program is a subroutine library, you
671
+ may consider it more useful to permit linking proprietary applications with
672
+ the library. If this is what you want to do, use the GNU Lesser General
673
+ Public License instead of this License. But first, please read
674
+ <https://www.gnu.org/licenses/why-not-lgpl.html>.
bert_vits2/README.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Bert-VITS2
2
+
3
+ VITS2 Backbone with bert
4
+ ## 成熟的旅行者/开拓者/舰长/博士/sensei/猎魔人/喵喵露/V应该参阅代码自己学习如何训练。
5
+ ### 严禁将此项目用于一切违反《中华人民共和国宪法》,《中华人民共和国刑法》,《中华人民共和国治安管理处罚法》和《中华人民共和国民法典》之用途。
bert_vits2/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from bert_vits2.bert_vits2 import Bert_VITS2
2
+ from bert_vits2 import text
bert_vits2/attentions.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+ from bert_vits2 import commons
6
+ from torch.nn.utils import weight_norm, remove_weight_norm
7
+
8
+
9
+ class LayerNorm(nn.Module):
10
+ def __init__(self, channels, eps=1e-5):
11
+ super().__init__()
12
+ self.channels = channels
13
+ self.eps = eps
14
+
15
+ self.gamma = nn.Parameter(torch.ones(channels))
16
+ self.beta = nn.Parameter(torch.zeros(channels))
17
+
18
+ def forward(self, x):
19
+ x = x.transpose(1, -1)
20
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
21
+ return x.transpose(1, -1)
22
+
23
+
24
+ @torch.jit.script
25
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
26
+ n_channels_int = n_channels[0]
27
+ in_act = input_a + input_b
28
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
29
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
30
+ acts = t_act * s_act
31
+ return acts
32
+
33
+
34
+ class Encoder(nn.Module):
35
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4,
36
+ isflow=True, **kwargs):
37
+ super().__init__()
38
+ self.hidden_channels = hidden_channels
39
+ self.filter_channels = filter_channels
40
+ self.n_heads = n_heads
41
+ self.n_layers = n_layers
42
+ self.kernel_size = kernel_size
43
+ self.p_dropout = p_dropout
44
+ self.window_size = window_size
45
+ # if isflow:
46
+ # cond_layer = torch.nn.Conv1d(256, 2 * hidden_channels * n_layers, 1)
47
+ # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1)
48
+ # self.cond_layer = weight_norm(cond_layer, name='weight')
49
+ # self.gin_channels = 256
50
+ self.cond_layer_idx = self.n_layers
51
+ if 'gin_channels' in kwargs:
52
+ self.gin_channels = kwargs['gin_channels']
53
+ if self.gin_channels != 0:
54
+ self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
55
+ # vits2 says 3rd block, so idx is 2 by default
56
+ self.cond_layer_idx = kwargs['cond_layer_idx'] if 'cond_layer_idx' in kwargs else 2
57
+ # print(self.gin_channels, self.cond_layer_idx)
58
+ assert self.cond_layer_idx < self.n_layers, 'cond_layer_idx should be less than n_layers'
59
+ self.drop = nn.Dropout(p_dropout)
60
+ self.attn_layers = nn.ModuleList()
61
+ self.norm_layers_1 = nn.ModuleList()
62
+ self.ffn_layers = nn.ModuleList()
63
+ self.norm_layers_2 = nn.ModuleList()
64
+ for i in range(self.n_layers):
65
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
66
+ window_size=window_size))
67
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
68
+ self.ffn_layers.append(
69
+ FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
70
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
71
+
72
+ def forward(self, x, x_mask, g=None):
73
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
74
+ x = x * x_mask
75
+ for i in range(self.n_layers):
76
+ if i == self.cond_layer_idx and g is not None:
77
+ g = self.spk_emb_linear(g.transpose(1, 2))
78
+ g = g.transpose(1, 2)
79
+ x = x + g
80
+ x = x * x_mask
81
+ y = self.attn_layers[i](x, x, attn_mask)
82
+ y = self.drop(y)
83
+ x = self.norm_layers_1[i](x + y)
84
+
85
+ y = self.ffn_layers[i](x, x_mask)
86
+ y = self.drop(y)
87
+ x = self.norm_layers_2[i](x + y)
88
+ x = x * x_mask
89
+ return x
90
+
91
+
92
+ class Decoder(nn.Module):
93
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
94
+ proximal_bias=False, proximal_init=True, **kwargs):
95
+ super().__init__()
96
+ self.hidden_channels = hidden_channels
97
+ self.filter_channels = filter_channels
98
+ self.n_heads = n_heads
99
+ self.n_layers = n_layers
100
+ self.kernel_size = kernel_size
101
+ self.p_dropout = p_dropout
102
+ self.proximal_bias = proximal_bias
103
+ self.proximal_init = proximal_init
104
+
105
+ self.drop = nn.Dropout(p_dropout)
106
+ self.self_attn_layers = nn.ModuleList()
107
+ self.norm_layers_0 = nn.ModuleList()
108
+ self.encdec_attn_layers = nn.ModuleList()
109
+ self.norm_layers_1 = nn.ModuleList()
110
+ self.ffn_layers = nn.ModuleList()
111
+ self.norm_layers_2 = nn.ModuleList()
112
+ for i in range(self.n_layers):
113
+ self.self_attn_layers.append(
114
+ MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
115
+ proximal_bias=proximal_bias, proximal_init=proximal_init))
116
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
117
+ self.encdec_attn_layers.append(
118
+ MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
119
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
120
+ self.ffn_layers.append(
121
+ FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
122
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
123
+
124
+ def forward(self, x, x_mask, h, h_mask):
125
+ """
126
+ x: decoder input
127
+ h: encoder output
128
+ """
129
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
130
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
131
+ x = x * x_mask
132
+ for i in range(self.n_layers):
133
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
134
+ y = self.drop(y)
135
+ x = self.norm_layers_0[i](x + y)
136
+
137
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
138
+ y = self.drop(y)
139
+ x = self.norm_layers_1[i](x + y)
140
+
141
+ y = self.ffn_layers[i](x, x_mask)
142
+ y = self.drop(y)
143
+ x = self.norm_layers_2[i](x + y)
144
+ x = x * x_mask
145
+ return x
146
+
147
+
148
+ class MultiHeadAttention(nn.Module):
149
+ def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True,
150
+ block_length=None, proximal_bias=False, proximal_init=False):
151
+ super().__init__()
152
+ assert channels % n_heads == 0
153
+
154
+ self.channels = channels
155
+ self.out_channels = out_channels
156
+ self.n_heads = n_heads
157
+ self.p_dropout = p_dropout
158
+ self.window_size = window_size
159
+ self.heads_share = heads_share
160
+ self.block_length = block_length
161
+ self.proximal_bias = proximal_bias
162
+ self.proximal_init = proximal_init
163
+ self.attn = None
164
+
165
+ self.k_channels = channels // n_heads
166
+ self.conv_q = nn.Conv1d(channels, channels, 1)
167
+ self.conv_k = nn.Conv1d(channels, channels, 1)
168
+ self.conv_v = nn.Conv1d(channels, channels, 1)
169
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
170
+ self.drop = nn.Dropout(p_dropout)
171
+
172
+ if window_size is not None:
173
+ n_heads_rel = 1 if heads_share else n_heads
174
+ rel_stddev = self.k_channels ** -0.5
175
+ self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
176
+ self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
177
+
178
+ nn.init.xavier_uniform_(self.conv_q.weight)
179
+ nn.init.xavier_uniform_(self.conv_k.weight)
180
+ nn.init.xavier_uniform_(self.conv_v.weight)
181
+ if proximal_init:
182
+ with torch.no_grad():
183
+ self.conv_k.weight.copy_(self.conv_q.weight)
184
+ self.conv_k.bias.copy_(self.conv_q.bias)
185
+
186
+ def forward(self, x, c, attn_mask=None):
187
+ q = self.conv_q(x)
188
+ k = self.conv_k(c)
189
+ v = self.conv_v(c)
190
+
191
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
192
+
193
+ x = self.conv_o(x)
194
+ return x
195
+
196
+ def attention(self, query, key, value, mask=None):
197
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
198
+ b, d, t_s, t_t = (*key.size(), query.size(2))
199
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
200
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
201
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
202
+
203
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
204
+ if self.window_size is not None:
205
+ assert t_s == t_t, "Relative attention is only available for self-attention."
206
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
207
+ rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
208
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
209
+ scores = scores + scores_local
210
+ if self.proximal_bias:
211
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
212
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
213
+ if mask is not None:
214
+ scores = scores.masked_fill(mask == 0, -1e4)
215
+ if self.block_length is not None:
216
+ assert t_s == t_t, "Local attention is only available for self-attention."
217
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
218
+ scores = scores.masked_fill(block_mask == 0, -1e4)
219
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
220
+ p_attn = self.drop(p_attn)
221
+ output = torch.matmul(p_attn, value)
222
+ if self.window_size is not None:
223
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
224
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
225
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
226
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
227
+ return output, p_attn
228
+
229
+ def _matmul_with_relative_values(self, x, y):
230
+ """
231
+ x: [b, h, l, m]
232
+ y: [h or 1, m, d]
233
+ ret: [b, h, l, d]
234
+ """
235
+ ret = torch.matmul(x, y.unsqueeze(0))
236
+ return ret
237
+
238
+ def _matmul_with_relative_keys(self, x, y):
239
+ """
240
+ x: [b, h, l, d]
241
+ y: [h or 1, m, d]
242
+ ret: [b, h, l, m]
243
+ """
244
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
245
+ return ret
246
+
247
+ def _get_relative_embeddings(self, relative_embeddings, length):
248
+ max_relative_position = 2 * self.window_size + 1
249
+ # Pad first before slice to avoid using cond ops.
250
+ pad_length = max(length - (self.window_size + 1), 0)
251
+ slice_start_position = max((self.window_size + 1) - length, 0)
252
+ slice_end_position = slice_start_position + 2 * length - 1
253
+ if pad_length > 0:
254
+ padded_relative_embeddings = F.pad(
255
+ relative_embeddings,
256
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
257
+ else:
258
+ padded_relative_embeddings = relative_embeddings
259
+ used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
260
+ return used_relative_embeddings
261
+
262
+ def _relative_position_to_absolute_position(self, x):
263
+ """
264
+ x: [b, h, l, 2*l-1]
265
+ ret: [b, h, l, l]
266
+ """
267
+ batch, heads, length, _ = x.size()
268
+ # Concat columns of pad to shift from relative to absolute indexing.
269
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
270
+
271
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
272
+ x_flat = x.view([batch, heads, length * 2 * length])
273
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
274
+
275
+ # Reshape and slice out the padded elements.
276
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
277
+ return x_final
278
+
279
+ def _absolute_position_to_relative_position(self, x):
280
+ """
281
+ x: [b, h, l, l]
282
+ ret: [b, h, l, 2*l-1]
283
+ """
284
+ batch, heads, length, _ = x.size()
285
+ # padd along column
286
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
287
+ x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
288
+ # add 0's in the beginning that will skew the elements after reshape
289
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
290
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
291
+ return x_final
292
+
293
+ def _attention_bias_proximal(self, length):
294
+ """Bias for self-attention to encourage attention to close positions.
295
+ Args:
296
+ length: an integer scalar.
297
+ Returns:
298
+ a Tensor with shape [1, 1, length, length]
299
+ """
300
+ r = torch.arange(length, dtype=torch.float32)
301
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
302
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
303
+
304
+
305
+ class FFN(nn.Module):
306
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None,
307
+ causal=False):
308
+ super().__init__()
309
+ self.in_channels = in_channels
310
+ self.out_channels = out_channels
311
+ self.filter_channels = filter_channels
312
+ self.kernel_size = kernel_size
313
+ self.p_dropout = p_dropout
314
+ self.activation = activation
315
+ self.causal = causal
316
+
317
+ if causal:
318
+ self.padding = self._causal_padding
319
+ else:
320
+ self.padding = self._same_padding
321
+
322
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
323
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
324
+ self.drop = nn.Dropout(p_dropout)
325
+
326
+ def forward(self, x, x_mask):
327
+ x = self.conv_1(self.padding(x * x_mask))
328
+ if self.activation == "gelu":
329
+ x = x * torch.sigmoid(1.702 * x)
330
+ else:
331
+ x = torch.relu(x)
332
+ x = self.drop(x)
333
+ x = self.conv_2(self.padding(x * x_mask))
334
+ return x * x_mask
335
+
336
+ def _causal_padding(self, x):
337
+ if self.kernel_size == 1:
338
+ return x
339
+ pad_l = self.kernel_size - 1
340
+ pad_r = 0
341
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
342
+ x = F.pad(x, commons.convert_pad_shape(padding))
343
+ return x
344
+
345
+ def _same_padding(self, x):
346
+ if self.kernel_size == 1:
347
+ return x
348
+ pad_l = (self.kernel_size - 1) // 2
349
+ pad_r = self.kernel_size // 2
350
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
351
+ x = F.pad(x, commons.convert_pad_shape(padding))
352
+ return x
bert_vits2/bert_vits2.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import numpy as np
4
+ import torch
5
+
6
+ from bert_vits2 import commons
7
+ from bert_vits2 import utils as bert_vits2_utils
8
+ from bert_vits2.clap_wrapper import get_clap_audio_feature, get_clap_text_feature
9
+ from bert_vits2.get_emo import get_emo
10
+ from bert_vits2.models import SynthesizerTrn
11
+ from bert_vits2.models_v230 import SynthesizerTrn as SynthesizerTrn_v230
12
+ from bert_vits2.models_ja_extra import SynthesizerTrn as SynthesizerTrn_ja_extra
13
+ from bert_vits2.text import *
14
+ from bert_vits2.text.cleaner import clean_text
15
+ from bert_vits2.utils import process_legacy_versions
16
+ from contants import config
17
+ from utils import get_hparams_from_file
18
+ from utils.sentence import split_languages
19
+
20
+
21
+ class Bert_VITS2:
22
+ def __init__(self, model_path, config, device=torch.device("cpu"), **kwargs):
23
+ self.model_path = model_path
24
+ self.hps_ms = get_hparams_from_file(config) if isinstance(config, str) else config
25
+ self.n_speakers = getattr(self.hps_ms.data, 'n_speakers', 0)
26
+ self.speakers = [item[0] for item in
27
+ sorted(list(getattr(self.hps_ms.data, 'spk2id', {'0': 0}).items()), key=lambda x: x[1])]
28
+ self.symbols = symbols
29
+ self.sampling_rate = self.hps_ms.data.sampling_rate
30
+
31
+ self.bert_model_names = {}
32
+ self.zh_bert_extra = False
33
+ self.ja_bert_extra = False
34
+ self.ja_bert_dim = 1024
35
+ self.num_tones = num_tones
36
+ self.pinyinPlus = None
37
+
38
+ # Compatible with legacy versions
39
+ self.version = process_legacy_versions(self.hps_ms).lower().replace("-", "_")
40
+ self.text_extra_str_map = {"zh": "", "ja": "", "en": ""}
41
+ self.bert_extra_str_map = {"zh": "", "ja": "", "en": ""}
42
+ self.hps_ms.model.emotion_embedding = None
43
+ if self.version in ["1.0", "1.0.0", "1.0.1"]:
44
+ """
45
+ chinese-roberta-wwm-ext-large
46
+ """
47
+ self.version = "1.0"
48
+ self.symbols = symbols_legacy
49
+ self.hps_ms.model.n_layers_trans_flow = 3
50
+ self.lang = getattr(self.hps_ms.data, "lang", ["zh"])
51
+ self.ja_bert_dim = 768
52
+ self.num_tones = num_tones_v111
53
+ self.text_extra_str_map.update({"zh": "_v100"})
54
+
55
+ elif self.version in ["1.1.0-transition"]:
56
+ """
57
+ chinese-roberta-wwm-ext-large
58
+ """
59
+ self.version = "1.1.0-transition"
60
+ self.hps_ms.model.n_layers_trans_flow = 3
61
+ self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja"])
62
+ self.ja_bert_dim = 768
63
+ self.num_tones = num_tones_v111
64
+ if "ja" in self.lang: self.bert_model_names.update({"ja": "BERT_BASE_JAPANESE_V3"})
65
+ self.text_extra_str_map.update({"zh": "_v100", "ja": "_v111"})
66
+ self.bert_extra_str_map.update({"ja": "_v111"})
67
+
68
+ elif self.version in ["1.1", "1.1.0", "1.1.1"]:
69
+ """
70
+ chinese-roberta-wwm-ext-large
71
+ bert-base-japanese-v3
72
+ """
73
+ self.version = "1.1"
74
+ self.hps_ms.model.n_layers_trans_flow = 6
75
+ self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja"])
76
+ self.ja_bert_dim = 768
77
+ self.num_tones = num_tones_v111
78
+ if "ja" in self.lang: self.bert_model_names.update({"ja": "BERT_BASE_JAPANESE_V3"})
79
+ self.text_extra_str_map.update({"zh": "_v100", "ja": "_v111"})
80
+ self.bert_extra_str_map.update({"ja": "_v111"})
81
+
82
+ elif self.version in ["2.0", "2.0.0", "2.0.1", "2.0.2"]:
83
+ """
84
+ chinese-roberta-wwm-ext-large
85
+ deberta-v2-large-japanese
86
+ deberta-v3-large
87
+ """
88
+ self.version = "2.0"
89
+ self.hps_ms.model.n_layers_trans_flow = 4
90
+ self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja", "en"])
91
+ self.num_tones = num_tones
92
+ if "ja" in self.lang: self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE"})
93
+ if "en" in self.lang: self.bert_model_names.update({"en": "DEBERTA_V3_LARGE"})
94
+ self.text_extra_str_map.update({"zh": "_v100", "ja": "_v200", "en": "_v200"})
95
+ self.bert_extra_str_map.update({"ja": "_v200", "en": "_v200"})
96
+
97
+ elif self.version in ["2.1", "2.1.0"]:
98
+ """
99
+ chinese-roberta-wwm-ext-large
100
+ deberta-v2-large-japanese-char-wwm
101
+ deberta-v3-large
102
+ wav2vec2-large-robust-12-ft-emotion-msp-dim
103
+ """
104
+ self.version = "2.1"
105
+ self.hps_ms.model.n_layers_trans_flow = 4
106
+ self.hps_ms.model.emotion_embedding = 1
107
+ self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja", "en"])
108
+ self.num_tones = num_tones
109
+ if "ja" in self.lang: self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM"})
110
+ if "en" in self.lang: self.bert_model_names.update({"en": "DEBERTA_V3_LARGE"})
111
+
112
+ elif self.version in ["2.2", "2.2.0"]:
113
+ """
114
+ chinese-roberta-wwm-ext-large
115
+ deberta-v2-large-japanese-char-wwm
116
+ deberta-v3-large
117
+ clap-htsat-fused
118
+ """
119
+ self.version = "2.2"
120
+ self.hps_ms.model.n_layers_trans_flow = 4
121
+ self.hps_ms.model.emotion_embedding = 2
122
+ self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja", "en"])
123
+ self.num_tones = num_tones
124
+ if "ja" in self.lang: self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM"})
125
+ if "en" in self.lang: self.bert_model_names.update({"en": "DEBERTA_V3_LARGE"})
126
+
127
+ elif self.version in ["2.3", "2.3.0"]:
128
+ """
129
+ chinese-roberta-wwm-ext-large
130
+ deberta-v2-large-japanese-char-wwm
131
+ deberta-v3-large
132
+ """
133
+ self.version = "2.3"
134
+ self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja", "en"])
135
+ self.num_tones = num_tones
136
+ self.text_extra_str_map.update({"en": "_v230"})
137
+ if "ja" in self.lang: self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM"})
138
+ if "en" in self.lang: self.bert_model_names.update({"en": "DEBERTA_V3_LARGE"})
139
+
140
+ elif self.version is not None and self.version in ["extra", "zh_clap"]:
141
+ """
142
+ Erlangshen-MegatronBert-1.3B-Chinese
143
+ clap-htsat-fused
144
+ """
145
+ self.version = "extra"
146
+ self.hps_ms.model.emotion_embedding = 2
147
+ self.hps_ms.model.n_layers_trans_flow = 6
148
+ self.lang = ["zh"]
149
+ self.num_tones = num_tones
150
+ self.zh_bert_extra = True
151
+ self.bert_model_names.update({"zh": "Erlangshen_MegatronBert_1.3B_Chinese"})
152
+ self.bert_extra_str_map.update({"zh": "_extra"})
153
+
154
+ elif self.version is not None and self.version in ["extra_fix", "2.4", "2.4.0"]:
155
+ """
156
+ Erlangshen-MegatronBert-1.3B-Chinese
157
+ clap-htsat-fused
158
+ """
159
+ self.version = "2.4"
160
+ self.hps_ms.model.emotion_embedding = 2
161
+ self.hps_ms.model.n_layers_trans_flow = 6
162
+ self.lang = ["zh"]
163
+ self.num_tones = num_tones
164
+ self.zh_bert_extra = True
165
+ self.bert_model_names.update({"zh": "Erlangshen_MegatronBert_1.3B_Chinese"})
166
+ self.bert_extra_str_map.update({"zh": "_extra"})
167
+ self.text_extra_str_map.update({"zh": "_v240"})
168
+
169
+ elif self.version is not None and self.version in ["ja_extra"]:
170
+ """
171
+ deberta-v2-large-japanese-char-wwm
172
+ """
173
+ self.version = "ja_extra"
174
+ self.hps_ms.model.emotion_embedding = 2
175
+ self.hps_ms.model.n_layers_trans_flow = 6
176
+ self.lang = ["ja"]
177
+ self.num_tones = num_tones
178
+ self.ja_bert_extra = True
179
+ self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM"})
180
+ self.bert_extra_str_map.update({"ja": "_extra"})
181
+ self.text_extra_str_map.update({"ja": "_extra"})
182
+
183
+ else:
184
+ logging.debug("Version information not found. Loaded as the newest version: v2.3.")
185
+ self.version = "2.3"
186
+ self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja", "en"])
187
+ self.num_tones = num_tones
188
+ self.text_extra_str_map.update({"en": "_v230"})
189
+ if "ja" in self.lang: self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM"})
190
+ if "en" in self.lang: self.bert_model_names.update({"en": "DEBERTA_V3_LARGE"})
191
+
192
+ if "zh" in self.lang and "zh" not in self.bert_model_names.keys():
193
+ self.bert_model_names.update({"zh": "CHINESE_ROBERTA_WWM_EXT_LARGE"})
194
+
195
+ self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
196
+
197
+ self.device = device
198
+
199
+ def load_model(self, model_handler):
200
+ self.model_handler = model_handler
201
+
202
+ if self.version in ["2.3", "extra", "2.4"]:
203
+ Synthesizer = SynthesizerTrn_v230
204
+ elif self.version == "ja_extra":
205
+ Synthesizer = SynthesizerTrn_ja_extra
206
+ else:
207
+ Synthesizer = SynthesizerTrn
208
+
209
+ if self.version == "2.4":
210
+ self.pinyinPlus = self.model_handler.get_pinyinPlus()
211
+ self.net_g = Synthesizer(
212
+ len(self.symbols),
213
+ self.hps_ms.data.filter_length // 2 + 1,
214
+ self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
215
+ n_speakers=self.hps_ms.data.n_speakers,
216
+ symbols=self.symbols,
217
+ ja_bert_dim=self.ja_bert_dim,
218
+ num_tones=self.num_tones,
219
+ zh_bert_extra=self.zh_bert_extra,
220
+ **self.hps_ms.model).to(self.device)
221
+ _ = self.net_g.eval()
222
+ bert_vits2_utils.load_checkpoint(self.model_path, self.net_g, None, skip_optimizer=True, version=self.version)
223
+
224
+ def get_speakers(self):
225
+ return self.speakers
226
+
227
+ def get_text(self, text, language_str, hps, style_text=None, style_weight=0.7):
228
+ clean_text_lang_str = language_str + self.text_extra_str_map.get(language_str, "")
229
+ bert_feature_lang_str = language_str + self.bert_extra_str_map.get(language_str, "")
230
+
231
+ tokenizer, _ = self.model_handler.get_bert_model(self.bert_model_names[language_str])
232
+
233
+ norm_text, phone, tone, word2ph = clean_text(text, clean_text_lang_str, tokenizer, self.pinyinPlus)
234
+
235
+ phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str, self._symbol_to_id)
236
+
237
+ if hps.data.add_blank:
238
+ phone = commons.intersperse(phone, 0)
239
+ tone = commons.intersperse(tone, 0)
240
+ language = commons.intersperse(language, 0)
241
+ for i in range(len(word2ph)):
242
+ word2ph[i] = word2ph[i] * 2
243
+ word2ph[0] += 1
244
+
245
+ if style_text == "" or self.zh_bert_extra:
246
+ style_text = None
247
+
248
+ bert = self.model_handler.get_bert_feature(norm_text, word2ph, bert_feature_lang_str,
249
+ self.bert_model_names[language_str], style_text, style_weight)
250
+ del word2ph
251
+ assert bert.shape[-1] == len(phone), phone
252
+
253
+ if self.zh_bert_extra:
254
+ zh_bert = bert
255
+ ja_bert, en_bert = None, None
256
+ elif self.ja_bert_extra:
257
+ ja_bert = bert
258
+ zh_bert, en_bert = None, None
259
+ elif language_str == "zh":
260
+ zh_bert = bert
261
+ ja_bert = torch.zeros(self.ja_bert_dim, len(phone))
262
+ en_bert = torch.zeros(1024, len(phone))
263
+ elif language_str == "ja":
264
+ zh_bert = torch.zeros(1024, len(phone))
265
+ ja_bert = bert
266
+ en_bert = torch.zeros(1024, len(phone))
267
+ elif language_str == "en":
268
+ zh_bert = torch.zeros(1024, len(phone))
269
+ ja_bert = torch.zeros(self.ja_bert_dim, len(phone))
270
+ en_bert = bert
271
+ else:
272
+ zh_bert = torch.zeros(1024, len(phone))
273
+ ja_bert = torch.zeros(self.ja_bert_dim, len(phone))
274
+ en_bert = torch.zeros(1024, len(phone))
275
+ assert bert.shape[-1] == len(
276
+ phone
277
+ ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
278
+ phone = torch.LongTensor(phone)
279
+ tone = torch.LongTensor(tone)
280
+ language = torch.LongTensor(language)
281
+ return zh_bert, ja_bert, en_bert, phone, tone, language
282
+
283
+ def _get_emo(self, reference_audio, emotion):
284
+ if reference_audio:
285
+ emo = torch.from_numpy(
286
+ get_emo(reference_audio, self.model_handler.emotion_model,
287
+ self.model_handler.emotion_processor))
288
+ else:
289
+ if emotion is None: emotion = 0
290
+ emo = torch.Tensor([emotion])
291
+
292
+ return emo
293
+
294
+ def _get_clap(self, reference_audio, text_prompt):
295
+ if isinstance(reference_audio, np.ndarray):
296
+ emo = get_clap_audio_feature(reference_audio, self.model_handler.clap_model,
297
+ self.model_handler.clap_processor, self.device)
298
+ else:
299
+ if text_prompt is None: text_prompt = config.bert_vits2_config.text_prompt
300
+ emo = get_clap_text_feature(text_prompt, self.model_handler.clap_model,
301
+ self.model_handler.clap_processor, self.device)
302
+ emo = torch.squeeze(emo, dim=1).unsqueeze(0)
303
+ return emo
304
+
305
+ def _infer(self, id, phones, tones, lang_ids, zh_bert, ja_bert, en_bert, sdp_ratio, noise, noisew, length,
306
+ emo=None):
307
+ with torch.no_grad():
308
+ x_tst = phones.to(self.device).unsqueeze(0)
309
+ tones = tones.to(self.device).unsqueeze(0)
310
+ lang_ids = lang_ids.to(self.device).unsqueeze(0)
311
+ if self.zh_bert_extra:
312
+ zh_bert = zh_bert.to(self.device).unsqueeze(0)
313
+ elif self.ja_bert_extra:
314
+ ja_bert = ja_bert.to(self.device).unsqueeze(0)
315
+ else:
316
+ zh_bert = zh_bert.to(self.device).unsqueeze(0)
317
+ ja_bert = ja_bert.to(self.device).unsqueeze(0)
318
+ en_bert = en_bert.to(self.device).unsqueeze(0)
319
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(self.device)
320
+ speakers = torch.LongTensor([int(id)]).to(self.device)
321
+ audio = self.net_g.infer(x_tst,
322
+ x_tst_lengths,
323
+ speakers,
324
+ tones,
325
+ lang_ids,
326
+ zh_bert=zh_bert,
327
+ ja_bert=ja_bert,
328
+ en_bert=en_bert,
329
+ sdp_ratio=sdp_ratio,
330
+ noise_scale=noise,
331
+ noise_scale_w=noisew,
332
+ length_scale=length,
333
+ emo=emo
334
+ )[0][0, 0].data.cpu().float().numpy()
335
+
336
+ torch.cuda.empty_cache()
337
+ return audio
338
+
339
+ def infer(self, text, id, lang, sdp_ratio, noise, noisew, length, reference_audio=None, emotion=None,
340
+ text_prompt=None, style_text=None, style_weigth=0.7, **kwargs):
341
+ zh_bert, ja_bert, en_bert, phones, tones, lang_ids = self.get_text(text, lang, self.hps_ms, style_text,
342
+ style_weigth)
343
+
344
+ emo = None
345
+ if self.hps_ms.model.emotion_embedding == 1:
346
+ emo = self._get_emo(reference_audio, emotion).to(self.device).unsqueeze(0)
347
+ elif self.hps_ms.model.emotion_embedding == 2:
348
+ emo = self._get_clap(reference_audio, text_prompt)
349
+
350
+ return self._infer(id, phones, tones, lang_ids, zh_bert, ja_bert, en_bert, sdp_ratio, noise, noisew, length,
351
+ emo)
352
+
353
+ def infer_multilang(self, text, id, lang, sdp_ratio, noise, noisew, length, reference_audio=None, emotion=None,
354
+ text_prompt=None, style_text=None, style_weigth=0.7, **kwargs):
355
+ sentences_list = split_languages(text, self.lang, expand_abbreviations=True, expand_hyphens=True)
356
+
357
+ emo = None
358
+ if self.hps_ms.model.emotion_embedding == 1:
359
+ emo = self._get_emo(reference_audio, emotion).to(self.device).unsqueeze(0)
360
+ elif self.hps_ms.model.emotion_embedding == 2:
361
+ emo = self._get_clap(reference_audio, text_prompt)
362
+
363
+ phones, tones, lang_ids, zh_bert, ja_bert, en_bert = [], [], [], [], [], []
364
+
365
+ for idx, (_text, lang) in enumerate(sentences_list):
366
+ skip_start = idx != 0
367
+ skip_end = idx != len(sentences_list) - 1
368
+ _zh_bert, _ja_bert, _en_bert, _phones, _tones, _lang_ids = self.get_text(_text, lang, self.hps_ms,
369
+ style_text, style_weigth)
370
+
371
+ if skip_start:
372
+ _phones = _phones[3:]
373
+ _tones = _tones[3:]
374
+ _lang_ids = _lang_ids[3:]
375
+ _zh_bert = _zh_bert[:, 3:]
376
+ _ja_bert = _ja_bert[:, 3:]
377
+ _en_bert = _en_bert[:, 3:]
378
+ if skip_end:
379
+ _phones = _phones[:-2]
380
+ _tones = _tones[:-2]
381
+ _lang_ids = _lang_ids[:-2]
382
+ _zh_bert = _zh_bert[:, :-2]
383
+ _ja_bert = _ja_bert[:, :-2]
384
+ _en_bert = _en_bert[:, :-2]
385
+
386
+ phones.append(_phones)
387
+ tones.append(_tones)
388
+ lang_ids.append(_lang_ids)
389
+ zh_bert.append(_zh_bert)
390
+ ja_bert.append(_ja_bert)
391
+ en_bert.append(_en_bert)
392
+
393
+ zh_bert = torch.cat(zh_bert, dim=1)
394
+ ja_bert = torch.cat(ja_bert, dim=1)
395
+ en_bert = torch.cat(en_bert, dim=1)
396
+ phones = torch.cat(phones, dim=0)
397
+ tones = torch.cat(tones, dim=0)
398
+ lang_ids = torch.cat(lang_ids, dim=0)
399
+
400
+ audio = self._infer(id, phones, tones, lang_ids, zh_bert, ja_bert, en_bert, sdp_ratio, noise,
401
+ noisew, length, emo)
402
+
403
+ return audio
bert_vits2/clap_wrapper.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def get_clap_audio_feature(audio_data, clap_model, processor, device):
5
+ with torch.no_grad():
6
+ inputs = processor(
7
+ audios=audio_data, return_tensors="pt", sampling_rate=48000
8
+ ).to(device)
9
+ emb = clap_model.get_audio_features(**inputs).float()
10
+ return emb.T
11
+
12
+
13
+ def get_clap_text_feature(text, clap_model, processor, device):
14
+ with torch.no_grad():
15
+ inputs = processor(text=text, return_tensors="pt").to(device)
16
+ emb = clap_model.get_text_features(**inputs).float()
17
+ return emb.T
bert_vits2/commons.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch.nn import functional as F
4
+
5
+
6
+ def init_weights(m, mean=0.0, std=0.01):
7
+ classname = m.__class__.__name__
8
+ if classname.find("Conv") != -1:
9
+ m.weight.data.normal_(mean, std)
10
+
11
+
12
+ def get_padding(kernel_size, dilation=1):
13
+ return int((kernel_size * dilation - dilation) / 2)
14
+
15
+
16
+ def convert_pad_shape(pad_shape):
17
+ layer = pad_shape[::-1]
18
+ pad_shape = [item for sublist in layer for item in sublist]
19
+ return pad_shape
20
+
21
+
22
+ def intersperse(lst, item):
23
+ result = [item] * (len(lst) * 2 + 1)
24
+ result[1::2] = lst
25
+ return result
26
+
27
+
28
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
29
+ """KL(P||Q)"""
30
+ kl = (logs_q - logs_p) - 0.5
31
+ kl += (
32
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
33
+ )
34
+ return kl
35
+
36
+
37
+ def rand_gumbel(shape):
38
+ """Sample from the Gumbel distribution, protect from overflows."""
39
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
+ return -torch.log(-torch.log(uniform_samples))
41
+
42
+
43
+ def rand_gumbel_like(x):
44
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
+ return g
46
+
47
+
48
+ def slice_segments(x, ids_str, segment_size=4):
49
+ gather_indices = ids_str.view(x.size(0), 1, 1).repeat(
50
+ 1, x.size(1), 1
51
+ ) + torch.arange(segment_size, device=x.device)
52
+ return torch.gather(x, 2, gather_indices)
53
+
54
+
55
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
56
+ b, d, t = x.size()
57
+ if x_lengths is None:
58
+ x_lengths = t
59
+ ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
60
+ ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
61
+ ret = slice_segments(x, ids_str, segment_size)
62
+ return ret, ids_str
63
+
64
+
65
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
66
+ position = torch.arange(length, dtype=torch.float)
67
+ num_timescales = channels // 2
68
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
69
+ num_timescales - 1
70
+ )
71
+ inv_timescales = min_timescale * torch.exp(
72
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
73
+ )
74
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
75
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
76
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
77
+ signal = signal.view(1, channels, length)
78
+ return signal
79
+
80
+
81
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
82
+ b, channels, length = x.size()
83
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
84
+ return x + signal.to(dtype=x.dtype, device=x.device)
85
+
86
+
87
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
88
+ b, channels, length = x.size()
89
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
90
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
91
+
92
+
93
+ def subsequent_mask(length):
94
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
95
+ return mask
96
+
97
+
98
+ @torch.jit.script
99
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
100
+ n_channels_int = n_channels[0]
101
+ in_act = input_a + input_b
102
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
103
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
104
+ acts = t_act * s_act
105
+ return acts
106
+
107
+
108
+ def convert_pad_shape(pad_shape):
109
+ layer = pad_shape[::-1]
110
+ pad_shape = [item for sublist in layer for item in sublist]
111
+ return pad_shape
112
+
113
+
114
+ def shift_1d(x):
115
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
116
+ return x
117
+
118
+
119
+ def sequence_mask(length, max_length=None):
120
+ if max_length is None:
121
+ max_length = length.max()
122
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
123
+ return x.unsqueeze(0) < length.unsqueeze(1)
124
+
125
+
126
+ def generate_path(duration, mask):
127
+ """
128
+ duration: [b, 1, t_x]
129
+ mask: [b, 1, t_y, t_x]
130
+ """
131
+
132
+ b, _, t_y, t_x = mask.shape
133
+ cum_duration = torch.cumsum(duration, -1)
134
+
135
+ cum_duration_flat = cum_duration.view(b * t_x)
136
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
137
+ path = path.view(b, t_x, t_y)
138
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
139
+ path = path.unsqueeze(1).transpose(2, 3) * mask
140
+ return path
141
+
142
+
143
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
144
+ if isinstance(parameters, torch.Tensor):
145
+ parameters = [parameters]
146
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
147
+ norm_type = float(norm_type)
148
+ if clip_value is not None:
149
+ clip_value = float(clip_value)
150
+
151
+ total_norm = 0
152
+ for p in parameters:
153
+ param_norm = p.grad.data.norm(norm_type)
154
+ total_norm += param_norm.item() ** norm_type
155
+ if clip_value is not None:
156
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
157
+ total_norm = total_norm ** (1.0 / norm_type)
158
+ return total_norm
bert_vits2/g2pW/pypinyin_G2pW_bv2/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from .g2pw import G2PWPinyin
4
+
5
+ __all__ = ["G2PWPinyin"]
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from pypinyin.constants import RE_HANS
4
+ from pypinyin.core import Pinyin, Style
5
+ from pypinyin.seg.simpleseg import simple_seg
6
+ from pypinyin.converter import UltimateConverter
7
+ from pypinyin.contrib.tone_convert import to_tone
8
+ from .g2pw1.onnx_api import G2PWOnnxConverter
9
+
10
+ class G2PWPinyin(Pinyin):
11
+ def __init__(
12
+ self,
13
+ model_dir="G2PWModel/",
14
+ model_source=None,
15
+ num_workers=None,
16
+ batch_size=None,
17
+ turnoff_tqdm=True,
18
+ enable_non_tradional_chinese=True,
19
+ v_to_u=False,
20
+ neutral_tone_with_five=False,
21
+ tone_sandhi=False,
22
+ **kwargs
23
+ ):
24
+ self._g2pw = G2PWOnnxConverter(
25
+ model_dir=model_dir,
26
+ style="pinyin",
27
+ model_source=model_source,
28
+ enable_non_tradional_chinese=enable_non_tradional_chinese,
29
+ )
30
+ self._converter = Converter(
31
+ self._g2pw,
32
+ v_to_u=v_to_u,
33
+ neutral_tone_with_five=neutral_tone_with_five,
34
+ tone_sandhi=tone_sandhi,
35
+ )
36
+
37
+ def get_seg(self, **kwargs):
38
+ return simple_seg
39
+
40
+
41
+ class Converter(UltimateConverter):
42
+ def __init__(
43
+ self,
44
+ g2pw_instance,
45
+ v_to_u=False,
46
+ neutral_tone_with_five=False,
47
+ tone_sandhi=False,
48
+ **kwargs
49
+ ):
50
+ super(Converter, self).__init__(
51
+ v_to_u=v_to_u,
52
+ neutral_tone_with_five=neutral_tone_with_five,
53
+ tone_sandhi=tone_sandhi,
54
+ **kwargs
55
+ )
56
+
57
+ self._g2pw = g2pw_instance
58
+
59
+ def convert(self, words, style, heteronym, errors, strict, **kwargs):
60
+ pys = []
61
+ if RE_HANS.match(words):
62
+ pys = self._to_pinyin(
63
+ words, style=style, heteronym=heteronym, errors=errors, strict=strict
64
+ )
65
+ post_data = self.post_pinyin(words, heteronym, pys)
66
+ if post_data is not None:
67
+ pys = post_data
68
+
69
+ pys = self.convert_styles(pys, words, style, heteronym, errors, strict)
70
+
71
+ else:
72
+ py = self.handle_nopinyin(
73
+ words, style=style, errors=errors, heteronym=heteronym, strict=strict
74
+ )
75
+ if py:
76
+ pys.extend(py)
77
+
78
+ return _remove_dup_and_empty(pys)
79
+
80
+ def _to_pinyin(self, han, style, heteronym, errors, strict, **kwargs):
81
+ g2pw_pinyin = self._g2pw(han)
82
+
83
+ if not g2pw_pinyin: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
84
+ return super(Converter, self).convert(
85
+ han, Style.TONE, heteronym, errors, strict, **kwargs
86
+ )
87
+
88
+ pinyins = []
89
+
90
+ for i, item in enumerate(g2pw_pinyin[0]):
91
+ if item is None: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
92
+ py = super(Converter, self).convert(
93
+ han[i], Style.TONE, heteronym, errors, strict, **kwargs
94
+ )
95
+ pinyins.extend(py)
96
+ else:
97
+ pinyins.append([to_tone(item)])
98
+
99
+ return pinyins
100
+
101
+
102
+ def _remove_dup_items(lst, remove_empty=False):
103
+ new_lst = []
104
+ for item in lst:
105
+ if remove_empty and not item:
106
+ continue
107
+ if item not in new_lst:
108
+ new_lst.append(item)
109
+ return new_lst
110
+
111
+
112
+ def _remove_dup_and_empty(lst_list):
113
+ new_lst_list = []
114
+ for lst in lst_list:
115
+ lst = _remove_dup_items(lst, remove_empty=True)
116
+ if lst:
117
+ new_lst_list.append(lst)
118
+ else:
119
+ new_lst_list.append([""])
120
+
121
+ return new_lst_list
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/__init__.py ADDED
File without changes
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/bopomofo_to_pinyin_wo_tune_dict.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"ㄌㄧㄥ": "ling", "ㄩㄢ": "yuan", "ㄒㄧㄥ": "xing", "ㄑㄧㄡ": "qiu", "ㄊㄧㄢ": "tian", "ㄎㄨㄚ": "kua", "ㄨ": "wu", "ㄧㄣ": "yin", "ㄧ": "yi", "ㄒㄧㄝ": "xie", "ㄔㄡ": "chou", "ㄋㄨㄛ": "nuo", "ㄉㄢ": "dan", "ㄒㄩ": "xu", "ㄒㄩㄥ": "xiong", "ㄌㄧㄡ": "liu", "ㄌㄧㄣ": "lin", "ㄒㄧㄤ": "xiang", "ㄩㄥ": "yong", "ㄒㄧㄣ": "xin", "ㄓㄣ": "zhen", "ㄉㄞ": "dai", "ㄆㄢ": "pan", "ㄖㄨ": "ru", "ㄇㄚ": "ma", "ㄑㄧㄢ": "qian", "ㄘ": "ci", "ㄓㄨㄥ": "zhong", "ㄋㄟ": "nei", "ㄔㄥ": "cheng", "ㄈㄥ": "feng", "ㄓㄨㄛ": "zhuo", "ㄈㄤ": "fang", "ㄠ": "ao", "ㄗㄨㄛ": "zuo", "ㄓㄡ": "zhou", "ㄉㄨㄥ": "dong", "ㄙㄨ": "su", "ㄑㄩㄥ": "qiong", "ㄎㄨㄤ": "kuang", "ㄨㄤ": "wang", "ㄌㄟ": "lei", "ㄋㄠ": "nao", "ㄓㄨ": "zhu", "ㄕㄨ": "shu", "ㄕㄣ": "shen", "ㄐㄧㄝ": "jie", "ㄉㄧㄝ": "die", "ㄔ": "chi", "ㄌㄨㄥ": "long", "ㄧㄥ": "ying", "ㄅㄥ": "beng", "ㄌㄢ": "lan", "ㄇㄧㄠ": "miao", "ㄌㄧ": "li", "ㄐㄧ": "ji", "ㄩ": "yu", "ㄌㄨㄛ": "luo", "ㄔㄞ": "chai", "ㄏㄨㄣ": "hun", "ㄏㄨㄟ": "hui", "ㄖㄠ": "rao", "ㄏㄢ": "han", "ㄒㄧ": "xi", "ㄊㄞ": "tai", "ㄧㄠ": "yao", "ㄐㄩㄣ": "jun", "ㄌㄩㄝ": "lve", "ㄊㄤ": "tang", "ㄓㄠ": "zhao", "ㄓㄞ": "zhai", "ㄓㄚ": "zha", "ㄦ": "er", "ㄖㄢ": "ran", "ㄑㄧ": "qi", "ㄙㄜ": "se", "ㄙ": "si", "ㄙㄚ": "sa", "ㄎㄨㄟ": "kui", "ㄆㄨ": "pu", "ㄊㄚ": "ta", "ㄉㄨ": "du", "ㄊㄨ": "tu", "ㄧㄤ": "yang", "ㄡ": "ou", "ㄇㄧㄢ": "mian", "ㄨㄣ": "wen", "ㄉㄧㄠ": "diao", "ㄇㄧㄝ": "mie", "ㄨㄚ": "wa", "ㄋㄧㄠ": "niao", "ㄧㄡ": "you", "ㄔㄜ": "che", "ㄑㄩㄢ": "quan", "ㄘㄞ": "cai", "ㄌㄧㄤ": "liang", "ㄍㄨ": "gu", "ㄇㄠ": "mao", "ㄍㄨㄚ": "gua", "ㄙㄨㄟ": "sui", "ㄇㄢ": "man", "ㄕ": "shi", "ㄎㄡ": "kou", "ㄊㄧㄥ": "ting", "ㄅㄧㄥ": "bing", "ㄏㄨㄛ": "huo", "ㄍㄨㄥ": "gong", "ㄑㄧㄣ": "qin", "ㄐㄩㄥ": "jiong", "ㄌㄨ": "lu", "ㄋㄢ": "nan", "ㄅㄧ": "bi", "ㄑㄧㄚ": "qia", "ㄆㄧ": "pi", "ㄉㄧㄢ": "dian", "ㄈㄨ": "fu", "ㄍㄜ": "ge", "ㄅㄞ": "bai", "ㄍㄢ": "gan", "ㄒㄩㄢ": "xuan", "ㄌㄤ": "lang", "ㄕㄜ": "she", "ㄏㄨㄚ": "hua", "ㄊㄡ": "tou", "ㄆㄧㄢ": "pian", "ㄉㄧ": "di", "ㄖㄨㄢ": "ruan", "ㄜ": "e", "ㄑㄧㄝ": "qie", "ㄉㄡ": "dou", "ㄖㄨㄟ": "rui", "ㄘㄨㄟ": "cui", "ㄐㄧㄢ": "jian", "ㄔㄨㄥ": "chong", "ㄉㄥ": "deng", "ㄐㄩㄝ": "jue", "ㄒㄩㄝ": "xue", "ㄒㄧㄠ": "xiao", "ㄗㄢ": "zan", "ㄓㄢ": "zhan", "ㄗㄡ": "zou", "ㄘㄡ": "cou", "ㄔㄨㄚ": "chua", "ㄈㄟ": "fei", "ㄅㄟ": "bei", "ㄔㄨ": "chu", "ㄅㄚ": "ba", "ㄎㄨㄞ": "kuai", "ㄒㄧㄚ": "xia", "ㄏㄜ": "he", "ㄅㄧㄝ": "bie", "ㄌㄩ": "lv", "ㄙㄨㄢ": "suan", "ㄏㄥ": "heng", "ㄍㄨㄟ": "gui", "ㄌㄡ": "lou", "ㄊㄧ": "ti", "ㄌㄜ": "le", "ㄙㄨㄣ": "sun", "ㄒㄧㄢ": "xian", "ㄑㄩㄝ": "que", "ㄓ": "zhi", "ㄐㄧㄚ": "jia", "ㄏㄨ": "hu", "ㄌㄚ": "la", "ㄎㄜ": "ke", "ㄞ": "ai", "ㄨㄟ": "wei", "ㄏㄨㄢ": "huan", "ㄕㄨㄚ": "shua", "ㄕㄨㄤ": "shuang", "ㄍㄞ": "gai", "ㄏㄞ": "hai", "ㄧㄢ": "yan", "ㄈㄢ": "fan", "ㄆㄤ": "pang", "ㄙㄨㄥ": "song", "ㄋㄜ": "ne", "ㄔㄣ": "chen", "ㄍㄨㄛ": "guo", "ㄣ": "en", "ㄋㄍ": "ng", "ㄆㄚ": "pa", "ㄈㄚ": "fa", "ㄆㄡ": "pou", "ㄏㄡ": "hou", "ㄑㄩ": "qu", "ㄒㄩㄣ": "xun", "ㄋㄧㄝ": "nie", "ㄏㄨㄥ": "hong", "ㄊㄨㄣ": "tun", "ㄨㄞ": "wai", "ㄕㄡ": "shou", "ㄧㄝ": "ye", "ㄐㄩ": "ju", "ㄙㄡ": "sou", "ㄌㄨㄣ": "lun", "ㄋㄧㄚ": "nia", "ㄆㄣ": "pen", "ㄈㄣ": "fen", "ㄔㄨㄣ": "chun", "ㄋㄧㄡ": "niu", "ㄖㄡ": "rou", "ㄉㄨㄛ": "duo", "ㄗㄜ": "ze", "ㄕㄥ": "sheng", "ㄎㄨ": "ku", "ㄧㄚ": "ya", "ㄓㄨㄟ": "zhui", "ㄍㄡ": "gou", "ㄅㄛ": "bo", "ㄋㄚ": "na", "ㄒㄧㄡ": "xiu", "ㄘㄨ": "cu", "ㄎㄨㄛ": "kuo", "ㄌㄠ": "lao", "ㄘㄨㄥ": "cong", "ㄉㄚ": "da", "ㄆㄛ": "po", "ㄙㄞ": "sai", "ㄌㄥ": "leng", "ㄖㄨㄥ": "rong", "ㄋㄧ": "ni", "ㄆㄠ": "pao", "ㄎㄢ": "kan", "ㄨㄥ": "weng", "ㄨㄢ": "wan", "ㄏㄠ": "hao", "ㄐㄧㄥ": "jing", "ㄊㄢ": "tan", "ㄅㄨ": "bu", "ㄗㄤ": "zang", "ㄐㄧㄡ": "jiu", "ㄇㄟ": "mei", "ㄇㄨ": "mu", "ㄉㄨㄟ": "dui", "ㄅㄤ": "bang", "ㄅㄠ": "bao", "ㄔㄤ": "chang", "ㄓㄤ": "zhang", "ㄗㄨㄥ": "zong", "ㄍㄨㄣ": "gun", "ㄌㄧㄠ": "liao", "ㄔㄢ": "chan", "ㄓㄜ": "zhe", "ㄇㄥ": "meng", "ㄑㄧㄠ": "qiao", "ㄋㄤ": "nang", "ㄩㄣ": "yun", "ㄎㄞ": "kai", "ㄍㄠ": "gao", "ㄊㄠ": "tao", "ㄕㄢ": "shan", "ㄌㄞ": "lai", "ㄅㄢ": "ban", "ㄎㄨㄥ": "kong", "ㄔㄨㄛ": "chuo", "ㄋㄨ": "nu", "ㄆㄟ": "pei", "ㄆㄥ": "peng", "ㄘㄢ": "can", "ㄙㄨㄛ": "suo", "ㄊㄨㄥ": "tong", "ㄑㄧㄤ": "qiang", "ㄙㄠ": "sao", "ㄓㄨㄢ": "zhuan", "ㄢ": "an", "ㄔㄚ": "cha", "ㄕㄚ": "sha", "ㄌㄧㄢ": "lian", "ㄇㄧ": "mi", "ㄋㄡ": "nou", "ㄘㄠ": "cao", "ㄙㄣ": "sen", "ㄋㄣ": "nen", "ㄋㄧㄢ": "nian", "ㄇㄞ": "mai", "ㄩㄝ": "yue", "ㄋㄞ": "nai", "ㄏㄨㄞ": "huai", "ㄗ": "zi", "ㄌㄨㄢ": "luan", "ㄉ��ㄥ": "ding", "ㄇㄤ": "mang", "ㄋㄧㄥ": "ning", "ㄇㄧㄥ": "ming", "ㄗㄨㄟ": "zui", "ㄎㄤ": "kang", "ㄉㄜ": "de", "ㄅㄧㄢ": "bian", "ㄐㄧㄣ": "jin", "ㄔㄨㄟ": "chui", "ㄊㄨㄟ": "tui", "ㄗㄚ": "za", "ㄘㄣ": "cen", "ㄇㄧㄣ": "min", "ㄏㄨㄤ": "huang", "ㄗㄨ": "zu", "ㄘㄨㄛ": "cuo", "ㄊㄨㄛ": "tuo", "ㄑㄩㄣ": "qun", "ㄅㄧㄣ": "bin", "ㄊㄧㄠ": "tiao", "ㄍㄤ": "gang", "ㄉㄨㄢ": "duan", "ㄅㄧㄠ": "biao", "ㄉㄠ": "dao", "ㄖㄨㄣ": "run", "ㄐㄧㄠ": "jiao", "ㄨㄛ": "wo", "ㄘㄨㄢ": "cuan", "ㄖㄣ": "ren", "ㄇㄣ": "men", "ㄓㄨㄣ": "zhun", "ㄎㄨㄣ": "kun", "ㄔㄨㄤ": "chuang", "ㄗㄠ": "zao", "ㄓㄥ": "zheng", "ㄆㄧㄣ": "pin", "ㄅㄣ": "ben", "ㄐㄧㄤ": "jiang", "ㄐㄩㄢ": "juan", "ㄘㄥ": "ceng", "ㄏㄤ": "hang", "ㄋㄧㄣ": "nin", "ㄌㄧㄝ": "lie", "ㄍㄨㄤ": "guang", "ㄙㄢ": "san", "ㄊㄜ": "te", "ㄕㄨㄣ": "shun", "ㄕㄨㄟ": "shui", "ㄔㄠ": "chao", "ㄘㄜ": "ce", "ㄍㄨㄞ": "guai", "ㄎㄥ": "keng", "ㄕㄞ": "shai", "ㄉㄣ": "den", "ㄊㄨㄢ": "tuan", "ㄆㄧㄠ": "piao", "ㄑㄧㄥ": "qing", "ㄍㄥ": "geng", "ㄔㄨㄞ": "chuai", "ㄕㄠ": "shao", "ㄍㄣ": "gen", "ㄋㄨㄢ": "nuan", "ㄖㄥ": "reng", "ㄇㄡ": "mou", "ㄆㄞ": "pai", "ㄤ": "ang", "ㄎㄚ": "ka", "ㄍㄨㄢ": "guan", "ㄕㄨㄛ": "shuo", "ㄏㄣ": "hen", "ㄔㄨㄢ": "chuan", "ㄎㄨㄢ": "kuan", "ㄏㄟ": "hei", "ㄇㄛ": "mo", "ㄗㄞ": "zai", "ㄋㄥ": "neng", "ㄕㄨㄞ": "shuai", "ㄖㄜ": "re", "ㄋㄩ": "nv", "ㄆㄧㄥ": "ping", "ㄘㄤ": "cang", "ㄋㄨㄥ": "nong", "ㄎㄠ": "kao", "ㄗㄨㄢ": "zuan", "ㄎㄣ": "ken", "ㄍㄚ": "ga", "ㄗㄣ": "zen", "ㄉㄤ": "dang", "ㄗㄥ": "zeng", "ㄉㄨㄣ": "dun", "ㄘㄚ": "ca", "ㄖㄤ": "rang", "ㄘㄨㄣ": "cun", "ㄖㄨㄛ": "ruo", "ㄊㄧㄝ": "tie", "ㄊㄥ": "teng", "ㄙㄥ": "seng", "ㄖ": "ri", "ㄗㄨㄣ": "zun", "ㄋㄧㄤ": "niang", "ㄋㄩㄝ": "nve", "ㄙㄤ": "sang", "ㄓㄨㄤ": "zhuang", "ㄕㄤ": "shang", "ㄆㄧㄝ": "pie", "ㄕㄨㄢ": "shuan", "ㄈㄡ": "fou", "ㄉㄧㄡ": "diu", "ㄇㄜ": "me", "ㄈㄛ": "fo", "ㄌㄧㄚ": "lia", "ㄎㄟ": "kei", "ㄏㄚ": "ha", "ㄚ": "a", "ㄌㄛ": "lo", "ㄧㄛ": "yo", "ㄛ": "o", "ㄏㄋㄍ": "hng", "ㄋ": "n", "ㄌㄣ": "len", "ㄉㄧㄚ": "dia", "ㄇㄧㄡ": "miu", "ㄉㄟ": "dei", "ㄏㄇ": "hm", "ㄋㄨㄣ": "nun", "ㄓㄨㄞ": "zhuai", "ㄊㄟ": "tei", "ㄗㄟ": "zei", "ㄓㄨㄚ": "zhua", "ㄖㄨㄚ": "rua", "ê": "ê", "ㄟ": "ei", "ㄍㄟ": "gei", "ㄈㄧㄠ": "fiao", "ㄕㄟ": "shei", "ㄓㄟ": "zhei", "ㄥ": "eng", "ㄘㄟ": "cei", "ㄉㄧㄣ": "din", "ㄅㄧㄤ": "biang", "ㄧㄞ": "yai"}
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/char_bopomofo_dict.json ADDED
The diff for this file is too large to render. See raw diff
 
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/char_convert.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters.
16
+ """
17
+ simplified_charcters = "制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁���稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢��尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎���蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓��鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤"
18
+
19
+ traditional_characters = "制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨��倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢���鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙��舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒���踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤"
20
+
21
+ assert len(simplified_charcters) == len(simplified_charcters)
22
+
23
+ s2t_dict = {}
24
+ t2s_dict = {}
25
+ for i, item in enumerate(simplified_charcters):
26
+ s2t_dict[item] = traditional_characters[i]
27
+ t2s_dict[traditional_characters[i]] = item
28
+
29
+
30
+ def tranditional_to_simplified(text: str) -> str:
31
+ return "".join([t2s_dict[item] if item in t2s_dict else item for item in text])
32
+
33
+
34
+ def simplified_to_traditional(text: str) -> str:
35
+ return "".join([s2t_dict[item] if item in s2t_dict else item for item in text])
36
+
37
+
38
+ if __name__ == "__main__":
39
+ text = "一般是指存取一個應用程式啟動時始終顯示在網站或網頁瀏覽器中的一個或多個初始網頁等畫面存在的站點"
40
+ print(text)
41
+ text_simple = tranditional_to_simplified(text)
42
+ print(text_simple)
43
+ text_traditional = simplified_to_traditional(text_simple)
44
+ print(text_traditional)
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/dataset.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Credits
16
+ This code is modified from https://github.com/GitYCC/g2pW
17
+ """
18
+ from typing import Dict
19
+ from typing import List
20
+ from typing import Tuple
21
+
22
+ import numpy as np
23
+
24
+ from .utils import tokenize_and_map
25
+
26
+ ANCHOR_CHAR = "▁"
27
+
28
+
29
+ def prepare_onnx_input(
30
+ tokenizer,
31
+ labels: List[str],
32
+ char2phonemes: Dict[str, List[int]],
33
+ chars: List[str],
34
+ texts: List[str],
35
+ query_ids: List[int],
36
+ use_mask: bool = False,
37
+ window_size: int = None,
38
+ max_len: int = 512,
39
+ ) -> Dict[str, np.array]:
40
+ if window_size is not None:
41
+ truncated_texts, truncated_query_ids = _truncate_texts(
42
+ window_size=window_size, texts=texts, query_ids=query_ids
43
+ )
44
+ input_ids = []
45
+ token_type_ids = []
46
+ attention_masks = []
47
+ phoneme_masks = []
48
+ char_ids = []
49
+ position_ids = []
50
+
51
+ for idx in range(len(texts)):
52
+ text = (truncated_texts if window_size else texts)[idx].lower()
53
+ query_id = (truncated_query_ids if window_size else query_ids)[idx]
54
+
55
+ try:
56
+ tokens, text2token, token2text = tokenize_and_map(
57
+ tokenizer=tokenizer, text=text
58
+ )
59
+ except Exception:
60
+ print(f'warning: text "{text}" is invalid')
61
+ return {}
62
+
63
+ text, query_id, tokens, text2token, token2text = _truncate(
64
+ max_len=max_len,
65
+ text=text,
66
+ query_id=query_id,
67
+ tokens=tokens,
68
+ text2token=text2token,
69
+ token2text=token2text,
70
+ )
71
+
72
+ processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
73
+
74
+ input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
75
+ token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
76
+ attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
77
+
78
+ query_char = text[query_id]
79
+ phoneme_mask = (
80
+ [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))]
81
+ if use_mask
82
+ else [1] * len(labels)
83
+ )
84
+ char_id = chars.index(query_char)
85
+ position_id = text2token[query_id] + 1 # [CLS] token locate at first place
86
+
87
+ input_ids.append(input_id)
88
+ token_type_ids.append(token_type_id)
89
+ attention_masks.append(attention_mask)
90
+ phoneme_masks.append(phoneme_mask)
91
+ char_ids.append(char_id)
92
+ position_ids.append(position_id)
93
+
94
+ outputs = {
95
+ "input_ids": np.array(input_ids).astype(np.int64),
96
+ "token_type_ids": np.array(token_type_ids).astype(np.int64),
97
+ "attention_masks": np.array(attention_masks).astype(np.int64),
98
+ "phoneme_masks": np.array(phoneme_masks).astype(np.float32),
99
+ "char_ids": np.array(char_ids).astype(np.int64),
100
+ "position_ids": np.array(position_ids).astype(np.int64),
101
+ }
102
+ return outputs
103
+
104
+
105
+ def _truncate_texts(
106
+ window_size: int, texts: List[str], query_ids: List[int]
107
+ ) -> Tuple[List[str], List[int]]:
108
+ truncated_texts = []
109
+ truncated_query_ids = []
110
+ for text, query_id in zip(texts, query_ids):
111
+ start = max(0, query_id - window_size // 2)
112
+ end = min(len(text), query_id + window_size // 2)
113
+ truncated_text = text[start:end]
114
+ truncated_texts.append(truncated_text)
115
+
116
+ truncated_query_id = query_id - start
117
+ truncated_query_ids.append(truncated_query_id)
118
+ return truncated_texts, truncated_query_ids
119
+
120
+
121
+ def _truncate(
122
+ max_len: int,
123
+ text: str,
124
+ query_id: int,
125
+ tokens: List[str],
126
+ text2token: List[int],
127
+ token2text: List[Tuple[int]],
128
+ ):
129
+ truncate_len = max_len - 2
130
+ if len(tokens) <= truncate_len:
131
+ return (text, query_id, tokens, text2token, token2text)
132
+
133
+ token_position = text2token[query_id]
134
+
135
+ token_start = token_position - truncate_len // 2
136
+ token_end = token_start + truncate_len
137
+ font_exceed_dist = -token_start
138
+ back_exceed_dist = token_end - len(tokens)
139
+ if font_exceed_dist > 0:
140
+ token_start += font_exceed_dist
141
+ token_end += font_exceed_dist
142
+ elif back_exceed_dist > 0:
143
+ token_start -= back_exceed_dist
144
+ token_end -= back_exceed_dist
145
+
146
+ start = token2text[token_start][0]
147
+ end = token2text[token_end - 1][1]
148
+
149
+ return (
150
+ text[start:end],
151
+ query_id - start,
152
+ tokens[token_start:token_end],
153
+ [i - token_start if i is not None else None for i in text2token[start:end]],
154
+ [(s - start, e - start) for s, e in token2text[token_start:token_end]],
155
+ )
156
+
157
+
158
+ def get_phoneme_labels(
159
+ polyphonic_chars: List[List[str]],
160
+ ) -> Tuple[List[str], Dict[str, List[int]]]:
161
+ labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
162
+ char2phonemes = {}
163
+ for char, phoneme in polyphonic_chars:
164
+ if char not in char2phonemes:
165
+ char2phonemes[char] = []
166
+ char2phonemes[char].append(labels.index(phoneme))
167
+ return labels, char2phonemes
168
+
169
+
170
+ def get_char_phoneme_labels(
171
+ polyphonic_chars: List[List[str]],
172
+ ) -> Tuple[List[str], Dict[str, List[int]]]:
173
+ labels = sorted(
174
+ list(set([f"{char} {phoneme}" for char, phoneme in polyphonic_chars]))
175
+ )
176
+ char2phonemes = {}
177
+ for char, phoneme in polyphonic_chars:
178
+ if char not in char2phonemes:
179
+ char2phonemes[char] = []
180
+ char2phonemes[char].append(labels.index(f"{char} {phoneme}"))
181
+ return labels, char2phonemes
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/onnx_api.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Credits
16
+ This code is modified from https://github.com/GitYCC/g2pW
17
+ """
18
+ import json
19
+ import os
20
+ from typing import Any
21
+ from typing import Dict
22
+ from typing import List
23
+ from typing import Tuple
24
+ import numpy as np
25
+ import onnxruntime
26
+ from opencc import OpenCC
27
+ from transformers import BertTokenizer
28
+ from pypinyin import pinyin
29
+ from pypinyin import Style
30
+
31
+ from .dataset import get_char_phoneme_labels
32
+ from .dataset import get_phoneme_labels
33
+ from .dataset import prepare_onnx_input
34
+ from .utils import load_config
35
+ from .char_convert import tranditional_to_simplified
36
+
37
+ model_version = "1.1"
38
+
39
+
40
+ def predict(
41
+ session, onnx_input: Dict[str, Any], labels: List[str]
42
+ ) -> Tuple[List[str], List[float]]:
43
+ all_preds = []
44
+ all_confidences = []
45
+ probs = session.run(
46
+ [],
47
+ {
48
+ "input_ids": onnx_input["input_ids"],
49
+ "token_type_ids": onnx_input["token_type_ids"],
50
+ "attention_mask": onnx_input["attention_masks"],
51
+ "phoneme_mask": onnx_input["phoneme_masks"],
52
+ "char_ids": onnx_input["char_ids"],
53
+ "position_ids": onnx_input["position_ids"],
54
+ },
55
+ )[0]
56
+
57
+ preds = np.argmax(probs, axis=1).tolist()
58
+ max_probs = []
59
+ for index, arr in zip(preds, probs.tolist()):
60
+ max_probs.append(arr[index])
61
+ all_preds += [labels[pred] for pred in preds]
62
+ all_confidences += max_probs
63
+
64
+ return all_preds, all_confidences
65
+
66
+
67
+ class G2PWOnnxConverter:
68
+ def __init__(
69
+ self,
70
+ model_dir: None,
71
+ model_source=None,
72
+ style: str = "bopomofo",
73
+ enable_non_tradional_chinese: bool = False,
74
+ ):
75
+ sess_options = onnxruntime.SessionOptions()
76
+ sess_options.graph_optimization_level = (
77
+ onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
78
+ )
79
+ sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
80
+ sess_options.intra_op_num_threads = os.cpu_count() - 1
81
+ try:
82
+ self.session_g2pw = onnxruntime.InferenceSession(
83
+ os.path.join(model_dir, "g2pW.onnx"),
84
+ sess_options=sess_options,
85
+ providers=["CUDAExecutionProvider"],
86
+ )
87
+ except:
88
+ self.session_g2pw = onnxruntime.InferenceSession(
89
+ os.path.join(model_dir, "g2pW.onnx"), sess_options=sess_options
90
+ )
91
+ self.config = load_config(
92
+ os.path.join(model_dir, "config.py"), use_default=True
93
+ )
94
+
95
+ self.model_source = (
96
+ os.path.join(os.path.abspath(os.curdir), model_source)
97
+ if model_source
98
+ else os.path.join(os.path.abspath(os.curdir), self.config.model_source)
99
+ )
100
+ self.enable_opencc = enable_non_tradional_chinese
101
+
102
+ self.tokenizer = (
103
+ BertTokenizer.from_pretrained(self.model_source)
104
+ if model_source
105
+ else BertTokenizer.from_pretrained(self.config.model_source)
106
+ )
107
+ polyphonic_chars_path = os.path.join(model_dir, "POLYPHONIC_CHARS.txt")
108
+ monophonic_chars_path = os.path.join(model_dir, "MONOPHONIC_CHARS.txt")
109
+
110
+ self.polyphonic_chars = [
111
+ line.split("\t")
112
+ for line in open(polyphonic_chars_path, encoding="utf-8")
113
+ .read()
114
+ .strip()
115
+ .split("\n")
116
+ ]
117
+ self.non_polyphonic = {
118
+ "一",
119
+ "不",
120
+ "和",
121
+ "咋",
122
+ "嗲",
123
+ "剖",
124
+ "差",
125
+ "攢",
126
+ "倒",
127
+ "難",
128
+ "奔",
129
+ "勁",
130
+ "拗",
131
+ "肖",
132
+ "瘙",
133
+ "誒",
134
+ "泊",
135
+ "听",
136
+ "噢",
137
+ }
138
+ self.non_monophonic = {"似", "攢"}
139
+ self.monophonic_chars = [
140
+ line.split("\t")
141
+ for line in open(monophonic_chars_path, encoding="utf-8")
142
+ .read()
143
+ .strip()
144
+ .split("\n")
145
+ ]
146
+ self.labels, self.char2phonemes = (
147
+ get_char_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
148
+ if self.config.use_char_phoneme
149
+ else get_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
150
+ )
151
+
152
+ self.chars = sorted(list(self.char2phonemes.keys()))
153
+
154
+ self.polyphonic_chars_new = set(self.chars)
155
+ for char in self.non_polyphonic:
156
+ if char in self.polyphonic_chars_new:
157
+ self.polyphonic_chars_new.remove(char)
158
+
159
+ self.monophonic_chars_dict = {
160
+ char: phoneme for char, phoneme in self.monophonic_chars
161
+ }
162
+ for char in self.non_monophonic:
163
+ if char in self.monophonic_chars_dict:
164
+ self.monophonic_chars_dict.pop(char)
165
+
166
+ self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"]
167
+
168
+ with open(
169
+ os.path.join(
170
+ os.path.dirname(os.path.abspath(__file__)),
171
+ "bopomofo_to_pinyin_wo_tune_dict.json",
172
+ ),
173
+ "r",
174
+ encoding="utf-8",
175
+ ) as fr:
176
+ self.bopomofo_convert_dict = json.load(fr)
177
+ self.style_convert_func = {
178
+ "bopomofo": lambda x: x,
179
+ "pinyin": self._convert_bopomofo_to_pinyin,
180
+ }[style]
181
+
182
+ with open(
183
+ os.path.join(
184
+ os.path.dirname(os.path.abspath(__file__)), "char_bopomofo_dict.json"
185
+ ),
186
+ "r",
187
+ encoding="utf-8",
188
+ ) as fr:
189
+ self.char_bopomofo_dict = json.load(fr)
190
+
191
+ if self.enable_opencc:
192
+ self.cc = OpenCC("s2tw")
193
+
194
+ def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
195
+ tone = bopomofo[-1]
196
+ assert tone in "12345"
197
+ component = self.bopomofo_convert_dict.get(bopomofo[:-1])
198
+ if component:
199
+ return component + tone
200
+ else:
201
+ print(f'Warning: "{bopomofo}" cannot convert to pinyin')
202
+ return None
203
+
204
+ def __call__(self, sentences: List[str]) -> List[List[str]]:
205
+ if isinstance(sentences, str):
206
+ sentences = [sentences]
207
+
208
+ if self.enable_opencc:
209
+ translated_sentences = []
210
+ for sent in sentences:
211
+ translated_sent = self.cc.convert(sent)
212
+ assert len(translated_sent) == len(sent)
213
+ translated_sentences.append(translated_sent)
214
+ sentences = translated_sentences
215
+
216
+ texts, query_ids, sent_ids, partial_results = self._prepare_data(
217
+ sentences=sentences
218
+ )
219
+ if len(texts) == 0:
220
+ # sentences no polyphonic words
221
+ return partial_results
222
+
223
+ onnx_input = prepare_onnx_input(
224
+ tokenizer=self.tokenizer,
225
+ labels=self.labels,
226
+ char2phonemes=self.char2phonemes,
227
+ chars=self.chars,
228
+ texts=texts,
229
+ query_ids=query_ids,
230
+ use_mask=self.config.use_mask,
231
+ window_size=None,
232
+ )
233
+
234
+ preds, confidences = predict(
235
+ session=self.session_g2pw, onnx_input=onnx_input, labels=self.labels
236
+ )
237
+ if self.config.use_char_phoneme:
238
+ preds = [pred.split(" ")[1] for pred in preds]
239
+
240
+ results = partial_results
241
+ for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
242
+ results[sent_id][query_id] = self.style_convert_func(pred)
243
+
244
+ return results
245
+
246
+ def _prepare_data(
247
+ self, sentences: List[str]
248
+ ) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
249
+ texts, query_ids, sent_ids, partial_results = [], [], [], []
250
+ for sent_id, sent in enumerate(sentences):
251
+ # pypinyin works well for Simplified Chinese than Traditional Chinese
252
+ sent_s = tranditional_to_simplified(sent)
253
+ pypinyin_result = pinyin(
254
+ sent_s, neutral_tone_with_five=True, style=Style.TONE3
255
+ )
256
+ partial_result = [None] * len(sent)
257
+ for i, char in enumerate(sent):
258
+ if char in self.polyphonic_chars_new:
259
+ texts.append(sent)
260
+ query_ids.append(i)
261
+ sent_ids.append(sent_id)
262
+ elif char in self.monophonic_chars_dict:
263
+ partial_result[i] = self.style_convert_func(
264
+ self.monophonic_chars_dict[char]
265
+ )
266
+ elif char in self.char_bopomofo_dict:
267
+ partial_result[i] = pypinyin_result[i][0]
268
+ # partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0])
269
+ else:
270
+ partial_result[i] = pypinyin_result[i][0]
271
+
272
+ partial_results.append(partial_result)
273
+ return texts, query_ids, sent_ids, partial_results
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/utils.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Credits
16
+ This code is modified from https://github.com/GitYCC/g2pW
17
+ """
18
+ import os
19
+ import re
20
+
21
+
22
+ def wordize_and_map(text: str):
23
+ words = []
24
+ index_map_from_text_to_word = []
25
+ index_map_from_word_to_text = []
26
+ while len(text) > 0:
27
+ match_space = re.match(r"^ +", text)
28
+ if match_space:
29
+ space_str = match_space.group(0)
30
+ index_map_from_text_to_word += [None] * len(space_str)
31
+ text = text[len(space_str) :]
32
+ continue
33
+
34
+ match_en = re.match(r"^[a-zA-Z0-9]+", text)
35
+ if match_en:
36
+ en_word = match_en.group(0)
37
+
38
+ word_start_pos = len(index_map_from_text_to_word)
39
+ word_end_pos = word_start_pos + len(en_word)
40
+ index_map_from_word_to_text.append((word_start_pos, word_end_pos))
41
+
42
+ index_map_from_text_to_word += [len(words)] * len(en_word)
43
+
44
+ words.append(en_word)
45
+ text = text[len(en_word) :]
46
+ else:
47
+ word_start_pos = len(index_map_from_text_to_word)
48
+ word_end_pos = word_start_pos + 1
49
+ index_map_from_word_to_text.append((word_start_pos, word_end_pos))
50
+
51
+ index_map_from_text_to_word += [len(words)]
52
+
53
+ words.append(text[0])
54
+ text = text[1:]
55
+ return words, index_map_from_text_to_word, index_map_from_word_to_text
56
+
57
+
58
+ def tokenize_and_map(tokenizer, text: str):
59
+ words, text2word, word2text = wordize_and_map(text=text)
60
+
61
+ tokens = []
62
+ index_map_from_token_to_text = []
63
+ for word, (word_start, word_end) in zip(words, word2text):
64
+ word_tokens = tokenizer.tokenize(word)
65
+
66
+ if len(word_tokens) == 0 or word_tokens == ["[UNK]"]:
67
+ index_map_from_token_to_text.append((word_start, word_end))
68
+ tokens.append("[UNK]")
69
+ else:
70
+ current_word_start = word_start
71
+ for word_token in word_tokens:
72
+ word_token_len = len(re.sub(r"^##", "", word_token))
73
+ index_map_from_token_to_text.append(
74
+ (current_word_start, current_word_start + word_token_len)
75
+ )
76
+ current_word_start = current_word_start + word_token_len
77
+ tokens.append(word_token)
78
+
79
+ index_map_from_text_to_token = text2word
80
+ for i, (token_start, token_end) in enumerate(index_map_from_token_to_text):
81
+ for token_pos in range(token_start, token_end):
82
+ index_map_from_text_to_token[token_pos] = i
83
+
84
+ return tokens, index_map_from_text_to_token, index_map_from_token_to_text
85
+
86
+
87
+ def _load_config(config_path: os.PathLike):
88
+ import importlib.util
89
+
90
+ spec = importlib.util.spec_from_file_location("__init__", config_path)
91
+ config = importlib.util.module_from_spec(spec)
92
+ spec.loader.exec_module(config)
93
+ return config
94
+
95
+
96
+ default_config_dict = {
97
+ "manual_seed": 1313,
98
+ "model_source": "bert-base-chinese",
99
+ "window_size": 32,
100
+ "num_workers": 2,
101
+ "use_mask": True,
102
+ "use_char_phoneme": False,
103
+ "use_conditional": True,
104
+ "param_conditional": {
105
+ "affect_location": "softmax",
106
+ "bias": True,
107
+ "char-linear": True,
108
+ "pos-linear": False,
109
+ "char+pos-second": True,
110
+ "char+pos-second_lowrank": False,
111
+ "lowrank_size": 0,
112
+ "char+pos-second_fm": False,
113
+ "fm_size": 0,
114
+ "fix_mode": None,
115
+ "count_json": "train.count.json",
116
+ },
117
+ "lr": 5e-5,
118
+ "val_interval": 200,
119
+ "num_iter": 10000,
120
+ "use_focal": False,
121
+ "param_focal": {"alpha": 0.0, "gamma": 0.7},
122
+ "use_pos": True,
123
+ "param_pos ": {
124
+ "weight": 0.1,
125
+ "pos_joint_training": True,
126
+ "train_pos_path": "train.pos",
127
+ "valid_pos_path": "dev.pos",
128
+ "test_pos_path": "test.pos",
129
+ },
130
+ }
131
+
132
+
133
+ def load_config(config_path: os.PathLike, use_default: bool = False):
134
+ config = _load_config(config_path)
135
+ if use_default:
136
+ for attr, val in default_config_dict.items():
137
+ if not hasattr(config, attr):
138
+ setattr(config, attr, val)
139
+ elif isinstance(val, dict):
140
+ d = getattr(config, attr)
141
+ for dict_k, dict_v in val.items():
142
+ if dict_k not in d:
143
+ d[dict_k] = dict_v
144
+ return config
bert_vits2/get_emo.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers import Wav2Vec2Processor
6
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
7
+ Wav2Vec2Model,
8
+ Wav2Vec2PreTrainedModel,
9
+ )
10
+
11
+ from contants import config
12
+
13
+
14
+ class RegressionHead(nn.Module):
15
+ r"""Classification head."""
16
+
17
+ def __init__(self, config):
18
+ super().__init__()
19
+
20
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
21
+ self.dropout = nn.Dropout(config.final_dropout)
22
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
23
+
24
+ def forward(self, features, **kwargs):
25
+ x = features
26
+ x = self.dropout(x)
27
+ x = self.dense(x)
28
+ x = torch.tanh(x)
29
+ x = self.dropout(x)
30
+ x = self.out_proj(x)
31
+
32
+ return x
33
+
34
+
35
+ class EmotionModel(Wav2Vec2PreTrainedModel):
36
+ r"""Speech emotion classifier."""
37
+
38
+ def __init__(self, config):
39
+ super().__init__(config)
40
+
41
+ self.config = config
42
+ self.wav2vec2 = Wav2Vec2Model(config)
43
+ self.classifier = RegressionHead(config)
44
+ self.init_weights()
45
+
46
+ def forward(
47
+ self,
48
+ input_values,
49
+ ):
50
+ outputs = self.wav2vec2(input_values)
51
+ hidden_states = outputs[0]
52
+ hidden_states = torch.mean(hidden_states, dim=1)
53
+ logits = self.classifier(hidden_states)
54
+
55
+ return hidden_states, logits
56
+
57
+
58
+ def process_func(
59
+ x: np.ndarray,
60
+ sampling_rate: int,
61
+ model: EmotionModel,
62
+ processor: Wav2Vec2Processor,
63
+ device: str,
64
+ embeddings: bool = False,
65
+ ) -> np.ndarray:
66
+ r"""Predict emotions or extract embeddings from raw audio signal."""
67
+ model = model.to(device)
68
+ y = processor(x, sampling_rate=sampling_rate)
69
+ y = y["input_values"][0]
70
+ y = torch.from_numpy(y).unsqueeze(0).to(device)
71
+
72
+ # run through model
73
+ with torch.no_grad():
74
+ y = model(y)[0 if embeddings else 1]
75
+
76
+ # convert to numpy
77
+ y = y.detach().cpu().numpy()
78
+
79
+ return y
80
+
81
+
82
+ def get_emo(audio, emotion_model, processor):
83
+ wav, sr = librosa.load(audio, 16000)
84
+ device = config.system.device
85
+ return process_func(
86
+ np.expand_dims(wav, 0).astype(np.float),
87
+ sr,
88
+ emotion_model,
89
+ processor,
90
+ device,
91
+ embeddings=True,
92
+ ).squeeze(0)
bert_vits2/models.py ADDED
@@ -0,0 +1,799 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from bert_vits2 import commons
7
+ from bert_vits2 import modules
8
+ from bert_vits2 import attentions
9
+
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from vector_quantize_pytorch import VectorQuantize
13
+
14
+ from bert_vits2.commons import init_weights, get_padding
15
+ from bert_vits2.text import num_languages
16
+
17
+
18
+ class DurationDiscriminator(nn.Module): # vits2
19
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
20
+ super().__init__()
21
+
22
+ self.in_channels = in_channels
23
+ self.filter_channels = filter_channels
24
+ self.kernel_size = kernel_size
25
+ self.p_dropout = p_dropout
26
+ self.gin_channels = gin_channels
27
+
28
+ self.drop = nn.Dropout(p_dropout)
29
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
30
+ self.norm_1 = modules.LayerNorm(filter_channels)
31
+ self.conv_2 = nn.Conv1d(
32
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
33
+ )
34
+ self.norm_2 = modules.LayerNorm(filter_channels)
35
+ self.dur_proj = nn.Conv1d(1, filter_channels, 1)
36
+
37
+ self.pre_out_conv_1 = nn.Conv1d(2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
38
+ self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
39
+ self.pre_out_conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
40
+ self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
41
+
42
+ if gin_channels != 0:
43
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
44
+
45
+ self.output_layer = nn.Sequential(
46
+ nn.Linear(filter_channels, 1),
47
+ nn.Sigmoid()
48
+ )
49
+
50
+ def forward_probability(self, x, x_mask, dur, g=None):
51
+ dur = self.dur_proj(dur)
52
+ x = torch.cat([x, dur], dim=1)
53
+ x = self.pre_out_conv_1(x * x_mask)
54
+ x = torch.relu(x)
55
+ x = self.pre_out_norm_1(x)
56
+ x = self.drop(x)
57
+ x = self.pre_out_conv_2(x * x_mask)
58
+ x = torch.relu(x)
59
+ x = self.pre_out_norm_2(x)
60
+ x = self.drop(x)
61
+ x = x * x_mask
62
+ x = x.transpose(1, 2)
63
+ output_prob = self.output_layer(x)
64
+ return output_prob
65
+
66
+ def forward(self, x, x_mask, dur_r, dur_hat, g=None):
67
+ x = torch.detach(x)
68
+ if g is not None:
69
+ g = torch.detach(g)
70
+ x = x + self.cond(g)
71
+ x = self.conv_1(x * x_mask)
72
+ x = torch.relu(x)
73
+ x = self.norm_1(x)
74
+ x = self.drop(x)
75
+ x = self.conv_2(x * x_mask)
76
+ x = torch.relu(x)
77
+ x = self.norm_2(x)
78
+ x = self.drop(x)
79
+
80
+ output_probs = []
81
+ for dur in [dur_r, dur_hat]:
82
+ output_prob = self.forward_probability(x, x_mask, dur, g)
83
+ output_probs.append(output_prob)
84
+
85
+ return output_probs
86
+
87
+
88
+ class Block(nn.Module):
89
+ def __init__(self, in_dim, hidden_dim) -> None:
90
+ super().__init__()
91
+ self.norm = nn.LayerNorm(in_dim)
92
+ self.mlp = MLP(in_dim, hidden_dim)
93
+
94
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
95
+ x = x + self.mlp(self.norm(x))
96
+ return x
97
+
98
+
99
+ class MLP(nn.Module):
100
+ def __init__(self, in_dim, hidden_dim):
101
+ super().__init__()
102
+ self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
103
+ self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
104
+ self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
105
+
106
+ def forward(self, x: torch.Tensor):
107
+ x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
108
+ x = self.c_proj(x)
109
+ return x
110
+
111
+
112
+ class TransformerCouplingBlock(nn.Module):
113
+ def __init__(self,
114
+ channels,
115
+ hidden_channels,
116
+ filter_channels,
117
+ n_heads,
118
+ n_layers,
119
+ kernel_size,
120
+ p_dropout,
121
+ n_flows=4,
122
+ gin_channels=0,
123
+ share_parameter=False
124
+ ):
125
+
126
+ super().__init__()
127
+ self.channels = channels
128
+ self.hidden_channels = hidden_channels
129
+ self.kernel_size = kernel_size
130
+ self.n_layers = n_layers
131
+ self.n_flows = n_flows
132
+ self.gin_channels = gin_channels
133
+
134
+ self.flows = nn.ModuleList()
135
+
136
+ self.wn = attentions.FFT(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout,
137
+ isflow=True, gin_channels=self.gin_channels) if share_parameter else None
138
+
139
+ for i in range(n_flows):
140
+ self.flows.append(
141
+ modules.TransformerCouplingLayer(channels, hidden_channels, kernel_size, n_layers, n_heads, p_dropout,
142
+ filter_channels, mean_only=True, wn_sharing_parameter=self.wn,
143
+ gin_channels=self.gin_channels))
144
+ self.flows.append(modules.Flip())
145
+
146
+ def forward(self, x, x_mask, g=None, reverse=False):
147
+ if not reverse:
148
+ for flow in self.flows:
149
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
+ else:
151
+ for flow in reversed(self.flows):
152
+ x = flow(x, x_mask, g=g, reverse=reverse)
153
+ return x
154
+
155
+
156
+ class StochasticDurationPredictor(nn.Module):
157
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
158
+ super().__init__()
159
+ filter_channels = in_channels # it needs to be removed from future version.
160
+ self.in_channels = in_channels
161
+ self.filter_channels = filter_channels
162
+ self.kernel_size = kernel_size
163
+ self.p_dropout = p_dropout
164
+ self.n_flows = n_flows
165
+ self.gin_channels = gin_channels
166
+
167
+ self.log_flow = modules.Log()
168
+ self.flows = nn.ModuleList()
169
+ self.flows.append(modules.ElementwiseAffine(2))
170
+ for i in range(n_flows):
171
+ self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
172
+ self.flows.append(modules.Flip())
173
+
174
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
175
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
176
+ self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
177
+ self.post_flows = nn.ModuleList()
178
+ self.post_flows.append(modules.ElementwiseAffine(2))
179
+ for i in range(4):
180
+ self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
181
+ self.post_flows.append(modules.Flip())
182
+
183
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
184
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
185
+ self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
186
+ if gin_channels != 0:
187
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
188
+
189
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
190
+ x = torch.detach(x)
191
+ x = self.pre(x)
192
+ if g is not None:
193
+ g = torch.detach(g)
194
+ x = x + self.cond(g)
195
+ x = self.convs(x, x_mask)
196
+ x = self.proj(x) * x_mask
197
+
198
+ if not reverse:
199
+ flows = self.flows
200
+ assert w is not None
201
+
202
+ logdet_tot_q = 0
203
+ h_w = self.post_pre(w)
204
+ h_w = self.post_convs(h_w, x_mask)
205
+ h_w = self.post_proj(h_w) * x_mask
206
+ e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
207
+ z_q = e_q
208
+ for flow in self.post_flows:
209
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
210
+ logdet_tot_q += logdet_q
211
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
212
+ u = torch.sigmoid(z_u) * x_mask
213
+ z0 = (w - u) * x_mask
214
+ logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
215
+ logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2]) - logdet_tot_q
216
+
217
+ logdet_tot = 0
218
+ z0, logdet = self.log_flow(z0, x_mask)
219
+ logdet_tot += logdet
220
+ z = torch.cat([z0, z1], 1)
221
+ for flow in flows:
222
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
223
+ logdet_tot = logdet_tot + logdet
224
+ nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot
225
+ return nll + logq # [b]
226
+ else:
227
+ flows = list(reversed(self.flows))
228
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
229
+ z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
230
+ for flow in flows:
231
+ z = flow(z, x_mask, g=x, reverse=reverse)
232
+ z0, z1 = torch.split(z, [1, 1], 1)
233
+ logw = z0
234
+ return logw
235
+
236
+
237
+ class DurationPredictor(nn.Module):
238
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
239
+ super().__init__()
240
+
241
+ self.in_channels = in_channels
242
+ self.filter_channels = filter_channels
243
+ self.kernel_size = kernel_size
244
+ self.p_dropout = p_dropout
245
+ self.gin_channels = gin_channels
246
+
247
+ self.drop = nn.Dropout(p_dropout)
248
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
249
+ self.norm_1 = modules.LayerNorm(filter_channels)
250
+ self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
251
+ self.norm_2 = modules.LayerNorm(filter_channels)
252
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
253
+
254
+ if gin_channels != 0:
255
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
256
+
257
+ def forward(self, x, x_mask, g=None):
258
+ x = torch.detach(x)
259
+ if g is not None:
260
+ g = torch.detach(g)
261
+ x = x + self.cond(g)
262
+ x = self.conv_1(x * x_mask)
263
+ x = torch.relu(x)
264
+ x = self.norm_1(x)
265
+ x = self.drop(x)
266
+ x = self.conv_2(x * x_mask)
267
+ x = torch.relu(x)
268
+ x = self.norm_2(x)
269
+ x = self.drop(x)
270
+ x = self.proj(x * x_mask)
271
+ return x * x_mask
272
+
273
+
274
+ class TextEncoder(nn.Module):
275
+ def __init__(self,
276
+ n_vocab,
277
+ out_channels,
278
+ hidden_channels,
279
+ filter_channels,
280
+ n_heads,
281
+ n_layers,
282
+ kernel_size,
283
+ p_dropout,
284
+ n_speakers,
285
+ gin_channels=0,
286
+ symbols=None,
287
+ ja_bert_dim=1024,
288
+ num_tones=None,
289
+ emotion_embedding=1,
290
+ zh_bert_extra=False,
291
+ ):
292
+ super().__init__()
293
+ self.n_vocab = n_vocab
294
+ self.out_channels = out_channels
295
+ self.hidden_channels = hidden_channels
296
+ self.filter_channels = filter_channels
297
+ self.n_heads = n_heads
298
+ self.n_layers = n_layers
299
+ self.kernel_size = kernel_size
300
+ self.p_dropout = p_dropout
301
+ self.gin_channels = gin_channels
302
+ self.emb = nn.Embedding(len(symbols), hidden_channels)
303
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5)
304
+ self.tone_emb = nn.Embedding(num_tones, hidden_channels)
305
+ nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels ** -0.5)
306
+ self.language_emb = nn.Embedding(num_languages, hidden_channels)
307
+ nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels ** -0.5)
308
+ self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
309
+ self.zh_bert_extra = zh_bert_extra
310
+ if self.zh_bert_extra:
311
+ self.bert_pre_proj = nn.Conv1d(2048, 1024, 1)
312
+ self.ja_bert_proj = nn.Conv1d(ja_bert_dim, hidden_channels, 1)
313
+ self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
314
+ self.emotion_embedding = emotion_embedding
315
+
316
+ if self.emotion_embedding == 1:
317
+ self.emo_proj = nn.Linear(1024, 1024)
318
+ self.emo_quantizer = VectorQuantize(
319
+ dim=1024,
320
+ codebook_size=10,
321
+ decay=0.8,
322
+ commitment_weight=1.0,
323
+ learnable_codebook=True,
324
+ ema_update=False,
325
+ )
326
+ self.emo_q_proj = nn.Linear(1024, hidden_channels)
327
+ elif self.emotion_embedding == 2:
328
+ self.in_feature_net = nn.Sequential(
329
+ # input is assumed to an already normalized embedding
330
+ nn.Linear(512, 1028, bias=False),
331
+ nn.GELU(),
332
+ nn.LayerNorm(1028),
333
+ *[Block(1028, 512) for _ in range(1)],
334
+ nn.Linear(1028, 512, bias=False),
335
+ # normalize before passing to VQ?
336
+ # nn.GELU(),
337
+ # nn.LayerNorm(512),
338
+ )
339
+ self.emo_vq = VectorQuantize(
340
+ dim=512,
341
+ codebook_size=64,
342
+ codebook_dim=32,
343
+ commitment_weight=0.1,
344
+ decay=0.85,
345
+ heads=32,
346
+ kmeans_iters=20,
347
+ separate_codebook_per_head=True,
348
+ stochastic_sample_codes=True,
349
+ threshold_ema_dead_code=2,
350
+ )
351
+ self.out_feature_net = nn.Linear(512, hidden_channels)
352
+
353
+ self.encoder = attentions.Encoder(
354
+ hidden_channels,
355
+ filter_channels,
356
+ n_heads,
357
+ n_layers,
358
+ kernel_size,
359
+ p_dropout,
360
+ gin_channels=self.gin_channels)
361
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
362
+
363
+ def forward(self, x, x_lengths, tone, language, zh_bert, ja_bert, en_bert, emo=None, sid=None, g=None):
364
+ x = self.emb(x) + self.tone_emb(tone) + self.language_emb(language)
365
+
366
+ if self.zh_bert_extra:
367
+ zh_bert = self.bert_pre_proj(zh_bert)
368
+ x += self.bert_proj(zh_bert).transpose(1, 2)
369
+ x += self.ja_bert_proj(ja_bert).transpose(1, 2)
370
+ x += self.en_bert_proj(en_bert).transpose(1, 2)
371
+
372
+ x *= math.sqrt(self.hidden_channels) # [b, t, h]
373
+ if self.emotion_embedding == 1:
374
+ # emo = emo.to(zh_bert_emb.device)
375
+ if emo.size(-1) == 1024:
376
+ emo_emb = self.emo_proj(emo.unsqueeze(1))
377
+ emo_emb_ = []
378
+ for i in range(emo_emb.size(0)):
379
+ temp_emo_emb, _, _ = self.emo_quantizer(
380
+ emo_emb[i].unsqueeze(0).to(emo.device)
381
+ )
382
+ emo_emb_.append(temp_emo_emb)
383
+ emo_emb = torch.cat(emo_emb_, dim=0).to(emo_emb.device)
384
+ else:
385
+ emo_emb = (
386
+ self.emo_quantizer.get_output_from_indices(emo.to(torch.long))
387
+ .unsqueeze(0)
388
+ .to(emo.device)
389
+ )
390
+
391
+ x += self.emo_q_proj(emo_emb)
392
+ elif self.emotion_embedding == 2:
393
+ emo_emb = self.in_feature_net(emo)
394
+ emo_emb, _, _ = self.emo_vq(emo_emb.unsqueeze(1))
395
+ emo_emb = self.out_feature_net(emo_emb)
396
+ x += emo_emb
397
+
398
+ x *= math.sqrt(self.hidden_channels) # [b, t, h]
399
+ x = torch.transpose(x, 1, -1) # [b, h, t]
400
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
401
+
402
+ x = self.encoder(x * x_mask, x_mask, g=g)
403
+ stats = self.proj(x) * x_mask
404
+
405
+ m, logs = torch.split(stats, self.out_channels, dim=1)
406
+ return x, m, logs, x_mask
407
+
408
+
409
+ class ResidualCouplingBlock(nn.Module):
410
+ def __init__(self,
411
+ channels,
412
+ hidden_channels,
413
+ kernel_size,
414
+ dilation_rate,
415
+ n_layers,
416
+ n_flows=4,
417
+ gin_channels=0):
418
+ super().__init__()
419
+ self.channels = channels
420
+ self.hidden_channels = hidden_channels
421
+ self.kernel_size = kernel_size
422
+ self.dilation_rate = dilation_rate
423
+ self.n_layers = n_layers
424
+ self.n_flows = n_flows
425
+ self.gin_channels = gin_channels
426
+
427
+ self.flows = nn.ModuleList()
428
+ for i in range(n_flows):
429
+ self.flows.append(
430
+ modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
431
+ gin_channels=gin_channels, mean_only=True))
432
+ self.flows.append(modules.Flip())
433
+
434
+ def forward(self, x, x_mask, g=None, reverse=False):
435
+ if not reverse:
436
+ for flow in self.flows:
437
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
438
+ else:
439
+ for flow in reversed(self.flows):
440
+ x = flow(x, x_mask, g=g, reverse=reverse)
441
+ return x
442
+
443
+
444
+ class PosteriorEncoder(nn.Module):
445
+ def __init__(self,
446
+ in_channels,
447
+ out_channels,
448
+ hidden_channels,
449
+ kernel_size,
450
+ dilation_rate,
451
+ n_layers,
452
+ gin_channels=0):
453
+ super().__init__()
454
+ self.in_channels = in_channels
455
+ self.out_channels = out_channels
456
+ self.hidden_channels = hidden_channels
457
+ self.kernel_size = kernel_size
458
+ self.dilation_rate = dilation_rate
459
+ self.n_layers = n_layers
460
+ self.gin_channels = gin_channels
461
+
462
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
463
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
464
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
465
+
466
+ def forward(self, x, x_lengths, g=None):
467
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
468
+ x = self.pre(x) * x_mask
469
+ x = self.enc(x, x_mask, g=g)
470
+ stats = self.proj(x) * x_mask
471
+ m, logs = torch.split(stats, self.out_channels, dim=1)
472
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
473
+ return z, m, logs, x_mask
474
+
475
+
476
+ class Generator(torch.nn.Module):
477
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
478
+ upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
479
+ super(Generator, self).__init__()
480
+ self.num_kernels = len(resblock_kernel_sizes)
481
+ self.num_upsamples = len(upsample_rates)
482
+ self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
483
+ resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
484
+
485
+ self.ups = nn.ModuleList()
486
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
487
+ self.ups.append(weight_norm(
488
+ ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)),
489
+ k, u, padding=(k - u) // 2)))
490
+
491
+ self.resblocks = nn.ModuleList()
492
+ for i in range(len(self.ups)):
493
+ ch = upsample_initial_channel // (2 ** (i + 1))
494
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
495
+ self.resblocks.append(resblock(ch, k, d))
496
+
497
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
498
+ self.ups.apply(init_weights)
499
+
500
+ if gin_channels != 0:
501
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
502
+
503
+ def forward(self, x, g=None):
504
+ x = self.conv_pre(x)
505
+ if g is not None:
506
+ x = x + self.cond(g)
507
+
508
+ for i in range(self.num_upsamples):
509
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
510
+ x = self.ups[i](x)
511
+ xs = None
512
+ for j in range(self.num_kernels):
513
+ if xs is None:
514
+ xs = self.resblocks[i * self.num_kernels + j](x)
515
+ else:
516
+ xs += self.resblocks[i * self.num_kernels + j](x)
517
+ x = xs / self.num_kernels
518
+ x = F.leaky_relu(x)
519
+ x = self.conv_post(x)
520
+ x = torch.tanh(x)
521
+
522
+ return x
523
+
524
+ def remove_weight_norm(self):
525
+ print('Removing weight norm...')
526
+ for l in self.ups:
527
+ remove_weight_norm(l)
528
+ for l in self.resblocks:
529
+ l.remove_weight_norm()
530
+
531
+
532
+ class DiscriminatorP(torch.nn.Module):
533
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
534
+ super(DiscriminatorP, self).__init__()
535
+ self.period = period
536
+ self.use_spectral_norm = use_spectral_norm
537
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
538
+ self.convs = nn.ModuleList([
539
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
540
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
541
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
542
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
543
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
544
+ ])
545
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
546
+
547
+ def forward(self, x):
548
+ fmap = []
549
+
550
+ # 1d to 2d
551
+ b, c, t = x.shape
552
+ if t % self.period != 0: # pad first
553
+ n_pad = self.period - (t % self.period)
554
+ x = F.pad(x, (0, n_pad), "reflect")
555
+ t = t + n_pad
556
+ x = x.view(b, c, t // self.period, self.period)
557
+
558
+ for l in self.convs:
559
+ x = l(x)
560
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
561
+ fmap.append(x)
562
+ x = self.conv_post(x)
563
+ fmap.append(x)
564
+ x = torch.flatten(x, 1, -1)
565
+
566
+ return x, fmap
567
+
568
+
569
+ class DiscriminatorS(torch.nn.Module):
570
+ def __init__(self, use_spectral_norm=False):
571
+ super(DiscriminatorS, self).__init__()
572
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
573
+ self.convs = nn.ModuleList([
574
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
575
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
576
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
577
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
578
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
579
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
580
+ ])
581
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
582
+
583
+ def forward(self, x):
584
+ fmap = []
585
+
586
+ for l in self.convs:
587
+ x = l(x)
588
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
589
+ fmap.append(x)
590
+ x = self.conv_post(x)
591
+ fmap.append(x)
592
+ x = torch.flatten(x, 1, -1)
593
+
594
+ return x, fmap
595
+
596
+
597
+ class MultiPeriodDiscriminator(torch.nn.Module):
598
+ def __init__(self, use_spectral_norm=False):
599
+ super(MultiPeriodDiscriminator, self).__init__()
600
+ periods = [2, 3, 5, 7, 11]
601
+
602
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
603
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
604
+ self.discriminators = nn.ModuleList(discs)
605
+
606
+ def forward(self, y, y_hat):
607
+ y_d_rs = []
608
+ y_d_gs = []
609
+ fmap_rs = []
610
+ fmap_gs = []
611
+ for i, d in enumerate(self.discriminators):
612
+ y_d_r, fmap_r = d(y)
613
+ y_d_g, fmap_g = d(y_hat)
614
+ y_d_rs.append(y_d_r)
615
+ y_d_gs.append(y_d_g)
616
+ fmap_rs.append(fmap_r)
617
+ fmap_gs.append(fmap_g)
618
+
619
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
620
+
621
+
622
+ class ReferenceEncoder(nn.Module):
623
+ '''
624
+ inputs --- [N, Ty/r, n_mels*r] mels
625
+ outputs --- [N, ref_enc_gru_size]
626
+ '''
627
+
628
+ def __init__(self, spec_channels, gin_channels=0):
629
+
630
+ super().__init__()
631
+ self.spec_channels = spec_channels
632
+ ref_enc_filters = [32, 32, 64, 64, 128, 128]
633
+ K = len(ref_enc_filters)
634
+ filters = [1] + ref_enc_filters
635
+ convs = [weight_norm(nn.Conv2d(in_channels=filters[i],
636
+ out_channels=filters[i + 1],
637
+ kernel_size=(3, 3),
638
+ stride=(2, 2),
639
+ padding=(1, 1))) for i in range(K)]
640
+ self.convs = nn.ModuleList(convs)
641
+ # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)])
642
+
643
+ out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
644
+ self.gru = nn.GRU(input_size=ref_enc_filters[-1] * out_channels,
645
+ hidden_size=256 // 2,
646
+ batch_first=True)
647
+ self.proj = nn.Linear(128, gin_channels)
648
+
649
+ def forward(self, inputs, mask=None):
650
+ N = inputs.size(0)
651
+ out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
652
+ for conv in self.convs:
653
+ out = conv(out)
654
+ # out = wn(out)
655
+ out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
656
+
657
+ out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
658
+ T = out.size(1)
659
+ N = out.size(0)
660
+ out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
661
+
662
+ self.gru.flatten_parameters()
663
+ memory, out = self.gru(out) # out --- [1, N, 128]
664
+
665
+ return self.proj(out.squeeze(0))
666
+
667
+ def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
668
+ for i in range(n_convs):
669
+ L = (L - kernel_size + 2 * pad) // stride + 1
670
+ return L
671
+
672
+
673
+ class SynthesizerTrn(nn.Module):
674
+ """
675
+ Synthesizer for Training
676
+ """
677
+
678
+ def __init__(self,
679
+ n_vocab,
680
+ spec_channels,
681
+ segment_size,
682
+ inter_channels,
683
+ hidden_channels,
684
+ filter_channels,
685
+ n_heads,
686
+ n_layers,
687
+ kernel_size,
688
+ p_dropout,
689
+ resblock,
690
+ resblock_kernel_sizes,
691
+ resblock_dilation_sizes,
692
+ upsample_rates,
693
+ upsample_initial_channel,
694
+ upsample_kernel_sizes,
695
+ n_speakers=256,
696
+ gin_channels=256,
697
+ use_sdp=True,
698
+ n_flow_layer=4,
699
+ n_layers_trans_flow=6,
700
+ flow_share_parameter=False,
701
+ use_transformer_flow=True,
702
+ symbols=None,
703
+ ja_bert_dim=1024,
704
+ num_tones=None,
705
+ emotion_embedding=False,
706
+ zh_bert_extra=False,
707
+ **kwargs):
708
+
709
+ super().__init__()
710
+ self.n_vocab = n_vocab
711
+ self.spec_channels = spec_channels
712
+ self.inter_channels = inter_channels
713
+ self.hidden_channels = hidden_channels
714
+ self.filter_channels = filter_channels
715
+ self.n_heads = n_heads
716
+ self.n_layers = n_layers
717
+ self.kernel_size = kernel_size
718
+ self.p_dropout = p_dropout
719
+ self.resblock = resblock
720
+ self.resblock_kernel_sizes = resblock_kernel_sizes
721
+ self.resblock_dilation_sizes = resblock_dilation_sizes
722
+ self.upsample_rates = upsample_rates
723
+ self.upsample_initial_channel = upsample_initial_channel
724
+ self.upsample_kernel_sizes = upsample_kernel_sizes
725
+ self.segment_size = segment_size
726
+ self.n_speakers = n_speakers
727
+ self.gin_channels = gin_channels
728
+ self.n_layers_trans_flow = n_layers_trans_flow
729
+ self.use_spk_conditioned_encoder = kwargs.get("use_spk_conditioned_encoder", True)
730
+ self.use_sdp = use_sdp
731
+ self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
732
+ self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
733
+ self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
734
+ self.current_mas_noise_scale = self.mas_noise_scale_initial
735
+ if self.use_spk_conditioned_encoder and gin_channels > 0:
736
+ self.enc_gin_channels = gin_channels
737
+ self.emotion_embedding = emotion_embedding
738
+ self.enc_p = TextEncoder(n_vocab,
739
+ inter_channels,
740
+ hidden_channels,
741
+ filter_channels,
742
+ n_heads,
743
+ n_layers,
744
+ kernel_size,
745
+ p_dropout,
746
+ self.n_speakers,
747
+ gin_channels=self.enc_gin_channels,
748
+ symbols=symbols,
749
+ ja_bert_dim=ja_bert_dim,
750
+ num_tones=num_tones,
751
+ emotion_embedding=self.emotion_embedding,
752
+ zh_bert_extra=zh_bert_extra,
753
+ )
754
+ self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
755
+ upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
756
+ self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
757
+ gin_channels=gin_channels)
758
+ if use_transformer_flow:
759
+ self.flow = TransformerCouplingBlock(inter_channels, hidden_channels, filter_channels, n_heads,
760
+ n_layers_trans_flow, 5, p_dropout, n_flow_layer,
761
+ gin_channels=gin_channels, share_parameter=flow_share_parameter)
762
+ else:
763
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,
764
+ gin_channels=gin_channels)
765
+ self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
766
+ self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
767
+
768
+ if self.n_speakers > 0:
769
+ self.emb_g = nn.Embedding(self.n_speakers, gin_channels)
770
+ else:
771
+ self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
772
+
773
+ def infer(self, x, x_lengths, sid, tone, language, zh_bert, ja_bert, en_bert, noise_scale=.667, length_scale=1,
774
+ noise_scale_w=0.8, max_len=None, sdp_ratio=0, y=None, emo=None):
775
+ # x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, tone, language, bert)
776
+ # g = self.gst(y)
777
+ if self.n_speakers > 0:
778
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
779
+ else:
780
+ g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
781
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, tone, language, zh_bert, ja_bert, en_bert, emo, sid, g=g)
782
+ logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (sdp_ratio) + self.dp(x, x_mask,
783
+ g=g) * (
784
+ 1 - sdp_ratio)
785
+ w = torch.exp(logw) * x_mask * length_scale
786
+ w_ceil = torch.ceil(w)
787
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
788
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
789
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
790
+ attn = commons.generate_path(w_ceil, attn_mask)
791
+
792
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
793
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1,
794
+ 2) # [b, t', t], [b, t, d] -> [b, d, t']
795
+
796
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
797
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
798
+ o = self.dec((z * y_mask)[:, :, :max_len], g=g)
799
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)
bert_vits2/models_ja_extra.py ADDED
@@ -0,0 +1,1016 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from bert_vits2 import commons
7
+ from bert_vits2 import modules
8
+ from bert_vits2 import attentions
9
+
10
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+
13
+ from bert_vits2.commons import init_weights, get_padding
14
+ from bert_vits2.text import symbols, num_tones, num_languages
15
+
16
+ from vector_quantize_pytorch import VectorQuantize
17
+
18
+
19
+ class DurationDiscriminator(nn.Module): # vits2
20
+ def __init__(
21
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
22
+ ):
23
+ super().__init__()
24
+
25
+ self.in_channels = in_channels
26
+ self.filter_channels = filter_channels
27
+ self.kernel_size = kernel_size
28
+ self.p_dropout = p_dropout
29
+ self.gin_channels = gin_channels
30
+
31
+ self.drop = nn.Dropout(p_dropout)
32
+ self.conv_1 = nn.Conv1d(
33
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
34
+ )
35
+ self.norm_1 = modules.LayerNorm(filter_channels)
36
+ self.conv_2 = nn.Conv1d(
37
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
38
+ )
39
+ self.norm_2 = modules.LayerNorm(filter_channels)
40
+ self.dur_proj = nn.Conv1d(1, filter_channels, 1)
41
+
42
+ self.LSTM = nn.LSTM(
43
+ 2 * filter_channels, filter_channels, batch_first=True, bidirectional=True
44
+ )
45
+
46
+ if gin_channels != 0:
47
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
48
+
49
+ self.output_layer = nn.Sequential(
50
+ nn.Linear(2 * filter_channels, 1), nn.Sigmoid()
51
+ )
52
+
53
+ def forward_probability(self, x, dur):
54
+ dur = self.dur_proj(dur)
55
+ x = torch.cat([x, dur], dim=1)
56
+ x = x.transpose(1, 2)
57
+ x, _ = self.LSTM(x)
58
+ output_prob = self.output_layer(x)
59
+ return output_prob
60
+
61
+ def forward(self, x, x_mask, dur_r, dur_hat, g=None):
62
+ x = torch.detach(x)
63
+ if g is not None:
64
+ g = torch.detach(g)
65
+ x = x + self.cond(g)
66
+ x = self.conv_1(x * x_mask)
67
+ x = torch.relu(x)
68
+ x = self.norm_1(x)
69
+ x = self.drop(x)
70
+ x = self.conv_2(x * x_mask)
71
+ x = torch.relu(x)
72
+ x = self.norm_2(x)
73
+ x = self.drop(x)
74
+
75
+ output_probs = []
76
+ for dur in [dur_r, dur_hat]:
77
+ output_prob = self.forward_probability(x, dur)
78
+ output_probs.append(output_prob)
79
+
80
+ return output_probs
81
+
82
+
83
+ class TransformerCouplingBlock(nn.Module):
84
+ def __init__(
85
+ self,
86
+ channels,
87
+ hidden_channels,
88
+ filter_channels,
89
+ n_heads,
90
+ n_layers,
91
+ kernel_size,
92
+ p_dropout,
93
+ n_flows=4,
94
+ gin_channels=0,
95
+ share_parameter=False,
96
+ ):
97
+ super().__init__()
98
+ self.channels = channels
99
+ self.hidden_channels = hidden_channels
100
+ self.kernel_size = kernel_size
101
+ self.n_layers = n_layers
102
+ self.n_flows = n_flows
103
+ self.gin_channels = gin_channels
104
+
105
+ self.flows = nn.ModuleList()
106
+
107
+ self.wn = (
108
+ attentions.FFT(
109
+ hidden_channels,
110
+ filter_channels,
111
+ n_heads,
112
+ n_layers,
113
+ kernel_size,
114
+ p_dropout,
115
+ isflow=True,
116
+ gin_channels=self.gin_channels,
117
+ )
118
+ if share_parameter
119
+ else None
120
+ )
121
+
122
+ for i in range(n_flows):
123
+ self.flows.append(
124
+ modules.TransformerCouplingLayer(
125
+ channels,
126
+ hidden_channels,
127
+ kernel_size,
128
+ n_layers,
129
+ n_heads,
130
+ p_dropout,
131
+ filter_channels,
132
+ mean_only=True,
133
+ wn_sharing_parameter=self.wn,
134
+ gin_channels=self.gin_channels,
135
+ )
136
+ )
137
+ self.flows.append(modules.Flip())
138
+
139
+ def forward(self, x, x_mask, g=None, reverse=False):
140
+ if not reverse:
141
+ for flow in self.flows:
142
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
143
+ else:
144
+ for flow in reversed(self.flows):
145
+ x = flow(x, x_mask, g=g, reverse=reverse)
146
+ return x
147
+
148
+
149
+ class StochasticDurationPredictor(nn.Module):
150
+ def __init__(
151
+ self,
152
+ in_channels,
153
+ filter_channels,
154
+ kernel_size,
155
+ p_dropout,
156
+ n_flows=4,
157
+ gin_channels=0,
158
+ ):
159
+ super().__init__()
160
+ filter_channels = in_channels # it needs to be removed from future version.
161
+ self.in_channels = in_channels
162
+ self.filter_channels = filter_channels
163
+ self.kernel_size = kernel_size
164
+ self.p_dropout = p_dropout
165
+ self.n_flows = n_flows
166
+ self.gin_channels = gin_channels
167
+
168
+ self.log_flow = modules.Log()
169
+ self.flows = nn.ModuleList()
170
+ self.flows.append(modules.ElementwiseAffine(2))
171
+ for i in range(n_flows):
172
+ self.flows.append(
173
+ modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
174
+ )
175
+ self.flows.append(modules.Flip())
176
+
177
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
178
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
179
+ self.post_convs = modules.DDSConv(
180
+ filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
181
+ )
182
+ self.post_flows = nn.ModuleList()
183
+ self.post_flows.append(modules.ElementwiseAffine(2))
184
+ for i in range(4):
185
+ self.post_flows.append(
186
+ modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
187
+ )
188
+ self.post_flows.append(modules.Flip())
189
+
190
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
191
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
192
+ self.convs = modules.DDSConv(
193
+ filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
194
+ )
195
+ if gin_channels != 0:
196
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
197
+
198
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
199
+ x = torch.detach(x)
200
+ x = self.pre(x)
201
+ if g is not None:
202
+ g = torch.detach(g)
203
+ x = x + self.cond(g)
204
+ x = self.convs(x, x_mask)
205
+ x = self.proj(x) * x_mask
206
+
207
+ if not reverse:
208
+ flows = self.flows
209
+ assert w is not None
210
+
211
+ logdet_tot_q = 0
212
+ h_w = self.post_pre(w)
213
+ h_w = self.post_convs(h_w, x_mask)
214
+ h_w = self.post_proj(h_w) * x_mask
215
+ e_q = (
216
+ torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
217
+ * x_mask
218
+ )
219
+ z_q = e_q
220
+ for flow in self.post_flows:
221
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
222
+ logdet_tot_q += logdet_q
223
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
224
+ u = torch.sigmoid(z_u) * x_mask
225
+ z0 = (w - u) * x_mask
226
+ logdet_tot_q += torch.sum(
227
+ (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
228
+ )
229
+ logq = (
230
+ torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2])
231
+ - logdet_tot_q
232
+ )
233
+
234
+ logdet_tot = 0
235
+ z0, logdet = self.log_flow(z0, x_mask)
236
+ logdet_tot += logdet
237
+ z = torch.cat([z0, z1], 1)
238
+ for flow in flows:
239
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
240
+ logdet_tot = logdet_tot + logdet
241
+ nll = (
242
+ torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2])
243
+ - logdet_tot
244
+ )
245
+ return nll + logq # [b]
246
+ else:
247
+ flows = list(reversed(self.flows))
248
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
249
+ z = (
250
+ torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
251
+ * noise_scale
252
+ )
253
+ for flow in flows:
254
+ z = flow(z, x_mask, g=x, reverse=reverse)
255
+ z0, z1 = torch.split(z, [1, 1], 1)
256
+ logw = z0
257
+ return logw
258
+
259
+
260
+ class DurationPredictor(nn.Module):
261
+ def __init__(
262
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
263
+ ):
264
+ super().__init__()
265
+
266
+ self.in_channels = in_channels
267
+ self.filter_channels = filter_channels
268
+ self.kernel_size = kernel_size
269
+ self.p_dropout = p_dropout
270
+ self.gin_channels = gin_channels
271
+
272
+ self.drop = nn.Dropout(p_dropout)
273
+ self.conv_1 = nn.Conv1d(
274
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
275
+ )
276
+ self.norm_1 = modules.LayerNorm(filter_channels)
277
+ self.conv_2 = nn.Conv1d(
278
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
279
+ )
280
+ self.norm_2 = modules.LayerNorm(filter_channels)
281
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
282
+
283
+ if gin_channels != 0:
284
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
285
+
286
+ def forward(self, x, x_mask, g=None):
287
+ x = torch.detach(x)
288
+ if g is not None:
289
+ g = torch.detach(g)
290
+ x = x + self.cond(g)
291
+ x = self.conv_1(x * x_mask)
292
+ x = torch.relu(x)
293
+ x = self.norm_1(x)
294
+ x = self.drop(x)
295
+ x = self.conv_2(x * x_mask)
296
+ x = torch.relu(x)
297
+ x = self.norm_2(x)
298
+ x = self.drop(x)
299
+ x = self.proj(x * x_mask)
300
+ return x * x_mask
301
+
302
+
303
+ class Bottleneck(nn.Sequential):
304
+ def __init__(self, in_dim, hidden_dim):
305
+ c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
306
+ c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
307
+ super().__init__(*[c_fc1, c_fc2])
308
+
309
+
310
+ class Block(nn.Module):
311
+ def __init__(self, in_dim, hidden_dim) -> None:
312
+ super().__init__()
313
+ self.norm = nn.LayerNorm(in_dim)
314
+ self.mlp = MLP(in_dim, hidden_dim)
315
+
316
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
317
+ x = x + self.mlp(self.norm(x))
318
+ return x
319
+
320
+
321
+ class MLP(nn.Module):
322
+ def __init__(self, in_dim, hidden_dim):
323
+ super().__init__()
324
+ self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
325
+ self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
326
+ self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
327
+
328
+ def forward(self, x: torch.Tensor):
329
+ x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
330
+ x = self.c_proj(x)
331
+ return x
332
+
333
+
334
+ class TextEncoder(nn.Module):
335
+ def __init__(
336
+ self,
337
+ n_vocab,
338
+ out_channels,
339
+ hidden_channels,
340
+ filter_channels,
341
+ n_heads,
342
+ n_layers,
343
+ kernel_size,
344
+ p_dropout,
345
+ gin_channels=0,
346
+ ):
347
+ super().__init__()
348
+ self.n_vocab = n_vocab
349
+ self.out_channels = out_channels
350
+ self.hidden_channels = hidden_channels
351
+ self.filter_channels = filter_channels
352
+ self.n_heads = n_heads
353
+ self.n_layers = n_layers
354
+ self.kernel_size = kernel_size
355
+ self.p_dropout = p_dropout
356
+ self.gin_channels = gin_channels
357
+ self.emb = nn.Embedding(len(symbols), hidden_channels)
358
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5)
359
+ self.tone_emb = nn.Embedding(num_tones, hidden_channels)
360
+ nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels ** -0.5)
361
+ self.language_emb = nn.Embedding(num_languages, hidden_channels)
362
+ nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels ** -0.5)
363
+ self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
364
+ # self.bert_pre_proj = nn.Conv1d(2048, 1024, 1)
365
+ # self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
366
+ self.in_feature_net = nn.Sequential(
367
+ # input is assumed to an already normalized embedding
368
+ nn.Linear(512, 1028, bias=False),
369
+ nn.GELU(),
370
+ nn.LayerNorm(1028),
371
+ *[Block(1028, 512) for _ in range(1)],
372
+ nn.Linear(1028, 512, bias=False),
373
+ # normalize before passing to VQ?
374
+ # nn.GELU(),
375
+ # nn.LayerNorm(512),
376
+ )
377
+ self.emo_vq = VectorQuantize(
378
+ dim=512,
379
+ # codebook_size=128,
380
+ codebook_size=256,
381
+ codebook_dim=16,
382
+ # codebook_dim=32,
383
+ commitment_weight=0.1,
384
+ decay=0.99,
385
+ heads=32,
386
+ kmeans_iters=20,
387
+ separate_codebook_per_head=True,
388
+ stochastic_sample_codes=True,
389
+ threshold_ema_dead_code=2,
390
+ use_cosine_sim=True,
391
+ )
392
+ self.out_feature_net = nn.Linear(512, hidden_channels)
393
+
394
+ self.encoder = attentions.Encoder(
395
+ hidden_channels,
396
+ filter_channels,
397
+ n_heads,
398
+ n_layers,
399
+ kernel_size,
400
+ p_dropout,
401
+ gin_channels=self.gin_channels,
402
+ )
403
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
404
+
405
+ def forward(self, x, x_lengths, tone, language, bert, emo, g=None):
406
+ bert_emb = self.bert_proj(bert).transpose(1, 2)
407
+ # en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
408
+ emo_emb = self.in_feature_net(emo)
409
+ emo_emb, _, loss_commit = self.emo_vq(emo_emb.unsqueeze(1))
410
+ loss_commit = loss_commit.mean()
411
+ emo_emb = self.out_feature_net(emo_emb)
412
+ x = (
413
+ self.emb(x)
414
+ + self.tone_emb(tone)
415
+ + self.language_emb(language)
416
+ + bert_emb
417
+ # + en_bert_emb
418
+ + emo_emb
419
+ ) * math.sqrt(
420
+ self.hidden_channels
421
+ ) # [b, t, h]
422
+ x = torch.transpose(x, 1, -1) # [b, h, t]
423
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
424
+ x.dtype
425
+ )
426
+
427
+ x = self.encoder(x * x_mask, x_mask, g=g)
428
+ stats = self.proj(x) * x_mask
429
+
430
+ m, logs = torch.split(stats, self.out_channels, dim=1)
431
+ return x, m, logs, x_mask, loss_commit
432
+
433
+
434
+ class ResidualCouplingBlock(nn.Module):
435
+ def __init__(
436
+ self,
437
+ channels,
438
+ hidden_channels,
439
+ kernel_size,
440
+ dilation_rate,
441
+ n_layers,
442
+ n_flows=4,
443
+ gin_channels=0,
444
+ ):
445
+ super().__init__()
446
+ self.channels = channels
447
+ self.hidden_channels = hidden_channels
448
+ self.kernel_size = kernel_size
449
+ self.dilation_rate = dilation_rate
450
+ self.n_layers = n_layers
451
+ self.n_flows = n_flows
452
+ self.gin_channels = gin_channels
453
+
454
+ self.flows = nn.ModuleList()
455
+ for i in range(n_flows):
456
+ self.flows.append(
457
+ modules.ResidualCouplingLayer(
458
+ channels,
459
+ hidden_channels,
460
+ kernel_size,
461
+ dilation_rate,
462
+ n_layers,
463
+ gin_channels=gin_channels,
464
+ mean_only=True,
465
+ )
466
+ )
467
+ self.flows.append(modules.Flip())
468
+
469
+ def forward(self, x, x_mask, g=None, reverse=False):
470
+ if not reverse:
471
+ for flow in self.flows:
472
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
473
+ else:
474
+ for flow in reversed(self.flows):
475
+ x = flow(x, x_mask, g=g, reverse=reverse)
476
+ return x
477
+
478
+
479
+ class PosteriorEncoder(nn.Module):
480
+ def __init__(
481
+ self,
482
+ in_channels,
483
+ out_channels,
484
+ hidden_channels,
485
+ kernel_size,
486
+ dilation_rate,
487
+ n_layers,
488
+ gin_channels=0,
489
+ ):
490
+ super().__init__()
491
+ self.in_channels = in_channels
492
+ self.out_channels = out_channels
493
+ self.hidden_channels = hidden_channels
494
+ self.kernel_size = kernel_size
495
+ self.dilation_rate = dilation_rate
496
+ self.n_layers = n_layers
497
+ self.gin_channels = gin_channels
498
+
499
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
500
+ self.enc = modules.WN(
501
+ hidden_channels,
502
+ kernel_size,
503
+ dilation_rate,
504
+ n_layers,
505
+ gin_channels=gin_channels,
506
+ )
507
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
508
+
509
+ def forward(self, x, x_lengths, g=None):
510
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
511
+ x.dtype
512
+ )
513
+ x = self.pre(x) * x_mask
514
+ x = self.enc(x, x_mask, g=g)
515
+ stats = self.proj(x) * x_mask
516
+ m, logs = torch.split(stats, self.out_channels, dim=1)
517
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
518
+ return z, m, logs, x_mask
519
+
520
+
521
+ class Generator(torch.nn.Module):
522
+ def __init__(
523
+ self,
524
+ initial_channel,
525
+ resblock,
526
+ resblock_kernel_sizes,
527
+ resblock_dilation_sizes,
528
+ upsample_rates,
529
+ upsample_initial_channel,
530
+ upsample_kernel_sizes,
531
+ gin_channels=0,
532
+ ):
533
+ super(Generator, self).__init__()
534
+ self.num_kernels = len(resblock_kernel_sizes)
535
+ self.num_upsamples = len(upsample_rates)
536
+ self.conv_pre = Conv1d(
537
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
538
+ )
539
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
540
+
541
+ self.ups = nn.ModuleList()
542
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
543
+ self.ups.append(
544
+ weight_norm(
545
+ ConvTranspose1d(
546
+ upsample_initial_channel // (2 ** i),
547
+ upsample_initial_channel // (2 ** (i + 1)),
548
+ k,
549
+ u,
550
+ padding=(k - u) // 2,
551
+ )
552
+ )
553
+ )
554
+
555
+ self.resblocks = nn.ModuleList()
556
+ for i in range(len(self.ups)):
557
+ ch = upsample_initial_channel // (2 ** (i + 1))
558
+ for j, (k, d) in enumerate(
559
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
560
+ ):
561
+ self.resblocks.append(resblock(ch, k, d))
562
+
563
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
564
+ self.ups.apply(init_weights)
565
+
566
+ if gin_channels != 0:
567
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
568
+
569
+ def forward(self, x, g=None):
570
+ x = self.conv_pre(x)
571
+ if g is not None:
572
+ x = x + self.cond(g)
573
+
574
+ for i in range(self.num_upsamples):
575
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
576
+ x = self.ups[i](x)
577
+ xs = None
578
+ for j in range(self.num_kernels):
579
+ if xs is None:
580
+ xs = self.resblocks[i * self.num_kernels + j](x)
581
+ else:
582
+ xs += self.resblocks[i * self.num_kernels + j](x)
583
+ x = xs / self.num_kernels
584
+ x = F.leaky_relu(x)
585
+ x = self.conv_post(x)
586
+ x = torch.tanh(x)
587
+
588
+ return x
589
+
590
+ def remove_weight_norm(self):
591
+ print("Removing weight norm...")
592
+ for layer in self.ups:
593
+ remove_weight_norm(layer)
594
+ for layer in self.resblocks:
595
+ layer.remove_weight_norm()
596
+
597
+
598
+ class DiscriminatorP(torch.nn.Module):
599
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
600
+ super(DiscriminatorP, self).__init__()
601
+ self.period = period
602
+ self.use_spectral_norm = use_spectral_norm
603
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
604
+ self.convs = nn.ModuleList(
605
+ [
606
+ norm_f(
607
+ Conv2d(
608
+ 1,
609
+ 32,
610
+ (kernel_size, 1),
611
+ (stride, 1),
612
+ padding=(get_padding(kernel_size, 1), 0),
613
+ )
614
+ ),
615
+ norm_f(
616
+ Conv2d(
617
+ 32,
618
+ 128,
619
+ (kernel_size, 1),
620
+ (stride, 1),
621
+ padding=(get_padding(kernel_size, 1), 0),
622
+ )
623
+ ),
624
+ norm_f(
625
+ Conv2d(
626
+ 128,
627
+ 512,
628
+ (kernel_size, 1),
629
+ (stride, 1),
630
+ padding=(get_padding(kernel_size, 1), 0),
631
+ )
632
+ ),
633
+ norm_f(
634
+ Conv2d(
635
+ 512,
636
+ 1024,
637
+ (kernel_size, 1),
638
+ (stride, 1),
639
+ padding=(get_padding(kernel_size, 1), 0),
640
+ )
641
+ ),
642
+ norm_f(
643
+ Conv2d(
644
+ 1024,
645
+ 1024,
646
+ (kernel_size, 1),
647
+ 1,
648
+ padding=(get_padding(kernel_size, 1), 0),
649
+ )
650
+ ),
651
+ ]
652
+ )
653
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
654
+
655
+ def forward(self, x):
656
+ fmap = []
657
+
658
+ # 1d to 2d
659
+ b, c, t = x.shape
660
+ if t % self.period != 0: # pad first
661
+ n_pad = self.period - (t % self.period)
662
+ x = F.pad(x, (0, n_pad), "reflect")
663
+ t = t + n_pad
664
+ x = x.view(b, c, t // self.period, self.period)
665
+
666
+ for layer in self.convs:
667
+ x = layer(x)
668
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
669
+ fmap.append(x)
670
+ x = self.conv_post(x)
671
+ fmap.append(x)
672
+ x = torch.flatten(x, 1, -1)
673
+
674
+ return x, fmap
675
+
676
+
677
+ class DiscriminatorS(torch.nn.Module):
678
+ def __init__(self, use_spectral_norm=False):
679
+ super(DiscriminatorS, self).__init__()
680
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
681
+ self.convs = nn.ModuleList(
682
+ [
683
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
684
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
685
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
686
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
687
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
688
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
689
+ ]
690
+ )
691
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
692
+
693
+ def forward(self, x):
694
+ fmap = []
695
+
696
+ for layer in self.convs:
697
+ x = layer(x)
698
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
699
+ fmap.append(x)
700
+ x = self.conv_post(x)
701
+ fmap.append(x)
702
+ x = torch.flatten(x, 1, -1)
703
+
704
+ return x, fmap
705
+
706
+
707
+ class MultiPeriodDiscriminator(torch.nn.Module):
708
+ def __init__(self, use_spectral_norm=False):
709
+ super(MultiPeriodDiscriminator, self).__init__()
710
+ periods = [2, 3, 5, 7, 11]
711
+
712
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
713
+ discs = discs + [
714
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
715
+ ]
716
+ self.discriminators = nn.ModuleList(discs)
717
+
718
+ def forward(self, y, y_hat):
719
+ y_d_rs = []
720
+ y_d_gs = []
721
+ fmap_rs = []
722
+ fmap_gs = []
723
+ for i, d in enumerate(self.discriminators):
724
+ y_d_r, fmap_r = d(y)
725
+ y_d_g, fmap_g = d(y_hat)
726
+ y_d_rs.append(y_d_r)
727
+ y_d_gs.append(y_d_g)
728
+ fmap_rs.append(fmap_r)
729
+ fmap_gs.append(fmap_g)
730
+
731
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
732
+
733
+
734
+ class WavLMDiscriminator(nn.Module):
735
+ """docstring for Discriminator."""
736
+
737
+ def __init__(
738
+ self, slm_hidden=768, slm_layers=13, initial_channel=64, use_spectral_norm=False
739
+ ):
740
+ super(WavLMDiscriminator, self).__init__()
741
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
742
+ self.pre = norm_f(
743
+ Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0)
744
+ )
745
+
746
+ self.convs = nn.ModuleList(
747
+ [
748
+ norm_f(
749
+ nn.Conv1d(
750
+ initial_channel, initial_channel * 2, kernel_size=5, padding=2
751
+ )
752
+ ),
753
+ norm_f(
754
+ nn.Conv1d(
755
+ initial_channel * 2,
756
+ initial_channel * 4,
757
+ kernel_size=5,
758
+ padding=2,
759
+ )
760
+ ),
761
+ norm_f(
762
+ nn.Conv1d(initial_channel * 4, initial_channel * 4, 5, 1, padding=2)
763
+ ),
764
+ ]
765
+ )
766
+
767
+ self.conv_post = norm_f(Conv1d(initial_channel * 4, 1, 3, 1, padding=1))
768
+
769
+ def forward(self, x):
770
+ x = self.pre(x)
771
+
772
+ fmap = []
773
+ for l in self.convs:
774
+ x = l(x)
775
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
776
+ fmap.append(x)
777
+ x = self.conv_post(x)
778
+ x = torch.flatten(x, 1, -1)
779
+
780
+ return x
781
+
782
+
783
+ class ReferenceEncoder(nn.Module):
784
+ """
785
+ inputs --- [N, Ty/r, n_mels*r] mels
786
+ outputs --- [N, ref_enc_gru_size]
787
+ """
788
+
789
+ def __init__(self, spec_channels, gin_channels=0):
790
+ super().__init__()
791
+ self.spec_channels = spec_channels
792
+ ref_enc_filters = [32, 32, 64, 64, 128, 128]
793
+ K = len(ref_enc_filters)
794
+ filters = [1] + ref_enc_filters
795
+ convs = [
796
+ weight_norm(
797
+ nn.Conv2d(
798
+ in_channels=filters[i],
799
+ out_channels=filters[i + 1],
800
+ kernel_size=(3, 3),
801
+ stride=(2, 2),
802
+ padding=(1, 1),
803
+ )
804
+ )
805
+ for i in range(K)
806
+ ]
807
+ self.convs = nn.ModuleList(convs)
808
+ # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
809
+
810
+ out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
811
+ self.gru = nn.GRU(
812
+ input_size=ref_enc_filters[-1] * out_channels,
813
+ hidden_size=256 // 2,
814
+ batch_first=True,
815
+ )
816
+ self.proj = nn.Linear(128, gin_channels)
817
+
818
+ def forward(self, inputs, mask=None):
819
+ N = inputs.size(0)
820
+ out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
821
+ for conv in self.convs:
822
+ out = conv(out)
823
+ # out = wn(out)
824
+ out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
825
+
826
+ out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
827
+ T = out.size(1)
828
+ N = out.size(0)
829
+ out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
830
+
831
+ self.gru.flatten_parameters()
832
+ memory, out = self.gru(out) # out --- [1, N, 128]
833
+
834
+ return self.proj(out.squeeze(0))
835
+
836
+ def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
837
+ for i in range(n_convs):
838
+ L = (L - kernel_size + 2 * pad) // stride + 1
839
+ return L
840
+
841
+
842
+ class SynthesizerTrn(nn.Module):
843
+ """
844
+ Synthesizer for Training
845
+ """
846
+
847
+ def __init__(
848
+ self,
849
+ n_vocab,
850
+ spec_channels,
851
+ segment_size,
852
+ inter_channels,
853
+ hidden_channels,
854
+ filter_channels,
855
+ n_heads,
856
+ n_layers,
857
+ kernel_size,
858
+ p_dropout,
859
+ resblock,
860
+ resblock_kernel_sizes,
861
+ resblock_dilation_sizes,
862
+ upsample_rates,
863
+ upsample_initial_channel,
864
+ upsample_kernel_sizes,
865
+ n_speakers=256,
866
+ gin_channels=256,
867
+ use_sdp=True,
868
+ n_flow_layer=4,
869
+ n_layers_trans_flow=6,
870
+ flow_share_parameter=False,
871
+ use_transformer_flow=True,
872
+ **kwargs
873
+ ):
874
+ super().__init__()
875
+ self.n_vocab = n_vocab
876
+ self.spec_channels = spec_channels
877
+ self.inter_channels = inter_channels
878
+ self.hidden_channels = hidden_channels
879
+ self.filter_channels = filter_channels
880
+ self.n_heads = n_heads
881
+ self.n_layers = n_layers
882
+ self.kernel_size = kernel_size
883
+ self.p_dropout = p_dropout
884
+ self.resblock = resblock
885
+ self.resblock_kernel_sizes = resblock_kernel_sizes
886
+ self.resblock_dilation_sizes = resblock_dilation_sizes
887
+ self.upsample_rates = upsample_rates
888
+ self.upsample_initial_channel = upsample_initial_channel
889
+ self.upsample_kernel_sizes = upsample_kernel_sizes
890
+ self.segment_size = segment_size
891
+ self.n_speakers = n_speakers
892
+ self.gin_channels = gin_channels
893
+ self.n_layers_trans_flow = n_layers_trans_flow
894
+ self.use_spk_conditioned_encoder = kwargs.get(
895
+ "use_spk_conditioned_encoder", True
896
+ )
897
+ self.use_sdp = use_sdp
898
+ self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
899
+ self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
900
+ self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
901
+ self.current_mas_noise_scale = self.mas_noise_scale_initial
902
+ if self.use_spk_conditioned_encoder and gin_channels > 0:
903
+ self.enc_gin_channels = gin_channels
904
+ self.enc_p = TextEncoder(
905
+ n_vocab,
906
+ inter_channels,
907
+ hidden_channels,
908
+ filter_channels,
909
+ n_heads,
910
+ n_layers,
911
+ kernel_size,
912
+ p_dropout,
913
+ gin_channels=self.enc_gin_channels,
914
+ )
915
+ self.dec = Generator(
916
+ inter_channels,
917
+ resblock,
918
+ resblock_kernel_sizes,
919
+ resblock_dilation_sizes,
920
+ upsample_rates,
921
+ upsample_initial_channel,
922
+ upsample_kernel_sizes,
923
+ gin_channels=gin_channels,
924
+ )
925
+ self.enc_q = PosteriorEncoder(
926
+ spec_channels,
927
+ inter_channels,
928
+ hidden_channels,
929
+ 5,
930
+ 1,
931
+ 16,
932
+ gin_channels=gin_channels,
933
+ )
934
+ if use_transformer_flow:
935
+ self.flow = TransformerCouplingBlock(
936
+ inter_channels,
937
+ hidden_channels,
938
+ filter_channels,
939
+ n_heads,
940
+ n_layers_trans_flow,
941
+ 5,
942
+ p_dropout,
943
+ n_flow_layer,
944
+ gin_channels=gin_channels,
945
+ share_parameter=flow_share_parameter,
946
+ )
947
+ else:
948
+ self.flow = ResidualCouplingBlock(
949
+ inter_channels,
950
+ hidden_channels,
951
+ 5,
952
+ 1,
953
+ n_flow_layer,
954
+ gin_channels=gin_channels,
955
+ )
956
+ self.sdp = StochasticDurationPredictor(
957
+ hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
958
+ )
959
+ self.dp = DurationPredictor(
960
+ hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
961
+ )
962
+
963
+ if n_speakers >= 1:
964
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
965
+ else:
966
+ self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
967
+
968
+ def infer(
969
+ self,
970
+ x,
971
+ x_lengths,
972
+ sid,
973
+ tone,
974
+ language,
975
+ ja_bert,
976
+ emo,
977
+ noise_scale=0.667,
978
+ length_scale=1,
979
+ noise_scale_w=0.8,
980
+ max_len=None,
981
+ sdp_ratio=0,
982
+ y=None,
983
+ **kwargs
984
+ ):
985
+ # x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, tone, language, ja_bert)
986
+ # g = self.gst(y)
987
+ if self.n_speakers > 0:
988
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
989
+ else:
990
+ g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
991
+ x, m_p, logs_p, x_mask, _ = self.enc_p(
992
+ x, x_lengths, tone, language, ja_bert, emo, g=g
993
+ )
994
+ logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
995
+ sdp_ratio
996
+ ) + self.dp(x, x_mask, g=g) * (1 - sdp_ratio)
997
+ w = torch.exp(logw) * x_mask * length_scale
998
+ w_ceil = torch.ceil(w)
999
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
1000
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
1001
+ x_mask.dtype
1002
+ )
1003
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
1004
+ attn = commons.generate_path(w_ceil, attn_mask)
1005
+
1006
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
1007
+ 1, 2
1008
+ ) # [b, t', t], [b, t, d] -> [b, d, t']
1009
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
1010
+ 1, 2
1011
+ ) # [b, t', t], [b, t, d] -> [b, d, t']
1012
+
1013
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
1014
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
1015
+ o = self.dec((z * y_mask)[:, :, :max_len], g=g)
1016
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)
bert_vits2/models_v230.py ADDED
@@ -0,0 +1,1019 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+ from vector_quantize_pytorch import VectorQuantize
6
+
7
+ from bert_vits2 import commons
8
+ from bert_vits2 import modules
9
+ from bert_vits2 import attentions
10
+
11
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
12
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
+
14
+ from bert_vits2.commons import init_weights, get_padding
15
+ from bert_vits2.text import symbols, num_tones, num_languages
16
+
17
+
18
+ class DurationDiscriminator(nn.Module): # vits2
19
+ def __init__(
20
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
21
+ ):
22
+ super().__init__()
23
+
24
+ self.in_channels = in_channels
25
+ self.filter_channels = filter_channels
26
+ self.kernel_size = kernel_size
27
+ self.p_dropout = p_dropout
28
+ self.gin_channels = gin_channels
29
+
30
+ self.drop = nn.Dropout(p_dropout)
31
+ self.conv_1 = nn.Conv1d(
32
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
33
+ )
34
+ self.norm_1 = modules.LayerNorm(filter_channels)
35
+ self.conv_2 = nn.Conv1d(
36
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
37
+ )
38
+ self.norm_2 = modules.LayerNorm(filter_channels)
39
+ self.dur_proj = nn.Conv1d(1, filter_channels, 1)
40
+
41
+ self.LSTM = nn.LSTM(
42
+ 2 * filter_channels, filter_channels, batch_first=True, bidirectional=True
43
+ )
44
+
45
+ if gin_channels != 0:
46
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
47
+
48
+ self.output_layer = nn.Sequential(
49
+ nn.Linear(2 * filter_channels, 1), nn.Sigmoid()
50
+ )
51
+
52
+ def forward_probability(self, x, dur):
53
+ dur = self.dur_proj(dur)
54
+ x = torch.cat([x, dur], dim=1)
55
+ x = x.transpose(1, 2)
56
+ x, _ = self.LSTM(x)
57
+ output_prob = self.output_layer(x)
58
+ return output_prob
59
+
60
+ def forward(self, x, x_mask, dur_r, dur_hat, g=None):
61
+ x = torch.detach(x)
62
+ if g is not None:
63
+ g = torch.detach(g)
64
+ x = x + self.cond(g)
65
+ x = self.conv_1(x * x_mask)
66
+ x = torch.relu(x)
67
+ x = self.norm_1(x)
68
+ x = self.drop(x)
69
+ x = self.conv_2(x * x_mask)
70
+ x = torch.relu(x)
71
+ x = self.norm_2(x)
72
+ x = self.drop(x)
73
+
74
+ output_probs = []
75
+ for dur in [dur_r, dur_hat]:
76
+ output_prob = self.forward_probability(x, dur)
77
+ output_probs.append(output_prob)
78
+
79
+ return output_probs
80
+
81
+
82
+ class TransformerCouplingBlock(nn.Module):
83
+ def __init__(
84
+ self,
85
+ channels,
86
+ hidden_channels,
87
+ filter_channels,
88
+ n_heads,
89
+ n_layers,
90
+ kernel_size,
91
+ p_dropout,
92
+ n_flows=4,
93
+ gin_channels=0,
94
+ share_parameter=False,
95
+ ):
96
+ super().__init__()
97
+ self.channels = channels
98
+ self.hidden_channels = hidden_channels
99
+ self.kernel_size = kernel_size
100
+ self.n_layers = n_layers
101
+ self.n_flows = n_flows
102
+ self.gin_channels = gin_channels
103
+
104
+ self.flows = nn.ModuleList()
105
+
106
+ self.wn = (
107
+ attentions.FFT(
108
+ hidden_channels,
109
+ filter_channels,
110
+ n_heads,
111
+ n_layers,
112
+ kernel_size,
113
+ p_dropout,
114
+ isflow=True,
115
+ gin_channels=self.gin_channels,
116
+ )
117
+ if share_parameter
118
+ else None
119
+ )
120
+
121
+ for i in range(n_flows):
122
+ self.flows.append(
123
+ modules.TransformerCouplingLayer(
124
+ channels,
125
+ hidden_channels,
126
+ kernel_size,
127
+ n_layers,
128
+ n_heads,
129
+ p_dropout,
130
+ filter_channels,
131
+ mean_only=True,
132
+ wn_sharing_parameter=self.wn,
133
+ gin_channels=self.gin_channels,
134
+ )
135
+ )
136
+ self.flows.append(modules.Flip())
137
+
138
+ def forward(self, x, x_mask, g=None, reverse=False):
139
+ if not reverse:
140
+ for flow in self.flows:
141
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
142
+ else:
143
+ for flow in reversed(self.flows):
144
+ x = flow(x, x_mask, g=g, reverse=reverse)
145
+ return x
146
+
147
+
148
+ class StochasticDurationPredictor(nn.Module):
149
+ def __init__(
150
+ self,
151
+ in_channels,
152
+ filter_channels,
153
+ kernel_size,
154
+ p_dropout,
155
+ n_flows=4,
156
+ gin_channels=0,
157
+ ):
158
+ super().__init__()
159
+ filter_channels = in_channels # it needs to be removed from future version.
160
+ self.in_channels = in_channels
161
+ self.filter_channels = filter_channels
162
+ self.kernel_size = kernel_size
163
+ self.p_dropout = p_dropout
164
+ self.n_flows = n_flows
165
+ self.gin_channels = gin_channels
166
+
167
+ self.log_flow = modules.Log()
168
+ self.flows = nn.ModuleList()
169
+ self.flows.append(modules.ElementwiseAffine(2))
170
+ for i in range(n_flows):
171
+ self.flows.append(
172
+ modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
173
+ )
174
+ self.flows.append(modules.Flip())
175
+
176
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
177
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
178
+ self.post_convs = modules.DDSConv(
179
+ filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
180
+ )
181
+ self.post_flows = nn.ModuleList()
182
+ self.post_flows.append(modules.ElementwiseAffine(2))
183
+ for i in range(4):
184
+ self.post_flows.append(
185
+ modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
186
+ )
187
+ self.post_flows.append(modules.Flip())
188
+
189
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
190
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
191
+ self.convs = modules.DDSConv(
192
+ filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
193
+ )
194
+ if gin_channels != 0:
195
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
196
+
197
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
198
+ x = torch.detach(x)
199
+ x = self.pre(x)
200
+ if g is not None:
201
+ g = torch.detach(g)
202
+ x = x + self.cond(g)
203
+ x = self.convs(x, x_mask)
204
+ x = self.proj(x) * x_mask
205
+
206
+ if not reverse:
207
+ flows = self.flows
208
+ assert w is not None
209
+
210
+ logdet_tot_q = 0
211
+ h_w = self.post_pre(w)
212
+ h_w = self.post_convs(h_w, x_mask)
213
+ h_w = self.post_proj(h_w) * x_mask
214
+ e_q = (
215
+ torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
216
+ * x_mask
217
+ )
218
+ z_q = e_q
219
+ for flow in self.post_flows:
220
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
221
+ logdet_tot_q += logdet_q
222
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
223
+ u = torch.sigmoid(z_u) * x_mask
224
+ z0 = (w - u) * x_mask
225
+ logdet_tot_q += torch.sum(
226
+ (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
227
+ )
228
+ logq = (
229
+ torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2])
230
+ - logdet_tot_q
231
+ )
232
+
233
+ logdet_tot = 0
234
+ z0, logdet = self.log_flow(z0, x_mask)
235
+ logdet_tot += logdet
236
+ z = torch.cat([z0, z1], 1)
237
+ for flow in flows:
238
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
239
+ logdet_tot = logdet_tot + logdet
240
+ nll = (
241
+ torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2])
242
+ - logdet_tot
243
+ )
244
+ return nll + logq # [b]
245
+ else:
246
+ flows = list(reversed(self.flows))
247
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
248
+ z = (
249
+ torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
250
+ * noise_scale
251
+ )
252
+ for flow in flows:
253
+ z = flow(z, x_mask, g=x, reverse=reverse)
254
+ z0, z1 = torch.split(z, [1, 1], 1)
255
+ logw = z0
256
+ return logw
257
+
258
+
259
+ class DurationPredictor(nn.Module):
260
+ def __init__(
261
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
262
+ ):
263
+ super().__init__()
264
+
265
+ self.in_channels = in_channels
266
+ self.filter_channels = filter_channels
267
+ self.kernel_size = kernel_size
268
+ self.p_dropout = p_dropout
269
+ self.gin_channels = gin_channels
270
+
271
+ self.drop = nn.Dropout(p_dropout)
272
+ self.conv_1 = nn.Conv1d(
273
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
274
+ )
275
+ self.norm_1 = modules.LayerNorm(filter_channels)
276
+ self.conv_2 = nn.Conv1d(
277
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
278
+ )
279
+ self.norm_2 = modules.LayerNorm(filter_channels)
280
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
281
+
282
+ if gin_channels != 0:
283
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
284
+
285
+ def forward(self, x, x_mask, g=None):
286
+ x = torch.detach(x)
287
+ if g is not None:
288
+ g = torch.detach(g)
289
+ x = x + self.cond(g)
290
+ x = self.conv_1(x * x_mask)
291
+ x = torch.relu(x)
292
+ x = self.norm_1(x)
293
+ x = self.drop(x)
294
+ x = self.conv_2(x * x_mask)
295
+ x = torch.relu(x)
296
+ x = self.norm_2(x)
297
+ x = self.drop(x)
298
+ x = self.proj(x * x_mask)
299
+ return x * x_mask
300
+
301
+
302
+ class Bottleneck(nn.Sequential):
303
+ def __init__(self, in_dim, hidden_dim):
304
+ c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
305
+ c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
306
+ super().__init__(*[c_fc1, c_fc2])
307
+
308
+
309
+ class Block(nn.Module):
310
+ def __init__(self, in_dim, hidden_dim) -> None:
311
+ super().__init__()
312
+ self.norm = nn.LayerNorm(in_dim)
313
+ self.mlp = MLP(in_dim, hidden_dim)
314
+
315
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
316
+ x = x + self.mlp(self.norm(x))
317
+ return x
318
+
319
+
320
+ class MLP(nn.Module):
321
+ def __init__(self, in_dim, hidden_dim):
322
+ super().__init__()
323
+ self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
324
+ self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
325
+ self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
326
+
327
+ def forward(self, x: torch.Tensor):
328
+ x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
329
+ x = self.c_proj(x)
330
+ return x
331
+
332
+
333
+ class TextEncoder(nn.Module):
334
+ def __init__(
335
+ self,
336
+ n_vocab,
337
+ out_channels,
338
+ hidden_channels,
339
+ filter_channels,
340
+ n_heads,
341
+ n_layers,
342
+ kernel_size,
343
+ p_dropout,
344
+ gin_channels=0,
345
+ zh_bert_extra=False,
346
+ ):
347
+ super().__init__()
348
+ self.n_vocab = n_vocab
349
+ self.out_channels = out_channels
350
+ self.hidden_channels = hidden_channels
351
+ self.filter_channels = filter_channels
352
+ self.n_heads = n_heads
353
+ self.n_layers = n_layers
354
+ self.kernel_size = kernel_size
355
+ self.p_dropout = p_dropout
356
+ self.gin_channels = gin_channels
357
+ self.emb = nn.Embedding(len(symbols), hidden_channels)
358
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5)
359
+ self.tone_emb = nn.Embedding(num_tones, hidden_channels)
360
+ nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels ** -0.5)
361
+ self.language_emb = nn.Embedding(num_languages, hidden_channels)
362
+ nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels ** -0.5)
363
+ self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
364
+ self.zh_bert_extra = zh_bert_extra
365
+ if self.zh_bert_extra:
366
+ self.bert_pre_proj = nn.Conv1d(2048, 1024, 1)
367
+ self.in_feature_net = nn.Sequential(
368
+ # input is assumed to an already normalized embedding
369
+ nn.Linear(512, 1028, bias=False),
370
+ nn.GELU(),
371
+ nn.LayerNorm(1028),
372
+ *[Block(1028, 512) for _ in range(1)],
373
+ nn.Linear(1028, 512, bias=False),
374
+ # normalize before passing to VQ?
375
+ # nn.GELU(),
376
+ # nn.LayerNorm(512),
377
+ )
378
+ self.emo_vq = VectorQuantize(
379
+ dim=512,
380
+ codebook_size=64,
381
+ codebook_dim=32,
382
+ commitment_weight=0.1,
383
+ decay=0.85,
384
+ heads=32,
385
+ kmeans_iters=20,
386
+ separate_codebook_per_head=True,
387
+ stochastic_sample_codes=True,
388
+ threshold_ema_dead_code=2,
389
+ )
390
+ self.out_feature_net = nn.Linear(512, hidden_channels)
391
+ else:
392
+ self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
393
+ self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
394
+
395
+ self.encoder = attentions.Encoder(
396
+ hidden_channels,
397
+ filter_channels,
398
+ n_heads,
399
+ n_layers,
400
+ kernel_size,
401
+ p_dropout,
402
+ gin_channels=self.gin_channels,
403
+ )
404
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
405
+
406
+ def forward(self, x, x_lengths, tone, language, zh_bert, ja_bert, en_bert, emo=None, g=None):
407
+ x = self.emb(x) + self.tone_emb(tone) + self.language_emb(language)
408
+
409
+ if self.zh_bert_extra:
410
+ zh_bert = self.bert_pre_proj(zh_bert)
411
+ emo_emb = self.in_feature_net(emo)
412
+ emo_emb, _, _ = self.emo_vq(emo_emb.unsqueeze(1))
413
+ emo_emb = self.out_feature_net(emo_emb)
414
+ x += emo_emb
415
+ x += self.bert_proj(zh_bert).transpose(1, 2)
416
+ if not self.zh_bert_extra:
417
+ x += self.ja_bert_proj(ja_bert).transpose(1, 2)
418
+ x += self.en_bert_proj(en_bert).transpose(1, 2)
419
+
420
+ x *= math.sqrt(self.hidden_channels) # [b, t, h]
421
+ x = torch.transpose(x, 1, -1) # [b, h, t]
422
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
423
+ x.dtype
424
+ )
425
+
426
+ x = self.encoder(x * x_mask, x_mask, g=g)
427
+ stats = self.proj(x) * x_mask
428
+
429
+ m, logs = torch.split(stats, self.out_channels, dim=1)
430
+ return x, m, logs, x_mask
431
+
432
+
433
+ class ResidualCouplingBlock(nn.Module):
434
+ def __init__(
435
+ self,
436
+ channels,
437
+ hidden_channels,
438
+ kernel_size,
439
+ dilation_rate,
440
+ n_layers,
441
+ n_flows=4,
442
+ gin_channels=0,
443
+ ):
444
+ super().__init__()
445
+ self.channels = channels
446
+ self.hidden_channels = hidden_channels
447
+ self.kernel_size = kernel_size
448
+ self.dilation_rate = dilation_rate
449
+ self.n_layers = n_layers
450
+ self.n_flows = n_flows
451
+ self.gin_channels = gin_channels
452
+
453
+ self.flows = nn.ModuleList()
454
+ for i in range(n_flows):
455
+ self.flows.append(
456
+ modules.ResidualCouplingLayer(
457
+ channels,
458
+ hidden_channels,
459
+ kernel_size,
460
+ dilation_rate,
461
+ n_layers,
462
+ gin_channels=gin_channels,
463
+ mean_only=True,
464
+ )
465
+ )
466
+ self.flows.append(modules.Flip())
467
+
468
+ def forward(self, x, x_mask, g=None, reverse=False):
469
+ if not reverse:
470
+ for flow in self.flows:
471
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
472
+ else:
473
+ for flow in reversed(self.flows):
474
+ x = flow(x, x_mask, g=g, reverse=reverse)
475
+ return x
476
+
477
+
478
+ class PosteriorEncoder(nn.Module):
479
+ def __init__(
480
+ self,
481
+ in_channels,
482
+ out_channels,
483
+ hidden_channels,
484
+ kernel_size,
485
+ dilation_rate,
486
+ n_layers,
487
+ gin_channels=0,
488
+ ):
489
+ super().__init__()
490
+ self.in_channels = in_channels
491
+ self.out_channels = out_channels
492
+ self.hidden_channels = hidden_channels
493
+ self.kernel_size = kernel_size
494
+ self.dilation_rate = dilation_rate
495
+ self.n_layers = n_layers
496
+ self.gin_channels = gin_channels
497
+
498
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
499
+ self.enc = modules.WN(
500
+ hidden_channels,
501
+ kernel_size,
502
+ dilation_rate,
503
+ n_layers,
504
+ gin_channels=gin_channels,
505
+ )
506
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
507
+
508
+ def forward(self, x, x_lengths, g=None):
509
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
510
+ x.dtype
511
+ )
512
+ x = self.pre(x) * x_mask
513
+ x = self.enc(x, x_mask, g=g)
514
+ stats = self.proj(x) * x_mask
515
+ m, logs = torch.split(stats, self.out_channels, dim=1)
516
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
517
+ return z, m, logs, x_mask
518
+
519
+
520
+ class Generator(torch.nn.Module):
521
+ def __init__(
522
+ self,
523
+ initial_channel,
524
+ resblock,
525
+ resblock_kernel_sizes,
526
+ resblock_dilation_sizes,
527
+ upsample_rates,
528
+ upsample_initial_channel,
529
+ upsample_kernel_sizes,
530
+ gin_channels=0,
531
+ ):
532
+ super(Generator, self).__init__()
533
+ self.num_kernels = len(resblock_kernel_sizes)
534
+ self.num_upsamples = len(upsample_rates)
535
+ self.conv_pre = Conv1d(
536
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
537
+ )
538
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
539
+
540
+ self.ups = nn.ModuleList()
541
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
542
+ self.ups.append(
543
+ weight_norm(
544
+ ConvTranspose1d(
545
+ upsample_initial_channel // (2 ** i),
546
+ upsample_initial_channel // (2 ** (i + 1)),
547
+ k,
548
+ u,
549
+ padding=(k - u) // 2,
550
+ )
551
+ )
552
+ )
553
+
554
+ self.resblocks = nn.ModuleList()
555
+ for i in range(len(self.ups)):
556
+ ch = upsample_initial_channel // (2 ** (i + 1))
557
+ for j, (k, d) in enumerate(
558
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
559
+ ):
560
+ self.resblocks.append(resblock(ch, k, d))
561
+
562
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
563
+ self.ups.apply(init_weights)
564
+
565
+ if gin_channels != 0:
566
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
567
+
568
+ def forward(self, x, g=None):
569
+ x = self.conv_pre(x)
570
+ if g is not None:
571
+ x = x + self.cond(g)
572
+
573
+ for i in range(self.num_upsamples):
574
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
575
+ x = self.ups[i](x)
576
+ xs = None
577
+ for j in range(self.num_kernels):
578
+ if xs is None:
579
+ xs = self.resblocks[i * self.num_kernels + j](x)
580
+ else:
581
+ xs += self.resblocks[i * self.num_kernels + j](x)
582
+ x = xs / self.num_kernels
583
+ x = F.leaky_relu(x)
584
+ x = self.conv_post(x)
585
+ x = torch.tanh(x)
586
+
587
+ return x
588
+
589
+ def remove_weight_norm(self):
590
+ print("Removing weight norm...")
591
+ for layer in self.ups:
592
+ remove_weight_norm(layer)
593
+ for layer in self.resblocks:
594
+ layer.remove_weight_norm()
595
+
596
+
597
+ class DiscriminatorP(torch.nn.Module):
598
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
599
+ super(DiscriminatorP, self).__init__()
600
+ self.period = period
601
+ self.use_spectral_norm = use_spectral_norm
602
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
603
+ self.convs = nn.ModuleList(
604
+ [
605
+ norm_f(
606
+ Conv2d(
607
+ 1,
608
+ 32,
609
+ (kernel_size, 1),
610
+ (stride, 1),
611
+ padding=(get_padding(kernel_size, 1), 0),
612
+ )
613
+ ),
614
+ norm_f(
615
+ Conv2d(
616
+ 32,
617
+ 128,
618
+ (kernel_size, 1),
619
+ (stride, 1),
620
+ padding=(get_padding(kernel_size, 1), 0),
621
+ )
622
+ ),
623
+ norm_f(
624
+ Conv2d(
625
+ 128,
626
+ 512,
627
+ (kernel_size, 1),
628
+ (stride, 1),
629
+ padding=(get_padding(kernel_size, 1), 0),
630
+ )
631
+ ),
632
+ norm_f(
633
+ Conv2d(
634
+ 512,
635
+ 1024,
636
+ (kernel_size, 1),
637
+ (stride, 1),
638
+ padding=(get_padding(kernel_size, 1), 0),
639
+ )
640
+ ),
641
+ norm_f(
642
+ Conv2d(
643
+ 1024,
644
+ 1024,
645
+ (kernel_size, 1),
646
+ 1,
647
+ padding=(get_padding(kernel_size, 1), 0),
648
+ )
649
+ ),
650
+ ]
651
+ )
652
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
653
+
654
+ def forward(self, x):
655
+ fmap = []
656
+
657
+ # 1d to 2d
658
+ b, c, t = x.shape
659
+ if t % self.period != 0: # pad first
660
+ n_pad = self.period - (t % self.period)
661
+ x = F.pad(x, (0, n_pad), "reflect")
662
+ t = t + n_pad
663
+ x = x.view(b, c, t // self.period, self.period)
664
+
665
+ for layer in self.convs:
666
+ x = layer(x)
667
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
668
+ fmap.append(x)
669
+ x = self.conv_post(x)
670
+ fmap.append(x)
671
+ x = torch.flatten(x, 1, -1)
672
+
673
+ return x, fmap
674
+
675
+
676
+ class DiscriminatorS(torch.nn.Module):
677
+ def __init__(self, use_spectral_norm=False):
678
+ super(DiscriminatorS, self).__init__()
679
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
680
+ self.convs = nn.ModuleList(
681
+ [
682
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
683
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
684
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
685
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
686
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
687
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
688
+ ]
689
+ )
690
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
691
+
692
+ def forward(self, x):
693
+ fmap = []
694
+
695
+ for layer in self.convs:
696
+ x = layer(x)
697
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
698
+ fmap.append(x)
699
+ x = self.conv_post(x)
700
+ fmap.append(x)
701
+ x = torch.flatten(x, 1, -1)
702
+
703
+ return x, fmap
704
+
705
+
706
+ class MultiPeriodDiscriminator(torch.nn.Module):
707
+ def __init__(self, use_spectral_norm=False):
708
+ super(MultiPeriodDiscriminator, self).__init__()
709
+ periods = [2, 3, 5, 7, 11]
710
+
711
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
712
+ discs = discs + [
713
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
714
+ ]
715
+ self.discriminators = nn.ModuleList(discs)
716
+
717
+ def forward(self, y, y_hat):
718
+ y_d_rs = []
719
+ y_d_gs = []
720
+ fmap_rs = []
721
+ fmap_gs = []
722
+ for i, d in enumerate(self.discriminators):
723
+ y_d_r, fmap_r = d(y)
724
+ y_d_g, fmap_g = d(y_hat)
725
+ y_d_rs.append(y_d_r)
726
+ y_d_gs.append(y_d_g)
727
+ fmap_rs.append(fmap_r)
728
+ fmap_gs.append(fmap_g)
729
+
730
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
731
+
732
+
733
+ class WavLMDiscriminator(nn.Module):
734
+ """docstring for Discriminator."""
735
+
736
+ def __init__(
737
+ self, slm_hidden=768, slm_layers=13, initial_channel=64, use_spectral_norm=False
738
+ ):
739
+ super(WavLMDiscriminator, self).__init__()
740
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
741
+ self.pre = norm_f(
742
+ Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0)
743
+ )
744
+
745
+ self.convs = nn.ModuleList(
746
+ [
747
+ norm_f(
748
+ nn.Conv1d(
749
+ initial_channel, initial_channel * 2, kernel_size=5, padding=2
750
+ )
751
+ ),
752
+ norm_f(
753
+ nn.Conv1d(
754
+ initial_channel * 2,
755
+ initial_channel * 4,
756
+ kernel_size=5,
757
+ padding=2,
758
+ )
759
+ ),
760
+ norm_f(
761
+ nn.Conv1d(initial_channel * 4, initial_channel * 4, 5, 1, padding=2)
762
+ ),
763
+ ]
764
+ )
765
+
766
+ self.conv_post = norm_f(Conv1d(initial_channel * 4, 1, 3, 1, padding=1))
767
+
768
+ def forward(self, x):
769
+ x = self.pre(x)
770
+
771
+ fmap = []
772
+ for l in self.convs:
773
+ x = l(x)
774
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
775
+ fmap.append(x)
776
+ x = self.conv_post(x)
777
+ x = torch.flatten(x, 1, -1)
778
+
779
+ return x
780
+
781
+
782
+ class ReferenceEncoder(nn.Module):
783
+ """
784
+ inputs --- [N, Ty/r, n_mels*r] mels
785
+ outputs --- [N, ref_enc_gru_size]
786
+ """
787
+
788
+ def __init__(self, spec_channels, gin_channels=0):
789
+ super().__init__()
790
+ self.spec_channels = spec_channels
791
+ ref_enc_filters = [32, 32, 64, 64, 128, 128]
792
+ K = len(ref_enc_filters)
793
+ filters = [1] + ref_enc_filters
794
+ convs = [
795
+ weight_norm(
796
+ nn.Conv2d(
797
+ in_channels=filters[i],
798
+ out_channels=filters[i + 1],
799
+ kernel_size=(3, 3),
800
+ stride=(2, 2),
801
+ padding=(1, 1),
802
+ )
803
+ )
804
+ for i in range(K)
805
+ ]
806
+ self.convs = nn.ModuleList(convs)
807
+ # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
808
+
809
+ out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
810
+ self.gru = nn.GRU(
811
+ input_size=ref_enc_filters[-1] * out_channels,
812
+ hidden_size=256 // 2,
813
+ batch_first=True,
814
+ )
815
+ self.proj = nn.Linear(128, gin_channels)
816
+
817
+ def forward(self, inputs, mask=None):
818
+ N = inputs.size(0)
819
+ out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
820
+ for conv in self.convs:
821
+ out = conv(out)
822
+ # out = wn(out)
823
+ out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
824
+
825
+ out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
826
+ T = out.size(1)
827
+ N = out.size(0)
828
+ out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
829
+
830
+ self.gru.flatten_parameters()
831
+ memory, out = self.gru(out) # out --- [1, N, 128]
832
+
833
+ return self.proj(out.squeeze(0))
834
+
835
+ def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
836
+ for i in range(n_convs):
837
+ L = (L - kernel_size + 2 * pad) // stride + 1
838
+ return L
839
+
840
+
841
+ class SynthesizerTrn(nn.Module):
842
+ """
843
+ Synthesizer for Training
844
+ """
845
+
846
+ def __init__(
847
+ self,
848
+ n_vocab,
849
+ spec_channels,
850
+ segment_size,
851
+ inter_channels,
852
+ hidden_channels,
853
+ filter_channels,
854
+ n_heads,
855
+ n_layers,
856
+ kernel_size,
857
+ p_dropout,
858
+ resblock,
859
+ resblock_kernel_sizes,
860
+ resblock_dilation_sizes,
861
+ upsample_rates,
862
+ upsample_initial_channel,
863
+ upsample_kernel_sizes,
864
+ n_speakers=256,
865
+ gin_channels=256,
866
+ use_sdp=True,
867
+ n_flow_layer=4,
868
+ n_layers_trans_flow=4,
869
+ flow_share_parameter=False,
870
+ use_transformer_flow=True,
871
+ zh_bert_extra=False,
872
+ **kwargs
873
+ ):
874
+ super().__init__()
875
+ self.n_vocab = n_vocab
876
+ self.spec_channels = spec_channels
877
+ self.inter_channels = inter_channels
878
+ self.hidden_channels = hidden_channels
879
+ self.filter_channels = filter_channels
880
+ self.n_heads = n_heads
881
+ self.n_layers = n_layers
882
+ self.kernel_size = kernel_size
883
+ self.p_dropout = p_dropout
884
+ self.resblock = resblock
885
+ self.resblock_kernel_sizes = resblock_kernel_sizes
886
+ self.resblock_dilation_sizes = resblock_dilation_sizes
887
+ self.upsample_rates = upsample_rates
888
+ self.upsample_initial_channel = upsample_initial_channel
889
+ self.upsample_kernel_sizes = upsample_kernel_sizes
890
+ self.segment_size = segment_size
891
+ self.n_speakers = n_speakers
892
+ self.gin_channels = gin_channels
893
+ self.n_layers_trans_flow = n_layers_trans_flow
894
+ self.use_spk_conditioned_encoder = kwargs.get(
895
+ "use_spk_conditioned_encoder", True
896
+ )
897
+ self.use_sdp = use_sdp
898
+ self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
899
+ self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
900
+ self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
901
+ self.current_mas_noise_scale = self.mas_noise_scale_initial
902
+ if self.use_spk_conditioned_encoder and gin_channels > 0:
903
+ self.enc_gin_channels = gin_channels
904
+ self.enc_p = TextEncoder(
905
+ n_vocab,
906
+ inter_channels,
907
+ hidden_channels,
908
+ filter_channels,
909
+ n_heads,
910
+ n_layers,
911
+ kernel_size,
912
+ p_dropout,
913
+ gin_channels=self.enc_gin_channels,
914
+ zh_bert_extra=zh_bert_extra,
915
+ )
916
+ self.dec = Generator(
917
+ inter_channels,
918
+ resblock,
919
+ resblock_kernel_sizes,
920
+ resblock_dilation_sizes,
921
+ upsample_rates,
922
+ upsample_initial_channel,
923
+ upsample_kernel_sizes,
924
+ gin_channels=gin_channels,
925
+ )
926
+ self.enc_q = PosteriorEncoder(
927
+ spec_channels,
928
+ inter_channels,
929
+ hidden_channels,
930
+ 5,
931
+ 1,
932
+ 16,
933
+ gin_channels=gin_channels,
934
+ )
935
+ if use_transformer_flow:
936
+ self.flow = TransformerCouplingBlock(
937
+ inter_channels,
938
+ hidden_channels,
939
+ filter_channels,
940
+ n_heads,
941
+ n_layers_trans_flow,
942
+ 5,
943
+ p_dropout,
944
+ n_flow_layer,
945
+ gin_channels=gin_channels,
946
+ share_parameter=flow_share_parameter,
947
+ )
948
+ else:
949
+ self.flow = ResidualCouplingBlock(
950
+ inter_channels,
951
+ hidden_channels,
952
+ 5,
953
+ 1,
954
+ n_flow_layer,
955
+ gin_channels=gin_channels,
956
+ )
957
+ self.sdp = StochasticDurationPredictor(
958
+ hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
959
+ )
960
+ self.dp = DurationPredictor(
961
+ hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
962
+ )
963
+
964
+ if n_speakers >= 1:
965
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
966
+ else:
967
+ self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
968
+
969
+ def infer(
970
+ self,
971
+ x,
972
+ x_lengths,
973
+ sid,
974
+ tone,
975
+ language,
976
+ zh_bert,
977
+ ja_bert,
978
+ en_bert,
979
+ emo=None,
980
+ noise_scale=0.667,
981
+ length_scale=1,
982
+ noise_scale_w=0.8,
983
+ max_len=None,
984
+ sdp_ratio=0,
985
+ y=None,
986
+ **kwargs,
987
+ ):
988
+ # x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, tone, language, bert)
989
+ # g = self.gst(y)
990
+ if self.n_speakers > 0:
991
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
992
+ else:
993
+ g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
994
+ x, m_p, logs_p, x_mask = self.enc_p(
995
+ x, x_lengths, tone, language, zh_bert, ja_bert, en_bert, emo=emo, g=g
996
+ )
997
+ logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
998
+ sdp_ratio
999
+ ) + self.dp(x, x_mask, g=g) * (1 - sdp_ratio)
1000
+ w = torch.exp(logw) * x_mask * length_scale
1001
+ w_ceil = torch.ceil(w)
1002
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
1003
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
1004
+ x_mask.dtype
1005
+ )
1006
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
1007
+ attn = commons.generate_path(w_ceil, attn_mask)
1008
+
1009
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
1010
+ 1, 2
1011
+ ) # [b, t', t], [b, t, d] -> [b, d, t']
1012
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
1013
+ 1, 2
1014
+ ) # [b, t', t], [b, t, d] -> [b, d, t']
1015
+
1016
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
1017
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
1018
+ o = self.dec((z * y_mask)[:, :, :max_len], g=g)
1019
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)
bert_vits2/modules.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import scipy
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm
11
+
12
+ from bert_vits2 import commons
13
+ from bert_vits2.commons import init_weights, get_padding
14
+ from bert_vits2.transforms import piecewise_rational_quadratic_transform
15
+ from bert_vits2.attentions import Encoder
16
+
17
+ LRELU_SLOPE = 0.1
18
+
19
+
20
+ class LayerNorm(nn.Module):
21
+ def __init__(self, channels, eps=1e-5):
22
+ super().__init__()
23
+ self.channels = channels
24
+ self.eps = eps
25
+
26
+ self.gamma = nn.Parameter(torch.ones(channels))
27
+ self.beta = nn.Parameter(torch.zeros(channels))
28
+
29
+ def forward(self, x):
30
+ x = x.transpose(1, -1)
31
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
+ return x.transpose(1, -1)
33
+
34
+
35
+ class ConvReluNorm(nn.Module):
36
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
37
+ super().__init__()
38
+ self.in_channels = in_channels
39
+ self.hidden_channels = hidden_channels
40
+ self.out_channels = out_channels
41
+ self.kernel_size = kernel_size
42
+ self.n_layers = n_layers
43
+ self.p_dropout = p_dropout
44
+ assert n_layers > 1, "Number of layers should be larger than 0."
45
+
46
+ self.conv_layers = nn.ModuleList()
47
+ self.norm_layers = nn.ModuleList()
48
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
49
+ self.norm_layers.append(LayerNorm(hidden_channels))
50
+ self.relu_drop = nn.Sequential(
51
+ nn.ReLU(),
52
+ nn.Dropout(p_dropout))
53
+ for _ in range(n_layers - 1):
54
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
55
+ self.norm_layers.append(LayerNorm(hidden_channels))
56
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
57
+ self.proj.weight.data.zero_()
58
+ self.proj.bias.data.zero_()
59
+
60
+ def forward(self, x, x_mask):
61
+ x_org = x
62
+ for i in range(self.n_layers):
63
+ x = self.conv_layers[i](x * x_mask)
64
+ x = self.norm_layers[i](x)
65
+ x = self.relu_drop(x)
66
+ x = x_org + self.proj(x)
67
+ return x * x_mask
68
+
69
+
70
+ class DDSConv(nn.Module):
71
+ """
72
+ Dialted and Depth-Separable Convolution
73
+ """
74
+
75
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
76
+ super().__init__()
77
+ self.channels = channels
78
+ self.kernel_size = kernel_size
79
+ self.n_layers = n_layers
80
+ self.p_dropout = p_dropout
81
+
82
+ self.drop = nn.Dropout(p_dropout)
83
+ self.convs_sep = nn.ModuleList()
84
+ self.convs_1x1 = nn.ModuleList()
85
+ self.norms_1 = nn.ModuleList()
86
+ self.norms_2 = nn.ModuleList()
87
+ for i in range(n_layers):
88
+ dilation = kernel_size ** i
89
+ padding = (kernel_size * dilation - dilation) // 2
90
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
91
+ groups=channels, dilation=dilation, padding=padding
92
+ ))
93
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
94
+ self.norms_1.append(LayerNorm(channels))
95
+ self.norms_2.append(LayerNorm(channels))
96
+
97
+ def forward(self, x, x_mask, g=None):
98
+ if g is not None:
99
+ x = x + g
100
+ for i in range(self.n_layers):
101
+ y = self.convs_sep[i](x * x_mask)
102
+ y = self.norms_1[i](y)
103
+ y = F.gelu(y)
104
+ y = self.convs_1x1[i](y)
105
+ y = self.norms_2[i](y)
106
+ y = F.gelu(y)
107
+ y = self.drop(y)
108
+ x = x + y
109
+ return x * x_mask
110
+
111
+
112
+ class WN(torch.nn.Module):
113
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
114
+ super(WN, self).__init__()
115
+ assert (kernel_size % 2 == 1)
116
+ self.hidden_channels = hidden_channels
117
+ self.kernel_size = kernel_size,
118
+ self.dilation_rate = dilation_rate
119
+ self.n_layers = n_layers
120
+ self.gin_channels = gin_channels
121
+ self.p_dropout = p_dropout
122
+
123
+ self.in_layers = torch.nn.ModuleList()
124
+ self.res_skip_layers = torch.nn.ModuleList()
125
+ self.drop = nn.Dropout(p_dropout)
126
+
127
+ if gin_channels != 0:
128
+ cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
129
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
130
+
131
+ for i in range(n_layers):
132
+ dilation = dilation_rate ** i
133
+ padding = int((kernel_size * dilation - dilation) / 2)
134
+ in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
135
+ dilation=dilation, padding=padding)
136
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
137
+ self.in_layers.append(in_layer)
138
+
139
+ # last one is not necessary
140
+ if i < n_layers - 1:
141
+ res_skip_channels = 2 * hidden_channels
142
+ else:
143
+ res_skip_channels = hidden_channels
144
+
145
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
146
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
147
+ self.res_skip_layers.append(res_skip_layer)
148
+
149
+ def forward(self, x, x_mask, g=None, **kwargs):
150
+ output = torch.zeros_like(x)
151
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
152
+
153
+ if g is not None:
154
+ g = self.cond_layer(g)
155
+
156
+ for i in range(self.n_layers):
157
+ x_in = self.in_layers[i](x)
158
+ if g is not None:
159
+ cond_offset = i * 2 * self.hidden_channels
160
+ g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
161
+ else:
162
+ g_l = torch.zeros_like(x_in)
163
+
164
+ acts = commons.fused_add_tanh_sigmoid_multiply(
165
+ x_in,
166
+ g_l,
167
+ n_channels_tensor)
168
+ acts = self.drop(acts)
169
+
170
+ res_skip_acts = self.res_skip_layers[i](acts)
171
+ if i < self.n_layers - 1:
172
+ res_acts = res_skip_acts[:, :self.hidden_channels, :]
173
+ x = (x + res_acts) * x_mask
174
+ output = output + res_skip_acts[:, self.hidden_channels:, :]
175
+ else:
176
+ output = output + res_skip_acts
177
+ return output * x_mask
178
+
179
+ def remove_weight_norm(self):
180
+ if self.gin_channels != 0:
181
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
182
+ for l in self.in_layers:
183
+ torch.nn.utils.remove_weight_norm(l)
184
+ for l in self.res_skip_layers:
185
+ torch.nn.utils.remove_weight_norm(l)
186
+
187
+
188
+ class ResBlock1(torch.nn.Module):
189
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
190
+ super(ResBlock1, self).__init__()
191
+ self.convs1 = nn.ModuleList([
192
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
193
+ padding=get_padding(kernel_size, dilation[0]))),
194
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
195
+ padding=get_padding(kernel_size, dilation[1]))),
196
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
197
+ padding=get_padding(kernel_size, dilation[2])))
198
+ ])
199
+ self.convs1.apply(init_weights)
200
+
201
+ self.convs2 = nn.ModuleList([
202
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203
+ padding=get_padding(kernel_size, 1))),
204
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
205
+ padding=get_padding(kernel_size, 1))),
206
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
207
+ padding=get_padding(kernel_size, 1)))
208
+ ])
209
+ self.convs2.apply(init_weights)
210
+
211
+ def forward(self, x, x_mask=None):
212
+ for c1, c2 in zip(self.convs1, self.convs2):
213
+ xt = F.leaky_relu(x, LRELU_SLOPE)
214
+ if x_mask is not None:
215
+ xt = xt * x_mask
216
+ xt = c1(xt)
217
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
218
+ if x_mask is not None:
219
+ xt = xt * x_mask
220
+ xt = c2(xt)
221
+ x = xt + x
222
+ if x_mask is not None:
223
+ x = x * x_mask
224
+ return x
225
+
226
+ def remove_weight_norm(self):
227
+ for l in self.convs1:
228
+ remove_weight_norm(l)
229
+ for l in self.convs2:
230
+ remove_weight_norm(l)
231
+
232
+
233
+ class ResBlock2(torch.nn.Module):
234
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
235
+ super(ResBlock2, self).__init__()
236
+ self.convs = nn.ModuleList([
237
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
238
+ padding=get_padding(kernel_size, dilation[0]))),
239
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
240
+ padding=get_padding(kernel_size, dilation[1])))
241
+ ])
242
+ self.convs.apply(init_weights)
243
+
244
+ def forward(self, x, x_mask=None):
245
+ for c in self.convs:
246
+ xt = F.leaky_relu(x, LRELU_SLOPE)
247
+ if x_mask is not None:
248
+ xt = xt * x_mask
249
+ xt = c(xt)
250
+ x = xt + x
251
+ if x_mask is not None:
252
+ x = x * x_mask
253
+ return x
254
+
255
+ def remove_weight_norm(self):
256
+ for l in self.convs:
257
+ remove_weight_norm(l)
258
+
259
+
260
+ class Log(nn.Module):
261
+ def forward(self, x, x_mask, reverse=False, **kwargs):
262
+ if not reverse:
263
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
264
+ logdet = torch.sum(-y, [1, 2])
265
+ return y, logdet
266
+ else:
267
+ x = torch.exp(x) * x_mask
268
+ return x
269
+
270
+
271
+ class Flip(nn.Module):
272
+ def forward(self, x, *args, reverse=False, **kwargs):
273
+ x = torch.flip(x, [1])
274
+ if not reverse:
275
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
276
+ return x, logdet
277
+ else:
278
+ return x
279
+
280
+
281
+ class ElementwiseAffine(nn.Module):
282
+ def __init__(self, channels):
283
+ super().__init__()
284
+ self.channels = channels
285
+ self.m = nn.Parameter(torch.zeros(channels, 1))
286
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
287
+
288
+ def forward(self, x, x_mask, reverse=False, **kwargs):
289
+ if not reverse:
290
+ y = self.m + torch.exp(self.logs) * x
291
+ y = y * x_mask
292
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
293
+ return y, logdet
294
+ else:
295
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
296
+ return x
297
+
298
+
299
+ class ResidualCouplingLayer(nn.Module):
300
+ def __init__(self,
301
+ channels,
302
+ hidden_channels,
303
+ kernel_size,
304
+ dilation_rate,
305
+ n_layers,
306
+ p_dropout=0,
307
+ gin_channels=0,
308
+ mean_only=False):
309
+ assert channels % 2 == 0, "channels should be divisible by 2"
310
+ super().__init__()
311
+ self.channels = channels
312
+ self.hidden_channels = hidden_channels
313
+ self.kernel_size = kernel_size
314
+ self.dilation_rate = dilation_rate
315
+ self.n_layers = n_layers
316
+ self.half_channels = channels // 2
317
+ self.mean_only = mean_only
318
+
319
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
320
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
321
+ gin_channels=gin_channels)
322
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
323
+ self.post.weight.data.zero_()
324
+ self.post.bias.data.zero_()
325
+
326
+ def forward(self, x, x_mask, g=None, reverse=False):
327
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
328
+ h = self.pre(x0) * x_mask
329
+ h = self.enc(h, x_mask, g=g)
330
+ stats = self.post(h) * x_mask
331
+ if not self.mean_only:
332
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
333
+ else:
334
+ m = stats
335
+ logs = torch.zeros_like(m)
336
+
337
+ if not reverse:
338
+ x1 = m + x1 * torch.exp(logs) * x_mask
339
+ x = torch.cat([x0, x1], 1)
340
+ logdet = torch.sum(logs, [1, 2])
341
+ return x, logdet
342
+ else:
343
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
344
+ x = torch.cat([x0, x1], 1)
345
+ return x
346
+
347
+
348
+ class ConvFlow(nn.Module):
349
+ def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
350
+ super().__init__()
351
+ self.in_channels = in_channels
352
+ self.filter_channels = filter_channels
353
+ self.kernel_size = kernel_size
354
+ self.n_layers = n_layers
355
+ self.num_bins = num_bins
356
+ self.tail_bound = tail_bound
357
+ self.half_channels = in_channels // 2
358
+
359
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
360
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
361
+ self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
362
+ self.proj.weight.data.zero_()
363
+ self.proj.bias.data.zero_()
364
+
365
+ def forward(self, x, x_mask, g=None, reverse=False):
366
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
367
+ h = self.pre(x0)
368
+ h = self.convs(h, x_mask, g=g)
369
+ h = self.proj(h) * x_mask
370
+
371
+ b, c, t = x0.shape
372
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
373
+
374
+ unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
375
+ unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels)
376
+ unnormalized_derivatives = h[..., 2 * self.num_bins:]
377
+
378
+ x1, logabsdet = piecewise_rational_quadratic_transform(x1,
379
+ unnormalized_widths,
380
+ unnormalized_heights,
381
+ unnormalized_derivatives,
382
+ inverse=reverse,
383
+ tails='linear',
384
+ tail_bound=self.tail_bound
385
+ )
386
+
387
+ x = torch.cat([x0, x1], 1) * x_mask
388
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
389
+ if not reverse:
390
+ return x, logdet
391
+ else:
392
+ return x
393
+
394
+
395
+ class TransformerCouplingLayer(nn.Module):
396
+ def __init__(self,
397
+ channels,
398
+ hidden_channels,
399
+ kernel_size,
400
+ n_layers,
401
+ n_heads,
402
+ p_dropout=0,
403
+ filter_channels=0,
404
+ mean_only=False,
405
+ wn_sharing_parameter=None,
406
+ gin_channels=0
407
+ ):
408
+ assert channels % 2 == 0, "channels should be divisible by 2"
409
+ super().__init__()
410
+ self.channels = channels
411
+ self.hidden_channels = hidden_channels
412
+ self.kernel_size = kernel_size
413
+ self.n_layers = n_layers
414
+ self.half_channels = channels // 2
415
+ self.mean_only = mean_only
416
+
417
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
418
+ self.enc = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, isflow=True,
419
+ gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter
420
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
421
+ self.post.weight.data.zero_()
422
+ self.post.bias.data.zero_()
423
+
424
+ def forward(self, x, x_mask, g=None, reverse=False):
425
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
426
+ h = self.pre(x0) * x_mask
427
+ h = self.enc(h, x_mask, g=g)
428
+ stats = self.post(h) * x_mask
429
+ if not self.mean_only:
430
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
431
+ else:
432
+ m = stats
433
+ logs = torch.zeros_like(m)
434
+
435
+ if not reverse:
436
+ x1 = m + x1 * torch.exp(logs) * x_mask
437
+ x = torch.cat([x0, x1], 1)
438
+ logdet = torch.sum(logs, [1, 2])
439
+ return x, logdet
440
+ else:
441
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
442
+ x = torch.cat([x0, x1], 1)
443
+ return x
444
+
445
+ x1, logabsdet = piecewise_rational_quadratic_transform(x1,
446
+ unnormalized_widths,
447
+ unnormalized_heights,
448
+ unnormalized_derivatives,
449
+ inverse=reverse,
450
+ tails='linear',
451
+ tail_bound=self.tail_bound
452
+ )
453
+
454
+ x = torch.cat([x0, x1], 1) * x_mask
455
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
456
+ if not reverse:
457
+ return x, logdet
458
+ else:
459
+ return x
bert_vits2/requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Cython
2
+ librosa==0.9.1
3
+ matplotlib==3.3.1
4
+ numpy
5
+ phonemizer
6
+ scipy
7
+ tensorboard
8
+ torch
9
+ torchvision
10
+ Unidecode
11
+ amfm_decompy
12
+ jieba
13
+ transformers
14
+ pypinyin
15
+ cn2an
bert_vits2/text/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bert_vits2.text.symbols import *
2
+
3
+
4
+ def cleaned_text_to_sequence_v111(cleaned_text, tones, language, _symbol_to_id):
5
+ """version <= 1.1.1"""
6
+ phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
7
+ tone_start = language_tone_start_map_v111[language]
8
+ tones = [i + tone_start for i in tones]
9
+ lang_id = language_id_map[language]
10
+ lang_ids = [lang_id for i in phones]
11
+ return phones, tones, lang_ids
12
+
13
+ def cleaned_text_to_sequence(cleaned_text, tones, language, _symbol_to_id):
14
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
15
+ Args:
16
+ text: string to convert to a sequence
17
+ Returns:
18
+ List of integers corresponding to the symbols in the text
19
+ """
20
+ phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
21
+ tone_start = language_tone_start_map[language]
22
+ tones = [i + tone_start for i in tones]
23
+ lang_id = language_id_map[language]
24
+ lang_ids = [lang_id for i in phones]
25
+ return phones, tones, lang_ids
bert_vits2/text/chinese.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import cn2an
5
+ from pypinyin import lazy_pinyin, Style
6
+
7
+ from bert_vits2.text.symbols import punctuation
8
+ from bert_vits2.text.tone_sandhi import ToneSandhi
9
+
10
+ current_file_path = os.path.dirname(__file__)
11
+ pinyin_to_symbol_map = {line.split("\t")[0]: line.strip().split("\t")[1] for line in
12
+ open(os.path.join(current_file_path, 'opencpop-strict.txt')).readlines()}
13
+
14
+ import jieba.posseg as psg
15
+ from jieba import lcut
16
+
17
+ lcut("预加载")
18
+
19
+ rep_map = {
20
+ ":": ",",
21
+ ";": ",",
22
+ ",": ",",
23
+ "。": ".",
24
+ "!": "!",
25
+ "?": "?",
26
+ "\n": ".",
27
+ "·": ",",
28
+ "、": ",",
29
+ "...": "…",
30
+ "$": ".",
31
+ "“": "'",
32
+ "”": "'",
33
+ '"': "'",
34
+ "‘": "'",
35
+ "’": "'",
36
+ "(": "'",
37
+ ")": "'",
38
+ "(": "'",
39
+ ")": "'",
40
+ "《": "'",
41
+ "》": "'",
42
+ "【": "'",
43
+ "】": "'",
44
+ "[": "'",
45
+ "]": "'",
46
+ "—": "-",
47
+ "~": "-",
48
+ "~": "-",
49
+ "「": "'",
50
+ "」": "'",
51
+ }
52
+
53
+ tone_modifier = ToneSandhi()
54
+
55
+
56
+ def replace_punctuation(text):
57
+ text = text.replace("嗯", "恩").replace("呣", "母")
58
+ pattern = re.compile('|'.join(re.escape(p) for p in rep_map.keys()))
59
+
60
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
61
+
62
+ replaced_text = re.sub(r'[^\u4e00-\u9fa5' + "".join(punctuation) + r']+', '', replaced_text)
63
+
64
+ return replaced_text
65
+
66
+
67
+ def g2p(text, **kwargs):
68
+ pattern = r'(?<=[{0}])\s*'.format(''.join(punctuation))
69
+ sentences = [i for i in re.split(pattern, text) if i.strip() != '']
70
+ phones, tones, word2ph = _g2p(sentences)
71
+ assert sum(word2ph) == len(phones)
72
+ assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
73
+ phones = ['_'] + phones + ["_"]
74
+ tones = [0] + tones + [0]
75
+ word2ph = [1] + word2ph + [1]
76
+ return phones, tones, word2ph
77
+
78
+
79
+ def _get_initials_finals(word):
80
+ initials = []
81
+ finals = []
82
+ orig_initials = lazy_pinyin(
83
+ word, neutral_tone_with_five=True, style=Style.INITIALS)
84
+ orig_finals = lazy_pinyin(
85
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
86
+ for c, v in zip(orig_initials, orig_finals):
87
+ initials.append(c)
88
+ finals.append(v)
89
+ return initials, finals
90
+
91
+
92
+ def _g2p(segments, **kwargs):
93
+ phones_list = []
94
+ tones_list = []
95
+ word2ph = []
96
+ for seg in segments:
97
+ pinyins = []
98
+ # Replace all English words in the sentence
99
+ seg = re.sub('[a-zA-Z]+', '', seg)
100
+ seg_cut = psg.lcut(seg)
101
+ initials = []
102
+ finals = []
103
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104
+ for word, pos in seg_cut:
105
+ if pos == 'eng':
106
+ continue
107
+ sub_initials, sub_finals = _get_initials_finals(word)
108
+ sub_finals = tone_modifier.modified_tone(word, pos,
109
+ sub_finals)
110
+ initials.append(sub_initials)
111
+ finals.append(sub_finals)
112
+
113
+ # assert len(sub_initials) == len(sub_finals) == len(word)
114
+ initials = sum(initials, [])
115
+ finals = sum(finals, [])
116
+ #
117
+ for c, v in zip(initials, finals):
118
+ raw_pinyin = c + v
119
+ # NOTE: post process for pypinyin outputs
120
+ # we discriminate i, ii and iii
121
+ if c == v:
122
+ assert c in punctuation
123
+ phone = [c]
124
+ tone = '0'
125
+ word2ph.append(1)
126
+ else:
127
+ v_without_tone = v[:-1]
128
+ tone = v[-1]
129
+
130
+ pinyin = c + v_without_tone
131
+ assert tone in '12345'
132
+
133
+ if c:
134
+ # 多音节
135
+ v_rep_map = {
136
+ "uei": 'ui',
137
+ 'iou': 'iu',
138
+ 'uen': 'un',
139
+ }
140
+ if v_without_tone in v_rep_map.keys():
141
+ pinyin = c + v_rep_map[v_without_tone]
142
+ else:
143
+ # 单音节
144
+ pinyin_rep_map = {
145
+ 'ing': 'ying',
146
+ 'i': 'yi',
147
+ 'in': 'yin',
148
+ 'u': 'wu',
149
+ }
150
+ if pinyin in pinyin_rep_map.keys():
151
+ pinyin = pinyin_rep_map[pinyin]
152
+ else:
153
+ single_rep_map = {
154
+ 'v': 'yu',
155
+ 'e': 'e',
156
+ 'i': 'y',
157
+ 'u': 'w',
158
+ }
159
+ if pinyin[0] in single_rep_map.keys():
160
+ pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
161
+
162
+ assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
163
+ phone = pinyin_to_symbol_map[pinyin].split(' ')
164
+ word2ph.append(len(phone))
165
+
166
+ phones_list += phone
167
+ tones_list += [int(tone)] * len(phone)
168
+ return phones_list, tones_list, word2ph
169
+
170
+
171
+ def text_normalize(text):
172
+ # numbers = re.findall(r'\d+(?:\.?\d+)?', text)
173
+ # for number in numbers:
174
+ # text = text.replace(number, cn2an.an2cn(number), 1)
175
+ text = cn2an.transform(text, "an2cn")
176
+ text = replace_punctuation(text)
177
+ return text
178
+
179
+
180
+ def get_bert_feature(text, word2ph):
181
+ from bert_vits2.text import chinese_bert
182
+ return chinese_bert.get_bert_feature(text, word2ph)
183
+
184
+
185
+ if __name__ == '__main__':
186
+ text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏。"
187
+ text = text_normalize(text)
188
+ print(text)
189
+ phones, tones, word2ph = g2p(text)
190
+
191
+ print(phones, tones, word2ph)
192
+ bert = get_bert_feature(text, word2ph)
193
+
194
+ print(bert.shape)
195
+
196
+ # # 示例用法
197
+ # text = "这是一个示例文本:,你好!这是一个测试...."
198
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
bert_vits2/text/chinese_bert.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from contants import config
4
+
5
+
6
+ def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, style_text=None, style_weight=0.7,
7
+ **kwargs):
8
+ with torch.no_grad():
9
+ inputs = tokenizer(text, return_tensors='pt')
10
+ for i in inputs:
11
+ inputs[i] = inputs[i].to(device)
12
+ res = model(**inputs, output_hidden_states=True)
13
+ res = torch.cat(res['hidden_states'][-3:-2], -1)[0].float().cpu()
14
+ if style_text:
15
+ style_inputs = tokenizer(style_text, return_tensors="pt")
16
+ for i in style_inputs:
17
+ style_inputs[i] = style_inputs[i].to(device)
18
+ style_res = model(**style_inputs, output_hidden_states=True)
19
+ style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
20
+ style_res_mean = style_res.mean(0)
21
+
22
+ assert len(word2ph) == len(text) + 2
23
+ word2phone = word2ph
24
+ phone_level_feature = []
25
+ for i in range(len(word2phone)):
26
+ if style_text:
27
+ repeat_feature = (
28
+ res[i].repeat(word2phone[i], 1) * (1 - style_weight)
29
+ + style_res_mean.repeat(word2phone[i], 1) * style_weight
30
+ )
31
+ else:
32
+ repeat_feature = res[i].repeat(word2phone[i], 1)
33
+ phone_level_feature.append(repeat_feature)
34
+
35
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
36
+
37
+ return phone_level_feature.T
38
+
39
+
40
+ if __name__ == '__main__':
41
+
42
+ word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
43
+ word2phone = [1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2,
44
+ 2, 2, 2, 1]
45
+
46
+ # 计算总帧数
47
+ total_frames = sum(word2phone)
48
+ print(word_level_feature.shape)
49
+ print(word2phone)
50
+ phone_level_feature = []
51
+ for i in range(len(word2phone)):
52
+ print(word_level_feature[i].shape)
53
+
54
+ # 对每个词重复word2phone[i]次
55
+ repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
56
+ phone_level_feature.append(repeat_feature)
57
+
58
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
59
+ print(phone_level_feature.shape) # torch.Size([36, 1024])
bert_vits2/text/chinese_bert_extra.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from contants import config
4
+
5
+
6
+ def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, style_text=None, style_weight=0.7,
7
+ **kwargs):
8
+ with torch.no_grad():
9
+ inputs = tokenizer(text, return_tensors='pt')
10
+ for i in inputs:
11
+ inputs[i] = inputs[i].to(device)
12
+ res = model(**inputs, output_hidden_states=True)
13
+ res = torch.nn.functional.normalize(torch.cat(res["hidden_states"][-3:-2], -1)[0], dim=0).float().cpu()
14
+ if style_text:
15
+ style_inputs = tokenizer(style_text, return_tensors="pt")
16
+ for i in style_inputs:
17
+ style_inputs[i] = style_inputs[i].to(device)
18
+ style_res = model(**style_inputs, output_hidden_states=True)
19
+ style_res = torch.nn.functional.normalize(
20
+ torch.cat(style_res["hidden_states"][-3:-2], -1)[0], dim=0
21
+ ).float().cpu()
22
+ style_res_mean = style_res.mean(0)
23
+ assert len(word2ph) == len(text) + 2
24
+ word2phone = word2ph
25
+ phone_level_feature = []
26
+ for i in range(len(word2phone)):
27
+ if style_text:
28
+ repeat_feature = (
29
+ res[i].repeat(word2phone[i], 1) * (1 - style_weight)
30
+ + style_res_mean.repeat(word2phone[i], 1) * style_weight
31
+ )
32
+ else:
33
+ repeat_feature = res[i].repeat(word2phone[i], 1)
34
+ phone_level_feature.append(repeat_feature)
35
+
36
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
37
+
38
+ return phone_level_feature.T
39
+
40
+
41
+ if __name__ == '__main__':
42
+
43
+ word_level_feature = torch.rand(38, 2048) # 12个词,每个词2048维特征
44
+ word2phone = [1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2,
45
+ 2, 2, 2, 1]
46
+
47
+ # 计算总帧数
48
+ total_frames = sum(word2phone)
49
+ print(word_level_feature.shape)
50
+ print(word2phone)
51
+ phone_level_feature = []
52
+ for i in range(len(word2phone)):
53
+ print(word_level_feature[i].shape)
54
+
55
+ # 对每个词重复word2phone[i]次
56
+ repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
57
+ phone_level_feature.append(repeat_feature)
58
+
59
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
60
+ print(phone_level_feature.shape) # torch.Size([36, 2048])
bert_vits2/text/chinese_v100.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import cn2an
5
+ from pypinyin import lazy_pinyin, Style
6
+
7
+ from bert_vits2.text.symbols import punctuation
8
+ from bert_vits2.text.tone_sandhi import ToneSandhi
9
+
10
+ current_file_path = os.path.dirname(__file__)
11
+ pinyin_to_symbol_map = {line.split("\t")[0]: line.strip().split("\t")[1] for line in
12
+ open(os.path.join(current_file_path, 'opencpop-strict.txt')).readlines()}
13
+
14
+ import jieba.posseg as psg
15
+ from jieba import lcut
16
+
17
+ lcut("预加载")
18
+
19
+ rep_map = {
20
+ ':': ',',
21
+ ';': ',',
22
+ ',': ',',
23
+ '。': '.',
24
+ '!': '!',
25
+ '?': '?',
26
+ '\n': '.',
27
+ "·": ",",
28
+ '、': ",",
29
+ '...': '…',
30
+ '$': '.',
31
+ '“': "'",
32
+ '”': "'",
33
+ '‘': "'",
34
+ '’': "'",
35
+ '(': "'",
36
+ ')': "'",
37
+ '(': "'",
38
+ ')': "'",
39
+ '《': "'",
40
+ '》': "'",
41
+ '【': "'",
42
+ '】': "'",
43
+ '[': "'",
44
+ ']': "'",
45
+ '—': "-",
46
+ '~': "-",
47
+ '~': "-",
48
+ '「': "'",
49
+ '」': "'",
50
+ }
51
+
52
+ tone_modifier = ToneSandhi()
53
+
54
+
55
+ def replace_punctuation(text):
56
+ text = text.replace("嗯", "恩").replace("呣", "母")
57
+ pattern = re.compile('|'.join(re.escape(p) for p in rep_map.keys()))
58
+
59
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
60
+
61
+ replaced_text = re.sub(r'[^\u4e00-\u9fa5' + "".join(punctuation) + r']+', '', replaced_text)
62
+
63
+ return replaced_text
64
+
65
+
66
+ def g2p(text, **kwargs):
67
+ pattern = r'(?<=[{0}])\s*'.format(''.join(punctuation))
68
+ sentences = [i for i in re.split(pattern, text) if i.strip() != '']
69
+ phones, tones, word2ph = _g2p(sentences)
70
+ assert sum(word2ph) == len(phones)
71
+ assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
72
+ phones = ['_'] + phones + ["_"]
73
+ tones = [0] + tones + [0]
74
+ word2ph = [1] + word2ph + [1]
75
+ return phones, tones, word2ph
76
+
77
+
78
+ def _get_initials_finals(word):
79
+ initials = []
80
+ finals = []
81
+ orig_initials = lazy_pinyin(
82
+ word, neutral_tone_with_five=True, style=Style.INITIALS)
83
+ orig_finals = lazy_pinyin(
84
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
85
+ for c, v in zip(orig_initials, orig_finals):
86
+ initials.append(c)
87
+ finals.append(v)
88
+ return initials, finals
89
+
90
+
91
+ def _g2p(segments):
92
+ phones_list = []
93
+ tones_list = []
94
+ word2ph = []
95
+ for seg in segments:
96
+ pinyins = []
97
+ # Replace all English words in the sentence
98
+ seg = re.sub('[a-zA-Z]+', '', seg)
99
+ seg_cut = psg.lcut(seg)
100
+ initials = []
101
+ finals = []
102
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
103
+ for word, pos in seg_cut:
104
+ if pos == 'eng':
105
+ continue
106
+ sub_initials, sub_finals = _get_initials_finals(word)
107
+ sub_finals = tone_modifier.modified_tone(word, pos,
108
+ sub_finals)
109
+ initials.append(sub_initials)
110
+ finals.append(sub_finals)
111
+
112
+ # assert len(sub_initials) == len(sub_finals) == len(word)
113
+ initials = sum(initials, [])
114
+ finals = sum(finals, [])
115
+ #
116
+ for c, v in zip(initials, finals):
117
+ raw_pinyin = c + v
118
+ # NOTE: post process for pypinyin outputs
119
+ # we discriminate i, ii and iii
120
+ if c == v:
121
+ assert c in punctuation
122
+ phone = [c]
123
+ tone = '0'
124
+ word2ph.append(1)
125
+ else:
126
+ v_without_tone = v[:-1]
127
+ tone = v[-1]
128
+
129
+ pinyin = c + v_without_tone
130
+ assert tone in '12345'
131
+
132
+ if c:
133
+ # 多音节
134
+ v_rep_map = {
135
+ "uei": 'ui',
136
+ 'iou': 'iu',
137
+ 'uen': 'un',
138
+ }
139
+ if v_without_tone in v_rep_map.keys():
140
+ pinyin = c + v_rep_map[v_without_tone]
141
+ else:
142
+ # 单音节
143
+ pinyin_rep_map = {
144
+ 'ing': 'ying',
145
+ 'i': 'yi',
146
+ 'in': 'yin',
147
+ 'u': 'wu',
148
+ }
149
+ if pinyin in pinyin_rep_map.keys():
150
+ pinyin = pinyin_rep_map[pinyin]
151
+ else:
152
+ single_rep_map = {
153
+ 'v': 'yu',
154
+ 'e': 'e',
155
+ 'i': 'y',
156
+ 'u': 'w',
157
+ }
158
+ if pinyin[0] in single_rep_map.keys():
159
+ pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160
+
161
+ assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162
+ phone = pinyin_to_symbol_map[pinyin].split(' ')
163
+ word2ph.append(len(phone))
164
+
165
+ phones_list += phone
166
+ tones_list += [int(tone)] * len(phone)
167
+ return phones_list, tones_list, word2ph
168
+
169
+
170
+ def text_normalize(text):
171
+ # numbers = re.findall(r'\d+(?:\.?\d+)?', text)
172
+ # for number in numbers:
173
+ # text = text.replace(number, cn2an.an2cn(number), 1)
174
+ text = cn2an.transform(text, "an2cn")
175
+ text = replace_punctuation(text)
176
+ return text
177
+
178
+
179
+ def get_bert_feature(text, word2ph):
180
+ from bert_vits2.text import chinese_bert
181
+ return chinese_bert.get_bert_feature(text, word2ph)
182
+
183
+
184
+ if __name__ == '__main__':
185
+ text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏。"
186
+ text = text_normalize(text)
187
+ print(text)
188
+ phones, tones, word2ph = g2p(text)
189
+
190
+ print(phones, tones, word2ph)
191
+ bert = get_bert_feature(text, word2ph)
192
+
193
+ print(bert.shape)
194
+
195
+ # # 示例用法
196
+ # text = "这是一个示例文本:,你好!这是一个测试...."
197
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
bert_vits2/text/chinese_v240.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ from pypinyin import Style
5
+ from bert_vits2.text.symbols import punctuation
6
+ from bert_vits2.text.tone_sandhi import ToneSandhi
7
+
8
+ import cn2an
9
+
10
+ normalizer = lambda x: cn2an.transform(x, "an2cn")
11
+
12
+ current_file_path = os.path.dirname(__file__)
13
+ pinyin_to_symbol_map = {
14
+ line.split("\t")[0]: line.strip().split("\t")[1]
15
+ for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
16
+ }
17
+
18
+ import jieba.posseg as psg
19
+
20
+ rep_map = {
21
+ ":": ",",
22
+ ";": ",",
23
+ ",": ",",
24
+ "。": ".",
25
+ "!": "!",
26
+ "?": "?",
27
+ "\n": ".",
28
+ "·": ",",
29
+ "、": ",",
30
+ "...": "…",
31
+ "$": ".",
32
+ "“": "'",
33
+ "”": "'",
34
+ '"': "'",
35
+ "‘": "'",
36
+ "’": "'",
37
+ "(": "'",
38
+ ")": "'",
39
+ "(": "'",
40
+ ")": "'",
41
+ "《": "'",
42
+ "》": "'",
43
+ "【": "'",
44
+ "】": "'",
45
+ "[": "'",
46
+ "]": "'",
47
+ "—": "-",
48
+ "~": "-",
49
+ "~": "-",
50
+ "「": "'",
51
+ "」": "'",
52
+ }
53
+
54
+ tone_modifier = ToneSandhi()
55
+
56
+
57
+ def replace_punctuation(text):
58
+ text = text.replace("嗯", "恩").replace("呣", "母")
59
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
60
+
61
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
62
+
63
+ replaced_text = re.sub(
64
+ r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
65
+ )
66
+
67
+ return replaced_text
68
+
69
+
70
+ def g2p(text, pinyinPlus=None, **kwargs):
71
+ pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
72
+ sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
73
+ phones, tones, word2ph = _g2p(sentences, pinyinPlus)
74
+ assert sum(word2ph) == len(phones)
75
+ assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
76
+ phones = ["_"] + phones + ["_"]
77
+ tones = [0] + tones + [0]
78
+ word2ph = [1] + word2ph + [1]
79
+ return phones, tones, word2ph
80
+
81
+
82
+ def _get_initials_finalsV2(word, orig_initials, orig_finals):
83
+ initials = []
84
+ finals = []
85
+ for c, v in zip(orig_initials, orig_finals):
86
+ initials.append(c)
87
+ finals.append(v)
88
+ return initials, finals
89
+
90
+
91
+ def _g2p(segments, pinyinPlus, **kwargs):
92
+ phones_list = []
93
+ tones_list = []
94
+ word2ph = []
95
+ for seg in segments:
96
+ # Replace all English words in the sentence
97
+
98
+ seg = re.sub("[a-zA-Z]+", "", seg)
99
+
100
+ seg_cut = psg.lcut(seg)
101
+ initials = []
102
+ finals = []
103
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104
+ allWords = ""
105
+ for word, pos in seg_cut:
106
+ allWords = allWords + word
107
+
108
+ orig_initials = pinyinPlus.lazy_pinyin(
109
+ allWords, neutral_tone_with_five=True, style=Style.INITIALS
110
+ )
111
+ orig_finals = pinyinPlus.lazy_pinyin(
112
+ allWords, neutral_tone_with_five=True, style=Style.FINALS_TONE3
113
+ )
114
+ currentIndex = 0
115
+ for word, pos in seg_cut:
116
+ curr_orig_initials = orig_initials[currentIndex: currentIndex + len(word)]
117
+ curr_orig_finalss = orig_finals[currentIndex: currentIndex + len(word)]
118
+ currentIndex = currentIndex + len(word)
119
+ if pos == "eng":
120
+ continue
121
+ sub_initials, sub_finals = _get_initials_finalsV2(
122
+ word, curr_orig_initials, curr_orig_finalss
123
+ )
124
+ sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
125
+ initials.append(sub_initials)
126
+ finals.append(sub_finals)
127
+
128
+ # assert len(sub_initials) == len(sub_finals) == len(word)
129
+ initials = sum(initials, [])
130
+ finals = sum(finals, [])
131
+ #
132
+ for c, v in zip(initials, finals):
133
+ raw_pinyin = c + v
134
+ # NOTE: post process for pypinyin outputs
135
+ # we discriminate i, ii and iii
136
+ if c == v:
137
+ assert c in punctuation
138
+ phone = [c]
139
+ tone = "0"
140
+ word2ph.append(1)
141
+ else:
142
+ v_without_tone = v[:-1]
143
+ tone = v[-1]
144
+
145
+ pinyin = c + v_without_tone
146
+ assert tone in "12345"
147
+
148
+ if c:
149
+ # 多音节
150
+ v_rep_map = {
151
+ "uei": "ui",
152
+ "iou": "iu",
153
+ "uen": "un",
154
+ }
155
+ if v_without_tone in v_rep_map.keys():
156
+ pinyin = c + v_rep_map[v_without_tone]
157
+ else:
158
+ # 单音节
159
+ pinyin_rep_map = {
160
+ "ing": "ying",
161
+ "i": "yi",
162
+ "in": "yin",
163
+ "u": "wu",
164
+ }
165
+ if pinyin in pinyin_rep_map.keys():
166
+ pinyin = pinyin_rep_map[pinyin]
167
+ else:
168
+ single_rep_map = {
169
+ "v": "yu",
170
+ "e": "e",
171
+ "i": "y",
172
+ "u": "w",
173
+ }
174
+ if pinyin[0] in single_rep_map.keys():
175
+ pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
176
+
177
+ assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
178
+ phone = pinyin_to_symbol_map[pinyin].split(" ")
179
+ word2ph.append(len(phone))
180
+
181
+ phones_list += phone
182
+ tones_list += [int(tone)] * len(phone)
183
+ return phones_list, tones_list, word2ph
184
+
185
+
186
+ def text_normalize(text):
187
+ text = normalizer(text)
188
+ text = replace_punctuation(text)
189
+ return text
190
+
191
+
192
+ def get_bert_feature(text, word2ph):
193
+ from bert_vits2.text import chinese_bert_extra as chinese_bert
194
+
195
+ return chinese_bert.get_bert_feature(text, word2ph)
196
+
197
+
198
+ if __name__ == "__main__":
199
+ from bert_vits2.text.chinese_bert import get_bert_feature
200
+
201
+ text = "欸,这个「勾玉」的形状,是不是和那边门上的凹槽很像?"
202
+ text = text_normalize(text)
203
+ print(text)
204
+ phones, tones, word2ph = g2p(text)
205
+ bert = get_bert_feature(text, word2ph)
206
+
207
+ print(phones, tones, word2ph, bert.shape)
208
+
209
+ # # 示例用法
210
+ # text = "这是一个示例文本:,你好!这是一个测试...."
211
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
bert_vits2/text/cleaner.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bert_vits2.text import chinese, japanese, english, cleaned_text_to_sequence, japanese_v111, chinese_v100, \
2
+ japanese_v200, english_v200, english_v230, chinese_v240, japanese_extra
3
+
4
+ language_module_map = {
5
+ 'zh': chinese,
6
+ 'ja': japanese,
7
+ 'en': english,
8
+ 'ja_v111': japanese_v111,
9
+ 'zh_v100': chinese_v100,
10
+ 'ja_v200': japanese_v200,
11
+ 'en_v200': english_v200,
12
+ 'en_v230': english_v230,
13
+ 'zh_v240': chinese_v240,
14
+ 'ja_extra': japanese_extra,
15
+ }
16
+
17
+
18
+ # _loaded_modules = {}
19
+ #
20
+ #
21
+ # def get_language_module(language):
22
+ # if language not in _loaded_modules:
23
+ # module_path = language_module_map.get(language)
24
+ # if not module_path:
25
+ # raise ValueError(f"Unsupported language: {language}")
26
+ #
27
+ # _loaded_modules[language] = importlib.import_module(module_path)
28
+ #
29
+ # return _loaded_modules[language]
30
+
31
+
32
+ def clean_text(text, language, tokenizer, pinyinPlus=None):
33
+ language_module = language_module_map[language]
34
+ norm_text = language_module.text_normalize(text)
35
+ phones, tones, word2ph = language_module.g2p(norm_text, tokenizer=tokenizer, pinyinPlus=pinyinPlus)
36
+ return norm_text, phones, tones, word2ph
37
+
38
+
39
+ # def clean_text_bert(text, language, tokenizer):
40
+ # language_module = language_module_map[language]
41
+ # norm_text = language_module.text_normalize(text)
42
+ # phones, tones, word2ph = language_module.g2p(norm_text, tokenizer)
43
+ # bert = language_module.get_bert_feature(norm_text, word2ph)
44
+ # return phones, tones, bert
45
+
46
+
47
+ def text_to_sequence(text, language, tokenizer):
48
+ norm_text, phones, tones, word2ph = clean_text(text, language, tokenizer)
49
+ return cleaned_text_to_sequence(phones, tones, language)
50
+
51
+
52
+ if __name__ == '__main__':
53
+ pass
bert_vits2/text/cmudict.rep ADDED
The diff for this file is too large to render. See raw diff
 
bert_vits2/text/cmudict_cache.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
3
+ size 6212655
bert_vits2/text/english.py ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import re
4
+ from g2p_en import G2p
5
+
6
+ from bert_vits2.text import symbols
7
+
8
+ current_file_path = os.path.dirname(__file__)
9
+ CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
10
+ CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
11
+ _g2p = G2p()
12
+
13
+ arpa = {
14
+ "AH0",
15
+ "S",
16
+ "AH1",
17
+ "EY2",
18
+ "AE2",
19
+ "EH0",
20
+ "OW2",
21
+ "UH0",
22
+ "NG",
23
+ "B",
24
+ "G",
25
+ "AY0",
26
+ "M",
27
+ "AA0",
28
+ "F",
29
+ "AO0",
30
+ "ER2",
31
+ "UH1",
32
+ "IY1",
33
+ "AH2",
34
+ "DH",
35
+ "IY0",
36
+ "EY1",
37
+ "IH0",
38
+ "K",
39
+ "N",
40
+ "W",
41
+ "IY2",
42
+ "T",
43
+ "AA1",
44
+ "ER1",
45
+ "EH2",
46
+ "OY0",
47
+ "UH2",
48
+ "UW1",
49
+ "Z",
50
+ "AW2",
51
+ "AW1",
52
+ "V",
53
+ "UW2",
54
+ "AA2",
55
+ "ER",
56
+ "AW0",
57
+ "UW0",
58
+ "R",
59
+ "OW1",
60
+ "EH1",
61
+ "ZH",
62
+ "AE0",
63
+ "IH2",
64
+ "IH",
65
+ "Y",
66
+ "JH",
67
+ "P",
68
+ "AY1",
69
+ "EY0",
70
+ "OY2",
71
+ "TH",
72
+ "HH",
73
+ "D",
74
+ "ER0",
75
+ "CH",
76
+ "AO1",
77
+ "AE1",
78
+ "AO2",
79
+ "OY1",
80
+ "AY2",
81
+ "IH1",
82
+ "OW0",
83
+ "L",
84
+ "SH",
85
+ }
86
+
87
+
88
+ def post_replace_ph(ph):
89
+ rep_map = {
90
+ ":": ",",
91
+ ";": ",",
92
+ ",": ",",
93
+ "。": ".",
94
+ "!": "!",
95
+ "?": "?",
96
+ "\n": ".",
97
+ "·": ",",
98
+ "、": ",",
99
+ "…": "...",
100
+ "···": "...",
101
+ "・・・": "...",
102
+ "v": "V",
103
+ }
104
+ if ph in rep_map.keys():
105
+ ph = rep_map[ph]
106
+ if ph in symbols:
107
+ return ph
108
+ if ph not in symbols:
109
+ ph = "UNK"
110
+ return ph
111
+
112
+
113
+ rep_map = {
114
+ ":": ",",
115
+ ";": ",",
116
+ ",": ",",
117
+ "。": ".",
118
+ "!": "!",
119
+ "?": "?",
120
+ "\n": ".",
121
+ ".": ".",
122
+ "…": "...",
123
+ "···": "...",
124
+ "・・・": "...",
125
+ "·": ",",
126
+ "・": ",",
127
+ "、": ",",
128
+ "$": ".",
129
+ "“": "'",
130
+ "”": "'",
131
+ '"': "'",
132
+ "‘": "'",
133
+ "’": "'",
134
+ "(": "'",
135
+ ")": "'",
136
+ "(": "'",
137
+ ")": "'",
138
+ "《": "'",
139
+ "》": "'",
140
+ "【": "'",
141
+ "】": "'",
142
+ "[": "'",
143
+ "]": "'",
144
+ "—": "-",
145
+ "−": "-",
146
+ "~": "-",
147
+ "~": "-",
148
+ "「": "'",
149
+ "」": "'",
150
+ }
151
+
152
+
153
+ def replace_punctuation(text):
154
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
155
+
156
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
157
+
158
+ # replaced_text = re.sub(
159
+ # r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
160
+ # + "".join(punctuation)
161
+ # + r"]+",
162
+ # "",
163
+ # replaced_text,
164
+ # )
165
+
166
+ return replaced_text
167
+
168
+
169
+ def read_dict():
170
+ g2p_dict = {}
171
+ start_line = 49
172
+ with open(CMU_DICT_PATH) as f:
173
+ line = f.readline()
174
+ line_index = 1
175
+ while line:
176
+ if line_index >= start_line:
177
+ line = line.strip()
178
+ word_split = line.split(" ")
179
+ word = word_split[0]
180
+
181
+ syllable_split = word_split[1].split(" - ")
182
+ g2p_dict[word] = []
183
+ for syllable in syllable_split:
184
+ phone_split = syllable.split(" ")
185
+ g2p_dict[word].append(phone_split)
186
+
187
+ line_index = line_index + 1
188
+ line = f.readline()
189
+
190
+ return g2p_dict
191
+
192
+
193
+ def cache_dict(g2p_dict, file_path):
194
+ with open(file_path, "wb") as pickle_file:
195
+ pickle.dump(g2p_dict, pickle_file)
196
+
197
+
198
+ def get_dict():
199
+ if os.path.exists(CACHE_PATH):
200
+ with open(CACHE_PATH, "rb") as pickle_file:
201
+ g2p_dict = pickle.load(pickle_file)
202
+ else:
203
+ g2p_dict = read_dict()
204
+ cache_dict(g2p_dict, CACHE_PATH)
205
+
206
+ return g2p_dict
207
+
208
+
209
+ eng_dict = get_dict()
210
+
211
+
212
+ def refine_ph(phn):
213
+ tone = 0
214
+ if re.search(r"\d$", phn):
215
+ tone = int(phn[-1]) + 1
216
+ phn = phn[:-1]
217
+ return phn.lower(), tone
218
+
219
+
220
+ def refine_syllables(syllables):
221
+ tones = []
222
+ phonemes = []
223
+ for phn_list in syllables:
224
+ for i in range(len(phn_list)):
225
+ phn = phn_list[i]
226
+ phn, tone = refine_ph(phn)
227
+ phonemes.append(phn)
228
+ tones.append(tone)
229
+ return phonemes, tones
230
+
231
+
232
+ import re
233
+ import inflect
234
+
235
+ _inflect = inflect.engine()
236
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
237
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
238
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
239
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
240
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
241
+ _number_re = re.compile(r"[0-9]+")
242
+
243
+ # List of (regular expression, replacement) pairs for abbreviations:
244
+ _abbreviations = [
245
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
246
+ for x in [
247
+ ("mrs", "misess"),
248
+ ("mr", "mister"),
249
+ ("dr", "doctor"),
250
+ ("st", "saint"),
251
+ ("co", "company"),
252
+ ("jr", "junior"),
253
+ ("maj", "major"),
254
+ ("gen", "general"),
255
+ ("drs", "doctors"),
256
+ ("rev", "reverend"),
257
+ ("lt", "lieutenant"),
258
+ ("hon", "honorable"),
259
+ ("sgt", "sergeant"),
260
+ ("capt", "captain"),
261
+ ("esq", "esquire"),
262
+ ("ltd", "limited"),
263
+ ("col", "colonel"),
264
+ ("ft", "fort"),
265
+ ]
266
+ ]
267
+
268
+ # List of (ipa, lazy ipa) pairs:
269
+ _lazy_ipa = [
270
+ (re.compile("%s" % x[0]), x[1])
271
+ for x in [
272
+ ("r", "ɹ"),
273
+ ("æ", "e"),
274
+ ("ɑ", "a"),
275
+ ("ɔ", "o"),
276
+ ("ð", "z"),
277
+ ("θ", "s"),
278
+ ("ɛ", "e"),
279
+ ("ɪ", "i"),
280
+ ("ʊ", "u"),
281
+ ("ʒ", "ʥ"),
282
+ ("ʤ", "ʥ"),
283
+ ("ˈ", "↓"),
284
+ ]
285
+ ]
286
+
287
+ # List of (ipa, lazy ipa2) pairs:
288
+ _lazy_ipa2 = [
289
+ (re.compile("%s" % x[0]), x[1])
290
+ for x in [
291
+ ("r", "ɹ"),
292
+ ("ð", "z"),
293
+ ("θ", "s"),
294
+ ("ʒ", "ʑ"),
295
+ ("ʤ", "dʑ"),
296
+ ("ˈ", "↓"),
297
+ ]
298
+ ]
299
+
300
+ # List of (ipa, ipa2) pairs
301
+ _ipa_to_ipa2 = [
302
+ (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
303
+ ]
304
+
305
+
306
+ def _expand_dollars(m):
307
+ match = m.group(1)
308
+ parts = match.split(".")
309
+ if len(parts) > 2:
310
+ return match + " dollars" # Unexpected format
311
+ dollars = int(parts[0]) if parts[0] else 0
312
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
313
+ if dollars and cents:
314
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
315
+ cent_unit = "cent" if cents == 1 else "cents"
316
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
317
+ elif dollars:
318
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
319
+ return "%s %s" % (dollars, dollar_unit)
320
+ elif cents:
321
+ cent_unit = "cent" if cents == 1 else "cents"
322
+ return "%s %s" % (cents, cent_unit)
323
+ else:
324
+ return "zero dollars"
325
+
326
+
327
+ def _remove_commas(m):
328
+ return m.group(1).replace(",", "")
329
+
330
+
331
+ def _expand_ordinal(m):
332
+ return _inflect.number_to_words(m.group(0))
333
+
334
+
335
+ def _expand_number(m):
336
+ num = int(m.group(0))
337
+ if num > 1000 and num < 3000:
338
+ if num == 2000:
339
+ return "two thousand"
340
+ elif num > 2000 and num < 2010:
341
+ return "two thousand " + _inflect.number_to_words(num % 100)
342
+ elif num % 100 == 0:
343
+ return _inflect.number_to_words(num // 100) + " hundred"
344
+ else:
345
+ return _inflect.number_to_words(
346
+ num, andword="", zero="oh", group=2
347
+ ).replace(", ", " ")
348
+ else:
349
+ return _inflect.number_to_words(num, andword="")
350
+
351
+
352
+ def _expand_decimal_point(m):
353
+ return m.group(1).replace(".", " point ")
354
+
355
+
356
+ def normalize_numbers(text):
357
+ text = re.sub(_comma_number_re, _remove_commas, text)
358
+ text = re.sub(_pounds_re, r"\1 pounds", text)
359
+ text = re.sub(_dollars_re, _expand_dollars, text)
360
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
361
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
362
+ text = re.sub(_number_re, _expand_number, text)
363
+ return text
364
+
365
+
366
+ def text_normalize(text):
367
+ text = normalize_numbers(text)
368
+ text = replace_punctuation(text)
369
+ text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
370
+ return text
371
+
372
+
373
+ def distribute_phone(n_phone, n_word):
374
+ phones_per_word = [0] * n_word
375
+ for task in range(n_phone):
376
+ min_tasks = min(phones_per_word)
377
+ min_index = phones_per_word.index(min_tasks)
378
+ phones_per_word[min_index] += 1
379
+ return phones_per_word
380
+
381
+
382
+ def sep_text(text):
383
+ words = re.split(r"([,;.\?\!\s+])", text)
384
+ words = [word for word in words if word.strip() != ""]
385
+ return words
386
+
387
+
388
+ def g2p(text, tokenizer, **kwargs):
389
+ phones = []
390
+ tones = []
391
+ # word2ph = []
392
+ words = sep_text(text)
393
+ tokens = [tokenizer.tokenize(i) for i in words]
394
+ for word in words:
395
+ if word.upper() in eng_dict:
396
+ phns, tns = refine_syllables(eng_dict[word.upper()])
397
+ phones.append([post_replace_ph(i) for i in phns])
398
+ tones.append(tns)
399
+ # word2ph.append(len(phns))
400
+ else:
401
+ phone_list = list(filter(lambda p: p != " ", _g2p(word)))
402
+ phns = []
403
+ tns = []
404
+ for ph in phone_list:
405
+ if ph in arpa:
406
+ ph, tn = refine_ph(ph)
407
+ phns.append(ph)
408
+ tns.append(tn)
409
+ else:
410
+ phns.append(ph)
411
+ tns.append(0)
412
+ phones.append([post_replace_ph(i) for i in phns])
413
+ tones.append(tns)
414
+ # word2ph.append(len(phns))
415
+ # phones = [post_replace_ph(i) for i in phones]
416
+
417
+ word2ph = []
418
+ for token, phoneme in zip(tokens, phones):
419
+ phone_len = len(phoneme)
420
+ word_len = len(token)
421
+
422
+ aaa = distribute_phone(phone_len, word_len)
423
+ word2ph += aaa
424
+
425
+ phones = ["_"] + [j for i in phones for j in i] + ["_"]
426
+ tones = [0] + [j for i in tones for j in i] + [0]
427
+ word2ph = [1] + word2ph + [1]
428
+ assert len(phones) == len(tones), text
429
+ assert len(phones) == sum(word2ph), text
430
+
431
+ return phones, tones, word2ph
432
+
433
+
434
+ def get_bert_feature(text, word2ph):
435
+ from bert_vits2.text import english_bert_mock
436
+
437
+ return english_bert_mock.get_bert_feature(text, word2ph)
438
+
439
+
440
+ if __name__ == "__main__":
441
+ # print(get_dict())
442
+ # print(eng_word_to_phoneme("hello"))
443
+ print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
444
+ # all_phones = set()
445
+ # for k, syllables in eng_dict.items():
446
+ # for group in syllables:
447
+ # for ph in group:
448
+ # all_phones.add(ph)
449
+ # print(all_phones)
bert_vits2/text/english_bert_mock.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from contants import config
4
+
5
+
6
+ def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, style_text=None, style_weight=0.7,
7
+ **kwargs):
8
+ with torch.no_grad():
9
+ inputs = tokenizer(text, return_tensors="pt")
10
+ for i in inputs:
11
+ inputs[i] = inputs[i].to(device)
12
+ res = model(**inputs, output_hidden_states=True)
13
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
14
+ if style_text:
15
+ style_inputs = tokenizer(style_text, return_tensors="pt")
16
+ for i in style_inputs:
17
+ style_inputs[i] = style_inputs[i].to(device)
18
+ style_res = model(**style_inputs, output_hidden_states=True)
19
+ style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
20
+ style_res_mean = style_res.mean(0)
21
+ assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
22
+ word2phone = word2ph
23
+ phone_level_feature = []
24
+ for i in range(len(word2phone)):
25
+ if style_text:
26
+ repeat_feature = (
27
+ res[i].repeat(word2phone[i], 1) * (1 - style_weight)
28
+ + style_res_mean.repeat(word2phone[i], 1) * style_weight
29
+ )
30
+ else:
31
+ repeat_feature = res[i].repeat(word2phone[i], 1)
32
+ phone_level_feature.append(repeat_feature)
33
+
34
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
35
+
36
+ return phone_level_feature.T
bert_vits2/text/english_bert_mock_v200.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from contants import config
4
+
5
+
6
+ def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, **kwargs):
7
+ with torch.no_grad():
8
+ inputs = tokenizer(text, return_tensors="pt")
9
+ for i in inputs:
10
+ inputs[i] = inputs[i].to(device)
11
+ res = model(**inputs, output_hidden_states=True)
12
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
13
+ # assert len(word2ph) == len(text)+2
14
+ word2phone = word2ph
15
+ phone_level_feature = []
16
+ for i in range(len(word2phone)):
17
+ repeat_feature = res[i].repeat(word2phone[i], 1)
18
+ phone_level_feature.append(repeat_feature)
19
+
20
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
21
+
22
+ return phone_level_feature.T
bert_vits2/text/english_v200.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ from g2p_en import G2p
4
+
5
+ from bert_vits2.text import symbols
6
+
7
+ current_file_path = os.path.dirname(__file__)
8
+ CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
9
+ CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
10
+ _g2p = G2p()
11
+
12
+ arpa = {
13
+ "AH0",
14
+ "S",
15
+ "AH1",
16
+ "EY2",
17
+ "AE2",
18
+ "EH0",
19
+ "OW2",
20
+ "UH0",
21
+ "NG",
22
+ "B",
23
+ "G",
24
+ "AY0",
25
+ "M",
26
+ "AA0",
27
+ "F",
28
+ "AO0",
29
+ "ER2",
30
+ "UH1",
31
+ "IY1",
32
+ "AH2",
33
+ "DH",
34
+ "IY0",
35
+ "EY1",
36
+ "IH0",
37
+ "K",
38
+ "N",
39
+ "W",
40
+ "IY2",
41
+ "T",
42
+ "AA1",
43
+ "ER1",
44
+ "EH2",
45
+ "OY0",
46
+ "UH2",
47
+ "UW1",
48
+ "Z",
49
+ "AW2",
50
+ "AW1",
51
+ "V",
52
+ "UW2",
53
+ "AA2",
54
+ "ER",
55
+ "AW0",
56
+ "UW0",
57
+ "R",
58
+ "OW1",
59
+ "EH1",
60
+ "ZH",
61
+ "AE0",
62
+ "IH2",
63
+ "IH",
64
+ "Y",
65
+ "JH",
66
+ "P",
67
+ "AY1",
68
+ "EY0",
69
+ "OY2",
70
+ "TH",
71
+ "HH",
72
+ "D",
73
+ "ER0",
74
+ "CH",
75
+ "AO1",
76
+ "AE1",
77
+ "AO2",
78
+ "OY1",
79
+ "AY2",
80
+ "IH1",
81
+ "OW0",
82
+ "L",
83
+ "SH",
84
+ }
85
+
86
+
87
+ def post_replace_ph(ph):
88
+ rep_map = {
89
+ ":": ",",
90
+ ";": ",",
91
+ ",": ",",
92
+ "。": ".",
93
+ "!": "!",
94
+ "?": "?",
95
+ "\n": ".",
96
+ "·": ",",
97
+ "、": ",",
98
+ "...": "…",
99
+ "v": "V",
100
+ }
101
+ if ph in rep_map.keys():
102
+ ph = rep_map[ph]
103
+ if ph in symbols:
104
+ return ph
105
+ if ph not in symbols:
106
+ ph = "UNK"
107
+ return ph
108
+
109
+
110
+ def read_dict():
111
+ g2p_dict = {}
112
+ start_line = 49
113
+ with open(CMU_DICT_PATH) as f:
114
+ line = f.readline()
115
+ line_index = 1
116
+ while line:
117
+ if line_index >= start_line:
118
+ line = line.strip()
119
+ word_split = line.split(" ")
120
+ word = word_split[0]
121
+
122
+ syllable_split = word_split[1].split(" - ")
123
+ g2p_dict[word] = []
124
+ for syllable in syllable_split:
125
+ phone_split = syllable.split(" ")
126
+ g2p_dict[word].append(phone_split)
127
+
128
+ line_index = line_index + 1
129
+ line = f.readline()
130
+
131
+ return g2p_dict
132
+
133
+
134
+ def cache_dict(g2p_dict, file_path):
135
+ with open(file_path, "wb") as pickle_file:
136
+ pickle.dump(g2p_dict, pickle_file)
137
+
138
+
139
+ def get_dict():
140
+ if os.path.exists(CACHE_PATH):
141
+ with open(CACHE_PATH, "rb") as pickle_file:
142
+ g2p_dict = pickle.load(pickle_file)
143
+ else:
144
+ g2p_dict = read_dict()
145
+ cache_dict(g2p_dict, CACHE_PATH)
146
+
147
+ return g2p_dict
148
+
149
+
150
+ eng_dict = get_dict()
151
+
152
+
153
+ def refine_ph(phn):
154
+ tone = 0
155
+ if re.search(r"\d$", phn):
156
+ tone = int(phn[-1]) + 1
157
+ phn = phn[:-1]
158
+ return phn.lower(), tone
159
+
160
+
161
+ def refine_syllables(syllables):
162
+ tones = []
163
+ phonemes = []
164
+ for phn_list in syllables:
165
+ for i in range(len(phn_list)):
166
+ phn = phn_list[i]
167
+ phn, tone = refine_ph(phn)
168
+ phonemes.append(phn)
169
+ tones.append(tone)
170
+ return phonemes, tones
171
+
172
+
173
+ import re
174
+ import inflect
175
+
176
+ _inflect = inflect.engine()
177
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
178
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
179
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
180
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
181
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
182
+ _number_re = re.compile(r"[0-9]+")
183
+
184
+ # List of (regular expression, replacement) pairs for abbreviations:
185
+ _abbreviations = [
186
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
187
+ for x in [
188
+ ("mrs", "misess"),
189
+ ("mr", "mister"),
190
+ ("dr", "doctor"),
191
+ ("st", "saint"),
192
+ ("co", "company"),
193
+ ("jr", "junior"),
194
+ ("maj", "major"),
195
+ ("gen", "general"),
196
+ ("drs", "doctors"),
197
+ ("rev", "reverend"),
198
+ ("lt", "lieutenant"),
199
+ ("hon", "honorable"),
200
+ ("sgt", "sergeant"),
201
+ ("capt", "captain"),
202
+ ("esq", "esquire"),
203
+ ("ltd", "limited"),
204
+ ("col", "colonel"),
205
+ ("ft", "fort"),
206
+ ]
207
+ ]
208
+
209
+ # List of (ipa, lazy ipa) pairs:
210
+ _lazy_ipa = [
211
+ (re.compile("%s" % x[0]), x[1])
212
+ for x in [
213
+ ("r", "ɹ"),
214
+ ("æ", "e"),
215
+ ("ɑ", "a"),
216
+ ("ɔ", "o"),
217
+ ("ð", "z"),
218
+ ("θ", "s"),
219
+ ("ɛ", "e"),
220
+ ("ɪ", "i"),
221
+ ("ʊ", "u"),
222
+ ("ʒ", "ʥ"),
223
+ ("ʤ", "ʥ"),
224
+ ("ˈ", "↓"),
225
+ ]
226
+ ]
227
+
228
+ # List of (ipa, lazy ipa2) pairs:
229
+ _lazy_ipa2 = [
230
+ (re.compile("%s" % x[0]), x[1])
231
+ for x in [
232
+ ("r", "ɹ"),
233
+ ("ð", "z"),
234
+ ("θ", "s"),
235
+ ("ʒ", "ʑ"),
236
+ ("ʤ", "dʑ"),
237
+ ("ˈ", "↓"),
238
+ ]
239
+ ]
240
+
241
+ # List of (ipa, ipa2) pairs
242
+ _ipa_to_ipa2 = [
243
+ (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
244
+ ]
245
+
246
+
247
+ def _expand_dollars(m):
248
+ match = m.group(1)
249
+ parts = match.split(".")
250
+ if len(parts) > 2:
251
+ return match + " dollars" # Unexpected format
252
+ dollars = int(parts[0]) if parts[0] else 0
253
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
254
+ if dollars and cents:
255
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
256
+ cent_unit = "cent" if cents == 1 else "cents"
257
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
258
+ elif dollars:
259
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
260
+ return "%s %s" % (dollars, dollar_unit)
261
+ elif cents:
262
+ cent_unit = "cent" if cents == 1 else "cents"
263
+ return "%s %s" % (cents, cent_unit)
264
+ else:
265
+ return "zero dollars"
266
+
267
+
268
+ def _remove_commas(m):
269
+ return m.group(1).replace(",", "")
270
+
271
+
272
+ def _expand_ordinal(m):
273
+ return _inflect.number_to_words(m.group(0))
274
+
275
+
276
+ def _expand_number(m):
277
+ num = int(m.group(0))
278
+ if num > 1000 and num < 3000:
279
+ if num == 2000:
280
+ return "two thousand"
281
+ elif num > 2000 and num < 2010:
282
+ return "two thousand " + _inflect.number_to_words(num % 100)
283
+ elif num % 100 == 0:
284
+ return _inflect.number_to_words(num // 100) + " hundred"
285
+ else:
286
+ return _inflect.number_to_words(
287
+ num, andword="", zero="oh", group=2
288
+ ).replace(", ", " ")
289
+ else:
290
+ return _inflect.number_to_words(num, andword="")
291
+
292
+
293
+ def _expand_decimal_point(m):
294
+ return m.group(1).replace(".", " point ")
295
+
296
+
297
+ def normalize_numbers(text):
298
+ text = re.sub(_comma_number_re, _remove_commas, text)
299
+ text = re.sub(_pounds_re, r"\1 pounds", text)
300
+ text = re.sub(_dollars_re, _expand_dollars, text)
301
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
302
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
303
+ text = re.sub(_number_re, _expand_number, text)
304
+ return text
305
+
306
+
307
+ def text_normalize(text):
308
+ text = normalize_numbers(text)
309
+ return text
310
+
311
+
312
+ def g2p(text, **kwargs):
313
+ phones = []
314
+ tones = []
315
+ word2ph = []
316
+ words = re.split(r"([,;.\-\?\!\s+])", text)
317
+ words = [word for word in words if word.strip() != ""]
318
+ for word in words:
319
+ if word.upper() in eng_dict:
320
+ phns, tns = refine_syllables(eng_dict[word.upper()])
321
+ phones += phns
322
+ tones += tns
323
+ word2ph.append(len(phns))
324
+ else:
325
+ phone_list = list(filter(lambda p: p != " ", _g2p(word)))
326
+ for ph in phone_list:
327
+ if ph in arpa:
328
+ ph, tn = refine_ph(ph)
329
+ phones.append(ph)
330
+ tones.append(tn)
331
+ else:
332
+ phones.append(ph)
333
+ tones.append(0)
334
+ word2ph.append(len(phone_list))
335
+
336
+ phones = [post_replace_ph(i) for i in phones]
337
+
338
+ phones = ["_"] + phones + ["_"]
339
+ tones = [0] + tones + [0]
340
+ word2ph = [1] + word2ph + [1]
341
+
342
+ return phones, tones, word2ph
343
+
344
+
345
+ def get_bert_feature(text, word2ph):
346
+ from bert_vits2.text import english_bert_mock
347
+
348
+ return english_bert_mock.get_bert_feature(text, word2ph)
349
+
350
+
351
+ if __name__ == "__main__":
352
+ # print(get_dict())
353
+ # print(eng_word_to_phoneme("hello"))
354
+ print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
355
+ # all_phones = set()
356
+ # for k, syllables in eng_dict.items():
357
+ # for group in syllables:
358
+ # for ph in group:
359
+ # all_phones.add(ph)
360
+ # print(all_phones)
bert_vits2/text/english_v230.py ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ from g2p_en import G2p
4
+ from transformers import DebertaV2Tokenizer
5
+
6
+ from bert_vits2.text import symbols
7
+ from bert_vits2.text.symbols import punctuation
8
+
9
+ current_file_path = os.path.dirname(__file__)
10
+ CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
11
+ CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
12
+ _g2p = G2p()
13
+ LOCAL_PATH = "./bert/deberta-v3-large"
14
+ # tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
15
+
16
+ arpa = {
17
+ "AH0",
18
+ "S",
19
+ "AH1",
20
+ "EY2",
21
+ "AE2",
22
+ "EH0",
23
+ "OW2",
24
+ "UH0",
25
+ "NG",
26
+ "B",
27
+ "G",
28
+ "AY0",
29
+ "M",
30
+ "AA0",
31
+ "F",
32
+ "AO0",
33
+ "ER2",
34
+ "UH1",
35
+ "IY1",
36
+ "AH2",
37
+ "DH",
38
+ "IY0",
39
+ "EY1",
40
+ "IH0",
41
+ "K",
42
+ "N",
43
+ "W",
44
+ "IY2",
45
+ "T",
46
+ "AA1",
47
+ "ER1",
48
+ "EH2",
49
+ "OY0",
50
+ "UH2",
51
+ "UW1",
52
+ "Z",
53
+ "AW2",
54
+ "AW1",
55
+ "V",
56
+ "UW2",
57
+ "AA2",
58
+ "ER",
59
+ "AW0",
60
+ "UW0",
61
+ "R",
62
+ "OW1",
63
+ "EH1",
64
+ "ZH",
65
+ "AE0",
66
+ "IH2",
67
+ "IH",
68
+ "Y",
69
+ "JH",
70
+ "P",
71
+ "AY1",
72
+ "EY0",
73
+ "OY2",
74
+ "TH",
75
+ "HH",
76
+ "D",
77
+ "ER0",
78
+ "CH",
79
+ "AO1",
80
+ "AE1",
81
+ "AO2",
82
+ "OY1",
83
+ "AY2",
84
+ "IH1",
85
+ "OW0",
86
+ "L",
87
+ "SH",
88
+ }
89
+
90
+
91
+ def post_replace_ph(ph):
92
+ rep_map = {
93
+ ":": ",",
94
+ ";": ",",
95
+ ",": ",",
96
+ "。": ".",
97
+ "!": "!",
98
+ "?": "?",
99
+ "\n": ".",
100
+ "·": ",",
101
+ "、": ",",
102
+ "…": "...",
103
+ "···": "...",
104
+ "・・・": "...",
105
+ "v": "V",
106
+ }
107
+ if ph in rep_map.keys():
108
+ ph = rep_map[ph]
109
+ if ph in symbols:
110
+ return ph
111
+ if ph not in symbols:
112
+ ph = "UNK"
113
+ return ph
114
+
115
+
116
+ rep_map = {
117
+ ":": ",",
118
+ ";": ",",
119
+ ",": ",",
120
+ "。": ".",
121
+ "!": "!",
122
+ "?": "?",
123
+ "\n": ".",
124
+ ".": ".",
125
+ "…": "...",
126
+ "···": "...",
127
+ "・・・": "...",
128
+ "·": ",",
129
+ "・": ",",
130
+ "、": ",",
131
+ "$": ".",
132
+ "“": "'",
133
+ "”": "'",
134
+ '"': "'",
135
+ "‘": "'",
136
+ "’": "'",
137
+ "(": "'",
138
+ ")": "'",
139
+ "(": "'",
140
+ ")": "'",
141
+ "《": "'",
142
+ "》": "'",
143
+ "【": "'",
144
+ "】": "'",
145
+ "[": "'",
146
+ "]": "'",
147
+ "—": "-",
148
+ "−": "-",
149
+ "~": "-",
150
+ "~": "-",
151
+ "「": "'",
152
+ "」": "'",
153
+ }
154
+
155
+
156
+ def replace_punctuation(text):
157
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
158
+
159
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
160
+
161
+ # replaced_text = re.sub(
162
+ # r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
163
+ # + "".join(punctuation)
164
+ # + r"]+",
165
+ # "",
166
+ # replaced_text,
167
+ # )
168
+
169
+ return replaced_text
170
+
171
+
172
+ def read_dict():
173
+ g2p_dict = {}
174
+ start_line = 49
175
+ with open(CMU_DICT_PATH) as f:
176
+ line = f.readline()
177
+ line_index = 1
178
+ while line:
179
+ if line_index >= start_line:
180
+ line = line.strip()
181
+ word_split = line.split(" ")
182
+ word = word_split[0]
183
+
184
+ syllable_split = word_split[1].split(" - ")
185
+ g2p_dict[word] = []
186
+ for syllable in syllable_split:
187
+ phone_split = syllable.split(" ")
188
+ g2p_dict[word].append(phone_split)
189
+
190
+ line_index = line_index + 1
191
+ line = f.readline()
192
+
193
+ return g2p_dict
194
+
195
+
196
+ def cache_dict(g2p_dict, file_path):
197
+ with open(file_path, "wb") as pickle_file:
198
+ pickle.dump(g2p_dict, pickle_file)
199
+
200
+
201
+ def get_dict():
202
+ if os.path.exists(CACHE_PATH):
203
+ with open(CACHE_PATH, "rb") as pickle_file:
204
+ g2p_dict = pickle.load(pickle_file)
205
+ else:
206
+ g2p_dict = read_dict()
207
+ cache_dict(g2p_dict, CACHE_PATH)
208
+
209
+ return g2p_dict
210
+
211
+
212
+ eng_dict = get_dict()
213
+
214
+
215
+ def refine_ph(phn):
216
+ tone = 0
217
+ if re.search(r"\d$", phn):
218
+ tone = int(phn[-1]) + 1
219
+ phn = phn[:-1]
220
+ else:
221
+ tone = 3
222
+ return phn.lower(), tone
223
+
224
+
225
+ def refine_syllables(syllables):
226
+ tones = []
227
+ phonemes = []
228
+ for phn_list in syllables:
229
+ for i in range(len(phn_list)):
230
+ phn = phn_list[i]
231
+ phn, tone = refine_ph(phn)
232
+ phonemes.append(phn)
233
+ tones.append(tone)
234
+ return phonemes, tones
235
+
236
+
237
+ import re
238
+ import inflect
239
+
240
+ _inflect = inflect.engine()
241
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
242
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
243
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
244
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
245
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
246
+ _number_re = re.compile(r"[0-9]+")
247
+
248
+ # List of (regular expression, replacement) pairs for abbreviations:
249
+ _abbreviations = [
250
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
251
+ for x in [
252
+ ("mrs", "misess"),
253
+ ("mr", "mister"),
254
+ ("dr", "doctor"),
255
+ ("st", "saint"),
256
+ ("co", "company"),
257
+ ("jr", "junior"),
258
+ ("maj", "major"),
259
+ ("gen", "general"),
260
+ ("drs", "doctors"),
261
+ ("rev", "reverend"),
262
+ ("lt", "lieutenant"),
263
+ ("hon", "honorable"),
264
+ ("sgt", "sergeant"),
265
+ ("capt", "captain"),
266
+ ("esq", "esquire"),
267
+ ("ltd", "limited"),
268
+ ("col", "colonel"),
269
+ ("ft", "fort"),
270
+ ]
271
+ ]
272
+
273
+ # List of (ipa, lazy ipa) pairs:
274
+ _lazy_ipa = [
275
+ (re.compile("%s" % x[0]), x[1])
276
+ for x in [
277
+ ("r", "ɹ"),
278
+ ("æ", "e"),
279
+ ("ɑ", "a"),
280
+ ("ɔ", "o"),
281
+ ("ð", "z"),
282
+ ("θ", "s"),
283
+ ("ɛ", "e"),
284
+ ("ɪ", "i"),
285
+ ("ʊ", "u"),
286
+ ("ʒ", "ʥ"),
287
+ ("ʤ", "ʥ"),
288
+ ("ˈ", "↓"),
289
+ ]
290
+ ]
291
+
292
+ # List of (ipa, lazy ipa2) pairs:
293
+ _lazy_ipa2 = [
294
+ (re.compile("%s" % x[0]), x[1])
295
+ for x in [
296
+ ("r", "ɹ"),
297
+ ("ð", "z"),
298
+ ("θ", "s"),
299
+ ("ʒ", "ʑ"),
300
+ ("ʤ", "dʑ"),
301
+ ("ˈ", "↓"),
302
+ ]
303
+ ]
304
+
305
+ # List of (ipa, ipa2) pairs
306
+ _ipa_to_ipa2 = [
307
+ (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
308
+ ]
309
+
310
+
311
+ def _expand_dollars(m):
312
+ match = m.group(1)
313
+ parts = match.split(".")
314
+ if len(parts) > 2:
315
+ return match + " dollars" # Unexpected format
316
+ dollars = int(parts[0]) if parts[0] else 0
317
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
318
+ if dollars and cents:
319
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
320
+ cent_unit = "cent" if cents == 1 else "cents"
321
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
322
+ elif dollars:
323
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
324
+ return "%s %s" % (dollars, dollar_unit)
325
+ elif cents:
326
+ cent_unit = "cent" if cents == 1 else "cents"
327
+ return "%s %s" % (cents, cent_unit)
328
+ else:
329
+ return "zero dollars"
330
+
331
+
332
+ def _remove_commas(m):
333
+ return m.group(1).replace(",", "")
334
+
335
+
336
+ def _expand_ordinal(m):
337
+ return _inflect.number_to_words(m.group(0))
338
+
339
+
340
+ def _expand_number(m):
341
+ num = int(m.group(0))
342
+ if num > 1000 and num < 3000:
343
+ if num == 2000:
344
+ return "two thousand"
345
+ elif num > 2000 and num < 2010:
346
+ return "two thousand " + _inflect.number_to_words(num % 100)
347
+ elif num % 100 == 0:
348
+ return _inflect.number_to_words(num // 100) + " hundred"
349
+ else:
350
+ return _inflect.number_to_words(
351
+ num, andword="", zero="oh", group=2
352
+ ).replace(", ", " ")
353
+ else:
354
+ return _inflect.number_to_words(num, andword="")
355
+
356
+
357
+ def _expand_decimal_point(m):
358
+ return m.group(1).replace(".", " point ")
359
+
360
+
361
+ def normalize_numbers(text):
362
+ text = re.sub(_comma_number_re, _remove_commas, text)
363
+ text = re.sub(_pounds_re, r"\1 pounds", text)
364
+ text = re.sub(_dollars_re, _expand_dollars, text)
365
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
366
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
367
+ text = re.sub(_number_re, _expand_number, text)
368
+ return text
369
+
370
+
371
+ def text_normalize(text):
372
+ text = normalize_numbers(text)
373
+ text = replace_punctuation(text)
374
+ text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
375
+ return text
376
+
377
+
378
+ def distribute_phone(n_phone, n_word):
379
+ phones_per_word = [0] * n_word
380
+ for task in range(n_phone):
381
+ min_tasks = min(phones_per_word)
382
+ min_index = phones_per_word.index(min_tasks)
383
+ phones_per_word[min_index] += 1
384
+ return phones_per_word
385
+
386
+
387
+ def sep_text(text):
388
+ words = re.split(r"([,;.\?\!\s+])", text)
389
+ words = [word for word in words if word.strip() != ""]
390
+ return words
391
+
392
+
393
+ def text_to_words(text, tokenizer):
394
+ tokens = tokenizer.tokenize(text)
395
+ words = []
396
+ for idx, t in enumerate(tokens):
397
+ if t.startswith("▁"):
398
+ words.append([t[1:]])
399
+ else:
400
+ if t in punctuation:
401
+ if idx == len(tokens) - 1:
402
+ words.append([f"{t}"])
403
+ else:
404
+ if (
405
+ not tokens[idx + 1].startswith("▁")
406
+ and tokens[idx + 1] not in punctuation
407
+ ):
408
+ if idx == 0:
409
+ words.append([])
410
+ words[-1].append(f"{t}")
411
+ else:
412
+ words.append([f"{t}"])
413
+ else:
414
+ if idx == 0:
415
+ words.append([])
416
+ words[-1].append(f"{t}")
417
+ return words
418
+
419
+
420
+ def g2p(text, tokenizer, **kwargs):
421
+ phones = []
422
+ tones = []
423
+ phone_len = []
424
+ # words = sep_text(text)
425
+ # tokens = [tokenizer.tokenize(i) for i in words]
426
+ words = text_to_words(text, tokenizer)
427
+
428
+ for word in words:
429
+ temp_phones, temp_tones = [], []
430
+ if len(word) > 1:
431
+ if "'" in word:
432
+ word = ["".join(word)]
433
+ for w in word:
434
+ if w in punctuation:
435
+ temp_phones.append(w)
436
+ temp_tones.append(0)
437
+ continue
438
+ if w.upper() in eng_dict:
439
+ phns, tns = refine_syllables(eng_dict[w.upper()])
440
+ temp_phones += [post_replace_ph(i) for i in phns]
441
+ temp_tones += tns
442
+ # w2ph.append(len(phns))
443
+ else:
444
+ phone_list = list(filter(lambda p: p != " ", _g2p(w)))
445
+ phns = []
446
+ tns = []
447
+ for ph in phone_list:
448
+ if ph in arpa:
449
+ ph, tn = refine_ph(ph)
450
+ phns.append(ph)
451
+ tns.append(tn)
452
+ else:
453
+ phns.append(ph)
454
+ tns.append(0)
455
+ temp_phones += [post_replace_ph(i) for i in phns]
456
+ temp_tones += tns
457
+ phones += temp_phones
458
+ tones += temp_tones
459
+ phone_len.append(len(temp_phones))
460
+ # phones = [post_replace_ph(i) for i in phones]
461
+
462
+ word2ph = []
463
+ for token, pl in zip(words, phone_len):
464
+ word_len = len(token)
465
+
466
+ aaa = distribute_phone(pl, word_len)
467
+ word2ph += aaa
468
+
469
+ phones = ["_"] + phones + ["_"]
470
+ tones = [0] + tones + [0]
471
+ word2ph = [1] + word2ph + [1]
472
+ assert len(phones) == len(tones), text
473
+ assert len(phones) == sum(word2ph), text
474
+
475
+ return phones, tones, word2ph
476
+
477
+
478
+ def get_bert_feature(text, word2ph):
479
+ from bert_vits2.text import english_bert_mock
480
+
481
+ return english_bert_mock.get_bert_feature(text, word2ph)
482
+
483
+
484
+ if __name__ == "__main__":
485
+ # print(get_dict())
486
+ # print(eng_word_to_phoneme("hello"))
487
+ print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
488
+ # all_phones = set()
489
+ # for k, syllables in eng_dict.items():
490
+ # for group in syllables:
491
+ # for ph in group:
492
+ # all_phones.add(ph)
493
+ # print(all_phones)
bert_vits2/text/japanese.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert Japanese text to phonemes which is
2
+ # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
+ import re
4
+ import unicodedata
5
+
6
+ from bert_vits2.text import punctuation, symbols
7
+
8
+ from num2words import num2words
9
+
10
+ import pyopenjtalk
11
+ import jaconv
12
+
13
+
14
+ def kata2phoneme(text: str) -> str:
15
+ """Convert katakana text to phonemes."""
16
+ text = text.strip()
17
+ if text == "ー":
18
+ return ["ー"]
19
+ elif text.startswith("ー"):
20
+ return ["ー"] + kata2phoneme(text[1:])
21
+ res = []
22
+ prev = None
23
+ while text:
24
+ if re.match(_MARKS, text):
25
+ res.append(text)
26
+ text = text[1:]
27
+ continue
28
+ if text.startswith("ー"):
29
+ if prev:
30
+ res.append(prev[-1])
31
+ text = text[1:]
32
+ continue
33
+ res += pyopenjtalk.g2p(text).lower().replace("cl", "q").split(" ")
34
+ break
35
+ # res = _COLON_RX.sub(":", res)
36
+ return res
37
+
38
+
39
+ def hira2kata(text: str) -> str:
40
+ return jaconv.hira2kata(text)
41
+
42
+
43
+ _SYMBOL_TOKENS = set(list("・、。?!"))
44
+ _NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
45
+ _MARKS = re.compile(
46
+ r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
47
+ )
48
+
49
+
50
+ def text2kata(text: str) -> str:
51
+ parsed = pyopenjtalk.run_frontend(text)
52
+
53
+ res = []
54
+ for parts in parsed:
55
+ word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
56
+ "’", ""
57
+ )
58
+ if yomi:
59
+ if re.match(_MARKS, yomi):
60
+ if len(word) > 1:
61
+ word = [replace_punctuation(i) for i in list(word)]
62
+ yomi = word
63
+ res += yomi
64
+ sep += word
65
+ continue
66
+ elif word not in rep_map.keys() and word not in rep_map.values():
67
+ word = ","
68
+ yomi = word
69
+ res.append(yomi)
70
+ else:
71
+ if word in _SYMBOL_TOKENS:
72
+ res.append(word)
73
+ elif word in ("っ", "ッ"):
74
+ res.append("ッ")
75
+ elif word in _NO_YOMI_TOKENS:
76
+ pass
77
+ else:
78
+ res.append(word)
79
+ return hira2kata("".join(res))
80
+
81
+
82
+ def text2sep_kata(text: str) -> (list, list):
83
+ parsed = pyopenjtalk.run_frontend(text)
84
+
85
+ res = []
86
+ sep = []
87
+ for parts in parsed:
88
+ word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
89
+ "’", ""
90
+ )
91
+ if yomi:
92
+ if re.match(_MARKS, yomi):
93
+ if len(word) > 1:
94
+ word = [replace_punctuation(i) for i in list(word)]
95
+ yomi = word
96
+ res += yomi
97
+ sep += word
98
+ continue
99
+ elif word not in rep_map.keys() and word not in rep_map.values():
100
+ word = ","
101
+ yomi = word
102
+ res.append(yomi)
103
+ else:
104
+ if word in _SYMBOL_TOKENS:
105
+ res.append(word)
106
+ elif word in ("っ", "ッ"):
107
+ res.append("ッ")
108
+ elif word in _NO_YOMI_TOKENS:
109
+ pass
110
+ else:
111
+ res.append(word)
112
+ sep.append(word)
113
+ return sep, [hira2kata(i) for i in res], get_accent(parsed)
114
+
115
+
116
+ def get_accent(parsed):
117
+ labels = pyopenjtalk.make_label(parsed)
118
+
119
+ phonemes = []
120
+ accents = []
121
+ for n, label in enumerate(labels):
122
+ phoneme = re.search(r"\-([^\+]*)\+", label).group(1)
123
+ if phoneme not in ["sil", "pau"]:
124
+ phonemes.append(phoneme.replace("cl", "q").lower())
125
+ else:
126
+ continue
127
+ a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
128
+ a2 = int(re.search(r"\+(\d+)\+", label).group(1))
129
+ if re.search(r"\-([^\+]*)\+", labels[n + 1]).group(1) in ["sil", "pau"]:
130
+ a2_next = -1
131
+ else:
132
+ a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
133
+ # Falling
134
+ if a1 == 0 and a2_next == a2 + 1:
135
+ accents.append(-1)
136
+ # Rising
137
+ elif a2 == 1 and a2_next == 2:
138
+ accents.append(1)
139
+ else:
140
+ accents.append(0)
141
+ return list(zip(phonemes, accents))
142
+
143
+
144
+ _ALPHASYMBOL_YOMI = {
145
+ "#": "シャープ",
146
+ "%": "パーセント",
147
+ "&": "アンド",
148
+ "+": "プラス",
149
+ "-": "マイナス",
150
+ ":": "コロン",
151
+ ";": "セミコロン",
152
+ "<": "小なり",
153
+ "=": "イコール",
154
+ ">": "大なり",
155
+ "@": "アット",
156
+ "a": "エー",
157
+ "b": "ビー",
158
+ "c": "シー",
159
+ "d": "ディー",
160
+ "e": "イー",
161
+ "f": "エフ",
162
+ "g": "ジー",
163
+ "h": "エイチ",
164
+ "i": "アイ",
165
+ "j": "ジェー",
166
+ "k": "ケー",
167
+ "l": "エル",
168
+ "m": "エム",
169
+ "n": "エヌ",
170
+ "o": "オー",
171
+ "p": "ピー",
172
+ "q": "キュー",
173
+ "r": "アール",
174
+ "s": "エス",
175
+ "t": "ティ���",
176
+ "u": "ユー",
177
+ "v": "ブイ",
178
+ "w": "ダブリュー",
179
+ "x": "エックス",
180
+ "y": "ワイ",
181
+ "z": "ゼット",
182
+ "α": "アルファ",
183
+ "β": "ベータ",
184
+ "γ": "ガンマ",
185
+ "δ": "デルタ",
186
+ "ε": "イプシロン",
187
+ "ζ": "ゼータ",
188
+ "η": "イータ",
189
+ "θ": "シータ",
190
+ "ι": "イオタ",
191
+ "κ": "カッパ",
192
+ "λ": "ラムダ",
193
+ "μ": "ミュー",
194
+ "ν": "ニュー",
195
+ "ξ": "クサイ",
196
+ "ο": "オミクロン",
197
+ "π": "パイ",
198
+ "ρ": "ロー",
199
+ "σ": "シグマ",
200
+ "τ": "タウ",
201
+ "υ": "ウプシロン",
202
+ "φ": "ファイ",
203
+ "χ": "カイ",
204
+ "ψ": "プサイ",
205
+ "ω": "オメガ",
206
+ }
207
+
208
+ _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
209
+ _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
210
+ _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
211
+ _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
212
+
213
+
214
+ def japanese_convert_numbers_to_words(text: str) -> str:
215
+ res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
216
+ res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
217
+ res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
218
+ return res
219
+
220
+
221
+ def japanese_convert_alpha_symbols_to_words(text: str) -> str:
222
+ return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
223
+
224
+
225
+ def japanese_text_to_phonemes(text: str) -> str:
226
+ """Convert Japanese text to phonemes."""
227
+ res = unicodedata.normalize("NFKC", text)
228
+ res = japanese_convert_numbers_to_words(res)
229
+ # res = japanese_convert_alpha_symbols_to_words(res)
230
+ res = text2kata(res)
231
+ res = kata2phoneme(res)
232
+ return res
233
+
234
+
235
+ def is_japanese_character(char):
236
+ # 定义日语文字系统的 Unicode 范围
237
+ japanese_ranges = [
238
+ (0x3040, 0x309F), # 平假名
239
+ (0x30A0, 0x30FF), # 片假名
240
+ (0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
241
+ (0x3400, 0x4DBF), # 汉字扩展 A
242
+ (0x20000, 0x2A6DF), # 汉字扩展 B
243
+ # 可以根据需要添加其他汉字扩展范围
244
+ ]
245
+
246
+ # 将字符的 Unicode 编码转换为整数
247
+ char_code = ord(char)
248
+
249
+ # 检查字符是否在任何一个日语范围内
250
+ for start, end in japanese_ranges:
251
+ if start <= char_code <= end:
252
+ return True
253
+
254
+ return False
255
+
256
+
257
+ rep_map = {
258
+ ":": ",",
259
+ ";": ",",
260
+ ",": ",",
261
+ "。": ".",
262
+ "!": "!",
263
+ "?": "?",
264
+ "\n": ".",
265
+ ".": ".",
266
+ "…": "...",
267
+ "···": "...",
268
+ "・・・": "...",
269
+ "·": ",",
270
+ "・": ",",
271
+ "、": ",",
272
+ "$": ".",
273
+ "“": "'",
274
+ "”": "'",
275
+ '"': "'",
276
+ "‘": "'",
277
+ "’": "'",
278
+ "(": "'",
279
+ ")": "'",
280
+ "(": "'",
281
+ ")": "'",
282
+ "《": "'",
283
+ "》": "'",
284
+ "【": "'",
285
+ "】": "'",
286
+ "[": "'",
287
+ "]": "'",
288
+ "—": "-",
289
+ "−": "-",
290
+ "~": "-",
291
+ "~": "-",
292
+ "「": "'",
293
+ "」": "'",
294
+ }
295
+
296
+
297
+ def replace_punctuation(text):
298
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
299
+
300
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
301
+
302
+ replaced_text = re.sub(
303
+ r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
304
+ + "".join(punctuation)
305
+ + r"]+",
306
+ "",
307
+ replaced_text,
308
+ )
309
+
310
+ return replaced_text
311
+
312
+
313
+ def text_normalize(text):
314
+ res = unicodedata.normalize("NFKC", text)
315
+ res = japanese_convert_numbers_to_words(res)
316
+ # res = "".join([i for i in res if is_japanese_character(i)])
317
+ res = replace_punctuation(res)
318
+ res = res.replace("゙", "")
319
+ return res
320
+
321
+
322
+ def distribute_phone(n_phone, n_word):
323
+ phones_per_word = [0] * n_word
324
+ for task in range(n_phone):
325
+ min_tasks = min(phones_per_word)
326
+ min_index = phones_per_word.index(min_tasks)
327
+ phones_per_word[min_index] += 1
328
+ return phones_per_word
329
+
330
+
331
+ def handle_long(sep_phonemes):
332
+ for i in range(len(sep_phonemes)):
333
+ if sep_phonemes[i][0] == "ー":
334
+ sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
335
+ if "ー" in sep_phonemes[i]:
336
+ for j in range(len(sep_phonemes[i])):
337
+ if sep_phonemes[i][j] == "ー":
338
+ sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
339
+ return sep_phonemes
340
+
341
+
342
+ def align_tones(phones, tones):
343
+ res = []
344
+ for pho in phones:
345
+ temp = [0] * len(pho)
346
+ for idx, p in enumerate(pho):
347
+ if len(tones) == 0:
348
+ break
349
+ if p == tones[0][0]:
350
+ temp[idx] = tones[0][1]
351
+ if idx > 0:
352
+ temp[idx] += temp[idx - 1]
353
+ tones.pop(0)
354
+ temp = [0] + temp
355
+ temp = temp[:-1]
356
+ if -1 in temp:
357
+ temp = [i + 1 for i in temp]
358
+ res.append(temp)
359
+ res = [i for j in res for i in j]
360
+ assert not any([i < 0 for i in res]) and not any([i > 1 for i in res])
361
+ return res
362
+
363
+
364
+ def rearrange_tones(tones, phones):
365
+ res = [0] * len(tones)
366
+ for i in range(len(tones)):
367
+ if i == 0:
368
+ if tones[i] not in punctuation:
369
+ res[i] = 1
370
+ elif tones[i] == prev:
371
+ if phones[i] in punctuation:
372
+ res[i] = 0
373
+ else:
374
+ res[i] = 1
375
+ elif tones[i] > prev:
376
+ res[i] = 2
377
+ elif tones[i] < prev:
378
+ res[i - 1] = 3
379
+ res[i] = 1
380
+ prev = tones[i]
381
+ return res
382
+
383
+
384
+ def g2p(norm_text, tokenizer, **kwargs):
385
+ sep_text, sep_kata, acc = text2sep_kata(norm_text)
386
+ sep_tokenized = []
387
+ for i in sep_text:
388
+ if i not in punctuation:
389
+ sep_tokenized.append(tokenizer.tokenize(i))
390
+ else:
391
+ sep_tokenized.append([i])
392
+
393
+ sep_phonemes = handle_long([kata2phoneme(i) for i in sep_kata])
394
+ # 异常处理,MeCab不认识的词的话会一路传到这里来,然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
395
+ for i in sep_phonemes:
396
+ for j in i:
397
+ assert j in symbols, (sep_text, sep_kata, sep_phonemes)
398
+ tones = align_tones(sep_phonemes, acc)
399
+
400
+ word2ph = []
401
+ for token, phoneme in zip(sep_tokenized, sep_phonemes):
402
+ phone_len = len(phoneme)
403
+ word_len = len(token)
404
+
405
+ aaa = distribute_phone(phone_len, word_len)
406
+ word2ph += aaa
407
+ phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
408
+ # tones = [0] + rearrange_tones(tones, phones[1:-1]) + [0]
409
+ tones = [0] + tones + [0]
410
+ word2ph = [1] + word2ph + [1]
411
+ assert len(phones) == len(tones)
412
+ return phones, tones, word2ph
413
+
414
+
415
+ if __name__ == "__main__":
416
+ from manager import model_handler
417
+
418
+ tokenizer, _ = model_handler.get_bert_model("DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM")
419
+ text = "hello,こんにちは、世界ー!……"
420
+ from bert_vits2.text.japanese_bert import get_bert_feature
421
+
422
+ text = text_normalize(text)
423
+ print(text)
424
+
425
+ phones, tones, word2ph = g2p(text, tokenizer)
426
+ bert = get_bert_feature(text, word2ph)
427
+
428
+ print(phones, tones, word2ph, bert.shape)
bert_vits2/text/japanese_bert.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from contants import config
4
+ from bert_vits2.text.japanese import text2sep_kata
5
+
6
+ LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
7
+
8
+
9
+ def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, style_text=None, style_weight=0.7,
10
+ **kwargs):
11
+ text = "".join(text2sep_kata(text)[0])
12
+ if style_text:
13
+ style_text = "".join(text2sep_kata(style_text)[0])
14
+ with torch.no_grad():
15
+ inputs = tokenizer(text, return_tensors="pt")
16
+ for i in inputs:
17
+ inputs[i] = inputs[i].to(device)
18
+ res = model(**inputs, output_hidden_states=True)
19
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
20
+ if style_text:
21
+ style_inputs = tokenizer(style_text, return_tensors="pt")
22
+ for i in style_inputs:
23
+ style_inputs[i] = style_inputs[i].to(device)
24
+ style_res = model(**style_inputs, output_hidden_states=True)
25
+ style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
26
+ style_res_mean = style_res.mean(0)
27
+
28
+ assert len(word2ph) == len(text) + 2
29
+ word2phone = word2ph
30
+ phone_level_feature = []
31
+ for i in range(len(word2phone)):
32
+ if style_text:
33
+ repeat_feature = (
34
+ res[i].repeat(word2phone[i], 1) * (1 - style_weight)
35
+ + style_res_mean.repeat(word2phone[i], 1) * style_weight
36
+ )
37
+ else:
38
+ repeat_feature = res[i].repeat(word2phone[i], 1)
39
+ phone_level_feature.append(repeat_feature)
40
+
41
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
42
+
43
+ return phone_level_feature.T
bert_vits2/text/japanese_bert_extra.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from contants import config
4
+ from bert_vits2.text.japanese import text2sep_kata
5
+
6
+
7
+ def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, style_text=None, style_weight=0.7,
8
+ **kwargs):
9
+ text = "".join(text2sep_kata(text)[0])
10
+ if style_text:
11
+ style_text = "".join(text2sep_kata(style_text)[0])
12
+
13
+ with torch.no_grad():
14
+ inputs = tokenizer(text, return_tensors="pt")
15
+ for i in inputs:
16
+ inputs[i] = inputs[i].to(device)
17
+ res = model(**inputs, output_hidden_states=True)
18
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
19
+ if style_text:
20
+ style_inputs = tokenizer(style_text, return_tensors="pt")
21
+ for i in style_inputs:
22
+ style_inputs[i] = style_inputs[i].to(device)
23
+ style_res = model(**style_inputs, output_hidden_states=True)
24
+ style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
25
+ style_res_mean = style_res.mean(0)
26
+
27
+ assert len(word2ph) == len(text) + 2
28
+ word2phone = word2ph
29
+ phone_level_feature = []
30
+ for i in range(len(word2phone)):
31
+ if style_text:
32
+ repeat_feature = (
33
+ res[i].repeat(word2phone[i], 1) * (1 - style_weight)
34
+ + style_res_mean.repeat(word2phone[i], 1) * style_weight
35
+ )
36
+ else:
37
+ repeat_feature = res[i].repeat(word2phone[i], 1)
38
+ phone_level_feature.append(repeat_feature)
39
+
40
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
41
+
42
+ return phone_level_feature.T
bert_vits2/text/japanese_bert_v111.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from contants import config
4
+
5
+
6
+ def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, **kwargs):
7
+ with torch.no_grad():
8
+ inputs = tokenizer(text, return_tensors="pt")
9
+ for i in inputs:
10
+ inputs[i] = inputs[i].to(device)
11
+ res = model(**inputs, output_hidden_states=True)
12
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
13
+ assert inputs["input_ids"].shape[-1] == len(word2ph)
14
+ word2phone = word2ph
15
+ phone_level_feature = []
16
+ for i in range(len(word2phone)):
17
+ repeat_feature = res[i].repeat(word2phone[i], 1)
18
+ phone_level_feature.append(repeat_feature)
19
+
20
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
21
+
22
+ return phone_level_feature.T
bert_vits2/text/japanese_bert_v200.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from contants import config
4
+ from bert_vits2.text.japanese_v200 import text2sep_kata
5
+
6
+
7
+ def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, **kwargs):
8
+ sep_text, _, _ = text2sep_kata(text)
9
+ sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
10
+ sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
11
+ sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
12
+ return get_bert_feature_with_token(sep_ids, word2ph, tokenizer, model, device)
13
+
14
+
15
+ def get_bert_feature_with_token(tokens, word2ph, tokenizer, model, device=config.system.device):
16
+ with torch.no_grad():
17
+ inputs = torch.tensor(tokens).to(device).unsqueeze(0)
18
+ token_type_ids = torch.zeros_like(inputs).to(device)
19
+ attention_mask = torch.ones_like(inputs).to(device)
20
+ inputs = {
21
+ "input_ids": inputs,
22
+ "token_type_ids": token_type_ids,
23
+ "attention_mask": attention_mask,
24
+ }
25
+
26
+ # for i in inputs:
27
+ # inputs[i] = inputs[i].to(device)
28
+ res = model(**inputs, output_hidden_states=True)
29
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
30
+ assert inputs["input_ids"].shape[-1] == len(word2ph)
31
+ word2phone = word2ph
32
+ phone_level_feature = []
33
+ for i in range(len(word2phone)):
34
+ repeat_feature = res[i].repeat(word2phone[i], 1)
35
+ phone_level_feature.append(repeat_feature)
36
+
37
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
38
+
39
+ return phone_level_feature.T
bert_vits2/text/japanese_extra.py ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert Japanese text to phonemes which is
2
+ # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
+ import re
4
+ import unicodedata
5
+
6
+ import pyopenjtalk
7
+ from num2words import num2words
8
+
9
+ from bert_vits2.text import punctuation
10
+ from bert_vits2.text.japanese_mora_list import (
11
+ mora_kata_to_mora_phonemes,
12
+ )
13
+
14
+ # 子音の集合
15
+ COSONANTS = set(
16
+ [
17
+ cosonant
18
+ for cosonant, _ in mora_kata_to_mora_phonemes.values()
19
+ if cosonant is not None
20
+ ]
21
+ )
22
+
23
+ # 母音の集合
24
+ VOWELS = {"a", "i", "u", "e", "o"}
25
+
26
+ # 正規化で記号を変換するための辞書
27
+ rep_map = {
28
+ ":": ",",
29
+ ";": ",",
30
+ ",": ",",
31
+ "。": ".",
32
+ "!": "!",
33
+ "?": "?",
34
+ "\n": ".",
35
+ ".": ".",
36
+ "…": "...",
37
+ "···": "...",
38
+ "・・・": "...",
39
+ "·": ",",
40
+ "・": ",",
41
+ "、": ",",
42
+ "$": ".",
43
+ "“": "'",
44
+ "”": "'",
45
+ '"': "'",
46
+ "‘": "'",
47
+ "’": "'",
48
+ "(": "'",
49
+ ")": "'",
50
+ "(": "'",
51
+ ")": "'",
52
+ "《": "'",
53
+ "》": "'",
54
+ "【": "'",
55
+ "】": "'",
56
+ "[": "'",
57
+ "]": "'",
58
+ "—": "-",
59
+ "−": "-",
60
+ # "~": "-", # これは長音記号「ー」として扱うよう変更
61
+ # "~": "-", # これは長音記号「ー」として扱うよう変更
62
+ "「": "'",
63
+ "」": "'",
64
+ }
65
+
66
+
67
+ def text_normalize(text):
68
+ """
69
+ 日本語のテキストを正規化する。
70
+ 結果は、ちょうど次の文字のみからなる:
71
+ - ひらがな
72
+ - カタカナ(全角長音記号「ー」が入る!)
73
+ - 漢字
74
+ - 半角アルファベット(大文字と小文字)
75
+ - ギリシャ文字
76
+ - `.` (句点`。`や`…`の一部や改行等)
77
+ - `,` (読点`、`や`:`等)
78
+ - `?` (疑問符`?`)
79
+ - `!` (感嘆符`!`)
80
+ - `'` (`「`や`」`等)
81
+ - `-` (`―`(ダッシュ、長音記号ではない)や`-`等)
82
+
83
+ 注意点:
84
+ - 三点リーダー`…`は`...`に変換される(`なるほど…。` → `なるほど....`)
85
+ - 数字は漢字に変換される(`1,100円` → `千百円`、`52.34` → `五十二点三四`)
86
+ - 読点や疑問符等の位置・個数等は保持される(`??あ、、!!!` → `??あ,,!!!`)
87
+ """
88
+ # print(f"Before normalization: {text}")
89
+ # ここでアルファベットは半角になり、三点リーダは`...`になる
90
+ res = unicodedata.normalize("NFKC", text)
91
+
92
+ res = japanese_convert_numbers_to_words(res) # 「100円」→「百円」等
93
+
94
+ # 「~」と「~」も長音記号として扱う
95
+ res = res.replace("~", "ー")
96
+ res = res.replace("~", "ー")
97
+
98
+ res = replace_punctuation(res) # 句読点等正規化、読めない文字を削除
99
+
100
+ # 結合文字の濁点・半濁点を削除
101
+ # 通常の「ば」等はそのままのこされる、「あ゛」は上で「あ゙」になりここで「あ」になる
102
+ res = res.replace("\u3099", "") # 結合文字の濁点を削除、る゙ → る
103
+ res = res.replace("\u309A", "") # 結合文字の半濁点を削除、な゚ → な
104
+ return res
105
+
106
+
107
+ def replace_punctuation(text: str) -> str:
108
+ """句読点等を「.」「,」「!」「?」「'」「-」に正規化し、OpenJTalkで読みが取得できるもののみ残す:
109
+ 漢字・平仮名・カタカナ、アルファベット、ギリシャ文字
110
+ """
111
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
112
+
113
+ # 句読点を辞書で置換
114
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
115
+
116
+ replaced_text = re.sub(
117
+ # ↓ ひらがな、カタカナ、漢字
118
+ r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
119
+ # ↓ 半角アルファベット(大文字と小文字)
120
+ + r"\u0041-\u005A\u0061-\u007A"
121
+ # ↓ 全角アルファベット(大文字と小文字)
122
+ + r"\uFF21-\uFF3A\uFF41-\uFF5A"
123
+ # ↓ ギリシャ文字
124
+ + r"\u0370-\u03FF\u1F00-\u1FFF"
125
+ # ↓ "!", "?", "…", ",", ".", "'", "-", 但し`…`はすでに`...`に変換されている
126
+ + "".join(punctuation) + r"]+",
127
+ # 上述以外の文字を削除
128
+ "",
129
+ replaced_text,
130
+ )
131
+
132
+ return replaced_text
133
+
134
+
135
+ _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
136
+ _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
137
+ _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
138
+ _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
139
+
140
+
141
+ def japanese_convert_numbers_to_words(text: str) -> str:
142
+ res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
143
+ res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
144
+ res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
145
+ return res
146
+
147
+
148
+ def g2p(norm_text: str, tokenizer, **kwargs) -> tuple[list[str], list[int], list[int]]:
149
+ """
150
+ 他で使われるメインの関数。`text_normalize()`で正規化された`norm_text`を受け取り、
151
+ - phones: 音素のリスト(ただし`!`や`,`や`.`等punctuationが含まれうる)
152
+ - tones: アクセントのリスト、0(低)と1(高)からなり、phonesと同じ長さ
153
+ - word2ph: 元のテキストの各文字に音素が何個割り当てられるかを表すリスト
154
+ のタプルを返す。
155
+ ただし`phones`と`tones`の最初と終わりに`_`が入り、応じて`word2ph`の最初と最後に1が追加される。
156
+ """
157
+ # pyopenjtalkのフルコンテキストラベルを使ってアクセントを取り出すと、punctuationの位置が消えてしまい情報が失われてしまう:
158
+ # 「こんにちは、世界。」と「こんにちは!世界。」と「こんにちは!!!???世界……。」は全て同じになる。
159
+ # よって、まずpunctuation無しの音素とアクセントのリストを作り、
160
+ # それとは別にpyopenjtalk.run_frontend()で得られる音素リスト(こちらはpunctuationが保持される)を使い、
161
+ # アクセント割当をしなおすことによってpunctuationを含めた音素とアクセントのリストを作る。
162
+
163
+ # punctuationがすべて消えた、音素とアクセントのタプルのリスト
164
+ phone_tone_list_wo_punct = g2phone_tone_wo_punct(norm_text)
165
+
166
+ # sep_text: 単語単位の単語のリスト
167
+ # sep_kata: 単語単位の単語のカタカナ読みのリスト
168
+ sep_text, sep_kata = text2sep_kata(norm_text)
169
+
170
+ # sep_phonemes: 各単語ごとの音素のリストのリスト
171
+ sep_phonemes = handle_long([kata2phoneme_list(i) for i in sep_kata])
172
+
173
+ # phone_w_punct: sep_phonemesを結合した、punctuationを元のまま保持した音素列
174
+ phone_w_punct: list[str] = []
175
+ for i in sep_phonemes:
176
+ phone_w_punct += i
177
+
178
+ # punctuation無しのアクセント情報を使って、punctuationを含めたアクセント情報を作る
179
+ phone_tone_list = align_tones(phone_w_punct, phone_tone_list_wo_punct)
180
+ # word2phは厳密な解答は不可能なので(「今日」「眼鏡」等の熟字訓が存在)、
181
+ # Bert-VITS2では、単語単位の分割を使って、単語の文字ごとにだいたい均等に音素を分配する
182
+
183
+ # sep_textから、各単語を1文字1文字分割して、文字のリスト(のリスト)を作る
184
+ sep_tokenized: list[list[str]] = []
185
+ for i in sep_text:
186
+ if i not in punctuation:
187
+ sep_tokenized.append(tokenizer.tokenize(i)) # ここでおそらく`i`が文字単位に分割される
188
+ else:
189
+ sep_tokenized.append([i])
190
+
191
+ # 各単語について、音素の数と文字の数を比較して、均等っぽく分配する
192
+ word2ph = []
193
+ for token, phoneme in zip(sep_tokenized, sep_phonemes):
194
+ phone_len = len(phoneme)
195
+ word_len = len(token)
196
+ word2ph += distribute_phone(phone_len, word_len)
197
+
198
+ # 最初と最後に`_`記号を追加、アクセントは0(低)、word2phもそれに合わせて追加
199
+ phone_tone_list = [("_", 0)] + phone_tone_list + [("_", 0)]
200
+ word2ph = [1] + word2ph + [1]
201
+
202
+ phones = [phone for phone, _ in phone_tone_list]
203
+ tones = [tone for _, tone in phone_tone_list]
204
+
205
+ assert len(phones) == sum(word2ph), f"{len(phones)} != {sum(word2ph)}"
206
+
207
+ return phones, tones, word2ph
208
+
209
+
210
+ def g2phone_tone_wo_punct(text: str) -> list[tuple[str, int]]:
211
+ """
212
+ テキストに対して、音素とアクセント(0か1)のペアのリストを返す。
213
+ ただし「!」「.」「?」等の非音素記号(punctuation)は全て消える(ポーズ記号も残さない)。
214
+ 非音素記号を含める処理は`align_tones()`で行われる。
215
+ また「っ」は「cl」でなく「q」に変換される(「ん」は「N」のまま)。
216
+ 例: "こんにちは、世界ー。。元気?!" →
217
+ [('k', 0), ('o', 0), ('N', 1), ('n', 1), ('i', 1), ('ch', 1), ('i', 1), ('w', 1), ('a', 1), ('s', 1), ('e', 1), ('k', 0), ('a', 0), ('i', 0), ('i', 0), ('g', 1), ('e', 1), ('N', 0), ('k', 0), ('i', 0)]
218
+ """
219
+ prosodies = pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True)
220
+ result: list[tuple[str, int]] = []
221
+ current_phrase: list[tuple[str, int]] = []
222
+ current_tone = 0
223
+ for i, letter in enumerate(prosodies):
224
+ # 特殊記号の処理
225
+
226
+ # 文頭記号、無視する
227
+ if letter == "^":
228
+ assert i == 0, "Unexpected ^"
229
+ # アクセント句の終わりに来る記号
230
+ elif letter in ("$", "?", "_", "#"):
231
+ # 保持しているフレーズを、アクセント数値を0-1に修正し結果に追加
232
+ result.extend(fix_phone_tone(current_phrase))
233
+ # 末尾に来る終了記号、無視(文中の疑問文は`_`になる)
234
+ if letter in ("$", "?"):
235
+ assert i == len(prosodies) - 1, f"Unexpected {letter}"
236
+ # あとは"_"���ポーズ)と"#"(アクセント句の境界)のみ
237
+ # これらは残さず、次のアクセント句に備える。
238
+ current_phrase = []
239
+ # 0を基準点にしてそこから上昇・下降する(負の場合は上の`fix_phone_tone`で直る)
240
+ current_tone = 0
241
+ # アクセント上昇記号
242
+ elif letter == "[":
243
+ current_tone = current_tone + 1
244
+ # アクセント下降記号
245
+ elif letter == "]":
246
+ current_tone = current_tone - 1
247
+ # それ以外は通常の音素
248
+ else:
249
+ if letter == "cl": # 「っ」の処理
250
+ letter = "q"
251
+ current_phrase.append((letter, current_tone))
252
+ return result
253
+
254
+
255
+ def text2sep_kata(norm_text: str) -> tuple[list[str], list[str]]:
256
+ """
257
+ `text_normalize`で正規化済みの`norm_text`を受け取り、それを単語分割し、
258
+ 分割された単語リストとその読み(カタカナor記号1文字)のリストのタプルを返す。
259
+ 単語分割結果は、`g2p()`の`word2ph`で1文字あたりに割り振る音素記号の数を決めるために使う。
260
+ 例:
261
+ `私はそう思う!って感じ?` →
262
+ ["私", "は", "そう", "思う", "!", "って", "感じ", "?"], ["ワタシ", "ワ", "ソー", "オモウ", "!", "ッテ", "カンジ", "?"]
263
+ """
264
+ # parsed: OpenJTalkの解析結果
265
+ parsed = pyopenjtalk.run_frontend(norm_text)
266
+ sep_text: list[str] = []
267
+ sep_kata: list[str] = []
268
+ for parts in parsed:
269
+ # word: 実際の単語の文字列
270
+ # yomi: その読み、但し無声化サインの`’`は除去
271
+ word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
272
+ "’", ""
273
+ )
274
+ """
275
+ ここで`yomi`の取りうる値は以下の通りのはず。
276
+ - `word`が通常単語 → 通常の読み(カタカナ)
277
+ (カタカナからなり、長音記号も含みうる、`アー` 等)
278
+ - `word`が`ー` から始まる → `ーラー` や `ーーー` など
279
+ - `word`が句読点や空白等 → `、`
280
+ - `word`が`?` → `?`(全角になる)
281
+ 他にも`word`が読めないキリル文字アラビア文字等が来ると`、`になるが、正規化でこの場合は起きないはず。
282
+ また元のコードでは`yomi`が空白の場合の処理があったが、これは起きないはず。
283
+ 処理すべきは`yomi`が`、`の場合のみのはず。
284
+ """
285
+ assert yomi != "", f"Empty yomi: {word}"
286
+ if yomi == "、":
287
+ # wordは正規化されているので、`.`, `,`, `!`, `'`, `-`のいずれか
288
+ if word not in (
289
+ ".",
290
+ ",",
291
+ "!",
292
+ "'",
293
+ "-",
294
+ ):
295
+ # ここはpyopenjtalkが読めない文字等のときに起こる
296
+ raise ValueError(f"Cannot read: {word} in:\n{norm_text}")
297
+ # yomiは元の記号のままに変更
298
+ yomi = word
299
+ elif yomi == "?":
300
+ assert word == "?", f"yomi `?` comes from: {word}"
301
+ yomi = "?"
302
+ sep_text.append(word)
303
+ sep_kata.append(yomi)
304
+ return sep_text, sep_kata
305
+
306
+
307
+ # ESPnetの実装から引用、変更点無し
308
+ # https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
309
+ def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True) -> list[str]:
310
+ """Extract phoneme + prosoody symbol sequence from input full-context labels.
311
+
312
+ The algorithm is based on `Prosodic features control by symbols as input of
313
+ sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
314
+
315
+ Args:
316
+ text (str): Input text.
317
+ drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
318
+
319
+ Returns:
320
+ List[str]: List of phoneme + prosody symbols.
321
+
322
+ Examples:
323
+ #>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
324
+ #>>> pyopenjtalk_g2p_prosody("こんにちは。")
325
+ ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
326
+
327
+ .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
328
+ modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
329
+
330
+ """
331
+ labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
332
+ N = len(labels)
333
+
334
+ phones = []
335
+ for n in range(N):
336
+ lab_curr = labels[n]
337
+
338
+ # current phoneme
339
+ p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
340
+ # deal unvoiced vowels as normal vowels
341
+ if drop_unvoiced_vowels and p3 in "AEIOU":
342
+ p3 = p3.lower()
343
+
344
+ # deal with sil at the beginning and the end of text
345
+ if p3 == "sil":
346
+ assert n == 0 or n == N - 1
347
+ if n == 0:
348
+ phones.append("^")
349
+ elif n == N - 1:
350
+ # check question form or not
351
+ e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
352
+ if e3 == 0:
353
+ phones.append("$")
354
+ elif e3 == 1:
355
+ phones.append("?")
356
+ continue
357
+ elif p3 == "pau":
358
+ phones.append("_")
359
+ continue
360
+ else:
361
+ phones.append(p3)
362
+
363
+ # accent type and position info (forward or backward)
364
+ a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
365
+ a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
366
+ a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
367
+
368
+ # number of mora in accent phrase
369
+ f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
370
+
371
+ a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
372
+ # accent phrase border
373
+ if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
374
+ phones.append("#")
375
+ # pitch falling
376
+ elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
377
+ phones.append("]")
378
+ # pitch rising
379
+ elif a2 == 1 and a2_next == 2:
380
+ phones.append("[")
381
+
382
+ return phones
383
+
384
+
385
+ def _numeric_feature_by_regex(regex, s):
386
+ match = re.search(regex, s)
387
+ if match is None:
388
+ return -50
389
+ return int(match.group(1))
390
+
391
+
392
+ def fix_phone_tone(phone_tone_list: list[tuple[str, int]]) -> list[tuple[str, int]]:
393
+ """
394
+ `phone_tone_list`のtone(アクセントの値)を0か1の範囲に修正する。
395
+ 例: [(a, 0), (i, -1), (u, -1)] → [(a, 1), (i, 0), (u, 0)]
396
+ """
397
+ tone_values = set(tone for _, tone in phone_tone_list)
398
+ if len(tone_values) == 1:
399
+ assert tone_values == {0}, tone_values
400
+ return phone_tone_list
401
+ elif len(tone_values) == 2:
402
+ if tone_values == {0, 1}:
403
+ return phone_tone_list
404
+ elif tone_values == {-1, 0}:
405
+ return [
406
+ (letter, 0 if tone == -1 else 1) for letter, tone in phone_tone_list
407
+ ]
408
+ else:
409
+ raise ValueError(f"Unexpected tone values: {tone_values}")
410
+ else:
411
+ raise ValueError(f"Unexpected tone values: {tone_values}")
412
+
413
+
414
+ def distribute_phone(n_phone: int, n_word: int) -> list[int]:
415
+ """
416
+ 左から右に1ずつ振り分け、次にまた左から右に1ずつ増やし、というふうに、
417
+ 音素の数`n_phone`を単語の数`n_word`に分配する。
418
+ """
419
+ phones_per_word = [0] * n_word
420
+ for _ in range(n_phone):
421
+ min_tasks = min(phones_per_word)
422
+ min_index = phones_per_word.index(min_tasks)
423
+ phones_per_word[min_index] += 1
424
+ return phones_per_word
425
+
426
+
427
+ def handle_long(sep_phonemes: list[list[str]]) -> list[list[str]]:
428
+ for i in range(len(sep_phonemes)):
429
+ if sep_phonemes[i][0] == "ー":
430
+ sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
431
+ if "ー" in sep_phonemes[i]:
432
+ for j in range(len(sep_phonemes[i])):
433
+ if sep_phonemes[i][j] == "ー":
434
+ sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
435
+ return sep_phonemes
436
+
437
+
438
+ def align_tones(
439
+ phones_with_punct: list[str], phone_tone_list: list[tuple[str, int]]
440
+ ) -> list[tuple[str, int]]:
441
+ """
442
+ 例:
443
+ …私は、、そう思う。
444
+ phones_with_punct:
445
+ [".", ".", ".", "w", "a", "t", "a", "sh", "i", "w", "a", ",", ",", "s", "o", "o", "o", "m", "o", "u", "."]
446
+ phone_tone_list:
447
+ [("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0))]
448
+ Return:
449
+ [(".", 0), (".", 0), (".", 0), ("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), (",", 0), (",", 0), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0), (".", 0)]
450
+ """
451
+ result: list[tuple[str, int]] = []
452
+ tone_index = 0
453
+ for phone in phones_with_punct:
454
+ if tone_index >= len(phone_tone_list):
455
+ # 余ったpunctuationがある場合 → (punctuation, 0)を追加
456
+ result.append((phone, 0))
457
+ elif phone == phone_tone_list[tone_index][0]:
458
+ # phone_tone_listの現在の音素と一致する場合 → toneをそこから取得、(phone, tone)を追加
459
+ result.append((phone, phone_tone_list[tone_index][1]))
460
+ # 探すindexを1つ進める
461
+ tone_index += 1
462
+ elif phone in punctuation:
463
+ # phoneがpunctuationの場合 → (phone, 0)を追加
464
+ result.append((phone, 0))
465
+ else:
466
+ print(f"phones: {phones_with_punct}")
467
+ print(f"phone_tone_list: {phone_tone_list}")
468
+ print(f"result: {result}")
469
+ print(f"tone_index: {tone_index}")
470
+ print(f"phone: {phone}")
471
+ raise ValueError(f"Unexpected phone: {phone}")
472
+ return result
473
+
474
+
475
+ def kata2phoneme_list(text: str) -> list[str]:
476
+ """
477
+ 原則カタカナの`text`を受け取り、それをそのままいじらずに音素記号のリストに変換。
478
+ 注意点:
479
+ - punctuationが来た場合(punctuationが1文字の場合がありうる)、処理せず1文字のリストを返す
480
+ - 冒頭に続く「ー」はそのまま「ー」のままにする(`handle_long()`で処理される)
481
+ - 文中の「ー」は前の音素記号の最後の音素記号に変換される。
482
+ 例:
483
+ `ーーソーナノカーー` → ["ー", "ー", "s", "o", "o", "n", "a", "n", "o", "k", "a", "a", "a"]
484
+ `?` → ["?"]
485
+ """
486
+ if text in punctuation:
487
+ return [text]
488
+ # `text`がカタカナ(`ー`含む)のみからなるかどうかをチェック
489
+ if re.fullmatch(r"[\u30A0-\u30FF]+", text) is None:
490
+ raise ValueError(f"Input must be katakana only: {text}")
491
+ sorted_keys = sorted(mora_kata_to_mora_phonemes.keys(), key=len, reverse=True)
492
+ pattern = "|".join(map(re.escape, sorted_keys))
493
+
494
+ def mora2phonemes(mora: str) -> str:
495
+ cosonant, vowel = mora_kata_to_mora_phonemes[mora]
496
+ if cosonant is None:
497
+ return f" {vowel}"
498
+ return f" {cosonant} {vowel}"
499
+
500
+ spaced_phonemes = re.sub(pattern, lambda m: mora2phonemes(m.group()), text)
501
+
502
+ # 長音記号「ー」の処理
503
+ long_pattern = r"(\w)(ー*)"
504
+ long_replacement = lambda m: m.group(1) + (" " + m.group(1)) * len(m.group(2))
505
+ spaced_phonemes = re.sub(long_pattern, long_replacement, spaced_phonemes)
506
+ return spaced_phonemes.strip().split(" ")
507
+
508
+
509
+ if __name__ == "__main__":
510
+ from manager import model_handler
511
+
512
+ tokenizer, _ = model_handler.get_bert_model("DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM")
513
+ text = "hello,こんにちは、世界ー~!……"
514
+
515
+ from bert_vits2.text.japanese_bert import get_bert_feature
516
+
517
+ text = text_normalize(text)
518
+ print(text)
519
+
520
+ phones, tones, word2ph = g2p(text)
521
+ print(phones, tones, word2ph)
522
+ bert = get_bert_feature(text, word2ph)
523
+
524
+ print(phones, tones, word2ph, bert.shape)