TTS-GPT-SoVITS / text /japanese.py
Ailyth's picture
NEW-0216-050145
3c7a160
raw
history blame
5.98 kB
# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
import re
import sys
import pyopenjtalk
from text import symbols
# Regular expression matching Japanese without punctuation marks:
_japanese_characters = re.compile(
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
)
# Regular expression matching non-Japanese characters or punctuation marks:
_japanese_marks = re.compile(
r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
)
# List of (symbol, Japanese) pairs for marks:
_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("οΌ…", "γƒ‘γƒΌγ‚»γƒ³γƒˆ")]]
# List of (consonant, sokuon) pairs:
_real_sokuon = [
(re.compile("%s" % x[0]), x[1])
for x in [
(r"Q([↑↓]*[kg])", r"k#\1"),
(r"Q([↑↓]*[tdjΚ§])", r"t#\1"),
(r"Q([↑↓]*[sΚƒ])", r"s\1"),
(r"Q([↑↓]*[pb])", r"p#\1"),
]
]
# List of (consonant, hatsuon) pairs:
_real_hatsuon = [
(re.compile("%s" % x[0]), x[1])
for x in [
(r"N([↑↓]*[pbm])", r"m\1"),
(r"N([↑↓]*[Κ§Κ₯j])", r"n^\1"),
(r"N([↑↓]*[tdn])", r"n\1"),
(r"N([↑↓]*[kg])", r"Ε‹\1"),
]
]
def post_replace_ph(ph):
rep_map = {
":": ",",
"οΌ›": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
"Β·": ",",
"、": ",",
"...": "…",
}
if ph in rep_map.keys():
ph = rep_map[ph]
if ph in symbols:
return ph
if ph not in symbols:
ph = "UNK"
return ph
def symbols_to_japanese(text):
for regex, replacement in _symbols_to_japanese:
text = re.sub(regex, replacement, text)
return text
def preprocess_jap(text, with_prosody=False):
"""Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
text = symbols_to_japanese(text)
sentences = re.split(_japanese_marks, text)
marks = re.findall(_japanese_marks, text)
text = []
for i, sentence in enumerate(sentences):
if re.match(_japanese_characters, sentence):
if with_prosody:
text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
else:
p = pyopenjtalk.g2p(sentence)
text += p.split(" ")
if i < len(marks):
if marks[i] == " ":# ι˜²ζ­’ζ„ε€–ηš„UNK
continue
text += [marks[i].replace(" ", "")]
return text
def text_normalize(text):
# todo: jap text normalize
return text
# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
"""Extract phoneme + prosoody symbol sequence from input full-context labels.
The algorithm is based on `Prosodic features control by symbols as input of
sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
Args:
text (str): Input text.
drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
Returns:
List[str]: List of phoneme + prosody symbols.
Examples:
>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
>>> pyopenjtalk_g2p_prosody("こんにけは。")
['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
"""
labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
N = len(labels)
phones = []
for n in range(N):
lab_curr = labels[n]
# current phoneme
p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
# deal unvoiced vowels as normal vowels
if drop_unvoiced_vowels and p3 in "AEIOU":
p3 = p3.lower()
# deal with sil at the beginning and the end of text
if p3 == "sil":
assert n == 0 or n == N - 1
if n == 0:
phones.append("^")
elif n == N - 1:
# check question form or not
e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
if e3 == 0:
phones.append("$")
elif e3 == 1:
phones.append("?")
continue
elif p3 == "pau":
phones.append("_")
continue
else:
phones.append(p3)
# accent type and position info (forward or backward)
a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
# number of mora in accent phrase
f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
# accent phrase border
if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
phones.append("#")
# pitch falling
elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
phones.append("]")
# pitch rising
elif a2 == 1 and a2_next == 2:
phones.append("[")
return phones
# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
def _numeric_feature_by_regex(regex, s):
match = re.search(regex, s)
if match is None:
return -50
return int(match.group(1))
def g2p(norm_text, with_prosody=False):
phones = preprocess_jap(norm_text, with_prosody)
phones = [post_replace_ph(i) for i in phones]
# todo: implement tones and word2ph
return phones
if __name__ == "__main__":
phones = g2p("こんにけは, hello, AKITOです,γ‚ˆγ‚γ—γγŠι‘˜γ„γ—γΎγ™γ­οΌ")
print(phones)