# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py import re import sys import pyopenjtalk from text.symbols import punctuation # Regular expression matching Japanese without punctuation marks: _japanese_characters = re.compile( r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" ) # Regular expression matching non-Japanese characters or punctuation marks: _japanese_marks = re.compile( r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" ) # List of (symbol, Japanese) pairs for marks: _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] # List of (consonant, sokuon) pairs: _real_sokuon = [ (re.compile("%s" % x[0]), x[1]) for x in [ (r"Q([↑↓]*[kg])", r"k#\1"), (r"Q([↑↓]*[tdjʧ])", r"t#\1"), (r"Q([↑↓]*[sʃ])", r"s\1"), (r"Q([↑↓]*[pb])", r"p#\1"), ] ] # List of (consonant, hatsuon) pairs: _real_hatsuon = [ (re.compile("%s" % x[0]), x[1]) for x in [ (r"N([↑↓]*[pbm])", r"m\1"), (r"N([↑↓]*[ʧʥj])", r"n^\1"), (r"N([↑↓]*[tdn])", r"n\1"), (r"N([↑↓]*[kg])", r"ŋ\1"), ] ] def post_replace_ph(ph): rep_map = { ":": ",", ";": ",", ",": ",", "。": ".", "!": "!", "?": "?", "\n": ".", "·": ",", "、": ",", "...": "…", } if ph in rep_map.keys(): ph = rep_map[ph] # if ph in symbols: # return ph # if ph not in symbols: # ph = "UNK" return ph def replace_consecutive_punctuation(text): punctuations = ''.join(re.escape(p) for p in punctuation) pattern = f'([{punctuations}])([{punctuations}])+' result = re.sub(pattern, r'\1', text) return result def symbols_to_japanese(text): for regex, replacement in _symbols_to_japanese: text = re.sub(regex, replacement, text) return text def preprocess_jap(text, with_prosody=False): """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" text = symbols_to_japanese(text) sentences = re.split(_japanese_marks, text) marks = re.findall(_japanese_marks, text) text = [] for i, sentence in enumerate(sentences): if re.match(_japanese_characters, sentence): if with_prosody: text += pyopenjtalk_g2p_prosody(sentence)[1:-1] else: p = pyopenjtalk.g2p(sentence) text += p.split(" ") if i < len(marks): if marks[i] == " ":# 防止意外的UNK continue text += [marks[i].replace(" ", "")] return text def text_normalize(text): # todo: jap text normalize # 避免重复标点引起的参考泄露 text = replace_consecutive_punctuation(text) return text # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True): """Extract phoneme + prosoody symbol sequence from input full-context labels. The algorithm is based on `Prosodic features control by symbols as input of sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks. Args: text (str): Input text. drop_unvoiced_vowels (bool): whether to drop unvoiced vowels. Returns: List[str]: List of phoneme + prosody symbols. Examples: >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody >>> pyopenjtalk_g2p_prosody("こんにちは。") ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$'] .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104 """ labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) N = len(labels) phones = [] for n in range(N): lab_curr = labels[n] # current phoneme p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) # deal unvoiced vowels as normal vowels if drop_unvoiced_vowels and p3 in "AEIOU": p3 = p3.lower() # deal with sil at the beginning and the end of text if p3 == "sil": assert n == 0 or n == N - 1 if n == 0: phones.append("^") elif n == N - 1: # check question form or not e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr) if e3 == 0: phones.append("$") elif e3 == 1: phones.append("?") continue elif p3 == "pau": phones.append("_") continue else: phones.append(p3) # accent type and position info (forward or backward) a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr) # number of mora in accent phrase f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1]) # accent phrase border if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": phones.append("#") # pitch falling elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: phones.append("]") # pitch rising elif a2 == 1 and a2_next == 2: phones.append("[") return phones # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py def _numeric_feature_by_regex(regex, s): match = re.search(regex, s) if match is None: return -50 return int(match.group(1)) def g2p(norm_text, with_prosody=True): phones = preprocess_jap(norm_text, with_prosody) phones = [post_replace_ph(i) for i in phones] # todo: implement tones and word2ph return phones if __name__ == "__main__": phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!") print(phones)