Spaces:
Sleeping
Sleeping
# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py | |
import re | |
import sys | |
import pyopenjtalk | |
from bert_vits2.text import symbols | |
# Regular expression matching Japanese without punctuation marks: | |
_japanese_characters = re.compile( | |
r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') | |
# Regular expression matching non-Japanese characters or punctuation marks: | |
_japanese_marks = re.compile( | |
r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') | |
# List of (symbol, Japanese) pairs for marks: | |
_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [ | |
('οΌ ', 'γγΌγ»γ³γ') | |
]] | |
# List of (consonant, sokuon) pairs: | |
_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [ | |
(r'Q([ββ]*[kg])', r'k#\1'), | |
(r'Q([ββ]*[tdjΚ§])', r't#\1'), | |
(r'Q([ββ]*[sΚ])', r's\1'), | |
(r'Q([ββ]*[pb])', r'p#\1') | |
]] | |
# List of (consonant, hatsuon) pairs: | |
_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [ | |
(r'N([ββ]*[pbm])', r'm\1'), | |
(r'N([ββ]*[Κ§Κ₯j])', r'n^\1'), | |
(r'N([ββ]*[tdn])', r'n\1'), | |
(r'N([ββ]*[kg])', r'Ε\1') | |
]] | |
def post_replace_ph(ph): | |
rep_map = { | |
'οΌ': ',', | |
'οΌ': ',', | |
'οΌ': ',', | |
'γ': '.', | |
'οΌ': '!', | |
'οΌ': '?', | |
'\n': '.', | |
"Β·": ",", | |
'γ': ",", | |
'...': 'β¦', | |
'v': "V" | |
} | |
if ph in rep_map.keys(): | |
ph = rep_map[ph] | |
if ph in symbols: | |
return ph | |
if ph not in symbols: | |
ph = 'UNK' | |
return ph | |
def symbols_to_japanese(text): | |
for regex, replacement in _symbols_to_japanese: | |
text = re.sub(regex, replacement, text) | |
return text | |
def preprocess_jap(text): | |
'''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html''' | |
text = symbols_to_japanese(text) | |
sentences = re.split(_japanese_marks, text) | |
marks = re.findall(_japanese_marks, text) | |
text = [] | |
for i, sentence in enumerate(sentences): | |
if re.match(_japanese_characters, sentence): | |
p = pyopenjtalk.g2p(sentence) | |
text += p.split(" ") | |
if i < len(marks): | |
text += [marks[i].replace(' ', '')] | |
return text | |
def text_normalize(text): | |
return text | |
def g2p(norm_text): | |
phones = preprocess_jap(norm_text) | |
phones = [post_replace_ph(i) for i in phones] | |
tones = [0 for i in phones] | |
word2ph = [1 for i in phones] | |
return phones, tones, word2ph | |
if __name__ == '__main__': | |
for line in open("../../../Downloads/transcript_utf8.txt").readlines(): | |
text = line.split(":")[1] | |
phones, tones, word2ph = g2p(text) | |
for p in phones: | |
if p == "z": | |
print(text, phones) | |
sys.exit(0) | |