|
from typing import List |
|
|
|
import jieba |
|
import pypinyin |
|
|
|
from .pinyinToPhonemes import PINYIN_DICT |
|
|
|
|
|
def _chinese_character_to_pinyin(text: str) -> List[str]: |
|
pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True) |
|
pinyins_flat_list = [item for sublist in pinyins for item in sublist] |
|
return pinyins_flat_list |
|
|
|
|
|
def _chinese_pinyin_to_phoneme(pinyin: str) -> str: |
|
segment = pinyin[:-1] |
|
tone = pinyin[-1] |
|
phoneme = PINYIN_DICT.get(segment, [""])[0] |
|
return phoneme + tone |
|
|
|
|
|
def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str: |
|
tokenized_text = jieba.cut(text, HMM=False) |
|
tokenized_text = " ".join(tokenized_text) |
|
pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) |
|
|
|
results: List[str] = [] |
|
|
|
for token in pinyined_text: |
|
if token[-1] in "12345": |
|
pinyin_phonemes = _chinese_pinyin_to_phoneme(token) |
|
|
|
results += list(pinyin_phonemes) |
|
else: |
|
results += list(token) |
|
|
|
return seperator.join(results) |
|
|