|
r""" |
|
Khmer Phonemizer - A Free, Standalone and Open-Source Khmer Grapheme-to-Phonemes. |
|
""" |
|
import os |
|
import csv |
|
from g2p import PhonetisaurusGraph |
|
|
|
def _read_lexicon_file(file): |
|
lexicon = {} |
|
with open(file) as infile: |
|
for line in csv.reader(infile, delimiter="\t"): |
|
word, phonemes = line |
|
word, phonemes = word.strip(), phonemes.strip().split() |
|
lexicon[word] = phonemes |
|
return lexicon |
|
|
|
_graph_file = os.path.join(os.path.dirname(__file__), "km_phonemizer.npz") |
|
_lexicon_file = os.path.join(os.path.dirname(__file__), "km_lexicon.tsv") |
|
_lexicon_dict = _read_lexicon_file(_lexicon_file) |
|
_graph = PhonetisaurusGraph.load(_graph_file, preload=False) |
|
|
|
def _phoneticize(word: str, beam: int, min_beam: int, beam_scale: float): |
|
results = _graph.g2p_one(word, beam=beam, min_beam=min_beam, beam_scale=beam_scale) |
|
results = list(results) |
|
if len(results) == 0: |
|
return None |
|
return results[0] |
|
|
|
|
|
def phonemize_single( |
|
word, |
|
beam: int = 500, |
|
min_beam: int = 100, |
|
beam_scale: float = 0.6, |
|
use_lexicon: bool = True, |
|
): |
|
r""" |
|
Phonemize a single word. The word must match [a-zA-Z\u1780-\u17dd]+ |
|
""" |
|
if word is None: |
|
return None |
|
word = word.lower() |
|
if use_lexicon and word in _lexicon_dict: |
|
return _lexicon_dict[word] |
|
return _phoneticize(word, beam=beam, min_beam=min_beam, beam_scale=beam_scale) |
|
|