""" dictionary.py Provides the Dictionary class which implements Reader using dictionary lookup. """ from speach import ttlig from config.config import ASCII_SPACE_TOKEN from yomikata import utils from yomikata.reader import Reader import jaconv class Dictionary(Reader): def __init__(self, tagger: str = "unidic") -> None: """Create a Dictionary object to apply furigana using Dictionary lookup Object holds configuration and tokenizer state. Typical usage: ```python reader = Dictionary() furi = Dictionary.furigana("お前はもう死んでいる") # "お{前/まえ}はもう{死/し}んでいる" ``` Args: tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible. """ if tagger == "unidic": import fugashi self.tagger = fugashi.Tagger() self.token_to_surface = lambda word: word.surface self.token_to_pos = lambda word: word.feature.pos1 self.token_to_kana = ( lambda word: jaconv.kata2hira(str(word)) if (word.feature.kana == "*" or word.feature.kana is None) else jaconv.kata2hira(str(word.feature.kana)) ) elif tagger == "ipadic": import fugashi import ipadic self.tagger = fugashi.GenericTagger(ipadic.MECAB_ARGS) self.token_to_surface = lambda word: word.surface self.token_to_pos = lambda word: word.feature[0] self.token_to_kana = ( lambda word: jaconv.kata2hira(str(word.feature[7])) if len(word.feature) >= 8 else jaconv.kata2hira(str(word.surface)) ) elif tagger == "juman": import fugashi import jumandic self.tagger = fugashi.GenericTagger(jumandic.MECAB_ARGS) self.token_to_surface = lambda word: word.surface self.token_to_pos = lambda word: word.feature[0] self.token_to_kana = ( lambda word: word.feature[5] if word.feature[5] != "*" else jaconv.kata2hira(str(word)) ) elif tagger == "sudachi": from sudachipy import dictionary as sudachidict from sudachipy import tokenizer as sudachitokenizer tokenizer_obj = sudachidict.Dictionary(dict="full").create() mode = sudachitokenizer.Tokenizer.SplitMode.C self.tagger = lambda s: tokenizer_obj.tokenize(s, mode) self.token_to_surface = lambda word: word.surface() self.token_to_pos = lambda word: word.part_of_speech()[0] self.token_to_kana = lambda word: jaconv.kata2hira( utils.standardize_text(str(word.reading_form())) ) def furigana(self, text: str) -> str: text = utils.standardize_text(text) text = text.replace(" ", ASCII_SPACE_TOKEN) rubytoken = utils.parse_furigana(text) output = "" for group in rubytoken.groups: if isinstance(group, ttlig.RubyFrag): output += f"{{{group.text}/{group.furi}}}" else: group = group.replace("{", "").replace("}", "") for word in self.tagger(group): kana = self.token_to_kana(word) surface = self.token_to_surface(word) pos = self.token_to_pos(word) if (surface == kana) or pos in ["記号", "補助記号", "特殊"]: output += surface else: output += ttlig.RubyToken.from_furi(surface, kana).to_code() output = output.replace(ASCII_SPACE_TOKEN, " ") return output