Spaces:
Build error
Build error
""" | |
dictionary.py | |
Provides the Dictionary class which implements Reader using dictionary lookup. | |
""" | |
import fugashi | |
import ipadic | |
import jaconv | |
import jumandic | |
from speach import ttlig | |
from sudachipy import dictionary as sudachidict | |
from sudachipy import tokenizer as sudachitokenizer | |
from config.config import ASCII_SPACE_TOKEN | |
from yomikata import utils | |
from yomikata.reader import Reader | |
tokenizer_obj = sudachidict.Dictionary(dict="full").create() | |
mode = sudachitokenizer.Tokenizer.SplitMode.C | |
taggers = {} | |
taggers["ipadic"] = fugashi.GenericTagger(ipadic.MECAB_ARGS) | |
taggers["juman"] = fugashi.GenericTagger(jumandic.MECAB_ARGS) | |
taggers["unidic"] = fugashi.Tagger() | |
taggers["sudachi"] = lambda s: tokenizer_obj.tokenize(s, mode) | |
token_to_kana = { | |
"ipadic": lambda word: jaconv.kata2hira(str(word.feature[7])) | |
if len(word.feature) >= 8 | |
else jaconv.kata2hira(str(word.surface)), | |
"juman": lambda word: word.feature[5] | |
if word.feature[5] != "*" | |
else jaconv.kata2hira(str(word)), | |
"unidic": lambda word: jaconv.kata2hira(str(word)) | |
if (word.feature.kana == "*" or word.feature.kana is None) | |
else jaconv.kata2hira(str(word.feature.kana)), | |
"sudachi": lambda word: jaconv.kata2hira( | |
utils.standardize_text(str(word.reading_form())) | |
), | |
} | |
token_to_surface = { | |
"ipadic": lambda word: word.surface, | |
"juman": lambda word: word.surface, | |
"unidic": lambda word: word.surface, | |
"sudachi": lambda word: word.surface(), | |
} | |
token_to_pos = { | |
"ipadic": lambda word: word.feature[0], | |
"juman": lambda word: word.feature[0], | |
"unidic": lambda word: word.feature.pos1, | |
"sudachi": lambda word: word.part_of_speech()[0], | |
} | |
class Dictionary(Reader): | |
def __init__(self, tagger: str = "unidic") -> None: | |
"""Create a Dictionary object to apply furigana using Dictionary lookup | |
Object holds configuration and tokenizer state. | |
Typical usage: | |
```python | |
reader = Dictionary() | |
furi = Dictionary.furigana("お前はもう死んでいる") | |
# "お{前/まえ}はもう{死/し}んでいる" | |
``` | |
Args: | |
tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible. | |
""" | |
self.tagger = taggers[tagger] | |
self.token_to_kana = token_to_kana[tagger] | |
self.token_to_surface = token_to_surface[tagger] | |
self.token_to_pos = token_to_pos[tagger] | |
def furigana(self, text: str) -> str: | |
text = utils.standardize_text(text) | |
text = text.replace(" ", ASCII_SPACE_TOKEN) | |
rubytoken = utils.parse_furigana(text) | |
output = "" | |
for group in rubytoken.groups: | |
if isinstance(group, ttlig.RubyFrag): | |
output += f"{{{group.text}/{group.furi}}}" | |
else: | |
group = group.replace("{", "").replace("}", "") | |
for word in self.tagger(group): | |
kana = self.token_to_kana(word) | |
surface = self.token_to_surface(word) | |
pos = self.token_to_pos(word) | |
if (surface == kana) or pos in ["記号", "補助記号", "特殊"]: | |
output += surface | |
else: | |
output += ttlig.RubyToken.from_furi(surface, kana).to_code() | |
output = output.replace(ASCII_SPACE_TOKEN, " ") | |
return output | |