File size: 3,445 Bytes
9aba307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
dictionary.py
Provides the Dictionary class which implements Reader using dictionary lookup.
"""

import fugashi
import ipadic
import jaconv
import jumandic
from speach import ttlig
from sudachipy import dictionary as sudachidict
from sudachipy import tokenizer as sudachitokenizer

from config.config import ASCII_SPACE_TOKEN
from yomikata import utils
from yomikata.reader import Reader

tokenizer_obj = sudachidict.Dictionary(dict="full").create()
mode = sudachitokenizer.Tokenizer.SplitMode.C

taggers = {}
taggers["ipadic"] = fugashi.GenericTagger(ipadic.MECAB_ARGS)
taggers["juman"] = fugashi.GenericTagger(jumandic.MECAB_ARGS)
taggers["unidic"] = fugashi.Tagger()
taggers["sudachi"] = lambda s: tokenizer_obj.tokenize(s, mode)

token_to_kana = {
    "ipadic": lambda word: jaconv.kata2hira(str(word.feature[7]))
    if len(word.feature) >= 8
    else jaconv.kata2hira(str(word.surface)),
    "juman": lambda word: word.feature[5]
    if word.feature[5] != "*"
    else jaconv.kata2hira(str(word)),
    "unidic": lambda word: jaconv.kata2hira(str(word))
    if (word.feature.kana == "*" or word.feature.kana is None)
    else jaconv.kata2hira(str(word.feature.kana)),
    "sudachi": lambda word: jaconv.kata2hira(
        utils.standardize_text(str(word.reading_form()))
    ),
}

token_to_surface = {
    "ipadic": lambda word: word.surface,
    "juman": lambda word: word.surface,
    "unidic": lambda word: word.surface,
    "sudachi": lambda word: word.surface(),
}

token_to_pos = {
    "ipadic": lambda word: word.feature[0],
    "juman": lambda word: word.feature[0],
    "unidic": lambda word: word.feature.pos1,
    "sudachi": lambda word: word.part_of_speech()[0],
}


class Dictionary(Reader):
    def __init__(self, tagger: str = "unidic") -> None:
        """Create a Dictionary object to apply furigana using Dictionary lookup
        Object holds configuration and tokenizer state.

        Typical usage:

        ```python
        reader = Dictionary()
        furi = Dictionary.furigana("お前はもう死んでいる")
        # "お{前/まえ}はもう{死/し}んでいる"
        ```

        Args:
            tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
        """

        self.tagger = taggers[tagger]
        self.token_to_kana = token_to_kana[tagger]
        self.token_to_surface = token_to_surface[tagger]
        self.token_to_pos = token_to_pos[tagger]

    def furigana(self, text: str) -> str:
        text = utils.standardize_text(text)
        text = text.replace(" ", ASCII_SPACE_TOKEN)
        rubytoken = utils.parse_furigana(text)
        output = ""

        for group in rubytoken.groups:
            if isinstance(group, ttlig.RubyFrag):
                output += f"{{{group.text}/{group.furi}}}"
            else:
                group = group.replace("{", "").replace("}", "")
                for word in self.tagger(group):
                    kana = self.token_to_kana(word)
                    surface = self.token_to_surface(word)
                    pos = self.token_to_pos(word)
                    if (surface == kana) or pos in ["記号", "補助記号", "特殊"]:
                        output += surface
                    else:
                        output += ttlig.RubyToken.from_furi(surface, kana).to_code()
        output = output.replace(ASCII_SPACE_TOKEN, " ")
        return output