Spaces:
Build error
Build error
Sam Passaglia
commited on
Commit
•
ac462f6
1
Parent(s):
ba074a5
minor
Browse files- yomikata/dictionary.py +48 -49
yomikata/dictionary.py
CHANGED
@@ -3,55 +3,11 @@ dictionary.py
|
|
3 |
Provides the Dictionary class which implements Reader using dictionary lookup.
|
4 |
"""
|
5 |
|
6 |
-
import fugashi
|
7 |
-
import ipadic
|
8 |
-
import jaconv
|
9 |
-
import jumandic
|
10 |
from speach import ttlig
|
11 |
-
from sudachipy import dictionary as sudachidict
|
12 |
-
from sudachipy import tokenizer as sudachitokenizer
|
13 |
-
|
14 |
from config.config import ASCII_SPACE_TOKEN
|
15 |
from yomikata import utils
|
16 |
from yomikata.reader import Reader
|
17 |
-
|
18 |
-
tokenizer_obj = sudachidict.Dictionary(dict="full").create()
|
19 |
-
mode = sudachitokenizer.Tokenizer.SplitMode.C
|
20 |
-
|
21 |
-
taggers = {}
|
22 |
-
taggers["ipadic"] = fugashi.GenericTagger(ipadic.MECAB_ARGS)
|
23 |
-
taggers["juman"] = fugashi.GenericTagger(jumandic.MECAB_ARGS)
|
24 |
-
taggers["unidic"] = fugashi.Tagger()
|
25 |
-
taggers["sudachi"] = lambda s: tokenizer_obj.tokenize(s, mode)
|
26 |
-
|
27 |
-
token_to_kana = {
|
28 |
-
"ipadic": lambda word: jaconv.kata2hira(str(word.feature[7]))
|
29 |
-
if len(word.feature) >= 8
|
30 |
-
else jaconv.kata2hira(str(word.surface)),
|
31 |
-
"juman": lambda word: word.feature[5]
|
32 |
-
if word.feature[5] != "*"
|
33 |
-
else jaconv.kata2hira(str(word)),
|
34 |
-
"unidic": lambda word: jaconv.kata2hira(str(word))
|
35 |
-
if (word.feature.kana == "*" or word.feature.kana is None)
|
36 |
-
else jaconv.kata2hira(str(word.feature.kana)),
|
37 |
-
"sudachi": lambda word: jaconv.kata2hira(
|
38 |
-
utils.standardize_text(str(word.reading_form()))
|
39 |
-
),
|
40 |
-
}
|
41 |
-
|
42 |
-
token_to_surface = {
|
43 |
-
"ipadic": lambda word: word.surface,
|
44 |
-
"juman": lambda word: word.surface,
|
45 |
-
"unidic": lambda word: word.surface,
|
46 |
-
"sudachi": lambda word: word.surface(),
|
47 |
-
}
|
48 |
-
|
49 |
-
token_to_pos = {
|
50 |
-
"ipadic": lambda word: word.feature[0],
|
51 |
-
"juman": lambda word: word.feature[0],
|
52 |
-
"unidic": lambda word: word.feature.pos1,
|
53 |
-
"sudachi": lambda word: word.part_of_speech()[0],
|
54 |
-
}
|
55 |
|
56 |
|
57 |
class Dictionary(Reader):
|
@@ -71,10 +27,53 @@ class Dictionary(Reader):
|
|
71 |
tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
|
72 |
"""
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
def furigana(self, text: str) -> str:
|
80 |
text = utils.standardize_text(text)
|
|
|
3 |
Provides the Dictionary class which implements Reader using dictionary lookup.
|
4 |
"""
|
5 |
|
|
|
|
|
|
|
|
|
6 |
from speach import ttlig
|
|
|
|
|
|
|
7 |
from config.config import ASCII_SPACE_TOKEN
|
8 |
from yomikata import utils
|
9 |
from yomikata.reader import Reader
|
10 |
+
import jaconv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
|
13 |
class Dictionary(Reader):
|
|
|
27 |
tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
|
28 |
"""
|
29 |
|
30 |
+
if tagger == "unidic":
|
31 |
+
import fugashi
|
32 |
+
|
33 |
+
self.tagger = fugashi.Tagger()
|
34 |
+
self.token_to_surface = lambda word: word.surface
|
35 |
+
self.token_to_pos = lambda word: word.feature.pos1
|
36 |
+
self.token_to_kana = (
|
37 |
+
lambda word: jaconv.kata2hira(str(word))
|
38 |
+
if (word.feature.kana == "*" or word.feature.kana is None)
|
39 |
+
else jaconv.kata2hira(str(word.feature.kana))
|
40 |
+
)
|
41 |
+
elif tagger == "ipadic":
|
42 |
+
import fugashi
|
43 |
+
import ipadic
|
44 |
+
|
45 |
+
self.tagger = fugashi.GenericTagger(ipadic.MECAB_ARGS)
|
46 |
+
self.token_to_surface = lambda word: word.surface
|
47 |
+
self.token_to_pos = lambda word: word.feature[0]
|
48 |
+
self.token_to_kana = (
|
49 |
+
lambda word: jaconv.kata2hira(str(word.feature[7]))
|
50 |
+
if len(word.feature) >= 8
|
51 |
+
else jaconv.kata2hira(str(word.surface))
|
52 |
+
)
|
53 |
+
elif tagger == "juman":
|
54 |
+
import fugashi
|
55 |
+
import jumandic
|
56 |
+
|
57 |
+
self.tagger = fugashi.GenericTagger(jumandic.MECAB_ARGS)
|
58 |
+
self.token_to_surface = lambda word: word.surface
|
59 |
+
self.token_to_pos = lambda word: word.feature[0]
|
60 |
+
self.token_to_kana = (
|
61 |
+
lambda word: word.feature[5]
|
62 |
+
if word.feature[5] != "*"
|
63 |
+
else jaconv.kata2hira(str(word))
|
64 |
+
)
|
65 |
+
elif tagger == "sudachi":
|
66 |
+
from sudachipy import dictionary as sudachidict
|
67 |
+
from sudachipy import tokenizer as sudachitokenizer
|
68 |
+
|
69 |
+
tokenizer_obj = sudachidict.Dictionary(dict="full").create()
|
70 |
+
mode = sudachitokenizer.Tokenizer.SplitMode.C
|
71 |
+
self.tagger = lambda s: tokenizer_obj.tokenize(s, mode)
|
72 |
+
self.token_to_surface = lambda word: word.surface()
|
73 |
+
self.token_to_pos = lambda word: word.part_of_speech()[0]
|
74 |
+
self.token_to_kana = lambda word: jaconv.kata2hira(
|
75 |
+
utils.standardize_text(str(word.reading_form()))
|
76 |
+
)
|
77 |
|
78 |
def furigana(self, text: str) -> str:
|
79 |
text = utils.standardize_text(text)
|