Sam Passaglia commited on
Commit
ac462f6
1 Parent(s): ba074a5
Files changed (1) hide show
  1. yomikata/dictionary.py +48 -49
yomikata/dictionary.py CHANGED
@@ -3,55 +3,11 @@ dictionary.py
3
  Provides the Dictionary class which implements Reader using dictionary lookup.
4
  """
5
 
6
- import fugashi
7
- import ipadic
8
- import jaconv
9
- import jumandic
10
  from speach import ttlig
11
- from sudachipy import dictionary as sudachidict
12
- from sudachipy import tokenizer as sudachitokenizer
13
-
14
  from config.config import ASCII_SPACE_TOKEN
15
  from yomikata import utils
16
  from yomikata.reader import Reader
17
-
18
- tokenizer_obj = sudachidict.Dictionary(dict="full").create()
19
- mode = sudachitokenizer.Tokenizer.SplitMode.C
20
-
21
- taggers = {}
22
- taggers["ipadic"] = fugashi.GenericTagger(ipadic.MECAB_ARGS)
23
- taggers["juman"] = fugashi.GenericTagger(jumandic.MECAB_ARGS)
24
- taggers["unidic"] = fugashi.Tagger()
25
- taggers["sudachi"] = lambda s: tokenizer_obj.tokenize(s, mode)
26
-
27
- token_to_kana = {
28
- "ipadic": lambda word: jaconv.kata2hira(str(word.feature[7]))
29
- if len(word.feature) >= 8
30
- else jaconv.kata2hira(str(word.surface)),
31
- "juman": lambda word: word.feature[5]
32
- if word.feature[5] != "*"
33
- else jaconv.kata2hira(str(word)),
34
- "unidic": lambda word: jaconv.kata2hira(str(word))
35
- if (word.feature.kana == "*" or word.feature.kana is None)
36
- else jaconv.kata2hira(str(word.feature.kana)),
37
- "sudachi": lambda word: jaconv.kata2hira(
38
- utils.standardize_text(str(word.reading_form()))
39
- ),
40
- }
41
-
42
- token_to_surface = {
43
- "ipadic": lambda word: word.surface,
44
- "juman": lambda word: word.surface,
45
- "unidic": lambda word: word.surface,
46
- "sudachi": lambda word: word.surface(),
47
- }
48
-
49
- token_to_pos = {
50
- "ipadic": lambda word: word.feature[0],
51
- "juman": lambda word: word.feature[0],
52
- "unidic": lambda word: word.feature.pos1,
53
- "sudachi": lambda word: word.part_of_speech()[0],
54
- }
55
 
56
 
57
  class Dictionary(Reader):
@@ -71,10 +27,53 @@ class Dictionary(Reader):
71
  tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
72
  """
73
 
74
- self.tagger = taggers[tagger]
75
- self.token_to_kana = token_to_kana[tagger]
76
- self.token_to_surface = token_to_surface[tagger]
77
- self.token_to_pos = token_to_pos[tagger]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def furigana(self, text: str) -> str:
80
  text = utils.standardize_text(text)
 
3
  Provides the Dictionary class which implements Reader using dictionary lookup.
4
  """
5
 
 
 
 
 
6
  from speach import ttlig
 
 
 
7
  from config.config import ASCII_SPACE_TOKEN
8
  from yomikata import utils
9
  from yomikata.reader import Reader
10
+ import jaconv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
 
13
  class Dictionary(Reader):
 
27
  tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
28
  """
29
 
30
+ if tagger == "unidic":
31
+ import fugashi
32
+
33
+ self.tagger = fugashi.Tagger()
34
+ self.token_to_surface = lambda word: word.surface
35
+ self.token_to_pos = lambda word: word.feature.pos1
36
+ self.token_to_kana = (
37
+ lambda word: jaconv.kata2hira(str(word))
38
+ if (word.feature.kana == "*" or word.feature.kana is None)
39
+ else jaconv.kata2hira(str(word.feature.kana))
40
+ )
41
+ elif tagger == "ipadic":
42
+ import fugashi
43
+ import ipadic
44
+
45
+ self.tagger = fugashi.GenericTagger(ipadic.MECAB_ARGS)
46
+ self.token_to_surface = lambda word: word.surface
47
+ self.token_to_pos = lambda word: word.feature[0]
48
+ self.token_to_kana = (
49
+ lambda word: jaconv.kata2hira(str(word.feature[7]))
50
+ if len(word.feature) >= 8
51
+ else jaconv.kata2hira(str(word.surface))
52
+ )
53
+ elif tagger == "juman":
54
+ import fugashi
55
+ import jumandic
56
+
57
+ self.tagger = fugashi.GenericTagger(jumandic.MECAB_ARGS)
58
+ self.token_to_surface = lambda word: word.surface
59
+ self.token_to_pos = lambda word: word.feature[0]
60
+ self.token_to_kana = (
61
+ lambda word: word.feature[5]
62
+ if word.feature[5] != "*"
63
+ else jaconv.kata2hira(str(word))
64
+ )
65
+ elif tagger == "sudachi":
66
+ from sudachipy import dictionary as sudachidict
67
+ from sudachipy import tokenizer as sudachitokenizer
68
+
69
+ tokenizer_obj = sudachidict.Dictionary(dict="full").create()
70
+ mode = sudachitokenizer.Tokenizer.SplitMode.C
71
+ self.tagger = lambda s: tokenizer_obj.tokenize(s, mode)
72
+ self.token_to_surface = lambda word: word.surface()
73
+ self.token_to_pos = lambda word: word.part_of_speech()[0]
74
+ self.token_to_kana = lambda word: jaconv.kata2hira(
75
+ utils.standardize_text(str(word.reading_form()))
76
+ )
77
 
78
  def furigana(self, text: str) -> str:
79
  text = utils.standardize_text(text)