Spaces:

passaglia
/

yomikata-demo

Build error

App Files Files Community

Sam Passaglia commited on Feb 20, 2023

Commit

ac462f6

•

1 Parent(s): ba074a5

minor

Browse files

Files changed (1) hide show

yomikata/dictionary.py +48 -49

yomikata/dictionary.py CHANGED Viewed

@@ -3,55 +3,11 @@ dictionary.py
 Provides the Dictionary class which implements Reader using dictionary lookup.
 """
-import fugashi
-import ipadic
-import jaconv
-import jumandic
 from speach import ttlig
-from sudachipy import dictionary as sudachidict
-from sudachipy import tokenizer as sudachitokenizer
 from config.config import ASCII_SPACE_TOKEN
 from yomikata import utils
 from yomikata.reader import Reader
-tokenizer_obj = sudachidict.Dictionary(dict="full").create()
-mode = sudachitokenizer.Tokenizer.SplitMode.C
-taggers = {}
-taggers["ipadic"] = fugashi.GenericTagger(ipadic.MECAB_ARGS)
-taggers["juman"] = fugashi.GenericTagger(jumandic.MECAB_ARGS)
-taggers["unidic"] = fugashi.Tagger()
-taggers["sudachi"] = lambda s: tokenizer_obj.tokenize(s, mode)
-token_to_kana = {
-    "ipadic": lambda word: jaconv.kata2hira(str(word.feature[7]))
-    if len(word.feature) >= 8
-    else jaconv.kata2hira(str(word.surface)),
-    "juman": lambda word: word.feature[5]
-    if word.feature[5] != "*"
-    else jaconv.kata2hira(str(word)),
-    "unidic": lambda word: jaconv.kata2hira(str(word))
-    if (word.feature.kana == "*" or word.feature.kana is None)
-    else jaconv.kata2hira(str(word.feature.kana)),
-    "sudachi": lambda word: jaconv.kata2hira(
-        utils.standardize_text(str(word.reading_form()))
-    ),
-}
-token_to_surface = {
-    "ipadic": lambda word: word.surface,
-    "juman": lambda word: word.surface,
-    "unidic": lambda word: word.surface,
-    "sudachi": lambda word: word.surface(),
-}
-token_to_pos = {
-    "ipadic": lambda word: word.feature[0],
-    "juman": lambda word: word.feature[0],
-    "unidic": lambda word: word.feature.pos1,
-    "sudachi": lambda word: word.part_of_speech()[0],
-}
 class Dictionary(Reader):
@@ -71,10 +27,53 @@ class Dictionary(Reader):
             tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
         """
-        self.tagger = taggers[tagger]
-        self.token_to_kana = token_to_kana[tagger]
-        self.token_to_surface = token_to_surface[tagger]
-        self.token_to_pos = token_to_pos[tagger]
     def furigana(self, text: str) -> str:
         text = utils.standardize_text(text)

 Provides the Dictionary class which implements Reader using dictionary lookup.
 """
 from speach import ttlig
 from config.config import ASCII_SPACE_TOKEN
 from yomikata import utils
 from yomikata.reader import Reader
+import jaconv
 class Dictionary(Reader):
             tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
         """
+        if tagger == "unidic":
+            import fugashi
+            self.tagger = fugashi.Tagger()
+            self.token_to_surface = lambda word: word.surface
+            self.token_to_pos = lambda word: word.feature.pos1
+            self.token_to_kana = (
+                lambda word: jaconv.kata2hira(str(word))
+                if (word.feature.kana == "*" or word.feature.kana is None)
+                else jaconv.kata2hira(str(word.feature.kana))
+            )
+        elif tagger == "ipadic":
+            import fugashi
+            import ipadic
+            self.tagger = fugashi.GenericTagger(ipadic.MECAB_ARGS)
+            self.token_to_surface = lambda word: word.surface
+            self.token_to_pos = lambda word: word.feature[0]
+            self.token_to_kana = (
+                lambda word: jaconv.kata2hira(str(word.feature[7]))
+                if len(word.feature) >= 8
+                else jaconv.kata2hira(str(word.surface))
+            )
+        elif tagger == "juman":
+            import fugashi
+            import jumandic
+            self.tagger = fugashi.GenericTagger(jumandic.MECAB_ARGS)
+            self.token_to_surface = lambda word: word.surface
+            self.token_to_pos = lambda word: word.feature[0]
+            self.token_to_kana = (
+                lambda word: word.feature[5]
+                if word.feature[5] != "*"
+                else jaconv.kata2hira(str(word))
+            )
+        elif tagger == "sudachi":
+            from sudachipy import dictionary as sudachidict
+            from sudachipy import tokenizer as sudachitokenizer
+            tokenizer_obj = sudachidict.Dictionary(dict="full").create()
+            mode = sudachitokenizer.Tokenizer.SplitMode.C
+            self.tagger = lambda s: tokenizer_obj.tokenize(s, mode)
+            self.token_to_surface = lambda word: word.surface()
+            self.token_to_pos = lambda word: word.part_of_speech()[0]
+            self.token_to_kana = lambda word: jaconv.kata2hira(
+                utils.standardize_text(str(word.reading_form()))
+            )
     def furigana(self, text: str) -> str:
         text = utils.standardize_text(text)