Spaces:
Build error
Build error
Sam Passaglia
commited on
Commit
•
9d2f9e8
1
Parent(s):
65d65b7
fixes
Browse files- requirements.txt +2 -1
- yomikata/dictionary.py +116 -3
requirements.txt
CHANGED
@@ -16,4 +16,5 @@ datasets>=2.7.1
|
|
16 |
pynvml==11.4.1
|
17 |
sentencepiece>=0.1.97
|
18 |
streamlit==1.17.0
|
19 |
-
rich
|
|
|
|
16 |
pynvml==11.4.1
|
17 |
sentencepiece>=0.1.97
|
18 |
streamlit==1.17.0
|
19 |
+
rich
|
20 |
+
altair<5
|
yomikata/dictionary.py
CHANGED
@@ -3,11 +3,16 @@ dictionary.py
|
|
3 |
Provides the Dictionary class which implements Reader using dictionary lookup.
|
4 |
"""
|
5 |
|
|
|
|
|
|
|
|
|
6 |
from speach import ttlig
|
7 |
-
from
|
|
|
8 |
from yomikata import utils
|
|
|
9 |
from yomikata.reader import Reader
|
10 |
-
import jaconv
|
11 |
|
12 |
|
13 |
class Dictionary(Reader):
|
@@ -93,6 +98,114 @@ class Dictionary(Reader):
|
|
93 |
if (surface == kana) or pos in ["記号", "補助記号", "特殊"]:
|
94 |
output += surface
|
95 |
else:
|
96 |
-
output +=
|
97 |
output = output.replace(ASCII_SPACE_TOKEN, " ")
|
98 |
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
Provides the Dictionary class which implements Reader using dictionary lookup.
|
4 |
"""
|
5 |
|
6 |
+
from difflib import ndiff
|
7 |
+
|
8 |
+
import jaconv
|
9 |
+
from chirptext import deko
|
10 |
from speach import ttlig
|
11 |
+
from speach.ttlig import RubyFrag, RubyToken
|
12 |
+
|
13 |
from yomikata import utils
|
14 |
+
from config.config import ASCII_SPACE_TOKEN
|
15 |
from yomikata.reader import Reader
|
|
|
16 |
|
17 |
|
18 |
class Dictionary(Reader):
|
|
|
98 |
if (surface == kana) or pos in ["記号", "補助記号", "特殊"]:
|
99 |
output += surface
|
100 |
else:
|
101 |
+
output += Dictionary.furi_to_ruby(surface, kana).to_code()
|
102 |
output = output.replace(ASCII_SPACE_TOKEN, " ")
|
103 |
return output
|
104 |
+
|
105 |
+
@staticmethod
|
106 |
+
def furi_to_ruby(surface, kana):
|
107 |
+
"""Combine a surface string and a kana string to a RubyToken object with furigana.
|
108 |
+
|
109 |
+
Args:
|
110 |
+
surface (str): Surface string
|
111 |
+
kana (str): Kana string
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
RubyToken: RubyToken object with furigana
|
115 |
+
|
116 |
+
This code is modified from the version in the part of speach library:
|
117 |
+
https://github.com/neocl/speach/
|
118 |
+
https://github.com/neocl/speach/blob/main/speach/ttlig.py
|
119 |
+
:copyright: (c) 2018 Le Tuan Anh <tuananh.ke@gmail.com>
|
120 |
+
:license: MIT
|
121 |
+
"""
|
122 |
+
|
123 |
+
def common_substring_from_right(string1, string2):
|
124 |
+
i = -1 # start from the end of strings
|
125 |
+
while -i <= min(len(string1), len(string2)):
|
126 |
+
if string1[i] != string2[i]: # if characters don't match, break
|
127 |
+
break
|
128 |
+
i -= 1 # decrement i to move towards start
|
129 |
+
return string1[i + 1 :] if i != -1 else "" # return common substring
|
130 |
+
|
131 |
+
def assert_rubytoken_kana_match(ruby: RubyToken, kana: str) -> None:
|
132 |
+
assert (
|
133 |
+
"".join(
|
134 |
+
[token.furi if isinstance(token, RubyFrag) else token for token in ruby.groups]
|
135 |
+
)
|
136 |
+
== kana
|
137 |
+
)
|
138 |
+
|
139 |
+
original_kana = kana
|
140 |
+
|
141 |
+
final_text = common_substring_from_right(surface, kana)
|
142 |
+
|
143 |
+
if final_text:
|
144 |
+
surface = surface[: -len(final_text)]
|
145 |
+
kana = kana[: -len(final_text)]
|
146 |
+
|
147 |
+
ruby = RubyToken(surface=surface)
|
148 |
+
if deko.is_kana(surface):
|
149 |
+
ruby.append(surface)
|
150 |
+
if final_text:
|
151 |
+
ruby.append(final_text)
|
152 |
+
assert_rubytoken_kana_match(ruby, original_kana)
|
153 |
+
return ruby
|
154 |
+
|
155 |
+
edit_seq = ndiff(surface, kana)
|
156 |
+
kanji = ""
|
157 |
+
text = ""
|
158 |
+
furi = ""
|
159 |
+
before = ""
|
160 |
+
expected = ""
|
161 |
+
for item in edit_seq:
|
162 |
+
if item.startswith("- "):
|
163 |
+
# flush text if needed
|
164 |
+
if expected and kanji and furi:
|
165 |
+
ruby.append(RubyFrag(text=kanji, furi=furi))
|
166 |
+
kanji = ""
|
167 |
+
furi = ""
|
168 |
+
print(ruby)
|
169 |
+
if text:
|
170 |
+
ruby.append(text)
|
171 |
+
text = ""
|
172 |
+
kanji += item[2:]
|
173 |
+
elif item.startswith("+ "):
|
174 |
+
if expected and item[2:] == expected:
|
175 |
+
if expected and kanji and furi:
|
176 |
+
ruby.append(RubyFrag(text=kanji, furi=furi))
|
177 |
+
kanji = ""
|
178 |
+
furi = ""
|
179 |
+
ruby.append(item[2:])
|
180 |
+
expected = ""
|
181 |
+
else:
|
182 |
+
furi += item[2:]
|
183 |
+
elif item.startswith(" "):
|
184 |
+
if before == "-" and not furi:
|
185 |
+
# shifting happened
|
186 |
+
expected = item[2:]
|
187 |
+
furi += item[2:]
|
188 |
+
else:
|
189 |
+
text += item[2:]
|
190 |
+
# flush if possible
|
191 |
+
if kanji and furi:
|
192 |
+
ruby.append(RubyFrag(text=kanji, furi=furi))
|
193 |
+
kanji = ""
|
194 |
+
furi = ""
|
195 |
+
else:
|
196 |
+
# possible error?
|
197 |
+
pass
|
198 |
+
before = item[0] # end for
|
199 |
+
if kanji:
|
200 |
+
if furi:
|
201 |
+
ruby.append(RubyFrag(text=kanji, furi=furi))
|
202 |
+
else:
|
203 |
+
ruby.append(kanji)
|
204 |
+
elif text:
|
205 |
+
ruby.append(text)
|
206 |
+
|
207 |
+
if final_text:
|
208 |
+
ruby.append(final_text)
|
209 |
+
|
210 |
+
assert_rubytoken_kana_match(ruby, original_kana)
|
211 |
+
return ruby
|