Spaces:
Build error
Build error
Sam Passaglia
commited on
Commit
โข
37b2b22
1
Parent(s):
cb3466d
minor
Browse files
app.py
CHANGED
@@ -11,189 +11,190 @@ from yomikata import utils
|
|
11 |
from yomikata.dictionary import Dictionary
|
12 |
from yomikata.utils import parse_furigana
|
13 |
|
|
|
14 |
|
15 |
-
@st.cache
|
16 |
-
def add_border(html: str):
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
def get_random_sentence():
|
23 |
-
from config.config import TEST_DATA_DIR
|
24 |
-
|
25 |
-
df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
|
26 |
-
return df.sample(1).iloc[0].sentence
|
27 |
-
|
28 |
-
|
29 |
-
@st.cache
|
30 |
-
def get_dbert_prediction_and_heteronym_list(text):
|
31 |
-
from yomikata.dbert import dBert
|
32 |
-
|
33 |
-
reader = dBert()
|
34 |
-
return reader.furigana(text), reader.heteronyms
|
35 |
-
|
36 |
-
|
37 |
-
@st.cache
|
38 |
-
def get_stats():
|
39 |
-
from config import config
|
40 |
-
from yomikata.utils import load_dict
|
41 |
-
|
42 |
-
stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
|
43 |
-
|
44 |
-
global_accuracy = stats["test"]["accuracy"]
|
45 |
-
|
46 |
-
stats = stats["test"]["heteronym_performance"]
|
47 |
-
heteronyms = stats.keys()
|
48 |
-
|
49 |
-
accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
|
50 |
-
|
51 |
-
readings = [
|
52 |
-
"ใ".join(
|
53 |
-
[
|
54 |
-
"{reading} ({correct}/{n})".format(
|
55 |
-
reading=reading,
|
56 |
-
correct=stats[heteronym]["readings"][reading]["found"][reading],
|
57 |
-
n=stats[heteronym]["readings"][reading]["n"],
|
58 |
-
)
|
59 |
-
for reading in stats[heteronym]["readings"].keys()
|
60 |
-
if (
|
61 |
-
stats[heteronym]["readings"][reading]["found"][reading] != 0
|
62 |
-
or reading != "<OTHER>"
|
63 |
-
)
|
64 |
-
]
|
65 |
-
)
|
66 |
-
for heteronym in heteronyms
|
67 |
-
]
|
68 |
-
|
69 |
-
# if reading != '<OTHER>'
|
70 |
-
|
71 |
-
df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
|
72 |
|
73 |
-
df = df[df["readings"].str.contains("ใ")]
|
74 |
|
75 |
-
|
|
|
76 |
|
77 |
-
|
|
|
78 |
|
79 |
-
df = df.sort_values("accuracy", ascending=False, ignore_index=True)
|
80 |
|
81 |
-
|
|
|
|
|
82 |
|
83 |
-
|
|
|
84 |
|
85 |
|
86 |
-
@st.cache
|
87 |
-
def
|
88 |
-
|
89 |
-
|
90 |
-
output_text = ""
|
91 |
-
heteronym_count = 0
|
92 |
-
for token in tokens.groups:
|
93 |
-
if isinstance(token, ttlig.RubyFrag):
|
94 |
-
if heteronym_count != 0:
|
95 |
-
output_text += ", "
|
96 |
|
97 |
-
|
98 |
-
{
|
99 |
-
"start": len(output_text),
|
100 |
-
"end": len(output_text) + len(token.text),
|
101 |
-
"label": token.furi,
|
102 |
-
}
|
103 |
-
)
|
104 |
|
105 |
-
|
106 |
-
heteronym_count += 1
|
107 |
-
else:
|
108 |
-
pass
|
109 |
-
return {
|
110 |
-
"text": output_text,
|
111 |
-
"ents": ents,
|
112 |
-
"title": None,
|
113 |
-
}
|
114 |
|
|
|
|
|
115 |
|
116 |
-
|
117 |
|
118 |
-
#
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
#
|
131 |
-
|
132 |
-
|
133 |
-
#
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
)
|
167 |
-
|
168 |
-
#
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
#
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
#
|
184 |
-
|
185 |
-
|
186 |
-
st.
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
#
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
from yomikata.dictionary import Dictionary
|
12 |
from yomikata.utils import parse_furigana
|
13 |
|
14 |
+
st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
|
15 |
|
16 |
+
# @st.cache
|
17 |
+
# def add_border(html: str):
|
18 |
+
# WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
|
19 |
+
# html = html.replace("\n", " ")
|
20 |
+
# return WRAPPER.format(html)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
|
|
22 |
|
23 |
+
# def get_random_sentence():
|
24 |
+
# from config.config import TEST_DATA_DIR
|
25 |
|
26 |
+
# df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
|
27 |
+
# return df.sample(1).iloc[0].sentence
|
28 |
|
|
|
29 |
|
30 |
+
# @st.cache
|
31 |
+
# def get_dbert_prediction_and_heteronym_list(text):
|
32 |
+
# from yomikata.dbert import dBert
|
33 |
|
34 |
+
# reader = dBert()
|
35 |
+
# return reader.furigana(text), reader.heteronyms
|
36 |
|
37 |
|
38 |
+
# @st.cache
|
39 |
+
# def get_stats():
|
40 |
+
# from config import config
|
41 |
+
# from yomikata.utils import load_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
# stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
# global_accuracy = stats["test"]["accuracy"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
# stats = stats["test"]["heteronym_performance"]
|
48 |
+
# heteronyms = stats.keys()
|
49 |
|
50 |
+
# accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
|
51 |
|
52 |
+
# readings = [
|
53 |
+
# "ใ".join(
|
54 |
+
# [
|
55 |
+
# "{reading} ({correct}/{n})".format(
|
56 |
+
# reading=reading,
|
57 |
+
# correct=stats[heteronym]["readings"][reading]["found"][reading],
|
58 |
+
# n=stats[heteronym]["readings"][reading]["n"],
|
59 |
+
# )
|
60 |
+
# for reading in stats[heteronym]["readings"].keys()
|
61 |
+
# if (
|
62 |
+
# stats[heteronym]["readings"][reading]["found"][reading] != 0
|
63 |
+
# or reading != "<OTHER>"
|
64 |
+
# )
|
65 |
+
# ]
|
66 |
+
# )
|
67 |
+
# for heteronym in heteronyms
|
68 |
+
# ]
|
69 |
+
|
70 |
+
# # if reading != '<OTHER>'
|
71 |
+
|
72 |
+
# df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
|
73 |
+
|
74 |
+
# df = df[df["readings"].str.contains("ใ")]
|
75 |
+
|
76 |
+
# df["readings"] = df["readings"].str.replace("<OTHER>", "Other")
|
77 |
+
|
78 |
+
# df = df.rename(columns={"readings": "readings (test corr./total)"})
|
79 |
+
|
80 |
+
# df = df.sort_values("accuracy", ascending=False, ignore_index=True)
|
81 |
+
|
82 |
+
# df.index += 1
|
83 |
+
|
84 |
+
# return global_accuracy, df
|
85 |
+
|
86 |
+
|
87 |
+
# @st.cache
|
88 |
+
# def furigana_to_spacy(text_with_furigana):
|
89 |
+
# tokens = parse_furigana(text_with_furigana)
|
90 |
+
# ents = []
|
91 |
+
# output_text = ""
|
92 |
+
# heteronym_count = 0
|
93 |
+
# for token in tokens.groups:
|
94 |
+
# if isinstance(token, ttlig.RubyFrag):
|
95 |
+
# if heteronym_count != 0:
|
96 |
+
# output_text += ", "
|
97 |
+
|
98 |
+
# ents.append(
|
99 |
+
# {
|
100 |
+
# "start": len(output_text),
|
101 |
+
# "end": len(output_text) + len(token.text),
|
102 |
+
# "label": token.furi,
|
103 |
+
# }
|
104 |
+
# )
|
105 |
+
|
106 |
+
# output_text += token.text
|
107 |
+
# heteronym_count += 1
|
108 |
+
# else:
|
109 |
+
# pass
|
110 |
+
# return {
|
111 |
+
# "text": output_text,
|
112 |
+
# "ents": ents,
|
113 |
+
# "title": None,
|
114 |
+
# }
|
115 |
+
|
116 |
+
|
117 |
+
# st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
|
118 |
+
|
119 |
+
# # Input text box
|
120 |
+
# st.markdown("Input a Japanese sentence:")
|
121 |
+
|
122 |
+
# if "default_sentence" not in st.session_state:
|
123 |
+
# st.session_state.default_sentence = "ใใ{ไบบ้/ใซใใใ}ใจใใใใฎใใ? {ไบบ้/ใซใใใ}ใจใใใใฎใฏ{่ง/ใคใฎ}ใฎ{็/ใฏ}ใใชใใ{็็ฝ/ใชใพใใ}ใ{้ก/ใใ}ใ{ๆ่ถณ/ใฆใใ}ใใใใ{ไฝ/ใชใ}ใจใใใใใ{ๆฐๅณ/ใใฟ}ใฎ{ๆช/ใใ}ใใใฎใ ใใ"
|
124 |
+
|
125 |
+
# input_text = st.text_area(
|
126 |
+
# "Input a Japanese sentence:",
|
127 |
+
# utils.remove_furigana(st.session_state.default_sentence),
|
128 |
+
# label_visibility="collapsed",
|
129 |
+
# )
|
130 |
+
|
131 |
+
# # Yomikata prediction
|
132 |
+
# dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
|
133 |
+
|
134 |
+
# # spacy-style output for the predictions
|
135 |
+
# colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
|
136 |
+
# spacy_dict = furigana_to_spacy(dbert_prediction)
|
137 |
+
# label_colors = {
|
138 |
+
# reading: colors[i % len(colors)]
|
139 |
+
# for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
|
140 |
+
# }
|
141 |
+
# html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
|
142 |
+
|
143 |
+
# if len(spacy_dict["ents"]) > 0:
|
144 |
+
# st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
|
145 |
+
# st.write(
|
146 |
+
# f"{add_border(html)}",
|
147 |
+
# unsafe_allow_html=True,
|
148 |
+
# )
|
149 |
+
# else:
|
150 |
+
# st.markdown("**Yomikata** found no heteronyms in the input text.")
|
151 |
+
|
152 |
+
# # Dictionary + Yomikata prediction
|
153 |
+
# st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")
|
154 |
+
# dictionary = st.radio(
|
155 |
+
# "It can be coupled with a dictionary",
|
156 |
+
# ("sudachi", "unidic", "ipadic", "juman"),
|
157 |
+
# horizontal=True,
|
158 |
+
# label_visibility="collapsed",
|
159 |
+
# )
|
160 |
+
|
161 |
+
# dictreader = Dictionary(dictionary)
|
162 |
+
# dictionary_prediction = dictreader.furigana(dbert_prediction)
|
163 |
+
# html = parse_furigana(dictionary_prediction).to_html()
|
164 |
+
# st.write(
|
165 |
+
# f"{add_border(html)}",
|
166 |
+
# unsafe_allow_html=True,
|
167 |
+
# )
|
168 |
+
|
169 |
+
# # Dictionary alone prediction
|
170 |
+
# if len(spacy_dict["ents"]) > 0:
|
171 |
+
# dictionary_prediction = dictreader.furigana(utils.remove_furigana(input_text))
|
172 |
+
# html = parse_furigana(dictionary_prediction).to_html()
|
173 |
+
# st.markdown("Without **Yomikata** disambiguation, the dictionary would yield:")
|
174 |
+
# st.write(
|
175 |
+
# f"{add_border(html)}",
|
176 |
+
# unsafe_allow_html=True,
|
177 |
+
# )
|
178 |
+
|
179 |
+
# # Randomize button
|
180 |
+
# if st.button("๐ฒ Randomize the input sentence"):
|
181 |
+
# st.session_state.default_sentence = get_random_sentence()
|
182 |
+
# st.experimental_rerun()
|
183 |
+
|
184 |
+
# # Stats section
|
185 |
+
# global_accuracy, stats_df = get_stats()
|
186 |
+
|
187 |
+
# st.subheader(
|
188 |
+
# f"{len(stats_df)} heteronyms supported, with a global accuracy of {global_accuracy:.0%}"
|
189 |
+
# )
|
190 |
+
|
191 |
+
# st.dataframe(stats_df)
|
192 |
+
|
193 |
+
# # Hide the footer
|
194 |
+
# hide_streamlit_style = """
|
195 |
+
# <style>
|
196 |
+
# #MainMenu {visibility: hidden;}
|
197 |
+
# footer {visibility: hidden;}
|
198 |
+
# </style>
|
199 |
+
# """
|
200 |
+
# st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|