Sam Passaglia commited on
Commit
37b2b22
โ€ข
1 Parent(s): cb3466d
Files changed (1) hide show
  1. app.py +173 -172
app.py CHANGED
@@ -11,189 +11,190 @@ from yomikata import utils
11
  from yomikata.dictionary import Dictionary
12
  from yomikata.utils import parse_furigana
13
 
 
14
 
15
- @st.cache
16
- def add_border(html: str):
17
- WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
18
- html = html.replace("\n", " ")
19
- return WRAPPER.format(html)
20
-
21
-
22
- def get_random_sentence():
23
- from config.config import TEST_DATA_DIR
24
-
25
- df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
26
- return df.sample(1).iloc[0].sentence
27
-
28
-
29
- @st.cache
30
- def get_dbert_prediction_and_heteronym_list(text):
31
- from yomikata.dbert import dBert
32
-
33
- reader = dBert()
34
- return reader.furigana(text), reader.heteronyms
35
-
36
-
37
- @st.cache
38
- def get_stats():
39
- from config import config
40
- from yomikata.utils import load_dict
41
-
42
- stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
43
-
44
- global_accuracy = stats["test"]["accuracy"]
45
-
46
- stats = stats["test"]["heteronym_performance"]
47
- heteronyms = stats.keys()
48
-
49
- accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
50
-
51
- readings = [
52
- "ใ€".join(
53
- [
54
- "{reading} ({correct}/{n})".format(
55
- reading=reading,
56
- correct=stats[heteronym]["readings"][reading]["found"][reading],
57
- n=stats[heteronym]["readings"][reading]["n"],
58
- )
59
- for reading in stats[heteronym]["readings"].keys()
60
- if (
61
- stats[heteronym]["readings"][reading]["found"][reading] != 0
62
- or reading != "<OTHER>"
63
- )
64
- ]
65
- )
66
- for heteronym in heteronyms
67
- ]
68
-
69
- # if reading != '<OTHER>'
70
-
71
- df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
72
 
73
- df = df[df["readings"].str.contains("ใ€")]
74
 
75
- df["readings"] = df["readings"].str.replace("<OTHER>", "Other")
 
76
 
77
- df = df.rename(columns={"readings": "readings (test corr./total)"})
 
78
 
79
- df = df.sort_values("accuracy", ascending=False, ignore_index=True)
80
 
81
- df.index += 1
 
 
82
 
83
- return global_accuracy, df
 
84
 
85
 
86
- @st.cache
87
- def furigana_to_spacy(text_with_furigana):
88
- tokens = parse_furigana(text_with_furigana)
89
- ents = []
90
- output_text = ""
91
- heteronym_count = 0
92
- for token in tokens.groups:
93
- if isinstance(token, ttlig.RubyFrag):
94
- if heteronym_count != 0:
95
- output_text += ", "
96
 
97
- ents.append(
98
- {
99
- "start": len(output_text),
100
- "end": len(output_text) + len(token.text),
101
- "label": token.furi,
102
- }
103
- )
104
 
105
- output_text += token.text
106
- heteronym_count += 1
107
- else:
108
- pass
109
- return {
110
- "text": output_text,
111
- "ents": ents,
112
- "title": None,
113
- }
114
 
 
 
115
 
116
- st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
117
 
118
- # Input text box
119
- st.markdown("Input a Japanese sentence:")
120
-
121
- if "default_sentence" not in st.session_state:
122
- st.session_state.default_sentence = "ใˆใ€{ไบบ้–“/ใซใ‚“ใ’ใ‚“}ใจใ„ใ†ใ‚‚ใฎใ‹ใ„? {ไบบ้–“/ใซใ‚“ใ’ใ‚“}ใจใ„ใ†ใ‚‚ใฎใฏ{่ง’/ใคใฎ}ใฎ{็”Ÿ/ใฏ}ใˆใชใ„ใ€{็”Ÿ็™ฝ/ใชใพใ˜ใ‚}ใ„{้ก”/ใ‹ใŠ}ใ‚„{ๆ‰‹่ถณ/ใฆใ‚ใ—}ใ‚’ใ—ใŸใ€{ไฝ•/ใชใ‚“}ใจใ‚‚ใ„ใ‚ใ‚Œใš{ๆฐ—ๅ‘ณ/ใใฟ}ใฎ{ๆ‚ช/ใ‚ใ‚‹}ใ„ใ‚‚ใฎใ ใ‚ˆใ€‚"
123
-
124
- input_text = st.text_area(
125
- "Input a Japanese sentence:",
126
- utils.remove_furigana(st.session_state.default_sentence),
127
- label_visibility="collapsed",
128
- )
129
-
130
- # Yomikata prediction
131
- dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
132
-
133
- # spacy-style output for the predictions
134
- colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
135
- spacy_dict = furigana_to_spacy(dbert_prediction)
136
- label_colors = {
137
- reading: colors[i % len(colors)]
138
- for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
139
- }
140
- html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
141
-
142
- if len(spacy_dict["ents"]) > 0:
143
- st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
144
- st.write(
145
- f"{add_border(html)}",
146
- unsafe_allow_html=True,
147
- )
148
- else:
149
- st.markdown("**Yomikata** found no heteronyms in the input text.")
150
-
151
- # Dictionary + Yomikata prediction
152
- st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")
153
- dictionary = st.radio(
154
- "It can be coupled with a dictionary",
155
- ("sudachi", "unidic", "ipadic", "juman"),
156
- horizontal=True,
157
- label_visibility="collapsed",
158
- )
159
-
160
- dictreader = Dictionary(dictionary)
161
- dictionary_prediction = dictreader.furigana(dbert_prediction)
162
- html = parse_furigana(dictionary_prediction).to_html()
163
- st.write(
164
- f"{add_border(html)}",
165
- unsafe_allow_html=True,
166
- )
167
-
168
- # Dictionary alone prediction
169
- if len(spacy_dict["ents"]) > 0:
170
- dictionary_prediction = dictreader.furigana(utils.remove_furigana(input_text))
171
- html = parse_furigana(dictionary_prediction).to_html()
172
- st.markdown("Without **Yomikata** disambiguation, the dictionary would yield:")
173
- st.write(
174
- f"{add_border(html)}",
175
- unsafe_allow_html=True,
176
- )
177
-
178
- # Randomize button
179
- if st.button("๐ŸŽฒ Randomize the input sentence"):
180
- st.session_state.default_sentence = get_random_sentence()
181
- st.experimental_rerun()
182
-
183
- # Stats section
184
- global_accuracy, stats_df = get_stats()
185
-
186
- st.subheader(
187
- f"{len(stats_df)} heteronyms supported, with a global accuracy of {global_accuracy:.0%}"
188
- )
189
-
190
- st.dataframe(stats_df)
191
-
192
- # Hide the footer
193
- hide_streamlit_style = """
194
- <style>
195
- #MainMenu {visibility: hidden;}
196
- footer {visibility: hidden;}
197
- </style>
198
- """
199
- st.markdown(hide_streamlit_style, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from yomikata.dictionary import Dictionary
12
  from yomikata.utils import parse_furigana
13
 
14
+ st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
15
 
16
+ # @st.cache
17
+ # def add_border(html: str):
18
+ # WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
19
+ # html = html.replace("\n", " ")
20
+ # return WRAPPER.format(html)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
22
 
23
+ # def get_random_sentence():
24
+ # from config.config import TEST_DATA_DIR
25
 
26
+ # df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
27
+ # return df.sample(1).iloc[0].sentence
28
 
 
29
 
30
+ # @st.cache
31
+ # def get_dbert_prediction_and_heteronym_list(text):
32
+ # from yomikata.dbert import dBert
33
 
34
+ # reader = dBert()
35
+ # return reader.furigana(text), reader.heteronyms
36
 
37
 
38
+ # @st.cache
39
+ # def get_stats():
40
+ # from config import config
41
+ # from yomikata.utils import load_dict
 
 
 
 
 
 
42
 
43
+ # stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
 
 
 
 
 
 
44
 
45
+ # global_accuracy = stats["test"]["accuracy"]
 
 
 
 
 
 
 
 
46
 
47
+ # stats = stats["test"]["heteronym_performance"]
48
+ # heteronyms = stats.keys()
49
 
50
+ # accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
51
 
52
+ # readings = [
53
+ # "ใ€".join(
54
+ # [
55
+ # "{reading} ({correct}/{n})".format(
56
+ # reading=reading,
57
+ # correct=stats[heteronym]["readings"][reading]["found"][reading],
58
+ # n=stats[heteronym]["readings"][reading]["n"],
59
+ # )
60
+ # for reading in stats[heteronym]["readings"].keys()
61
+ # if (
62
+ # stats[heteronym]["readings"][reading]["found"][reading] != 0
63
+ # or reading != "<OTHER>"
64
+ # )
65
+ # ]
66
+ # )
67
+ # for heteronym in heteronyms
68
+ # ]
69
+
70
+ # # if reading != '<OTHER>'
71
+
72
+ # df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
73
+
74
+ # df = df[df["readings"].str.contains("ใ€")]
75
+
76
+ # df["readings"] = df["readings"].str.replace("<OTHER>", "Other")
77
+
78
+ # df = df.rename(columns={"readings": "readings (test corr./total)"})
79
+
80
+ # df = df.sort_values("accuracy", ascending=False, ignore_index=True)
81
+
82
+ # df.index += 1
83
+
84
+ # return global_accuracy, df
85
+
86
+
87
+ # @st.cache
88
+ # def furigana_to_spacy(text_with_furigana):
89
+ # tokens = parse_furigana(text_with_furigana)
90
+ # ents = []
91
+ # output_text = ""
92
+ # heteronym_count = 0
93
+ # for token in tokens.groups:
94
+ # if isinstance(token, ttlig.RubyFrag):
95
+ # if heteronym_count != 0:
96
+ # output_text += ", "
97
+
98
+ # ents.append(
99
+ # {
100
+ # "start": len(output_text),
101
+ # "end": len(output_text) + len(token.text),
102
+ # "label": token.furi,
103
+ # }
104
+ # )
105
+
106
+ # output_text += token.text
107
+ # heteronym_count += 1
108
+ # else:
109
+ # pass
110
+ # return {
111
+ # "text": output_text,
112
+ # "ents": ents,
113
+ # "title": None,
114
+ # }
115
+
116
+
117
+ # st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
118
+
119
+ # # Input text box
120
+ # st.markdown("Input a Japanese sentence:")
121
+
122
+ # if "default_sentence" not in st.session_state:
123
+ # st.session_state.default_sentence = "ใˆใ€{ไบบ้–“/ใซใ‚“ใ’ใ‚“}ใจใ„ใ†ใ‚‚ใฎใ‹ใ„? {ไบบ้–“/ใซใ‚“ใ’ใ‚“}ใจใ„ใ†ใ‚‚ใฎใฏ{่ง’/ใคใฎ}ใฎ{็”Ÿ/ใฏ}ใˆใชใ„ใ€{็”Ÿ็™ฝ/ใชใพใ˜ใ‚}ใ„{้ก”/ใ‹ใŠ}ใ‚„{ๆ‰‹่ถณ/ใฆใ‚ใ—}ใ‚’ใ—ใŸใ€{ไฝ•/ใชใ‚“}ใจใ‚‚ใ„ใ‚ใ‚Œใš{ๆฐ—ๅ‘ณ/ใใฟ}ใฎ{ๆ‚ช/ใ‚ใ‚‹}ใ„ใ‚‚ใฎใ ใ‚ˆใ€‚"
124
+
125
+ # input_text = st.text_area(
126
+ # "Input a Japanese sentence:",
127
+ # utils.remove_furigana(st.session_state.default_sentence),
128
+ # label_visibility="collapsed",
129
+ # )
130
+
131
+ # # Yomikata prediction
132
+ # dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
133
+
134
+ # # spacy-style output for the predictions
135
+ # colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
136
+ # spacy_dict = furigana_to_spacy(dbert_prediction)
137
+ # label_colors = {
138
+ # reading: colors[i % len(colors)]
139
+ # for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
140
+ # }
141
+ # html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
142
+
143
+ # if len(spacy_dict["ents"]) > 0:
144
+ # st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
145
+ # st.write(
146
+ # f"{add_border(html)}",
147
+ # unsafe_allow_html=True,
148
+ # )
149
+ # else:
150
+ # st.markdown("**Yomikata** found no heteronyms in the input text.")
151
+
152
+ # # Dictionary + Yomikata prediction
153
+ # st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")
154
+ # dictionary = st.radio(
155
+ # "It can be coupled with a dictionary",
156
+ # ("sudachi", "unidic", "ipadic", "juman"),
157
+ # horizontal=True,
158
+ # label_visibility="collapsed",
159
+ # )
160
+
161
+ # dictreader = Dictionary(dictionary)
162
+ # dictionary_prediction = dictreader.furigana(dbert_prediction)
163
+ # html = parse_furigana(dictionary_prediction).to_html()
164
+ # st.write(
165
+ # f"{add_border(html)}",
166
+ # unsafe_allow_html=True,
167
+ # )
168
+
169
+ # # Dictionary alone prediction
170
+ # if len(spacy_dict["ents"]) > 0:
171
+ # dictionary_prediction = dictreader.furigana(utils.remove_furigana(input_text))
172
+ # html = parse_furigana(dictionary_prediction).to_html()
173
+ # st.markdown("Without **Yomikata** disambiguation, the dictionary would yield:")
174
+ # st.write(
175
+ # f"{add_border(html)}",
176
+ # unsafe_allow_html=True,
177
+ # )
178
+
179
+ # # Randomize button
180
+ # if st.button("๐ŸŽฒ Randomize the input sentence"):
181
+ # st.session_state.default_sentence = get_random_sentence()
182
+ # st.experimental_rerun()
183
+
184
+ # # Stats section
185
+ # global_accuracy, stats_df = get_stats()
186
+
187
+ # st.subheader(
188
+ # f"{len(stats_df)} heteronyms supported, with a global accuracy of {global_accuracy:.0%}"
189
+ # )
190
+
191
+ # st.dataframe(stats_df)
192
+
193
+ # # Hide the footer
194
+ # hide_streamlit_style = """
195
+ # <style>
196
+ # #MainMenu {visibility: hidden;}
197
+ # footer {visibility: hidden;}
198
+ # </style>
199
+ # """
200
+ # st.markdown(hide_streamlit_style, unsafe_allow_html=True)