Spaces:

passaglia
/

yomikata-demo

Build error

App Files Files Community

Sam Passaglia commited on Feb 20, 2023

Commit

9aba307

•

1 Parent(s): f73b6d4

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +178 -0
config/__pycache__/config.cpython-310.pyc +0 -0
config/config.py +102 -0
config/dbert-train-args.json +21 -0
config/heteronyms.json +559 -0
config/heteronyms_Sato2022.json +211 -0
pyproject.toml +65 -0
requirements.txt +25 -0
robot_reading.png +0 -0
stores/dbert/added_tokens.json +64 -0
stores/dbert/config.json +634 -0
stores/dbert/heteronyms.json +567 -0
stores/dbert/label_encoder.json +306 -0
stores/dbert/pytorch_model.bin +3 -0
stores/dbert/special_tokens_map.json +7 -0
stores/dbert/tokenizer_config.json +22 -0
stores/dbert/training_args.bin +3 -0
stores/dbert/training_performance.json +0 -0
stores/dbert/vocab.txt +0 -0
yomikata/__init__.py +0 -0
yomikata/__pycache__/__init__.cpython-310.pyc +0 -0
yomikata/__pycache__/dbert.cpython-310.pyc +0 -0
yomikata/__pycache__/dictionary.cpython-310.pyc +0 -0
yomikata/__pycache__/evaluate.cpython-310.pyc +0 -0
yomikata/__pycache__/main.cpython-310.pyc +0 -0
yomikata/__pycache__/reader.cpython-310.pyc +0 -0
yomikata/__pycache__/t5.cpython-310.pyc +0 -0
yomikata/__pycache__/utils.cpython-310.pyc +0 -0
yomikata/dataset/__init__.py +0 -0
yomikata/dataset/__pycache__/__init__.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/aozora.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/bccwj.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/kwdlc.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/ndlbib.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/pronunciations.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/repair_long_vowels.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/split.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/sudachi.cpython-310.pyc +0 -0
yomikata/dataset/__pycache__/unidic.cpython-310.pyc +0 -0
yomikata/dataset/aozora.py +117 -0
yomikata/dataset/bccwj.py +206 -0
yomikata/dataset/kwdlc.py +109 -0
yomikata/dataset/ndlbib.py +46 -0
yomikata/dataset/pronunciations.py +57 -0
yomikata/dataset/repair_long_vowels.py +62 -0
yomikata/dataset/split.py +271 -0
yomikata/dataset/sudachi.py +50 -0
yomikata/dataset/unidic.py +44 -0
yomikata/dbert.py +414 -0
yomikata/dictionary.py +99 -0

app.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""app.py
+streamlit demo of yomikata"""
+import pandas as pd
+import spacy
+import streamlit as st
+from speach import ttlig
+from yomikata import utils
+from yomikata.dictionary import Dictionary
+from yomikata.utils import parse_furigana
+from pathlib import Path
+@st.cache_data
+def add_border(html: str):
+    WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
+    html = html.replace("\n", " ")
+    return WRAPPER.format(html)
+def get_random_sentence():
+    from config.config import TEST_DATA_DIR
+    df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
+    return df.sample(1).iloc[0].sentence
+@st.cache_data
+def get_dbert_prediction_and_heteronym_list(text):
+    from yomikata.dbert import dBert
+    reader = dBert()
+    return reader.furigana(text), reader.heteronyms
+@st.cache_data
+def get_stats():
+    from config import config
+    from yomikata.utils import load_dict
+    stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
+    global_accuracy = stats['test']['accuracy']
+    stats = stats['test']['heteronym_performance']
+    heteronyms = stats.keys()
+    accuracy = [stats[heteronym]['accuracy'] for heteronym in heteronyms]
+    readings = [ "、".join(["{reading} ({correct}/{n})".format(reading=reading, correct=stats[heteronym]['readings'][reading]['found'][reading], n=stats[heteronym]['readings'][reading]['n']) for reading in stats[heteronym]['readings'].keys() if (stats[heteronym]['readings'][reading]['found'][reading] !=0 or reading != '<OTHER>')]) for heteronym in heteronyms ]
+    #if reading != '<OTHER>'
+    df = pd.DataFrame({'heteronym': heteronyms, 'accuracy': accuracy, 'readings': readings} )
+    df = df[df['readings'].str.contains('、')]
+    df['readings'] =  df['readings'].str.replace('<OTHER>', 'Other')
+    df = df.rename(columns={'readings':'readings (test corr./total)'})
+    df= df.sort_values('accuracy', ascending=False, ignore_index=True)
+    df.index += 1
+    return global_accuracy, df
+@st.cache_data
+def furigana_to_spacy(text_with_furigana):
+    tokens = parse_furigana(text_with_furigana)
+    ents = []
+    output_text = ""
+    heteronym_count = 0
+    for token in tokens.groups:
+        if isinstance(token, ttlig.RubyFrag):
+            if heteronym_count != 0:
+                output_text += ", "
+            ents.append(
+                {
+                    "start": len(output_text),
+                    "end": len(output_text) + len(token.text),
+                    "label": token.furi,
+                }
+            )
+            output_text += token.text
+            heteronym_count += 1
+        else:
+            pass
+    return {
+        "text": output_text,
+        "ents": ents,
+        "title": None,
+    }
+st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
+# Input text box
+st.markdown("Input a Japanese sentence:")
+if "default_sentence" not in st.session_state:
+    st.session_state.default_sentence = "え、{人間/にんげん}というものかい? {人間/にんげん}というものは{角/つの}の{生/は}えない、{生白/なまじろ}い{顔/かお}や{手足/てあし}をした、{何/なん}ともいわれず{気味/きみ}の{悪/わる}いものだよ。"
+input_text = st.text_area(
+    "Input a Japanese sentence:",
+    utils.remove_furigana(st.session_state.default_sentence),
+    label_visibility="collapsed",
+)
+# Yomikata prediction
+dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
+# spacy-style output for the predictions
+colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
+spacy_dict = furigana_to_spacy(dbert_prediction)
+label_colors = {
+    reading: colors[i % len(colors)]
+    for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
+}
+html = spacy.displacy.render(
+    spacy_dict, style="ent", manual=True, options={"colors": label_colors}
+)
+if len(spacy_dict["ents"]) > 0:
+    st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
+    st.write(
+        f"{add_border(html)}",
+        unsafe_allow_html=True,
+    )
+else:
+    st.markdown("**Yomikata** found no heteronyms in the input text.")
+# Dictionary + Yomikata prediction
+st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")
+dictionary = st.radio(
+    "It can be coupled with a dictionary",
+    ("sudachi", "unidic", "ipadic", "juman"),
+    horizontal=True,
+    label_visibility="collapsed",
+)
+dictreader = Dictionary(dictionary)
+dictionary_prediction = dictreader.furigana(dbert_prediction)
+html = parse_furigana(dictionary_prediction).to_html()
+st.write(
+    f"{add_border(html)}",
+    unsafe_allow_html=True,
+)
+# Dictionary alone prediction
+if len(spacy_dict["ents"]) > 0:
+    dictionary_prediction = dictreader.furigana(utils.remove_furigana(input_text))
+    html = parse_furigana(dictionary_prediction).to_html()
+    st.markdown("Without **Yomikata** disambiguation, the dictionary would yield:")
+    st.write(
+        f"{add_border(html)}",
+        unsafe_allow_html=True,
+    )
+# Randomize button
+if st.button("🎲 Randomize the input sentence"):
+    st.session_state.default_sentence = get_random_sentence()
+    st.experimental_rerun()
+# Stats section
+global_accuracy, stats_df = get_stats()
+st.subheader(f"{len(stats_df)} heteronyms supported, with a global accuracy of {global_accuracy:.0%}")
+st.dataframe(stats_df)
+# Hide the footer
+hide_streamlit_style = """
+            <style>
+            #MainMenu {visibility: hidden;}
+            footer {visibility: hidden;}
+            </style>
+            """
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)

config/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (1.95 kB). View file

config/config.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# config.py
+import json
+import logging.config
+import sys
+from pathlib import Path
+import mlflow
+from rich.logging import RichHandler
+# Base and Config Directories
+BASE_DIR = Path(__file__).parent.parent.absolute()
+CONFIG_DIR = Path(BASE_DIR, "config")
+# Data Directories
+RAW_DATA_DIR = Path(BASE_DIR, "raw_data")
+SENTENCE_DATA_DIR = Path(BASE_DIR, "sentence_data")
+TRAIN_DATA_DIR = Path(SENTENCE_DATA_DIR, "train")
+VAL_DATA_DIR = Path(SENTENCE_DATA_DIR, "val")
+TEST_DATA_DIR = Path(SENTENCE_DATA_DIR, "test")
+READING_DATA_DIR = Path(BASE_DIR, "reading_data")
+# Logs Directory
+LOGS_DIR = Path(BASE_DIR, "logs")
+# Model Storage Directory
+STORES_DIR = Path(BASE_DIR, "stores")
+RUN_REGISTRY = Path(STORES_DIR, "runs")
+# Create dirs
+RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
+SENTENCE_DATA_DIR.mkdir(parents=True, exist_ok=True)
+TRAIN_DATA_DIR.mkdir(parents=True, exist_ok=True)
+VAL_DATA_DIR.mkdir(parents=True, exist_ok=True)
+TEST_DATA_DIR.mkdir(parents=True, exist_ok=True)
+READING_DATA_DIR.mkdir(parents=True, exist_ok=True)
+LOGS_DIR.mkdir(parents=True, exist_ok=True)
+STORES_DIR.mkdir(parents=True, exist_ok=True)
+RUN_REGISTRY.mkdir(parents=True, exist_ok=True)
+# Special tokens reserved
+ASCII_SPACE_TOKEN = "\U0000FFFF"  # this is used to replace the usual space characters before sending text to mecab, because mecab uses the usual space to separate words.
+# Seed
+SEED = 1271297
+# Training parameters
+TRAIN_SIZE = 0.7
+VAL_SIZE = 0.15
+TEST_SIZE = 0.15
+assert TRAIN_SIZE + VAL_SIZE + TEST_SIZE == 1
+# Heteronym list
+with open(Path(CONFIG_DIR, "heteronyms.json")) as fp:
+    HETERONYMS = json.load(fp)
+# MLFlow model registry
+mlflow.set_tracking_uri("file://" + str(RUN_REGISTRY.absolute()))
+# Logger
+logging_config = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "minimal": {"format": "%(message)s"},
+        "detailed": {
+            "format": "%(levelname)s %(asctime)s [%(name)s:%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n"
+        },
+    },
+    "handlers": {
+        "console": {
+            "class": "logging.StreamHandler",
+            "stream": sys.stdout,
+            "formatter": "minimal",
+            "level": logging.DEBUG,
+        },
+        "info": {
+            "class": "logging.handlers.RotatingFileHandler",
+            "filename": Path(LOGS_DIR, "info.log"),
+            "maxBytes": 10485760,  # 1 MB
+            "backupCount": 10,
+            "formatter": "detailed",
+            "level": logging.INFO,
+        },
+        "error": {
+            "class": "logging.handlers.RotatingFileHandler",
+            "filename": Path(LOGS_DIR, "error.log"),
+            "maxBytes": 10485760,  # 1 MB
+            "backupCount": 10,
+            "formatter": "detailed",
+            "level": logging.ERROR,
+        },
+    },
+    "root": {
+        "handlers": ["console", "info", "error"],
+        "level": logging.INFO,
+        "propagate": True,
+    },
+}
+logging.config.dictConfig(logging_config)
+logger = logging.getLogger()
+logger.handlers[0] = RichHandler(markup=True)

config/dbert-train-args.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "model": "dBert",
+    "dataset": "optimized_strict_heteronyms",
+    "experiment": "train-dBert",
+    "run": "test",
+    "num_train_epochs": 10,
+    "evaluation_strategy": "steps",
+    "eval_steps": 300,
+    "logging_strategy": "steps",
+    "logging_steps": 300,
+    "save_strategy": "steps",
+    "save_steps": 300,
+    "learning_rate": 2e-5,
+    "per_device_train_batch_size": 128,
+    "per_device_eval_batch_size": 128,
+    "load_best_model_at_end": true,
+    "metric_for_best_model": "loss",
+    "weight_decay": 0.01,
+    "save_total_limit": 5,
+    "report_to": "mlflow"
+}

config/heteronyms.json ADDED Viewed

	@@ -0,0 +1,559 @@

+{
+  "表": {
+    "ひょう": 3349,
+    "おもて": 3034,
+    "あらわ": 2474,
+    "あら": 731
+  },
+  "角": {
+    "かく": 4360,
+    "かど": 2303,
+    "つの": 372,
+    "すみ": 70
+  },
+  "大分": {
+    "おおいた": 3358,
+    "だいぶ": 797,
+    "だいぶん": 97
+  },
+  "国立": {
+    "こくりつ": 19256,
+    "くにたち": 246
+  },
+  "人気": {
+    "にんき": 7383,
+    "ひとけ": 149,
+    "じんき": 44
+  },
+  "市場": {
+    "しじょう": 85107,
+    "いちば": 781
+  },
+  "気質": {
+    "きしつ": 1108,
+    "かたぎ": 398
+  },
+  "上方": {
+    "かみがた": 1411,
+    "じょうほう": 656
+  },
+  "上手": {
+    "じょうず": 8065,
+    "うま": 706,
+    "かみて": 150,
+    "うわて": 57
+  },
+  "下手": {
+    "へた": 849,
+    "したて": 128,
+    "べた": 121,
+    "しもて": 50
+  },
+  "仮名": {
+    "かな": 1407,
+    "がな": 129,
+    "かめい": 115
+  },
+  "礼拝": {
+    "れいはい": 841,
+    "らいはい": 62
+  },
+  "遺言": {
+    "ゆいごん": 3152,
+    "いげん": 67,
+    "いごん": 57
+  },
+  "口腔": {
+    "こうこう": 6475,
+    "こうくう": 5577
+  },
+  "骨": {
+    "ほね": 10697,
+    "こつ": 5870
+  },
+  "一途": {
+    "いちず": 576,
+    "いっと": 139
+  },
+  "一言": {
+    "ひとこと": 2567,
+    "いちげん": 133,
+    "いちごん": 106
+  },
+  "最中": {
+    "さいちゅう": 520,
+    "さなか": 43
+  },
+  "一目": {
+    "ひとめ": 1596,
+    "いちもく": 210
+  },
+  "係": {
+    "かか": 14218,
+    "かかわ": 9804,
+    "がかり": 234,
+    "かかり": 227
+  },
+  "足跡": {
+    "あしあと": 2626,
+    "そくせき": 1862
+  },
+  "今日": {
+    "きょう": 17624,
+    "こんにち": 6772
+  },
+  "明日": {
+    "あす": 9824,
+    "あした": 6606,
+    "みょうにち": 66
+  },
+  "生物": {
+    "せいぶつ": 26088,
+    "いきもの": 55
+  },
+  "変化": {
+    "へんか": 87895,
+    "へんげ": 337
+  },
+  "大事": {
+    "だいじ": 5293,
+    "おおごと": 54
+  },
+  "大家": {
+    "たいか": 586,
+    "おおや": 238,
+    "たいけ": 79
+  },
+  "心中": {
+    "しんじゅう": 1541,
+    "しんちゅう": 250,
+    "しんぢゅう": 127
+  },
+  "一行": {
+    "いっこう": 1112,
+    "いちぎょう": 95
+  },
+  "一時": {
+    "いちじ": 2649,
+    "いっとき": 381,
+    "いちどき": 47
+  },
+  "一方": {
+    "いっぽう": 5327,
+    "ひとかた": 112,
+    "いちほう": 42
+  },
+  "一夜": {
+    "いちや": 1148,
+    "ひとよ": 82
+  },
+  "下野": {
+    "しもつけ": 530,
+    "げや": 104,
+    "しもの": 57
+  },
+  "花弁": {
+    "かべん": 213,
+    "はなびら": 58
+  },
+  "玩具": {
+    "がんぐ": 1354,
+    "おもちゃ": 238
+  },
+  "強力": {
+    "きょうりょく": 2319,
+    "ごうりき": 51
+  },
+  "金色": {
+    "きんいろ": 942,
+    "こんじき": 484
+  },
+  "経緯": {
+    "けいい": 7659,
+    "いきさつ": 56
+  },
+  "故郷": {
+    "こきょう": 3840,
+    "ふるさと": 506,
+    "くに": 122
+  },
+  "紅葉": {
+    "こうよう": 856,
+    "もみじ": 339
+  },
+  "根本": {
+    "こんぽん": 2872,
+    "ねもと": 262
+  },
+  "山陰": {
+    "さんいん": 2094,
+    "やまかげ": 51
+  },
+  "上下": {
+    "じょうげ": 1549,
+    "うえした": 97
+  },
+  "身体": {
+    "しんたい": 20301,
+    "からだ": 3375
+  },
+  "水面": {
+    "すいめん": 1387,
+    "みなも": 91
+  },
+  "世論": {
+    "よろん": 4554,
+    "せろん": 1934
+  },
+  "清水": {
+    "しみず": 4114,
+    "きよみず": 98
+  },
+  "大手": {
+    "おおて": 6695,
+    "おおで": 119
+  },
+  "大人": {
+    "おとな": 11037,
+    "たいじん": 113,
+    "うし": 59
+  },
+  "大勢": {
+    "おおぜい": 1290,
+    "たいせい": 398
+  },
+  "中間": {
+    "ちゅうかん": 17669,
+    "ちゅうげん": 144
+  },
+  "日向": {
+    "ひゅうが": 800,
+    "ひなた": 318
+  },
+  "夫婦": {
+    "ふうふ": 9165,
+    "めおと": 354
+  },
+  "牧場": {
+    "ぼくじょう": 1913,
+    "まきば": 159
+  },
+  "末期": {
+    "まっき": 3569,
+    "まつご": 78
+  },
+  "利益": {
+    "りえき": 13434,
+    "りやく": 209
+  },
+  "一味": {
+    "いちみ": 442,
+    "ひとあじ": 60
+  },
+  "魚": {
+    "さかな": 5857,
+    "うお": 1706,
+    "ぎょ": 413,
+    "ざかな": 50
+  },
+  "施行": {
+    "しこう": 18724,
+    "せこう": 70
+  },
+  "施工": {
+    "せこう": 25734,
+    "しこう": 48,
+    "せこ": 43
+  },
+  "転生": {
+    "てんせい": 911,
+    "てんしょう": 175
+  },
+  "博士": {
+    "はくし": 17017,
+    "はかせ": 2462
+  },
+  "眼鏡": {
+    "めがね": 2040,
+    "がんきょう": 102
+  },
+  "文字": {
+    "もじ": 9583,
+    "もんじ": 633
+  },
+  "文書": {
+    "ぶんしょ": 15094,
+    "もんじょ": 5879,
+    "もんしょ": 51
+  },
+  "現世": {
+    "げんせい": 192,
+    "げんせ": 125
+  },
+  "日中": {
+    "にっちゅう": 12478,
+    "にちじゅう": 117
+  },
+  "夜中": {
+    "よなか": 723,
+    "やちゅう": 106
+  },
+  "二人": {
+    "ふたり": 22151,
+    "ににん": 256
+  },
+  "見物": {
+    "けんぶつ": 1832,
+    "みもの": 61
+  },
+  "清浄": {
+    "せいじょう": 800,
+    "しょうじょう": 46
+  },
+  "谷間": {
+    "たにま": 1089,
+    "たにあい": 67
+  },
+  "追従": {
+    "ついじゅう": 1000,
+    "ついしょう": 73
+  },
+  "墓石": {
+    "はかいし": 323,
+    "ぼせき": 257
+  },
+  "漢書": {
+    "かんじょ": 171,
+    "かんしょ": 66,
+    "からぶみ": 47
+  },
+  "作法": {
+    "さほう": 3905,
+    "さくほう": 427
+  },
+  "半月": {
+    "はんつき": 388,
+    "はんげつ": 85
+  },
+  "黒子": {
+    "ほくろ": 200,
+    "くろこ": 183
+  },
+  "競売": {
+    "けいばい": 937,
+    "きょうばい": 332
+  },
+  "開眼": {
+    "かいげん": 338,
+    "かいがん": 144
+  },
+  "求道": {
+    "きゅうどう": 379,
+    "ぐどう": 81
+  },
+  "施業": {
+    "せぎょう": 602,
+    "しぎょう": 264
+  },
+  "借家": {
+    "しゃっか": 505,
+    "しゃくや": 394
+  },
+  "法衣": {
+    "ころも": 115,
+    "ほうえ": 87
+  },
+  "昨日": {
+    "きのう": 2670,
+    "さくじつ": 713
+  },
+  "風車": {
+    "ふうしゃ": 1133,
+    "かざぐるま": 678
+  },
+  "寒気": {
+    "かんき": 153,
+    "さむけ": 79
+  },
+  "背筋": {
+    "せすじ": 177,
+    "はいきん": 43
+  },
+  "逆手": {
+    "さかて": 169,
+    "ぎゃくて": 116
+  },
+  "生花": {
+    "いけばな": 283,
+    "せいか": 91
+  },
+  "白髪": {
+    "しらが": 313,
+    "はくはつ": 113
+  },
+  "一月": {
+    "ひとつき": 301,
+    "いちがつ": 282
+  },
+  "一寸": {
+    "ちょっと": 1481,
+    "いっすん": 111
+  },
+  "一声": {
+    "ひとこえ": 253,
+    "いっせい": 109
+  },
+  "一日": {
+    "いちにち": 1711,
+    "ついたち": 866,
+    "いちじつ": 41
+  },
+  "一分": {
+    "いちぶん": 75,
+    "いちぶ": 62
+  },
+  "一文": {
+    "いちもん": 86,
+    "いちぶん": 48
+  },
+  "何時": {
+    "いつ": 1248,
+    "なんじ": 159,
+    "なんどき": 63
+  },
+  "何分": {
+    "なにぶん": 379,
+    "なんぷん": 51
+  },
+  "気骨": {
+    "きこつ": 140,
+    "きぼね": 67
+  },
+  "銀杏": {
+    "いちょう": 322,
+    "ぎんなん": 85
+  },
+  "細々": {
+    "こまごま": 88,
+    "ほそぼそ": 67
+  },
+  "細目": {
+    "さいもく": 962,
+    "ほそめ": 123
+  },
+  "疾風": {
+    "しっぷう": 544,
+    "はやて": 94,
+    "かぜ": 68
+  },
+  "菖蒲": {
+    "しょうぶ": 165,
+    "あやめ": 65
+  },
+  "船底": {
+    "せんてい": 246,
+    "ふなぞこ": 80
+  },
+  "相乗": {
+    "そうじょう": 732,
+    "あいの": 89
+  },
+  "造作": {
+    "ぞうさ": 188,
+    "ぞうさく": 65
+  },
+  "頭数": {
+    "あたまかず": 168,
+    "とうすう": 119
+  },
+  "二重": {
+    "にじゅう": 5418,
+    "ふたえ": 65
+  },
+  "日暮": {
+    "ひぐ": 403,
+    "ひぐれ": 97,
+    "ひぐらし": 81
+  },
+  "梅雨": {
+    "つゆ": 471,
+    "ばいう": 284
+  },
+  "風穴": {
+    "かざあな": 300,
+    "ふうけつ": 68
+  },
+  "分別": {
+    "ふんべつ": 1280,
+    "ぶんべつ": 635
+  },
+  "夜話": {
+    "やわ": 2153,
+    "よばなし": 52
+  },
+  "野兎": {
+    "やと": 176,
+    "のうさぎ": 43
+  },
+  "冷水": {
+    "れいすい": 189,
+    "ひやみず": 153
+  },
+  "連中": {
+    "れんじゅう": 853,
+    "れんちゅう": 691
+  },
+  "飛沫": {
+    "ひまつ": 223,
+    "しぶき": 96
+  },
+  "翡翠": {
+    "ひすい": 177,
+    "かわせみ": 94
+  },
+  "一昨日": {
+    "おととい": 208,
+    "いっさくじつ": 71
+  },
+  "一昨年": {
+    "おととし": 72,
+    "いっさくねん": 59
+  },
+  "十八番": {
+    "じゅうはちばん": 212,
+    "おはこ": 41
+  },
+  "明後日": {
+    "あさって": 186,
+    "みょうごにち": 60
+  },
+  "石綿": {
+    "いしわた": 1702,
+    "せきめん": 360
+  },
+  "公文": {
+    "こうぶん": 196,
+    "くもん": 46
+  },
+  "読本": {
+    "どくほん": 12176,
+    "とくほん": 2414,
+    "よみほん": 121
+  },
+  "古本": {
+    "ふるほん": 550,
+    "こほん": 109
+  },
+  "町家": {
+    "まちや": 655,
+    "ちょうか": 216
+  },
+  "米": {
+    "べい": 17392,
+    "こめ": 9021,
+    "まい": 2829,
+    "よね": 620,
+    "ごめ": 164,
+    "めーとる": 112
+  }
+}

config/heteronyms_Sato2022.json ADDED Viewed

	@@ -0,0 +1,211 @@

+{
+    "heteronyms_in_bert": {
+        "表": 2,
+        "角": 4,
+        "大分": 2,
+        "国立": 2,
+        "人気": 3,
+        "市場": 2,
+        "気質": 2,
+        "役所": 2,
+        "上方": 2,
+        "上手": 3,
+        "下手": 3,
+        "人事": 2,
+        "金星": 2,
+        "仮名": 2,
+        "内面": 2,
+        "礼拝": 2,
+        "遺言": 3,
+        "口腔": 2,
+        "後世": 2,
+        "骨": 2,
+        "一途": 2,
+        "一言": 3,
+        "最中": 3,
+        "一目": 2,
+        "係": 3,
+        "足跡": 2,
+        "今日": 2,
+        "明日": 3,
+        "生物": 3,
+        "変化": 2,
+        "大事": 2,
+        "水車": 2,
+        "一見": 2,
+        "一端": 2,
+        "大家": 3,
+        "心中": 2,
+        "書物": 2,
+        "一角": 2,
+        "一行": 3,
+        "一時": 3,
+        "一定": 2,
+        "一方": 2,
+        "一夜": 2,
+        "下野": 3,
+        "化学": 2,
+        "火口": 2,
+        "花弁": 2,
+        "玩具": 2,
+        "強力": 3,
+        "金色": 2,
+        "経緯": 2,
+        "故郷": 2,
+        "紅葉": 2,
+        "行方": 3,
+        "根本": 2,
+        "左右": 3,
+        "山陰": 2,
+        "十分": 2,
+        "上下": 5,
+        "身体": 2,
+        "水面": 2,
+        "世論": 2,
+        "清水": 3,
+        "大手": 2,
+        "大人": 4,
+        "大勢": 3,
+        "中間": 5,
+        "日向": 42,
+        "日時": 3,
+        "夫婦": 2,
+        "牧場": 2,
+        "末期": 2,
+        "利益": 2,
+        "工夫": 2,
+        "一味": 2,
+        "魚": 3,
+        "区分": 2,
+        "施行": 4,
+        "施工": 2,
+        "転生": 2,
+        "博士": 2,
+        "法華": 2,
+        "真面目": 3,
+        "眼鏡": 2,
+        "文字": 2,
+        "文書": 3,
+        "律令": 2,
+        "現世": 2,
+        "日中": 2,
+        "夜中": 3,
+        "前世": 2,
+        "二人": 2,
+        "立像": 2
+    },
+    "heteronyms_not_in_bert": {
+        "教化": 3,
+        "見物": 2,
+        "清浄": 2,
+        "谷間": 2,
+        "追従": 2,
+        "墓石": 2,
+        "大文字": 2,
+        "漢書": 2,
+        "作法": 2,
+        "兵法": 2,
+        "大人気": 2,
+        "半月": 2,
+        "黒子": 2,
+        "外面": 2,
+        "競売": 2,
+        "開眼": 2,
+        "求道": 2,
+        "血脈": 2,
+        "施業": 2,
+        "借家": 2,
+        "頭蓋骨": 2,
+        "法衣": 2,
+        "昨日": 2,
+        "氷柱": 2,
+        "風車": 2,
+        "寒気": 2,
+        "背筋": 2,
+        "逆手": 2,
+        "色紙": 2,
+        "生花": 3,
+        "白髪": 2,
+        "貼付": 2,
+        "一回": 2,
+        "一期": 2,
+        "一月": 3,
+        "一所": 2,
+        "一寸": 2,
+        "一声": 2,
+        "一石": 2,
+        "一日": 4,
+        "一分": 3,
+        "一文": 3,
+        "一片": 3,
+        "何時": 3,
+        "何分": 2,
+        "火煙": 2,
+        "火傷": 2,
+        "火床": 3,
+        "火先": 2,
+        "火筒": 2,
+        "芥子": 3,
+        "気骨": 2,
+        "銀杏": 3,
+        "元金": 2,
+        "五分": 2,
+        "後々": 2,
+        "後生": 2,
+        "御供": 4,
+        "細々": 3,
+        "細目": 2,
+        "三位": 2,
+        "疾風": 3,
+        "菖蒲": 2,
+        "世人": 2,
+        "世路": 2,
+        "船底": 2,
+        "早急": 2,
+        "相乗": 2,
+        "造作": 2,
+        "他言": 2,
+        "東雲": 2,
+        "頭数": 2,
+        "二重": 2,
+        "日供": 2,
+        "日次": 4,
+        "日暮": 3,
+        "日来": 3,
+        "梅雨": 2,
+        "風穴": 2,
+        "仏語": 3,
+        "分別": 2,
+        "面子": 2,
+        "木目": 2,
+        "目下": 2,
+        "夜直": 2,
+        "夜来": 2,
+        "夜話": 2,
+        "野兎": 2,
+        "野馬": 3,
+        "野分": 2,
+        "野辺": 2,
+        "野面": 3,
+        "野立": 3,
+        "冷水": 2,
+        "連中": 2,
+        "飛沫": 2,
+        "翡翠": 2,
+        "餃子": 2,
+        "一足": 2,
+        "意気地": 2,
+        "一昨日": 3,
+        "一昨年": 2,
+        "十八番": 2,
+        "十六夜": 2,
+        "明後日": 2,
+        "石綿": 2,
+        "公文": 2,
+        "読本": 3,
+        "仏国": 3,
+        "古本": 2,
+        "町家": 2,
+        "遊行": 2
+    }
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,65 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "yomikata"
+version = "0.0.1"
+authors = [{name="Sam Passaglia"}]
+description = "Japanese kanji disambiguation"
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+    "License:: OSI Approved :: MIT License"
+]
+dynamic = ["dependencies"]
+[project.urls]
+"Homepage" = "https://github.com/passaglia/yomikata"
+"Demo" = "https://huggingface.co/spaces/passaglia/yomikata"
+"Bug Tracker" = "https://github.com/passaglia/yomikata/issues"
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+[tool.setuptools]
+packages = ["yomikata", "config"]
+[tool.flake8]
+exclude = "venv"
+ignore = ["E203","E501", "W503", "E226"]
+max-line-length = 79
+# E501: Line too long
+# W503: Line break occurred before binary operator
+# E226: Missing white space around arithmetic operator
+# E203: whitespace before ':' ()
+# iSort
+[tool.isort]
+profile = "black"
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+virtual_env = "venv"
+# Black formatting
+[tool.black]
+line-length = 100
+include = '\.pyi?$'
+exclude = '''
+    /(
+      .eggs         # exclude a few common directories
+    | .git          # in the root of the project
+    | .hg
+    | .mypy_cache
+    | .tox
+    | venv
+    | _build
+    | buck-out
+    | build
+    | dist
+  )/
+  '''

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+numpy==1.24.0
+pandas==1.5.2
+pretty-errors==1.2.25
+fugashi==1.2.1
+ipadic==1.0.0
+jumandic==1.0.0
+jaconv==0.3
+fugashi[unidic] #python -m unidic download
+sudachidict_full
+scikit-learn==1.2.0
+speach==0.1a15.post1
+torch==1.13.1
+transformers==4.25.1
+datasets==2.7.1
+pynvml==11.4.1
+sentencepiece==0.1.97
+typer==0.7.0
+rich==12.6.0
+unidic-lite
+japanize_matplotlib
+mlflow-skinny==2.1.1
+streamlit==1.18.1
+black
+flake8
+isort

robot_reading.png ADDED Viewed

stores/dbert/added_tokens.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "一分": 32813,
+  "一声": 32824,
+  "一寸": 32779,
+  "一文": 32798,
+  "一日": 32791,
+  "一昨年": 32825,
+  "一昨日": 32822,
+  "一月": 32783,
+  "二重": 32782,
+  "何分": 32772,
+  "何時": 32773,
+  "作法": 32816,
+  "借家": 32819,
+  "公文": 32780,
+  "冷水": 32796,
+  "分別": 32827,
+  "十八番": 32810,
+  "半月": 32801,
+  "古本": 32805,
+  "墓石": 32814,
+  "夜話": 32806,
+  "大文字": 32774,
+  "寒気": 32804,
+  "施業": 32775,
+  "日暮": 32786,
+  "明後日": 32808,
+  "昨日": 32788,
+  "梅雨": 32803,
+  "気骨": 32777,
+  "求道": 32784,
+  "法衣": 32821,
+  "清浄": 32785,
+  "漢書": 32776,
+  "生花": 32811,
+  "町家": 32797,
+  "疾風": 32789,
+  "白髪": 32794,
+  "相乗": 32809,
+  "石綿": 32781,
+  "競売": 32799,
+  "細々": 32769,
+  "細目": 32815,
+  "翡翠": 32826,
+  "背筋": 32823,
+  "船底": 32812,
+  "菖蒲": 32820,
+  "見物": 32829,
+  "読本": 32795,
+  "谷間": 32800,
+  "追従": 32828,
+  "逆手": 32778,
+  "造作": 32818,
+  "連中": 32770,
+  "野兎": 32807,
+  "銀杏": 32768,
+  "開眼": 32790,
+  "頭数": 32792,
+  "頭蓋骨": 32817,
+  "風穴": 32802,
+  "風車": 32793,
+  "飛沫": 32787,
+  "黒子": 32771
+}

stores/dbert/config.json ADDED Viewed

	@@ -0,0 +1,634 @@

+{
+  "_name_or_path": "cl-tohoku/bert-base-japanese-v2",
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20",
+    "21": "LABEL_21",
+    "22": "LABEL_22",
+    "23": "LABEL_23",
+    "24": "LABEL_24",
+    "25": "LABEL_25",
+    "26": "LABEL_26",
+    "27": "LABEL_27",
+    "28": "LABEL_28",
+    "29": "LABEL_29",
+    "30": "LABEL_30",
+    "31": "LABEL_31",
+    "32": "LABEL_32",
+    "33": "LABEL_33",
+    "34": "LABEL_34",
+    "35": "LABEL_35",
+    "36": "LABEL_36",
+    "37": "LABEL_37",
+    "38": "LABEL_38",
+    "39": "LABEL_39",
+    "40": "LABEL_40",
+    "41": "LABEL_41",
+    "42": "LABEL_42",
+    "43": "LABEL_43",
+    "44": "LABEL_44",
+    "45": "LABEL_45",
+    "46": "LABEL_46",
+    "47": "LABEL_47",
+    "48": "LABEL_48",
+    "49": "LABEL_49",
+    "50": "LABEL_50",
+    "51": "LABEL_51",
+    "52": "LABEL_52",
+    "53": "LABEL_53",
+    "54": "LABEL_54",
+    "55": "LABEL_55",
+    "56": "LABEL_56",
+    "57": "LABEL_57",
+    "58": "LABEL_58",
+    "59": "LABEL_59",
+    "60": "LABEL_60",
+    "61": "LABEL_61",
+    "62": "LABEL_62",
+    "63": "LABEL_63",
+    "64": "LABEL_64",
+    "65": "LABEL_65",
+    "66": "LABEL_66",
+    "67": "LABEL_67",
+    "68": "LABEL_68",
+    "69": "LABEL_69",
+    "70": "LABEL_70",
+    "71": "LABEL_71",
+    "72": "LABEL_72",
+    "73": "LABEL_73",
+    "74": "LABEL_74",
+    "75": "LABEL_75",
+    "76": "LABEL_76",
+    "77": "LABEL_77",
+    "78": "LABEL_78",
+    "79": "LABEL_79",
+    "80": "LABEL_80",
+    "81": "LABEL_81",
+    "82": "LABEL_82",
+    "83": "LABEL_83",
+    "84": "LABEL_84",
+    "85": "LABEL_85",
+    "86": "LABEL_86",
+    "87": "LABEL_87",
+    "88": "LABEL_88",
+    "89": "LABEL_89",
+    "90": "LABEL_90",
+    "91": "LABEL_91",
+    "92": "LABEL_92",
+    "93": "LABEL_93",
+    "94": "LABEL_94",
+    "95": "LABEL_95",
+    "96": "LABEL_96",
+    "97": "LABEL_97",
+    "98": "LABEL_98",
+    "99": "LABEL_99",
+    "100": "LABEL_100",
+    "101": "LABEL_101",
+    "102": "LABEL_102",
+    "103": "LABEL_103",
+    "104": "LABEL_104",
+    "105": "LABEL_105",
+    "106": "LABEL_106",
+    "107": "LABEL_107",
+    "108": "LABEL_108",
+    "109": "LABEL_109",
+    "110": "LABEL_110",
+    "111": "LABEL_111",
+    "112": "LABEL_112",
+    "113": "LABEL_113",
+    "114": "LABEL_114",
+    "115": "LABEL_115",
+    "116": "LABEL_116",
+    "117": "LABEL_117",
+    "118": "LABEL_118",
+    "119": "LABEL_119",
+    "120": "LABEL_120",
+    "121": "LABEL_121",
+    "122": "LABEL_122",
+    "123": "LABEL_123",
+    "124": "LABEL_124",
+    "125": "LABEL_125",
+    "126": "LABEL_126",
+    "127": "LABEL_127",
+    "128": "LABEL_128",
+    "129": "LABEL_129",
+    "130": "LABEL_130",
+    "131": "LABEL_131",
+    "132": "LABEL_132",
+    "133": "LABEL_133",
+    "134": "LABEL_134",
+    "135": "LABEL_135",
+    "136": "LABEL_136",
+    "137": "LABEL_137",
+    "138": "LABEL_138",
+    "139": "LABEL_139",
+    "140": "LABEL_140",
+    "141": "LABEL_141",
+    "142": "LABEL_142",
+    "143": "LABEL_143",
+    "144": "LABEL_144",
+    "145": "LABEL_145",
+    "146": "LABEL_146",
+    "147": "LABEL_147",
+    "148": "LABEL_148",
+    "149": "LABEL_149",
+    "150": "LABEL_150",
+    "151": "LABEL_151",
+    "152": "LABEL_152",
+    "153": "LABEL_153",
+    "154": "LABEL_154",
+    "155": "LABEL_155",
+    "156": "LABEL_156",
+    "157": "LABEL_157",
+    "158": "LABEL_158",
+    "159": "LABEL_159",
+    "160": "LABEL_160",
+    "161": "LABEL_161",
+    "162": "LABEL_162",
+    "163": "LABEL_163",
+    "164": "LABEL_164",
+    "165": "LABEL_165",
+    "166": "LABEL_166",
+    "167": "LABEL_167",
+    "168": "LABEL_168",
+    "169": "LABEL_169",
+    "170": "LABEL_170",
+    "171": "LABEL_171",
+    "172": "LABEL_172",
+    "173": "LABEL_173",
+    "174": "LABEL_174",
+    "175": "LABEL_175",
+    "176": "LABEL_176",
+    "177": "LABEL_177",
+    "178": "LABEL_178",
+    "179": "LABEL_179",
+    "180": "LABEL_180",
+    "181": "LABEL_181",
+    "182": "LABEL_182",
+    "183": "LABEL_183",
+    "184": "LABEL_184",
+    "185": "LABEL_185",
+    "186": "LABEL_186",
+    "187": "LABEL_187",
+    "188": "LABEL_188",
+    "189": "LABEL_189",
+    "190": "LABEL_190",
+    "191": "LABEL_191",
+    "192": "LABEL_192",
+    "193": "LABEL_193",
+    "194": "LABEL_194",
+    "195": "LABEL_195",
+    "196": "LABEL_196",
+    "197": "LABEL_197",
+    "198": "LABEL_198",
+    "199": "LABEL_199",
+    "200": "LABEL_200",
+    "201": "LABEL_201",
+    "202": "LABEL_202",
+    "203": "LABEL_203",
+    "204": "LABEL_204",
+    "205": "LABEL_205",
+    "206": "LABEL_206",
+    "207": "LABEL_207",
+    "208": "LABEL_208",
+    "209": "LABEL_209",
+    "210": "LABEL_210",
+    "211": "LABEL_211",
+    "212": "LABEL_212",
+    "213": "LABEL_213",
+    "214": "LABEL_214",
+    "215": "LABEL_215",
+    "216": "LABEL_216",
+    "217": "LABEL_217",
+    "218": "LABEL_218",
+    "219": "LABEL_219",
+    "220": "LABEL_220",
+    "221": "LABEL_221",
+    "222": "LABEL_222",
+    "223": "LABEL_223",
+    "224": "LABEL_224",
+    "225": "LABEL_225",
+    "226": "LABEL_226",
+    "227": "LABEL_227",
+    "228": "LABEL_228",
+    "229": "LABEL_229",
+    "230": "LABEL_230",
+    "231": "LABEL_231",
+    "232": "LABEL_232",
+    "233": "LABEL_233",
+    "234": "LABEL_234",
+    "235": "LABEL_235",
+    "236": "LABEL_236",
+    "237": "LABEL_237",
+    "238": "LABEL_238",
+    "239": "LABEL_239",
+    "240": "LABEL_240",
+    "241": "LABEL_241",
+    "242": "LABEL_242",
+    "243": "LABEL_243",
+    "244": "LABEL_244",
+    "245": "LABEL_245",
+    "246": "LABEL_246",
+    "247": "LABEL_247",
+    "248": "LABEL_248",
+    "249": "LABEL_249",
+    "250": "LABEL_250",
+    "251": "LABEL_251",
+    "252": "LABEL_252",
+    "253": "LABEL_253",
+    "254": "LABEL_254",
+    "255": "LABEL_255",
+    "256": "LABEL_256",
+    "257": "LABEL_257",
+    "258": "LABEL_258",
+    "259": "LABEL_259",
+    "260": "LABEL_260",
+    "261": "LABEL_261",
+    "262": "LABEL_262",
+    "263": "LABEL_263",
+    "264": "LABEL_264",
+    "265": "LABEL_265",
+    "266": "LABEL_266",
+    "267": "LABEL_267",
+    "268": "LABEL_268",
+    "269": "LABEL_269",
+    "270": "LABEL_270",
+    "271": "LABEL_271",
+    "272": "LABEL_272",
+    "273": "LABEL_273",
+    "274": "LABEL_274",
+    "275": "LABEL_275",
+    "276": "LABEL_276",
+    "277": "LABEL_277",
+    "278": "LABEL_278",
+    "279": "LABEL_279",
+    "280": "LABEL_280",
+    "281": "LABEL_281",
+    "282": "LABEL_282",
+    "283": "LABEL_283",
+    "284": "LABEL_284",
+    "285": "LABEL_285",
+    "286": "LABEL_286",
+    "287": "LABEL_287",
+    "288": "LABEL_288",
+    "289": "LABEL_289",
+    "290": "LABEL_290",
+    "291": "LABEL_291",
+    "292": "LABEL_292",
+    "293": "LABEL_293",
+    "294": "LABEL_294",
+    "295": "LABEL_295",
+    "296": "LABEL_296",
+    "297": "LABEL_297",
+    "298": "LABEL_298",
+    "299": "LABEL_299",
+    "300": "LABEL_300",
+    "301": "LABEL_301"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_100": 100,
+    "LABEL_101": 101,
+    "LABEL_102": 102,
+    "LABEL_103": 103,
+    "LABEL_104": 104,
+    "LABEL_105": 105,
+    "LABEL_106": 106,
+    "LABEL_107": 107,
+    "LABEL_108": 108,
+    "LABEL_109": 109,
+    "LABEL_11": 11,
+    "LABEL_110": 110,
+    "LABEL_111": 111,
+    "LABEL_112": 112,
+    "LABEL_113": 113,
+    "LABEL_114": 114,
+    "LABEL_115": 115,
+    "LABEL_116": 116,
+    "LABEL_117": 117,
+    "LABEL_118": 118,
+    "LABEL_119": 119,
+    "LABEL_12": 12,
+    "LABEL_120": 120,
+    "LABEL_121": 121,
+    "LABEL_122": 122,
+    "LABEL_123": 123,
+    "LABEL_124": 124,
+    "LABEL_125": 125,
+    "LABEL_126": 126,
+    "LABEL_127": 127,
+    "LABEL_128": 128,
+    "LABEL_129": 129,
+    "LABEL_13": 13,
+    "LABEL_130": 130,
+    "LABEL_131": 131,
+    "LABEL_132": 132,
+    "LABEL_133": 133,
+    "LABEL_134": 134,
+    "LABEL_135": 135,
+    "LABEL_136": 136,
+    "LABEL_137": 137,
+    "LABEL_138": 138,
+    "LABEL_139": 139,
+    "LABEL_14": 14,
+    "LABEL_140": 140,
+    "LABEL_141": 141,
+    "LABEL_142": 142,
+    "LABEL_143": 143,
+    "LABEL_144": 144,
+    "LABEL_145": 145,
+    "LABEL_146": 146,
+    "LABEL_147": 147,
+    "LABEL_148": 148,
+    "LABEL_149": 149,
+    "LABEL_15": 15,
+    "LABEL_150": 150,
+    "LABEL_151": 151,
+    "LABEL_152": 152,
+    "LABEL_153": 153,
+    "LABEL_154": 154,
+    "LABEL_155": 155,
+    "LABEL_156": 156,
+    "LABEL_157": 157,
+    "LABEL_158": 158,
+    "LABEL_159": 159,
+    "LABEL_16": 16,
+    "LABEL_160": 160,
+    "LABEL_161": 161,
+    "LABEL_162": 162,
+    "LABEL_163": 163,
+    "LABEL_164": 164,
+    "LABEL_165": 165,
+    "LABEL_166": 166,
+    "LABEL_167": 167,
+    "LABEL_168": 168,
+    "LABEL_169": 169,
+    "LABEL_17": 17,
+    "LABEL_170": 170,
+    "LABEL_171": 171,
+    "LABEL_172": 172,
+    "LABEL_173": 173,
+    "LABEL_174": 174,
+    "LABEL_175": 175,
+    "LABEL_176": 176,
+    "LABEL_177": 177,
+    "LABEL_178": 178,
+    "LABEL_179": 179,
+    "LABEL_18": 18,
+    "LABEL_180": 180,
+    "LABEL_181": 181,
+    "LABEL_182": 182,
+    "LABEL_183": 183,
+    "LABEL_184": 184,
+    "LABEL_185": 185,
+    "LABEL_186": 186,
+    "LABEL_187": 187,
+    "LABEL_188": 188,
+    "LABEL_189": 189,
+    "LABEL_19": 19,
+    "LABEL_190": 190,
+    "LABEL_191": 191,
+    "LABEL_192": 192,
+    "LABEL_193": 193,
+    "LABEL_194": 194,
+    "LABEL_195": 195,
+    "LABEL_196": 196,
+    "LABEL_197": 197,
+    "LABEL_198": 198,
+    "LABEL_199": 199,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_200": 200,
+    "LABEL_201": 201,
+    "LABEL_202": 202,
+    "LABEL_203": 203,
+    "LABEL_204": 204,
+    "LABEL_205": 205,
+    "LABEL_206": 206,
+    "LABEL_207": 207,
+    "LABEL_208": 208,
+    "LABEL_209": 209,
+    "LABEL_21": 21,
+    "LABEL_210": 210,
+    "LABEL_211": 211,
+    "LABEL_212": 212,
+    "LABEL_213": 213,
+    "LABEL_214": 214,
+    "LABEL_215": 215,
+    "LABEL_216": 216,
+    "LABEL_217": 217,
+    "LABEL_218": 218,
+    "LABEL_219": 219,
+    "LABEL_22": 22,
+    "LABEL_220": 220,
+    "LABEL_221": 221,
+    "LABEL_222": 222,
+    "LABEL_223": 223,
+    "LABEL_224": 224,
+    "LABEL_225": 225,
+    "LABEL_226": 226,
+    "LABEL_227": 227,
+    "LABEL_228": 228,
+    "LABEL_229": 229,
+    "LABEL_23": 23,
+    "LABEL_230": 230,
+    "LABEL_231": 231,
+    "LABEL_232": 232,
+    "LABEL_233": 233,
+    "LABEL_234": 234,
+    "LABEL_235": 235,
+    "LABEL_236": 236,
+    "LABEL_237": 237,
+    "LABEL_238": 238,
+    "LABEL_239": 239,
+    "LABEL_24": 24,
+    "LABEL_240": 240,
+    "LABEL_241": 241,
+    "LABEL_242": 242,
+    "LABEL_243": 243,
+    "LABEL_244": 244,
+    "LABEL_245": 245,
+    "LABEL_246": 246,
+    "LABEL_247": 247,
+    "LABEL_248": 248,
+    "LABEL_249": 249,
+    "LABEL_25": 25,
+    "LABEL_250": 250,
+    "LABEL_251": 251,
+    "LABEL_252": 252,
+    "LABEL_253": 253,
+    "LABEL_254": 254,
+    "LABEL_255": 255,
+    "LABEL_256": 256,
+    "LABEL_257": 257,
+    "LABEL_258": 258,
+    "LABEL_259": 259,
+    "LABEL_26": 26,
+    "LABEL_260": 260,
+    "LABEL_261": 261,
+    "LABEL_262": 262,
+    "LABEL_263": 263,
+    "LABEL_264": 264,
+    "LABEL_265": 265,
+    "LABEL_266": 266,
+    "LABEL_267": 267,
+    "LABEL_268": 268,
+    "LABEL_269": 269,
+    "LABEL_27": 27,
+    "LABEL_270": 270,
+    "LABEL_271": 271,
+    "LABEL_272": 272,
+    "LABEL_273": 273,
+    "LABEL_274": 274,
+    "LABEL_275": 275,
+    "LABEL_276": 276,
+    "LABEL_277": 277,
+    "LABEL_278": 278,
+    "LABEL_279": 279,
+    "LABEL_28": 28,
+    "LABEL_280": 280,
+    "LABEL_281": 281,
+    "LABEL_282": 282,
+    "LABEL_283": 283,
+    "LABEL_284": 284,
+    "LABEL_285": 285,
+    "LABEL_286": 286,
+    "LABEL_287": 287,
+    "LABEL_288": 288,
+    "LABEL_289": 289,
+    "LABEL_29": 29,
+    "LABEL_290": 290,
+    "LABEL_291": 291,
+    "LABEL_292": 292,
+    "LABEL_293": 293,
+    "LABEL_294": 294,
+    "LABEL_295": 295,
+    "LABEL_296": 296,
+    "LABEL_297": 297,
+    "LABEL_298": 298,
+    "LABEL_299": 299,
+    "LABEL_3": 3,
+    "LABEL_30": 30,
+    "LABEL_300": 300,
+    "LABEL_301": 301,
+    "LABEL_31": 31,
+    "LABEL_32": 32,
+    "LABEL_33": 33,
+    "LABEL_34": 34,
+    "LABEL_35": 35,
+    "LABEL_36": 36,
+    "LABEL_37": 37,
+    "LABEL_38": 38,
+    "LABEL_39": 39,
+    "LABEL_4": 4,
+    "LABEL_40": 40,
+    "LABEL_41": 41,
+    "LABEL_42": 42,
+    "LABEL_43": 43,
+    "LABEL_44": 44,
+    "LABEL_45": 45,
+    "LABEL_46": 46,
+    "LABEL_47": 47,
+    "LABEL_48": 48,
+    "LABEL_49": 49,
+    "LABEL_5": 5,
+    "LABEL_50": 50,
+    "LABEL_51": 51,
+    "LABEL_52": 52,
+    "LABEL_53": 53,
+    "LABEL_54": 54,
+    "LABEL_55": 55,
+    "LABEL_56": 56,
+    "LABEL_57": 57,
+    "LABEL_58": 58,
+    "LABEL_59": 59,
+    "LABEL_6": 6,
+    "LABEL_60": 60,
+    "LABEL_61": 61,
+    "LABEL_62": 62,
+    "LABEL_63": 63,
+    "LABEL_64": 64,
+    "LABEL_65": 65,
+    "LABEL_66": 66,
+    "LABEL_67": 67,
+    "LABEL_68": 68,
+    "LABEL_69": 69,
+    "LABEL_7": 7,
+    "LABEL_70": 70,
+    "LABEL_71": 71,
+    "LABEL_72": 72,
+    "LABEL_73": 73,
+    "LABEL_74": 74,
+    "LABEL_75": 75,
+    "LABEL_76": 76,
+    "LABEL_77": 77,
+    "LABEL_78": 78,
+    "LABEL_79": 79,
+    "LABEL_8": 8,
+    "LABEL_80": 80,
+    "LABEL_81": 81,
+    "LABEL_82": 82,
+    "LABEL_83": 83,
+    "LABEL_84": 84,
+    "LABEL_85": 85,
+    "LABEL_86": 86,
+    "LABEL_87": 87,
+    "LABEL_88": 88,
+    "LABEL_89": 89,
+    "LABEL_9": 9,
+    "LABEL_90": 90,
+    "LABEL_91": 91,
+    "LABEL_92": 92,
+    "LABEL_93": 93,
+    "LABEL_94": 94,
+    "LABEL_95": 95,
+    "LABEL_96": 96,
+    "LABEL_97": 97,
+    "LABEL_98": 98,
+    "LABEL_99": 99
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "tokenizer_class": "BertJapaneseTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 32830
+}

stores/dbert/heteronyms.json ADDED Viewed

	@@ -0,0 +1,567 @@

+{
+  "表": {
+    "ひょう": 3349,
+    "おもて": 3034,
+    "あらわ": 2474,
+    "あら": 731
+  },
+  "角": {
+    "かく": 4360,
+    "かど": 2303,
+    "つの": 372,
+    "すみ": 70
+  },
+  "大分": {
+    "おおいた": 3358,
+    "だいぶ": 797,
+    "だいぶん": 97
+  },
+  "国立": {
+    "こくりつ": 19256,
+    "くにたち": 246
+  },
+  "人気": {
+    "にんき": 7383,
+    "ひとけ": 149,
+    "じんき": 44
+  },
+  "市場": {
+    "しじょう": 85107,
+    "いちば": 781
+  },
+  "気質": {
+    "きしつ": 1108,
+    "かたぎ": 398
+  },
+  "上方": {
+    "かみがた": 1411,
+    "じょうほう": 656
+  },
+  "上手": {
+    "じょうず": 8065,
+    "うま": 706,
+    "かみて": 150,
+    "うわて": 57
+  },
+  "下手": {
+    "へた": 849,
+    "したて": 128,
+    "べた": 121,
+    "しもて": 50
+  },
+  "仮名": {
+    "かな": 1407,
+    "がな": 129,
+    "かめい": 115
+  },
+  "礼拝": {
+    "れいはい": 841,
+    "らいはい": 62
+  },
+  "遺言": {
+    "ゆいごん": 3152,
+    "いげん": 67,
+    "いごん": 57
+  },
+  "口腔": {
+    "こうこう": 6475,
+    "こうくう": 5577
+  },
+  "骨": {
+    "ほね": 10697,
+    "こつ": 5870
+  },
+  "一途": {
+    "いちず": 576,
+    "いっと": 139
+  },
+  "一言": {
+    "ひとこと": 2567,
+    "いちげん": 133,
+    "いちごん": 106
+  },
+  "最中": {
+    "さいちゅう": 520,
+    "さなか": 43
+  },
+  "一目": {
+    "ひとめ": 1596,
+    "いちもく": 210
+  },
+  "係": {
+    "かか": 14218,
+    "かかわ": 9804,
+    "がかり": 234,
+    "かかり": 227
+  },
+  "足跡": {
+    "あしあと": 2626,
+    "そくせき": 1862
+  },
+  "今日": {
+    "きょう": 17624,
+    "こんにち": 6772
+  },
+  "明日": {
+    "あす": 9824,
+    "あした": 6606,
+    "みょうにち": 66
+  },
+  "生物": {
+    "せいぶつ": 26088,
+    "いきもの": 55
+  },
+  "変化": {
+    "へんか": 87895,
+    "へんげ": 337
+  },
+  "大事": {
+    "だいじ": 5293,
+    "おおごと": 54
+  },
+  "大家": {
+    "たいか": 586,
+    "おおや": 238,
+    "たいけ": 79
+  },
+  "心中": {
+    "しんじゅう": 1541,
+    "しんちゅう": 250,
+    "しんぢゅう": 127
+  },
+  "一行": {
+    "いっこう": 1112,
+    "いちぎょう": 95
+  },
+  "一時": {
+    "いちじ": 2649,
+    "いっとき": 381,
+    "いちどき": 47
+  },
+  "一方": {
+    "いっぽう": 5327,
+    "ひとかた": 112,
+    "いちほう": 42
+  },
+  "一夜": {
+    "いちや": 1148,
+    "ひとよ": 82
+  },
+  "下野": {
+    "しもつけ": 530,
+    "げや": 104,
+    "しもの": 57
+  },
+  "花弁": {
+    "かべん": 213,
+    "はなびら": 58
+  },
+  "玩具": {
+    "がんぐ": 1354,
+    "おもちゃ": 238
+  },
+  "強力": {
+    "きょうりょく": 2319,
+    "ごうりき": 51
+  },
+  "金色": {
+    "きんいろ": 942,
+    "こんじき": 484
+  },
+  "経緯": {
+    "けいい": 7659,
+    "いきさつ": 56
+  },
+  "故郷": {
+    "こきょう": 3840,
+    "ふるさと": 506,
+    "くに": 122
+  },
+  "紅葉": {
+    "こうよう": 856,
+    "もみじ": 339
+  },
+  "根本": {
+    "こんぽん": 2872,
+    "ねもと": 262
+  },
+  "山陰": {
+    "さんいん": 2094,
+    "やまかげ": 51
+  },
+  "上下": {
+    "じょうげ": 1549,
+    "うえした": 97
+  },
+  "身体": {
+    "しんたい": 20301,
+    "からだ": 3375
+  },
+  "水面": {
+    "すいめん": 1387,
+    "みなも": 91
+  },
+  "世論": {
+    "よろん": 4554,
+    "せろん": 1934
+  },
+  "清水": {
+    "しみず": 4114,
+    "きよみず": 98
+  },
+  "大手": {
+    "おおて": 6695,
+    "おおで": 119
+  },
+  "大人": {
+    "おとな": 11037,
+    "たいじん": 113,
+    "うし": 59
+  },
+  "大勢": {
+    "おおぜい": 1290,
+    "たいせい": 398
+  },
+  "中間": {
+    "ちゅうかん": 17669,
+    "ちゅうげん": 144
+  },
+  "日向": {
+    "ひゅうが": 800,
+    "ひなた": 318
+  },
+  "夫婦": {
+    "ふうふ": 9165,
+    "めおと": 354
+  },
+  "牧場": {
+    "ぼくじょう": 1913,
+    "まきば": 159
+  },
+  "末期": {
+    "まっき": 3569,
+    "まつご": 78
+  },
+  "利益": {
+    "りえき": 13434,
+    "りやく": 209
+  },
+  "一味": {
+    "いちみ": 442,
+    "ひとあじ": 60
+  },
+  "魚": {
+    "さかな": 5857,
+    "うお": 1706,
+    "ぎょ": 413,
+    "ざかな": 50
+  },
+  "施行": {
+    "しこう": 18724,
+    "せこう": 70
+  },
+  "施工": {
+    "せこう": 25734,
+    "しこう": 48,
+    "せこ": 43
+  },
+  "転生": {
+    "てんせい": 911,
+    "てんしょう": 175
+  },
+  "博士": {
+    "はくし": 17017,
+    "はかせ": 2462
+  },
+  "眼鏡": {
+    "めがね": 2040,
+    "がんきょう": 102
+  },
+  "文字": {
+    "もじ": 9583,
+    "もんじ": 633
+  },
+  "文書": {
+    "ぶんしょ": 15094,
+    "もんじょ": 5879,
+    "もんしょ": 51
+  },
+  "現世": {
+    "げんせい": 192,
+    "げんせ": 125
+  },
+  "日中": {
+    "にっちゅう": 12478,
+    "にちじゅう": 117
+  },
+  "夜中": {
+    "よなか": 723,
+    "やちゅう": 106
+  },
+  "二人": {
+    "ふたり": 22151,
+    "ににん": 256
+  },
+  "見物": {
+    "けんぶつ": 1832,
+    "みもの": 61
+  },
+  "清浄": {
+    "せいじょう": 800,
+    "しょうじょう": 46
+  },
+  "谷間": {
+    "たにま": 1089,
+    "たにあい": 67
+  },
+  "追従": {
+    "ついじゅう": 1000,
+    "ついしょう": 73
+  },
+  "墓石": {
+    "はかいし": 323,
+    "ぼせき": 257
+  },
+  "大文字": {
+    "おおもじ": 65,
+    "だいもんじ": 46
+  },
+  "漢書": {
+    "かんじょ": 171,
+    "かんしょ": 66,
+    "からぶみ": 47
+  },
+  "作法": {
+    "さほう": 3905,
+    "さくほう": 427
+  },
+  "半月": {
+    "はんつき": 388,
+    "はんげつ": 85
+  },
+  "黒子": {
+    "ほくろ": 200,
+    "くろこ": 183
+  },
+  "競売": {
+    "けいばい": 937,
+    "きょうばい": 332
+  },
+  "開眼": {
+    "かいげん": 338,
+    "かいがん": 144
+  },
+  "求道": {
+    "きゅうどう": 379,
+    "ぐどう": 81
+  },
+  "施業": {
+    "せぎょう": 602,
+    "しぎょう": 264
+  },
+  "借家": {
+    "しゃっか": 505,
+    "しゃくや": 394
+  },
+  "頭蓋骨": {
+    "ずがいこつ": 377,
+    "とうがいこつ": 187
+  },
+  "法衣": {
+    "ころも": 115,
+    "ほうえ": 87
+  },
+  "昨日": {
+    "きのう": 2670,
+    "さくじつ": 713
+  },
+  "風車": {
+    "ふうしゃ": 1133,
+    "かざぐるま": 678
+  },
+  "寒気": {
+    "かんき": 153,
+    "さむけ": 79
+  },
+  "背筋": {
+    "せすじ": 177,
+    "はいきん": 43
+  },
+  "逆手": {
+    "さかて": 169,
+    "ぎゃくて": 116
+  },
+  "生花": {
+    "いけばな": 283,
+    "せいか": 91
+  },
+  "白髪": {
+    "しらが": 313,
+    "はくはつ": 113
+  },
+  "一月": {
+    "ひとつき": 301,
+    "いちがつ": 282
+  },
+  "一寸": {
+    "ちょっと": 1481,
+    "いっすん": 111
+  },
+  "一声": {
+    "ひとこえ": 253,
+    "いっせい": 109
+  },
+  "一日": {
+    "いちにち": 1711,
+    "ついたち": 866,
+    "いちじつ": 41
+  },
+  "一分": {
+    "いちぶん": 75,
+    "いちぶ": 62
+  },
+  "一文": {
+    "いちもん": 86,
+    "いちぶん": 48
+  },
+  "何時": {
+    "いつ": 1248,
+    "なんじ": 159,
+    "なんどき": 63
+  },
+  "何分": {
+    "なにぶん": 379,
+    "なんぷん": 51
+  },
+  "気骨": {
+    "きこつ": 140,
+    "きぼね": 67
+  },
+  "銀杏": {
+    "いちょう": 322,
+    "ぎんなん": 85
+  },
+  "細々": {
+    "こまごま": 88,
+    "ほそぼそ": 67
+  },
+  "細目": {
+    "さいもく": 962,
+    "ほそめ": 123
+  },
+  "疾風": {
+    "しっぷう": 544,
+    "はやて": 94,
+    "かぜ": 68
+  },
+  "菖蒲": {
+    "しょうぶ": 165,
+    "あやめ": 65
+  },
+  "船底": {
+    "せんてい": 246,
+    "ふなぞこ": 80
+  },
+  "相乗": {
+    "そうじょう": 732,
+    "あいの": 89
+  },
+  "造作": {
+    "ぞうさ": 188,
+    "ぞうさく": 65
+  },
+  "頭数": {
+    "あたまかず": 168,
+    "とうすう": 119
+  },
+  "二重": {
+    "にじゅう": 5418,
+    "ふたえ": 65
+  },
+  "日暮": {
+    "ひぐ": 403,
+    "ひぐれ": 97,
+    "ひぐらし": 81
+  },
+  "梅雨": {
+    "つゆ": 471,
+    "ばいう": 284
+  },
+  "風穴": {
+    "かざあな": 300,
+    "ふうけつ": 68
+  },
+  "分別": {
+    "ふんべつ": 1280,
+    "ぶんべつ": 635
+  },
+  "夜話": {
+    "やわ": 2153,
+    "よばなし": 52
+  },
+  "野兎": {
+    "やと": 176,
+    "のうさぎ": 43
+  },
+  "冷水": {
+    "れいすい": 189,
+    "ひやみず": 153
+  },
+  "連中": {
+    "れんじゅう": 853,
+    "れんちゅう": 691
+  },
+  "飛沫": {
+    "ひまつ": 223,
+    "しぶき": 96
+  },
+  "翡翠": {
+    "ひすい": 177,
+    "かわせみ": 94
+  },
+  "一昨日": {
+    "おととい": 208,
+    "いっさくじつ": 71
+  },
+  "一昨年": {
+    "おととし": 72,
+    "いっさくねん": 59
+  },
+  "十八番": {
+    "じゅうはちばん": 212,
+    "おはこ": 41
+  },
+  "明後日": {
+    "あさって": 186,
+    "みょうごにち": 60
+  },
+  "石綿": {
+    "いしわた": 1702,
+    "せきめん": 360
+  },
+  "公文": {
+    "こうぶん": 196,
+    "くもん": 46
+  },
+  "読本": {
+    "どくほん": 12176,
+    "とくほん": 2414,
+    "よみほん": 121
+  },
+  "古本": {
+    "ふるほん": 550,
+    "こほん": 109
+  },
+  "町家": {
+    "まちや": 655,
+    "ちょうか": 216
+  },
+  "米": {
+    "べい": 17392,
+    "こめ": 9021,
+    "まい": 2829,
+    "よね": 620,
+    "ごめ": 164,
+    "めーとる": 112
+  }
+}

stores/dbert/label_encoder.json ADDED Viewed

	@@ -0,0 +1,306 @@

+{
+    "class_to_index": {
+        "<OTHER>": 0,
+        "\u4e00\u5206:\u3044\u3061\u3076": 1,
+        "\u4e00\u5206:\u3044\u3061\u3076\u3093": 2,
+        "\u4e00\u5473:\u3044\u3061\u307f": 3,
+        "\u4e00\u5473:\u3072\u3068\u3042\u3058": 4,
+        "\u4e00\u58f0:\u3044\u3063\u305b\u3044": 5,
+        "\u4e00\u58f0:\u3072\u3068\u3053\u3048": 6,
+        "\u4e00\u591c:\u3044\u3061\u3084": 7,
+        "\u4e00\u591c:\u3072\u3068\u3088": 8,
+        "\u4e00\u5bf8:\u3044\u3063\u3059\u3093": 9,
+        "\u4e00\u5bf8:\u3061\u3087\u3063\u3068": 10,
+        "\u4e00\u6587:\u3044\u3061\u3076\u3093": 11,
+        "\u4e00\u6587:\u3044\u3061\u3082\u3093": 12,
+        "\u4e00\u65b9:\u3044\u3061\u307b\u3046": 13,
+        "\u4e00\u65b9:\u3044\u3063\u307d\u3046": 14,
+        "\u4e00\u65b9:\u3072\u3068\u304b\u305f": 15,
+        "\u4e00\u65e5:\u3044\u3061\u3058\u3064": 16,
+        "\u4e00\u65e5:\u3044\u3061\u306b\u3061": 17,
+        "\u4e00\u65e5:\u3064\u3044\u305f\u3061": 18,
+        "\u4e00\u6628\u5e74:\u3044\u3063\u3055\u304f\u306d\u3093": 19,
+        "\u4e00\u6628\u5e74:\u304a\u3068\u3068\u3057": 20,
+        "\u4e00\u6628\u65e5:\u3044\u3063\u3055\u304f\u3058\u3064": 21,
+        "\u4e00\u6628\u65e5:\u304a\u3068\u3068\u3044": 22,
+        "\u4e00\u6642:\u3044\u3061\u3058": 23,
+        "\u4e00\u6642:\u3044\u3061\u3069\u304d": 24,
+        "\u4e00\u6642:\u3044\u3063\u3068\u304d": 25,
+        "\u4e00\u6708:\u3044\u3061\u304c\u3064": 26,
+        "\u4e00\u6708:\u3072\u3068\u3064\u304d": 27,
+        "\u4e00\u76ee:\u3044\u3061\u3082\u304f": 28,
+        "\u4e00\u76ee:\u3072\u3068\u3081": 29,
+        "\u4e00\u884c:\u3044\u3061\u304e\u3087\u3046": 30,
+        "\u4e00\u884c:\u3044\u3063\u3053\u3046": 31,
+        "\u4e00\u8a00:\u3044\u3061\u3052\u3093": 32,
+        "\u4e00\u8a00:\u3044\u3061\u3054\u3093": 33,
+        "\u4e00\u8a00:\u3072\u3068\u3053\u3068": 34,
+        "\u4e00\u9014:\u3044\u3061\u305a": 35,
+        "\u4e00\u9014:\u3044\u3063\u3068": 36,
+        "\u4e0a\u4e0b:\u3046\u3048\u3057\u305f": 37,
+        "\u4e0a\u4e0b:\u3058\u3087\u3046\u3052": 38,
+        "\u4e0a\u624b:\u3046\u307e": 39,
+        "\u4e0a\u624b:\u3046\u308f\u3066": 40,
+        "\u4e0a\u624b:\u304b\u307f\u3066": 41,
+        "\u4e0a\u624b:\u3058\u3087\u3046\u305a": 42,
+        "\u4e0a\u65b9:\u304b\u307f\u304c\u305f": 43,
+        "\u4e0a\u65b9:\u3058\u3087\u3046\u307b\u3046": 44,
+        "\u4e0b\u624b:\u3057\u305f\u3066": 45,
+        "\u4e0b\u624b:\u3057\u3082\u3066": 46,
+        "\u4e0b\u624b:\u3078\u305f": 47,
+        "\u4e0b\u624b:\u3079\u305f": 48,
+        "\u4e0b\u91ce:\u3052\u3084": 49,
+        "\u4e0b\u91ce:\u3057\u3082\u3064\u3051": 50,
+        "\u4e0b\u91ce:\u3057\u3082\u306e": 51,
+        "\u4e16\u8ad6:\u305b\u308d\u3093": 52,
+        "\u4e16\u8ad6:\u3088\u308d\u3093": 53,
+        "\u4e2d\u9593:\u3061\u3085\u3046\u304b\u3093": 54,
+        "\u4e2d\u9593:\u3061\u3085\u3046\u3052\u3093": 55,
+        "\u4e8c\u4eba:\u306b\u306b\u3093": 56,
+        "\u4e8c\u4eba:\u3075\u305f\u308a": 57,
+        "\u4e8c\u91cd:\u306b\u3058\u3085\u3046": 58,
+        "\u4e8c\u91cd:\u3075\u305f\u3048": 59,
+        "\u4eba\u6c17:\u3058\u3093\u304d": 60,
+        "\u4eba\u6c17:\u306b\u3093\u304d": 61,
+        "\u4eba\u6c17:\u3072\u3068\u3051": 62,
+        "\u4eca\u65e5:\u304d\u3087\u3046": 63,
+        "\u4eca\u65e5:\u3053\u3093\u306b\u3061": 64,
+        "\u4eee\u540d:\u304b\u306a": 65,
+        "\u4eee\u540d:\u304b\u3081\u3044": 66,
+        "\u4eee\u540d:\u304c\u306a": 67,
+        "\u4f55\u5206:\u306a\u306b\u3076\u3093": 68,
+        "\u4f55\u5206:\u306a\u3093\u3077\u3093": 69,
+        "\u4f55\u6642:\u3044\u3064": 70,
+        "\u4f55\u6642:\u306a\u3093\u3058": 71,
+        "\u4f55\u6642:\u306a\u3093\u3069\u304d": 72,
+        "\u4f5c\u6cd5:\u3055\u304f\u307b\u3046": 73,
+        "\u4f5c\u6cd5:\u3055\u307b\u3046": 74,
+        "\u4fc2:\u304b\u304b": 75,
+        "\u4fc2:\u304b\u304b\u308a": 76,
+        "\u4fc2:\u304b\u304b\u308f": 77,
+        "\u4fc2:\u304c\u304b\u308a": 78,
+        "\u501f\u5bb6:\u3057\u3083\u304f\u3084": 79,
+        "\u501f\u5bb6:\u3057\u3083\u3063\u304b": 80,
+        "\u516c\u6587:\u304f\u3082\u3093": 81,
+        "\u516c\u6587:\u3053\u3046\u3076\u3093": 82,
+        "\u51b7\u6c34:\u3072\u3084\u307f\u305a": 83,
+        "\u51b7\u6c34:\u308c\u3044\u3059\u3044": 84,
+        "\u5206\u5225:\u3075\u3093\u3079\u3064": 85,
+        "\u5206\u5225:\u3076\u3093\u3079\u3064": 86,
+        "\u5229\u76ca:\u308a\u3048\u304d": 87,
+        "\u5229\u76ca:\u308a\u3084\u304f": 88,
+        "\u5341\u516b\u756a:\u304a\u306f\u3053": 89,
+        "\u5341\u516b\u756a:\u3058\u3085\u3046\u306f\u3061\u3070\u3093": 90,
+        "\u534a\u6708:\u306f\u3093\u3052\u3064": 91,
+        "\u534a\u6708:\u306f\u3093\u3064\u304d": 92,
+        "\u535a\u58eb:\u306f\u304b\u305b": 93,
+        "\u535a\u58eb:\u306f\u304f\u3057": 94,
+        "\u53e3\u8154:\u3053\u3046\u304f\u3046": 95,
+        "\u53e3\u8154:\u3053\u3046\u3053\u3046": 96,
+        "\u53e4\u672c:\u3053\u307b\u3093": 97,
+        "\u53e4\u672c:\u3075\u308b\u307b\u3093": 98,
+        "\u56fd\u7acb:\u304f\u306b\u305f\u3061": 99,
+        "\u56fd\u7acb:\u3053\u304f\u308a\u3064": 100,
+        "\u5893\u77f3:\u306f\u304b\u3044\u3057": 101,
+        "\u5893\u77f3:\u307c\u305b\u304d": 102,
+        "\u5909\u5316:\u3078\u3093\u304b": 103,
+        "\u5909\u5316:\u3078\u3093\u3052": 104,
+        "\u591c\u4e2d:\u3084\u3061\u3085\u3046": 105,
+        "\u591c\u4e2d:\u3088\u306a\u304b": 106,
+        "\u591c\u8a71:\u3084\u308f": 107,
+        "\u591c\u8a71:\u3088\u3070\u306a\u3057": 108,
+        "\u5927\u4e8b:\u304a\u304a\u3054\u3068": 109,
+        "\u5927\u4e8b:\u3060\u3044\u3058": 110,
+        "\u5927\u4eba:\u3046\u3057": 111,
+        "\u5927\u4eba:\u304a\u3068\u306a": 112,
+        "\u5927\u4eba:\u305f\u3044\u3058\u3093": 113,
+        "\u5927\u5206:\u304a\u304a\u3044\u305f": 114,
+        "\u5927\u5206:\u3060\u3044\u3076": 115,
+        "\u5927\u5206:\u3060\u3044\u3076\u3093": 116,
+        "\u5927\u52e2:\u304a\u304a\u305c\u3044": 117,
+        "\u5927\u52e2:\u305f\u3044\u305b\u3044": 118,
+        "\u5927\u5bb6:\u304a\u304a\u3084": 119,
+        "\u5927\u5bb6:\u305f\u3044\u304b": 120,
+        "\u5927\u5bb6:\u305f\u3044\u3051": 121,
+        "\u5927\u624b:\u304a\u304a\u3066": 122,
+        "\u5927\u624b:\u304a\u304a\u3067": 123,
+        "\u5927\u6587\u5b57:\u304a\u304a\u3082\u3058": 124,
+        "\u5927\u6587\u5b57:\u3060\u3044\u3082\u3093\u3058": 125,
+        "\u592b\u5a66:\u3075\u3046\u3075": 126,
+        "\u592b\u5a66:\u3081\u304a\u3068": 127,
+        "\u5bd2\u6c17:\u304b\u3093\u304d": 128,
+        "\u5bd2\u6c17:\u3055\u3080\u3051": 129,
+        "\u5c71\u9670:\u3055\u3093\u3044\u3093": 130,
+        "\u5c71\u9670:\u3084\u307e\u304b\u3052": 131,
+        "\u5e02\u5834:\u3044\u3061\u3070": 132,
+        "\u5e02\u5834:\u3057\u3058\u3087\u3046": 133,
+        "\u5f37\u529b:\u304d\u3087\u3046\u308a\u3087\u304f": 134,
+        "\u5f37\u529b:\u3054\u3046\u308a\u304d": 135,
+        "\u5fc3\u4e2d:\u3057\u3093\u3058\u3085\u3046": 136,
+        "\u5fc3\u4e2d:\u3057\u3093\u3061\u3085\u3046": 137,
+        "\u5fc3\u4e2d:\u3057\u3093\u3062\u3085\u3046": 138,
+        "\u6545\u90f7:\u304f\u306b": 139,
+        "\u6545\u90f7:\u3053\u304d\u3087\u3046": 140,
+        "\u6545\u90f7:\u3075\u308b\u3055\u3068": 141,
+        "\u6587\u5b57:\u3082\u3058": 142,
+        "\u6587\u5b57:\u3082\u3093\u3058": 143,
+        "\u6587\u66f8:\u3076\u3093\u3057\u3087": 144,
+        "\u6587\u66f8:\u3082\u3093\u3057\u3087": 145,
+        "\u6587\u66f8:\u3082\u3093\u3058\u3087": 146,
+        "\u65bd\u5de5:\u3057\u3053\u3046": 147,
+        "\u65bd\u5de5:\u305b\u3053": 148,
+        "\u65bd\u5de5:\u305b\u3053\u3046": 149,
+        "\u65bd\u696d:\u3057\u304e\u3087\u3046": 150,
+        "\u65bd\u696d:\u305b\u304e\u3087\u3046": 151,
+        "\u65bd\u884c:\u3057\u3053\u3046": 152,
+        "\u65bd\u884c:\u305b\u3053\u3046": 153,
+        "\u65e5\u4e2d:\u306b\u3061\u3058\u3085\u3046": 154,
+        "\u65e5\u4e2d:\u306b\u3063\u3061\u3085\u3046": 155,
+        "\u65e5\u5411:\u3072\u306a\u305f": 156,
+        "\u65e5\u5411:\u3072\u3085\u3046\u304c": 157,
+        "\u65e5\u66ae:\u3072\u3050": 158,
+        "\u65e5\u66ae:\u3072\u3050\u3089\u3057": 159,
+        "\u65e5\u66ae:\u3072\u3050\u308c": 160,
+        "\u660e\u5f8c\u65e5:\u3042\u3055\u3063\u3066": 161,
+        "\u660e\u5f8c\u65e5:\u307f\u3087\u3046\u3054\u306b\u3061": 162,
+        "\u660e\u65e5:\u3042\u3057\u305f": 163,
+        "\u660e\u65e5:\u3042\u3059": 164,
+        "\u660e\u65e5:\u307f\u3087\u3046\u306b\u3061": 165,
+        "\u6628\u65e5:\u304d\u306e\u3046": 166,
+        "\u6628\u65e5:\u3055\u304f\u3058\u3064": 167,
+        "\u6700\u4e2d:\u3055\u3044\u3061\u3085\u3046": 168,
+        "\u6700\u4e2d:\u3055\u306a\u304b": 169,
+        "\u672b\u671f:\u307e\u3063\u304d": 170,
+        "\u672b\u671f:\u307e\u3064\u3054": 171,
+        "\u6839\u672c:\u3053\u3093\u307d\u3093": 172,
+        "\u6839\u672c:\u306d\u3082\u3068": 173,
+        "\u6885\u96e8:\u3064\u3086": 174,
+        "\u6885\u96e8:\u3070\u3044\u3046": 175,
+        "\u6c17\u8cea:\u304b\u305f\u304e": 176,
+        "\u6c17\u8cea:\u304d\u3057\u3064": 177,
+        "\u6c17\u9aa8:\u304d\u3053\u3064": 178,
+        "\u6c17\u9aa8:\u304d\u307c\u306d": 179,
+        "\u6c34\u9762:\u3059\u3044\u3081\u3093": 180,
+        "\u6c34\u9762:\u307f\u306a\u3082": 181,
+        "\u6c42\u9053:\u304d\u3085\u3046\u3069\u3046": 182,
+        "\u6c42\u9053:\u3050\u3069\u3046": 183,
+        "\u6cd5\u8863:\u3053\u308d\u3082": 184,
+        "\u6cd5\u8863:\u307b\u3046\u3048": 185,
+        "\u6e05\u6c34:\u304d\u3088\u307f\u305a": 186,
+        "\u6e05\u6c34:\u3057\u307f\u305a": 187,
+        "\u6e05\u6d44:\u3057\u3087\u3046\u3058\u3087\u3046": 188,
+        "\u6e05\u6d44:\u305b\u3044\u3058\u3087\u3046": 189,
+        "\u6f22\u66f8:\u304b\u3089\u3076\u307f": 190,
+        "\u6f22\u66f8:\u304b\u3093\u3057\u3087": 191,
+        "\u6f22\u66f8:\u304b\u3093\u3058\u3087": 192,
+        "\u7267\u5834:\u307c\u304f\u3058\u3087\u3046": 193,
+        "\u7267\u5834:\u307e\u304d\u3070": 194,
+        "\u73a9\u5177:\u304a\u3082\u3061\u3083": 195,
+        "\u73a9\u5177:\u304c\u3093\u3050": 196,
+        "\u73fe\u4e16:\u3052\u3093\u305b": 197,
+        "\u73fe\u4e16:\u3052\u3093\u305b\u3044": 198,
+        "\u751f\u7269:\u3044\u304d\u3082\u306e": 199,
+        "\u751f\u7269:\u305b\u3044\u3076\u3064": 200,
+        "\u751f\u82b1:\u3044\u3051\u3070\u306a": 201,
+        "\u751f\u82b1:\u305b\u3044\u304b": 202,
+        "\u753a\u5bb6:\u3061\u3087\u3046\u304b": 203,
+        "\u753a\u5bb6:\u307e\u3061\u3084": 204,
+        "\u75be\u98a8:\u304b\u305c": 205,
+        "\u75be\u98a8:\u3057\u3063\u3077\u3046": 206,
+        "\u75be\u98a8:\u306f\u3084\u3066": 207,
+        "\u767d\u9aea:\u3057\u3089\u304c": 208,
+        "\u767d\u9aea:\u306f\u304f\u306f\u3064": 209,
+        "\u76f8\u4e57:\u3042\u3044\u306e": 210,
+        "\u76f8\u4e57:\u305d\u3046\u3058\u3087\u3046": 211,
+        "\u773c\u93e1:\u304c\u3093\u304d\u3087\u3046": 212,
+        "\u773c\u93e1:\u3081\u304c\u306d": 213,
+        "\u77f3\u7dbf:\u3044\u3057\u308f\u305f": 214,
+        "\u77f3\u7dbf:\u305b\u304d\u3081\u3093": 215,
+        "\u793c\u62dd:\u3089\u3044\u306f\u3044": 216,
+        "\u793c\u62dd:\u308c\u3044\u306f\u3044": 217,
+        "\u7af6\u58f2:\u304d\u3087\u3046\u3070\u3044": 218,
+        "\u7af6\u58f2:\u3051\u3044\u3070\u3044": 219,
+        "\u7c73:\u3053\u3081": 220,
+        "\u7c73:\u3054\u3081": 221,
+        "\u7c73:\u3079\u3044": 222,
+        "\u7c73:\u307e\u3044": 223,
+        "\u7c73:\u3081\u30fc\u3068\u308b": 224,
+        "\u7c73:\u3088\u306d": 225,
+        "\u7d05\u8449:\u3053\u3046\u3088\u3046": 226,
+        "\u7d05\u8449:\u3082\u307f\u3058": 227,
+        "\u7d30\u3005:\u3053\u307e\u3054\u307e": 228,
+        "\u7d30\u3005:\u307b\u305d\u307c\u305d": 229,
+        "\u7d30\u76ee:\u3055\u3044\u3082\u304f": 230,
+        "\u7d30\u76ee:\u307b\u305d\u3081": 231,
+        "\u7d4c\u7def:\u3044\u304d\u3055\u3064": 232,
+        "\u7d4c\u7def:\u3051\u3044\u3044": 233,
+        "\u7fe1\u7fe0:\u304b\u308f\u305b\u307f": 234,
+        "\u7fe1\u7fe0:\u3072\u3059\u3044": 235,
+        "\u80cc\u7b4b:\u305b\u3059\u3058": 236,
+        "\u80cc\u7b4b:\u306f\u3044\u304d\u3093": 237,
+        "\u8239\u5e95:\u305b\u3093\u3066\u3044": 238,
+        "\u8239\u5e95:\u3075\u306a\u305e\u3053": 239,
+        "\u82b1\u5f01:\u304b\u3079\u3093": 240,
+        "\u82b1\u5f01:\u306f\u306a\u3073\u3089": 241,
+        "\u83d6\u84b2:\u3042\u3084\u3081": 242,
+        "\u83d6\u84b2:\u3057\u3087\u3046\u3076": 243,
+        "\u8868:\u3042\u3089": 244,
+        "\u8868:\u3042\u3089\u308f": 245,
+        "\u8868:\u304a\u3082\u3066": 246,
+        "\u8868:\u3072\u3087\u3046": 247,
+        "\u898b\u7269:\u3051\u3093\u3076\u3064": 248,
+        "\u898b\u7269:\u307f\u3082\u306e": 249,
+        "\u89d2:\u304b\u304f": 250,
+        "\u89d2:\u304b\u3069": 251,
+        "\u89d2:\u3059\u307f": 252,
+        "\u89d2:\u3064\u306e": 253,
+        "\u8aad\u672c:\u3068\u304f\u307b\u3093": 254,
+        "\u8aad\u672c:\u3069\u304f\u307b\u3093": 255,
+        "\u8aad\u672c:\u3088\u307f\u307b\u3093": 256,
+        "\u8c37\u9593:\u305f\u306b\u3042\u3044": 257,
+        "\u8c37\u9593:\u305f\u306b\u307e": 258,
+        "\u8db3\u8de1:\u3042\u3057\u3042\u3068": 259,
+        "\u8db3\u8de1:\u305d\u304f\u305b\u304d": 260,
+        "\u8eab\u4f53:\u304b\u3089\u3060": 261,
+        "\u8eab\u4f53:\u3057\u3093\u305f\u3044": 262,
+        "\u8ee2\u751f:\u3066\u3093\u3057\u3087\u3046": 263,
+        "\u8ee2\u751f:\u3066\u3093\u305b\u3044": 264,
+        "\u8ffd\u5f93:\u3064\u3044\u3057\u3087\u3046": 265,
+        "\u8ffd\u5f93:\u3064\u3044\u3058\u3085\u3046": 266,
+        "\u9006\u624b:\u304e\u3083\u304f\u3066": 267,
+        "\u9006\u624b:\u3055\u304b\u3066": 268,
+        "\u9020\u4f5c:\u305e\u3046\u3055": 269,
+        "\u9020\u4f5c:\u305e\u3046\u3055\u304f": 270,
+        "\u9023\u4e2d:\u308c\u3093\u3058\u3085\u3046": 271,
+        "\u9023\u4e2d:\u308c\u3093\u3061\u3085\u3046": 272,
+        "\u907a\u8a00:\u3044\u3052\u3093": 273,
+        "\u907a\u8a00:\u3044\u3054\u3093": 274,
+        "\u907a\u8a00:\u3086\u3044\u3054\u3093": 275,
+        "\u91ce\u514e:\u306e\u3046\u3055\u304e": 276,
+        "\u91ce\u514e:\u3084\u3068": 277,
+        "\u91d1\u8272:\u304d\u3093\u3044\u308d": 278,
+        "\u91d1\u8272:\u3053\u3093\u3058\u304d": 279,
+        "\u9280\u674f:\u3044\u3061\u3087\u3046": 280,
+        "\u9280\u674f:\u304e\u3093\u306a\u3093": 281,
+        "\u958b\u773c:\u304b\u3044\u304c\u3093": 282,
+        "\u958b\u773c:\u304b\u3044\u3052\u3093": 283,
+        "\u982d\u6570:\u3042\u305f\u307e\u304b\u305a": 284,
+        "\u982d\u6570:\u3068\u3046\u3059\u3046": 285,
+        "\u982d\u84cb\u9aa8:\u305a\u304c\u3044\u3053\u3064": 286,
+        "\u982d\u84cb\u9aa8:\u3068\u3046\u304c\u3044\u3053\u3064": 287,
+        "\u98a8\u7a74:\u304b\u3056\u3042\u306a": 288,
+        "\u98a8\u7a74:\u3075\u3046\u3051\u3064": 289,
+        "\u98a8\u8eca:\u304b\u3056\u3050\u308b\u307e": 290,
+        "\u98a8\u8eca:\u3075\u3046\u3057\u3083": 291,
+        "\u98db\u6cab:\u3057\u3076\u304d": 292,
+        "\u98db\u6cab:\u3072\u307e\u3064": 293,
+        "\u9aa8:\u3053\u3064": 294,
+        "\u9aa8:\u307b\u306d": 295,
+        "\u9b5a:\u3046\u304a": 296,
+        "\u9b5a:\u304e\u3087": 297,
+        "\u9b5a:\u3055\u304b\u306a": 298,
+        "\u9b5a:\u3056\u304b\u306a": 299,
+        "\u9ed2\u5b50:\u304f\u308d\u3053": 300,
+        "\u9ed2\u5b50:\u307b\u304f\u308d": 301
+    }
+}

stores/dbert/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da84a3d4be38191f4485086a8b4c7013a2ab33cf2c7d20df6c3fdfe0092041af
+size 443657837

stores/dbert/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

stores/dbert/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "do_subword_tokenize": true,
+  "do_word_tokenize": true,
+  "jumanpp_kwargs": null,
+  "mask_token": "[MASK]",
+  "mecab_kwargs": {
+    "mecab_dic": "unidic_lite"
+  },
+  "model_max_length": 1000000000000000019884624838656,
+  "name_or_path": "cl-tohoku/bert-base-japanese-v2",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "subword_tokenizer_type": "wordpiece",
+  "sudachi_kwargs": null,
+  "tokenizer_class": "BertJapaneseTokenizer",
+  "unk_token": "[UNK]",
+  "word_tokenizer_type": "mecab"
+}

stores/dbert/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aee0041b2ad4b019fea4db1f8aabd34b0081878cae2c17395657331db1adbb70
+size 3579

stores/dbert/training_performance.json ADDED Viewed

The diff for this file is too large to render. See raw diff

stores/dbert/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

yomikata/__init__.py ADDED Viewed

File without changes

yomikata/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (155 Bytes). View file

yomikata/__pycache__/dbert.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

yomikata/__pycache__/dictionary.cpython-310.pyc ADDED Viewed

Binary file (4 kB). View file

yomikata/__pycache__/evaluate.cpython-310.pyc ADDED Viewed

Binary file (4.98 kB). View file

yomikata/__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (2.84 kB). View file

yomikata/__pycache__/reader.cpython-310.pyc ADDED Viewed

Binary file (774 Bytes). View file

yomikata/__pycache__/t5.cpython-310.pyc ADDED Viewed

Binary file (5.18 kB). View file

yomikata/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (12.1 kB). View file

yomikata/dataset/__init__.py ADDED Viewed

File without changes

yomikata/dataset/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (163 Bytes). View file

yomikata/dataset/__pycache__/aozora.cpython-310.pyc ADDED Viewed

Binary file (2.99 kB). View file

yomikata/dataset/__pycache__/bccwj.cpython-310.pyc ADDED Viewed

Binary file (5.31 kB). View file

yomikata/dataset/__pycache__/kwdlc.cpython-310.pyc ADDED Viewed

Binary file (2.47 kB). View file

yomikata/dataset/__pycache__/ndlbib.cpython-310.pyc ADDED Viewed

Binary file (1.3 kB). View file

yomikata/dataset/__pycache__/pronunciations.cpython-310.pyc ADDED Viewed

Binary file (1.44 kB). View file

yomikata/dataset/__pycache__/repair_long_vowels.cpython-310.pyc ADDED Viewed

Binary file (2.13 kB). View file

yomikata/dataset/__pycache__/split.cpython-310.pyc ADDED Viewed

Binary file (8.08 kB). View file

yomikata/dataset/__pycache__/sudachi.cpython-310.pyc ADDED Viewed

Binary file (1.15 kB). View file

yomikata/dataset/__pycache__/unidic.cpython-310.pyc ADDED Viewed

Binary file (1.27 kB). View file

yomikata/dataset/aozora.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""aozora.py
+Data processing script for aozora bunko file from https://github.com/ndl-lab/huriganacorpus-aozora
+"""
+import warnings
+from pathlib import Path
+import pandas as pd
+from pandas.errors import ParserError
+from speach import ttlig
+from config import config
+from config.config import logger
+from yomikata import utils
+from yomikata.dataset.repair_long_vowels import repair_long_vowels
+warnings.filterwarnings("ignore")
+def read_file(file: str):
+    # logger.info("reading file")
+    with open(file) as f:
+        rows = [
+            line.rstrip("\n").rstrip("\r").split("\t")[0:3] for line in f.readlines()
+        ]
+    df = pd.DataFrame(rows, columns=["word", "furigana", "type"])
+    # logger.info("removing unused rows")
+    # remove unused rows
+    df = df[~df["type"].isin(["[入力 読み]", "分かち書き"])]
+    df = df[~pd.isna(df["word"])]
+    df = df[~pd.isnull(df["word"])]
+    df = df[df["word"] != ""]
+    # logger.info("organizing into sentences")
+    # now organize remaining rows into sentences
+    gyou_df = pd.DataFrame(columns=["sentence", "furigana", "sentenceid"])
+    sentence = ""
+    furigana = ""
+    sentenceid = None
+    gyous = []
+    for row in df.itertuples():
+        if row.type in ["[入力文]"]:
+            sentence = row.word
+        elif row.type in ["漢字"]:
+            furigana += ttlig.RubyToken.from_furi(
+                row.word, repair_long_vowels(row.furigana, row.word)
+            ).to_code()
+        elif row.word.split(":")[0] in ["行番号"]:
+            if sentenceid:  # this handles the first row
+                gyous.append([sentence, furigana, sentenceid])
+            sentenceid = file.name + "_" + row.word.split(":")[1].strip()
+            sentence = None
+            furigana = ""
+        else:
+            furigana += row.word
+    # last row handling
+    gyous.append([sentence, furigana, sentenceid])
+    # make dataframe
+    gyou_df = pd.DataFrame(gyous, columns=["sentence", "furigana", "sentenceid"])
+    gyou_df = gyou_df[~pd.isna(gyou_df.sentence)]
+    # logger.info("cleaning rows")
+    # clean rows
+    gyou_df["furigana"] = gyou_df["furigana"].apply(utils.standardize_text)
+    gyou_df["sentence"] = gyou_df["sentence"].apply(
+        lambda s: utils.standardize_text(
+            s.replace("|", "").replace(" ", "").replace("※", "")
+        )
+    )
+    # logger.info("removing errors")
+    # remove non-matching rows
+    gyou_df = gyou_df[
+        gyou_df["sentence"] == gyou_df["furigana"].apply(utils.remove_furigana)
+    ]
+    # remove known errors
+    error_ids = []
+    gyou_df = gyou_df[~gyou_df["sentenceid"].isin(error_ids)]
+    # remove duplicates
+    gyou_df = gyou_df.drop_duplicates()
+    return gyou_df
+def aozora_data():
+    """Extract, load and transform the aozora data"""
+    # Extract sentences from the data files
+    files = list(Path(config.RAW_DATA_DIR, "aozora").glob("*/*/*.txt"))
+    with open(Path(config.SENTENCE_DATA_DIR, "aozora.csv"), "w") as f:
+        f.write("sentence,furigana,sentenceid\n")
+    for i, file in enumerate(files):
+        logger.info(f"{i+1}/{len(files)} {file.name}")
+        try:
+            df = read_file(file)
+        except ParserError:
+            logger.error(f"Parser error on {file}")
+        df.to_csv(
+            Path(config.SENTENCE_DATA_DIR, "aozora.csv"),
+            mode="a",
+            index=False,
+            header=False,
+        )
+    logger.info("✅ Saved all aozora data!")
+if __name__ == "__main__":
+    aozora_data()

yomikata/dataset/bccwj.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""bccwj.py
+Data processing script for files downloaded from Chuunagon search
+Chuunagon URL: https://chunagon.ninjal.ac.jp/
+Download with the settings
+文脈中の区切り記号 |
+文脈中の文区切り記号 #
+前後文脈の語数 10
+検索対象（固定長・可変長） 両方
+共起条件の範囲 文境界をまたがない
+ダウンロードオプション
+システム Linux
+文字コード UTF-8
+改行コード LF
+出力ファイルが一つの場合は Zip 圧縮を行わない 検索条件式ごとに出力ファイルを分割する
+インラインタグを使用  CHECK BOTH 語彙素読み AND 発音形出現形語種 BOX
+(発音形出現形 is the actual pronounced one, but displays e.g. よう　れい　as よー　れー)
+タグの区切り記号 :
+"""
+import warnings
+from pathlib import Path
+import jaconv
+import pandas as pd
+from speach.ttlig import RubyToken
+from config import config
+from config.config import logger
+from yomikata import utils
+warnings.filterwarnings("ignore")
+SENTENCE_SPLIT_CHAR = "#"
+WORD_SPLIT_CHAR = "|"
+READING_SEP_CHAR = ":"
+def read_bccwj_file(filename: str):
+    """ """
+    df = pd.read_csv(filename, sep="\t")
+    df["前文脈"] = df["前文脈"].fillna("")
+    df["後文脈"] = df["後文脈"].fillna("")
+    df["full_text"] = (
+        df["前文脈"] + df["キー"] + "[" + df["語彙素読み"] + ":" + df["発音形出現形"] + "]" + df["後文脈"]
+    )
+    def get_sentences(row):
+        sentences = row["full_text"].split(SENTENCE_SPLIT_CHAR)
+        furigana_sentences = []
+        for sentence in sentences:
+            words_with_readings = sentence.split(WORD_SPLIT_CHAR)
+            furigana_sentence = ""
+            for word_with_reading in words_with_readings:
+                word = word_with_reading.split("[")[0]
+                form, reading = jaconv.kata2hira(
+                    word_with_reading.split("[")[1].split("]")[0]
+                ).split(READING_SEP_CHAR)
+                if (
+                    not utils.has_kanji(word)
+                    or reading == jaconv.kata2hira(word)
+                    or form == ""
+                    or reading == ""
+                ):
+                    furigana_sentence += word
+                else:
+                    if ("ー" in reading) and ("ー" not in form):
+                        indexes_of_dash = [
+                            pos for pos, char in enumerate(reading) if char == "ー"
+                        ]
+                        for index_of_dash in indexes_of_dash:
+                            if len(reading) == len(form):
+                                dash_reading = form[index_of_dash]
+                            else:
+                                char_before_dash = reading[index_of_dash - 1]
+                                if char_before_dash in "ねめせぜれてでけげへべぺ":
+                                    digraphA = char_before_dash + "え"
+                                    digraphB = char_before_dash + "い"
+                                    if digraphA in form and digraphB not in form:
+                                        dash_reading = "え"
+                                    elif digraphB in form and digraphA not in form:
+                                        dash_reading = "い"
+                                    else:
+                                        logger.warning(
+                                            f"Leaving dash in {word} {form} {reading}"
+                                        )
+                                        dash_reading = "ー"
+                                elif char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
+                                    dash_reading = "う"
+                                elif char_before_dash in "しじみいきぎひびち":
+                                    dash_reading = "い"
+                                elif char_before_dash in "そぞのこごもろとどよょおほぼぽ":
+                                    digraphA = char_before_dash + "お"
+                                    digraphB = char_before_dash + "う"
+                                    if digraphA in form and digraphB not in form:
+                                        dash_reading = "お"
+                                    elif digraphB in form and digraphA not in form:
+                                        dash_reading = "う"
+                                    else:
+                                        if digraphA in word and digraphB not in word:
+                                            dash_reading = "お"
+                                        elif digraphB in word and digraphA not in word:
+                                            dash_reading = "う"
+                                        else:
+                                            logger.warning(
+                                                f"Leaving dash in {word} {form} {reading}"
+                                            )
+                                            dash_reading = "ー"
+                                else:
+                                    logger.warning(
+                                        f"Leaving dash in {word} {form} {reading}"
+                                    )
+                                    dash_reading = "ー"
+                            reading = (
+                                reading[:index_of_dash]
+                                + dash_reading
+                                + reading[index_of_dash + 1 :]
+                            )
+                    furigana_sentence += RubyToken.from_furi(word, reading).to_code()
+            furigana_sentences.append(furigana_sentence)
+        furigana_sentences = [
+            utils.standardize_text(sentence) for sentence in furigana_sentences
+        ]
+        sentences = [utils.remove_furigana(sentence) for sentence in furigana_sentences]
+        try:
+            rowid = row["サンプル ID"]
+        except KeyError:
+            rowid = row["講演 ID"]
+        if len(furigana_sentences) == 1:
+            ids = [rowid]
+        else:
+            ids = [rowid + "_" + str(i) for i in range(len(furigana_sentences))]
+        sub_df = pd.DataFrame(
+            {"sentence": sentences, "furigana": furigana_sentences, "sentenceid": ids}
+        )
+        sub_df = sub_df[sub_df["sentence"] != sub_df["furigana"]]
+        return sub_df
+    output_df = pd.DataFrame()
+    for i, row in df.iterrows():
+        output_df = output_df.append(get_sentences(row))
+    return output_df
+def bccwj_data():
+    """Extract, load and transform the bccwj data"""
+    # Extract sentences from the data files
+    bccwj_files = list(Path(config.RAW_DATA_DIR, "bccwj").glob("*.txt"))
+    df = pd.DataFrame()
+    for bccwj_file in bccwj_files:
+        logger.info(bccwj_file.name)
+        df = pd.concat([df, read_bccwj_file(bccwj_file)])
+    # remove known errors
+    error_ids = []
+    df = df[~df["sentenceid"].isin(error_ids)]
+    df = df[df["sentence"] != ""]
+    df = df.drop_duplicates()
+    df["furigana"] = df["furigana"].apply(utils.standardize_text)
+    df["sentence"] = df["sentence"].apply(utils.standardize_text)
+    assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()
+    # Output
+    df.to_csv(Path(config.SENTENCE_DATA_DIR, "bccwj.csv"), index=False)
+    logger.info("✅ Saved bccwj data!")
+def bccwj_subset(bccwj_file):
+    """Extract, load and transform a subset of the bccwj data"""
+    df = read_bccwj_file(bccwj_file)
+    # remove known errors
+    error_ids = []
+    df = df[~df["sentenceid"].isin(error_ids)]
+    df = df.drop_duplicates()
+    df["furigana"] = df["furigana"].apply(utils.standardize_text)
+    df["sentence"] = df["sentence"].apply(utils.standardize_text)
+    # Output
+    df.to_csv(
+        Path(config.SENTENCE_DATA_DIR, bccwj_file.name.split(".")[0] + ".csv"),
+        index=False,
+    )
+    logger.info("✅ Saved bccwj " + bccwj_file.name.split(".")[0] + " data!")
+if __name__ == "__main__":
+    bccwj_data()

yomikata/dataset/kwdlc.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""kwdlc.py
+Data processing script for KWDLC files directly in the repository format
+KWDLC repository: https://github.com/ku-nlp/KWDLC
+"""
+import warnings
+from pathlib import Path
+import pandas as pd
+from speach import ttlig
+from config import config
+from config.config import logger
+from yomikata import utils
+warnings.filterwarnings("ignore")
+def read_knp_file(filename: str):
+    with open(filename) as f:
+        contents = f.readlines()
+    ids = []
+    sentences = []
+    furiganas = []
+    sentence = ""
+    furigana = ""
+    for row in contents:
+        first_word = row.split(" ")[0]
+        if first_word in ["*", "+"]:
+            pass
+        elif first_word == "#":
+            sentence_id = row.split(" ")[1].split("S-ID:")[1]
+        elif first_word == "EOS\n":
+            sentence = utils.standardize_text(sentence)
+            furigana = utils.standardize_text(furigana)
+            if sentence == utils.remove_furigana(furigana):
+                sentences.append(sentence)
+                furiganas.append(furigana)
+                ids.append(sentence_id)
+            else:
+                logger.warning(
+                    f"Dropping mismatched line \n Sentence: {sentence} \n  Furigana: {furigana}"
+                )
+            sentence = ""
+            furigana = ""
+        else:
+            words = row.split(" ")
+            sentence += words[0]
+            if words[0] == words[1]:
+                furigana += words[0]
+            else:
+                furigana += ttlig.RubyToken.from_furi(words[0], words[1]).to_code()
+    assert len(ids) == len(sentences)
+    assert len(sentences) == len(furiganas)
+    return ids, sentences, furiganas  # readings
+def kwdlc_data():
+    """Extract, load and transform the kwdlc data"""
+    # Extract sentences from the data files
+    knp_files = list(Path(config.RAW_DATA_DIR, "kwdlc").glob("**/*.knp"))
+    all_ids = []
+    all_sentences = []
+    all_furiganas = []
+    for knp_file in knp_files:
+        ids, sentences, furiganas = read_knp_file(knp_file)
+        all_ids += ids
+        all_sentences += sentences
+        all_furiganas += furiganas
+    # construct dataframe
+    df = pd.DataFrame(
+        list(
+            zip(all_sentences, all_furiganas, all_ids)
+        ),  # all_readings, all_furiganas)),
+        columns=["sentence", "furigana", "sentenceid"],
+    )
+    # remove known errors
+    error_ids = [
+        "w201106-0000547376-1",
+        "w201106-0001768070-1-01",
+        "w201106-0000785999-1",
+        "w201106-0001500842-1",
+        "w201106-0000704257-1",
+        "w201106-0002300346-3",
+        "w201106-0001779669-3",
+        "w201106-0000259203-1",
+    ]
+    df = df[~df["sentenceid"].isin(error_ids)]
+    df = df.drop_duplicates()
+    df["furigana"] = df["furigana"].apply(utils.standardize_text)
+    df["sentence"] = df["sentence"].apply(utils.standardize_text)
+    # Test
+    assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()
+    # Output
+    df.to_csv(Path(config.SENTENCE_DATA_DIR, "kwdlc.csv"), index=False)
+    logger.info("✅ Saved kwdlc data!")
+if __name__ == "__main__":
+    kwdlc_data()

yomikata/dataset/ndlbib.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""ndlbib.py
+Data processing script for ndlbib sentence file from https://github.com/ndl-lab/huriganacorpus-ndlbib
+"""
+import warnings
+from pathlib import Path
+from pandas.errors import ParserError
+from config import config
+from config.config import logger
+from yomikata.dataset.aozora import read_file
+# ndlbib and aozora use same file structure
+warnings.filterwarnings("ignore")
+def ndlbib_data():
+    """Extract, load and transform the ndlbib data"""
+    # Extract sentences from the data files
+    files = list(Path(config.RAW_DATA_DIR, "shosi").glob("*.txt"))
+    with open(Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"), "w") as f:
+        f.write("sentence,furigana,sentenceid\n")
+    for i, file in enumerate(files):
+        logger.info(f"{i+1}/{len(files)} {file.name}")
+        try:
+            df = read_file(file)
+        except ParserError:
+            logger.error(f"Parser error on {file}")
+        df.to_csv(
+            Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"),
+            mode="a",
+            index=False,
+            header=False,
+        )
+    logger.info("✅ Saved ndlbib data!")
+if __name__ == "__main__":
+    ndlbib_data()

yomikata/dataset/pronunciations.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from pathlib import Path
+import jaconv
+import pandas as pd
+from tqdm import tqdm
+from config import config
+from config.config import logger
+from yomikata import utils
+def pronunciation_data():
+    data_files = list(Path(config.READING_DATA_DIR).glob("*.csv"))
+    df = pd.DataFrame()
+    for file in data_files:
+        if (file.name == "all.csv") or (file.name == "ambiguous.csv"):
+            continue
+        output_df = pd.read_csv(file)
+        df = pd.concat([df, output_df])
+    df["surface"] = df["surface"].astype(str).str.strip()
+    df["kana"] = df["kana"].astype(str).str.strip()
+    tqdm.pandas()
+    df["kana"] = df["kana"].progress_apply(utils.standardize_text)
+    df["surface"] = df["surface"].progress_apply(utils.standardize_text)
+    df["kana"] = df.progress_apply(lambda row: jaconv.kata2hira(row["kana"]), axis=1)
+    df = df[df["surface"] != df["kana"]]
+    df = df[df["kana"] != ""]
+    df = df[df["surface"].progress_apply(utils.has_kanji)]
+    df = df.loc[~df["surface"].str.contains(r"[〜〜（）\)\(\*]\.")]
+    df = df[["surface", "kana"]]
+    df = df.drop_duplicates()
+    df.to_csv(Path(config.READING_DATA_DIR, "all.csv"), index=False)
+    logger.info("✅ Merged all the pronunciation data!")
+    # merged_df = (
+    #     df.groupby("surface")["kana"]
+    #     .apply(list)
+    #     .reset_index(name="pronunciations")
+    # )
+    # ambiguous_df = merged_df[merged_df["pronunciations"].apply(len) > 1]
+    # ambiguous_df.to_csv(Path(config.READING_DATA_DIR, "ambiguous.csv"), index=False)
+if __name__ == "__main__":
+    pronunciation_data()

yomikata/dataset/repair_long_vowels.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from pathlib import Path
+import pandas as pd
+from config import config
+from config.config import logger
+pronunciation_df = pd.read_csv(Path(config.READING_DATA_DIR, "all.csv"))
+pronunciation_df = pronunciation_df.groupby("surface")["kana"].apply(list)
+def repair_long_vowels(kana: str, kanji: str = None) -> str:
+    """Clean and normalize text
+    Args:
+        kana (str): input string
+        kanji (str): input string, optional
+    Returns:
+        str: a cleaned string
+    """
+    reading = kana
+    indices_of_dash = [pos for pos, char in enumerate(reading) if char == "ー"]
+    # get rid of non-ambiguous dashes
+    for index_of_dash in indices_of_dash:
+        char_before_dash = reading[index_of_dash - 1]
+        if char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
+            reading = reading[:index_of_dash] + "う" + reading[index_of_dash + 1 :]
+        elif char_before_dash in "しじみいきぎひびちぢぃ":
+            reading = reading[:index_of_dash] + "い" + reading[index_of_dash + 1 :]
+    indices_of_not_dash = [pos for pos, char in enumerate(reading) if char != "ー"]
+    if len(indices_of_not_dash) != len(reading):
+        if not kanji:
+            logger.info("Disambiguating this dash requires kanji")
+            logger.info(f"Left dash in {reading}")
+        else:
+            try:
+                candidate_pronunciations = list(pronunciation_df[kanji])
+            except KeyError:
+                candidate_pronunciations = []
+            candidate_pronunciations = list(set(candidate_pronunciations))
+            candidate_pronunciations = [
+                x for x in candidate_pronunciations if len(x) == len(reading)
+            ]
+            candidate_pronunciations = [
+                x
+                for x in candidate_pronunciations
+                if all([x[i] == reading[i] for i in indices_of_not_dash])
+            ]
+            if len(candidate_pronunciations) == 1:
+                reading = candidate_pronunciations[0]
+            else:
+                pass
+                # logger.warning(f"Left dashes in {kanji} {reading}")
+    return reading

yomikata/dataset/split.py ADDED Viewed

	@@ -0,0 +1,271 @@

+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from speach.ttlig import RubyFrag, RubyToken
+from config import config
+from config.config import logger
+from yomikata import utils
+from yomikata.dictionary import Dictionary
+def train_val_test_split(X, y, train_size, val_size, test_size):
+    """Split dataset into data splits."""
+    assert (train_size + val_size + test_size) == 1
+    X_train, X_, y_train, y_ = train_test_split(X, y, train_size=train_size)
+    X_val, X_test, y_val, y_test = train_test_split(
+        X_, y_, train_size=val_size / (test_size + val_size)
+    )
+    return X_train, X_val, X_test, y_train, y_val, y_test
+def filter_simple(input_file, output_file, heteronyms) -> None:
+    """This filters out sentences which don't contain any heteronyms"""
+    df = pd.read_csv(input_file)  # load
+    logger.info(f"Prefilter size: {len(df)}")
+    df = df[df["sentence"].str.contains(r"|".join(heteronyms))]
+    logger.info(f"Postfilter size: {len(df)}")
+    df.to_csv(output_file, index=False)
+def filter_dictionary(input_file, output_file, heteronyms, dictionary) -> None:
+    """This filters out sentences which contain heteronyms only as part of a compound which is known to the dictionary"""
+    df = pd.read_csv(input_file)  # load
+    logger.info(f"Prefilter size: {len(df)}")
+    df["contains_heteronym"] = df["sentence"].apply(
+        lambda s: not set(
+            [dictionary.token_to_surface(m) for m in dictionary.tagger(s)]
+        ).isdisjoint(heteronyms)
+    )
+    df = df[df["contains_heteronym"]]
+    logger.info(f"Postfilter size: {len(df)}")
+    df.to_csv(output_file, index=False)
+def regroup_furigana(s, heteronym, heteronym_dict, dictionary, verbose=False):
+    rubytokens = utils.parse_furigana(s)
+    output_tokens = []
+    for token in rubytokens.groups:
+        if isinstance(token, RubyFrag):
+            # this is a token with furigana
+            if heteronym in token.text and token.text != heteronym:
+                # it includes the heteronym but is not exactly the heteronym
+                # if len(dictionary.tagger(token.text)) > 1:
+                # it is not in the dictionary, so we try to regroup it
+                # note this dictionary check is not foolproof: sometimes words are in the dictionary and found here,
+                # but in a parse of the whole sentence the word will be split in two.
+                # commented this out since actually even if it is part of dictionary, it will go through the training and so we might as well try to regroup it to avoid it being an <OTHER>
+                viable_regroupings = []
+                for reading in heteronym_dict[heteronym]:
+                    regrouped_tokens = regroup_furigana_tokens(
+                        [token], heteronym, reading, verbose=verbose
+                    )
+                    if regrouped_tokens != [token]:
+                        if verbose:
+                            print("viable regrouping found")
+                        viable_regroupings.append(regrouped_tokens)
+                if len(viable_regroupings) == 1:
+                    output_tokens += viable_regroupings[0]
+                    continue
+                else:
+                    if verbose:
+                        print("multiple viable readings found, cannot regroup")
+                    pass
+        output_tokens.append(token)
+    output_string = RubyToken(groups=output_tokens).to_code()
+    assert utils.furigana_to_kana(output_string) == utils.furigana_to_kana(s)
+    assert utils.remove_furigana(output_string) == utils.remove_furigana(s)
+    return output_string
+def regroup_furigana_tokens(ruby_tokens, heteronym, reading, verbose=False):
+    if not len(ruby_tokens) == 1:
+        raise ValueError("regroup failed, no support yet for token merging")
+    ruby_token = ruby_tokens[0]
+    text = ruby_token.text
+    furi = ruby_token.furi
+    try:
+        split_text = [
+            text[0 : text.index(heteronym)],
+            heteronym,
+            text[text.index(heteronym) + len(heteronym) :],
+        ]
+        split_text = [text for text in split_text if text != ""]
+    except ValueError:
+        if verbose:
+            print("regroup failed, heteronym not in token text")
+        return ruby_tokens
+    try:
+        split_furi = [
+            furi[0 : furi.index(reading)],
+            reading,
+            furi[furi.index(reading) + len(reading) :],
+        ]
+        split_furi = [furi for furi in split_furi if furi != ""]
+    except ValueError:
+        if verbose:
+            print("regroup failed, reading not in token furi")
+        return ruby_tokens
+    if not len(split_text) == len(split_furi):
+        if verbose:
+            print(
+                "regroup failed, failed to find heteronym and its reading in the same place in the inputs"
+            )
+        return ruby_tokens
+    regrouped_tokens = [
+        RubyFrag(text=split_text[i], furi=split_furi[i]) for i in range(len(split_text))
+    ]
+    if not "".join([token.furi for token in ruby_tokens]) == "".join(
+        [token.furi for token in regrouped_tokens]
+    ):
+        if verbose:
+            print(
+                "regroup failed, reading of produced result does not agree with reading of input"
+            )
+        return ruby_tokens
+    if not [token.furi for token in regrouped_tokens if token.text == heteronym] == [
+        reading
+    ]:
+        if verbose:
+            print("regroup failed, the heteronym did not get assigned the reading")
+        return ruby_tokens
+    return regrouped_tokens
+def optimize_furigana(input_file, output_file, heteronym_dict, dictionary) -> None:
+    df = pd.read_csv(input_file)  # load
+    logger.info("Optimizing furigana using heteronym list and dictionary")
+    for heteronym in heteronym_dict.keys():
+        logger.info(f"Heteronym {heteronym} {heteronym_dict[heteronym]}")
+        n_with_het = sum(df["sentence"].str.contains(heteronym))
+        rows_to_rearrange = df["sentence"].str.contains(heteronym)
+        optimized_rows = df.loc[rows_to_rearrange, "furigana"].apply(
+            lambda s: regroup_furigana(s, heteronym, heteronym_dict, dictionary)
+        )
+        n_rearranged = sum(df.loc[rows_to_rearrange, "furigana"] != optimized_rows)
+        logger.info(f"{n_rearranged}/{n_with_het} sentences were optimized")
+        df.loc[rows_to_rearrange, "furigana"] = optimized_rows
+    df.to_csv(output_file, index=False)
+def remove_other_readings(input_file, output_file, heteronym_dict):
+    df = pd.read_csv(input_file)  # load
+    logger.info(f"Prefilter size: {len(df)}")
+    df["keep_row"] = False
+    for heteronym in heteronym_dict.keys():
+        logger.info(heteronym)
+        n_with_het = sum(df["sentence"].str.contains(heteronym))
+        keep_for_het = df["furigana"].str.contains(
+            r"|".join(
+                [f"{{{heteronym}/{reading}}}" for reading in heteronym_dict[heteronym]]
+            )
+        )
+        df["keep_row"] = df["keep_row"] | keep_for_het
+        logger.info(
+            f"Dropped {n_with_het-sum(keep_for_het)}/{n_with_het} sentences which have different readings"
+        )  # TODO reword
+    df = df.loc[df["keep_row"]]
+    df = df.drop("keep_row", axis=1)
+    df.to_csv(output_file, index=False)
+def check_data(input_file) -> bool:
+    df = pd.read_csv(input_file)  # load
+    df["furigana-test"] = df["sentence"] == df["furigana"].apply(utils.remove_furigana)
+    assert df["furigana-test"].all()
+    df["sentence-standardize-test"] = df["sentence"] == df["sentence"].apply(
+        utils.standardize_text
+    )
+    assert df["sentence-standardize-test"].all()
+    return True
+def split_data(data_file) -> None:
+    df = pd.read_csv(data_file)  # load
+    X = df["sentence"].values
+    y = df["furigana"].values
+    (X_train, X_val, X_test, y_train, y_val, y_test) = train_val_test_split(
+        X=X,
+        y=y,
+        train_size=config.TRAIN_SIZE,
+        val_size=config.VAL_SIZE,
+        test_size=config.TEST_SIZE,
+    )
+    train_df = pd.DataFrame({"sentence": X_train, "furigana": y_train})
+    val_df = pd.DataFrame({"sentence": X_val, "furigana": y_val})
+    test_df = pd.DataFrame({"sentence": X_test, "furigana": y_test})
+    train_df.to_csv(Path(config.TRAIN_DATA_DIR, "train_" + data_file.name), index=False)
+    val_df.to_csv(Path(config.VAL_DATA_DIR, "val_" + data_file.name), index=False)
+    test_df.to_csv(Path(config.TEST_DATA_DIR, "test_" + data_file.name), index=False)
+if __name__ == "__main__":
+    input_files = [
+        Path(config.SENTENCE_DATA_DIR, "aozora.csv"),
+        Path(config.SENTENCE_DATA_DIR, "kwdlc.csv"),
+        Path(config.SENTENCE_DATA_DIR, "bccwj.csv"),
+        Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"),
+    ]
+    logger.info("Merging sentence data")
+    utils.merge_csvs(input_files, Path(config.SENTENCE_DATA_DIR, "all.csv"), n_header=1)
+    logger.info("Rough filtering for sentences with heteronyms")
+    filter_simple(
+        Path(config.SENTENCE_DATA_DIR, "all.csv"),
+        Path(config.SENTENCE_DATA_DIR, "have_heteronyms_simple.csv"),
+        config.HETERONYMS.keys(),
+    )
+    logger.info("Sudachidict filtering for out heteronyms in known compounds")
+    filter_dictionary(
+        Path(config.SENTENCE_DATA_DIR, "have_heteronyms_simple.csv"),
+        Path(config.SENTENCE_DATA_DIR, "have_heteronyms.csv"),
+        config.HETERONYMS.keys(),
+        Dictionary("sudachi"),
+    )
+    logger.info("Optimizing furigana")
+    optimize_furigana(
+        Path(config.SENTENCE_DATA_DIR, "have_heteronyms.csv"),
+        Path(config.SENTENCE_DATA_DIR, "optimized_heteronyms.csv"),
+        config.HETERONYMS,
+        Dictionary("sudachi"),
+    )
+    logger.info("Removing heteronyms with unexpected readings")
+    remove_other_readings(
+        Path(config.SENTENCE_DATA_DIR, "optimized_heteronyms.csv"),
+        Path(config.SENTENCE_DATA_DIR, "optimized_strict_heteronyms.csv"),
+        config.HETERONYMS,
+    )
+    logger.info("Running checks on data")
+    test_result = check_data(
+        Path(config.SENTENCE_DATA_DIR, "optimized_strict_heteronyms.csv")
+    )
+    logger.info("Performing train/test/split")
+    split_data(Path(config.SENTENCE_DATA_DIR, "optimized_strict_heteronyms.csv"))
+    logger.info("Data splits successfully generated!")

yomikata/dataset/sudachi.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""sudachi.py
+Data processing script for sudachi dictionary
+"""
+import warnings
+from pathlib import Path
+import pandas as pd
+from config import config
+from config.config import logger
+warnings.filterwarnings("ignore")
+def sudachi_data():
+    sudachi_file = list(Path(config.RAW_DATA_DIR, "sudachi").glob("*.csv"))
+    df = pd.DataFrame()
+    for file in sudachi_file:
+        logger.info(file.name)
+        # Load file
+        df = pd.concat(
+            [
+                df,
+                pd.read_csv(
+                    file,
+                    header=None,
+                ),
+            ]
+        )
+    df["surface"] = df[0].astype(str).str.strip()
+    df["kana"] = df[11].astype(str).str.strip()
+    df["type"] = df[5].astype(str).str.strip()
+    df = df[df["kana"] != "*"]
+    df = df[df["surface"] != df["kana"]]
+    df = df[df["type"] != "補助記号"]
+    df = df[["surface", "kana"]]
+    df.to_csv(Path(config.READING_DATA_DIR, "sudachi.csv"), index=False)
+    logger.info("✅ Processed sudachi data!")
+if __name__ == "__main__":
+    sudachi_data()

yomikata/dataset/unidic.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""unidic.py
+Data processing script for unidic dictionary
+"""
+import warnings
+from pathlib import Path
+import pandas as pd
+from config import config
+from config.config import logger
+warnings.filterwarnings("ignore")
+def unidic_data():
+    """Extract, load and transform the unidic data"""
+    # Extract sentences from the data files
+    unidic_file = list(Path(config.RAW_DATA_DIR, "unidic").glob("*.csv"))[0]
+    # Load file
+    df = pd.read_csv(
+        unidic_file,
+        header=None,
+        names="surface id1 id2 id3 pos1 pos2 pos3 pos4 cType "
+        "cForm lForm lemma orth orthBase pron pronBase goshu iType iForm fType "
+        "fForm iConType fConType type kana kanaBase form formBase aType aConType "
+        "aModType lid lemma_id".split(" "),
+    )
+    df["surface"] = df["surface"].astype(str).str.strip()
+    df["kana"] = df["kana"].astype(str).str.strip()
+    df = df[df["kana"] != "*"]
+    df = df[df["surface"] != df["kana"]]
+    df = df[["surface", "kana"]]
+    df.to_csv(Path(config.READING_DATA_DIR, "unidic.csv"), index=False)
+    logger.info("✅ Processed unidic data!")
+if __name__ == "__main__":
+    unidic_data()

yomikata/dbert.py ADDED Viewed

	@@ -0,0 +1,414 @@

+"""
+dbert.py
+Provides the dBert class that implements Reader using BERT contextual embeddings to disambiguate heteronyms.
+"""
+import logging
+import os
+from pathlib import Path
+import numpy as np
+import torch
+from speach.ttlig import RubyFrag, RubyToken
+from transformers import (
+    AutoModelForTokenClassification,
+    BertJapaneseTokenizer,
+    DataCollatorForTokenClassification,
+    EarlyStoppingCallback,
+    Trainer,
+    TrainingArguments,
+)
+from config import config
+from config.config import logger
+from yomikata import utils
+from yomikata.reader import Reader
+from yomikata.utils import LabelEncoder
+logging.getLogger("transformers").setLevel(logging.ERROR)
+logging.getLogger("transformers.trainer").setLevel(logging.ERROR)
+logging.getLogger("datasets").setLevel(logging.ERROR)
+class dBert(Reader):
+    def __init__(
+        self,
+        artifacts_dir: Path = Path(config.STORES_DIR, "dbert"),
+        reinitialize: bool = False,
+        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+    ) -> None:
+        # Set the device
+        self.device = device
+        logger.info(f"Running on {self.device}")
+        if self.device.type == "cuda":
+            logger.info(torch.cuda.get_device_name(0))
+        # Hardcoded parameters
+        self.max_length = 128
+        # Load the model
+        self.artifacts_dir = artifacts_dir
+        if reinitialize:
+            # load tokenizer from upstream huggingface repository
+            default_model = "cl-tohoku/bert-base-japanese-v2"
+            self.tokenizer = BertJapaneseTokenizer.from_pretrained(default_model)
+            logger.info(f"Using {default_model} tokenizer")
+            # load the heteronyms list
+            self.heteronyms = config.HETERONYMS
+            # make the label encoder
+            label_list = ["<OTHER>"]
+            for i, heteronym in enumerate(self.heteronyms.keys()):
+                for j, reading in enumerate(self.heteronyms[heteronym]):
+                    label_list.append(heteronym + ":" + reading)
+            self.label_encoder = LabelEncoder()
+            self.label_encoder.fit(label_list)
+            logger.info("Made label encoder with default heteronyms")
+            # add surface forms to tokenizer vocab
+            surfaces = list(
+                set([x.split(":")[0] for x in self.label_encoder.classes if x != "<OTHER>"])
+            )
+            new_tokens = [
+                surface
+                for surface in surfaces
+                if surface
+                not in (list(self.tokenizer.vocab.keys()) + list(self.tokenizer.get_added_vocab()))
+            ]
+            self.tokenizer.add_tokens(new_tokens)
+            if len(new_tokens) > 0:
+                logger.info(f"Added {len(new_tokens)} surface forms to tokenizer vocab")
+            # check that new tokens were added properly
+            assert [
+                self.tokenizer.decode(
+                    self.tokenizer.encode(
+                        [surface],
+                        add_special_tokens=False,
+                    )
+                )
+                for surface in surfaces
+            ] == surfaces
+            self.surfaceIDs = self.tokenizer.encode(
+                list(set([x.split(":")[0] for x in self.label_encoder.classes if x != "<OTHER>"])),
+                add_special_tokens=False,
+            )
+            assert len(self.surfaceIDs) == len(surfaces)
+            # Load model from upstream huggingface repository
+            self.model = AutoModelForTokenClassification.from_pretrained(
+                default_model, num_labels=len(self.label_encoder.classes)
+            )
+            self.model.resize_token_embeddings(len(self.tokenizer))
+            logger.info(f"Using model {default_model}")
+            self.save(artifacts_dir)
+        else:
+            self.load(artifacts_dir)
+    def load(self, directory):
+        self.tokenizer = BertJapaneseTokenizer.from_pretrained(directory)
+        self.model = AutoModelForTokenClassification.from_pretrained(directory).to(self.device)
+        self.label_encoder = LabelEncoder.load(Path(directory, "label_encoder.json"))
+        self.heteronyms = utils.load_dict(Path(directory, "heteronyms.json"))
+        self.surfaceIDs = self.tokenizer.encode(
+            list(set([x.split(":")[0] for x in self.label_encoder.classes if x != "<OTHER>"])),
+            add_special_tokens=False,
+        )
+        logger.info(f"Loaded model from directory {directory}")
+    def save(self, directory):
+        self.tokenizer.save_pretrained(directory)
+        self.model.save_pretrained(directory)
+        self.label_encoder.save(Path(directory, "label_encoder.json"))
+        utils.save_dict(self.heteronyms, Path(directory, "heteronyms.json"))
+        logger.info(f"Saved model to directory {directory}")
+    def batch_preprocess_function(self, entries, pad=False):
+        inputs = [entry for entry in entries["sentence"]]
+        furiganas = [entry for entry in entries["furigana"]]
+        if pad:
+            tokenized_inputs = self.tokenizer(
+                inputs,
+                max_length=self.max_length,
+                truncation=True,
+                padding="max_length",
+                # return_tensors="np",
+            )
+        else:
+            tokenized_inputs = self.tokenizer(
+                inputs,
+                max_length=self.max_length,
+                truncation=True,
+            )
+        labels = []
+        for i, input_ids in enumerate(tokenized_inputs["input_ids"]):
+            furigana_temp = furiganas[i]
+            label_ids = []
+            assert inputs[i] == utils.remove_furigana(furiganas[i])
+            for j, input_id in enumerate(input_ids):
+                if input_id not in self.surfaceIDs:
+                    label = -100
+                else:
+                    surface = self.tokenizer.decode([input_id])
+                    try:
+                        reading_start_idx = furigana_temp.index(surface) + len(surface)
+                        furigana_temp = furigana_temp[reading_start_idx + 1 :]
+                        reading_end_idx = furigana_temp.index("}")
+                        reading = furigana_temp[:reading_end_idx]
+                        furigana_temp = furigana_temp[reading_end_idx + 1 :]
+                        label = self.label_encoder.class_to_index[surface + ":" + reading]
+                    except KeyError:
+                        # this means there's an unknown reading
+                        label = 0
+                    except ValueError:
+                        # this means that the surface form is not present in the furigana
+                        # probably it got split between two different words
+                        label = 0
+                label_ids.append(label)
+            assert len(label_ids) == len(input_ids)
+            labels.append(label_ids)
+        assert len(labels) == len(tokenized_inputs["input_ids"])
+        return {
+            "input_ids": tokenized_inputs["input_ids"],
+            "attention_mask": tokenized_inputs["attention_mask"],
+            "labels": labels,
+        }
+    def train(self, dataset, training_args={}) -> dict:
+        dataset = dataset.map(
+            self.batch_preprocess_function, batched=True, fn_kwargs={"pad": False}
+        )
+        dataset = dataset.filter(
+            lambda entry: any(x in entry["input_ids"] for x in list(self.surfaceIDs))
+        )
+        # put the model in training mode
+        self.model.train()
+        default_training_args = {
+            "output_dir": self.artifacts_dir,
+            "num_train_epochs": 10,
+            "evaluation_strategy": "steps",
+            "eval_steps": 10,
+            "logging_strategy": "steps",
+            "logging_steps": 10,
+            "save_strategy": "steps",
+            "save_steps": 10,
+            "learning_rate": 2e-5,
+            "per_device_train_batch_size": 128,
+            "per_device_eval_batch_size": 128,
+            "load_best_model_at_end": True,
+            "metric_for_best_model": "loss",
+            "weight_decay": 0.01,
+            "save_total_limit": 3,
+            "fp16": True,
+            "report_to": "tensorboard",
+        }
+        default_training_args.update(training_args)
+        training_args = default_training_args
+        # Not padding in batch_preprocess_function so need data_collator for trainer
+        data_collator = DataCollatorForTokenClassification(tokenizer=self.tokenizer, padding=True)
+        if "val" in list(dataset):
+            trainer = Trainer(
+                model=self.model,
+                args=TrainingArguments(**training_args),
+                train_dataset=dataset["train"],
+                eval_dataset=dataset["val"],
+                tokenizer=self.tokenizer,
+                callbacks=[
+                    EarlyStoppingCallback(early_stopping_patience=5),
+                ],
+                data_collator=data_collator,
+            )
+        else:
+            trainer = Trainer(
+                model=self.model,
+                args=TrainingArguments(**training_args),
+                train_dataset=dataset["train"],
+                tokenizer=self.tokenizer,
+                data_collator=data_collator,
+            )
+        result = trainer.train()
+        # Output some training information
+        print(f"Time: {result.metrics['train_runtime']:.2f}")
+        print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+        gpu_index = int(os.environ["CUDA_VISIBLE_DEVICES"])
+        utils.print_gpu_utilization(gpu_index)
+        # Get metrics for each train/val/split
+        self.model.eval()
+        full_performance = {}
+        for key in dataset.keys():
+            max_evals = min(100000, len(dataset[key]))
+            # max_evals = len(dataset[key])
+            logger.info(f"getting predictions for {key}")
+            subset = dataset[key].shuffle().select(range(max_evals))
+            prediction_output = trainer.predict(subset)
+            logger.info(f"processing predictions for {key}")
+            metrics = prediction_output[2]
+            labels = prediction_output[1]
+            predictions = np.argmax(prediction_output[0], axis=2)
+            true_inputs = [
+                self.tokenizer.decode([input_id])
+                for row in subset["input_ids"]
+                for input_id in row
+                if input_id in self.surfaceIDs
+            ]
+            true_predictions = [
+                str(self.label_encoder.index_to_class[p])
+                for prediction, label in zip(predictions, labels)
+                for (p, l) in zip(prediction, label)
+                if l != -100
+            ]
+            true_labels = [
+                str(self.label_encoder.index_to_class[l])
+                for prediction, label in zip(predictions, labels)
+                for (p, l) in zip(prediction, label)
+                if l != -100
+            ]
+            logger.info("processing performance")
+            performance = {
+                heteronym: {
+                    "n": 0,
+                    "readings": {
+                        reading: {
+                            "n": 0,
+                            "found": {
+                                readingprime: 0
+                                for readingprime in list(self.heteronyms[heteronym].keys())
+                                + ["<OTHER>"]
+                            },
+                        }
+                        for reading in list(self.heteronyms[heteronym].keys()) + ["<OTHER>"]
+                    },
+                }
+                for heteronym in self.heteronyms.keys()
+            }
+            for i, surface in enumerate(true_inputs):
+                performance[surface]["n"] += 1
+                true_reading = true_labels[i].split(":")[-1]
+                performance[surface]["readings"][true_reading]["n"] += 1
+                if true_predictions[i] != "<OTHER>":
+                    if true_predictions[i].split(":")[0] != surface:
+                        logger.warning(f"big failure at {surface} {true_predictions[i]}")
+                        found_reading = "<OTHER>"
+                    else:
+                        found_reading = true_predictions[i].split(":")[1]
+                else:
+                    found_reading = "<OTHER>"
+                performance[surface]["readings"][true_reading]["found"][found_reading] += 1
+                # if found_reading != true_reading:
+                #     # pass
+                #     logger.info(
+                #         f"Predicted {found_reading} instead of {true_reading} in {subset["furigana"][furi_rows[i]]}"
+                #     )
+            n = 0
+            correct = 0
+            for surface in performance.keys():
+                for true_reading in performance[surface]["readings"].keys():
+                    performance[surface]["readings"][true_reading]["accuracy"] = np.round(
+                        performance[surface]["readings"][true_reading]["found"][true_reading]
+                        / np.array(performance[surface]["readings"][true_reading]["n"]),
+                        3,
+                    )
+                performance[surface]["accuracy"] = np.round(
+                    sum(
+                        performance[surface]["readings"][true_reading]["found"][true_reading]
+                        for true_reading in performance[surface]["readings"].keys()
+                    )
+                    / np.array(performance[surface]["n"]),
+                    3,
+                )
+                correct += sum(
+                    performance[surface]["readings"][true_reading]["found"][true_reading]
+                    for true_reading in performance[surface]["readings"].keys()
+                )
+                n += performance[surface]["n"]
+            performance = {
+                "metrics": metrics,
+                "accuracy": round(correct / n, 3),
+                "heteronym_performance": performance,
+            }
+            full_performance[key] = performance
+        trainer.save_model()
+        return full_performance
+    def furigana(self, text: str) -> str:
+        text = utils.standardize_text(text)
+        text = utils.remove_furigana(text)
+        text = text.replace("{", "").replace("}", "")
+        self.model.eval()
+        text_encoded = self.tokenizer(
+            text,
+            max_length=self.max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids = text_encoded["input_ids"].to(self.device)
+        input_mask = text_encoded["attention_mask"].to(self.device)
+        logits = self.model(input_ids=input_ids, attention_mask=input_mask).logits
+        predictions = torch.argmax(logits, dim=2)
+        output_ruby = []
+        for (i, p) in enumerate(predictions[0]):
+            text = self.tokenizer.decode([input_ids[0][i]])
+            if text in ["[CLS]", "[SEP]"]:
+                continue
+            if text[:2] == "##":
+                text = text[2:]
+            if input_ids[0][i].item() in self.surfaceIDs:
+                furi = self.label_encoder.index_to_class[p.item()]
+                if furi == "<OTHER>":
+                    output_ruby.append(f"{{{text}}}")
+                elif furi.split(":")[0] != text:
+                    output_ruby.append(f"{{{text}}}")
+                else:
+                    output_ruby.append(RubyFrag(text=text, furi=furi.split(":")[1]))
+            else:
+                output_ruby.append(text)
+        return RubyToken(groups=output_ruby).to_code()

yomikata/dictionary.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+dictionary.py
+Provides the Dictionary class which implements Reader using dictionary lookup.
+"""
+import fugashi
+import ipadic
+import jaconv
+import jumandic
+from speach import ttlig
+from sudachipy import dictionary as sudachidict
+from sudachipy import tokenizer as sudachitokenizer
+from config.config import ASCII_SPACE_TOKEN
+from yomikata import utils
+from yomikata.reader import Reader
+tokenizer_obj = sudachidict.Dictionary(dict="full").create()
+mode = sudachitokenizer.Tokenizer.SplitMode.C
+taggers = {}
+taggers["ipadic"] = fugashi.GenericTagger(ipadic.MECAB_ARGS)
+taggers["juman"] = fugashi.GenericTagger(jumandic.MECAB_ARGS)
+taggers["unidic"] = fugashi.Tagger()
+taggers["sudachi"] = lambda s: tokenizer_obj.tokenize(s, mode)
+token_to_kana = {
+    "ipadic": lambda word: jaconv.kata2hira(str(word.feature[7]))
+    if len(word.feature) >= 8
+    else jaconv.kata2hira(str(word.surface)),
+    "juman": lambda word: word.feature[5]
+    if word.feature[5] != "*"
+    else jaconv.kata2hira(str(word)),
+    "unidic": lambda word: jaconv.kata2hira(str(word))
+    if (word.feature.kana == "*" or word.feature.kana is None)
+    else jaconv.kata2hira(str(word.feature.kana)),
+    "sudachi": lambda word: jaconv.kata2hira(
+        utils.standardize_text(str(word.reading_form()))
+    ),
+}
+token_to_surface = {
+    "ipadic": lambda word: word.surface,
+    "juman": lambda word: word.surface,
+    "unidic": lambda word: word.surface,
+    "sudachi": lambda word: word.surface(),
+}
+token_to_pos = {
+    "ipadic": lambda word: word.feature[0],
+    "juman": lambda word: word.feature[0],
+    "unidic": lambda word: word.feature.pos1,
+    "sudachi": lambda word: word.part_of_speech()[0],
+}
+class Dictionary(Reader):
+    def __init__(self, tagger: str = "unidic") -> None:
+        """Create a Dictionary object to apply furigana using Dictionary lookup
+        Object holds configuration and tokenizer state.
+        Typical usage:
+        ```python
+        reader = Dictionary()
+        furi = Dictionary.furigana("お前はもう死んでいる")
+        # "お{前/まえ}はもう{死/し}んでいる"
+        ```
+        Args:
+            tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
+        """
+        self.tagger = taggers[tagger]
+        self.token_to_kana = token_to_kana[tagger]
+        self.token_to_surface = token_to_surface[tagger]
+        self.token_to_pos = token_to_pos[tagger]
+    def furigana(self, text: str) -> str:
+        text = utils.standardize_text(text)
+        text = text.replace(" ", ASCII_SPACE_TOKEN)
+        rubytoken = utils.parse_furigana(text)
+        output = ""
+        for group in rubytoken.groups:
+            if isinstance(group, ttlig.RubyFrag):
+                output += f"{{{group.text}/{group.furi}}}"
+            else:
+                group = group.replace("{", "").replace("}", "")
+                for word in self.tagger(group):
+                    kana = self.token_to_kana(word)
+                    surface = self.token_to_surface(word)
+                    pos = self.token_to_pos(word)
+                    if (surface == kana) or pos in ["記号", "補助記号", "特殊"]:
+                        output += surface
+                    else:
+                        output += ttlig.RubyToken.from_furi(surface, kana).to_code()
+        output = output.replace(ASCII_SPACE_TOKEN, " ")
+        return output