File size: 8,087 Bytes
9aba307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""bccwj.py
Data processing script for files downloaded from Chuunagon search
Chuunagon URL: https://chunagon.ninjal.ac.jp/

Download with the settings
文脈中の区切り記号 |
文脈中の文区切り記号 #
前後文脈の語数 10
検索対象(固定長・可変長) 両方
共起条件の範囲 文境界をまたがない

ダウンロードオプション
システム Linux
文字コード UTF-8
改行コード LF
出力ファイルが一つの場合は Zip 圧縮を行わない 検索条件式ごとに出力ファイルを分割する
インラインタグを使用  CHECK BOTH 語彙素読み AND 発音形出現形語種 BOX
(発音形出現形 is the actual pronounced one, but displays e.g. よう れい as よー れー)
タグの区切り記号 :
"""

import warnings
from pathlib import Path

import jaconv
import pandas as pd
from speach.ttlig import RubyToken

from config import config
from config.config import logger
from yomikata import utils

warnings.filterwarnings("ignore")

SENTENCE_SPLIT_CHAR = "#"
WORD_SPLIT_CHAR = "|"
READING_SEP_CHAR = ":"


def read_bccwj_file(filename: str):
    """ """

    df = pd.read_csv(filename, sep="\t")

    df["前文脈"] = df["前文脈"].fillna("")
    df["後文脈"] = df["後文脈"].fillna("")
    df["full_text"] = (
        df["前文脈"] + df["キー"] + "[" + df["語彙素読み"] + ":" + df["発音形出現形"] + "]" + df["後文脈"]
    )

    def get_sentences(row):
        sentences = row["full_text"].split(SENTENCE_SPLIT_CHAR)
        furigana_sentences = []
        for sentence in sentences:
            words_with_readings = sentence.split(WORD_SPLIT_CHAR)
            furigana_sentence = ""
            for word_with_reading in words_with_readings:
                word = word_with_reading.split("[")[0]
                form, reading = jaconv.kata2hira(
                    word_with_reading.split("[")[1].split("]")[0]
                ).split(READING_SEP_CHAR)

                if (
                    not utils.has_kanji(word)
                    or reading == jaconv.kata2hira(word)
                    or form == ""
                    or reading == ""
                ):
                    furigana_sentence += word
                else:
                    if ("ー" in reading) and ("ー" not in form):
                        indexes_of_dash = [
                            pos for pos, char in enumerate(reading) if char == "ー"
                        ]
                        for index_of_dash in indexes_of_dash:
                            if len(reading) == len(form):
                                dash_reading = form[index_of_dash]
                            else:
                                char_before_dash = reading[index_of_dash - 1]
                                if char_before_dash in "ねめせぜれてでけげへべぺ":
                                    digraphA = char_before_dash + "え"
                                    digraphB = char_before_dash + "い"
                                    if digraphA in form and digraphB not in form:
                                        dash_reading = "え"
                                    elif digraphB in form and digraphA not in form:
                                        dash_reading = "い"
                                    else:
                                        logger.warning(
                                            f"Leaving dash in {word} {form} {reading}"
                                        )
                                        dash_reading = "ー"
                                elif char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
                                    dash_reading = "う"
                                elif char_before_dash in "しじみいきぎひびち":
                                    dash_reading = "い"
                                elif char_before_dash in "そぞのこごもろとどよょおほぼぽ":
                                    digraphA = char_before_dash + "お"
                                    digraphB = char_before_dash + "う"
                                    if digraphA in form and digraphB not in form:
                                        dash_reading = "お"
                                    elif digraphB in form and digraphA not in form:
                                        dash_reading = "う"
                                    else:
                                        if digraphA in word and digraphB not in word:
                                            dash_reading = "お"
                                        elif digraphB in word and digraphA not in word:
                                            dash_reading = "う"
                                        else:
                                            logger.warning(
                                                f"Leaving dash in {word} {form} {reading}"
                                            )
                                            dash_reading = "ー"
                                else:
                                    logger.warning(
                                        f"Leaving dash in {word} {form} {reading}"
                                    )
                                    dash_reading = "ー"
                            reading = (
                                reading[:index_of_dash]
                                + dash_reading
                                + reading[index_of_dash + 1 :]
                            )
                    furigana_sentence += RubyToken.from_furi(word, reading).to_code()

            furigana_sentences.append(furigana_sentence)

        furigana_sentences = [
            utils.standardize_text(sentence) for sentence in furigana_sentences
        ]
        sentences = [utils.remove_furigana(sentence) for sentence in furigana_sentences]
        try:
            rowid = row["サンプル ID"]
        except KeyError:
            rowid = row["講演 ID"]
        if len(furigana_sentences) == 1:
            ids = [rowid]
        else:
            ids = [rowid + "_" + str(i) for i in range(len(furigana_sentences))]

        sub_df = pd.DataFrame(
            {"sentence": sentences, "furigana": furigana_sentences, "sentenceid": ids}
        )

        sub_df = sub_df[sub_df["sentence"] != sub_df["furigana"]]

        return sub_df

    output_df = pd.DataFrame()
    for i, row in df.iterrows():
        output_df = output_df.append(get_sentences(row))

    return output_df


def bccwj_data():
    """Extract, load and transform the bccwj data"""

    # Extract sentences from the data files
    bccwj_files = list(Path(config.RAW_DATA_DIR, "bccwj").glob("*.txt"))

    df = pd.DataFrame()

    for bccwj_file in bccwj_files:
        logger.info(bccwj_file.name)
        df = pd.concat([df, read_bccwj_file(bccwj_file)])

    # remove known errors
    error_ids = []

    df = df[~df["sentenceid"].isin(error_ids)]
    df = df[df["sentence"] != ""]
    df = df.drop_duplicates()
    df["furigana"] = df["furigana"].apply(utils.standardize_text)
    df["sentence"] = df["sentence"].apply(utils.standardize_text)
    assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()

    # Output
    df.to_csv(Path(config.SENTENCE_DATA_DIR, "bccwj.csv"), index=False)

    logger.info("✅ Saved bccwj data!")


def bccwj_subset(bccwj_file):
    """Extract, load and transform a subset of the bccwj data"""

    df = read_bccwj_file(bccwj_file)

    # remove known errors
    error_ids = []

    df = df[~df["sentenceid"].isin(error_ids)]
    df = df.drop_duplicates()
    df["furigana"] = df["furigana"].apply(utils.standardize_text)
    df["sentence"] = df["sentence"].apply(utils.standardize_text)

    # Output
    df.to_csv(
        Path(config.SENTENCE_DATA_DIR, bccwj_file.name.split(".")[0] + ".csv"),
        index=False,
    )

    logger.info("✅ Saved bccwj " + bccwj_file.name.split(".")[0] + " data!")


if __name__ == "__main__":
    bccwj_data()