Spaces:

passaglia
/

yomikata-demo

Build error

File size: 8,087 Bytes

9aba307

"""bccwj.py
Data processing script for files downloaded from Chuunagon search
Chuunagon URL: https://chunagon.ninjal.ac.jp/

Download with the settings
文脈中の区切り記号 |
文脈中の文区切り記号 #
前後文脈の語数 10
検索対象（固定長・可変長） 両方
共起条件の範囲 文境界をまたがない

ダウンロードオプション
システム Linux
文字コード UTF-8
改行コード LF
出力ファイルが一つの場合は Zip 圧縮を行わない 検索条件式ごとに出力ファイルを分割する
インラインタグを使用  CHECK BOTH 語彙素読み AND 発音形出現形語種 BOX
(発音形出現形 is the actual pronounced one, but displays e.g. よう　れい　as よー　れー)
タグの区切り記号 :
"""

import warnings
from pathlib import Path

import jaconv
import pandas as pd
from speach.ttlig import RubyToken

from config import config
from config.config import logger
from yomikata import utils

warnings.filterwarnings("ignore")

SENTENCE_SPLIT_CHAR = "#"
WORD_SPLIT_CHAR = "|"
READING_SEP_CHAR = ":"


def read_bccwj_file(filename: str):
    """ """

    df = pd.read_csv(filename, sep="\t")

    df["前文脈"] = df["前文脈"].fillna("")
    df["後文脈"] = df["後文脈"].fillna("")
    df["full_text"] = (
        df["前文脈"] + df["キー"] + "[" + df["語彙素読み"] + ":" + df["発音形出現形"] + "]" + df["後文脈"]
    )

    def get_sentences(row):
        sentences = row["full_text"].split(SENTENCE_SPLIT_CHAR)
        furigana_sentences = []
        for sentence in sentences:
            words_with_readings = sentence.split(WORD_SPLIT_CHAR)
            furigana_sentence = ""
            for word_with_reading in words_with_readings:
                word = word_with_reading.split("[")[0]
                form, reading = jaconv.kata2hira(
                    word_with_reading.split("[")[1].split("]")[0]
                ).split(READING_SEP_CHAR)

                if (
                    not utils.has_kanji(word)
                    or reading == jaconv.kata2hira(word)
                    or form == ""
                    or reading == ""
                ):
                    furigana_sentence += word
                else:
                    if ("ー" in reading) and ("ー" not in form):
                        indexes_of_dash = [
                            pos for pos, char in enumerate(reading) if char == "ー"
                        ]
                        for index_of_dash in indexes_of_dash:
                            if len(reading) == len(form):
                                dash_reading = form[index_of_dash]
                            else:
                                char_before_dash = reading[index_of_dash - 1]
                                if char_before_dash in "ねめせぜれてでけげへべぺ":
                                    digraphA = char_before_dash + "え"
                                    digraphB = char_before_dash + "い"
                                    if digraphA in form and digraphB not in form:
                                        dash_reading = "え"
                                    elif digraphB in form and digraphA not in form:
                                        dash_reading = "い"
                                    else:
                                        logger.warning(
                                            f"Leaving dash in {word} {form} {reading}"
                                        )
                                        dash_reading = "ー"
                                elif char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
                                    dash_reading = "う"
                                elif char_before_dash in "しじみいきぎひびち":
                                    dash_reading = "い"
                                elif char_before_dash in "そぞのこごもろとどよょおほぼぽ":
                                    digraphA = char_before_dash + "お"
                                    digraphB = char_before_dash + "う"
                                    if digraphA in form and digraphB not in form:
                                        dash_reading = "お"
                                    elif digraphB in form and digraphA not in form:
                                        dash_reading = "う"
                                    else:
                                        if digraphA in word and digraphB not in word:
                                            dash_reading = "お"
                                        elif digraphB in word and digraphA not in word:
                                            dash_reading = "う"
                                        else:
                                            logger.warning(
                                                f"Leaving dash in {word} {form} {reading}"
                                            )
                                            dash_reading = "ー"
                                else:
                                    logger.warning(
                                        f"Leaving dash in {word} {form} {reading}"
                                    )
                                    dash_reading = "ー"
                            reading = (
                                reading[:index_of_dash]
                                + dash_reading
                                + reading[index_of_dash + 1 :]
                            )
                    furigana_sentence += RubyToken.from_furi(word, reading).to_code()

            furigana_sentences.append(furigana_sentence)

        furigana_sentences = [
            utils.standardize_text(sentence) for sentence in furigana_sentences
        ]
        sentences = [utils.remove_furigana(sentence) for sentence in furigana_sentences]
        try:
            rowid = row["サンプル ID"]
        except KeyError:
            rowid = row["講演 ID"]
        if len(furigana_sentences) == 1:
            ids = [rowid]
        else:
            ids = [rowid + "_" + str(i) for i in range(len(furigana_sentences))]

        sub_df = pd.DataFrame(
            {"sentence": sentences, "furigana": furigana_sentences, "sentenceid": ids}
        )

        sub_df = sub_df[sub_df["sentence"] != sub_df["furigana"]]

        return sub_df

    output_df = pd.DataFrame()
    for i, row in df.iterrows():
        output_df = output_df.append(get_sentences(row))

    return output_df


def bccwj_data():
    """Extract, load and transform the bccwj data"""

    # Extract sentences from the data files
    bccwj_files = list(Path(config.RAW_DATA_DIR, "bccwj").glob("*.txt"))

    df = pd.DataFrame()

    for bccwj_file in bccwj_files:
        logger.info(bccwj_file.name)
        df = pd.concat([df, read_bccwj_file(bccwj_file)])

    # remove known errors
    error_ids = []

    df = df[~df["sentenceid"].isin(error_ids)]
    df = df[df["sentence"] != ""]
    df = df.drop_duplicates()
    df["furigana"] = df["furigana"].apply(utils.standardize_text)
    df["sentence"] = df["sentence"].apply(utils.standardize_text)
    assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()

    # Output
    df.to_csv(Path(config.SENTENCE_DATA_DIR, "bccwj.csv"), index=False)

    logger.info("✅ Saved bccwj data!")


def bccwj_subset(bccwj_file):
    """Extract, load and transform a subset of the bccwj data"""

    df = read_bccwj_file(bccwj_file)

    # remove known errors
    error_ids = []

    df = df[~df["sentenceid"].isin(error_ids)]
    df = df.drop_duplicates()
    df["furigana"] = df["furigana"].apply(utils.standardize_text)
    df["sentence"] = df["sentence"].apply(utils.standardize_text)

    # Output
    df.to_csv(
        Path(config.SENTENCE_DATA_DIR, bccwj_file.name.split(".")[0] + ".csv"),
        index=False,
    )

    logger.info("✅ Saved bccwj " + bccwj_file.name.split(".")[0] + " data!")


if __name__ == "__main__":
    bccwj_data()