"""bccwj.py Data processing script for files downloaded from Chuunagon search Chuunagon URL: https://chunagon.ninjal.ac.jp/ Download with the settings 文脈中の区切り記号 | 文脈中の文区切り記号 # 前後文脈の語数 10 検索対象(固定長・可変長) 両方 共起条件の範囲 文境界をまたがない ダウンロードオプション システム Linux 文字コード UTF-8 改行コード LF 出力ファイルが一つの場合は Zip 圧縮を行わない 検索条件式ごとに出力ファイルを分割する インラインタグを使用 CHECK BOTH 語彙素読み AND 発音形出現形語種 BOX (発音形出現形 is the actual pronounced one, but displays e.g. よう れい as よー れー) タグの区切り記号 : """ import warnings from pathlib import Path import jaconv import pandas as pd from speach.ttlig import RubyToken from config import config from config.config import logger from yomikata import utils warnings.filterwarnings("ignore") SENTENCE_SPLIT_CHAR = "#" WORD_SPLIT_CHAR = "|" READING_SEP_CHAR = ":" def read_bccwj_file(filename: str): """ """ df = pd.read_csv(filename, sep="\t") df["前文脈"] = df["前文脈"].fillna("") df["後文脈"] = df["後文脈"].fillna("") df["full_text"] = ( df["前文脈"] + df["キー"] + "[" + df["語彙素読み"] + ":" + df["発音形出現形"] + "]" + df["後文脈"] ) def get_sentences(row): sentences = row["full_text"].split(SENTENCE_SPLIT_CHAR) furigana_sentences = [] for sentence in sentences: words_with_readings = sentence.split(WORD_SPLIT_CHAR) furigana_sentence = "" for word_with_reading in words_with_readings: word = word_with_reading.split("[")[0] form, reading = jaconv.kata2hira( word_with_reading.split("[")[1].split("]")[0] ).split(READING_SEP_CHAR) if ( not utils.has_kanji(word) or reading == jaconv.kata2hira(word) or form == "" or reading == "" ): furigana_sentence += word else: if ("ー" in reading) and ("ー" not in form): indexes_of_dash = [ pos for pos, char in enumerate(reading) if char == "ー" ] for index_of_dash in indexes_of_dash: if len(reading) == len(form): dash_reading = form[index_of_dash] else: char_before_dash = reading[index_of_dash - 1] if char_before_dash in "ねめせぜれてでけげへべぺ": digraphA = char_before_dash + "え" digraphB = char_before_dash + "い" if digraphA in form and digraphB not in form: dash_reading = "え" elif digraphB in form and digraphA not in form: dash_reading = "い" else: logger.warning( f"Leaving dash in {word} {form} {reading}" ) dash_reading = "ー" elif char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ": dash_reading = "う" elif char_before_dash in "しじみいきぎひびち": dash_reading = "い" elif char_before_dash in "そぞのこごもろとどよょおほぼぽ": digraphA = char_before_dash + "お" digraphB = char_before_dash + "う" if digraphA in form and digraphB not in form: dash_reading = "お" elif digraphB in form and digraphA not in form: dash_reading = "う" else: if digraphA in word and digraphB not in word: dash_reading = "お" elif digraphB in word and digraphA not in word: dash_reading = "う" else: logger.warning( f"Leaving dash in {word} {form} {reading}" ) dash_reading = "ー" else: logger.warning( f"Leaving dash in {word} {form} {reading}" ) dash_reading = "ー" reading = ( reading[:index_of_dash] + dash_reading + reading[index_of_dash + 1 :] ) furigana_sentence += RubyToken.from_furi(word, reading).to_code() furigana_sentences.append(furigana_sentence) furigana_sentences = [ utils.standardize_text(sentence) for sentence in furigana_sentences ] sentences = [utils.remove_furigana(sentence) for sentence in furigana_sentences] try: rowid = row["サンプル ID"] except KeyError: rowid = row["講演 ID"] if len(furigana_sentences) == 1: ids = [rowid] else: ids = [rowid + "_" + str(i) for i in range(len(furigana_sentences))] sub_df = pd.DataFrame( {"sentence": sentences, "furigana": furigana_sentences, "sentenceid": ids} ) sub_df = sub_df[sub_df["sentence"] != sub_df["furigana"]] return sub_df output_df = pd.DataFrame() for i, row in df.iterrows(): output_df = output_df.append(get_sentences(row)) return output_df def bccwj_data(): """Extract, load and transform the bccwj data""" # Extract sentences from the data files bccwj_files = list(Path(config.RAW_DATA_DIR, "bccwj").glob("*.txt")) df = pd.DataFrame() for bccwj_file in bccwj_files: logger.info(bccwj_file.name) df = pd.concat([df, read_bccwj_file(bccwj_file)]) # remove known errors error_ids = [] df = df[~df["sentenceid"].isin(error_ids)] df = df[df["sentence"] != ""] df = df.drop_duplicates() df["furigana"] = df["furigana"].apply(utils.standardize_text) df["sentence"] = df["sentence"].apply(utils.standardize_text) assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all() # Output df.to_csv(Path(config.SENTENCE_DATA_DIR, "bccwj.csv"), index=False) logger.info("✅ Saved bccwj data!") def bccwj_subset(bccwj_file): """Extract, load and transform a subset of the bccwj data""" df = read_bccwj_file(bccwj_file) # remove known errors error_ids = [] df = df[~df["sentenceid"].isin(error_ids)] df = df.drop_duplicates() df["furigana"] = df["furigana"].apply(utils.standardize_text) df["sentence"] = df["sentence"].apply(utils.standardize_text) # Output df.to_csv( Path(config.SENTENCE_DATA_DIR, bccwj_file.name.split(".")[0] + ".csv"), index=False, ) logger.info("✅ Saved bccwj " + bccwj_file.name.split(".")[0] + " data!") if __name__ == "__main__": bccwj_data()