Sam Passaglia
initial commit
9aba307
raw
history blame
8.09 kB
"""bccwj.py
Data processing script for files downloaded from Chuunagon search
Chuunagon URL: https://chunagon.ninjal.ac.jp/
Download with the settings
文脈中の区切り記号 |
文脈中の文区切り記号 #
前後文脈の語数 10
検索対象(固定長・可変長) 両方
共起条件の範囲 文境界をまたがない
ダウンロードオプション
システム Linux
文字コード UTF-8
改行コード LF
出力ファイルが一つの場合は Zip 圧縮を行わない 検索条件式ごとに出力ファイルを分割する
インラインタグを使用 CHECK BOTH 語彙素読み AND 発音形出現形語種 BOX
(発音形出現形 is the actual pronounced one, but displays e.g. よう れい as よー れー)
タグの区切り記号 :
"""
import warnings
from pathlib import Path
import jaconv
import pandas as pd
from speach.ttlig import RubyToken
from config import config
from config.config import logger
from yomikata import utils
warnings.filterwarnings("ignore")
SENTENCE_SPLIT_CHAR = "#"
WORD_SPLIT_CHAR = "|"
READING_SEP_CHAR = ":"
def read_bccwj_file(filename: str):
""" """
df = pd.read_csv(filename, sep="\t")
df["前文脈"] = df["前文脈"].fillna("")
df["後文脈"] = df["後文脈"].fillna("")
df["full_text"] = (
df["前文脈"] + df["キー"] + "[" + df["語彙素読み"] + ":" + df["発音形出現形"] + "]" + df["後文脈"]
)
def get_sentences(row):
sentences = row["full_text"].split(SENTENCE_SPLIT_CHAR)
furigana_sentences = []
for sentence in sentences:
words_with_readings = sentence.split(WORD_SPLIT_CHAR)
furigana_sentence = ""
for word_with_reading in words_with_readings:
word = word_with_reading.split("[")[0]
form, reading = jaconv.kata2hira(
word_with_reading.split("[")[1].split("]")[0]
).split(READING_SEP_CHAR)
if (
not utils.has_kanji(word)
or reading == jaconv.kata2hira(word)
or form == ""
or reading == ""
):
furigana_sentence += word
else:
if ("ー" in reading) and ("ー" not in form):
indexes_of_dash = [
pos for pos, char in enumerate(reading) if char == "ー"
]
for index_of_dash in indexes_of_dash:
if len(reading) == len(form):
dash_reading = form[index_of_dash]
else:
char_before_dash = reading[index_of_dash - 1]
if char_before_dash in "ねめせぜれてでけげへべぺ":
digraphA = char_before_dash + "え"
digraphB = char_before_dash + "い"
if digraphA in form and digraphB not in form:
dash_reading = "え"
elif digraphB in form and digraphA not in form:
dash_reading = "い"
else:
logger.warning(
f"Leaving dash in {word} {form} {reading}"
)
dash_reading = "ー"
elif char_before_dash in "ぬつづむるくぐすずゆゅふぶぷ":
dash_reading = "う"
elif char_before_dash in "しじみいきぎひびち":
dash_reading = "い"
elif char_before_dash in "そぞのこごもろとどよょおほぼぽ":
digraphA = char_before_dash + "お"
digraphB = char_before_dash + "う"
if digraphA in form and digraphB not in form:
dash_reading = "お"
elif digraphB in form and digraphA not in form:
dash_reading = "う"
else:
if digraphA in word and digraphB not in word:
dash_reading = "お"
elif digraphB in word and digraphA not in word:
dash_reading = "う"
else:
logger.warning(
f"Leaving dash in {word} {form} {reading}"
)
dash_reading = "ー"
else:
logger.warning(
f"Leaving dash in {word} {form} {reading}"
)
dash_reading = "ー"
reading = (
reading[:index_of_dash]
+ dash_reading
+ reading[index_of_dash + 1 :]
)
furigana_sentence += RubyToken.from_furi(word, reading).to_code()
furigana_sentences.append(furigana_sentence)
furigana_sentences = [
utils.standardize_text(sentence) for sentence in furigana_sentences
]
sentences = [utils.remove_furigana(sentence) for sentence in furigana_sentences]
try:
rowid = row["サンプル ID"]
except KeyError:
rowid = row["講演 ID"]
if len(furigana_sentences) == 1:
ids = [rowid]
else:
ids = [rowid + "_" + str(i) for i in range(len(furigana_sentences))]
sub_df = pd.DataFrame(
{"sentence": sentences, "furigana": furigana_sentences, "sentenceid": ids}
)
sub_df = sub_df[sub_df["sentence"] != sub_df["furigana"]]
return sub_df
output_df = pd.DataFrame()
for i, row in df.iterrows():
output_df = output_df.append(get_sentences(row))
return output_df
def bccwj_data():
"""Extract, load and transform the bccwj data"""
# Extract sentences from the data files
bccwj_files = list(Path(config.RAW_DATA_DIR, "bccwj").glob("*.txt"))
df = pd.DataFrame()
for bccwj_file in bccwj_files:
logger.info(bccwj_file.name)
df = pd.concat([df, read_bccwj_file(bccwj_file)])
# remove known errors
error_ids = []
df = df[~df["sentenceid"].isin(error_ids)]
df = df[df["sentence"] != ""]
df = df.drop_duplicates()
df["furigana"] = df["furigana"].apply(utils.standardize_text)
df["sentence"] = df["sentence"].apply(utils.standardize_text)
assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()
# Output
df.to_csv(Path(config.SENTENCE_DATA_DIR, "bccwj.csv"), index=False)
logger.info("✅ Saved bccwj data!")
def bccwj_subset(bccwj_file):
"""Extract, load and transform a subset of the bccwj data"""
df = read_bccwj_file(bccwj_file)
# remove known errors
error_ids = []
df = df[~df["sentenceid"].isin(error_ids)]
df = df.drop_duplicates()
df["furigana"] = df["furigana"].apply(utils.standardize_text)
df["sentence"] = df["sentence"].apply(utils.standardize_text)
# Output
df.to_csv(
Path(config.SENTENCE_DATA_DIR, bccwj_file.name.split(".")[0] + ".csv"),
index=False,
)
logger.info("✅ Saved bccwj " + bccwj_file.name.split(".")[0] + " data!")
if __name__ == "__main__":
bccwj_data()