Spaces:
Build error
Build error
File size: 3,511 Bytes
9aba307 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
"""aozora.py
Data processing script for aozora bunko file from https://github.com/ndl-lab/huriganacorpus-aozora
"""
import warnings
from pathlib import Path
import pandas as pd
from pandas.errors import ParserError
from speach import ttlig
from config import config
from config.config import logger
from yomikata import utils
from yomikata.dataset.repair_long_vowels import repair_long_vowels
warnings.filterwarnings("ignore")
def read_file(file: str):
# logger.info("reading file")
with open(file) as f:
rows = [
line.rstrip("\n").rstrip("\r").split("\t")[0:3] for line in f.readlines()
]
df = pd.DataFrame(rows, columns=["word", "furigana", "type"])
# logger.info("removing unused rows")
# remove unused rows
df = df[~df["type"].isin(["[入力 読み]", "分かち書き"])]
df = df[~pd.isna(df["word"])]
df = df[~pd.isnull(df["word"])]
df = df[df["word"] != ""]
# logger.info("organizing into sentences")
# now organize remaining rows into sentences
gyou_df = pd.DataFrame(columns=["sentence", "furigana", "sentenceid"])
sentence = ""
furigana = ""
sentenceid = None
gyous = []
for row in df.itertuples():
if row.type in ["[入力文]"]:
sentence = row.word
elif row.type in ["漢字"]:
furigana += ttlig.RubyToken.from_furi(
row.word, repair_long_vowels(row.furigana, row.word)
).to_code()
elif row.word.split(":")[0] in ["行番号"]:
if sentenceid: # this handles the first row
gyous.append([sentence, furigana, sentenceid])
sentenceid = file.name + "_" + row.word.split(":")[1].strip()
sentence = None
furigana = ""
else:
furigana += row.word
# last row handling
gyous.append([sentence, furigana, sentenceid])
# make dataframe
gyou_df = pd.DataFrame(gyous, columns=["sentence", "furigana", "sentenceid"])
gyou_df = gyou_df[~pd.isna(gyou_df.sentence)]
# logger.info("cleaning rows")
# clean rows
gyou_df["furigana"] = gyou_df["furigana"].apply(utils.standardize_text)
gyou_df["sentence"] = gyou_df["sentence"].apply(
lambda s: utils.standardize_text(
s.replace("|", "").replace(" ", "").replace("※", "")
)
)
# logger.info("removing errors")
# remove non-matching rows
gyou_df = gyou_df[
gyou_df["sentence"] == gyou_df["furigana"].apply(utils.remove_furigana)
]
# remove known errors
error_ids = []
gyou_df = gyou_df[~gyou_df["sentenceid"].isin(error_ids)]
# remove duplicates
gyou_df = gyou_df.drop_duplicates()
return gyou_df
def aozora_data():
"""Extract, load and transform the aozora data"""
# Extract sentences from the data files
files = list(Path(config.RAW_DATA_DIR, "aozora").glob("*/*/*.txt"))
with open(Path(config.SENTENCE_DATA_DIR, "aozora.csv"), "w") as f:
f.write("sentence,furigana,sentenceid\n")
for i, file in enumerate(files):
logger.info(f"{i+1}/{len(files)} {file.name}")
try:
df = read_file(file)
except ParserError:
logger.error(f"Parser error on {file}")
df.to_csv(
Path(config.SENTENCE_DATA_DIR, "aozora.csv"),
mode="a",
index=False,
header=False,
)
logger.info("✅ Saved all aozora data!")
if __name__ == "__main__":
aozora_data()
|