Spaces:
Build error
Build error
"""aozora.py | |
Data processing script for aozora bunko file from https://github.com/ndl-lab/huriganacorpus-aozora | |
""" | |
import warnings | |
from pathlib import Path | |
import pandas as pd | |
from pandas.errors import ParserError | |
from speach import ttlig | |
from config import config | |
from config.config import logger | |
from yomikata import utils | |
from yomikata.dataset.repair_long_vowels import repair_long_vowels | |
warnings.filterwarnings("ignore") | |
def read_file(file: str): | |
# logger.info("reading file") | |
with open(file) as f: | |
rows = [ | |
line.rstrip("\n").rstrip("\r").split("\t")[0:3] for line in f.readlines() | |
] | |
df = pd.DataFrame(rows, columns=["word", "furigana", "type"]) | |
# logger.info("removing unused rows") | |
# remove unused rows | |
df = df[~df["type"].isin(["[ε ₯ε θͺγΏ]", "εγγ‘ζΈγ"])] | |
df = df[~pd.isna(df["word"])] | |
df = df[~pd.isnull(df["word"])] | |
df = df[df["word"] != ""] | |
# logger.info("organizing into sentences") | |
# now organize remaining rows into sentences | |
gyou_df = pd.DataFrame(columns=["sentence", "furigana", "sentenceid"]) | |
sentence = "" | |
furigana = "" | |
sentenceid = None | |
gyous = [] | |
for row in df.itertuples(): | |
if row.type in ["[ε ₯εζ]"]: | |
sentence = row.word | |
elif row.type in ["ζΌ’ε"]: | |
furigana += ttlig.RubyToken.from_furi( | |
row.word, repair_long_vowels(row.furigana, row.word) | |
).to_code() | |
elif row.word.split(":")[0] in ["θ‘ηͺε·"]: | |
if sentenceid: # this handles the first row | |
gyous.append([sentence, furigana, sentenceid]) | |
sentenceid = file.name + "_" + row.word.split(":")[1].strip() | |
sentence = None | |
furigana = "" | |
else: | |
furigana += row.word | |
# last row handling | |
gyous.append([sentence, furigana, sentenceid]) | |
# make dataframe | |
gyou_df = pd.DataFrame(gyous, columns=["sentence", "furigana", "sentenceid"]) | |
gyou_df = gyou_df[~pd.isna(gyou_df.sentence)] | |
# logger.info("cleaning rows") | |
# clean rows | |
gyou_df["furigana"] = gyou_df["furigana"].apply(utils.standardize_text) | |
gyou_df["sentence"] = gyou_df["sentence"].apply( | |
lambda s: utils.standardize_text( | |
s.replace("|", "").replace(" ", "").replace("β»", "") | |
) | |
) | |
# logger.info("removing errors") | |
# remove non-matching rows | |
gyou_df = gyou_df[ | |
gyou_df["sentence"] == gyou_df["furigana"].apply(utils.remove_furigana) | |
] | |
# remove known errors | |
error_ids = [] | |
gyou_df = gyou_df[~gyou_df["sentenceid"].isin(error_ids)] | |
# remove duplicates | |
gyou_df = gyou_df.drop_duplicates() | |
return gyou_df | |
def aozora_data(): | |
"""Extract, load and transform the aozora data""" | |
# Extract sentences from the data files | |
files = list(Path(config.RAW_DATA_DIR, "aozora").glob("*/*/*.txt")) | |
with open(Path(config.SENTENCE_DATA_DIR, "aozora.csv"), "w") as f: | |
f.write("sentence,furigana,sentenceid\n") | |
for i, file in enumerate(files): | |
logger.info(f"{i+1}/{len(files)} {file.name}") | |
try: | |
df = read_file(file) | |
except ParserError: | |
logger.error(f"Parser error on {file}") | |
df.to_csv( | |
Path(config.SENTENCE_DATA_DIR, "aozora.csv"), | |
mode="a", | |
index=False, | |
header=False, | |
) | |
logger.info("β Saved all aozora data!") | |
if __name__ == "__main__": | |
aozora_data() | |