Spaces:

passaglia
/

yomikata-demo

Build error

yomikata-demo / yomikata /dataset /aozora.py

Sam Passaglia

initial commit

9aba307 over 1 year ago

3.51 kB

	"""aozora.py
	Data processing script for aozora bunko file from https://github.com/ndl-lab/huriganacorpus-aozora
	"""

	import warnings
	from pathlib import Path

	import pandas as pd
	from pandas.errors import ParserError
	from speach import ttlig

	from config import config
	from config.config import logger
	from yomikata import utils
	from yomikata.dataset.repair_long_vowels import repair_long_vowels

	warnings.filterwarnings("ignore")


	def read_file(file: str):
	# logger.info("reading file")
	with open(file) as f:
	rows = [
	line.rstrip("\n").rstrip("\r").split("\t")[0:3] for line in f.readlines()
	]
	df = pd.DataFrame(rows, columns=["word", "furigana", "type"])

	# logger.info("removing unused rows")
	# remove unused rows
	df = df[~df["type"].isin(["[入力読み]", "分かち書き"])]
	df = df[~pd.isna(df["word"])]
	df = df[~pd.isnull(df["word"])]
	df = df[df["word"] != ""]

	# logger.info("organizing into sentences")
	# now organize remaining rows into sentences
	gyou_df = pd.DataFrame(columns=["sentence", "furigana", "sentenceid"])
	sentence = ""
	furigana = ""
	sentenceid = None
	gyous = []
	for row in df.itertuples():
	if row.type in ["[入力文]"]:
	sentence = row.word
	elif row.type in ["漢字"]:
	furigana += ttlig.RubyToken.from_furi(
	row.word, repair_long_vowels(row.furigana, row.word)
	).to_code()
	elif row.word.split(":")[0] in ["行番号"]:
	if sentenceid: # this handles the first row
	gyous.append([sentence, furigana, sentenceid])
	sentenceid = file.name + "_" + row.word.split(":")[1].strip()
	sentence = None
	furigana = ""
	else:
	furigana += row.word

	# last row handling
	gyous.append([sentence, furigana, sentenceid])

	# make dataframe
	gyou_df = pd.DataFrame(gyous, columns=["sentence", "furigana", "sentenceid"])
	gyou_df = gyou_df[~pd.isna(gyou_df.sentence)]

	# logger.info("cleaning rows")
	# clean rows
	gyou_df["furigana"] = gyou_df["furigana"].apply(utils.standardize_text)
	gyou_df["sentence"] = gyou_df["sentence"].apply(
	lambda s: utils.standardize_text(
	s.replace("\|", "").replace(" ", "").replace("※", "")
	)
	)

	# logger.info("removing errors")
	# remove non-matching rows
	gyou_df = gyou_df[
	gyou_df["sentence"] == gyou_df["furigana"].apply(utils.remove_furigana)
	]

	# remove known errors
	error_ids = []
	gyou_df = gyou_df[~gyou_df["sentenceid"].isin(error_ids)]

	# remove duplicates
	gyou_df = gyou_df.drop_duplicates()

	return gyou_df


	def aozora_data():
	"""Extract, load and transform the aozora data"""

	# Extract sentences from the data files
	files = list(Path(config.RAW_DATA_DIR, "aozora").glob("//*.txt"))

	with open(Path(config.SENTENCE_DATA_DIR, "aozora.csv"), "w") as f:
	f.write("sentence,furigana,sentenceid\n")

	for i, file in enumerate(files):
	logger.info(f"{i+1}/{len(files)} {file.name}")
	try:
	df = read_file(file)
	except ParserError:
	logger.error(f"Parser error on {file}")

	df.to_csv(
	Path(config.SENTENCE_DATA_DIR, "aozora.csv"),
	mode="a",
	index=False,
	header=False,
	)

	logger.info("✅ Saved all aozora data!")


	if __name__ == "__main__":
	aozora_data()