Sam Passaglia
initial commit
9aba307
raw
history blame
1.17 kB
"""ndlbib.py
Data processing script for ndlbib sentence file from https://github.com/ndl-lab/huriganacorpus-ndlbib
"""
import warnings
from pathlib import Path
from pandas.errors import ParserError
from config import config
from config.config import logger
from yomikata.dataset.aozora import read_file
# ndlbib and aozora use same file structure
warnings.filterwarnings("ignore")
def ndlbib_data():
"""Extract, load and transform the ndlbib data"""
# Extract sentences from the data files
files = list(Path(config.RAW_DATA_DIR, "shosi").glob("*.txt"))
with open(Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"), "w") as f:
f.write("sentence,furigana,sentenceid\n")
for i, file in enumerate(files):
logger.info(f"{i+1}/{len(files)} {file.name}")
try:
df = read_file(file)
except ParserError:
logger.error(f"Parser error on {file}")
df.to_csv(
Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"),
mode="a",
index=False,
header=False,
)
logger.info("✅ Saved ndlbib data!")
if __name__ == "__main__":
ndlbib_data()