Spaces:

passaglia
/

yomikata-demo

Build error

File size: 1,165 Bytes

9aba307

"""ndlbib.py
Data processing script for ndlbib sentence file from https://github.com/ndl-lab/huriganacorpus-ndlbib
"""

import warnings
from pathlib import Path

from pandas.errors import ParserError

from config import config
from config.config import logger
from yomikata.dataset.aozora import read_file

# ndlbib and aozora use same file structure

warnings.filterwarnings("ignore")


def ndlbib_data():
    """Extract, load and transform the ndlbib data"""

    # Extract sentences from the data files
    files = list(Path(config.RAW_DATA_DIR, "shosi").glob("*.txt"))

    with open(Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"), "w") as f:
        f.write("sentence,furigana,sentenceid\n")

    for i, file in enumerate(files):
        logger.info(f"{i+1}/{len(files)} {file.name}")
        try:
            df = read_file(file)
        except ParserError:
            logger.error(f"Parser error on {file}")

        df.to_csv(
            Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"),
            mode="a",
            index=False,
            header=False,
        )

    logger.info("✅ Saved ndlbib data!")


if __name__ == "__main__":
    ndlbib_data()