Spaces:
Build error
Build error
"""ndlbib.py | |
Data processing script for ndlbib sentence file from https://github.com/ndl-lab/huriganacorpus-ndlbib | |
""" | |
import warnings | |
from pathlib import Path | |
from pandas.errors import ParserError | |
from config import config | |
from config.config import logger | |
from yomikata.dataset.aozora import read_file | |
# ndlbib and aozora use same file structure | |
warnings.filterwarnings("ignore") | |
def ndlbib_data(): | |
"""Extract, load and transform the ndlbib data""" | |
# Extract sentences from the data files | |
files = list(Path(config.RAW_DATA_DIR, "shosi").glob("*.txt")) | |
with open(Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"), "w") as f: | |
f.write("sentence,furigana,sentenceid\n") | |
for i, file in enumerate(files): | |
logger.info(f"{i+1}/{len(files)} {file.name}") | |
try: | |
df = read_file(file) | |
except ParserError: | |
logger.error(f"Parser error on {file}") | |
df.to_csv( | |
Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"), | |
mode="a", | |
index=False, | |
header=False, | |
) | |
logger.info("✅ Saved ndlbib data!") | |
if __name__ == "__main__": | |
ndlbib_data() | |