Spaces:
Build error
Build error
File size: 1,165 Bytes
9aba307 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
"""ndlbib.py
Data processing script for ndlbib sentence file from https://github.com/ndl-lab/huriganacorpus-ndlbib
"""
import warnings
from pathlib import Path
from pandas.errors import ParserError
from config import config
from config.config import logger
from yomikata.dataset.aozora import read_file
# ndlbib and aozora use same file structure
warnings.filterwarnings("ignore")
def ndlbib_data():
"""Extract, load and transform the ndlbib data"""
# Extract sentences from the data files
files = list(Path(config.RAW_DATA_DIR, "shosi").glob("*.txt"))
with open(Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"), "w") as f:
f.write("sentence,furigana,sentenceid\n")
for i, file in enumerate(files):
logger.info(f"{i+1}/{len(files)} {file.name}")
try:
df = read_file(file)
except ParserError:
logger.error(f"Parser error on {file}")
df.to_csv(
Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"),
mode="a",
index=False,
header=False,
)
logger.info("✅ Saved ndlbib data!")
if __name__ == "__main__":
ndlbib_data()
|