File size: 1,165 Bytes
9aba307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""ndlbib.py
Data processing script for ndlbib sentence file from https://github.com/ndl-lab/huriganacorpus-ndlbib
"""

import warnings
from pathlib import Path

from pandas.errors import ParserError

from config import config
from config.config import logger
from yomikata.dataset.aozora import read_file

# ndlbib and aozora use same file structure

warnings.filterwarnings("ignore")


def ndlbib_data():
    """Extract, load and transform the ndlbib data"""

    # Extract sentences from the data files
    files = list(Path(config.RAW_DATA_DIR, "shosi").glob("*.txt"))

    with open(Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"), "w") as f:
        f.write("sentence,furigana,sentenceid\n")

    for i, file in enumerate(files):
        logger.info(f"{i+1}/{len(files)} {file.name}")
        try:
            df = read_file(file)
        except ParserError:
            logger.error(f"Parser error on {file}")

        df.to_csv(
            Path(config.SENTENCE_DATA_DIR, "ndlbib.csv"),
            mode="a",
            index=False,
            header=False,
        )

    logger.info("✅ Saved ndlbib data!")


if __name__ == "__main__":
    ndlbib_data()