File size: 3,511 Bytes
9aba307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""aozora.py
Data processing script for aozora bunko file from https://github.com/ndl-lab/huriganacorpus-aozora
"""

import warnings
from pathlib import Path

import pandas as pd
from pandas.errors import ParserError
from speach import ttlig

from config import config
from config.config import logger
from yomikata import utils
from yomikata.dataset.repair_long_vowels import repair_long_vowels

warnings.filterwarnings("ignore")


def read_file(file: str):
    # logger.info("reading file")
    with open(file) as f:
        rows = [
            line.rstrip("\n").rstrip("\r").split("\t")[0:3] for line in f.readlines()
        ]
    df = pd.DataFrame(rows, columns=["word", "furigana", "type"])

    # logger.info("removing unused rows")
    # remove unused rows
    df = df[~df["type"].isin(["[入力 読み]", "分かち書き"])]
    df = df[~pd.isna(df["word"])]
    df = df[~pd.isnull(df["word"])]
    df = df[df["word"] != ""]

    # logger.info("organizing into sentences")
    # now organize remaining rows into sentences
    gyou_df = pd.DataFrame(columns=["sentence", "furigana", "sentenceid"])
    sentence = ""
    furigana = ""
    sentenceid = None
    gyous = []
    for row in df.itertuples():
        if row.type in ["[入力文]"]:
            sentence = row.word
        elif row.type in ["漢字"]:
            furigana += ttlig.RubyToken.from_furi(
                row.word, repair_long_vowels(row.furigana, row.word)
            ).to_code()
        elif row.word.split(":")[0] in ["行番号"]:
            if sentenceid:  # this handles the first row
                gyous.append([sentence, furigana, sentenceid])
            sentenceid = file.name + "_" + row.word.split(":")[1].strip()
            sentence = None
            furigana = ""
        else:
            furigana += row.word

    # last row handling
    gyous.append([sentence, furigana, sentenceid])

    # make dataframe
    gyou_df = pd.DataFrame(gyous, columns=["sentence", "furigana", "sentenceid"])
    gyou_df = gyou_df[~pd.isna(gyou_df.sentence)]

    # logger.info("cleaning rows")
    # clean rows
    gyou_df["furigana"] = gyou_df["furigana"].apply(utils.standardize_text)
    gyou_df["sentence"] = gyou_df["sentence"].apply(
        lambda s: utils.standardize_text(
            s.replace("|", "").replace(" ", "").replace("※", "")
        )
    )

    # logger.info("removing errors")
    # remove non-matching rows
    gyou_df = gyou_df[
        gyou_df["sentence"] == gyou_df["furigana"].apply(utils.remove_furigana)
    ]

    # remove known errors
    error_ids = []
    gyou_df = gyou_df[~gyou_df["sentenceid"].isin(error_ids)]

    # remove duplicates
    gyou_df = gyou_df.drop_duplicates()

    return gyou_df


def aozora_data():
    """Extract, load and transform the aozora data"""

    # Extract sentences from the data files
    files = list(Path(config.RAW_DATA_DIR, "aozora").glob("*/*/*.txt"))

    with open(Path(config.SENTENCE_DATA_DIR, "aozora.csv"), "w") as f:
        f.write("sentence,furigana,sentenceid\n")

    for i, file in enumerate(files):
        logger.info(f"{i+1}/{len(files)} {file.name}")
        try:
            df = read_file(file)
        except ParserError:
            logger.error(f"Parser error on {file}")

        df.to_csv(
            Path(config.SENTENCE_DATA_DIR, "aozora.csv"),
            mode="a",
            index=False,
            header=False,
        )

    logger.info("✅ Saved all aozora data!")


if __name__ == "__main__":
    aozora_data()