File size: 3,141 Bytes
9aba307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""kwdlc.py
Data processing script for KWDLC files directly in the repository format
KWDLC repository: https://github.com/ku-nlp/KWDLC
"""

import warnings
from pathlib import Path

import pandas as pd
from speach import ttlig

from config import config
from config.config import logger
from yomikata import utils

warnings.filterwarnings("ignore")


def read_knp_file(filename: str):
    with open(filename) as f:
        contents = f.readlines()

    ids = []
    sentences = []
    furiganas = []
    sentence = ""
    furigana = ""
    for row in contents:
        first_word = row.split(" ")[0]
        if first_word in ["*", "+"]:
            pass
        elif first_word == "#":
            sentence_id = row.split(" ")[1].split("S-ID:")[1]
        elif first_word == "EOS\n":
            sentence = utils.standardize_text(sentence)
            furigana = utils.standardize_text(furigana)
            if sentence == utils.remove_furigana(furigana):
                sentences.append(sentence)
                furiganas.append(furigana)
                ids.append(sentence_id)
            else:
                logger.warning(
                    f"Dropping mismatched line \n Sentence: {sentence} \n  Furigana: {furigana}"
                )
            sentence = ""
            furigana = ""
        else:
            words = row.split(" ")
            sentence += words[0]
            if words[0] == words[1]:
                furigana += words[0]
            else:
                furigana += ttlig.RubyToken.from_furi(words[0], words[1]).to_code()

    assert len(ids) == len(sentences)
    assert len(sentences) == len(furiganas)
    return ids, sentences, furiganas  # readings


def kwdlc_data():
    """Extract, load and transform the kwdlc data"""

    # Extract sentences from the data files
    knp_files = list(Path(config.RAW_DATA_DIR, "kwdlc").glob("**/*.knp"))

    all_ids = []
    all_sentences = []
    all_furiganas = []
    for knp_file in knp_files:
        ids, sentences, furiganas = read_knp_file(knp_file)
        all_ids += ids
        all_sentences += sentences
        all_furiganas += furiganas

    # construct dataframe
    df = pd.DataFrame(
        list(
            zip(all_sentences, all_furiganas, all_ids)
        ),  # all_readings, all_furiganas)),
        columns=["sentence", "furigana", "sentenceid"],
    )

    # remove known errors
    error_ids = [
        "w201106-0000547376-1",
        "w201106-0001768070-1-01",
        "w201106-0000785999-1",
        "w201106-0001500842-1",
        "w201106-0000704257-1",
        "w201106-0002300346-3",
        "w201106-0001779669-3",
        "w201106-0000259203-1",
    ]

    df = df[~df["sentenceid"].isin(error_ids)]
    df = df.drop_duplicates()
    df["furigana"] = df["furigana"].apply(utils.standardize_text)
    df["sentence"] = df["sentence"].apply(utils.standardize_text)
    # Test
    assert (df["sentence"] == df["furigana"].apply(utils.remove_furigana)).all()

    # Output
    df.to_csv(Path(config.SENTENCE_DATA_DIR, "kwdlc.csv"), index=False)

    logger.info("✅ Saved kwdlc data!")


if __name__ == "__main__":
    kwdlc_data()