File size: 1,598 Bytes
9aba307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from pathlib import Path

import jaconv
import pandas as pd
from tqdm import tqdm

from config import config
from config.config import logger
from yomikata import utils


def pronunciation_data():

    data_files = list(Path(config.READING_DATA_DIR).glob("*.csv"))

    df = pd.DataFrame()

    for file in data_files:
        if (file.name == "all.csv") or (file.name == "ambiguous.csv"):
            continue
        output_df = pd.read_csv(file)
        df = pd.concat([df, output_df])

    df["surface"] = df["surface"].astype(str).str.strip()
    df["kana"] = df["kana"].astype(str).str.strip()

    tqdm.pandas()

    df["kana"] = df["kana"].progress_apply(utils.standardize_text)
    df["surface"] = df["surface"].progress_apply(utils.standardize_text)
    df["kana"] = df.progress_apply(lambda row: jaconv.kata2hira(row["kana"]), axis=1)
    df = df[df["surface"] != df["kana"]]
    df = df[df["kana"] != ""]

    df = df[df["surface"].progress_apply(utils.has_kanji)]

    df = df.loc[~df["surface"].str.contains(r"[〜〜()\)\(\*]\.")]

    df = df[["surface", "kana"]]
    df = df.drop_duplicates()

    df.to_csv(Path(config.READING_DATA_DIR, "all.csv"), index=False)

    logger.info("✅ Merged all the pronunciation data!")

    # merged_df = (
    #     df.groupby("surface")["kana"]
    #     .apply(list)
    #     .reset_index(name="pronunciations")
    # )

    # ambiguous_df = merged_df[merged_df["pronunciations"].apply(len) > 1]
    # ambiguous_df.to_csv(Path(config.READING_DATA_DIR, "ambiguous.csv"), index=False)


if __name__ == "__main__":
    pronunciation_data()